xref: /PHP-8.3/ext/pcre/php_pcre.c (revision 4d91665f)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Andrei Zmievski <andrei@php.net>                             |
14    +----------------------------------------------------------------------+
15  */
16 
17 #include "php.h"
18 #include "php_ini.h"
19 #include "php_globals.h"
20 #include "php_pcre.h"
21 #include "ext/standard/info.h"
22 #include "ext/standard/basic_functions.h"
23 #include "zend_smart_str.h"
24 #include "SAPI.h"
25 
26 #include "ext/standard/php_string.h"
27 
28 #define PREG_PATTERN_ORDER			1
29 #define PREG_SET_ORDER				2
30 #define PREG_OFFSET_CAPTURE			(1<<8)
31 #define PREG_UNMATCHED_AS_NULL		(1<<9)
32 
33 #define	PREG_SPLIT_NO_EMPTY			(1<<0)
34 #define PREG_SPLIT_DELIM_CAPTURE	(1<<1)
35 #define PREG_SPLIT_OFFSET_CAPTURE	(1<<2)
36 
37 #define PREG_REPLACE_EVAL			(1<<0)
38 
39 #define PREG_GREP_INVERT			(1<<0)
40 
41 #define PREG_JIT                    (1<<3)
42 
43 #define PCRE_CACHE_SIZE 4096
44 
45 #ifdef HAVE_PCRE_JIT_SUPPORT
46 #define PHP_PCRE_JIT_SUPPORT 1
47 #else
48 #define PHP_PCRE_JIT_SUPPORT 0
49 #endif
50 
51 char *php_pcre_version;
52 
53 #include "php_pcre_arginfo.h"
54 
55 struct _pcre_cache_entry {
56 	pcre2_code *re;
57 	uint32_t preg_options;
58 	uint32_t capture_count;
59 	uint32_t name_count;
60 	uint32_t compile_options;
61 	uint32_t refcount;
62 };
63 
64 PHPAPI ZEND_DECLARE_MODULE_GLOBALS(pcre)
65 
66 #ifdef HAVE_PCRE_JIT_SUPPORT
67 #define PCRE_JIT_STACK_MIN_SIZE (32 * 1024)
68 #define PCRE_JIT_STACK_MAX_SIZE (192 * 1024)
69 ZEND_TLS pcre2_jit_stack *jit_stack = NULL;
70 #endif
71 /* General context using (infallible) system allocator. */
72 ZEND_TLS pcre2_general_context *gctx = NULL;
73 /* These two are global per thread for now. Though it is possible to use these
74  	per pattern. Either one can copy it and use in pce, or one does no global
75 	contexts at all, but creates for every pce. */
76 ZEND_TLS pcre2_compile_context *cctx = NULL;
77 ZEND_TLS pcre2_match_context   *mctx = NULL;
78 ZEND_TLS pcre2_match_data      *mdata = NULL;
79 ZEND_TLS bool              mdata_used = 0;
80 ZEND_TLS uint8_t pcre2_init_ok = 0;
81 #if defined(ZTS) && defined(HAVE_PCRE_JIT_SUPPORT)
82 static MUTEX_T pcre_mt = NULL;
83 #define php_pcre_mutex_alloc() \
84 	if (tsrm_is_main_thread() && !pcre_mt) pcre_mt = tsrm_mutex_alloc();
85 #define php_pcre_mutex_free() \
86 	if (tsrm_is_main_thread() && pcre_mt) { tsrm_mutex_free(pcre_mt); pcre_mt = NULL; }
87 #define php_pcre_mutex_lock() tsrm_mutex_lock(pcre_mt);
88 #define php_pcre_mutex_unlock() tsrm_mutex_unlock(pcre_mt);
89 #else
90 #define php_pcre_mutex_alloc()
91 #define php_pcre_mutex_free()
92 #define php_pcre_mutex_lock()
93 #define php_pcre_mutex_unlock()
94 #endif
95 
96 ZEND_TLS HashTable char_tables;
97 
php_pcre_free_char_table(zval * data)98 static void php_pcre_free_char_table(zval *data)
99 {/*{{{*/
100 	void *ptr = Z_PTR_P(data);
101 	pefree(ptr, 1);
102 }/*}}}*/
103 
pcre_handle_exec_error(int pcre_code)104 static void pcre_handle_exec_error(int pcre_code) /* {{{ */
105 {
106 	int preg_code = 0;
107 
108 	switch (pcre_code) {
109 		case PCRE2_ERROR_MATCHLIMIT:
110 			preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
111 			break;
112 
113 		case PCRE2_ERROR_RECURSIONLIMIT:
114 			preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
115 			break;
116 
117 		case PCRE2_ERROR_BADUTFOFFSET:
118 			preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
119 			break;
120 
121 #ifdef HAVE_PCRE_JIT_SUPPORT
122 		case PCRE2_ERROR_JIT_STACKLIMIT:
123 			preg_code = PHP_PCRE_JIT_STACKLIMIT_ERROR;
124 			break;
125 #endif
126 
127 		default:
128 			if (pcre_code <= PCRE2_ERROR_UTF8_ERR1 && pcre_code >= PCRE2_ERROR_UTF8_ERR21) {
129 				preg_code = PHP_PCRE_BAD_UTF8_ERROR;
130 			} else  {
131 				preg_code = PHP_PCRE_INTERNAL_ERROR;
132 			}
133 			break;
134 	}
135 
136 	PCRE_G(error_code) = preg_code;
137 }
138 /* }}} */
139 
php_pcre_get_error_msg(php_pcre_error_code error_code)140 static const char *php_pcre_get_error_msg(php_pcre_error_code error_code) /* {{{ */
141 {
142 	switch (error_code) {
143 		case PHP_PCRE_NO_ERROR:
144 			return "No error";
145 		case PHP_PCRE_INTERNAL_ERROR:
146 			return "Internal error";
147 		case PHP_PCRE_BAD_UTF8_ERROR:
148 			return "Malformed UTF-8 characters, possibly incorrectly encoded";
149 		case PHP_PCRE_BAD_UTF8_OFFSET_ERROR:
150 			return "The offset did not correspond to the beginning of a valid UTF-8 code point";
151 		case PHP_PCRE_BACKTRACK_LIMIT_ERROR:
152 			return "Backtrack limit exhausted";
153 		case PHP_PCRE_RECURSION_LIMIT_ERROR:
154 			return "Recursion limit exhausted";
155 
156 #ifdef HAVE_PCRE_JIT_SUPPORT
157 		case PHP_PCRE_JIT_STACKLIMIT_ERROR:
158 			return "JIT stack limit exhausted";
159 #endif
160 
161 		default:
162 			return "Unknown error";
163 	}
164 }
165 /* }}} */
166 
php_free_pcre_cache(zval * data)167 static void php_free_pcre_cache(zval *data) /* {{{ */
168 {
169 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
170 	if (!pce) return;
171 	pcre2_code_free(pce->re);
172 	free(pce);
173 }
174 /* }}} */
175 
php_efree_pcre_cache(zval * data)176 static void php_efree_pcre_cache(zval *data) /* {{{ */
177 {
178 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
179 	if (!pce) return;
180 	pcre2_code_free(pce->re);
181 	efree(pce);
182 }
183 /* }}} */
184 
php_pcre_malloc(PCRE2_SIZE size,void * data)185 static void *php_pcre_malloc(PCRE2_SIZE size, void *data)
186 {
187 	return pemalloc(size, 1);
188 }
189 
php_pcre_free(void * block,void * data)190 static void php_pcre_free(void *block, void *data)
191 {
192 	pefree(block, 1);
193 }
194 
php_pcre_emalloc(PCRE2_SIZE size,void * data)195 static void *php_pcre_emalloc(PCRE2_SIZE size, void *data)
196 {
197 	return emalloc(size);
198 }
199 
php_pcre_efree(void * block,void * data)200 static void php_pcre_efree(void *block, void *data)
201 {
202 	efree(block);
203 }
204 
205 #ifdef PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
206 	/* pcre 10.38 needs PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK, disabled by default */
207 #define PHP_PCRE_DEFAULT_EXTRA_COPTIONS PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
208 #else
209 #define PHP_PCRE_DEFAULT_EXTRA_COPTIONS 0
210 #endif
211 
212 #define PHP_PCRE_PREALLOC_MDATA_SIZE 32
213 
php_pcre_init_pcre2(uint8_t jit)214 static void php_pcre_init_pcre2(uint8_t jit)
215 {/*{{{*/
216 	if (!gctx) {
217 		gctx = pcre2_general_context_create(php_pcre_malloc, php_pcre_free, NULL);
218 		if (!gctx) {
219 			pcre2_init_ok = 0;
220 			return;
221 		}
222 	}
223 
224 	if (!cctx) {
225 		cctx = pcre2_compile_context_create(gctx);
226 		if (!cctx) {
227 			pcre2_init_ok = 0;
228 			return;
229 		}
230 	}
231 
232 	pcre2_set_compile_extra_options(cctx, PHP_PCRE_DEFAULT_EXTRA_COPTIONS);
233 
234 	if (!mctx) {
235 		mctx = pcre2_match_context_create(gctx);
236 		if (!mctx) {
237 			pcre2_init_ok = 0;
238 			return;
239 		}
240 	}
241 
242 #ifdef HAVE_PCRE_JIT_SUPPORT
243 	if (jit && !jit_stack) {
244 		jit_stack = pcre2_jit_stack_create(PCRE_JIT_STACK_MIN_SIZE, PCRE_JIT_STACK_MAX_SIZE, gctx);
245 		if (!jit_stack) {
246 			pcre2_init_ok = 0;
247 			return;
248 		}
249 	}
250 #endif
251 
252 	if (!mdata) {
253 		mdata = pcre2_match_data_create(PHP_PCRE_PREALLOC_MDATA_SIZE, gctx);
254 		if (!mdata) {
255 			pcre2_init_ok = 0;
256 			return;
257 		}
258 	}
259 
260 	pcre2_init_ok = 1;
261 }/*}}}*/
262 
php_pcre_shutdown_pcre2(void)263 static void php_pcre_shutdown_pcre2(void)
264 {/*{{{*/
265 	if (gctx) {
266 		pcre2_general_context_free(gctx);
267 		gctx = NULL;
268 	}
269 
270 	if (cctx) {
271 		pcre2_compile_context_free(cctx);
272 		cctx = NULL;
273 	}
274 
275 	if (mctx) {
276 		pcre2_match_context_free(mctx);
277 		mctx = NULL;
278 	}
279 
280 #ifdef HAVE_PCRE_JIT_SUPPORT
281 	/* Stack may only be destroyed when no cached patterns
282 	 	possibly associated with it do exist. */
283 	if (jit_stack) {
284 		pcre2_jit_stack_free(jit_stack);
285 		jit_stack = NULL;
286 	}
287 #endif
288 
289 	if (mdata) {
290 		pcre2_match_data_free(mdata);
291 		mdata = NULL;
292 	}
293 
294 	pcre2_init_ok = 0;
295 }/*}}}*/
296 
PHP_GINIT_FUNCTION(pcre)297 static PHP_GINIT_FUNCTION(pcre) /* {{{ */
298 {
299 	php_pcre_mutex_alloc();
300 
301 	/* If we're on the CLI SAPI, there will only be one request, so we don't need the
302 	 * cache to survive after RSHUTDOWN. */
303 	pcre_globals->per_request_cache = strcmp(sapi_module.name, "cli") == 0;
304 	if (!pcre_globals->per_request_cache) {
305 		zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1);
306 	}
307 
308 	pcre_globals->backtrack_limit = 0;
309 	pcre_globals->recursion_limit = 0;
310 	pcre_globals->error_code      = PHP_PCRE_NO_ERROR;
311 	ZVAL_UNDEF(&pcre_globals->unmatched_null_pair);
312 	ZVAL_UNDEF(&pcre_globals->unmatched_empty_pair);
313 #ifdef HAVE_PCRE_JIT_SUPPORT
314 	pcre_globals->jit = 1;
315 #endif
316 
317 	php_pcre_init_pcre2(1);
318 	zend_hash_init(&char_tables, 1, NULL, php_pcre_free_char_table, 1);
319 }
320 /* }}} */
321 
PHP_GSHUTDOWN_FUNCTION(pcre)322 static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */
323 {
324 	if (!pcre_globals->per_request_cache) {
325 		zend_hash_destroy(&pcre_globals->pcre_cache);
326 	}
327 
328 	php_pcre_shutdown_pcre2();
329 	zend_hash_destroy(&char_tables);
330 	php_pcre_mutex_free();
331 }
332 /* }}} */
333 
PHP_INI_MH(OnUpdateBacktrackLimit)334 static PHP_INI_MH(OnUpdateBacktrackLimit)
335 {/*{{{*/
336 	OnUpdateLong(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
337 	if (mctx) {
338 		pcre2_set_match_limit(mctx, (uint32_t)PCRE_G(backtrack_limit));
339 	}
340 
341 	return SUCCESS;
342 }/*}}}*/
343 
PHP_INI_MH(OnUpdateRecursionLimit)344 static PHP_INI_MH(OnUpdateRecursionLimit)
345 {/*{{{*/
346 	OnUpdateLong(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
347 	if (mctx) {
348 		pcre2_set_depth_limit(mctx, (uint32_t)PCRE_G(recursion_limit));
349 	}
350 
351 	return SUCCESS;
352 }/*}}}*/
353 
354 #ifdef HAVE_PCRE_JIT_SUPPORT
PHP_INI_MH(OnUpdateJit)355 static PHP_INI_MH(OnUpdateJit)
356 {/*{{{*/
357 	OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
358 	if (PCRE_G(jit) && jit_stack) {
359 		pcre2_jit_stack_assign(mctx, NULL, jit_stack);
360 	} else {
361 		pcre2_jit_stack_assign(mctx, NULL, NULL);
362 	}
363 
364 	return SUCCESS;
365 }/*}}}*/
366 #endif
367 
368 PHP_INI_BEGIN()
369 	STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateBacktrackLimit, backtrack_limit, zend_pcre_globals, pcre_globals)
370 	STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000",  PHP_INI_ALL, OnUpdateRecursionLimit, recursion_limit, zend_pcre_globals, pcre_globals)
371 #ifdef HAVE_PCRE_JIT_SUPPORT
372 	STD_PHP_INI_BOOLEAN("pcre.jit",           "1",       PHP_INI_ALL, OnUpdateJit,            jit,             zend_pcre_globals, pcre_globals)
373 #endif
PHP_INI_END()374 PHP_INI_END()
375 
376 static char *_pcre2_config_str(uint32_t what)
377 {/*{{{*/
378 	int len = pcre2_config(what, NULL);
379 	char *ret = (char *) malloc(len + 1);
380 
381 	len = pcre2_config(what, ret);
382 	if (!len) {
383 		free(ret);
384 		return NULL;
385 	}
386 
387 	return ret;
388 }/*}}}*/
389 
390 /* {{{ PHP_MINFO_FUNCTION(pcre) */
PHP_MINFO_FUNCTION(pcre)391 static PHP_MINFO_FUNCTION(pcre)
392 {
393 #ifdef HAVE_PCRE_JIT_SUPPORT
394 	uint32_t flag = 0;
395 	char *jit_target = _pcre2_config_str(PCRE2_CONFIG_JITTARGET);
396 #endif
397 	char *version = _pcre2_config_str(PCRE2_CONFIG_VERSION);
398 	char *unicode = _pcre2_config_str(PCRE2_CONFIG_UNICODE_VERSION);
399 
400 	php_info_print_table_start();
401 	php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
402 	php_info_print_table_row(2, "PCRE Library Version", version);
403 	free(version);
404 	php_info_print_table_row(2, "PCRE Unicode Version", unicode);
405 	free(unicode);
406 
407 #ifdef HAVE_PCRE_JIT_SUPPORT
408 	if (!pcre2_config(PCRE2_CONFIG_JIT, &flag)) {
409 		php_info_print_table_row(2, "PCRE JIT Support", flag ? "enabled" : "disabled");
410 	} else {
411 		php_info_print_table_row(2, "PCRE JIT Support", "unknown" );
412 	}
413 	if (jit_target) {
414 		php_info_print_table_row(2, "PCRE JIT Target", jit_target);
415 	}
416 	free(jit_target);
417 #else
418 	php_info_print_table_row(2, "PCRE JIT Support", "not compiled in" );
419 #endif
420 
421 #ifdef HAVE_PCRE_VALGRIND_SUPPORT
422 	php_info_print_table_row(2, "PCRE Valgrind Support", "enabled" );
423 #endif
424 
425 	php_info_print_table_end();
426 
427 	DISPLAY_INI_ENTRIES();
428 }
429 /* }}} */
430 
431 /* {{{ PHP_MINIT_FUNCTION(pcre) */
PHP_MINIT_FUNCTION(pcre)432 static PHP_MINIT_FUNCTION(pcre)
433 {
434 #ifdef HAVE_PCRE_JIT_SUPPORT
435 	if (UNEXPECTED(!pcre2_init_ok)) {
436 		/* Retry. */
437 		php_pcre_init_pcre2(PCRE_G(jit));
438 		if (!pcre2_init_ok) {
439 			return FAILURE;
440 		}
441 	}
442 #endif
443 
444 	REGISTER_INI_ENTRIES();
445 
446 	php_pcre_version = _pcre2_config_str(PCRE2_CONFIG_VERSION);
447 
448 	register_php_pcre_symbols(module_number);
449 
450 	return SUCCESS;
451 }
452 /* }}} */
453 
454 /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
PHP_MSHUTDOWN_FUNCTION(pcre)455 static PHP_MSHUTDOWN_FUNCTION(pcre)
456 {
457 	UNREGISTER_INI_ENTRIES();
458 
459 	free(php_pcre_version);
460 
461 	return SUCCESS;
462 }
463 /* }}} */
464 
465 /* {{{ PHP_RINIT_FUNCTION(pcre) */
PHP_RINIT_FUNCTION(pcre)466 static PHP_RINIT_FUNCTION(pcre)
467 {
468 #ifdef HAVE_PCRE_JIT_SUPPORT
469 	if (UNEXPECTED(!pcre2_init_ok)) {
470 		/* Retry. */
471 		php_pcre_mutex_lock();
472 		php_pcre_init_pcre2(PCRE_G(jit));
473 		if (!pcre2_init_ok) {
474 			php_pcre_mutex_unlock();
475 			return FAILURE;
476 		}
477 		php_pcre_mutex_unlock();
478 	}
479 
480 	mdata_used = 0;
481 #endif
482 
483 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
484 	PCRE_G(gctx_zmm) = pcre2_general_context_create(php_pcre_emalloc, php_pcre_efree, NULL);
485 	if (!PCRE_G(gctx_zmm)) {
486 		return FAILURE;
487 	}
488 
489 	if (PCRE_G(per_request_cache)) {
490 		zend_hash_init(&PCRE_G(pcre_cache), 0, NULL, php_efree_pcre_cache, 0);
491 	}
492 
493 	return SUCCESS;
494 }
495 /* }}} */
496 
PHP_RSHUTDOWN_FUNCTION(pcre)497 static PHP_RSHUTDOWN_FUNCTION(pcre)
498 {
499 	pcre2_general_context_free(PCRE_G(gctx_zmm));
500 	PCRE_G(gctx_zmm) = NULL;
501 
502 	if (PCRE_G(per_request_cache)) {
503 		zend_hash_destroy(&PCRE_G(pcre_cache));
504 	}
505 
506 	zval_ptr_dtor(&PCRE_G(unmatched_null_pair));
507 	zval_ptr_dtor(&PCRE_G(unmatched_empty_pair));
508 	ZVAL_UNDEF(&PCRE_G(unmatched_null_pair));
509 	ZVAL_UNDEF(&PCRE_G(unmatched_empty_pair));
510 	return SUCCESS;
511 }
512 
513 /* {{{ static pcre_clean_cache */
pcre_clean_cache(zval * data,void * arg)514 static int pcre_clean_cache(zval *data, void *arg)
515 {
516 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
517 	int *num_clean = (int *)arg;
518 
519 	if (*num_clean > 0 && !pce->refcount) {
520 		(*num_clean)--;
521 		return ZEND_HASH_APPLY_REMOVE;
522 	} else {
523 		return ZEND_HASH_APPLY_KEEP;
524 	}
525 }
526 /* }}} */
527 
free_subpats_table(zend_string ** subpat_names,uint32_t num_subpats)528 static void free_subpats_table(zend_string **subpat_names, uint32_t num_subpats) {
529 	uint32_t i;
530 	for (i = 0; i < num_subpats; i++) {
531 		if (subpat_names[i]) {
532 			zend_string_release(subpat_names[i]);
533 		}
534 	}
535 	efree(subpat_names);
536 }
537 
538 /* {{{ static make_subpats_table */
make_subpats_table(uint32_t num_subpats,pcre_cache_entry * pce)539 static zend_string **make_subpats_table(uint32_t num_subpats, pcre_cache_entry *pce)
540 {
541 	uint32_t name_cnt = pce->name_count, name_size, ni = 0;
542 	char *name_table;
543 	zend_string **subpat_names;
544 	int rc1, rc2;
545 
546 	rc1 = pcre2_pattern_info(pce->re, PCRE2_INFO_NAMETABLE, &name_table);
547 	rc2 = pcre2_pattern_info(pce->re, PCRE2_INFO_NAMEENTRYSIZE, &name_size);
548 	if (rc1 < 0 || rc2 < 0) {
549 		php_error_docref(NULL, E_WARNING, "Internal pcre2_pattern_info() error %d", rc1 < 0 ? rc1 : rc2);
550 		return NULL;
551 	}
552 
553 	subpat_names = ecalloc(num_subpats, sizeof(zend_string *));
554 	while (ni++ < name_cnt) {
555 		unsigned short name_idx = 0x100 * (unsigned char)name_table[0] + (unsigned char)name_table[1];
556 		const char *name = name_table + 2;
557 		subpat_names[name_idx] = zend_string_init(name, strlen(name), 0);
558 		if (is_numeric_string(ZSTR_VAL(subpat_names[name_idx]), ZSTR_LEN(subpat_names[name_idx]), NULL, NULL, 0) > 0) {
559 			php_error_docref(NULL, E_WARNING, "Numeric named subpatterns are not allowed");
560 			free_subpats_table(subpat_names, num_subpats);
561 			return NULL;
562 		}
563 		name_table += name_size;
564 	}
565 	return subpat_names;
566 }
567 /* }}} */
568 
569 /* {{{ static calculate_unit_length */
570 /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE2_UTF. */
calculate_unit_length(pcre_cache_entry * pce,const char * start)571 static zend_always_inline size_t calculate_unit_length(pcre_cache_entry *pce, const char *start)
572 {
573 	size_t unit_len;
574 
575 	if (pce->compile_options & PCRE2_UTF) {
576 		const char *end = start;
577 
578 		/* skip continuation bytes */
579 		while ((*++end & 0xC0) == 0x80);
580 		unit_len = end - start;
581 	} else {
582 		unit_len = 1;
583 	}
584 	return unit_len;
585 }
586 /* }}} */
587 
588 /* {{{ pcre_get_compiled_regex_cache */
pcre_get_compiled_regex_cache_ex(zend_string * regex,int locale_aware)589 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, int locale_aware)
590 {
591 	pcre2_code			*re = NULL;
592 #if 10 == PCRE2_MAJOR && 37 == PCRE2_MINOR && !HAVE_BUNDLED_PCRE
593 	uint32_t			 coptions = PCRE2_NO_START_OPTIMIZE;
594 #else
595 	uint32_t			 coptions = 0;
596 #endif
597 	PCRE2_UCHAR	         error[128];
598 	PCRE2_SIZE           erroffset;
599 	int                  errnumber;
600 	char				 delimiter;
601 	char				 start_delimiter;
602 	char				 end_delimiter;
603 	char				*p, *pp;
604 	char				*pattern;
605 	size_t				 pattern_len;
606 	uint32_t			 poptions = 0;
607 	const uint8_t       *tables = NULL;
608 	zval                *zv;
609 	pcre_cache_entry	 new_entry;
610 	int					 rc;
611 	zend_string 		*key;
612 	pcre_cache_entry	*ret;
613 
614 	if (locale_aware && BG(ctype_string)) {
615 		key = zend_string_concat2(
616 			ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string)),
617 			ZSTR_VAL(regex), ZSTR_LEN(regex));
618 	} else {
619 		key = regex;
620 	}
621 
622 	/* Try to lookup the cached regex entry, and if successful, just pass
623 	   back the compiled pattern, otherwise go on and compile it. */
624 	zv = zend_hash_find(&PCRE_G(pcre_cache), key);
625 	if (zv) {
626 		if (key != regex) {
627 			zend_string_release_ex(key, 0);
628 		}
629 		return (pcre_cache_entry*)Z_PTR_P(zv);
630 	}
631 
632 	p = ZSTR_VAL(regex);
633 	const char* end_p = ZSTR_VAL(regex) + ZSTR_LEN(regex);
634 
635 	/* Parse through the leading whitespace, and display a warning if we
636 	   get to the end without encountering a delimiter. */
637 	while (isspace((int)*(unsigned char *)p)) p++;
638 	if (p >= end_p) {
639 		if (key != regex) {
640 			zend_string_release_ex(key, 0);
641 		}
642 		php_error_docref(NULL, E_WARNING, "Empty regular expression");
643 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
644 		return NULL;
645 	}
646 
647 	/* Get the delimiter and display a warning if it is alphanumeric
648 	   or a backslash. */
649 	delimiter = *p++;
650 	if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\' || delimiter == '\0') {
651 		if (key != regex) {
652 			zend_string_release_ex(key, 0);
653 		}
654 		php_error_docref(NULL, E_WARNING, "Delimiter must not be alphanumeric, backslash, or NUL");
655 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
656 		return NULL;
657 	}
658 
659 	start_delimiter = delimiter;
660 	if ((pp = strchr("([{< )]}> )]}>", delimiter)))
661 		delimiter = pp[5];
662 	end_delimiter = delimiter;
663 
664 	pp = p;
665 
666 	if (start_delimiter == end_delimiter) {
667 		/* We need to iterate through the pattern, searching for the ending delimiter,
668 		   but skipping the backslashed delimiters.  If the ending delimiter is not
669 		   found, display a warning. */
670 		while (pp < end_p) {
671 			if (*pp == '\\' && pp + 1 < end_p) pp++;
672 			else if (*pp == delimiter)
673 				break;
674 			pp++;
675 		}
676 	} else {
677 		/* We iterate through the pattern, searching for the matching ending
678 		 * delimiter. For each matching starting delimiter, we increment nesting
679 		 * level, and decrement it for each matching ending delimiter. If we
680 		 * reach the end of the pattern without matching, display a warning.
681 		 */
682 		int brackets = 1; 	/* brackets nesting level */
683 		while (pp < end_p) {
684 			if (*pp == '\\' && pp + 1 < end_p) pp++;
685 			else if (*pp == end_delimiter && --brackets <= 0)
686 				break;
687 			else if (*pp == start_delimiter)
688 				brackets++;
689 			pp++;
690 		}
691 	}
692 
693 	if (pp >= end_p) {
694 		if (key != regex) {
695 			zend_string_release_ex(key, 0);
696 		}
697 		if (start_delimiter == end_delimiter) {
698 			php_error_docref(NULL,E_WARNING, "No ending delimiter '%c' found", delimiter);
699 		} else {
700 			php_error_docref(NULL,E_WARNING, "No ending matching delimiter '%c' found", delimiter);
701 		}
702 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
703 		return NULL;
704 	}
705 
706 	/* Make a copy of the actual pattern. */
707 	pattern_len = pp - p;
708 	pattern = estrndup(p, pattern_len);
709 
710 	/* Move on to the options */
711 	pp++;
712 
713 	/* Parse through the options, setting appropriate flags.  Display
714 	   a warning if we encounter an unknown modifier. */
715 	while (pp < end_p) {
716 		switch (*pp++) {
717 			/* Perl compatible options */
718 			case 'i':	coptions |= PCRE2_CASELESS;		break;
719 			case 'm':	coptions |= PCRE2_MULTILINE;		break;
720 			case 'n':	coptions |= PCRE2_NO_AUTO_CAPTURE;	break;
721 			case 's':	coptions |= PCRE2_DOTALL;		break;
722 			case 'x':	coptions |= PCRE2_EXTENDED;		break;
723 
724 			/* PCRE specific options */
725 			case 'A':	coptions |= PCRE2_ANCHORED;		break;
726 			case 'D':	coptions |= PCRE2_DOLLAR_ENDONLY;break;
727 			case 'S':	/* Pass. */					break;
728 			case 'X':	/* Pass. */					break;
729 			case 'U':	coptions |= PCRE2_UNGREEDY;		break;
730 			case 'u':	coptions |= PCRE2_UTF;
731 	/* In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
732 	   characters, even in UTF-8 mode. However, this can be changed by setting
733 	   the PCRE2_UCP option. */
734 #ifdef PCRE2_UCP
735 						coptions |= PCRE2_UCP;
736 #endif
737 				break;
738 			case 'J':	coptions |= PCRE2_DUPNAMES;		break;
739 
740 			/* Custom preg options */
741 			case 'e':	poptions |= PREG_REPLACE_EVAL;	break;
742 
743 			case ' ':
744 			case '\n':
745 			case '\r':
746 				break;
747 
748 			default:
749 				if (pp[-1]) {
750 					php_error_docref(NULL, E_WARNING, "Unknown modifier '%c'", pp[-1]);
751 				} else {
752 					php_error_docref(NULL, E_WARNING, "NUL is not a valid modifier");
753 				}
754 				pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
755 				efree(pattern);
756 				if (key != regex) {
757 					zend_string_release_ex(key, 0);
758 				}
759 				return NULL;
760 		}
761 	}
762 
763 	if (poptions & PREG_REPLACE_EVAL) {
764 		php_error_docref(NULL, E_WARNING, "The /e modifier is no longer supported, use preg_replace_callback instead");
765 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
766 		efree(pattern);
767 		if (key != regex) {
768 			zend_string_release_ex(key, 0);
769 		}
770 		return NULL;
771 	}
772 
773 	if (key != regex) {
774 		tables = (uint8_t *)zend_hash_find_ptr(&char_tables, BG(ctype_string));
775 		if (!tables) {
776 			zend_string *_k;
777 			tables = pcre2_maketables(gctx);
778 			if (UNEXPECTED(!tables)) {
779 				php_error_docref(NULL,E_WARNING, "Failed to generate locale character tables");
780 				pcre_handle_exec_error(PCRE2_ERROR_NOMEMORY);
781 				zend_string_release_ex(key, 0);
782 				efree(pattern);
783 				return NULL;
784 			}
785 			_k = zend_string_init(ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string)), 1);
786 			GC_MAKE_PERSISTENT_LOCAL(_k);
787 			zend_hash_add_ptr(&char_tables, _k, (void *)tables);
788 			zend_string_release(_k);
789 		}
790 	}
791 	pcre2_set_character_tables(cctx, tables);
792 
793 	/* Compile pattern and display a warning if compilation failed. */
794 	re = pcre2_compile((PCRE2_SPTR)pattern, pattern_len, coptions, &errnumber, &erroffset, cctx);
795 
796 	if (re == NULL) {
797 		if (key != regex) {
798 			zend_string_release_ex(key, 0);
799 		}
800 		pcre2_get_error_message(errnumber, error, sizeof(error));
801 		php_error_docref(NULL,E_WARNING, "Compilation failed: %s at offset %zu", error, erroffset);
802 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
803 		efree(pattern);
804 		return NULL;
805 	}
806 
807 #ifdef HAVE_PCRE_JIT_SUPPORT
808 	if (PCRE_G(jit)) {
809 		/* Enable PCRE JIT compiler */
810 		rc = pcre2_jit_compile(re, PCRE2_JIT_COMPLETE);
811 		if (EXPECTED(rc >= 0)) {
812 			size_t jit_size = 0;
813 			if (!pcre2_pattern_info(re, PCRE2_INFO_JITSIZE, &jit_size) && jit_size > 0) {
814 				poptions |= PREG_JIT;
815 			}
816 		} else if (rc == PCRE2_ERROR_NOMEMORY) {
817 			php_error_docref(NULL, E_WARNING,
818 				"Allocation of JIT memory failed, PCRE JIT will be disabled. "
819 				"This is likely caused by security restrictions. "
820 				"Either grant PHP permission to allocate executable memory, or set pcre.jit=0");
821 			PCRE_G(jit) = 0;
822 		} else {
823 			pcre2_get_error_message(rc, error, sizeof(error));
824 			php_error_docref(NULL, E_WARNING, "JIT compilation failed: %s", error);
825 			pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
826 		}
827 	}
828 #endif
829 	efree(pattern);
830 
831 	/*
832 	 * If we reached cache limit, clean out the items from the head of the list;
833 	 * these are supposedly the oldest ones (but not necessarily the least used
834 	 * ones).
835 	 */
836 	if (zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) {
837 		int num_clean = PCRE_CACHE_SIZE / 8;
838 		zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean);
839 	}
840 
841 	/* Store the compiled pattern and extra info in the cache. */
842 	new_entry.re = re;
843 	new_entry.preg_options = poptions;
844 	new_entry.compile_options = coptions;
845 	new_entry.refcount = 0;
846 
847 	rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &new_entry.capture_count);
848 	if (rc < 0) {
849 		if (key != regex) {
850 			zend_string_release_ex(key, 0);
851 		}
852 		php_error_docref(NULL, E_WARNING, "Internal pcre2_pattern_info() error %d", rc);
853 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
854 		return NULL;
855 	}
856 
857 	rc = pcre2_pattern_info(re, PCRE2_INFO_NAMECOUNT, &new_entry.name_count);
858 	if (rc < 0) {
859 		if (key != regex) {
860 			zend_string_release_ex(key, 0);
861 		}
862 		php_error_docref(NULL, E_WARNING, "Internal pcre_pattern_info() error %d", rc);
863 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
864 		return NULL;
865 	}
866 
867 	/*
868 	 * Interned strings are not duplicated when stored in HashTable,
869 	 * but all the interned strings created during HTTP request are removed
870 	 * at end of request. However PCRE_G(pcre_cache) must be consistent
871 	 * on the next request as well. So we disable usage of interned strings
872 	 * as hash keys especually for this table.
873 	 * See bug #63180
874 	 */
875 	if (!(GC_FLAGS(key) & IS_STR_PERMANENT) && !PCRE_G(per_request_cache)) {
876 		zend_string *str = zend_string_init(ZSTR_VAL(key), ZSTR_LEN(key), 1);
877 		GC_MAKE_PERSISTENT_LOCAL(str);
878 
879 		ret = zend_hash_add_new_mem(&PCRE_G(pcre_cache), str, &new_entry, sizeof(pcre_cache_entry));
880 		zend_string_release(str);
881 	} else {
882 		ret = zend_hash_add_new_mem(&PCRE_G(pcre_cache), key, &new_entry, sizeof(pcre_cache_entry));
883 	}
884 
885 	if (key != regex) {
886 		zend_string_release_ex(key, 0);
887 	}
888 
889 	return ret;
890 }
891 /* }}} */
892 
893 /* {{{ pcre_get_compiled_regex_cache */
pcre_get_compiled_regex_cache(zend_string * regex)894 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_string *regex)
895 {
896 	return pcre_get_compiled_regex_cache_ex(regex, 1);
897 }
898 /* }}} */
899 
900 /* {{{ pcre_get_compiled_regex */
pcre_get_compiled_regex(zend_string * regex,uint32_t * capture_count)901 PHPAPI pcre2_code *pcre_get_compiled_regex(zend_string *regex, uint32_t *capture_count)
902 {
903 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex);
904 
905 	if (capture_count) {
906 		*capture_count = pce ? pce->capture_count : 0;
907 	}
908 
909 	return pce ? pce->re : NULL;
910 }
911 /* }}} */
912 
913 /* XXX For the cases where it's only about match yes/no and no capture
914 		required, perhaps just a minimum sized data would suffice. */
php_pcre_create_match_data(uint32_t capture_count,pcre2_code * re)915 PHPAPI pcre2_match_data *php_pcre_create_match_data(uint32_t capture_count, pcre2_code *re)
916 {/*{{{*/
917 
918 	assert(NULL != re);
919 
920 	if (EXPECTED(!mdata_used)) {
921 		int rc = 0;
922 
923 		if (!capture_count) {
924 			/* As we deal with a non cached pattern, no other way to gather this info. */
925 			rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &capture_count);
926 		}
927 
928 		if (rc >= 0 && capture_count + 1 <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
929 			mdata_used = 1;
930 			return mdata;
931 		}
932 	}
933 
934 	return pcre2_match_data_create_from_pattern(re, gctx);
935 }/*}}}*/
936 
php_pcre_free_match_data(pcre2_match_data * match_data)937 PHPAPI void php_pcre_free_match_data(pcre2_match_data *match_data)
938 {/*{{{*/
939 	if (UNEXPECTED(match_data != mdata)) {
940 		pcre2_match_data_free(match_data);
941 	} else {
942 		mdata_used = 0;
943 	}
944 }/*}}}*/
945 
init_unmatched_null_pair(void)946 static void init_unmatched_null_pair(void) {
947 	zval val1, val2;
948 	ZVAL_NULL(&val1);
949 	ZVAL_LONG(&val2, -1);
950 	ZVAL_ARR(&PCRE_G(unmatched_null_pair), zend_new_pair(&val1, &val2));
951 }
952 
init_unmatched_empty_pair(void)953 static void init_unmatched_empty_pair(void) {
954 	zval val1, val2;
955 	ZVAL_EMPTY_STRING(&val1);
956 	ZVAL_LONG(&val2, -1);
957 	ZVAL_ARR(&PCRE_G(unmatched_empty_pair), zend_new_pair(&val1, &val2));
958 }
959 
populate_match_value_str(zval * val,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset)960 static zend_always_inline void populate_match_value_str(
961 		zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset) {
962 	ZVAL_STRINGL_FAST(val, subject + start_offset, end_offset - start_offset);
963 }
964 
populate_match_value(zval * val,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset,uint32_t unmatched_as_null)965 static inline void populate_match_value(
966 		zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset,
967 		uint32_t unmatched_as_null) {
968 	if (PCRE2_UNSET == start_offset) {
969 		if (unmatched_as_null) {
970 			ZVAL_NULL(val);
971 		} else {
972 			ZVAL_EMPTY_STRING(val);
973 		}
974 	} else {
975 		populate_match_value_str(val, subject, start_offset, end_offset);
976 	}
977 }
978 
add_named(zval * subpats,zend_string * name,zval * val,bool unmatched)979 static inline void add_named(
980 		zval *subpats, zend_string *name, zval *val, bool unmatched) {
981 	/* If the DUPNAMES option is used, multiple subpatterns might have the same name.
982 	 * In this case we want to preserve the one that actually has a value. */
983 	if (!unmatched) {
984 		zend_hash_update(Z_ARRVAL_P(subpats), name, val);
985 	} else {
986 		if (!zend_hash_add(Z_ARRVAL_P(subpats), name, val)) {
987 			return;
988 		}
989 	}
990 	Z_TRY_ADDREF_P(val);
991 }
992 
993 /* {{{ add_offset_pair */
add_offset_pair(zval * result,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset,zend_string * name,uint32_t unmatched_as_null)994 static inline void add_offset_pair(
995 		zval *result, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset,
996 		zend_string *name, uint32_t unmatched_as_null)
997 {
998 	zval match_pair;
999 
1000 	/* Add (match, offset) to the return value */
1001 	if (PCRE2_UNSET == start_offset) {
1002 		if (unmatched_as_null) {
1003 			if (Z_ISUNDEF(PCRE_G(unmatched_null_pair))) {
1004 				init_unmatched_null_pair();
1005 			}
1006 			ZVAL_COPY(&match_pair, &PCRE_G(unmatched_null_pair));
1007 		} else {
1008 			if (Z_ISUNDEF(PCRE_G(unmatched_empty_pair))) {
1009 				init_unmatched_empty_pair();
1010 			}
1011 			ZVAL_COPY(&match_pair, &PCRE_G(unmatched_empty_pair));
1012 		}
1013 	} else {
1014 		zval val1, val2;
1015 		populate_match_value_str(&val1, subject, start_offset, end_offset);
1016 		ZVAL_LONG(&val2, start_offset);
1017 		ZVAL_ARR(&match_pair, zend_new_pair(&val1, &val2));
1018 	}
1019 
1020 	if (name) {
1021 		add_named(result, name, &match_pair, start_offset == PCRE2_UNSET);
1022 	}
1023 	zend_hash_next_index_insert(Z_ARRVAL_P(result), &match_pair);
1024 }
1025 /* }}} */
1026 
populate_subpat_array(zval * subpats,const char * subject,PCRE2_SIZE * offsets,zend_string ** subpat_names,uint32_t num_subpats,int count,const PCRE2_SPTR mark,zend_long flags)1027 static void populate_subpat_array(
1028 		zval *subpats, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names,
1029 		uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags) {
1030 	bool offset_capture = (flags & PREG_OFFSET_CAPTURE) != 0;
1031 	bool unmatched_as_null = (flags & PREG_UNMATCHED_AS_NULL) != 0;
1032 	zval val;
1033 	int i;
1034 	if (subpat_names) {
1035 		if (offset_capture) {
1036 			for (i = 0; i < count; i++) {
1037 				add_offset_pair(
1038 					subpats, subject, offsets[2*i], offsets[2*i+1],
1039 					subpat_names[i], unmatched_as_null);
1040 			}
1041 			if (unmatched_as_null) {
1042 				for (i = count; i < num_subpats; i++) {
1043 					add_offset_pair(subpats, NULL, PCRE2_UNSET, PCRE2_UNSET, subpat_names[i], 1);
1044 				}
1045 			}
1046 		} else {
1047 			for (i = 0; i < count; i++) {
1048 				populate_match_value(
1049 					&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1050 				if (subpat_names[i]) {
1051 					add_named(subpats, subpat_names[i], &val, offsets[2*i] == PCRE2_UNSET);
1052 				}
1053 				zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &val);
1054 			}
1055 			if (unmatched_as_null) {
1056 				for (i = count; i < num_subpats; i++) {
1057 					ZVAL_NULL(&val);
1058 					if (subpat_names[i]) {
1059 						zend_hash_add(Z_ARRVAL_P(subpats), subpat_names[i], &val);
1060 					}
1061 					zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &val);
1062 				}
1063 			}
1064 		}
1065 	} else {
1066 		if (offset_capture) {
1067 			for (i = 0; i < count; i++) {
1068 				add_offset_pair(
1069 					subpats, subject, offsets[2*i], offsets[2*i+1], NULL, unmatched_as_null);
1070 			}
1071 			if (unmatched_as_null) {
1072 				for (i = count; i < num_subpats; i++) {
1073 					add_offset_pair(subpats, NULL, PCRE2_UNSET, PCRE2_UNSET, NULL, 1);
1074 				}
1075 			}
1076 		} else {
1077 			for (i = 0; i < count; i++) {
1078 				populate_match_value(
1079 					&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1080 				zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &val);
1081 			}
1082 			if (unmatched_as_null) {
1083 				for (i = count; i < num_subpats; i++) {
1084 					add_next_index_null(subpats);
1085 				}
1086 			}
1087 		}
1088 	}
1089 	/* Add MARK, if available */
1090 	if (mark) {
1091 		add_assoc_string_ex(subpats, "MARK", sizeof("MARK") - 1, (char *)mark);
1092 	}
1093 }
1094 
php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS,int global)1095 static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ */
1096 {
1097 	/* parameters */
1098 	zend_string		 *regex;			/* Regular expression */
1099 	zend_string		 *subject;			/* String to match against */
1100 	pcre_cache_entry *pce;				/* Compiled regular expression */
1101 	zval			 *subpats = NULL;	/* Array for subpatterns */
1102 	zend_long		  flags = 0;		/* Match control flags */
1103 	zend_long		  start_offset = 0;	/* Where the new search starts */
1104 
1105 	ZEND_PARSE_PARAMETERS_START(2, 5)
1106 		Z_PARAM_STR(regex)
1107 		Z_PARAM_STR(subject)
1108 		Z_PARAM_OPTIONAL
1109 		Z_PARAM_ZVAL(subpats)
1110 		Z_PARAM_LONG(flags)
1111 		Z_PARAM_LONG(start_offset)
1112 	ZEND_PARSE_PARAMETERS_END();
1113 
1114 	/* Compile regex or get it from cache. */
1115 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1116 		RETURN_FALSE;
1117 	}
1118 
1119 	pce->refcount++;
1120 	php_pcre_match_impl(pce, subject, return_value, subpats,
1121 		global, ZEND_NUM_ARGS() >= 4, flags, start_offset);
1122 	pce->refcount--;
1123 }
1124 /* }}} */
1125 
is_known_valid_utf8(zend_string * subject_str,PCRE2_SIZE start_offset)1126 static zend_always_inline bool is_known_valid_utf8(
1127 		zend_string *subject_str, PCRE2_SIZE start_offset) {
1128 	if (!(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)) {
1129 		/* We don't know whether the string is valid UTF-8 or not. */
1130 		return 0;
1131 	}
1132 
1133 	if (start_offset == ZSTR_LEN(subject_str)) {
1134 		/* Degenerate case: Offset points to end of string. */
1135 		return 1;
1136 	}
1137 
1138 	/* Check that the offset does not point to an UTF-8 continuation byte. */
1139 	return (ZSTR_VAL(subject_str)[start_offset] & 0xc0) != 0x80;
1140 }
1141 
1142 /* {{{ php_pcre_match_impl() */
php_pcre_match_impl(pcre_cache_entry * pce,zend_string * subject_str,zval * return_value,zval * subpats,int global,int use_flags,zend_long flags,zend_off_t start_offset)1143 PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
1144 	zval *subpats, int global, int use_flags, zend_long flags, zend_off_t start_offset)
1145 {
1146 	zval			 result_set,		/* Holds a set of subpatterns after
1147 										   a global match */
1148 					*match_sets = NULL;	/* An array of sets of matches for each
1149 										   subpattern after a global match */
1150 	uint32_t		 options;			/* Execution options */
1151 	int				 count;				/* Count of matched subpatterns */
1152 	PCRE2_SIZE		*offsets;			/* Array of subpattern offsets */
1153 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1154 	int				 matched;			/* Has anything matched */
1155 	zend_string	   **subpat_names;		/* Array for named subpatterns */
1156 	size_t			 i;
1157 	uint32_t		 subpats_order;		/* Order of subpattern matches */
1158 	uint32_t		 offset_capture;	/* Capture match offsets: yes/no */
1159 	uint32_t		 unmatched_as_null;	/* Null non-matches: yes/no */
1160 	PCRE2_SPTR       mark = NULL;		/* Target for MARK name */
1161 	zval			 marks;				/* Array of marks for PREG_PATTERN_ORDER */
1162 	pcre2_match_data *match_data;
1163 	PCRE2_SIZE		 start_offset2, orig_start_offset;
1164 
1165 	char *subject = ZSTR_VAL(subject_str);
1166 	size_t subject_len = ZSTR_LEN(subject_str);
1167 
1168 	ZVAL_UNDEF(&marks);
1169 
1170 	/* Overwrite the passed-in value for subpatterns with an empty array. */
1171 	if (subpats != NULL) {
1172 		subpats = zend_try_array_init(subpats);
1173 		if (!subpats) {
1174 			RETURN_THROWS();
1175 		}
1176 	}
1177 
1178 	subpats_order = global ? PREG_PATTERN_ORDER : 0;
1179 
1180 	if (use_flags) {
1181 		offset_capture = flags & PREG_OFFSET_CAPTURE;
1182 		unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL;
1183 
1184 		/*
1185 		 * subpats_order is pre-set to pattern mode so we change it only if
1186 		 * necessary.
1187 		 */
1188 		if (flags & 0xff) {
1189 			subpats_order = flags & 0xff;
1190 		}
1191 		if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
1192 			(!global && subpats_order != 0)) {
1193 			zend_argument_value_error(4, "must be a PREG_* constant");
1194 			RETURN_THROWS();
1195 		}
1196 	} else {
1197 		offset_capture = 0;
1198 		unmatched_as_null = 0;
1199 	}
1200 
1201 	/* Negative offset counts from the end of the string. */
1202 	if (start_offset < 0) {
1203 		if ((PCRE2_SIZE)-start_offset <= subject_len) {
1204 			start_offset2 = subject_len + start_offset;
1205 		} else {
1206 			start_offset2 = 0;
1207 		}
1208 	} else {
1209 		start_offset2 = (PCRE2_SIZE)start_offset;
1210 	}
1211 
1212 	if (start_offset2 > subject_len) {
1213 		pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET);
1214 		RETURN_FALSE;
1215 	}
1216 
1217 	/* Calculate the size of the offsets array, and allocate memory for it. */
1218 	num_subpats = pce->capture_count + 1;
1219 
1220 	/*
1221 	 * Build a mapping from subpattern numbers to their names. We will
1222 	 * allocate the table only if there are any named subpatterns.
1223 	 */
1224 	subpat_names = NULL;
1225 	if (subpats && pce->name_count > 0) {
1226 		subpat_names = make_subpats_table(num_subpats, pce);
1227 		if (!subpat_names) {
1228 			RETURN_FALSE;
1229 		}
1230 	}
1231 
1232 	/* Allocate match sets array and initialize the values. */
1233 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
1234 		match_sets = (zval *)safe_emalloc(num_subpats, sizeof(zval), 0);
1235 		for (i=0; i<num_subpats; i++) {
1236 			array_init(&match_sets[i]);
1237 		}
1238 	}
1239 
1240 	matched = 0;
1241 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1242 
1243 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1244 		match_data = mdata;
1245 	} else {
1246 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1247 		if (!match_data) {
1248 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1249 			if (subpat_names) {
1250 				free_subpats_table(subpat_names, num_subpats);
1251 			}
1252 			if (match_sets) {
1253 				efree(match_sets);
1254 			}
1255 			RETURN_FALSE;
1256 		}
1257 	}
1258 
1259 	orig_start_offset = start_offset2;
1260 	options =
1261 		(pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, orig_start_offset)
1262 			? 0 : PCRE2_NO_UTF_CHECK;
1263 
1264 	/* Execute the regular expression. */
1265 #ifdef HAVE_PCRE_JIT_SUPPORT
1266 	if ((pce->preg_options & PREG_JIT) && options) {
1267 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1268 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1269 	} else
1270 #endif
1271 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1272 			options, match_data, mctx);
1273 
1274 	while (1) {
1275 		/* If something has matched */
1276 		if (count >= 0) {
1277 			/* Check for too many substrings condition. */
1278 			if (UNEXPECTED(count == 0)) {
1279 				php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings");
1280 				count = num_subpats;
1281 			}
1282 
1283 matched:
1284 			matched++;
1285 
1286 			offsets = pcre2_get_ovector_pointer(match_data);
1287 
1288 			/* If subpatterns array has been passed, fill it in with values. */
1289 			if (subpats != NULL) {
1290 				/* Try to get the list of substrings and display a warning if failed. */
1291 				if (offsets[1] < offsets[0]) {
1292 					if (subpat_names) {
1293 						free_subpats_table(subpat_names, num_subpats);
1294 					}
1295 					if (match_sets) efree(match_sets);
1296 					php_error_docref(NULL, E_WARNING, "Get subpatterns list failed");
1297 					RETURN_FALSE;
1298 				}
1299 
1300 				if (global) {	/* global pattern matching */
1301 					if (subpats && subpats_order == PREG_PATTERN_ORDER) {
1302 						/* For each subpattern, insert it into the appropriate array. */
1303 						if (offset_capture) {
1304 							for (i = 0; i < count; i++) {
1305 								add_offset_pair(
1306 									&match_sets[i], subject, offsets[2*i], offsets[2*i+1],
1307 									NULL, unmatched_as_null);
1308 							}
1309 						} else {
1310 							for (i = 0; i < count; i++) {
1311 								zval val;
1312 								populate_match_value(
1313 									&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1314 								zend_hash_next_index_insert_new(Z_ARRVAL(match_sets[i]), &val);
1315 							}
1316 						}
1317 						mark = pcre2_get_mark(match_data);
1318 						/* Add MARK, if available */
1319 						if (mark) {
1320 							if (Z_TYPE(marks) == IS_UNDEF) {
1321 								array_init(&marks);
1322 							}
1323 							add_index_string(&marks, matched - 1, (char *) mark);
1324 						}
1325 						/*
1326 						 * If the number of captured subpatterns on this run is
1327 						 * less than the total possible number, pad the result
1328 						 * arrays with NULLs or empty strings.
1329 						 */
1330 						if (count < num_subpats) {
1331 							for (; i < num_subpats; i++) {
1332 								if (offset_capture) {
1333 									add_offset_pair(
1334 										&match_sets[i], NULL, PCRE2_UNSET, PCRE2_UNSET,
1335 										NULL, unmatched_as_null);
1336 								} else if (unmatched_as_null) {
1337 									add_next_index_null(&match_sets[i]);
1338 								} else {
1339 									add_next_index_str(&match_sets[i], ZSTR_EMPTY_ALLOC());
1340 								}
1341 							}
1342 						}
1343 					} else {
1344 						/* Allocate and populate the result set array */
1345 						array_init_size(&result_set, count + (mark ? 1 : 0));
1346 						mark = pcre2_get_mark(match_data);
1347 						populate_subpat_array(
1348 							&result_set, subject, offsets, subpat_names,
1349 							num_subpats, count, mark, flags);
1350 						/* And add it to the output array */
1351 						zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &result_set);
1352 					}
1353 				} else {			/* single pattern matching */
1354 					/* For each subpattern, insert it into the subpatterns array. */
1355 					mark = pcre2_get_mark(match_data);
1356 					populate_subpat_array(
1357 						subpats, subject, offsets, subpat_names, num_subpats, count, mark, flags);
1358 					break;
1359 				}
1360 			}
1361 
1362 			/* Advance to the next piece. */
1363 			start_offset2 = offsets[1];
1364 
1365 			/* If we have matched an empty string, mimic what Perl's /g options does.
1366 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1367 			   the match again at the same point. If this fails (picked up above) we
1368 			   advance to the next character. */
1369 			if (start_offset2 == offsets[0]) {
1370 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1371 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1372 				if (count >= 0) {
1373 					if (global) {
1374 						goto matched;
1375 					} else {
1376 						break;
1377 					}
1378 				} else if (count == PCRE2_ERROR_NOMATCH) {
1379 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1380 					   this is not necessarily the end. We need to advance
1381 					   the start offset, and continue. Fudge the offset values
1382 					   to achieve this, unless we're already at the end of the string. */
1383 					if (start_offset2 < subject_len) {
1384 						size_t unit_len = calculate_unit_length(pce, subject + start_offset2);
1385 
1386 						start_offset2 += unit_len;
1387 					} else {
1388 						break;
1389 					}
1390 				} else {
1391 					goto error;
1392 				}
1393 			}
1394 		} else if (count == PCRE2_ERROR_NOMATCH) {
1395 			break;
1396 		} else {
1397 error:
1398 			pcre_handle_exec_error(count);
1399 			break;
1400 		}
1401 
1402 		if (!global) {
1403 			break;
1404 		}
1405 
1406 		/* Execute the regular expression. */
1407 #ifdef HAVE_PCRE_JIT_SUPPORT
1408 		if ((pce->preg_options & PREG_JIT)) {
1409 			if (PCRE2_UNSET == start_offset2 || start_offset2 > subject_len) {
1410 				pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET);
1411 				break;
1412 			}
1413 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1414 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1415 		} else
1416 #endif
1417 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1418 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1419 	}
1420 	if (match_data != mdata) {
1421 		pcre2_match_data_free(match_data);
1422 	}
1423 
1424 	/* Add the match sets to the output array and clean up */
1425 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
1426 		if (subpat_names) {
1427 			for (i = 0; i < num_subpats; i++) {
1428 				if (subpat_names[i]) {
1429 					zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i], &match_sets[i]);
1430 					Z_ADDREF(match_sets[i]);
1431 				}
1432 				zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i]);
1433 			}
1434 		} else {
1435 			for (i = 0; i < num_subpats; i++) {
1436 				zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i]);
1437 			}
1438 		}
1439 		efree(match_sets);
1440 
1441 		if (Z_TYPE(marks) != IS_UNDEF) {
1442 			add_assoc_zval(subpats, "MARK", &marks);
1443 		}
1444 	}
1445 
1446 	if (subpat_names) {
1447 		free_subpats_table(subpat_names, num_subpats);
1448 	}
1449 
1450 	if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
1451 		/* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */
1452 		if ((pce->compile_options & PCRE2_UTF)
1453 				&& !ZSTR_IS_INTERNED(subject_str) && orig_start_offset == 0) {
1454 			GC_ADD_FLAGS(subject_str, IS_STR_VALID_UTF8);
1455 		}
1456 
1457 		RETVAL_LONG(matched);
1458 	} else {
1459 		RETVAL_FALSE;
1460 	}
1461 }
1462 /* }}} */
1463 
1464 /* {{{ Perform a Perl-style regular expression match */
PHP_FUNCTION(preg_match)1465 PHP_FUNCTION(preg_match)
1466 {
1467 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1468 }
1469 /* }}} */
1470 
1471 /* {{{ Perform a Perl-style global regular expression match */
PHP_FUNCTION(preg_match_all)1472 PHP_FUNCTION(preg_match_all)
1473 {
1474 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1475 }
1476 /* }}} */
1477 
1478 /* {{{ preg_get_backref */
preg_get_backref(char ** str,int * backref)1479 static int preg_get_backref(char **str, int *backref)
1480 {
1481 	char in_brace = 0;
1482 	char *walk = *str;
1483 
1484 	if (walk[1] == 0)
1485 		return 0;
1486 
1487 	if (*walk == '$' && walk[1] == '{') {
1488 		in_brace = 1;
1489 		walk++;
1490 	}
1491 	walk++;
1492 
1493 	if (*walk >= '0' && *walk <= '9') {
1494 		*backref = *walk - '0';
1495 		walk++;
1496 	} else
1497 		return 0;
1498 
1499 	if (*walk && *walk >= '0' && *walk <= '9') {
1500 		*backref = *backref * 10 + *walk - '0';
1501 		walk++;
1502 	}
1503 
1504 	if (in_brace) {
1505 		if (*walk != '}')
1506 			return 0;
1507 		else
1508 			walk++;
1509 	}
1510 
1511 	*str = walk;
1512 	return 1;
1513 }
1514 /* }}} */
1515 
1516 /* {{{ preg_do_repl_func */
preg_do_repl_func(zend_fcall_info * fci,zend_fcall_info_cache * fcc,const char * subject,PCRE2_SIZE * offsets,zend_string ** subpat_names,uint32_t num_subpats,int count,const PCRE2_SPTR mark,zend_long flags)1517 static zend_string *preg_do_repl_func(zend_fcall_info *fci, zend_fcall_info_cache *fcc, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names, uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags)
1518 {
1519 	zend_string *result_str;
1520 	zval		 retval;			/* Function return value */
1521 	zval	     arg;				/* Argument to pass to function */
1522 
1523 	array_init_size(&arg, count + (mark ? 1 : 0));
1524 	populate_subpat_array(&arg, subject, offsets, subpat_names, num_subpats, count, mark, flags);
1525 
1526 	fci->retval = &retval;
1527 	fci->param_count = 1;
1528 	fci->params = &arg;
1529 
1530 	if (zend_call_function(fci, fcc) == SUCCESS && Z_TYPE(retval) != IS_UNDEF) {
1531 		if (EXPECTED(Z_TYPE(retval) == IS_STRING)) {
1532 			result_str = Z_STR(retval);
1533 		} else {
1534 			result_str = zval_get_string_func(&retval);
1535 			zval_ptr_dtor(&retval);
1536 		}
1537 	} else {
1538 		if (!EG(exception)) {
1539 			php_error_docref(NULL, E_WARNING, "Unable to call custom replacement function");
1540 		}
1541 
1542 		result_str = zend_string_init(&subject[offsets[0]], offsets[1] - offsets[0], 0);
1543 	}
1544 
1545 	zval_ptr_dtor(&arg);
1546 
1547 	return result_str;
1548 }
1549 /* }}} */
1550 
1551 /* {{{ php_pcre_replace */
php_pcre_replace(zend_string * regex,zend_string * subject_str,const char * subject,size_t subject_len,zend_string * replace_str,size_t limit,size_t * replace_count)1552 PHPAPI zend_string *php_pcre_replace(zend_string *regex,
1553 							  zend_string *subject_str,
1554 							  const char *subject, size_t subject_len,
1555 							  zend_string *replace_str,
1556 							  size_t limit, size_t *replace_count)
1557 {
1558 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
1559 	zend_string	 		*result;			/* Function result */
1560 
1561 	/* Abort on pending exception, e.g. thrown from __toString(). */
1562 	if (UNEXPECTED(EG(exception))) {
1563 		return NULL;
1564 	}
1565 
1566 	/* Compile regex or get it from cache. */
1567 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1568 		return NULL;
1569 	}
1570 	pce->refcount++;
1571 	result = php_pcre_replace_impl(pce, subject_str, subject, subject_len, replace_str,
1572 		limit, replace_count);
1573 	pce->refcount--;
1574 
1575 	return result;
1576 }
1577 /* }}} */
1578 
1579 /* {{{ php_pcre_replace_impl() */
php_pcre_replace_impl(pcre_cache_entry * pce,zend_string * subject_str,const char * subject,size_t subject_len,zend_string * replace_str,size_t limit,size_t * replace_count)1580 PHPAPI zend_string *php_pcre_replace_impl(pcre_cache_entry *pce, zend_string *subject_str, const char *subject, size_t subject_len, zend_string *replace_str, size_t limit, size_t *replace_count)
1581 {
1582 	uint32_t		 options;			/* Execution options */
1583 	int				 count;				/* Count of matched subpatterns */
1584 	PCRE2_SIZE		*offsets;			/* Array of subpattern offsets */
1585 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1586 	size_t			 new_len;			/* Length of needed storage */
1587 	size_t			 alloc_len;			/* Actual allocated length */
1588 	size_t			 match_len;			/* Length of the current match */
1589 	int				 backref;			/* Backreference number */
1590 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
1591 	size_t			 last_end_offset;	/* Where the last search ended */
1592 	char			*walkbuf,			/* Location of current replacement in the result */
1593 					*walk,				/* Used to walk the replacement string */
1594 					 walk_last;			/* Last walked character */
1595 	const char		*match,				/* The current match */
1596 					*piece,				/* The current piece of subject */
1597 					*replace_end;		/* End of replacement string */
1598 	size_t			result_len; 		/* Length of result */
1599 	zend_string		*result;			/* Result of replacement */
1600 	pcre2_match_data *match_data;
1601 
1602 	/* Calculate the size of the offsets array, and allocate memory for it. */
1603 	num_subpats = pce->capture_count + 1;
1604 	alloc_len = 0;
1605 	result = NULL;
1606 
1607 	/* Initialize */
1608 	match = NULL;
1609 	start_offset = 0;
1610 	last_end_offset = 0;
1611 	result_len = 0;
1612 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1613 
1614 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1615 		match_data = mdata;
1616 	} else {
1617 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1618 		if (!match_data) {
1619 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1620 			return NULL;
1621 		}
1622 	}
1623 
1624 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
1625 
1626 	/* Execute the regular expression. */
1627 #ifdef HAVE_PCRE_JIT_SUPPORT
1628 	if ((pce->preg_options & PREG_JIT) && options) {
1629 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1630 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1631 	} else
1632 #endif
1633 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1634 			options, match_data, mctx);
1635 
1636 	while (1) {
1637 		piece = subject + last_end_offset;
1638 
1639 		if (count >= 0 && limit > 0) {
1640 			bool simple_string;
1641 
1642 			/* Check for too many substrings condition. */
1643 			if (UNEXPECTED(count == 0)) {
1644 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
1645 				count = num_subpats;
1646 			}
1647 
1648 matched:
1649 			offsets = pcre2_get_ovector_pointer(match_data);
1650 
1651 			if (UNEXPECTED(offsets[1] < offsets[0])) {
1652 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1653 				if (result) {
1654 					zend_string_release_ex(result, 0);
1655 					result = NULL;
1656 				}
1657 				break;
1658 			}
1659 
1660 			if (replace_count) {
1661 				++*replace_count;
1662 			}
1663 
1664 			/* Set the match location in subject */
1665 			match = subject + offsets[0];
1666 
1667 			new_len = result_len + offsets[0] - last_end_offset; /* part before the match */
1668 
1669 			walk = ZSTR_VAL(replace_str);
1670 			replace_end = walk + ZSTR_LEN(replace_str);
1671 			walk_last = 0;
1672 			simple_string = 1;
1673 			while (walk < replace_end) {
1674 				if ('\\' == *walk || '$' == *walk) {
1675 					simple_string = 0;
1676 					if (walk_last == '\\') {
1677 						walk++;
1678 						walk_last = 0;
1679 						continue;
1680 					}
1681 					if (preg_get_backref(&walk, &backref)) {
1682 						if (backref < count)
1683 							new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
1684 						continue;
1685 					}
1686 				}
1687 				new_len++;
1688 				walk++;
1689 				walk_last = walk[-1];
1690 			}
1691 
1692 			if (new_len >= alloc_len) {
1693 				alloc_len = zend_safe_address_guarded(2, new_len, ZSTR_MAX_OVERHEAD) - ZSTR_MAX_OVERHEAD;
1694 				if (result == NULL) {
1695 					result = zend_string_alloc(alloc_len, 0);
1696 				} else {
1697 					result = zend_string_extend(result, alloc_len, 0);
1698 				}
1699 			}
1700 
1701 			if (match-piece > 0) {
1702 				/* copy the part of the string before the match */
1703 				memcpy(&ZSTR_VAL(result)[result_len], piece, match-piece);
1704 				result_len += (match-piece);
1705 			}
1706 
1707 			if (simple_string) {
1708 				/* copy replacement */
1709 				memcpy(&ZSTR_VAL(result)[result_len], ZSTR_VAL(replace_str), ZSTR_LEN(replace_str)+1);
1710 				result_len += ZSTR_LEN(replace_str);
1711 			} else {
1712 				/* copy replacement and backrefs */
1713 				walkbuf = ZSTR_VAL(result) + result_len;
1714 
1715 				walk = ZSTR_VAL(replace_str);
1716 				walk_last = 0;
1717 				while (walk < replace_end) {
1718 					if ('\\' == *walk || '$' == *walk) {
1719 						if (walk_last == '\\') {
1720 							*(walkbuf-1) = *walk++;
1721 							walk_last = 0;
1722 							continue;
1723 						}
1724 						if (preg_get_backref(&walk, &backref)) {
1725 							if (backref < count) {
1726 								match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1727 								memcpy(walkbuf, subject + offsets[backref<<1], match_len);
1728 								walkbuf += match_len;
1729 							}
1730 							continue;
1731 						}
1732 					}
1733 					*walkbuf++ = *walk++;
1734 					walk_last = walk[-1];
1735 				}
1736 				*walkbuf = '\0';
1737 				/* increment the result length by how much we've added to the string */
1738 				result_len += (walkbuf - (ZSTR_VAL(result) + result_len));
1739 			}
1740 
1741 			limit--;
1742 
1743 			/* Advance to the next piece. */
1744 			start_offset = last_end_offset = offsets[1];
1745 
1746 			/* If we have matched an empty string, mimic what Perl's /g options does.
1747 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1748 			   the match again at the same point. If this fails (picked up above) we
1749 			   advance to the next character. */
1750 			if (start_offset == offsets[0]) {
1751 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1752 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1753 
1754 				piece = subject + start_offset;
1755 				if (count >= 0 && limit > 0) {
1756 					goto matched;
1757 				} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1758 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1759 					   this is not necessarily the end. We need to advance
1760 					   the start offset, and continue. Fudge the offset values
1761 					   to achieve this, unless we're already at the end of the string. */
1762 					if (start_offset < subject_len) {
1763 						size_t unit_len = calculate_unit_length(pce, piece);
1764 						start_offset += unit_len;
1765 					} else {
1766 						goto not_matched;
1767 					}
1768 				} else {
1769 					goto error;
1770 				}
1771 			}
1772 
1773 		} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1774 not_matched:
1775 			if (!result && subject_str) {
1776 				result = zend_string_copy(subject_str);
1777 				break;
1778 			}
1779 			/* now we know exactly how long it is */
1780 			alloc_len = result_len + subject_len - last_end_offset;
1781 			if (NULL != result) {
1782 				result = zend_string_realloc(result, alloc_len, 0);
1783 			} else {
1784 				result = zend_string_alloc(alloc_len, 0);
1785 			}
1786 			/* stick that last bit of string on our output */
1787 			memcpy(ZSTR_VAL(result) + result_len, piece, subject_len - last_end_offset);
1788 			result_len += subject_len - last_end_offset;
1789 			ZSTR_VAL(result)[result_len] = '\0';
1790 			ZSTR_LEN(result) = result_len;
1791 			break;
1792 		} else {
1793 error:
1794 			pcre_handle_exec_error(count);
1795 			if (result) {
1796 				zend_string_release_ex(result, 0);
1797 				result = NULL;
1798 			}
1799 			break;
1800 		}
1801 
1802 #ifdef HAVE_PCRE_JIT_SUPPORT
1803 		if (pce->preg_options & PREG_JIT) {
1804 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1805 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1806 		} else
1807 #endif
1808 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1809 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1810 	}
1811 	if (match_data != mdata) {
1812 		pcre2_match_data_free(match_data);
1813 	}
1814 
1815 	return result;
1816 }
1817 /* }}} */
1818 
1819 /* {{{ php_pcre_replace_func_impl() */
php_pcre_replace_func_impl(pcre_cache_entry * pce,zend_string * subject_str,const char * subject,size_t subject_len,zend_fcall_info * fci,zend_fcall_info_cache * fcc,size_t limit,size_t * replace_count,zend_long flags)1820 static zend_string *php_pcre_replace_func_impl(pcre_cache_entry *pce, zend_string *subject_str, const char *subject, size_t subject_len, zend_fcall_info *fci, zend_fcall_info_cache *fcc, size_t limit, size_t *replace_count, zend_long flags)
1821 {
1822 	uint32_t		 options;			/* Execution options */
1823 	int				 count;				/* Count of matched subpatterns */
1824 	PCRE2_SIZE		*offsets;			/* Array of subpattern offsets */
1825 	zend_string		**subpat_names;		/* Array for named subpatterns */
1826 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1827 	size_t			 new_len;			/* Length of needed storage */
1828 	size_t			 alloc_len;			/* Actual allocated length */
1829 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
1830 	size_t			 last_end_offset;	/* Where the last search ended */
1831 	const char		*match,				/* The current match */
1832 					*piece;				/* The current piece of subject */
1833 	size_t			result_len; 		/* Length of result */
1834 	zend_string		*result;			/* Result of replacement */
1835 	zend_string     *eval_result;		/* Result of custom function */
1836 	pcre2_match_data *match_data;
1837 	bool old_mdata_used;
1838 
1839 	/* Calculate the size of the offsets array, and allocate memory for it. */
1840 	num_subpats = pce->capture_count + 1;
1841 
1842 	/*
1843 	 * Build a mapping from subpattern numbers to their names. We will
1844 	 * allocate the table only if there are any named subpatterns.
1845 	 */
1846 	subpat_names = NULL;
1847 	if (UNEXPECTED(pce->name_count > 0)) {
1848 		subpat_names = make_subpats_table(num_subpats, pce);
1849 		if (!subpat_names) {
1850 			return NULL;
1851 		}
1852 	}
1853 
1854 	alloc_len = 0;
1855 	result = NULL;
1856 
1857 	/* Initialize */
1858 	match = NULL;
1859 	start_offset = 0;
1860 	last_end_offset = 0;
1861 	result_len = 0;
1862 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1863 
1864 	old_mdata_used = mdata_used;
1865 	if (!old_mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1866 		mdata_used = 1;
1867 		match_data = mdata;
1868 	} else {
1869 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1870 		if (!match_data) {
1871 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1872 			if (subpat_names) {
1873 				free_subpats_table(subpat_names, num_subpats);
1874 			}
1875 			mdata_used = old_mdata_used;
1876 			return NULL;
1877 		}
1878 	}
1879 
1880 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
1881 
1882 	/* Execute the regular expression. */
1883 #ifdef HAVE_PCRE_JIT_SUPPORT
1884 	if ((pce->preg_options & PREG_JIT) && options) {
1885 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1886 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1887 	} else
1888 #endif
1889 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1890 			options, match_data, mctx);
1891 
1892 	while (1) {
1893 		piece = subject + last_end_offset;
1894 
1895 		if (count >= 0 && limit) {
1896 			/* Check for too many substrings condition. */
1897 			if (UNEXPECTED(count == 0)) {
1898 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
1899 				count = num_subpats;
1900 			}
1901 
1902 matched:
1903 			offsets = pcre2_get_ovector_pointer(match_data);
1904 
1905 			if (UNEXPECTED(offsets[1] < offsets[0])) {
1906 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1907 				if (result) {
1908 					zend_string_release_ex(result, 0);
1909 					result = NULL;
1910 				}
1911 				break;
1912 			}
1913 
1914 			if (replace_count) {
1915 				++*replace_count;
1916 			}
1917 
1918 			/* Set the match location in subject */
1919 			match = subject + offsets[0];
1920 
1921 			new_len = result_len + offsets[0] - last_end_offset; /* part before the match */
1922 
1923 			/* Use custom function to get replacement string and its length. */
1924 			eval_result = preg_do_repl_func(
1925 				fci, fcc, subject, offsets, subpat_names, num_subpats, count,
1926 				pcre2_get_mark(match_data), flags);
1927 
1928 			ZEND_ASSERT(eval_result);
1929 			new_len = zend_safe_address_guarded(1, ZSTR_LEN(eval_result) + ZSTR_MAX_OVERHEAD, new_len) -ZSTR_MAX_OVERHEAD;
1930 			if (new_len >= alloc_len) {
1931 				alloc_len = zend_safe_address_guarded(2, new_len, ZSTR_MAX_OVERHEAD) - ZSTR_MAX_OVERHEAD;
1932 				if (result == NULL) {
1933 					result = zend_string_alloc(alloc_len, 0);
1934 				} else {
1935 					result = zend_string_extend(result, alloc_len, 0);
1936 				}
1937 			}
1938 
1939 			if (match-piece > 0) {
1940 				/* copy the part of the string before the match */
1941 				memcpy(ZSTR_VAL(result) + result_len, piece, match-piece);
1942 				result_len += (match-piece);
1943 			}
1944 
1945 			/* If using custom function, copy result to the buffer and clean up. */
1946 			memcpy(ZSTR_VAL(result) + result_len, ZSTR_VAL(eval_result), ZSTR_LEN(eval_result));
1947 			result_len += ZSTR_LEN(eval_result);
1948 			zend_string_release_ex(eval_result, 0);
1949 
1950 			limit--;
1951 
1952 			/* Advance to the next piece. */
1953 			start_offset = last_end_offset = offsets[1];
1954 
1955 			/* If we have matched an empty string, mimic what Perl's /g options does.
1956 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1957 			   the match again at the same point. If this fails (picked up above) we
1958 			   advance to the next character. */
1959 			if (start_offset == offsets[0]) {
1960 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1961 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1962 
1963 				piece = subject + start_offset;
1964 				if (count >= 0 && limit) {
1965 					goto matched;
1966 				} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1967 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1968 					   this is not necessarily the end. We need to advance
1969 					   the start offset, and continue. Fudge the offset values
1970 					   to achieve this, unless we're already at the end of the string. */
1971 					if (start_offset < subject_len) {
1972 						size_t unit_len = calculate_unit_length(pce, piece);
1973 						start_offset += unit_len;
1974 					} else {
1975 						goto not_matched;
1976 					}
1977 				} else {
1978 					goto error;
1979 				}
1980 			}
1981 
1982 		} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1983 not_matched:
1984 			if (!result && subject_str) {
1985 				result = zend_string_copy(subject_str);
1986 				break;
1987 			}
1988 			/* now we know exactly how long it is */
1989 			alloc_len = result_len + subject_len - last_end_offset;
1990 			if (NULL != result) {
1991 				result = zend_string_realloc(result, alloc_len, 0);
1992 			} else {
1993 				result = zend_string_alloc(alloc_len, 0);
1994 			}
1995 			/* stick that last bit of string on our output */
1996 			memcpy(ZSTR_VAL(result) + result_len, piece, subject_len - last_end_offset);
1997 			result_len += subject_len - last_end_offset;
1998 			ZSTR_VAL(result)[result_len] = '\0';
1999 			ZSTR_LEN(result) = result_len;
2000 			break;
2001 		} else {
2002 error:
2003 			pcre_handle_exec_error(count);
2004 			if (result) {
2005 				zend_string_release_ex(result, 0);
2006 				result = NULL;
2007 			}
2008 			break;
2009 		}
2010 #ifdef HAVE_PCRE_JIT_SUPPORT
2011 		if ((pce->preg_options & PREG_JIT)) {
2012 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
2013 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2014 		} else
2015 #endif
2016 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
2017 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2018 	}
2019 	if (match_data != mdata) {
2020 		pcre2_match_data_free(match_data);
2021 	}
2022 	mdata_used = old_mdata_used;
2023 
2024 	if (UNEXPECTED(subpat_names)) {
2025 		free_subpats_table(subpat_names, num_subpats);
2026 	}
2027 
2028 	return result;
2029 }
2030 /* }}} */
2031 
2032 /* {{{ php_pcre_replace_func */
php_pcre_replace_func(zend_string * regex,zend_string * subject_str,zend_fcall_info * fci,zend_fcall_info_cache * fcc,size_t limit,size_t * replace_count,zend_long flags)2033 static zend_always_inline zend_string *php_pcre_replace_func(zend_string *regex,
2034 							  zend_string *subject_str,
2035 							  zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2036 							  size_t limit, size_t *replace_count, zend_long flags)
2037 {
2038 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
2039 	zend_string	 		*result;			/* Function result */
2040 
2041 	/* Compile regex or get it from cache. */
2042 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2043 		return NULL;
2044 	}
2045 	pce->refcount++;
2046 	result = php_pcre_replace_func_impl(
2047 		pce, subject_str, ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), fci, fcc,
2048 		limit, replace_count, flags);
2049 	pce->refcount--;
2050 
2051 	return result;
2052 }
2053 /* }}} */
2054 
2055 /* {{{ php_pcre_replace_array */
php_pcre_replace_array(HashTable * regex,zend_string * replace_str,HashTable * replace_ht,zend_string * subject_str,size_t limit,size_t * replace_count)2056 static zend_string *php_pcre_replace_array(HashTable *regex,
2057 	zend_string *replace_str, HashTable *replace_ht,
2058 	zend_string *subject_str, size_t limit, size_t *replace_count)
2059 {
2060 	zval		*regex_entry;
2061 	zend_string *result;
2062 
2063 	zend_string_addref(subject_str);
2064 
2065 	if (replace_ht) {
2066 		uint32_t replace_idx = 0;
2067 
2068 		/* For each entry in the regex array, get the entry */
2069 		ZEND_HASH_FOREACH_VAL(regex, regex_entry) {
2070 			/* Make sure we're dealing with strings. */
2071 			zend_string *tmp_regex_str;
2072 			zend_string *regex_str = zval_get_tmp_string(regex_entry, &tmp_regex_str);
2073 			zend_string *replace_entry_str, *tmp_replace_entry_str;
2074 			zval *zv;
2075 
2076 			/* Get current entry */
2077 			while (1) {
2078 				if (replace_idx == replace_ht->nNumUsed) {
2079 					replace_entry_str = ZSTR_EMPTY_ALLOC();
2080 					tmp_replace_entry_str = NULL;
2081 					break;
2082 				}
2083 				zv = ZEND_HASH_ELEMENT(replace_ht, replace_idx);
2084 				replace_idx++;
2085 				if (Z_TYPE_P(zv) != IS_UNDEF) {
2086 					replace_entry_str = zval_get_tmp_string(zv, &tmp_replace_entry_str);
2087 					break;
2088 				}
2089 			}
2090 
2091 			/* Do the actual replacement and put the result back into subject_str
2092 			   for further replacements. */
2093 			result = php_pcre_replace(regex_str, subject_str, ZSTR_VAL(subject_str),
2094 				ZSTR_LEN(subject_str), replace_entry_str, limit, replace_count);
2095 			zend_tmp_string_release(tmp_replace_entry_str);
2096 			zend_tmp_string_release(tmp_regex_str);
2097 			zend_string_release_ex(subject_str, 0);
2098 			subject_str = result;
2099 			if (UNEXPECTED(result == NULL)) {
2100 				break;
2101 			}
2102 		} ZEND_HASH_FOREACH_END();
2103 
2104 	} else {
2105 		ZEND_ASSERT(replace_str != NULL);
2106 
2107 		/* For each entry in the regex array, get the entry */
2108 		ZEND_HASH_FOREACH_VAL(regex, regex_entry) {
2109 			/* Make sure we're dealing with strings. */
2110 			zend_string *tmp_regex_str;
2111 			zend_string *regex_str = zval_get_tmp_string(regex_entry, &tmp_regex_str);
2112 
2113 			/* Do the actual replacement and put the result back into subject_str
2114 			   for further replacements. */
2115 			result = php_pcre_replace(regex_str, subject_str, ZSTR_VAL(subject_str),
2116 				ZSTR_LEN(subject_str), replace_str, limit, replace_count);
2117 			zend_tmp_string_release(tmp_regex_str);
2118 			zend_string_release_ex(subject_str, 0);
2119 			subject_str = result;
2120 
2121 			if (UNEXPECTED(result == NULL)) {
2122 				break;
2123 			}
2124 		} ZEND_HASH_FOREACH_END();
2125 	}
2126 
2127 	return subject_str;
2128 }
2129 /* }}} */
2130 
2131 /* {{{ php_replace_in_subject */
php_replace_in_subject(zend_string * regex_str,HashTable * regex_ht,zend_string * replace_str,HashTable * replace_ht,zend_string * subject,size_t limit,size_t * replace_count)2132 static zend_always_inline zend_string *php_replace_in_subject(
2133 	zend_string *regex_str, HashTable *regex_ht,
2134 	zend_string *replace_str, HashTable *replace_ht,
2135 	zend_string *subject, size_t limit, size_t *replace_count)
2136 {
2137 	zend_string *result;
2138 
2139 	if (regex_str) {
2140 		ZEND_ASSERT(replace_str != NULL);
2141 		result = php_pcre_replace(regex_str, subject, ZSTR_VAL(subject), ZSTR_LEN(subject),
2142 			replace_str, limit, replace_count);
2143 	} else {
2144 		ZEND_ASSERT(regex_ht != NULL);
2145 		result = php_pcre_replace_array(regex_ht, replace_str, replace_ht, subject,
2146 			limit, replace_count);
2147 	}
2148 	return result;
2149 }
2150 /* }}} */
2151 
2152 /* {{{ php_replace_in_subject_func */
php_replace_in_subject_func(zend_string * regex_str,HashTable * regex_ht,zend_fcall_info * fci,zend_fcall_info_cache * fcc,zend_string * subject,size_t limit,size_t * replace_count,zend_long flags)2153 static zend_string *php_replace_in_subject_func(zend_string *regex_str, HashTable *regex_ht,
2154 	zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2155 	zend_string *subject, size_t limit, size_t *replace_count, zend_long flags)
2156 {
2157 	zend_string *result;
2158 
2159 	if (regex_str) {
2160 		result = php_pcre_replace_func(
2161 			regex_str, subject, fci, fcc, limit, replace_count, flags);
2162 		return result;
2163 	} else {
2164 		/* If regex is an array */
2165 		zval		*regex_entry;
2166 
2167 		ZEND_ASSERT(regex_ht != NULL);
2168 
2169 		zend_string_addref(subject);
2170 
2171 		/* For each entry in the regex array, get the entry */
2172 		ZEND_HASH_FOREACH_VAL(regex_ht, regex_entry) {
2173 			/* Make sure we're dealing with strings. */
2174 			zend_string *tmp_regex_entry_str;
2175 			zend_string *regex_entry_str = zval_get_tmp_string(regex_entry, &tmp_regex_entry_str);
2176 
2177 			/* Do the actual replacement and put the result back into subject
2178 			   for further replacements. */
2179 			result = php_pcre_replace_func(
2180 				regex_entry_str, subject, fci, fcc, limit, replace_count, flags);
2181 			zend_tmp_string_release(tmp_regex_entry_str);
2182 			zend_string_release(subject);
2183 			subject = result;
2184 			if (UNEXPECTED(result == NULL)) {
2185 				break;
2186 			}
2187 		} ZEND_HASH_FOREACH_END();
2188 
2189 		return subject;
2190 	}
2191 }
2192 /* }}} */
2193 
2194 /* {{{ preg_replace_func_impl */
preg_replace_func_impl(zval * return_value,zend_string * regex_str,HashTable * regex_ht,zend_fcall_info * fci,zend_fcall_info_cache * fcc,zend_string * subject_str,HashTable * subject_ht,zend_long limit_val,zend_long flags)2195 static size_t preg_replace_func_impl(zval *return_value,
2196 	zend_string *regex_str, HashTable *regex_ht,
2197 	zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2198 	zend_string *subject_str, HashTable *subject_ht, zend_long limit_val, zend_long flags)
2199 {
2200 	zend_string	*result;
2201 	size_t replace_count = 0;
2202 
2203 	if (subject_str) {
2204 		result = php_replace_in_subject_func(
2205 			regex_str, regex_ht, fci, fcc, subject_str, limit_val, &replace_count, flags);
2206 		if (result != NULL) {
2207 			RETVAL_STR(result);
2208 		} else {
2209 			RETVAL_NULL();
2210 		}
2211 	} else {
2212 		/* if subject is an array */
2213 		zval		*subject_entry, zv;
2214 		zend_string	*string_key;
2215 		zend_ulong	 num_key;
2216 
2217 		ZEND_ASSERT(subject_ht != NULL);
2218 
2219 		array_init_size(return_value, zend_hash_num_elements(subject_ht));
2220 
2221 		/* For each subject entry, convert it to string, then perform replacement
2222 		   and add the result to the return_value array. */
2223 		ZEND_HASH_FOREACH_KEY_VAL(subject_ht, num_key, string_key, subject_entry) {
2224 			zend_string *tmp_subject_entry_str;
2225 			zend_string *subject_entry_str = zval_get_tmp_string(subject_entry, &tmp_subject_entry_str);
2226 
2227 			result = php_replace_in_subject_func(
2228 				regex_str, regex_ht, fci, fcc, subject_entry_str, limit_val, &replace_count, flags);
2229 			if (result != NULL) {
2230 				/* Add to return array */
2231 				ZVAL_STR(&zv, result);
2232 				if (string_key) {
2233 					zend_hash_add_new(Z_ARRVAL_P(return_value), string_key, &zv);
2234 				} else {
2235 					zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, &zv);
2236 				}
2237 			}
2238 			zend_tmp_string_release(tmp_subject_entry_str);
2239 		} ZEND_HASH_FOREACH_END();
2240 	}
2241 
2242 	return replace_count;
2243 }
2244 /* }}} */
2245 
2246 /* {{{ preg_replace_common */
preg_replace_common(INTERNAL_FUNCTION_PARAMETERS,bool is_filter)2247 static void preg_replace_common(INTERNAL_FUNCTION_PARAMETERS, bool is_filter)
2248 {
2249 	zval *zcount = NULL;
2250 	zend_string *regex_str;
2251 	HashTable *regex_ht;
2252 	zend_string *replace_str;
2253 	HashTable *replace_ht;
2254 	zend_string *subject_str;
2255 	HashTable *subject_ht;
2256 	zend_long limit = -1;
2257 	size_t replace_count = 0;
2258 	zend_string	*result;
2259 	size_t old_replace_count;
2260 
2261 	/* Get function parameters and do error-checking. */
2262 	ZEND_PARSE_PARAMETERS_START(3, 5)
2263 		Z_PARAM_ARRAY_HT_OR_STR(regex_ht, regex_str)
2264 		Z_PARAM_ARRAY_HT_OR_STR(replace_ht, replace_str)
2265 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2266 		Z_PARAM_OPTIONAL
2267 		Z_PARAM_LONG(limit)
2268 		Z_PARAM_ZVAL(zcount)
2269 	ZEND_PARSE_PARAMETERS_END();
2270 
2271 	/* If replace is an array then the regex argument needs to also be an array */
2272 	if (replace_ht && !regex_ht) {
2273 		zend_argument_type_error(1, "must be of type array when argument #2 ($replacement) is an array, string given");
2274 		RETURN_THROWS();
2275 	}
2276 
2277 	if (subject_str) {
2278 		old_replace_count = replace_count;
2279 		result = php_replace_in_subject(regex_str, regex_ht, replace_str, replace_ht,
2280 			subject_str, limit, &replace_count);
2281 		if (result != NULL) {
2282 			if (!is_filter || replace_count > old_replace_count) {
2283 				RETVAL_STR(result);
2284 			} else {
2285 				zend_string_release_ex(result, 0);
2286 				RETVAL_NULL();
2287 			}
2288 		} else {
2289 			RETVAL_NULL();
2290 		}
2291 	} else {
2292 		/* if subject is an array */
2293 		zval		*subject_entry, zv;
2294 		zend_string	*string_key;
2295 		zend_ulong	 num_key;
2296 
2297 		ZEND_ASSERT(subject_ht != NULL);
2298 
2299 		array_init_size(return_value, zend_hash_num_elements(subject_ht));
2300 
2301 		/* For each subject entry, convert it to string, then perform replacement
2302 		   and add the result to the return_value array. */
2303 		ZEND_HASH_FOREACH_KEY_VAL(subject_ht, num_key, string_key, subject_entry) {
2304 			old_replace_count = replace_count;
2305 			zend_string *tmp_subject_entry_str;
2306 			zend_string *subject_entry_str = zval_get_tmp_string(subject_entry, &tmp_subject_entry_str);
2307 			result = php_replace_in_subject(regex_str, regex_ht, replace_str, replace_ht,
2308 				subject_entry_str, limit, &replace_count);
2309 
2310 			if (result != NULL) {
2311 				if (!is_filter || replace_count > old_replace_count) {
2312 					/* Add to return array */
2313 					ZVAL_STR(&zv, result);
2314 					if (string_key) {
2315 						zend_hash_add_new(Z_ARRVAL_P(return_value), string_key, &zv);
2316 					} else {
2317 						zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, &zv);
2318 					}
2319 				} else {
2320 					zend_string_release_ex(result, 0);
2321 				}
2322 			}
2323 			zend_tmp_string_release(tmp_subject_entry_str);
2324 		} ZEND_HASH_FOREACH_END();
2325 	}
2326 
2327 	if (zcount) {
2328 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2329 	}
2330 }
2331 /* }}} */
2332 
2333 /* {{{ Perform Perl-style regular expression replacement. */
PHP_FUNCTION(preg_replace)2334 PHP_FUNCTION(preg_replace)
2335 {
2336 	preg_replace_common(INTERNAL_FUNCTION_PARAM_PASSTHRU, false);
2337 }
2338 /* }}} */
2339 
2340 /* {{{ Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback)2341 PHP_FUNCTION(preg_replace_callback)
2342 {
2343 	zval *zcount = NULL;
2344 	zend_string *regex_str;
2345 	HashTable *regex_ht;
2346 	zend_string *subject_str;
2347 	HashTable *subject_ht;
2348 	zend_long limit = -1, flags = 0;
2349 	size_t replace_count;
2350 	zend_fcall_info fci;
2351 	zend_fcall_info_cache fcc;
2352 
2353 	/* Get function parameters and do error-checking. */
2354 	ZEND_PARSE_PARAMETERS_START(3, 6)
2355 		Z_PARAM_ARRAY_HT_OR_STR(regex_ht, regex_str)
2356 		Z_PARAM_FUNC(fci, fcc)
2357 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2358 		Z_PARAM_OPTIONAL
2359 		Z_PARAM_LONG(limit)
2360 		Z_PARAM_ZVAL(zcount)
2361 		Z_PARAM_LONG(flags)
2362 	ZEND_PARSE_PARAMETERS_END();
2363 
2364 	replace_count = preg_replace_func_impl(return_value, regex_str, regex_ht,
2365 		&fci, &fcc,
2366 		subject_str, subject_ht, limit, flags);
2367 	if (zcount) {
2368 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2369 	}
2370 }
2371 /* }}} */
2372 
2373 /* {{{ Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback_array)2374 PHP_FUNCTION(preg_replace_callback_array)
2375 {
2376 	zval zv, *replace, *zcount = NULL;
2377 	HashTable *pattern, *subject_ht;
2378 	zend_string *subject_str, *str_idx_regex;
2379 	zend_long limit = -1, flags = 0;
2380 	size_t replace_count = 0;
2381 	zend_fcall_info fci;
2382 	zend_fcall_info_cache fcc;
2383 
2384 	/* Get function parameters and do error-checking. */
2385 	ZEND_PARSE_PARAMETERS_START(2, 5)
2386 		Z_PARAM_ARRAY_HT(pattern)
2387 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2388 		Z_PARAM_OPTIONAL
2389 		Z_PARAM_LONG(limit)
2390 		Z_PARAM_ZVAL(zcount)
2391 		Z_PARAM_LONG(flags)
2392 	ZEND_PARSE_PARAMETERS_END();
2393 
2394 	fci.size = sizeof(fci);
2395 	fci.object = NULL;
2396 	fci.named_params = NULL;
2397 
2398 	if (subject_ht) {
2399 		GC_TRY_ADDREF(subject_ht);
2400 	} else {
2401 		GC_TRY_ADDREF(subject_str);
2402 	}
2403 
2404 	ZEND_HASH_FOREACH_STR_KEY_VAL(pattern, str_idx_regex, replace) {
2405 		if (!zend_is_callable_ex(replace, NULL, 0, NULL, &fcc, NULL)) {
2406 			zend_argument_type_error(1, "must contain only valid callbacks");
2407 			goto error;
2408 		}
2409 		if (!str_idx_regex) {
2410 			zend_argument_type_error(1, "must contain only string patterns as keys");
2411 			goto error;
2412 		}
2413 
2414 		ZVAL_COPY_VALUE(&fci.function_name, replace);
2415 
2416 		replace_count += preg_replace_func_impl(&zv, str_idx_regex, /* regex_ht */ NULL, &fci, &fcc,
2417 			subject_str, subject_ht, limit, flags);
2418 		switch (Z_TYPE(zv)) {
2419 			case IS_ARRAY:
2420 				ZEND_ASSERT(subject_ht);
2421 				zend_array_release(subject_ht);
2422 				subject_ht = Z_ARR(zv);
2423 				break;
2424 			case IS_STRING:
2425 				ZEND_ASSERT(subject_str);
2426 				zend_string_release(subject_str);
2427 				subject_str = Z_STR(zv);
2428 				break;
2429 			case IS_NULL:
2430 				RETVAL_NULL();
2431 				goto error;
2432 			EMPTY_SWITCH_DEFAULT_CASE()
2433 		}
2434 
2435 		if (EG(exception)) {
2436 			goto error;
2437 		}
2438 	} ZEND_HASH_FOREACH_END();
2439 
2440 	if (zcount) {
2441 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2442 	}
2443 
2444 	if (subject_ht) {
2445 		RETVAL_ARR(subject_ht);
2446 		// Unset the type_flags of immutable arrays to prevent the VM from performing refcounting
2447 		if (GC_FLAGS(subject_ht) & IS_ARRAY_IMMUTABLE) {
2448 			Z_TYPE_FLAGS_P(return_value) = 0;
2449 		}
2450 		return;
2451 	} else {
2452 		RETURN_STR(subject_str);
2453 	}
2454 
2455 error:
2456 	if (subject_ht) {
2457 		zend_array_release(subject_ht);
2458 	} else {
2459 		zend_string_release(subject_str);
2460 	}
2461 }
2462 /* }}} */
2463 
2464 /* {{{ Perform Perl-style regular expression replacement and only return matches. */
PHP_FUNCTION(preg_filter)2465 PHP_FUNCTION(preg_filter)
2466 {
2467 	preg_replace_common(INTERNAL_FUNCTION_PARAM_PASSTHRU, true);
2468 }
2469 /* }}} */
2470 
2471 /* {{{ Split string into an array using a perl-style regular expression as a delimiter */
PHP_FUNCTION(preg_split)2472 PHP_FUNCTION(preg_split)
2473 {
2474 	zend_string			*regex;			/* Regular expression */
2475 	zend_string			*subject;		/* String to match against */
2476 	zend_long			 limit_val = -1;/* Integer value of limit */
2477 	zend_long			 flags = 0;		/* Match control flags */
2478 	pcre_cache_entry	*pce;			/* Compiled regular expression */
2479 
2480 	/* Get function parameters and do error checking */
2481 	ZEND_PARSE_PARAMETERS_START(2, 4)
2482 		Z_PARAM_STR(regex)
2483 		Z_PARAM_STR(subject)
2484 		Z_PARAM_OPTIONAL
2485 		Z_PARAM_LONG(limit_val)
2486 		Z_PARAM_LONG(flags)
2487 	ZEND_PARSE_PARAMETERS_END();
2488 
2489 	/* Compile regex or get it from cache. */
2490 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2491 		RETURN_FALSE;
2492 	}
2493 
2494 	pce->refcount++;
2495 	php_pcre_split_impl(pce, subject, return_value, limit_val, flags);
2496 	pce->refcount--;
2497 }
2498 /* }}} */
2499 
2500 /* {{{ php_pcre_split */
php_pcre_split_impl(pcre_cache_entry * pce,zend_string * subject_str,zval * return_value,zend_long limit_val,zend_long flags)2501 PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
2502 	zend_long limit_val, zend_long flags)
2503 {
2504 	PCRE2_SIZE		*offsets;			/* Array of subpattern offsets */
2505 	uint32_t		 options;			/* Execution options */
2506 	int				 count;				/* Count of matched subpatterns */
2507 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
2508 	PCRE2_SIZE		 last_match_offset;	/* Location of last match */
2509 	uint32_t		 no_empty;			/* If NO_EMPTY flag is set */
2510 	uint32_t		 delim_capture; 	/* If delimiters should be captured */
2511 	uint32_t		 offset_capture;	/* If offsets should be captured */
2512 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
2513 	zval			 tmp;
2514 	pcre2_match_data *match_data;
2515 	char *subject = ZSTR_VAL(subject_str);
2516 
2517 	no_empty = flags & PREG_SPLIT_NO_EMPTY;
2518 	delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
2519 	offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
2520 
2521 	/* Initialize return value */
2522 	array_init(return_value);
2523 
2524 	/* Calculate the size of the offsets array, and allocate memory for it. */
2525 	num_subpats = pce->capture_count + 1;
2526 
2527 	/* Start at the beginning of the string */
2528 	start_offset = 0;
2529 	last_match_offset = 0;
2530 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
2531 
2532 	if (limit_val == -1) {
2533 		/* pass */
2534 	} else if (limit_val == 0) {
2535 		limit_val = -1;
2536 	} else if (limit_val <= 1) {
2537 		goto last;
2538 	}
2539 
2540 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
2541 		match_data = mdata;
2542 	} else {
2543 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
2544 		if (!match_data) {
2545 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2546 			zval_ptr_dtor(return_value);
2547 			RETURN_FALSE;
2548 		}
2549 	}
2550 
2551 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
2552 
2553 #ifdef HAVE_PCRE_JIT_SUPPORT
2554 	if ((pce->preg_options & PREG_JIT) && options) {
2555 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2556 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2557 	} else
2558 #endif
2559 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2560 			options, match_data, mctx);
2561 
2562 	while (1) {
2563 		/* If something matched */
2564 		if (count >= 0) {
2565 			/* Check for too many substrings condition. */
2566 			if (UNEXPECTED(count == 0)) {
2567 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
2568 				count = num_subpats;
2569 			}
2570 
2571 matched:
2572 			offsets = pcre2_get_ovector_pointer(match_data);
2573 
2574 			if (UNEXPECTED(offsets[1] < offsets[0])) {
2575 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2576 				break;
2577 			}
2578 
2579 			if (!no_empty || offsets[0] != last_match_offset) {
2580 				if (offset_capture) {
2581 					/* Add (match, offset) pair to the return value */
2582 					add_offset_pair(
2583 						return_value, subject, last_match_offset, offsets[0],
2584 						NULL, 0);
2585 				} else {
2586 					/* Add the piece to the return value */
2587 					populate_match_value_str(&tmp, subject, last_match_offset, offsets[0]);
2588 					zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &tmp);
2589 				}
2590 
2591 				/* One less left to do */
2592 				if (limit_val != -1)
2593 					limit_val--;
2594 			}
2595 
2596 			if (delim_capture) {
2597 				size_t i;
2598 				for (i = 1; i < count; i++) {
2599 					/* If we have matched a delimiter */
2600 					if (!no_empty || offsets[2*i] != offsets[2*i+1]) {
2601 						if (offset_capture) {
2602 							add_offset_pair(
2603 								return_value, subject, offsets[2*i], offsets[2*i+1], NULL, 0);
2604 						} else {
2605 							populate_match_value_str(&tmp, subject, offsets[2*i], offsets[2*i+1]);
2606 							zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &tmp);
2607 						}
2608 					}
2609 				}
2610 			}
2611 
2612 			/* Advance to the position right after the last full match */
2613 			start_offset = last_match_offset = offsets[1];
2614 
2615 			/* If we have matched an empty string, mimic what Perl's /g options does.
2616 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
2617 			   the match again at the same point. If this fails (picked up above) we
2618 			   advance to the next character. */
2619 			if (start_offset == offsets[0]) {
2620 				/* Get next piece if no limit or limit not yet reached and something matched*/
2621 				if (limit_val != -1 && limit_val <= 1) {
2622 					break;
2623 				}
2624 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2625 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
2626 				if (count >= 0) {
2627 					goto matched;
2628 				} else if (count == PCRE2_ERROR_NOMATCH) {
2629 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
2630 					   this is not necessarily the end. We need to advance
2631 					   the start offset, and continue. Fudge the offset values
2632 					   to achieve this, unless we're already at the end of the string. */
2633 					if (start_offset < ZSTR_LEN(subject_str)) {
2634 						start_offset += calculate_unit_length(pce, subject + start_offset);
2635 					} else {
2636 						break;
2637 					}
2638 				} else {
2639 					goto error;
2640 				}
2641 			}
2642 
2643 		} else if (count == PCRE2_ERROR_NOMATCH) {
2644 			break;
2645 		} else {
2646 error:
2647 			pcre_handle_exec_error(count);
2648 			break;
2649 		}
2650 
2651 		/* Get next piece if no limit or limit not yet reached and something matched*/
2652 		if (limit_val != -1 && limit_val <= 1) {
2653 			break;
2654 		}
2655 
2656 #ifdef HAVE_PCRE_JIT_SUPPORT
2657 		if (pce->preg_options & PREG_JIT) {
2658 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2659 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2660 		} else
2661 #endif
2662 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2663 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2664 	}
2665 	if (match_data != mdata) {
2666 		pcre2_match_data_free(match_data);
2667 	}
2668 
2669 	if (PCRE_G(error_code) != PHP_PCRE_NO_ERROR) {
2670 		zval_ptr_dtor(return_value);
2671 		RETURN_FALSE;
2672 	}
2673 
2674 last:
2675 	start_offset = last_match_offset; /* the offset might have been incremented, but without further successful matches */
2676 
2677 	if (!no_empty || start_offset < ZSTR_LEN(subject_str)) {
2678 		if (offset_capture) {
2679 			/* Add the last (match, offset) pair to the return value */
2680 			add_offset_pair(return_value, subject, start_offset, ZSTR_LEN(subject_str), NULL, 0);
2681 		} else {
2682 			/* Add the last piece to the return value */
2683 			if (start_offset == 0) {
2684 				ZVAL_STR_COPY(&tmp, subject_str);
2685 			} else {
2686 				populate_match_value_str(&tmp, subject, start_offset, ZSTR_LEN(subject_str));
2687 			}
2688 			zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &tmp);
2689 		}
2690 	}
2691 }
2692 /* }}} */
2693 
2694 /* {{{ Quote regular expression characters plus an optional character */
PHP_FUNCTION(preg_quote)2695 PHP_FUNCTION(preg_quote)
2696 {
2697 	zend_string *str;       		/* Input string argument */
2698 	zend_string	*delim = NULL;		/* Additional delimiter argument */
2699 	char		*in_str;			/* Input string */
2700 	char		*in_str_end;    	/* End of the input string */
2701 	zend_string	*out_str;			/* Output string with quoted characters */
2702 	size_t       extra_len;         /* Number of additional characters */
2703 	char 		*p,					/* Iterator for input string */
2704 				*q,					/* Iterator for output string */
2705 				 delim_char = '\0',	/* Delimiter character to be quoted */
2706 				 c;					/* Current character */
2707 
2708 	/* Get the arguments and check for errors */
2709 	ZEND_PARSE_PARAMETERS_START(1, 2)
2710 		Z_PARAM_STR(str)
2711 		Z_PARAM_OPTIONAL
2712 		Z_PARAM_STR_OR_NULL(delim)
2713 	ZEND_PARSE_PARAMETERS_END();
2714 
2715 	/* Nothing to do if we got an empty string */
2716 	if (ZSTR_LEN(str) == 0) {
2717 		RETURN_EMPTY_STRING();
2718 	}
2719 
2720 	in_str = ZSTR_VAL(str);
2721 	in_str_end = in_str + ZSTR_LEN(str);
2722 
2723 	if (delim) {
2724 		delim_char = ZSTR_VAL(delim)[0];
2725 	}
2726 
2727 	/* Go through the string and quote necessary characters */
2728 	extra_len = 0;
2729 	p = in_str;
2730 	do {
2731 		c = *p;
2732 		switch(c) {
2733 			case '.':
2734 			case '\\':
2735 			case '+':
2736 			case '*':
2737 			case '?':
2738 			case '[':
2739 			case '^':
2740 			case ']':
2741 			case '$':
2742 			case '(':
2743 			case ')':
2744 			case '{':
2745 			case '}':
2746 			case '=':
2747 			case '!':
2748 			case '>':
2749 			case '<':
2750 			case '|':
2751 			case ':':
2752 			case '-':
2753 			case '#':
2754 				extra_len++;
2755 				break;
2756 
2757 			case '\0':
2758 				extra_len+=3;
2759 				break;
2760 
2761 			default:
2762 				if (c == delim_char) {
2763 					extra_len++;
2764 				}
2765 				break;
2766 		}
2767 		p++;
2768 	} while (p != in_str_end);
2769 
2770 	if (extra_len == 0) {
2771 		RETURN_STR_COPY(str);
2772 	}
2773 
2774 	/* Allocate enough memory so that even if each character
2775 	   is quoted, we won't run out of room */
2776 	out_str = zend_string_safe_alloc(1, ZSTR_LEN(str), extra_len, 0);
2777 	q = ZSTR_VAL(out_str);
2778 	p = in_str;
2779 
2780 	do {
2781 		c = *p;
2782 		switch(c) {
2783 			case '.':
2784 			case '\\':
2785 			case '+':
2786 			case '*':
2787 			case '?':
2788 			case '[':
2789 			case '^':
2790 			case ']':
2791 			case '$':
2792 			case '(':
2793 			case ')':
2794 			case '{':
2795 			case '}':
2796 			case '=':
2797 			case '!':
2798 			case '>':
2799 			case '<':
2800 			case '|':
2801 			case ':':
2802 			case '-':
2803 			case '#':
2804 				*q++ = '\\';
2805 				*q++ = c;
2806 				break;
2807 
2808 			case '\0':
2809 				*q++ = '\\';
2810 				*q++ = '0';
2811 				*q++ = '0';
2812 				*q++ = '0';
2813 				break;
2814 
2815 			default:
2816 				if (c == delim_char) {
2817 					*q++ = '\\';
2818 				}
2819 				*q++ = c;
2820 				break;
2821 		}
2822 		p++;
2823 	} while (p != in_str_end);
2824 	*q = '\0';
2825 
2826 	RETURN_NEW_STR(out_str);
2827 }
2828 /* }}} */
2829 
2830 /* {{{ Searches array and returns entries which match regex */
PHP_FUNCTION(preg_grep)2831 PHP_FUNCTION(preg_grep)
2832 {
2833 	zend_string			*regex;			/* Regular expression */
2834 	zval				*input;			/* Input array */
2835 	zend_long			 flags = 0;		/* Match control flags */
2836 	pcre_cache_entry	*pce;			/* Compiled regular expression */
2837 
2838 	/* Get arguments and do error checking */
2839 	ZEND_PARSE_PARAMETERS_START(2, 3)
2840 		Z_PARAM_STR(regex)
2841 		Z_PARAM_ARRAY(input)
2842 		Z_PARAM_OPTIONAL
2843 		Z_PARAM_LONG(flags)
2844 	ZEND_PARSE_PARAMETERS_END();
2845 
2846 	/* Compile regex or get it from cache. */
2847 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2848 		RETURN_FALSE;
2849 	}
2850 
2851 	pce->refcount++;
2852 	php_pcre_grep_impl(pce, input, return_value, flags);
2853 	pce->refcount--;
2854 }
2855 /* }}} */
2856 
php_pcre_grep_impl(pcre_cache_entry * pce,zval * input,zval * return_value,zend_long flags)2857 PHPAPI void  php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, zend_long flags) /* {{{ */
2858 {
2859 	zval            *entry;             /* An entry in the input array */
2860 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
2861 	int				 count;				/* Count of matched subpatterns */
2862 	uint32_t		 options;			/* Execution options */
2863 	zend_string		*string_key;
2864 	zend_ulong		 num_key;
2865 	bool		 invert;			/* Whether to return non-matching
2866 										   entries */
2867 	pcre2_match_data *match_data;
2868 	invert = flags & PREG_GREP_INVERT ? 1 : 0;
2869 
2870 	/* Calculate the size of the offsets array, and allocate memory for it. */
2871 	num_subpats = pce->capture_count + 1;
2872 
2873 	/* Initialize return array */
2874 	array_init(return_value);
2875 
2876 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
2877 
2878 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
2879 		match_data = mdata;
2880 	} else {
2881 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
2882 		if (!match_data) {
2883 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2884 			return;
2885 		}
2886 	}
2887 
2888 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
2889 
2890 	/* Go through the input array */
2891 	ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(input), num_key, string_key, entry) {
2892 		zend_string *tmp_subject_str;
2893 		zend_string *subject_str = zval_get_tmp_string(entry, &tmp_subject_str);
2894 
2895 		/* Perform the match */
2896 #ifdef HAVE_PCRE_JIT_SUPPORT
2897 		if ((pce->preg_options & PREG_JIT) && options) {
2898 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), 0,
2899 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2900 		} else
2901 #endif
2902 		count = pcre2_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), 0,
2903 				options, match_data, mctx);
2904 
2905 		/* If the entry fits our requirements */
2906 		if (count >= 0) {
2907 			/* Check for too many substrings condition. */
2908 			if (UNEXPECTED(count == 0)) {
2909 				php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings");
2910 			}
2911 			if (!invert) {
2912 				Z_TRY_ADDREF_P(entry);
2913 
2914 				/* Add to return array */
2915 				if (string_key) {
2916 					zend_hash_update(Z_ARRVAL_P(return_value), string_key, entry);
2917 				} else {
2918 					zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry);
2919 				}
2920 			}
2921 		} else if (count == PCRE2_ERROR_NOMATCH) {
2922 			if (invert) {
2923 				Z_TRY_ADDREF_P(entry);
2924 
2925 				/* Add to return array */
2926 				if (string_key) {
2927 					zend_hash_update(Z_ARRVAL_P(return_value), string_key, entry);
2928 				} else {
2929 					zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry);
2930 				}
2931 			}
2932 		} else {
2933 			pcre_handle_exec_error(count);
2934 			zend_tmp_string_release(tmp_subject_str);
2935 			break;
2936 		}
2937 
2938 		zend_tmp_string_release(tmp_subject_str);
2939 	} ZEND_HASH_FOREACH_END();
2940 	if (match_data != mdata) {
2941 		pcre2_match_data_free(match_data);
2942 	}
2943 }
2944 /* }}} */
2945 
2946 /* {{{ Returns the error code of the last regexp execution. */
PHP_FUNCTION(preg_last_error)2947 PHP_FUNCTION(preg_last_error)
2948 {
2949 	ZEND_PARSE_PARAMETERS_NONE();
2950 
2951 	RETURN_LONG(PCRE_G(error_code));
2952 }
2953 /* }}} */
2954 
2955 /* {{{ Returns the error message of the last regexp execution. */
PHP_FUNCTION(preg_last_error_msg)2956 PHP_FUNCTION(preg_last_error_msg)
2957 {
2958 	ZEND_PARSE_PARAMETERS_NONE();
2959 
2960 	RETURN_STRING(php_pcre_get_error_msg(PCRE_G(error_code)));
2961 }
2962 /* }}} */
2963 
2964 /* {{{ module definition structures */
2965 
2966 zend_module_entry pcre_module_entry = {
2967 	STANDARD_MODULE_HEADER,
2968 	"pcre",
2969 	ext_functions,
2970 	PHP_MINIT(pcre),
2971 	PHP_MSHUTDOWN(pcre),
2972 	PHP_RINIT(pcre),
2973 	PHP_RSHUTDOWN(pcre),
2974 	PHP_MINFO(pcre),
2975 	PHP_PCRE_VERSION,
2976 	PHP_MODULE_GLOBALS(pcre),
2977 	PHP_GINIT(pcre),
2978 	PHP_GSHUTDOWN(pcre),
2979 	NULL,
2980 	STANDARD_MODULE_PROPERTIES_EX
2981 };
2982 
2983 #ifdef COMPILE_DL_PCRE
ZEND_GET_MODULE(pcre)2984 ZEND_GET_MODULE(pcre)
2985 #endif
2986 
2987 /* }}} */
2988 
2989 PHPAPI pcre2_match_context *php_pcre_mctx(void)
2990 {/*{{{*/
2991 	return mctx;
2992 }/*}}}*/
2993 
php_pcre_gctx(void)2994 PHPAPI pcre2_general_context *php_pcre_gctx(void)
2995 {/*{{{*/
2996 	return gctx;
2997 }/*}}}*/
2998 
php_pcre_cctx(void)2999 PHPAPI pcre2_compile_context *php_pcre_cctx(void)
3000 {/*{{{*/
3001 	return cctx;
3002 }/*}}}*/
3003 
php_pcre_pce_incref(pcre_cache_entry * pce)3004 PHPAPI void php_pcre_pce_incref(pcre_cache_entry *pce)
3005 {/*{{{*/
3006 	assert(NULL != pce);
3007 	pce->refcount++;
3008 }/*}}}*/
3009 
php_pcre_pce_decref(pcre_cache_entry * pce)3010 PHPAPI void php_pcre_pce_decref(pcre_cache_entry *pce)
3011 {/*{{{*/
3012 	assert(NULL != pce);
3013 	assert(0 != pce->refcount);
3014 	pce->refcount--;
3015 }/*}}}*/
3016 
php_pcre_pce_re(pcre_cache_entry * pce)3017 PHPAPI pcre2_code *php_pcre_pce_re(pcre_cache_entry *pce)
3018 {/*{{{*/
3019 	assert(NULL != pce);
3020 	return pce->re;
3021 }/*}}}*/
3022