xref: /PHP-8.2/ext/pcre/php_pcre.c (revision 631bc816)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Andrei Zmievski <andrei@php.net>                             |
14    +----------------------------------------------------------------------+
15  */
16 
17 #include "php.h"
18 #include "php_ini.h"
19 #include "php_globals.h"
20 #include "php_pcre.h"
21 #include "ext/standard/info.h"
22 #include "ext/standard/basic_functions.h"
23 #include "zend_smart_str.h"
24 #include "SAPI.h"
25 
26 #include "ext/standard/php_string.h"
27 
28 #define PREG_PATTERN_ORDER			1
29 #define PREG_SET_ORDER				2
30 #define PREG_OFFSET_CAPTURE			(1<<8)
31 #define PREG_UNMATCHED_AS_NULL		(1<<9)
32 
33 #define	PREG_SPLIT_NO_EMPTY			(1<<0)
34 #define PREG_SPLIT_DELIM_CAPTURE	(1<<1)
35 #define PREG_SPLIT_OFFSET_CAPTURE	(1<<2)
36 
37 #define PREG_GREP_INVERT			(1<<0)
38 
39 #define PREG_JIT                    (1<<3)
40 
41 #define PCRE_CACHE_SIZE 4096
42 
43 #ifdef HAVE_PCRE_JIT_SUPPORT
44 #define PHP_PCRE_JIT_SUPPORT 1
45 #else
46 #define PHP_PCRE_JIT_SUPPORT 0
47 #endif
48 
49 char *php_pcre_version;
50 
51 #include "php_pcre_arginfo.h"
52 
53 struct _pcre_cache_entry {
54 	pcre2_code *re;
55 	uint32_t preg_options;
56 	uint32_t capture_count;
57 	uint32_t name_count;
58 	uint32_t compile_options;
59 	uint32_t refcount;
60 };
61 
62 PHPAPI ZEND_DECLARE_MODULE_GLOBALS(pcre)
63 
64 #ifdef HAVE_PCRE_JIT_SUPPORT
65 #define PCRE_JIT_STACK_MIN_SIZE (32 * 1024)
66 #define PCRE_JIT_STACK_MAX_SIZE (192 * 1024)
67 ZEND_TLS pcre2_jit_stack *jit_stack = NULL;
68 #endif
69 /* General context using (infallible) system allocator. */
70 ZEND_TLS pcre2_general_context *gctx = NULL;
71 /* These two are global per thread for now. Though it is possible to use these
72  	per pattern. Either one can copy it and use in pce, or one does no global
73 	contexts at all, but creates for every pce. */
74 ZEND_TLS pcre2_compile_context *cctx = NULL;
75 ZEND_TLS pcre2_match_context   *mctx = NULL;
76 ZEND_TLS pcre2_match_data      *mdata = NULL;
77 ZEND_TLS bool              mdata_used = 0;
78 ZEND_TLS uint8_t pcre2_init_ok = 0;
79 #if defined(ZTS) && defined(HAVE_PCRE_JIT_SUPPORT)
80 static MUTEX_T pcre_mt = NULL;
81 #define php_pcre_mutex_alloc() \
82 	if (tsrm_is_main_thread() && !pcre_mt) pcre_mt = tsrm_mutex_alloc();
83 #define php_pcre_mutex_free() \
84 	if (tsrm_is_main_thread() && pcre_mt) { tsrm_mutex_free(pcre_mt); pcre_mt = NULL; }
85 #define php_pcre_mutex_lock() tsrm_mutex_lock(pcre_mt);
86 #define php_pcre_mutex_unlock() tsrm_mutex_unlock(pcre_mt);
87 #else
88 #define php_pcre_mutex_alloc()
89 #define php_pcre_mutex_free()
90 #define php_pcre_mutex_lock()
91 #define php_pcre_mutex_unlock()
92 #endif
93 
94 ZEND_TLS HashTable char_tables;
95 
php_pcre_free_char_table(zval * data)96 static void php_pcre_free_char_table(zval *data)
97 {/*{{{*/
98 	void *ptr = Z_PTR_P(data);
99 	pefree(ptr, 1);
100 }/*}}}*/
101 
pcre_handle_exec_error(int pcre_code)102 static void pcre_handle_exec_error(int pcre_code) /* {{{ */
103 {
104 	int preg_code = 0;
105 
106 	switch (pcre_code) {
107 		case PCRE2_ERROR_MATCHLIMIT:
108 			preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
109 			break;
110 
111 		case PCRE2_ERROR_RECURSIONLIMIT:
112 			preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
113 			break;
114 
115 		case PCRE2_ERROR_BADUTFOFFSET:
116 			preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
117 			break;
118 
119 #ifdef HAVE_PCRE_JIT_SUPPORT
120 		case PCRE2_ERROR_JIT_STACKLIMIT:
121 			preg_code = PHP_PCRE_JIT_STACKLIMIT_ERROR;
122 			break;
123 #endif
124 
125 		default:
126 			if (pcre_code <= PCRE2_ERROR_UTF8_ERR1 && pcre_code >= PCRE2_ERROR_UTF8_ERR21) {
127 				preg_code = PHP_PCRE_BAD_UTF8_ERROR;
128 			} else  {
129 				preg_code = PHP_PCRE_INTERNAL_ERROR;
130 			}
131 			break;
132 	}
133 
134 	PCRE_G(error_code) = preg_code;
135 }
136 /* }}} */
137 
php_pcre_get_error_msg(php_pcre_error_code error_code)138 static const char *php_pcre_get_error_msg(php_pcre_error_code error_code) /* {{{ */
139 {
140 	switch (error_code) {
141 		case PHP_PCRE_NO_ERROR:
142 			return "No error";
143 		case PHP_PCRE_INTERNAL_ERROR:
144 			return "Internal error";
145 		case PHP_PCRE_BAD_UTF8_ERROR:
146 			return "Malformed UTF-8 characters, possibly incorrectly encoded";
147 		case PHP_PCRE_BAD_UTF8_OFFSET_ERROR:
148 			return "The offset did not correspond to the beginning of a valid UTF-8 code point";
149 		case PHP_PCRE_BACKTRACK_LIMIT_ERROR:
150 			return "Backtrack limit exhausted";
151 		case PHP_PCRE_RECURSION_LIMIT_ERROR:
152 			return "Recursion limit exhausted";
153 
154 #ifdef HAVE_PCRE_JIT_SUPPORT
155 		case PHP_PCRE_JIT_STACKLIMIT_ERROR:
156 			return "JIT stack limit exhausted";
157 #endif
158 
159 		default:
160 			return "Unknown error";
161 	}
162 }
163 /* }}} */
164 
php_free_pcre_cache(zval * data)165 static void php_free_pcre_cache(zval *data) /* {{{ */
166 {
167 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
168 	if (!pce) return;
169 	pcre2_code_free(pce->re);
170 	free(pce);
171 }
172 /* }}} */
173 
php_efree_pcre_cache(zval * data)174 static void php_efree_pcre_cache(zval *data) /* {{{ */
175 {
176 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
177 	if (!pce) return;
178 	pcre2_code_free(pce->re);
179 	efree(pce);
180 }
181 /* }}} */
182 
php_pcre_malloc(PCRE2_SIZE size,void * data)183 static void *php_pcre_malloc(PCRE2_SIZE size, void *data)
184 {
185 	return pemalloc(size, 1);
186 }
187 
php_pcre_free(void * block,void * data)188 static void php_pcre_free(void *block, void *data)
189 {
190 	pefree(block, 1);
191 }
192 
php_pcre_emalloc(PCRE2_SIZE size,void * data)193 static void *php_pcre_emalloc(PCRE2_SIZE size, void *data)
194 {
195 	return emalloc(size);
196 }
197 
php_pcre_efree(void * block,void * data)198 static void php_pcre_efree(void *block, void *data)
199 {
200 	efree(block);
201 }
202 
203 #ifdef PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
204 	/* pcre 10.38 needs PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK, disabled by default */
205 #define PHP_PCRE_DEFAULT_EXTRA_COPTIONS PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
206 #else
207 #define PHP_PCRE_DEFAULT_EXTRA_COPTIONS 0
208 #endif
209 
210 #define PHP_PCRE_PREALLOC_MDATA_SIZE 32
211 
php_pcre_init_pcre2(uint8_t jit)212 static void php_pcre_init_pcre2(uint8_t jit)
213 {/*{{{*/
214 	if (!gctx) {
215 		gctx = pcre2_general_context_create(php_pcre_malloc, php_pcre_free, NULL);
216 		if (!gctx) {
217 			pcre2_init_ok = 0;
218 			return;
219 		}
220 	}
221 
222 	if (!cctx) {
223 		cctx = pcre2_compile_context_create(gctx);
224 		if (!cctx) {
225 			pcre2_init_ok = 0;
226 			return;
227 		}
228 	}
229 
230 	pcre2_set_compile_extra_options(cctx, PHP_PCRE_DEFAULT_EXTRA_COPTIONS);
231 
232 	if (!mctx) {
233 		mctx = pcre2_match_context_create(gctx);
234 		if (!mctx) {
235 			pcre2_init_ok = 0;
236 			return;
237 		}
238 	}
239 
240 #ifdef HAVE_PCRE_JIT_SUPPORT
241 	if (jit && !jit_stack) {
242 		jit_stack = pcre2_jit_stack_create(PCRE_JIT_STACK_MIN_SIZE, PCRE_JIT_STACK_MAX_SIZE, gctx);
243 		if (!jit_stack) {
244 			pcre2_init_ok = 0;
245 			return;
246 		}
247 	}
248 #endif
249 
250 	if (!mdata) {
251 		mdata = pcre2_match_data_create(PHP_PCRE_PREALLOC_MDATA_SIZE, gctx);
252 		if (!mdata) {
253 			pcre2_init_ok = 0;
254 			return;
255 		}
256 	}
257 
258 	pcre2_init_ok = 1;
259 }/*}}}*/
260 
php_pcre_shutdown_pcre2(void)261 static void php_pcre_shutdown_pcre2(void)
262 {/*{{{*/
263 	if (gctx) {
264 		pcre2_general_context_free(gctx);
265 		gctx = NULL;
266 	}
267 
268 	if (cctx) {
269 		pcre2_compile_context_free(cctx);
270 		cctx = NULL;
271 	}
272 
273 	if (mctx) {
274 		pcre2_match_context_free(mctx);
275 		mctx = NULL;
276 	}
277 
278 #ifdef HAVE_PCRE_JIT_SUPPORT
279 	/* Stack may only be destroyed when no cached patterns
280 	 	possibly associated with it do exist. */
281 	if (jit_stack) {
282 		pcre2_jit_stack_free(jit_stack);
283 		jit_stack = NULL;
284 	}
285 #endif
286 
287 	if (mdata) {
288 		pcre2_match_data_free(mdata);
289 		mdata = NULL;
290 	}
291 
292 	pcre2_init_ok = 0;
293 }/*}}}*/
294 
PHP_GINIT_FUNCTION(pcre)295 static PHP_GINIT_FUNCTION(pcre) /* {{{ */
296 {
297 	php_pcre_mutex_alloc();
298 
299 	/* If we're on the CLI SAPI, there will only be one request, so we don't need the
300 	 * cache to survive after RSHUTDOWN. */
301 	pcre_globals->per_request_cache = strcmp(sapi_module.name, "cli") == 0;
302 	if (!pcre_globals->per_request_cache) {
303 		zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1);
304 	}
305 
306 	pcre_globals->backtrack_limit = 0;
307 	pcre_globals->recursion_limit = 0;
308 	pcre_globals->error_code      = PHP_PCRE_NO_ERROR;
309 	ZVAL_UNDEF(&pcre_globals->unmatched_null_pair);
310 	ZVAL_UNDEF(&pcre_globals->unmatched_empty_pair);
311 #ifdef HAVE_PCRE_JIT_SUPPORT
312 	pcre_globals->jit = 1;
313 #endif
314 
315 	php_pcre_init_pcre2(1);
316 	zend_hash_init(&char_tables, 1, NULL, php_pcre_free_char_table, 1);
317 }
318 /* }}} */
319 
PHP_GSHUTDOWN_FUNCTION(pcre)320 static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */
321 {
322 	if (!pcre_globals->per_request_cache) {
323 		zend_hash_destroy(&pcre_globals->pcre_cache);
324 	}
325 
326 	php_pcre_shutdown_pcre2();
327 	zend_hash_destroy(&char_tables);
328 	php_pcre_mutex_free();
329 }
330 /* }}} */
331 
PHP_INI_MH(OnUpdateBacktrackLimit)332 static PHP_INI_MH(OnUpdateBacktrackLimit)
333 {/*{{{*/
334 	OnUpdateLong(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
335 	if (mctx) {
336 		pcre2_set_match_limit(mctx, (uint32_t)PCRE_G(backtrack_limit));
337 	}
338 
339 	return SUCCESS;
340 }/*}}}*/
341 
PHP_INI_MH(OnUpdateRecursionLimit)342 static PHP_INI_MH(OnUpdateRecursionLimit)
343 {/*{{{*/
344 	OnUpdateLong(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
345 	if (mctx) {
346 		pcre2_set_depth_limit(mctx, (uint32_t)PCRE_G(recursion_limit));
347 	}
348 
349 	return SUCCESS;
350 }/*}}}*/
351 
352 #ifdef HAVE_PCRE_JIT_SUPPORT
PHP_INI_MH(OnUpdateJit)353 static PHP_INI_MH(OnUpdateJit)
354 {/*{{{*/
355 	OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
356 	if (PCRE_G(jit) && jit_stack) {
357 		pcre2_jit_stack_assign(mctx, NULL, jit_stack);
358 	} else {
359 		pcre2_jit_stack_assign(mctx, NULL, NULL);
360 	}
361 
362 	return SUCCESS;
363 }/*}}}*/
364 #endif
365 
366 PHP_INI_BEGIN()
367 	STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateBacktrackLimit, backtrack_limit, zend_pcre_globals, pcre_globals)
368 	STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000",  PHP_INI_ALL, OnUpdateRecursionLimit, recursion_limit, zend_pcre_globals, pcre_globals)
369 #ifdef HAVE_PCRE_JIT_SUPPORT
370 	STD_PHP_INI_BOOLEAN("pcre.jit",           "1",       PHP_INI_ALL, OnUpdateJit,            jit,             zend_pcre_globals, pcre_globals)
371 #endif
PHP_INI_END()372 PHP_INI_END()
373 
374 static char *_pcre2_config_str(uint32_t what)
375 {/*{{{*/
376 	int len = pcre2_config(what, NULL);
377 	char *ret = (char *) malloc(len + 1);
378 
379 	len = pcre2_config(what, ret);
380 	if (!len) {
381 		free(ret);
382 		return NULL;
383 	}
384 
385 	return ret;
386 }/*}}}*/
387 
388 /* {{{ PHP_MINFO_FUNCTION(pcre) */
PHP_MINFO_FUNCTION(pcre)389 static PHP_MINFO_FUNCTION(pcre)
390 {
391 #ifdef HAVE_PCRE_JIT_SUPPORT
392 	uint32_t flag = 0;
393 	char *jit_target = _pcre2_config_str(PCRE2_CONFIG_JITTARGET);
394 #endif
395 	char *version = _pcre2_config_str(PCRE2_CONFIG_VERSION);
396 	char *unicode = _pcre2_config_str(PCRE2_CONFIG_UNICODE_VERSION);
397 
398 	php_info_print_table_start();
399 	php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
400 	php_info_print_table_row(2, "PCRE Library Version", version);
401 	free(version);
402 	php_info_print_table_row(2, "PCRE Unicode Version", unicode);
403 	free(unicode);
404 
405 #ifdef HAVE_PCRE_JIT_SUPPORT
406 	if (!pcre2_config(PCRE2_CONFIG_JIT, &flag)) {
407 		php_info_print_table_row(2, "PCRE JIT Support", flag ? "enabled" : "disabled");
408 	} else {
409 		php_info_print_table_row(2, "PCRE JIT Support", "unknown" );
410 	}
411 	if (jit_target) {
412 		php_info_print_table_row(2, "PCRE JIT Target", jit_target);
413 	}
414 	free(jit_target);
415 #else
416 	php_info_print_table_row(2, "PCRE JIT Support", "not compiled in" );
417 #endif
418 
419 #ifdef HAVE_PCRE_VALGRIND_SUPPORT
420 	php_info_print_table_row(2, "PCRE Valgrind Support", "enabled" );
421 #endif
422 
423 	php_info_print_table_end();
424 
425 	DISPLAY_INI_ENTRIES();
426 }
427 /* }}} */
428 
429 /* {{{ PHP_MINIT_FUNCTION(pcre) */
PHP_MINIT_FUNCTION(pcre)430 static PHP_MINIT_FUNCTION(pcre)
431 {
432 #ifdef HAVE_PCRE_JIT_SUPPORT
433 	if (UNEXPECTED(!pcre2_init_ok)) {
434 		/* Retry. */
435 		php_pcre_init_pcre2(PCRE_G(jit));
436 		if (!pcre2_init_ok) {
437 			return FAILURE;
438 		}
439 	}
440 #endif
441 
442 	REGISTER_INI_ENTRIES();
443 
444 	php_pcre_version = _pcre2_config_str(PCRE2_CONFIG_VERSION);
445 
446 	register_php_pcre_symbols(module_number);
447 
448 	return SUCCESS;
449 }
450 /* }}} */
451 
452 /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
PHP_MSHUTDOWN_FUNCTION(pcre)453 static PHP_MSHUTDOWN_FUNCTION(pcre)
454 {
455 	UNREGISTER_INI_ENTRIES();
456 
457 	free(php_pcre_version);
458 
459 	return SUCCESS;
460 }
461 /* }}} */
462 
463 /* {{{ PHP_RINIT_FUNCTION(pcre) */
PHP_RINIT_FUNCTION(pcre)464 static PHP_RINIT_FUNCTION(pcre)
465 {
466 #ifdef HAVE_PCRE_JIT_SUPPORT
467 	if (UNEXPECTED(!pcre2_init_ok)) {
468 		/* Retry. */
469 		php_pcre_mutex_lock();
470 		php_pcre_init_pcre2(PCRE_G(jit));
471 		if (!pcre2_init_ok) {
472 			php_pcre_mutex_unlock();
473 			return FAILURE;
474 		}
475 		php_pcre_mutex_unlock();
476 	}
477 
478 	mdata_used = 0;
479 #endif
480 
481 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
482 	PCRE_G(gctx_zmm) = pcre2_general_context_create(php_pcre_emalloc, php_pcre_efree, NULL);
483 	if (!PCRE_G(gctx_zmm)) {
484 		return FAILURE;
485 	}
486 
487 	if (PCRE_G(per_request_cache)) {
488 		zend_hash_init(&PCRE_G(pcre_cache), 0, NULL, php_efree_pcre_cache, 0);
489 	}
490 
491 	return SUCCESS;
492 }
493 /* }}} */
494 
PHP_RSHUTDOWN_FUNCTION(pcre)495 static PHP_RSHUTDOWN_FUNCTION(pcre)
496 {
497 	pcre2_general_context_free(PCRE_G(gctx_zmm));
498 	PCRE_G(gctx_zmm) = NULL;
499 
500 	if (PCRE_G(per_request_cache)) {
501 		zend_hash_destroy(&PCRE_G(pcre_cache));
502 	}
503 
504 	zval_ptr_dtor(&PCRE_G(unmatched_null_pair));
505 	zval_ptr_dtor(&PCRE_G(unmatched_empty_pair));
506 	ZVAL_UNDEF(&PCRE_G(unmatched_null_pair));
507 	ZVAL_UNDEF(&PCRE_G(unmatched_empty_pair));
508 	return SUCCESS;
509 }
510 
511 /* {{{ static pcre_clean_cache */
pcre_clean_cache(zval * data,void * arg)512 static int pcre_clean_cache(zval *data, void *arg)
513 {
514 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
515 	int *num_clean = (int *)arg;
516 
517 	if (*num_clean > 0 && !pce->refcount) {
518 		(*num_clean)--;
519 		return ZEND_HASH_APPLY_REMOVE;
520 	} else {
521 		return ZEND_HASH_APPLY_KEEP;
522 	}
523 }
524 /* }}} */
525 
free_subpats_table(zend_string ** subpat_names,uint32_t num_subpats)526 static void free_subpats_table(zend_string **subpat_names, uint32_t num_subpats) {
527 	uint32_t i;
528 	for (i = 0; i < num_subpats; i++) {
529 		if (subpat_names[i]) {
530 			zend_string_release_ex(subpat_names[i], false);
531 		}
532 	}
533 	efree(subpat_names);
534 }
535 
536 /* {{{ static make_subpats_table */
make_subpats_table(uint32_t num_subpats,pcre_cache_entry * pce)537 static zend_string **make_subpats_table(uint32_t num_subpats, pcre_cache_entry *pce)
538 {
539 	uint32_t name_cnt = pce->name_count, name_size, ni = 0;
540 	char *name_table;
541 	zend_string **subpat_names;
542 	int rc1, rc2;
543 
544 	rc1 = pcre2_pattern_info(pce->re, PCRE2_INFO_NAMETABLE, &name_table);
545 	rc2 = pcre2_pattern_info(pce->re, PCRE2_INFO_NAMEENTRYSIZE, &name_size);
546 	if (rc1 < 0 || rc2 < 0) {
547 		php_error_docref(NULL, E_WARNING, "Internal pcre2_pattern_info() error %d", rc1 < 0 ? rc1 : rc2);
548 		return NULL;
549 	}
550 
551 	subpat_names = ecalloc(num_subpats, sizeof(zend_string *));
552 	while (ni++ < name_cnt) {
553 		unsigned short name_idx = 0x100 * (unsigned char)name_table[0] + (unsigned char)name_table[1];
554 		const char *name = name_table + 2;
555 		subpat_names[name_idx] = zend_string_init(name, strlen(name), 0);
556 		if (is_numeric_string(ZSTR_VAL(subpat_names[name_idx]), ZSTR_LEN(subpat_names[name_idx]), NULL, NULL, 0) > 0) {
557 			php_error_docref(NULL, E_WARNING, "Numeric named subpatterns are not allowed");
558 			free_subpats_table(subpat_names, num_subpats);
559 			return NULL;
560 		}
561 		name_table += name_size;
562 	}
563 	return subpat_names;
564 }
565 /* }}} */
566 
567 /* {{{ static calculate_unit_length */
568 /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE2_UTF. */
calculate_unit_length(pcre_cache_entry * pce,const char * start)569 static zend_always_inline size_t calculate_unit_length(pcre_cache_entry *pce, const char *start)
570 {
571 	size_t unit_len;
572 
573 	if (pce->compile_options & PCRE2_UTF) {
574 		const char *end = start;
575 
576 		/* skip continuation bytes */
577 		while ((*++end & 0xC0) == 0x80);
578 		unit_len = end - start;
579 	} else {
580 		unit_len = 1;
581 	}
582 	return unit_len;
583 }
584 /* }}} */
585 
586 /* {{{ pcre_get_compiled_regex_cache */
pcre_get_compiled_regex_cache_ex(zend_string * regex,bool locale_aware)587 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, bool locale_aware)
588 {
589 	pcre2_code			*re = NULL;
590 #if 10 == PCRE2_MAJOR && 37 == PCRE2_MINOR && !HAVE_BUNDLED_PCRE
591 	uint32_t			 coptions = PCRE2_NO_START_OPTIMIZE;
592 #else
593 	uint32_t			 coptions = 0;
594 #endif
595 	PCRE2_UCHAR	         error[128];
596 	PCRE2_SIZE           erroffset;
597 	int                  errnumber;
598 	char				 delimiter;
599 	char				 start_delimiter;
600 	char				 end_delimiter;
601 	char				*p, *pp;
602 	char				*pattern;
603 	size_t				 pattern_len;
604 	uint32_t			 poptions = 0;
605 	const uint8_t       *tables = NULL;
606 	zval                *zv;
607 	pcre_cache_entry	 new_entry;
608 	int					 rc;
609 	zend_string 		*key;
610 	pcre_cache_entry	*ret;
611 
612 	if (locale_aware && BG(ctype_string)) {
613 		key = zend_string_concat2(
614 			ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string)),
615 			ZSTR_VAL(regex), ZSTR_LEN(regex));
616 	} else {
617 		key = regex;
618 	}
619 
620 	/* Try to lookup the cached regex entry, and if successful, just pass
621 	   back the compiled pattern, otherwise go on and compile it. */
622 	zv = zend_hash_find(&PCRE_G(pcre_cache), key);
623 	if (zv) {
624 		if (key != regex) {
625 			zend_string_release_ex(key, 0);
626 		}
627 		return (pcre_cache_entry*)Z_PTR_P(zv);
628 	}
629 
630 	p = ZSTR_VAL(regex);
631 	const char* end_p = ZSTR_VAL(regex) + ZSTR_LEN(regex);
632 
633 	/* Parse through the leading whitespace, and display a warning if we
634 	   get to the end without encountering a delimiter. */
635 	while (isspace((int)*(unsigned char *)p)) p++;
636 	if (p >= end_p) {
637 		if (key != regex) {
638 			zend_string_release_ex(key, 0);
639 		}
640 		php_error_docref(NULL, E_WARNING, "Empty regular expression");
641 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
642 		return NULL;
643 	}
644 
645 	/* Get the delimiter and display a warning if it is alphanumeric
646 	   or a backslash. */
647 	delimiter = *p++;
648 	if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\' || delimiter == '\0') {
649 		if (key != regex) {
650 			zend_string_release_ex(key, 0);
651 		}
652 		php_error_docref(NULL, E_WARNING, "Delimiter must not be alphanumeric, backslash, or NUL byte");
653 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
654 		return NULL;
655 	}
656 
657 	start_delimiter = delimiter;
658 	if ((pp = strchr("([{< )]}> )]}>", delimiter)))
659 		delimiter = pp[5];
660 	end_delimiter = delimiter;
661 
662 	pp = p;
663 
664 	if (start_delimiter == end_delimiter) {
665 		/* We need to iterate through the pattern, searching for the ending delimiter,
666 		   but skipping the backslashed delimiters.  If the ending delimiter is not
667 		   found, display a warning. */
668 		while (pp < end_p) {
669 			if (*pp == '\\' && pp + 1 < end_p) pp++;
670 			else if (*pp == delimiter)
671 				break;
672 			pp++;
673 		}
674 	} else {
675 		/* We iterate through the pattern, searching for the matching ending
676 		 * delimiter. For each matching starting delimiter, we increment nesting
677 		 * level, and decrement it for each matching ending delimiter. If we
678 		 * reach the end of the pattern without matching, display a warning.
679 		 */
680 		int brackets = 1; 	/* brackets nesting level */
681 		while (pp < end_p) {
682 			if (*pp == '\\' && pp + 1 < end_p) pp++;
683 			else if (*pp == end_delimiter && --brackets <= 0)
684 				break;
685 			else if (*pp == start_delimiter)
686 				brackets++;
687 			pp++;
688 		}
689 	}
690 
691 	if (pp >= end_p) {
692 		if (key != regex) {
693 			zend_string_release_ex(key, 0);
694 		}
695 		if (start_delimiter == end_delimiter) {
696 			php_error_docref(NULL,E_WARNING, "No ending delimiter '%c' found", delimiter);
697 		} else {
698 			php_error_docref(NULL,E_WARNING, "No ending matching delimiter '%c' found", delimiter);
699 		}
700 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
701 		return NULL;
702 	}
703 
704 	/* Make a copy of the actual pattern. */
705 	pattern_len = pp - p;
706 	pattern = estrndup(p, pattern_len);
707 
708 	/* Move on to the options */
709 	pp++;
710 
711 	/* Parse through the options, setting appropriate flags.  Display
712 	   a warning if we encounter an unknown modifier. */
713 	while (pp < end_p) {
714 		switch (*pp++) {
715 			/* Perl compatible options */
716 			case 'i':	coptions |= PCRE2_CASELESS;		break;
717 			case 'm':	coptions |= PCRE2_MULTILINE;		break;
718 			case 'n':	coptions |= PCRE2_NO_AUTO_CAPTURE;	break;
719 			case 's':	coptions |= PCRE2_DOTALL;		break;
720 			case 'x':	coptions |= PCRE2_EXTENDED;		break;
721 
722 			/* PCRE specific options */
723 			case 'A':	coptions |= PCRE2_ANCHORED;		break;
724 			case 'D':	coptions |= PCRE2_DOLLAR_ENDONLY;break;
725 			case 'S':	/* Pass. */					break;
726 			case 'X':	/* Pass. */					break;
727 			case 'U':	coptions |= PCRE2_UNGREEDY;		break;
728 			case 'u':	coptions |= PCRE2_UTF;
729 	/* In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
730 	   characters, even in UTF-8 mode. However, this can be changed by setting
731 	   the PCRE2_UCP option. */
732 #ifdef PCRE2_UCP
733 						coptions |= PCRE2_UCP;
734 #endif
735 				break;
736 			case 'J':	coptions |= PCRE2_DUPNAMES;		break;
737 
738 			case ' ':
739 			case '\n':
740 			case '\r':
741 				break;
742 
743 			case 'e': /* legacy eval */
744 			default:
745 				if (pp[-1]) {
746 					php_error_docref(NULL, E_WARNING, "Unknown modifier '%c'", pp[-1]);
747 				} else {
748 					php_error_docref(NULL, E_WARNING, "NUL byte is not a valid modifier");
749 				}
750 				pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
751 				efree(pattern);
752 				if (key != regex) {
753 					zend_string_release_ex(key, 0);
754 				}
755 				return NULL;
756 		}
757 	}
758 
759 	if (key != regex) {
760 		tables = (uint8_t *)zend_hash_find_ptr(&char_tables, BG(ctype_string));
761 		if (!tables) {
762 			zend_string *_k;
763 			tables = pcre2_maketables(gctx);
764 			if (UNEXPECTED(!tables)) {
765 				php_error_docref(NULL,E_WARNING, "Failed to generate locale character tables");
766 				pcre_handle_exec_error(PCRE2_ERROR_NOMEMORY);
767 				zend_string_release_ex(key, 0);
768 				efree(pattern);
769 				return NULL;
770 			}
771 			_k = zend_string_init(ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string)), 1);
772 			GC_MAKE_PERSISTENT_LOCAL(_k);
773 			zend_hash_add_ptr(&char_tables, _k, (void *)tables);
774 			zend_string_release(_k);
775 		}
776 	}
777 	pcre2_set_character_tables(cctx, tables);
778 
779 	/* Compile pattern and display a warning if compilation failed. */
780 	re = pcre2_compile((PCRE2_SPTR)pattern, pattern_len, coptions, &errnumber, &erroffset, cctx);
781 
782 	if (re == NULL) {
783 		if (key != regex) {
784 			zend_string_release_ex(key, 0);
785 		}
786 		pcre2_get_error_message(errnumber, error, sizeof(error));
787 		php_error_docref(NULL,E_WARNING, "Compilation failed: %s at offset %zu", error, erroffset);
788 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
789 		efree(pattern);
790 		return NULL;
791 	}
792 
793 #ifdef HAVE_PCRE_JIT_SUPPORT
794 	if (PCRE_G(jit)) {
795 		/* Enable PCRE JIT compiler */
796 		rc = pcre2_jit_compile(re, PCRE2_JIT_COMPLETE);
797 		if (EXPECTED(rc >= 0)) {
798 			size_t jit_size = 0;
799 			if (!pcre2_pattern_info(re, PCRE2_INFO_JITSIZE, &jit_size) && jit_size > 0) {
800 				poptions |= PREG_JIT;
801 			}
802 		} else if (rc == PCRE2_ERROR_NOMEMORY) {
803 			php_error_docref(NULL, E_WARNING,
804 				"Allocation of JIT memory failed, PCRE JIT will be disabled. "
805 				"This is likely caused by security restrictions. "
806 				"Either grant PHP permission to allocate executable memory, or set pcre.jit=0");
807 			PCRE_G(jit) = 0;
808 		} else {
809 			pcre2_get_error_message(rc, error, sizeof(error));
810 			php_error_docref(NULL, E_WARNING, "JIT compilation failed: %s", error);
811 			pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
812 		}
813 	}
814 #endif
815 	efree(pattern);
816 
817 	/*
818 	 * If we reached cache limit, clean out the items from the head of the list;
819 	 * these are supposedly the oldest ones (but not necessarily the least used
820 	 * ones).
821 	 */
822 	if (zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) {
823 		int num_clean = PCRE_CACHE_SIZE / 8;
824 		zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean);
825 	}
826 
827 	/* Store the compiled pattern and extra info in the cache. */
828 	new_entry.re = re;
829 	new_entry.preg_options = poptions;
830 	new_entry.compile_options = coptions;
831 	new_entry.refcount = 0;
832 
833 	rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &new_entry.capture_count);
834 	if (rc < 0) {
835 		if (key != regex) {
836 			zend_string_release_ex(key, 0);
837 		}
838 		php_error_docref(NULL, E_WARNING, "Internal pcre2_pattern_info() error %d", rc);
839 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
840 		return NULL;
841 	}
842 
843 	rc = pcre2_pattern_info(re, PCRE2_INFO_NAMECOUNT, &new_entry.name_count);
844 	if (rc < 0) {
845 		if (key != regex) {
846 			zend_string_release_ex(key, 0);
847 		}
848 		php_error_docref(NULL, E_WARNING, "Internal pcre_pattern_info() error %d", rc);
849 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
850 		return NULL;
851 	}
852 
853 	/*
854 	 * Interned strings are not duplicated when stored in HashTable,
855 	 * but all the interned strings created during HTTP request are removed
856 	 * at end of request. However PCRE_G(pcre_cache) must be consistent
857 	 * on the next request as well. So we disable usage of interned strings
858 	 * as hash keys especually for this table.
859 	 * See bug #63180
860 	 */
861 	if (!(GC_FLAGS(key) & IS_STR_PERMANENT) && !PCRE_G(per_request_cache)) {
862 		zend_string *str = zend_string_init(ZSTR_VAL(key), ZSTR_LEN(key), 1);
863 		GC_MAKE_PERSISTENT_LOCAL(str);
864 
865 		ret = zend_hash_add_new_mem(&PCRE_G(pcre_cache), str, &new_entry, sizeof(pcre_cache_entry));
866 		zend_string_release(str);
867 	} else {
868 		ret = zend_hash_add_new_mem(&PCRE_G(pcre_cache), key, &new_entry, sizeof(pcre_cache_entry));
869 	}
870 
871 	if (key != regex) {
872 		zend_string_release_ex(key, 0);
873 	}
874 
875 	return ret;
876 }
877 /* }}} */
878 
879 /* {{{ pcre_get_compiled_regex_cache */
pcre_get_compiled_regex_cache(zend_string * regex)880 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_string *regex)
881 {
882 	return pcre_get_compiled_regex_cache_ex(regex, true);
883 }
884 /* }}} */
885 
886 /* {{{ pcre_get_compiled_regex */
pcre_get_compiled_regex(zend_string * regex,uint32_t * capture_count)887 PHPAPI pcre2_code *pcre_get_compiled_regex(zend_string *regex, uint32_t *capture_count)
888 {
889 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex);
890 
891 	if (capture_count) {
892 		*capture_count = pce ? pce->capture_count : 0;
893 	}
894 
895 	return pce ? pce->re : NULL;
896 }
897 /* }}} */
898 
899 /* XXX For the cases where it's only about match yes/no and no capture
900 		required, perhaps just a minimum sized data would suffice. */
php_pcre_create_match_data(uint32_t capture_count,pcre2_code * re)901 PHPAPI pcre2_match_data *php_pcre_create_match_data(uint32_t capture_count, pcre2_code *re)
902 {/*{{{*/
903 
904 	assert(NULL != re);
905 
906 	if (EXPECTED(!mdata_used)) {
907 		int rc = 0;
908 
909 		if (!capture_count) {
910 			/* As we deal with a non cached pattern, no other way to gather this info. */
911 			rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &capture_count);
912 		}
913 
914 		if (rc >= 0 && capture_count + 1 <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
915 			mdata_used = 1;
916 			return mdata;
917 		}
918 	}
919 
920 	return pcre2_match_data_create_from_pattern(re, gctx);
921 }/*}}}*/
922 
php_pcre_free_match_data(pcre2_match_data * match_data)923 PHPAPI void php_pcre_free_match_data(pcre2_match_data *match_data)
924 {/*{{{*/
925 	if (UNEXPECTED(match_data != mdata)) {
926 		pcre2_match_data_free(match_data);
927 	} else {
928 		mdata_used = 0;
929 	}
930 }/*}}}*/
931 
init_unmatched_null_pair(void)932 static void init_unmatched_null_pair(void) {
933 	zval val1, val2;
934 	ZVAL_NULL(&val1);
935 	ZVAL_LONG(&val2, -1);
936 	ZVAL_ARR(&PCRE_G(unmatched_null_pair), zend_new_pair(&val1, &val2));
937 }
938 
init_unmatched_empty_pair(void)939 static void init_unmatched_empty_pair(void) {
940 	zval val1, val2;
941 	ZVAL_EMPTY_STRING(&val1);
942 	ZVAL_LONG(&val2, -1);
943 	ZVAL_ARR(&PCRE_G(unmatched_empty_pair), zend_new_pair(&val1, &val2));
944 }
945 
populate_match_value_str(zval * val,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset)946 static zend_always_inline void populate_match_value_str(
947 		zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset) {
948 	ZVAL_STRINGL_FAST(val, subject + start_offset, end_offset - start_offset);
949 }
950 
populate_match_value(zval * val,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset,bool unmatched_as_null)951 static zend_always_inline void populate_match_value(
952 		zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset,
953 		bool unmatched_as_null) {
954 	if (PCRE2_UNSET == start_offset) {
955 		if (unmatched_as_null) {
956 			ZVAL_NULL(val);
957 		} else {
958 			ZVAL_EMPTY_STRING(val);
959 		}
960 	} else {
961 		populate_match_value_str(val, subject, start_offset, end_offset);
962 	}
963 }
964 
add_named(HashTable * const subpats,zend_string * name,zval * val,bool unmatched)965 static inline void add_named(
966 		HashTable *const subpats, zend_string *name, zval *val, bool unmatched) {
967 	/* If the DUPNAMES option is used, multiple subpatterns might have the same name.
968 	 * In this case we want to preserve the one that actually has a value. */
969 	if (!unmatched) {
970 		zend_hash_update(subpats, name, val);
971 	} else {
972 		if (!zend_hash_add(subpats, name, val)) {
973 			return;
974 		}
975 	}
976 	Z_TRY_ADDREF_P(val);
977 }
978 
979 /* {{{ add_offset_pair */
add_offset_pair(HashTable * const result,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset,zend_string * name,zend_long unmatched_as_null)980 static inline void add_offset_pair(
981 		HashTable *const result, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset,
982 		zend_string *name, zend_long unmatched_as_null)
983 {
984 	zval match_pair;
985 
986 	/* Add (match, offset) to the return value */
987 	if (PCRE2_UNSET == start_offset) {
988 		if (unmatched_as_null) {
989 			if (Z_ISUNDEF(PCRE_G(unmatched_null_pair))) {
990 				init_unmatched_null_pair();
991 			}
992 			ZVAL_COPY(&match_pair, &PCRE_G(unmatched_null_pair));
993 		} else {
994 			if (Z_ISUNDEF(PCRE_G(unmatched_empty_pair))) {
995 				init_unmatched_empty_pair();
996 			}
997 			ZVAL_COPY(&match_pair, &PCRE_G(unmatched_empty_pair));
998 		}
999 	} else {
1000 		zval val1, val2;
1001 		populate_match_value_str(&val1, subject, start_offset, end_offset);
1002 		ZVAL_LONG(&val2, start_offset);
1003 		ZVAL_ARR(&match_pair, zend_new_pair(&val1, &val2));
1004 	}
1005 
1006 	if (name) {
1007 		add_named(result, name, &match_pair, start_offset == PCRE2_UNSET);
1008 	}
1009 	zend_hash_next_index_insert_new(result, &match_pair);
1010 }
1011 /* }}} */
1012 
populate_subpat_array(zval * subpats,const char * subject,PCRE2_SIZE * offsets,zend_string ** subpat_names,uint32_t num_subpats,int count,const PCRE2_SPTR mark,zend_long flags)1013 static void populate_subpat_array(
1014 		zval *subpats, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names,
1015 		uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags) {
1016 	zend_long offset_capture = flags & PREG_OFFSET_CAPTURE;
1017 	zend_long unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL;
1018 	zval val;
1019 	int i;
1020 	HashTable *subpats_ht = Z_ARRVAL_P(subpats);
1021 	if (subpat_names) {
1022 		if (offset_capture) {
1023 			for (i = 0; i < count; i++) {
1024 				add_offset_pair(
1025 					subpats_ht, subject, offsets[2*i], offsets[2*i+1],
1026 					subpat_names[i], unmatched_as_null);
1027 			}
1028 			if (unmatched_as_null) {
1029 				for (i = count; i < num_subpats; i++) {
1030 					add_offset_pair(subpats_ht, NULL, PCRE2_UNSET, PCRE2_UNSET, subpat_names[i], 1);
1031 				}
1032 			}
1033 		} else {
1034 			for (i = 0; i < count; i++) {
1035 				populate_match_value(
1036 					&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1037 				if (subpat_names[i]) {
1038 					add_named(subpats_ht, subpat_names[i], &val, offsets[2*i] == PCRE2_UNSET);
1039 				}
1040 				zend_hash_next_index_insert_new(subpats_ht, &val);
1041 			}
1042 			if (unmatched_as_null) {
1043 				for (i = count; i < num_subpats; i++) {
1044 					ZVAL_NULL(&val);
1045 					if (subpat_names[i]) {
1046 						zend_hash_add(subpats_ht, subpat_names[i], &val);
1047 					}
1048 					zend_hash_next_index_insert_new(subpats_ht, &val);
1049 				}
1050 			}
1051 		}
1052 	} else {
1053 		if (offset_capture) {
1054 			for (i = 0; i < count; i++) {
1055 				add_offset_pair(
1056 					subpats_ht, subject, offsets[2*i], offsets[2*i+1], NULL, unmatched_as_null);
1057 			}
1058 			if (unmatched_as_null) {
1059 				for (i = count; i < num_subpats; i++) {
1060 					add_offset_pair(subpats_ht, NULL, PCRE2_UNSET, PCRE2_UNSET, NULL, 1);
1061 				}
1062 			}
1063 		} else {
1064 			for (i = 0; i < count; i++) {
1065 				populate_match_value(
1066 					&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1067 				zend_hash_next_index_insert_new(subpats_ht, &val);
1068 			}
1069 			if (unmatched_as_null) {
1070 				for (i = count; i < num_subpats; i++) {
1071 					add_next_index_null(subpats);
1072 				}
1073 			}
1074 		}
1075 	}
1076 	/* Add MARK, if available */
1077 	if (mark) {
1078 		add_assoc_string_ex(subpats, "MARK", sizeof("MARK") - 1, (char *)mark);
1079 	}
1080 }
1081 
php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS,bool global)1082 static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, bool global) /* {{{ */
1083 {
1084 	/* parameters */
1085 	zend_string		 *regex;			/* Regular expression */
1086 	zend_string		 *subject;			/* String to match against */
1087 	pcre_cache_entry *pce;				/* Compiled regular expression */
1088 	zval			 *subpats = NULL;	/* Array for subpatterns */
1089 	zend_long		  flags = 0;		/* Match control flags */
1090 	zend_long		  start_offset = 0;	/* Where the new search starts */
1091 
1092 	ZEND_PARSE_PARAMETERS_START(2, 5)
1093 		Z_PARAM_STR(regex)
1094 		Z_PARAM_STR(subject)
1095 		Z_PARAM_OPTIONAL
1096 		Z_PARAM_ZVAL(subpats)
1097 		Z_PARAM_LONG(flags)
1098 		Z_PARAM_LONG(start_offset)
1099 	ZEND_PARSE_PARAMETERS_END();
1100 
1101 	/* Compile regex or get it from cache. */
1102 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1103 		RETURN_FALSE;
1104 	}
1105 
1106 	pce->refcount++;
1107 	php_pcre_match_impl(pce, subject, return_value, subpats,
1108 		global, flags, start_offset);
1109 	pce->refcount--;
1110 }
1111 /* }}} */
1112 
is_known_valid_utf8(zend_string * subject_str,PCRE2_SIZE start_offset)1113 static zend_always_inline bool is_known_valid_utf8(
1114 		zend_string *subject_str, PCRE2_SIZE start_offset) {
1115 	if (!ZSTR_IS_VALID_UTF8(subject_str)) {
1116 		/* We don't know whether the string is valid UTF-8 or not. */
1117 		return 0;
1118 	}
1119 
1120 	if (start_offset == ZSTR_LEN(subject_str)) {
1121 		/* Degenerate case: Offset points to end of string. */
1122 		return 1;
1123 	}
1124 
1125 	/* Check that the offset does not point to an UTF-8 continuation byte. */
1126 	return (ZSTR_VAL(subject_str)[start_offset] & 0xc0) != 0x80;
1127 }
1128 
1129 /* {{{ php_pcre_match_impl() */
php_pcre_match_impl(pcre_cache_entry * pce,zend_string * subject_str,zval * return_value,zval * subpats,bool global,zend_long flags,zend_off_t start_offset)1130 PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
1131 	zval *subpats, bool global, zend_long flags, zend_off_t start_offset)
1132 {
1133 	zval			 result_set;		/* Holds a set of subpatterns after
1134 										   a global match */
1135 	HashTable	   **match_sets = NULL;	/* An array of sets of matches for each
1136 										   subpattern after a global match */
1137 	uint32_t		 options;			/* Execution options */
1138 	int				 count;				/* Count of matched subpatterns */
1139 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1140 	int				 matched;			/* Has anything matched */
1141 	zend_string	   **subpat_names;		/* Array for named subpatterns */
1142 	size_t			 i;
1143 	uint32_t		 subpats_order;		/* Order of subpattern matches */
1144 	uint32_t		 offset_capture;	/* Capture match offsets: yes/no */
1145 	zend_long		 unmatched_as_null;	/* Null non-matches: yes/no */
1146 	PCRE2_SPTR       mark = NULL;		/* Target for MARK name */
1147 	HashTable		*marks = NULL;		/* Array of marks for PREG_PATTERN_ORDER */
1148 	pcre2_match_data *match_data;
1149 	PCRE2_SIZE		 start_offset2, orig_start_offset;
1150 
1151 	char *subject = ZSTR_VAL(subject_str);
1152 	size_t subject_len = ZSTR_LEN(subject_str);
1153 
1154 	/* Overwrite the passed-in value for subpatterns with an empty array. */
1155 	if (subpats != NULL) {
1156 		subpats = zend_try_array_init(subpats);
1157 		if (!subpats) {
1158 			RETURN_THROWS();
1159 		}
1160 	}
1161 
1162 	subpats_order = global ? PREG_PATTERN_ORDER : 0;
1163 
1164 	if (flags) {
1165 		offset_capture = flags & PREG_OFFSET_CAPTURE;
1166 		unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL;
1167 
1168 		/*
1169 		 * subpats_order is pre-set to pattern mode so we change it only if
1170 		 * necessary.
1171 		 */
1172 		if (flags & 0xff) {
1173 			subpats_order = flags & 0xff;
1174 			if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
1175 				(!global && subpats_order != 0)) {
1176 				zend_argument_value_error(4, "must be a PREG_* constant");
1177 				RETURN_THROWS();
1178 			}
1179 		}
1180 	} else {
1181 		offset_capture = 0;
1182 		unmatched_as_null = 0;
1183 	}
1184 
1185 	/* Negative offset counts from the end of the string. */
1186 	if (start_offset < 0) {
1187 		if ((PCRE2_SIZE)-start_offset <= subject_len) {
1188 			start_offset2 = subject_len + start_offset;
1189 		} else {
1190 			start_offset2 = 0;
1191 		}
1192 	} else {
1193 		start_offset2 = (PCRE2_SIZE)start_offset;
1194 	}
1195 
1196 	if (start_offset2 > subject_len) {
1197 		pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET);
1198 		RETURN_FALSE;
1199 	}
1200 
1201 	/* Calculate the size of the offsets array, and allocate memory for it. */
1202 	num_subpats = pce->capture_count + 1;
1203 
1204 	/*
1205 	 * Build a mapping from subpattern numbers to their names. We will
1206 	 * allocate the table only if there are any named subpatterns.
1207 	 */
1208 	subpat_names = NULL;
1209 	if (subpats && pce->name_count > 0) {
1210 		subpat_names = make_subpats_table(num_subpats, pce);
1211 		if (!subpat_names) {
1212 			RETURN_FALSE;
1213 		}
1214 	}
1215 
1216 	matched = 0;
1217 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1218 
1219 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1220 		match_data = mdata;
1221 	} else {
1222 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1223 		if (!match_data) {
1224 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1225 			if (subpat_names) {
1226 				free_subpats_table(subpat_names, num_subpats);
1227 			}
1228 			RETURN_FALSE;
1229 		}
1230 	}
1231 
1232 	/* Allocate match sets array and initialize the values. */
1233 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
1234 		match_sets = safe_emalloc(num_subpats, sizeof(HashTable *), 0);
1235 		for (i=0; i<num_subpats; i++) {
1236 			match_sets[i] = zend_new_array(0);
1237 		}
1238 	}
1239 
1240 	/* Array of subpattern offsets */
1241 	PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data);
1242 
1243 	orig_start_offset = start_offset2;
1244 	options =
1245 		(pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, orig_start_offset)
1246 			? 0 : PCRE2_NO_UTF_CHECK;
1247 
1248 	/* Execute the regular expression. */
1249 #ifdef HAVE_PCRE_JIT_SUPPORT
1250 	if ((pce->preg_options & PREG_JIT) && options) {
1251 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1252 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1253 	} else
1254 #endif
1255 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1256 			options, match_data, mctx);
1257 
1258 	while (1) {
1259 		/* If something has matched */
1260 		if (count >= 0) {
1261 			/* Check for too many substrings condition. */
1262 			if (UNEXPECTED(count == 0)) {
1263 				php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings");
1264 				count = num_subpats;
1265 			}
1266 
1267 matched:
1268 			matched++;
1269 
1270 			/* If subpatterns array has been passed, fill it in with values. */
1271 			if (subpats != NULL) {
1272 				/* Try to get the list of substrings and display a warning if failed. */
1273 				if (UNEXPECTED(offsets[1] < offsets[0])) {
1274 					if (subpat_names) {
1275 						free_subpats_table(subpat_names, num_subpats);
1276 					}
1277 					if (match_sets) efree(match_sets);
1278 					php_error_docref(NULL, E_WARNING, "Get subpatterns list failed");
1279 					RETURN_FALSE;
1280 				}
1281 
1282 				if (global) {	/* global pattern matching */
1283 					if (subpats_order == PREG_PATTERN_ORDER) {
1284 						/* For each subpattern, insert it into the appropriate array. */
1285 						if (offset_capture) {
1286 							for (i = 0; i < count; i++) {
1287 								add_offset_pair(
1288 									match_sets[i], subject, offsets[2*i], offsets[2*i+1],
1289 									NULL, unmatched_as_null);
1290 							}
1291 						} else {
1292 							for (i = 0; i < count; i++) {
1293 								zval val;
1294 								populate_match_value(
1295 									&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1296 								zend_hash_next_index_insert_new(match_sets[i], &val);
1297 							}
1298 						}
1299 						mark = pcre2_get_mark(match_data);
1300 						/* Add MARK, if available */
1301 						if (mark) {
1302 							if (!marks) {
1303 								marks = zend_new_array(0);
1304 							}
1305 							zval tmp;
1306 							ZVAL_STRING(&tmp, (char *) mark);
1307 							zend_hash_index_add_new(marks, matched - 1, &tmp);
1308 						}
1309 						/*
1310 						 * If the number of captured subpatterns on this run is
1311 						 * less than the total possible number, pad the result
1312 						 * arrays with NULLs or empty strings.
1313 						 */
1314 						if (count < num_subpats) {
1315 							for (int i = count; i < num_subpats; i++) {
1316 								if (offset_capture) {
1317 									add_offset_pair(
1318 										match_sets[i], NULL, PCRE2_UNSET, PCRE2_UNSET,
1319 										NULL, unmatched_as_null);
1320 								} else if (unmatched_as_null) {
1321 									zval tmp;
1322 									ZVAL_NULL(&tmp);
1323 									zend_hash_next_index_insert_new(match_sets[i], &tmp);
1324 								} else {
1325 									zval tmp;
1326 									ZVAL_EMPTY_STRING(&tmp);
1327 									zend_hash_next_index_insert_new(match_sets[i], &tmp);
1328 								}
1329 							}
1330 						}
1331 					} else {
1332 						/* Allocate and populate the result set array */
1333 						mark = pcre2_get_mark(match_data);
1334 						array_init_size(&result_set, count + (mark ? 1 : 0));
1335 						populate_subpat_array(
1336 							&result_set, subject, offsets, subpat_names,
1337 							num_subpats, count, mark, flags);
1338 						/* And add it to the output array */
1339 						zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &result_set);
1340 					}
1341 				} else {			/* single pattern matching */
1342 					/* For each subpattern, insert it into the subpatterns array. */
1343 					mark = pcre2_get_mark(match_data);
1344 					populate_subpat_array(
1345 						subpats, subject, offsets, subpat_names, num_subpats, count, mark, flags);
1346 					break;
1347 				}
1348 			}
1349 
1350 			/* Advance to the next piece. */
1351 			start_offset2 = offsets[1];
1352 
1353 			/* If we have matched an empty string, mimic what Perl's /g options does.
1354 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1355 			   the match again at the same point. If this fails (picked up above) we
1356 			   advance to the next character. */
1357 			if (start_offset2 == offsets[0]) {
1358 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1359 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1360 				if (count >= 0) {
1361 					if (global) {
1362 						goto matched;
1363 					} else {
1364 						break;
1365 					}
1366 				} else if (count == PCRE2_ERROR_NOMATCH) {
1367 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1368 					   this is not necessarily the end. We need to advance
1369 					   the start offset, and continue. Fudge the offset values
1370 					   to achieve this, unless we're already at the end of the string. */
1371 					if (start_offset2 < subject_len) {
1372 						size_t unit_len = calculate_unit_length(pce, subject + start_offset2);
1373 
1374 						start_offset2 += unit_len;
1375 					} else {
1376 						break;
1377 					}
1378 				} else {
1379 					goto error;
1380 				}
1381 			}
1382 		} else if (count == PCRE2_ERROR_NOMATCH) {
1383 			break;
1384 		} else {
1385 error:
1386 			pcre_handle_exec_error(count);
1387 			break;
1388 		}
1389 
1390 		if (!global) {
1391 			break;
1392 		}
1393 
1394 		/* Execute the regular expression. */
1395 #ifdef HAVE_PCRE_JIT_SUPPORT
1396 		if ((pce->preg_options & PREG_JIT)) {
1397 			if (start_offset2 > subject_len) {
1398 				pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET);
1399 				break;
1400 			}
1401 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1402 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1403 		} else
1404 #endif
1405 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1406 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1407 	}
1408 	if (match_data != mdata) {
1409 		pcre2_match_data_free(match_data);
1410 	}
1411 
1412 	/* Add the match sets to the output array and clean up */
1413 	if (match_sets) {
1414 		if (subpat_names) {
1415 			for (i = 0; i < num_subpats; i++) {
1416 				zval wrapper;
1417 				ZVAL_ARR(&wrapper, match_sets[i]);
1418 				if (subpat_names[i]) {
1419 					zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i], &wrapper);
1420 					GC_ADDREF(match_sets[i]);
1421 				}
1422 				zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &wrapper);
1423 			}
1424 		} else {
1425 			for (i = 0; i < num_subpats; i++) {
1426 				zval wrapper;
1427 				ZVAL_ARR(&wrapper, match_sets[i]);
1428 				zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &wrapper);
1429 			}
1430 		}
1431 		efree(match_sets);
1432 
1433 		if (marks) {
1434 			zval tmp;
1435 			ZVAL_ARR(&tmp, marks);
1436 			zend_hash_str_update(Z_ARRVAL_P(subpats), "MARK", sizeof("MARK") - 1, &tmp);
1437 		}
1438 	}
1439 
1440 	if (subpat_names) {
1441 		free_subpats_table(subpat_names, num_subpats);
1442 	}
1443 
1444 	if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
1445 		/* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */
1446 		if ((pce->compile_options & PCRE2_UTF)
1447 				&& !ZSTR_IS_INTERNED(subject_str) && orig_start_offset == 0) {
1448 			GC_ADD_FLAGS(subject_str, IS_STR_VALID_UTF8);
1449 		}
1450 
1451 		RETVAL_LONG(matched);
1452 	} else {
1453 		RETVAL_FALSE;
1454 	}
1455 }
1456 /* }}} */
1457 
1458 /* {{{ Perform a Perl-style regular expression match */
PHP_FUNCTION(preg_match)1459 PHP_FUNCTION(preg_match)
1460 {
1461 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, false);
1462 }
1463 /* }}} */
1464 
1465 ZEND_FRAMELESS_FUNCTION(preg_match, 2)
1466 {
1467 	zval regex_tmp, subject_tmp;
1468 	zend_string *regex, *subject;
1469 
1470 	Z_FLF_PARAM_STR(1, regex, regex_tmp);
1471 	Z_FLF_PARAM_STR(2, subject, subject_tmp);
1472 
1473 	/* Compile regex or get it from cache. */
1474 	pcre_cache_entry *pce;
1475 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1476 		RETURN_FALSE;
1477 	}
1478 
1479 	pce->refcount++;
1480 	php_pcre_match_impl(pce, subject, return_value, /* subpats */ NULL,
1481 		/* global */ false, /* flags */ 0, /* start_offset */ 0);
1482 	pce->refcount--;
1483 
1484 flf_clean:
1485 	Z_FLF_PARAM_FREE_STR(1, regex_tmp);
1486 	Z_FLF_PARAM_FREE_STR(2, subject_tmp);
1487 }
1488 
1489 /* {{{ Perform a Perl-style global regular expression match */
PHP_FUNCTION(preg_match_all)1490 PHP_FUNCTION(preg_match_all)
1491 {
1492 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, true);
1493 }
1494 /* }}} */
1495 
1496 /* {{{ preg_get_backref */
preg_get_backref(char ** str,int * backref)1497 static int preg_get_backref(char **str, int *backref)
1498 {
1499 	char in_brace = 0;
1500 	char *walk = *str;
1501 
1502 	if (walk[1] == 0)
1503 		return 0;
1504 
1505 	if (*walk == '$' && walk[1] == '{') {
1506 		in_brace = 1;
1507 		walk++;
1508 	}
1509 	walk++;
1510 
1511 	if (*walk >= '0' && *walk <= '9') {
1512 		*backref = *walk - '0';
1513 		walk++;
1514 	} else
1515 		return 0;
1516 
1517 	if (*walk && *walk >= '0' && *walk <= '9') {
1518 		*backref = *backref * 10 + *walk - '0';
1519 		walk++;
1520 	}
1521 
1522 	if (in_brace) {
1523 		if (*walk != '}')
1524 			return 0;
1525 		else
1526 			walk++;
1527 	}
1528 
1529 	*str = walk;
1530 	return 1;
1531 }
1532 /* }}} */
1533 
1534 /* {{{ preg_do_repl_func */
preg_do_repl_func(zend_fcall_info * fci,zend_fcall_info_cache * fcc,const char * subject,PCRE2_SIZE * offsets,zend_string ** subpat_names,uint32_t num_subpats,int count,const PCRE2_SPTR mark,zend_long flags)1535 static zend_string *preg_do_repl_func(zend_fcall_info *fci, zend_fcall_info_cache *fcc, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names, uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags)
1536 {
1537 	zend_string *result_str;
1538 	zval		 retval;			/* Function return value */
1539 	zval	     arg;				/* Argument to pass to function */
1540 
1541 	array_init_size(&arg, count + (mark ? 1 : 0));
1542 	populate_subpat_array(&arg, subject, offsets, subpat_names, num_subpats, count, mark, flags);
1543 
1544 	fci->retval = &retval;
1545 	fci->param_count = 1;
1546 	fci->params = &arg;
1547 
1548 	if (zend_call_function(fci, fcc) == SUCCESS && Z_TYPE(retval) != IS_UNDEF) {
1549 		if (EXPECTED(Z_TYPE(retval) == IS_STRING)) {
1550 			result_str = Z_STR(retval);
1551 		} else {
1552 			result_str = zval_get_string_func(&retval);
1553 			zval_ptr_dtor(&retval);
1554 		}
1555 	} else {
1556 		if (!EG(exception)) {
1557 			php_error_docref(NULL, E_WARNING, "Unable to call custom replacement function");
1558 		}
1559 
1560 		result_str = zend_string_init(&subject[offsets[0]], offsets[1] - offsets[0], 0);
1561 	}
1562 
1563 	zval_ptr_dtor(&arg);
1564 
1565 	return result_str;
1566 }
1567 /* }}} */
1568 
1569 /* {{{ php_pcre_replace */
php_pcre_replace(zend_string * regex,zend_string * subject_str,const char * subject,size_t subject_len,zend_string * replace_str,size_t limit,size_t * replace_count)1570 PHPAPI zend_string *php_pcre_replace(zend_string *regex,
1571 							  zend_string *subject_str,
1572 							  const char *subject, size_t subject_len,
1573 							  zend_string *replace_str,
1574 							  size_t limit, size_t *replace_count)
1575 {
1576 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
1577 	zend_string	 		*result;			/* Function result */
1578 
1579 	/* Abort on pending exception, e.g. thrown from __toString(). */
1580 	if (UNEXPECTED(EG(exception))) {
1581 		return NULL;
1582 	}
1583 
1584 	/* Compile regex or get it from cache. */
1585 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1586 		return NULL;
1587 	}
1588 	pce->refcount++;
1589 	result = php_pcre_replace_impl(pce, subject_str, subject, subject_len, replace_str,
1590 		limit, replace_count);
1591 	pce->refcount--;
1592 
1593 	return result;
1594 }
1595 /* }}} */
1596 
1597 /* {{{ php_pcre_replace_impl() */
php_pcre_replace_impl(pcre_cache_entry * pce,zend_string * subject_str,const char * subject,size_t subject_len,zend_string * replace_str,size_t limit,size_t * replace_count)1598 PHPAPI zend_string *php_pcre_replace_impl(pcre_cache_entry *pce, zend_string *subject_str, const char *subject, size_t subject_len, zend_string *replace_str, size_t limit, size_t *replace_count)
1599 {
1600 	uint32_t		 options;			/* Execution options */
1601 	int				 count;				/* Count of matched subpatterns */
1602 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1603 	size_t			 new_len;			/* Length of needed storage */
1604 	size_t			 alloc_len;			/* Actual allocated length */
1605 	size_t			 match_len;			/* Length of the current match */
1606 	int				 backref;			/* Backreference number */
1607 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
1608 	size_t			 last_end_offset;	/* Where the last search ended */
1609 	char			*walkbuf,			/* Location of current replacement in the result */
1610 					*walk,				/* Used to walk the replacement string */
1611 					 walk_last;			/* Last walked character */
1612 	const char		*match,				/* The current match */
1613 					*piece,				/* The current piece of subject */
1614 					*replace_end;		/* End of replacement string */
1615 	size_t			result_len; 		/* Length of result */
1616 	zend_string		*result;			/* Result of replacement */
1617 	pcre2_match_data *match_data;
1618 
1619 	/* Calculate the size of the offsets array, and allocate memory for it. */
1620 	num_subpats = pce->capture_count + 1;
1621 	alloc_len = 0;
1622 	result = NULL;
1623 
1624 	/* Initialize */
1625 	match = NULL;
1626 	start_offset = 0;
1627 	last_end_offset = 0;
1628 	result_len = 0;
1629 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1630 
1631 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1632 		match_data = mdata;
1633 	} else {
1634 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1635 		if (!match_data) {
1636 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1637 			return NULL;
1638 		}
1639 	}
1640 
1641 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
1642 
1643 	/* Array of subpattern offsets */
1644 	PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data);
1645 
1646 	/* Execute the regular expression. */
1647 #ifdef HAVE_PCRE_JIT_SUPPORT
1648 	if ((pce->preg_options & PREG_JIT) && options) {
1649 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1650 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1651 	} else
1652 #endif
1653 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1654 			options, match_data, mctx);
1655 
1656 	while (1) {
1657 		piece = subject + last_end_offset;
1658 
1659 		if (count >= 0 && limit > 0) {
1660 			bool simple_string;
1661 
1662 			/* Check for too many substrings condition. */
1663 			if (UNEXPECTED(count == 0)) {
1664 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
1665 				count = num_subpats;
1666 			}
1667 
1668 matched:
1669 			if (UNEXPECTED(offsets[1] < offsets[0])) {
1670 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1671 				if (result) {
1672 					zend_string_release_ex(result, 0);
1673 					result = NULL;
1674 				}
1675 				break;
1676 			}
1677 
1678 			if (replace_count) {
1679 				++*replace_count;
1680 			}
1681 
1682 			/* Set the match location in subject */
1683 			match = subject + offsets[0];
1684 
1685 			new_len = result_len + offsets[0] - last_end_offset; /* part before the match */
1686 
1687 			walk = ZSTR_VAL(replace_str);
1688 			replace_end = walk + ZSTR_LEN(replace_str);
1689 			walk_last = 0;
1690 			simple_string = 1;
1691 			while (walk < replace_end) {
1692 				if ('\\' == *walk || '$' == *walk) {
1693 					simple_string = 0;
1694 					if (walk_last == '\\') {
1695 						walk++;
1696 						walk_last = 0;
1697 						continue;
1698 					}
1699 					if (preg_get_backref(&walk, &backref)) {
1700 						if (backref < count)
1701 							new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
1702 						continue;
1703 					}
1704 				}
1705 				new_len++;
1706 				walk++;
1707 				walk_last = walk[-1];
1708 			}
1709 
1710 			if (new_len >= alloc_len) {
1711 				alloc_len = zend_safe_address_guarded(2, new_len, ZSTR_MAX_OVERHEAD) - ZSTR_MAX_OVERHEAD;
1712 				if (result == NULL) {
1713 					result = zend_string_alloc(alloc_len, 0);
1714 				} else {
1715 					result = zend_string_extend(result, alloc_len, 0);
1716 				}
1717 			}
1718 
1719 			if (match-piece > 0) {
1720 				/* copy the part of the string before the match */
1721 				memcpy(&ZSTR_VAL(result)[result_len], piece, match-piece);
1722 				result_len += (match-piece);
1723 			}
1724 
1725 			if (simple_string) {
1726 				/* copy replacement */
1727 				memcpy(&ZSTR_VAL(result)[result_len], ZSTR_VAL(replace_str), ZSTR_LEN(replace_str)+1);
1728 				result_len += ZSTR_LEN(replace_str);
1729 			} else {
1730 				/* copy replacement and backrefs */
1731 				walkbuf = ZSTR_VAL(result) + result_len;
1732 
1733 				walk = ZSTR_VAL(replace_str);
1734 				walk_last = 0;
1735 				while (walk < replace_end) {
1736 					if ('\\' == *walk || '$' == *walk) {
1737 						if (walk_last == '\\') {
1738 							*(walkbuf-1) = *walk++;
1739 							walk_last = 0;
1740 							continue;
1741 						}
1742 						if (preg_get_backref(&walk, &backref)) {
1743 							if (backref < count) {
1744 								match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1745 								walkbuf = zend_mempcpy(walkbuf, subject + offsets[backref << 1], match_len);
1746 							}
1747 							continue;
1748 						}
1749 					}
1750 					*walkbuf++ = *walk++;
1751 					walk_last = walk[-1];
1752 				}
1753 				*walkbuf = '\0';
1754 				/* increment the result length by how much we've added to the string */
1755 				result_len += (walkbuf - (ZSTR_VAL(result) + result_len));
1756 			}
1757 
1758 			limit--;
1759 
1760 			/* Advance to the next piece. */
1761 			start_offset = last_end_offset = offsets[1];
1762 
1763 			/* If we have matched an empty string, mimic what Perl's /g options does.
1764 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1765 			   the match again at the same point. If this fails (picked up above) we
1766 			   advance to the next character. */
1767 			if (start_offset == offsets[0]) {
1768 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1769 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1770 
1771 				piece = subject + start_offset;
1772 				if (count >= 0 && limit > 0) {
1773 					goto matched;
1774 				} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1775 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1776 					   this is not necessarily the end. We need to advance
1777 					   the start offset, and continue. Fudge the offset values
1778 					   to achieve this, unless we're already at the end of the string. */
1779 					if (start_offset < subject_len) {
1780 						size_t unit_len = calculate_unit_length(pce, piece);
1781 						start_offset += unit_len;
1782 					} else {
1783 						goto not_matched;
1784 					}
1785 				} else {
1786 					goto error;
1787 				}
1788 			}
1789 
1790 		} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1791 not_matched:
1792 			if (!result && subject_str) {
1793 				result = zend_string_copy(subject_str);
1794 				break;
1795 			}
1796 			/* now we know exactly how long it is */
1797 			alloc_len = result_len + subject_len - last_end_offset;
1798 			if (NULL != result) {
1799 				result = zend_string_realloc(result, alloc_len, 0);
1800 			} else {
1801 				result = zend_string_alloc(alloc_len, 0);
1802 			}
1803 			/* stick that last bit of string on our output */
1804 			memcpy(ZSTR_VAL(result) + result_len, piece, subject_len - last_end_offset);
1805 			result_len += subject_len - last_end_offset;
1806 			ZSTR_VAL(result)[result_len] = '\0';
1807 			ZSTR_LEN(result) = result_len;
1808 			break;
1809 		} else {
1810 error:
1811 			pcre_handle_exec_error(count);
1812 			if (result) {
1813 				zend_string_release_ex(result, 0);
1814 				result = NULL;
1815 			}
1816 			break;
1817 		}
1818 
1819 #ifdef HAVE_PCRE_JIT_SUPPORT
1820 		if (pce->preg_options & PREG_JIT) {
1821 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1822 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1823 		} else
1824 #endif
1825 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1826 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1827 	}
1828 	if (match_data != mdata) {
1829 		pcre2_match_data_free(match_data);
1830 	}
1831 
1832 	return result;
1833 }
1834 /* }}} */
1835 
1836 /* {{{ php_pcre_replace_func_impl() */
php_pcre_replace_func_impl(pcre_cache_entry * pce,zend_string * subject_str,const char * subject,size_t subject_len,zend_fcall_info * fci,zend_fcall_info_cache * fcc,size_t limit,size_t * replace_count,zend_long flags)1837 static zend_string *php_pcre_replace_func_impl(pcre_cache_entry *pce, zend_string *subject_str, const char *subject, size_t subject_len, zend_fcall_info *fci, zend_fcall_info_cache *fcc, size_t limit, size_t *replace_count, zend_long flags)
1838 {
1839 	uint32_t		 options;			/* Execution options */
1840 	int				 count;				/* Count of matched subpatterns */
1841 	zend_string		**subpat_names;		/* Array for named subpatterns */
1842 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1843 	size_t			 new_len;			/* Length of needed storage */
1844 	size_t			 alloc_len;			/* Actual allocated length */
1845 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
1846 	size_t			 last_end_offset;	/* Where the last search ended */
1847 	const char		*match,				/* The current match */
1848 					*piece;				/* The current piece of subject */
1849 	size_t			result_len; 		/* Length of result */
1850 	zend_string		*result;			/* Result of replacement */
1851 	zend_string     *eval_result;		/* Result of custom function */
1852 	pcre2_match_data *match_data;
1853 	bool old_mdata_used;
1854 
1855 	/* Calculate the size of the offsets array, and allocate memory for it. */
1856 	num_subpats = pce->capture_count + 1;
1857 
1858 	/*
1859 	 * Build a mapping from subpattern numbers to their names. We will
1860 	 * allocate the table only if there are any named subpatterns.
1861 	 */
1862 	subpat_names = NULL;
1863 	if (UNEXPECTED(pce->name_count > 0)) {
1864 		subpat_names = make_subpats_table(num_subpats, pce);
1865 		if (!subpat_names) {
1866 			return NULL;
1867 		}
1868 	}
1869 
1870 	alloc_len = 0;
1871 	result = NULL;
1872 
1873 	/* Initialize */
1874 	match = NULL;
1875 	start_offset = 0;
1876 	last_end_offset = 0;
1877 	result_len = 0;
1878 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1879 
1880 	old_mdata_used = mdata_used;
1881 	if (!old_mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1882 		mdata_used = 1;
1883 		match_data = mdata;
1884 	} else {
1885 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1886 		if (!match_data) {
1887 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1888 			if (subpat_names) {
1889 				free_subpats_table(subpat_names, num_subpats);
1890 			}
1891 			mdata_used = old_mdata_used;
1892 			return NULL;
1893 		}
1894 	}
1895 
1896 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
1897 
1898 	/* Array of subpattern offsets */
1899 	PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data);
1900 
1901 	/* Execute the regular expression. */
1902 #ifdef HAVE_PCRE_JIT_SUPPORT
1903 	if ((pce->preg_options & PREG_JIT) && options) {
1904 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1905 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1906 	} else
1907 #endif
1908 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1909 			options, match_data, mctx);
1910 
1911 	while (1) {
1912 		piece = subject + last_end_offset;
1913 
1914 		if (count >= 0 && limit) {
1915 			/* Check for too many substrings condition. */
1916 			if (UNEXPECTED(count == 0)) {
1917 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
1918 				count = num_subpats;
1919 			}
1920 
1921 matched:
1922 			if (UNEXPECTED(offsets[1] < offsets[0])) {
1923 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1924 				if (result) {
1925 					zend_string_release_ex(result, 0);
1926 					result = NULL;
1927 				}
1928 				break;
1929 			}
1930 
1931 			if (replace_count) {
1932 				++*replace_count;
1933 			}
1934 
1935 			/* Set the match location in subject */
1936 			match = subject + offsets[0];
1937 
1938 			new_len = result_len + offsets[0] - last_end_offset; /* part before the match */
1939 
1940 			/* Use custom function to get replacement string and its length. */
1941 			eval_result = preg_do_repl_func(
1942 				fci, fcc, subject, offsets, subpat_names, num_subpats, count,
1943 				pcre2_get_mark(match_data), flags);
1944 
1945 			ZEND_ASSERT(eval_result);
1946 			new_len = zend_safe_address_guarded(1, ZSTR_LEN(eval_result) + ZSTR_MAX_OVERHEAD, new_len) -ZSTR_MAX_OVERHEAD;
1947 			if (new_len >= alloc_len) {
1948 				alloc_len = zend_safe_address_guarded(2, new_len, ZSTR_MAX_OVERHEAD) - ZSTR_MAX_OVERHEAD;
1949 				if (result == NULL) {
1950 					result = zend_string_alloc(alloc_len, 0);
1951 				} else {
1952 					result = zend_string_extend(result, alloc_len, 0);
1953 				}
1954 			}
1955 
1956 			if (match-piece > 0) {
1957 				/* copy the part of the string before the match */
1958 				memcpy(ZSTR_VAL(result) + result_len, piece, match-piece);
1959 				result_len += (match-piece);
1960 			}
1961 
1962 			/* If using custom function, copy result to the buffer and clean up. */
1963 			memcpy(ZSTR_VAL(result) + result_len, ZSTR_VAL(eval_result), ZSTR_LEN(eval_result));
1964 			result_len += ZSTR_LEN(eval_result);
1965 			zend_string_release_ex(eval_result, 0);
1966 
1967 			limit--;
1968 
1969 			/* Advance to the next piece. */
1970 			start_offset = last_end_offset = offsets[1];
1971 
1972 			/* If we have matched an empty string, mimic what Perl's /g options does.
1973 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1974 			   the match again at the same point. If this fails (picked up above) we
1975 			   advance to the next character. */
1976 			if (start_offset == offsets[0]) {
1977 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1978 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1979 
1980 				piece = subject + start_offset;
1981 				if (count >= 0 && limit) {
1982 					goto matched;
1983 				} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1984 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1985 					   this is not necessarily the end. We need to advance
1986 					   the start offset, and continue. Fudge the offset values
1987 					   to achieve this, unless we're already at the end of the string. */
1988 					if (start_offset < subject_len) {
1989 						size_t unit_len = calculate_unit_length(pce, piece);
1990 						start_offset += unit_len;
1991 					} else {
1992 						goto not_matched;
1993 					}
1994 				} else {
1995 					goto error;
1996 				}
1997 			}
1998 
1999 		} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
2000 not_matched:
2001 			if (!result && subject_str) {
2002 				result = zend_string_copy(subject_str);
2003 				break;
2004 			}
2005 			/* now we know exactly how long it is */
2006 			alloc_len = result_len + subject_len - last_end_offset;
2007 			if (NULL != result) {
2008 				result = zend_string_realloc(result, alloc_len, 0);
2009 			} else {
2010 				result = zend_string_alloc(alloc_len, 0);
2011 			}
2012 			/* stick that last bit of string on our output */
2013 			memcpy(ZSTR_VAL(result) + result_len, piece, subject_len - last_end_offset);
2014 			result_len += subject_len - last_end_offset;
2015 			ZSTR_VAL(result)[result_len] = '\0';
2016 			ZSTR_LEN(result) = result_len;
2017 			break;
2018 		} else {
2019 error:
2020 			pcre_handle_exec_error(count);
2021 			if (result) {
2022 				zend_string_release_ex(result, 0);
2023 				result = NULL;
2024 			}
2025 			break;
2026 		}
2027 #ifdef HAVE_PCRE_JIT_SUPPORT
2028 		if ((pce->preg_options & PREG_JIT)) {
2029 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
2030 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2031 		} else
2032 #endif
2033 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
2034 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2035 	}
2036 	if (match_data != mdata) {
2037 		pcre2_match_data_free(match_data);
2038 	}
2039 	mdata_used = old_mdata_used;
2040 
2041 	if (UNEXPECTED(subpat_names)) {
2042 		free_subpats_table(subpat_names, num_subpats);
2043 	}
2044 
2045 	return result;
2046 }
2047 /* }}} */
2048 
2049 /* {{{ php_pcre_replace_func */
php_pcre_replace_func(zend_string * regex,zend_string * subject_str,zend_fcall_info * fci,zend_fcall_info_cache * fcc,size_t limit,size_t * replace_count,zend_long flags)2050 static zend_always_inline zend_string *php_pcre_replace_func(zend_string *regex,
2051 							  zend_string *subject_str,
2052 							  zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2053 							  size_t limit, size_t *replace_count, zend_long flags)
2054 {
2055 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
2056 	zend_string	 		*result;			/* Function result */
2057 
2058 	/* Compile regex or get it from cache. */
2059 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2060 		return NULL;
2061 	}
2062 	pce->refcount++;
2063 	result = php_pcre_replace_func_impl(
2064 		pce, subject_str, ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), fci, fcc,
2065 		limit, replace_count, flags);
2066 	pce->refcount--;
2067 
2068 	return result;
2069 }
2070 /* }}} */
2071 
2072 /* {{{ php_pcre_replace_array */
php_pcre_replace_array(HashTable * regex,zend_string * replace_str,HashTable * replace_ht,zend_string * subject_str,size_t limit,size_t * replace_count)2073 static zend_string *php_pcre_replace_array(HashTable *regex,
2074 	zend_string *replace_str, HashTable *replace_ht,
2075 	zend_string *subject_str, size_t limit, size_t *replace_count)
2076 {
2077 	zval		*regex_entry;
2078 	zend_string *result;
2079 
2080 	zend_string_addref(subject_str);
2081 
2082 	if (replace_ht) {
2083 		uint32_t replace_idx = 0;
2084 
2085 		/* For each entry in the regex array, get the entry */
2086 		ZEND_HASH_FOREACH_VAL(regex, regex_entry) {
2087 			/* Make sure we're dealing with strings. */
2088 			zend_string *tmp_regex_str;
2089 			zend_string *regex_str = zval_get_tmp_string(regex_entry, &tmp_regex_str);
2090 			zend_string *replace_entry_str, *tmp_replace_entry_str;
2091 			zval *zv;
2092 
2093 			/* Get current entry */
2094 			while (1) {
2095 				if (replace_idx == replace_ht->nNumUsed) {
2096 					replace_entry_str = ZSTR_EMPTY_ALLOC();
2097 					tmp_replace_entry_str = NULL;
2098 					break;
2099 				}
2100 				zv = ZEND_HASH_ELEMENT(replace_ht, replace_idx);
2101 				replace_idx++;
2102 				if (Z_TYPE_P(zv) != IS_UNDEF) {
2103 					replace_entry_str = zval_get_tmp_string(zv, &tmp_replace_entry_str);
2104 					break;
2105 				}
2106 			}
2107 
2108 			/* Do the actual replacement and put the result back into subject_str
2109 			   for further replacements. */
2110 			result = php_pcre_replace(regex_str, subject_str, ZSTR_VAL(subject_str),
2111 				ZSTR_LEN(subject_str), replace_entry_str, limit, replace_count);
2112 			zend_tmp_string_release(tmp_replace_entry_str);
2113 			zend_tmp_string_release(tmp_regex_str);
2114 			zend_string_release_ex(subject_str, 0);
2115 			subject_str = result;
2116 			if (UNEXPECTED(result == NULL)) {
2117 				break;
2118 			}
2119 		} ZEND_HASH_FOREACH_END();
2120 
2121 	} else {
2122 		ZEND_ASSERT(replace_str != NULL);
2123 
2124 		/* For each entry in the regex array, get the entry */
2125 		ZEND_HASH_FOREACH_VAL(regex, regex_entry) {
2126 			/* Make sure we're dealing with strings. */
2127 			zend_string *tmp_regex_str;
2128 			zend_string *regex_str = zval_get_tmp_string(regex_entry, &tmp_regex_str);
2129 
2130 			/* Do the actual replacement and put the result back into subject_str
2131 			   for further replacements. */
2132 			result = php_pcre_replace(regex_str, subject_str, ZSTR_VAL(subject_str),
2133 				ZSTR_LEN(subject_str), replace_str, limit, replace_count);
2134 			zend_tmp_string_release(tmp_regex_str);
2135 			zend_string_release_ex(subject_str, 0);
2136 			subject_str = result;
2137 
2138 			if (UNEXPECTED(result == NULL)) {
2139 				break;
2140 			}
2141 		} ZEND_HASH_FOREACH_END();
2142 	}
2143 
2144 	return subject_str;
2145 }
2146 /* }}} */
2147 
2148 /* {{{ php_replace_in_subject */
php_replace_in_subject(zend_string * regex_str,HashTable * regex_ht,zend_string * replace_str,HashTable * replace_ht,zend_string * subject,size_t limit,size_t * replace_count)2149 static zend_always_inline zend_string *php_replace_in_subject(
2150 	zend_string *regex_str, HashTable *regex_ht,
2151 	zend_string *replace_str, HashTable *replace_ht,
2152 	zend_string *subject, size_t limit, size_t *replace_count)
2153 {
2154 	zend_string *result;
2155 
2156 	if (regex_str) {
2157 		ZEND_ASSERT(replace_str != NULL);
2158 		result = php_pcre_replace(regex_str, subject, ZSTR_VAL(subject), ZSTR_LEN(subject),
2159 			replace_str, limit, replace_count);
2160 	} else {
2161 		ZEND_ASSERT(regex_ht != NULL);
2162 		result = php_pcre_replace_array(regex_ht, replace_str, replace_ht, subject,
2163 			limit, replace_count);
2164 	}
2165 	return result;
2166 }
2167 /* }}} */
2168 
2169 /* {{{ php_replace_in_subject_func */
php_replace_in_subject_func(zend_string * regex_str,HashTable * regex_ht,zend_fcall_info * fci,zend_fcall_info_cache * fcc,zend_string * subject,size_t limit,size_t * replace_count,zend_long flags)2170 static zend_string *php_replace_in_subject_func(zend_string *regex_str, HashTable *regex_ht,
2171 	zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2172 	zend_string *subject, size_t limit, size_t *replace_count, zend_long flags)
2173 {
2174 	zend_string *result;
2175 
2176 	if (regex_str) {
2177 		result = php_pcre_replace_func(
2178 			regex_str, subject, fci, fcc, limit, replace_count, flags);
2179 		return result;
2180 	} else {
2181 		/* If regex is an array */
2182 		zval		*regex_entry;
2183 
2184 		ZEND_ASSERT(regex_ht != NULL);
2185 
2186 		zend_string_addref(subject);
2187 
2188 		/* For each entry in the regex array, get the entry */
2189 		ZEND_HASH_FOREACH_VAL(regex_ht, regex_entry) {
2190 			/* Make sure we're dealing with strings. */
2191 			zend_string *tmp_regex_entry_str;
2192 			zend_string *regex_entry_str = zval_get_tmp_string(regex_entry, &tmp_regex_entry_str);
2193 
2194 			/* Do the actual replacement and put the result back into subject
2195 			   for further replacements. */
2196 			result = php_pcre_replace_func(
2197 				regex_entry_str, subject, fci, fcc, limit, replace_count, flags);
2198 			zend_tmp_string_release(tmp_regex_entry_str);
2199 			zend_string_release(subject);
2200 			subject = result;
2201 			if (UNEXPECTED(result == NULL)) {
2202 				break;
2203 			}
2204 		} ZEND_HASH_FOREACH_END();
2205 
2206 		return subject;
2207 	}
2208 }
2209 /* }}} */
2210 
2211 /* {{{ preg_replace_func_impl */
preg_replace_func_impl(zval * return_value,zend_string * regex_str,HashTable * regex_ht,zend_fcall_info * fci,zend_fcall_info_cache * fcc,zend_string * subject_str,HashTable * subject_ht,zend_long limit_val,zend_long flags)2212 static size_t preg_replace_func_impl(zval *return_value,
2213 	zend_string *regex_str, HashTable *regex_ht,
2214 	zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2215 	zend_string *subject_str, HashTable *subject_ht, zend_long limit_val, zend_long flags)
2216 {
2217 	zend_string	*result;
2218 	size_t replace_count = 0;
2219 
2220 	if (subject_str) {
2221 		result = php_replace_in_subject_func(
2222 			regex_str, regex_ht, fci, fcc, subject_str, limit_val, &replace_count, flags);
2223 		if (result != NULL) {
2224 			RETVAL_STR(result);
2225 		} else {
2226 			RETVAL_NULL();
2227 		}
2228 	} else {
2229 		/* if subject is an array */
2230 		zval		*subject_entry, zv;
2231 		zend_string	*string_key;
2232 		zend_ulong	 num_key;
2233 
2234 		ZEND_ASSERT(subject_ht != NULL);
2235 
2236 		array_init_size(return_value, zend_hash_num_elements(subject_ht));
2237 		HashTable *return_value_ht = Z_ARRVAL_P(return_value);
2238 
2239 		/* For each subject entry, convert it to string, then perform replacement
2240 		   and add the result to the return_value array. */
2241 		ZEND_HASH_FOREACH_KEY_VAL(subject_ht, num_key, string_key, subject_entry) {
2242 			zend_string *tmp_subject_entry_str;
2243 			zend_string *subject_entry_str = zval_get_tmp_string(subject_entry, &tmp_subject_entry_str);
2244 
2245 			result = php_replace_in_subject_func(
2246 				regex_str, regex_ht, fci, fcc, subject_entry_str, limit_val, &replace_count, flags);
2247 			if (result != NULL) {
2248 				/* Add to return array */
2249 				ZVAL_STR(&zv, result);
2250 				if (string_key) {
2251 					zend_hash_add_new(return_value_ht, string_key, &zv);
2252 				} else {
2253 					zend_hash_index_add_new(return_value_ht, num_key, &zv);
2254 				}
2255 			}
2256 			zend_tmp_string_release(tmp_subject_entry_str);
2257 		} ZEND_HASH_FOREACH_END();
2258 	}
2259 
2260 	return replace_count;
2261 }
2262 /* }}} */
2263 
_preg_replace_common(zval * return_value,HashTable * regex_ht,zend_string * regex_str,HashTable * replace_ht,zend_string * replace_str,HashTable * subject_ht,zend_string * subject_str,zend_long limit,zval * zcount,bool is_filter)2264 static void _preg_replace_common(
2265 	zval *return_value,
2266 	HashTable *regex_ht, zend_string *regex_str,
2267 	HashTable *replace_ht, zend_string *replace_str,
2268 	HashTable *subject_ht, zend_string *subject_str,
2269 	zend_long limit,
2270 	zval *zcount,
2271 	bool is_filter
2272 ) {
2273 	size_t replace_count = 0;
2274 	zend_string	*result;
2275 	size_t old_replace_count;
2276 
2277 	/* If replace is an array then the regex argument needs to also be an array */
2278 	if (replace_ht && !regex_ht) {
2279 		zend_argument_type_error(1, "must be of type array when argument #2 ($replacement) is an array, string given");
2280 		RETURN_THROWS();
2281 	}
2282 
2283 	if (subject_str) {
2284 		old_replace_count = replace_count;
2285 		result = php_replace_in_subject(regex_str, regex_ht, replace_str, replace_ht,
2286 			subject_str, limit, &replace_count);
2287 		if (result != NULL) {
2288 			if (!is_filter || replace_count > old_replace_count) {
2289 				RETVAL_STR(result);
2290 			} else {
2291 				zend_string_release_ex(result, 0);
2292 				RETVAL_NULL();
2293 			}
2294 		} else {
2295 			RETVAL_NULL();
2296 		}
2297 	} else {
2298 		/* if subject is an array */
2299 		zval		*subject_entry, zv;
2300 		zend_string	*string_key;
2301 		zend_ulong	 num_key;
2302 
2303 		ZEND_ASSERT(subject_ht != NULL);
2304 
2305 		array_init_size(return_value, zend_hash_num_elements(subject_ht));
2306 		HashTable *return_value_ht = Z_ARRVAL_P(return_value);
2307 
2308 		/* For each subject entry, convert it to string, then perform replacement
2309 		   and add the result to the return_value array. */
2310 		ZEND_HASH_FOREACH_KEY_VAL(subject_ht, num_key, string_key, subject_entry) {
2311 			old_replace_count = replace_count;
2312 			zend_string *tmp_subject_entry_str;
2313 			zend_string *subject_entry_str = zval_get_tmp_string(subject_entry, &tmp_subject_entry_str);
2314 			result = php_replace_in_subject(regex_str, regex_ht, replace_str, replace_ht,
2315 				subject_entry_str, limit, &replace_count);
2316 
2317 			if (result != NULL) {
2318 				if (!is_filter || replace_count > old_replace_count) {
2319 					/* Add to return array */
2320 					ZVAL_STR(&zv, result);
2321 					if (string_key) {
2322 						zend_hash_add_new(return_value_ht, string_key, &zv);
2323 					} else {
2324 						zend_hash_index_add_new(return_value_ht, num_key, &zv);
2325 					}
2326 				} else {
2327 					zend_string_release_ex(result, 0);
2328 				}
2329 			}
2330 			zend_tmp_string_release(tmp_subject_entry_str);
2331 		} ZEND_HASH_FOREACH_END();
2332 	}
2333 
2334 	if (zcount) {
2335 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2336 	}
2337 }
2338 
2339 /* {{{ preg_replace_common */
preg_replace_common(INTERNAL_FUNCTION_PARAMETERS,bool is_filter)2340 static void preg_replace_common(INTERNAL_FUNCTION_PARAMETERS, bool is_filter)
2341 {
2342 	zend_string *regex_str, *replace_str, *subject_str;
2343 	HashTable *regex_ht, *replace_ht, *subject_ht;
2344 	zend_long limit = -1;
2345 	zval *zcount = NULL;
2346 
2347 	/* Get function parameters and do error-checking. */
2348 	ZEND_PARSE_PARAMETERS_START(3, 5)
2349 		Z_PARAM_ARRAY_HT_OR_STR(regex_ht, regex_str)
2350 		Z_PARAM_ARRAY_HT_OR_STR(replace_ht, replace_str)
2351 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2352 		Z_PARAM_OPTIONAL
2353 		Z_PARAM_LONG(limit)
2354 		Z_PARAM_ZVAL(zcount)
2355 	ZEND_PARSE_PARAMETERS_END();
2356 
2357 	_preg_replace_common(
2358 		return_value,
2359 		regex_ht, regex_str,
2360 		replace_ht, replace_str,
2361 		subject_ht, subject_str,
2362 		limit, zcount, is_filter);
2363 }
2364 /* }}} */
2365 
2366 /* {{{ Perform Perl-style regular expression replacement. */
PHP_FUNCTION(preg_replace)2367 PHP_FUNCTION(preg_replace)
2368 {
2369 	preg_replace_common(INTERNAL_FUNCTION_PARAM_PASSTHRU, false);
2370 }
2371 /* }}} */
2372 
2373 ZEND_FRAMELESS_FUNCTION(preg_replace, 3)
2374 {
2375 	zend_string *regex_str, *replace_str, *subject_str;
2376 	HashTable *regex_ht, *replace_ht, *subject_ht;
2377 	zval regex_tmp, replace_tmp, subject_tmp;
2378 
2379 	Z_FLF_PARAM_ARRAY_HT_OR_STR(1, regex_ht, regex_str, regex_tmp);
2380 	Z_FLF_PARAM_ARRAY_HT_OR_STR(2, replace_ht, replace_str, replace_tmp);
2381 	Z_FLF_PARAM_ARRAY_HT_OR_STR(3, subject_ht, subject_str, subject_tmp);
2382 
2383 	_preg_replace_common(
2384 		return_value,
2385 		regex_ht, regex_str,
2386 		replace_ht, replace_str,
2387 		subject_ht, subject_str,
2388 		/* limit */ -1, /* zcount */ NULL, /* is_filter */ false);
2389 
2390 flf_clean:;
2391 	Z_FLF_PARAM_FREE_STR(1, regex_tmp);
2392 	Z_FLF_PARAM_FREE_STR(2, replace_tmp);
2393 	Z_FLF_PARAM_FREE_STR(3, subject_tmp);
2394 }
2395 
2396 /* {{{ Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback)2397 PHP_FUNCTION(preg_replace_callback)
2398 {
2399 	zval *zcount = NULL;
2400 	zend_string *regex_str;
2401 	HashTable *regex_ht;
2402 	zend_string *subject_str;
2403 	HashTable *subject_ht;
2404 	zend_long limit = -1, flags = 0;
2405 	size_t replace_count;
2406 	zend_fcall_info fci;
2407 	zend_fcall_info_cache fcc;
2408 
2409 	/* Get function parameters and do error-checking. */
2410 	ZEND_PARSE_PARAMETERS_START(3, 6)
2411 		Z_PARAM_ARRAY_HT_OR_STR(regex_ht, regex_str)
2412 		Z_PARAM_FUNC(fci, fcc)
2413 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2414 		Z_PARAM_OPTIONAL
2415 		Z_PARAM_LONG(limit)
2416 		Z_PARAM_ZVAL(zcount)
2417 		Z_PARAM_LONG(flags)
2418 	ZEND_PARSE_PARAMETERS_END();
2419 
2420 	replace_count = preg_replace_func_impl(return_value, regex_str, regex_ht,
2421 		&fci, &fcc,
2422 		subject_str, subject_ht, limit, flags);
2423 	if (zcount) {
2424 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2425 	}
2426 }
2427 /* }}} */
2428 
2429 /* {{{ Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback_array)2430 PHP_FUNCTION(preg_replace_callback_array)
2431 {
2432 	zval zv, *replace, *zcount = NULL;
2433 	HashTable *pattern, *subject_ht;
2434 	zend_string *subject_str, *str_idx_regex;
2435 	zend_long limit = -1, flags = 0;
2436 	size_t replace_count = 0;
2437 	zend_fcall_info fci;
2438 	zend_fcall_info_cache fcc;
2439 
2440 	/* Get function parameters and do error-checking. */
2441 	ZEND_PARSE_PARAMETERS_START(2, 5)
2442 		Z_PARAM_ARRAY_HT(pattern)
2443 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2444 		Z_PARAM_OPTIONAL
2445 		Z_PARAM_LONG(limit)
2446 		Z_PARAM_ZVAL(zcount)
2447 		Z_PARAM_LONG(flags)
2448 	ZEND_PARSE_PARAMETERS_END();
2449 
2450 	fci.size = sizeof(fci);
2451 	fci.object = NULL;
2452 	fci.named_params = NULL;
2453 
2454 	if (subject_ht) {
2455 		GC_TRY_ADDREF(subject_ht);
2456 	} else {
2457 		GC_TRY_ADDREF(subject_str);
2458 	}
2459 
2460 	ZEND_HASH_FOREACH_STR_KEY_VAL(pattern, str_idx_regex, replace) {
2461 		if (!zend_is_callable_ex(replace, NULL, 0, NULL, &fcc, NULL)) {
2462 			zend_argument_type_error(1, "must contain only valid callbacks");
2463 			goto error;
2464 		}
2465 		if (!str_idx_regex) {
2466 			zend_argument_type_error(1, "must contain only string patterns as keys");
2467 			goto error;
2468 		}
2469 
2470 		ZVAL_COPY_VALUE(&fci.function_name, replace);
2471 
2472 		replace_count += preg_replace_func_impl(&zv, str_idx_regex, /* regex_ht */ NULL, &fci, &fcc,
2473 			subject_str, subject_ht, limit, flags);
2474 		switch (Z_TYPE(zv)) {
2475 			case IS_ARRAY:
2476 				ZEND_ASSERT(subject_ht);
2477 				zend_array_release(subject_ht);
2478 				subject_ht = Z_ARR(zv);
2479 				break;
2480 			case IS_STRING:
2481 				ZEND_ASSERT(subject_str);
2482 				zend_string_release(subject_str);
2483 				subject_str = Z_STR(zv);
2484 				break;
2485 			case IS_NULL:
2486 				RETVAL_NULL();
2487 				goto error;
2488 			EMPTY_SWITCH_DEFAULT_CASE()
2489 		}
2490 
2491 		if (EG(exception)) {
2492 			goto error;
2493 		}
2494 	} ZEND_HASH_FOREACH_END();
2495 
2496 	if (zcount) {
2497 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2498 	}
2499 
2500 	if (subject_ht) {
2501 		RETVAL_ARR(subject_ht);
2502 		// Unset the type_flags of immutable arrays to prevent the VM from performing refcounting
2503 		if (GC_FLAGS(subject_ht) & IS_ARRAY_IMMUTABLE) {
2504 			Z_TYPE_FLAGS_P(return_value) = 0;
2505 		}
2506 		return;
2507 	} else {
2508 		RETURN_STR(subject_str);
2509 	}
2510 
2511 error:
2512 	if (subject_ht) {
2513 		zend_array_release(subject_ht);
2514 	} else {
2515 		zend_string_release(subject_str);
2516 	}
2517 }
2518 /* }}} */
2519 
2520 /* {{{ Perform Perl-style regular expression replacement and only return matches. */
PHP_FUNCTION(preg_filter)2521 PHP_FUNCTION(preg_filter)
2522 {
2523 	preg_replace_common(INTERNAL_FUNCTION_PARAM_PASSTHRU, true);
2524 }
2525 /* }}} */
2526 
2527 /* {{{ Split string into an array using a perl-style regular expression as a delimiter */
PHP_FUNCTION(preg_split)2528 PHP_FUNCTION(preg_split)
2529 {
2530 	zend_string			*regex;			/* Regular expression */
2531 	zend_string			*subject;		/* String to match against */
2532 	zend_long			 limit_val = -1;/* Integer value of limit */
2533 	zend_long			 flags = 0;		/* Match control flags */
2534 	pcre_cache_entry	*pce;			/* Compiled regular expression */
2535 
2536 	/* Get function parameters and do error checking */
2537 	ZEND_PARSE_PARAMETERS_START(2, 4)
2538 		Z_PARAM_STR(regex)
2539 		Z_PARAM_STR(subject)
2540 		Z_PARAM_OPTIONAL
2541 		Z_PARAM_LONG(limit_val)
2542 		Z_PARAM_LONG(flags)
2543 	ZEND_PARSE_PARAMETERS_END();
2544 
2545 	/* Compile regex or get it from cache. */
2546 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2547 		RETURN_FALSE;
2548 	}
2549 
2550 	pce->refcount++;
2551 	php_pcre_split_impl(pce, subject, return_value, limit_val, flags);
2552 	pce->refcount--;
2553 }
2554 /* }}} */
2555 
2556 /* {{{ php_pcre_split */
php_pcre_split_impl(pcre_cache_entry * pce,zend_string * subject_str,zval * return_value,zend_long limit_val,zend_long flags)2557 PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
2558 	zend_long limit_val, zend_long flags)
2559 {
2560 	uint32_t		 options;			/* Execution options */
2561 	int				 count;				/* Count of matched subpatterns */
2562 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
2563 	PCRE2_SIZE		 last_match_offset;	/* Location of last match */
2564 	uint32_t		 no_empty;			/* If NO_EMPTY flag is set */
2565 	uint32_t		 delim_capture; 	/* If delimiters should be captured */
2566 	uint32_t		 offset_capture;	/* If offsets should be captured */
2567 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
2568 	zval			 tmp;
2569 	pcre2_match_data *match_data;
2570 	char *subject = ZSTR_VAL(subject_str);
2571 
2572 	no_empty = flags & PREG_SPLIT_NO_EMPTY;
2573 	delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
2574 	offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
2575 
2576 	/* Initialize return value */
2577 	array_init(return_value);
2578 	HashTable *return_value_ht = Z_ARRVAL_P(return_value);
2579 
2580 	/* Calculate the size of the offsets array, and allocate memory for it. */
2581 	num_subpats = pce->capture_count + 1;
2582 
2583 	/* Start at the beginning of the string */
2584 	start_offset = 0;
2585 	last_match_offset = 0;
2586 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
2587 
2588 	if (limit_val == -1) {
2589 		/* pass */
2590 	} else if (limit_val == 0) {
2591 		limit_val = -1;
2592 	} else if (limit_val <= 1) {
2593 		goto last;
2594 	}
2595 
2596 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
2597 		match_data = mdata;
2598 	} else {
2599 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
2600 		if (!match_data) {
2601 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2602 			zval_ptr_dtor(return_value);
2603 			RETURN_FALSE;
2604 		}
2605 	}
2606 
2607 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
2608 
2609 	/* Array of subpattern offsets */
2610 	PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data);
2611 
2612 #ifdef HAVE_PCRE_JIT_SUPPORT
2613 	if ((pce->preg_options & PREG_JIT) && options) {
2614 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2615 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2616 	} else
2617 #endif
2618 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2619 			options, match_data, mctx);
2620 
2621 	while (1) {
2622 		/* If something matched */
2623 		if (count >= 0) {
2624 			/* Check for too many substrings condition. */
2625 			if (UNEXPECTED(count == 0)) {
2626 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
2627 				count = num_subpats;
2628 			}
2629 
2630 matched:
2631 			if (UNEXPECTED(offsets[1] < offsets[0])) {
2632 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2633 				break;
2634 			}
2635 
2636 			if (!no_empty || offsets[0] != last_match_offset) {
2637 				if (offset_capture) {
2638 					/* Add (match, offset) pair to the return value */
2639 					add_offset_pair(
2640 						return_value_ht, subject, last_match_offset, offsets[0],
2641 						NULL, 0);
2642 				} else {
2643 					/* Add the piece to the return value */
2644 					populate_match_value_str(&tmp, subject, last_match_offset, offsets[0]);
2645 					zend_hash_next_index_insert_new(return_value_ht, &tmp);
2646 				}
2647 
2648 				/* One less left to do */
2649 				if (limit_val != -1)
2650 					limit_val--;
2651 			}
2652 
2653 			if (delim_capture) {
2654 				size_t i;
2655 				for (i = 1; i < count; i++) {
2656 					/* If we have matched a delimiter */
2657 					if (!no_empty || offsets[2*i] != offsets[2*i+1]) {
2658 						if (offset_capture) {
2659 							add_offset_pair(
2660 								return_value_ht, subject, offsets[2*i], offsets[2*i+1], NULL, 0);
2661 						} else {
2662 							populate_match_value_str(&tmp, subject, offsets[2*i], offsets[2*i+1]);
2663 							zend_hash_next_index_insert_new(return_value_ht, &tmp);
2664 						}
2665 					}
2666 				}
2667 			}
2668 
2669 			/* Advance to the position right after the last full match */
2670 			start_offset = last_match_offset = offsets[1];
2671 
2672 			/* If we have matched an empty string, mimic what Perl's /g options does.
2673 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
2674 			   the match again at the same point. If this fails (picked up above) we
2675 			   advance to the next character. */
2676 			if (start_offset == offsets[0]) {
2677 				/* Get next piece if no limit or limit not yet reached and something matched*/
2678 				if (limit_val != -1 && limit_val <= 1) {
2679 					break;
2680 				}
2681 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2682 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
2683 				if (count >= 0) {
2684 					goto matched;
2685 				} else if (count == PCRE2_ERROR_NOMATCH) {
2686 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
2687 					   this is not necessarily the end. We need to advance
2688 					   the start offset, and continue. Fudge the offset values
2689 					   to achieve this, unless we're already at the end of the string. */
2690 					if (start_offset < ZSTR_LEN(subject_str)) {
2691 						start_offset += calculate_unit_length(pce, subject + start_offset);
2692 					} else {
2693 						break;
2694 					}
2695 				} else {
2696 					goto error;
2697 				}
2698 			}
2699 
2700 		} else if (count == PCRE2_ERROR_NOMATCH) {
2701 			break;
2702 		} else {
2703 error:
2704 			pcre_handle_exec_error(count);
2705 			break;
2706 		}
2707 
2708 		/* Get next piece if no limit or limit not yet reached and something matched*/
2709 		if (limit_val != -1 && limit_val <= 1) {
2710 			break;
2711 		}
2712 
2713 #ifdef HAVE_PCRE_JIT_SUPPORT
2714 		if (pce->preg_options & PREG_JIT) {
2715 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2716 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2717 		} else
2718 #endif
2719 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2720 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2721 	}
2722 	if (match_data != mdata) {
2723 		pcre2_match_data_free(match_data);
2724 	}
2725 
2726 	if (PCRE_G(error_code) != PHP_PCRE_NO_ERROR) {
2727 		zval_ptr_dtor(return_value);
2728 		RETURN_FALSE;
2729 	}
2730 
2731 last:
2732 	start_offset = last_match_offset; /* the offset might have been incremented, but without further successful matches */
2733 
2734 	if (!no_empty || start_offset < ZSTR_LEN(subject_str)) {
2735 		if (offset_capture) {
2736 			/* Add the last (match, offset) pair to the return value */
2737 			add_offset_pair(return_value_ht, subject, start_offset, ZSTR_LEN(subject_str), NULL, 0);
2738 		} else {
2739 			/* Add the last piece to the return value */
2740 			if (start_offset == 0) {
2741 				ZVAL_STR_COPY(&tmp, subject_str);
2742 			} else {
2743 				populate_match_value_str(&tmp, subject, start_offset, ZSTR_LEN(subject_str));
2744 			}
2745 			zend_hash_next_index_insert_new(return_value_ht, &tmp);
2746 		}
2747 	}
2748 }
2749 /* }}} */
2750 
2751 /* {{{ Quote regular expression characters plus an optional character */
PHP_FUNCTION(preg_quote)2752 PHP_FUNCTION(preg_quote)
2753 {
2754 	zend_string *str;       		/* Input string argument */
2755 	zend_string	*delim = NULL;		/* Additional delimiter argument */
2756 	char		*in_str;			/* Input string */
2757 	char		*in_str_end;    	/* End of the input string */
2758 	zend_string	*out_str;			/* Output string with quoted characters */
2759 	size_t       extra_len;         /* Number of additional characters */
2760 	char 		*p,					/* Iterator for input string */
2761 				*q,					/* Iterator for output string */
2762 				 delim_char = '\0',	/* Delimiter character to be quoted */
2763 				 c;					/* Current character */
2764 
2765 	/* Get the arguments and check for errors */
2766 	ZEND_PARSE_PARAMETERS_START(1, 2)
2767 		Z_PARAM_STR(str)
2768 		Z_PARAM_OPTIONAL
2769 		Z_PARAM_STR_OR_NULL(delim)
2770 	ZEND_PARSE_PARAMETERS_END();
2771 
2772 	/* Nothing to do if we got an empty string */
2773 	if (ZSTR_LEN(str) == 0) {
2774 		RETURN_EMPTY_STRING();
2775 	}
2776 
2777 	in_str = ZSTR_VAL(str);
2778 	in_str_end = in_str + ZSTR_LEN(str);
2779 
2780 	if (delim) {
2781 		delim_char = ZSTR_VAL(delim)[0];
2782 	}
2783 
2784 	/* Go through the string and quote necessary characters */
2785 	extra_len = 0;
2786 	p = in_str;
2787 	do {
2788 		c = *p;
2789 		switch(c) {
2790 			case '.':
2791 			case '\\':
2792 			case '+':
2793 			case '*':
2794 			case '?':
2795 			case '[':
2796 			case '^':
2797 			case ']':
2798 			case '$':
2799 			case '(':
2800 			case ')':
2801 			case '{':
2802 			case '}':
2803 			case '=':
2804 			case '!':
2805 			case '>':
2806 			case '<':
2807 			case '|':
2808 			case ':':
2809 			case '-':
2810 			case '#':
2811 				extra_len++;
2812 				break;
2813 
2814 			case '\0':
2815 				extra_len+=3;
2816 				break;
2817 
2818 			default:
2819 				if (c == delim_char) {
2820 					extra_len++;
2821 				}
2822 				break;
2823 		}
2824 		p++;
2825 	} while (p != in_str_end);
2826 
2827 	if (extra_len == 0) {
2828 		RETURN_STR_COPY(str);
2829 	}
2830 
2831 	/* Allocate enough memory so that even if each character
2832 	   is quoted, we won't run out of room */
2833 	out_str = zend_string_safe_alloc(1, ZSTR_LEN(str), extra_len, 0);
2834 	q = ZSTR_VAL(out_str);
2835 	p = in_str;
2836 
2837 	do {
2838 		c = *p;
2839 		switch(c) {
2840 			case '.':
2841 			case '\\':
2842 			case '+':
2843 			case '*':
2844 			case '?':
2845 			case '[':
2846 			case '^':
2847 			case ']':
2848 			case '$':
2849 			case '(':
2850 			case ')':
2851 			case '{':
2852 			case '}':
2853 			case '=':
2854 			case '!':
2855 			case '>':
2856 			case '<':
2857 			case '|':
2858 			case ':':
2859 			case '-':
2860 			case '#':
2861 				*q++ = '\\';
2862 				*q++ = c;
2863 				break;
2864 
2865 			case '\0':
2866 				*q++ = '\\';
2867 				*q++ = '0';
2868 				*q++ = '0';
2869 				*q++ = '0';
2870 				break;
2871 
2872 			default:
2873 				if (c == delim_char) {
2874 					*q++ = '\\';
2875 				}
2876 				*q++ = c;
2877 				break;
2878 		}
2879 		p++;
2880 	} while (p != in_str_end);
2881 	*q = '\0';
2882 
2883 	RETURN_NEW_STR(out_str);
2884 }
2885 /* }}} */
2886 
2887 /* {{{ Searches array and returns entries which match regex */
PHP_FUNCTION(preg_grep)2888 PHP_FUNCTION(preg_grep)
2889 {
2890 	zend_string			*regex;			/* Regular expression */
2891 	zval				*input;			/* Input array */
2892 	zend_long			 flags = 0;		/* Match control flags */
2893 	pcre_cache_entry	*pce;			/* Compiled regular expression */
2894 
2895 	/* Get arguments and do error checking */
2896 	ZEND_PARSE_PARAMETERS_START(2, 3)
2897 		Z_PARAM_STR(regex)
2898 		Z_PARAM_ARRAY(input)
2899 		Z_PARAM_OPTIONAL
2900 		Z_PARAM_LONG(flags)
2901 	ZEND_PARSE_PARAMETERS_END();
2902 
2903 	/* Compile regex or get it from cache. */
2904 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2905 		RETURN_FALSE;
2906 	}
2907 
2908 	pce->refcount++;
2909 	php_pcre_grep_impl(pce, input, return_value, flags);
2910 	pce->refcount--;
2911 }
2912 /* }}} */
2913 
php_pcre_grep_impl(pcre_cache_entry * pce,zval * input,zval * return_value,zend_long flags)2914 PHPAPI void  php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, zend_long flags) /* {{{ */
2915 {
2916 	zval            *entry;             /* An entry in the input array */
2917 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
2918 	int				 count;				/* Count of matched subpatterns */
2919 	uint32_t		 options;			/* Execution options */
2920 	zend_string		*string_key;
2921 	zend_ulong		 num_key;
2922 	bool		 invert;			/* Whether to return non-matching
2923 										   entries */
2924 	pcre2_match_data *match_data;
2925 	invert = flags & PREG_GREP_INVERT ? 1 : 0;
2926 
2927 	/* Calculate the size of the offsets array, and allocate memory for it. */
2928 	num_subpats = pce->capture_count + 1;
2929 
2930 	/* Initialize return array */
2931 	array_init(return_value);
2932 	HashTable *return_value_ht = Z_ARRVAL_P(return_value);
2933 
2934 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
2935 
2936 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
2937 		match_data = mdata;
2938 	} else {
2939 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
2940 		if (!match_data) {
2941 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2942 			return;
2943 		}
2944 	}
2945 
2946 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
2947 
2948 	/* Go through the input array */
2949 	ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(input), num_key, string_key, entry) {
2950 		zend_string *tmp_subject_str;
2951 		zend_string *subject_str = zval_get_tmp_string(entry, &tmp_subject_str);
2952 
2953 		/* Perform the match */
2954 #ifdef HAVE_PCRE_JIT_SUPPORT
2955 		if ((pce->preg_options & PREG_JIT) && options) {
2956 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), 0,
2957 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2958 		} else
2959 #endif
2960 		count = pcre2_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), 0,
2961 				options, match_data, mctx);
2962 
2963 		/* If the entry fits our requirements */
2964 		if (count >= 0) {
2965 			/* Check for too many substrings condition. */
2966 			if (UNEXPECTED(count == 0)) {
2967 				php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings");
2968 			}
2969 			if (!invert) {
2970 				Z_TRY_ADDREF_P(entry);
2971 
2972 				/* Add to return array */
2973 				if (string_key) {
2974 					zend_hash_update(return_value_ht, string_key, entry);
2975 				} else {
2976 					zend_hash_index_update(return_value_ht, num_key, entry);
2977 				}
2978 			}
2979 		} else if (count == PCRE2_ERROR_NOMATCH) {
2980 			if (invert) {
2981 				Z_TRY_ADDREF_P(entry);
2982 
2983 				/* Add to return array */
2984 				if (string_key) {
2985 					zend_hash_update(return_value_ht, string_key, entry);
2986 				} else {
2987 					zend_hash_index_update(return_value_ht, num_key, entry);
2988 				}
2989 			}
2990 		} else {
2991 			pcre_handle_exec_error(count);
2992 			zend_tmp_string_release(tmp_subject_str);
2993 			break;
2994 		}
2995 
2996 		zend_tmp_string_release(tmp_subject_str);
2997 	} ZEND_HASH_FOREACH_END();
2998 	if (match_data != mdata) {
2999 		pcre2_match_data_free(match_data);
3000 	}
3001 }
3002 /* }}} */
3003 
3004 /* {{{ Returns the error code of the last regexp execution. */
PHP_FUNCTION(preg_last_error)3005 PHP_FUNCTION(preg_last_error)
3006 {
3007 	ZEND_PARSE_PARAMETERS_NONE();
3008 
3009 	RETURN_LONG(PCRE_G(error_code));
3010 }
3011 /* }}} */
3012 
3013 /* {{{ Returns the error message of the last regexp execution. */
PHP_FUNCTION(preg_last_error_msg)3014 PHP_FUNCTION(preg_last_error_msg)
3015 {
3016 	ZEND_PARSE_PARAMETERS_NONE();
3017 
3018 	RETURN_STRING(php_pcre_get_error_msg(PCRE_G(error_code)));
3019 }
3020 /* }}} */
3021 
3022 /* {{{ module definition structures */
3023 
3024 zend_module_entry pcre_module_entry = {
3025 	STANDARD_MODULE_HEADER,
3026 	"pcre",
3027 	ext_functions,
3028 	PHP_MINIT(pcre),
3029 	PHP_MSHUTDOWN(pcre),
3030 	PHP_RINIT(pcre),
3031 	PHP_RSHUTDOWN(pcre),
3032 	PHP_MINFO(pcre),
3033 	PHP_PCRE_VERSION,
3034 	PHP_MODULE_GLOBALS(pcre),
3035 	PHP_GINIT(pcre),
3036 	PHP_GSHUTDOWN(pcre),
3037 	NULL,
3038 	STANDARD_MODULE_PROPERTIES_EX
3039 };
3040 
3041 #ifdef COMPILE_DL_PCRE
ZEND_GET_MODULE(pcre)3042 ZEND_GET_MODULE(pcre)
3043 #endif
3044 
3045 /* }}} */
3046 
3047 PHPAPI pcre2_match_context *php_pcre_mctx(void)
3048 {/*{{{*/
3049 	return mctx;
3050 }/*}}}*/
3051 
php_pcre_gctx(void)3052 PHPAPI pcre2_general_context *php_pcre_gctx(void)
3053 {/*{{{*/
3054 	return gctx;
3055 }/*}}}*/
3056 
php_pcre_cctx(void)3057 PHPAPI pcre2_compile_context *php_pcre_cctx(void)
3058 {/*{{{*/
3059 	return cctx;
3060 }/*}}}*/
3061 
php_pcre_pce_incref(pcre_cache_entry * pce)3062 PHPAPI void php_pcre_pce_incref(pcre_cache_entry *pce)
3063 {/*{{{*/
3064 	assert(NULL != pce);
3065 	pce->refcount++;
3066 }/*}}}*/
3067 
php_pcre_pce_decref(pcre_cache_entry * pce)3068 PHPAPI void php_pcre_pce_decref(pcre_cache_entry *pce)
3069 {/*{{{*/
3070 	assert(NULL != pce);
3071 	assert(0 != pce->refcount);
3072 	pce->refcount--;
3073 }/*}}}*/
3074 
php_pcre_pce_re(pcre_cache_entry * pce)3075 PHPAPI pcre2_code *php_pcre_pce_re(pcre_cache_entry *pce)
3076 {/*{{{*/
3077 	assert(NULL != pce);
3078 	return pce->re;
3079 }/*}}}*/
3080