xref: /php-src/ext/pcre/php_pcre.c (revision 712fc54e)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Andrei Zmievski <andrei@php.net>                             |
14    +----------------------------------------------------------------------+
15  */
16 
17 #include "php.h"
18 #include "php_ini.h"
19 #include "php_globals.h"
20 #include "php_pcre.h"
21 #include "php_pcre_arginfo.h"
22 #include "ext/standard/info.h"
23 #include "ext/standard/basic_functions.h"
24 #include "zend_smart_str.h"
25 #include "SAPI.h"
26 
27 #include "ext/standard/php_string.h"
28 
29 #define PREG_PATTERN_ORDER			1
30 #define PREG_SET_ORDER				2
31 #define PREG_OFFSET_CAPTURE			(1<<8)
32 #define PREG_UNMATCHED_AS_NULL		(1<<9)
33 
34 #define	PREG_SPLIT_NO_EMPTY			(1<<0)
35 #define PREG_SPLIT_DELIM_CAPTURE	(1<<1)
36 #define PREG_SPLIT_OFFSET_CAPTURE	(1<<2)
37 
38 #define PREG_REPLACE_EVAL			(1<<0)
39 
40 #define PREG_GREP_INVERT			(1<<0)
41 
42 #define PREG_JIT                    (1<<3)
43 
44 #define PCRE_CACHE_SIZE 4096
45 
46 struct _pcre_cache_entry {
47 	pcre2_code *re;
48 	uint32_t preg_options;
49 	uint32_t capture_count;
50 	uint32_t name_count;
51 	uint32_t compile_options;
52 	uint32_t refcount;
53 };
54 
55 PHPAPI ZEND_DECLARE_MODULE_GLOBALS(pcre)
56 
57 #ifdef HAVE_PCRE_JIT_SUPPORT
58 #define PCRE_JIT_STACK_MIN_SIZE (32 * 1024)
59 #define PCRE_JIT_STACK_MAX_SIZE (192 * 1024)
60 ZEND_TLS pcre2_jit_stack *jit_stack = NULL;
61 #endif
62 /* General context using (infallible) system allocator. */
63 ZEND_TLS pcre2_general_context *gctx = NULL;
64 /* These two are global per thread for now. Though it is possible to use these
65  	per pattern. Either one can copy it and use in pce, or one does no global
66 	contexts at all, but creates for every pce. */
67 ZEND_TLS pcre2_compile_context *cctx = NULL;
68 ZEND_TLS pcre2_match_context   *mctx = NULL;
69 ZEND_TLS pcre2_match_data      *mdata = NULL;
70 ZEND_TLS bool              mdata_used = 0;
71 ZEND_TLS uint8_t pcre2_init_ok = 0;
72 #if defined(ZTS) && defined(HAVE_PCRE_JIT_SUPPORT)
73 static MUTEX_T pcre_mt = NULL;
74 #define php_pcre_mutex_alloc() \
75 	if (tsrm_is_main_thread() && !pcre_mt) pcre_mt = tsrm_mutex_alloc();
76 #define php_pcre_mutex_free() \
77 	if (tsrm_is_main_thread() && pcre_mt) { tsrm_mutex_free(pcre_mt); pcre_mt = NULL; }
78 #define php_pcre_mutex_lock() tsrm_mutex_lock(pcre_mt);
79 #define php_pcre_mutex_unlock() tsrm_mutex_unlock(pcre_mt);
80 #else
81 #define php_pcre_mutex_alloc()
82 #define php_pcre_mutex_free()
83 #define php_pcre_mutex_lock()
84 #define php_pcre_mutex_unlock()
85 #endif
86 
87 ZEND_TLS HashTable char_tables;
88 
php_pcre_free_char_table(zval * data)89 static void php_pcre_free_char_table(zval *data)
90 {/*{{{*/
91 	void *ptr = Z_PTR_P(data);
92 	pefree(ptr, 1);
93 }/*}}}*/
94 
pcre_handle_exec_error(int pcre_code)95 static void pcre_handle_exec_error(int pcre_code) /* {{{ */
96 {
97 	int preg_code = 0;
98 
99 	switch (pcre_code) {
100 		case PCRE2_ERROR_MATCHLIMIT:
101 			preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
102 			break;
103 
104 		case PCRE2_ERROR_RECURSIONLIMIT:
105 			preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
106 			break;
107 
108 		case PCRE2_ERROR_BADUTFOFFSET:
109 			preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
110 			break;
111 
112 #ifdef HAVE_PCRE_JIT_SUPPORT
113 		case PCRE2_ERROR_JIT_STACKLIMIT:
114 			preg_code = PHP_PCRE_JIT_STACKLIMIT_ERROR;
115 			break;
116 #endif
117 
118 		default:
119 			if (pcre_code <= PCRE2_ERROR_UTF8_ERR1 && pcre_code >= PCRE2_ERROR_UTF8_ERR21) {
120 				preg_code = PHP_PCRE_BAD_UTF8_ERROR;
121 			} else  {
122 				preg_code = PHP_PCRE_INTERNAL_ERROR;
123 			}
124 			break;
125 	}
126 
127 	PCRE_G(error_code) = preg_code;
128 }
129 /* }}} */
130 
php_pcre_get_error_msg(php_pcre_error_code error_code)131 static const char *php_pcre_get_error_msg(php_pcre_error_code error_code) /* {{{ */
132 {
133 	switch (error_code) {
134 		case PHP_PCRE_NO_ERROR:
135 			return "No error";
136 		case PHP_PCRE_INTERNAL_ERROR:
137 			return "Internal error";
138 		case PHP_PCRE_BAD_UTF8_ERROR:
139 			return "Malformed UTF-8 characters, possibly incorrectly encoded";
140 		case PHP_PCRE_BAD_UTF8_OFFSET_ERROR:
141 			return "The offset did not correspond to the beginning of a valid UTF-8 code point";
142 		case PHP_PCRE_BACKTRACK_LIMIT_ERROR:
143 			return "Backtrack limit exhausted";
144 		case PHP_PCRE_RECURSION_LIMIT_ERROR:
145 			return "Recursion limit exhausted";
146 
147 #ifdef HAVE_PCRE_JIT_SUPPORT
148 		case PHP_PCRE_JIT_STACKLIMIT_ERROR:
149 			return "JIT stack limit exhausted";
150 #endif
151 
152 		default:
153 			return "Unknown error";
154 	}
155 }
156 /* }}} */
157 
php_free_pcre_cache(zval * data)158 static void php_free_pcre_cache(zval *data) /* {{{ */
159 {
160 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
161 	if (!pce) return;
162 	pcre2_code_free(pce->re);
163 	free(pce);
164 }
165 /* }}} */
166 
php_efree_pcre_cache(zval * data)167 static void php_efree_pcre_cache(zval *data) /* {{{ */
168 {
169 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
170 	if (!pce) return;
171 	pcre2_code_free(pce->re);
172 	efree(pce);
173 }
174 /* }}} */
175 
php_pcre_malloc(PCRE2_SIZE size,void * data)176 static void *php_pcre_malloc(PCRE2_SIZE size, void *data)
177 {
178 	return pemalloc(size, 1);
179 }
180 
php_pcre_free(void * block,void * data)181 static void php_pcre_free(void *block, void *data)
182 {
183 	pefree(block, 1);
184 }
185 
php_pcre_emalloc(PCRE2_SIZE size,void * data)186 static void *php_pcre_emalloc(PCRE2_SIZE size, void *data)
187 {
188 	return emalloc(size);
189 }
190 
php_pcre_efree(void * block,void * data)191 static void php_pcre_efree(void *block, void *data)
192 {
193 	efree(block);
194 }
195 
196 #ifdef PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
197 	/* pcre 10.38 needs PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK, disabled by default */
198 #define PHP_PCRE_DEFAULT_EXTRA_COPTIONS PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK
199 #else
200 #define PHP_PCRE_DEFAULT_EXTRA_COPTIONS 0
201 #endif
202 
203 #define PHP_PCRE_PREALLOC_MDATA_SIZE 32
204 
php_pcre_init_pcre2(uint8_t jit)205 static void php_pcre_init_pcre2(uint8_t jit)
206 {/*{{{*/
207 	if (!gctx) {
208 		gctx = pcre2_general_context_create(php_pcre_malloc, php_pcre_free, NULL);
209 		if (!gctx) {
210 			pcre2_init_ok = 0;
211 			return;
212 		}
213 	}
214 
215 	if (!cctx) {
216 		cctx = pcre2_compile_context_create(gctx);
217 		if (!cctx) {
218 			pcre2_init_ok = 0;
219 			return;
220 		}
221 	}
222 
223 	pcre2_set_compile_extra_options(cctx, PHP_PCRE_DEFAULT_EXTRA_COPTIONS);
224 
225 	if (!mctx) {
226 		mctx = pcre2_match_context_create(gctx);
227 		if (!mctx) {
228 			pcre2_init_ok = 0;
229 			return;
230 		}
231 	}
232 
233 #ifdef HAVE_PCRE_JIT_SUPPORT
234 	if (jit && !jit_stack) {
235 		jit_stack = pcre2_jit_stack_create(PCRE_JIT_STACK_MIN_SIZE, PCRE_JIT_STACK_MAX_SIZE, gctx);
236 		if (!jit_stack) {
237 			pcre2_init_ok = 0;
238 			return;
239 		}
240 	}
241 #endif
242 
243 	if (!mdata) {
244 		mdata = pcre2_match_data_create(PHP_PCRE_PREALLOC_MDATA_SIZE, gctx);
245 		if (!mdata) {
246 			pcre2_init_ok = 0;
247 			return;
248 		}
249 	}
250 
251 	pcre2_init_ok = 1;
252 }/*}}}*/
253 
php_pcre_shutdown_pcre2(void)254 static void php_pcre_shutdown_pcre2(void)
255 {/*{{{*/
256 	if (gctx) {
257 		pcre2_general_context_free(gctx);
258 		gctx = NULL;
259 	}
260 
261 	if (cctx) {
262 		pcre2_compile_context_free(cctx);
263 		cctx = NULL;
264 	}
265 
266 	if (mctx) {
267 		pcre2_match_context_free(mctx);
268 		mctx = NULL;
269 	}
270 
271 #ifdef HAVE_PCRE_JIT_SUPPORT
272 	/* Stack may only be destroyed when no cached patterns
273 	 	possibly associated with it do exist. */
274 	if (jit_stack) {
275 		pcre2_jit_stack_free(jit_stack);
276 		jit_stack = NULL;
277 	}
278 #endif
279 
280 	if (mdata) {
281 		pcre2_match_data_free(mdata);
282 		mdata = NULL;
283 	}
284 
285 	pcre2_init_ok = 0;
286 }/*}}}*/
287 
PHP_GINIT_FUNCTION(pcre)288 static PHP_GINIT_FUNCTION(pcre) /* {{{ */
289 {
290 	php_pcre_mutex_alloc();
291 
292 	/* If we're on the CLI SAPI, there will only be one request, so we don't need the
293 	 * cache to survive after RSHUTDOWN. */
294 	pcre_globals->per_request_cache = strcmp(sapi_module.name, "cli") == 0;
295 	if (!pcre_globals->per_request_cache) {
296 		zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1);
297 	}
298 
299 	pcre_globals->backtrack_limit = 0;
300 	pcre_globals->recursion_limit = 0;
301 	pcre_globals->error_code      = PHP_PCRE_NO_ERROR;
302 	ZVAL_UNDEF(&pcre_globals->unmatched_null_pair);
303 	ZVAL_UNDEF(&pcre_globals->unmatched_empty_pair);
304 #ifdef HAVE_PCRE_JIT_SUPPORT
305 	pcre_globals->jit = 1;
306 #endif
307 
308 	php_pcre_init_pcre2(1);
309 	zend_hash_init(&char_tables, 1, NULL, php_pcre_free_char_table, 1);
310 }
311 /* }}} */
312 
PHP_GSHUTDOWN_FUNCTION(pcre)313 static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */
314 {
315 	if (!pcre_globals->per_request_cache) {
316 		zend_hash_destroy(&pcre_globals->pcre_cache);
317 	}
318 
319 	php_pcre_shutdown_pcre2();
320 	zend_hash_destroy(&char_tables);
321 	php_pcre_mutex_free();
322 }
323 /* }}} */
324 
PHP_INI_MH(OnUpdateBacktrackLimit)325 static PHP_INI_MH(OnUpdateBacktrackLimit)
326 {/*{{{*/
327 	OnUpdateLong(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
328 	if (mctx) {
329 		pcre2_set_match_limit(mctx, (uint32_t)PCRE_G(backtrack_limit));
330 	}
331 
332 	return SUCCESS;
333 }/*}}}*/
334 
PHP_INI_MH(OnUpdateRecursionLimit)335 static PHP_INI_MH(OnUpdateRecursionLimit)
336 {/*{{{*/
337 	OnUpdateLong(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
338 	if (mctx) {
339 		pcre2_set_depth_limit(mctx, (uint32_t)PCRE_G(recursion_limit));
340 	}
341 
342 	return SUCCESS;
343 }/*}}}*/
344 
345 #ifdef HAVE_PCRE_JIT_SUPPORT
PHP_INI_MH(OnUpdateJit)346 static PHP_INI_MH(OnUpdateJit)
347 {/*{{{*/
348 	OnUpdateBool(entry, new_value, mh_arg1, mh_arg2, mh_arg3, stage);
349 	if (PCRE_G(jit) && jit_stack) {
350 		pcre2_jit_stack_assign(mctx, NULL, jit_stack);
351 	} else {
352 		pcre2_jit_stack_assign(mctx, NULL, NULL);
353 	}
354 
355 	return SUCCESS;
356 }/*}}}*/
357 #endif
358 
359 PHP_INI_BEGIN()
360 	STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateBacktrackLimit, backtrack_limit, zend_pcre_globals, pcre_globals)
361 	STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000",  PHP_INI_ALL, OnUpdateRecursionLimit, recursion_limit, zend_pcre_globals, pcre_globals)
362 #ifdef HAVE_PCRE_JIT_SUPPORT
363 	STD_PHP_INI_ENTRY("pcre.jit",             "1",       PHP_INI_ALL, OnUpdateJit, jit,             zend_pcre_globals, pcre_globals)
364 #endif
PHP_INI_END()365 PHP_INI_END()
366 
367 static char *_pcre2_config_str(uint32_t what)
368 {/*{{{*/
369 	int len = pcre2_config(what, NULL);
370 	char *ret = (char *) malloc(len + 1);
371 
372 	len = pcre2_config(what, ret);
373 	if (!len) {
374 		free(ret);
375 		return NULL;
376 	}
377 
378 	return ret;
379 }/*}}}*/
380 
381 /* {{{ PHP_MINFO_FUNCTION(pcre) */
PHP_MINFO_FUNCTION(pcre)382 static PHP_MINFO_FUNCTION(pcre)
383 {
384 #ifdef HAVE_PCRE_JIT_SUPPORT
385 	uint32_t flag = 0;
386 	char *jit_target = _pcre2_config_str(PCRE2_CONFIG_JITTARGET);
387 #endif
388 	char *version = _pcre2_config_str(PCRE2_CONFIG_VERSION);
389 	char *unicode = _pcre2_config_str(PCRE2_CONFIG_UNICODE_VERSION);
390 
391 	php_info_print_table_start();
392 	php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
393 	php_info_print_table_row(2, "PCRE Library Version", version);
394 	free(version);
395 	php_info_print_table_row(2, "PCRE Unicode Version", unicode);
396 	free(unicode);
397 
398 #ifdef HAVE_PCRE_JIT_SUPPORT
399 	if (!pcre2_config(PCRE2_CONFIG_JIT, &flag)) {
400 		php_info_print_table_row(2, "PCRE JIT Support", flag ? "enabled" : "disabled");
401 	} else {
402 		php_info_print_table_row(2, "PCRE JIT Support", "unknown" );
403 	}
404 	if (jit_target) {
405 		php_info_print_table_row(2, "PCRE JIT Target", jit_target);
406 	}
407 	free(jit_target);
408 #else
409 	php_info_print_table_row(2, "PCRE JIT Support", "not compiled in" );
410 #endif
411 
412 #ifdef HAVE_PCRE_VALGRIND_SUPPORT
413 	php_info_print_table_row(2, "PCRE Valgrind Support", "enabled" );
414 #endif
415 
416 	php_info_print_table_end();
417 
418 	DISPLAY_INI_ENTRIES();
419 }
420 /* }}} */
421 
422 /* {{{ PHP_MINIT_FUNCTION(pcre) */
PHP_MINIT_FUNCTION(pcre)423 static PHP_MINIT_FUNCTION(pcre)
424 {
425 	char *version;
426 
427 #ifdef HAVE_PCRE_JIT_SUPPORT
428 	if (UNEXPECTED(!pcre2_init_ok)) {
429 		/* Retry. */
430 		php_pcre_init_pcre2(PCRE_G(jit));
431 		if (!pcre2_init_ok) {
432 			return FAILURE;
433 		}
434 	}
435 #endif
436 
437 	REGISTER_INI_ENTRIES();
438 
439 	REGISTER_LONG_CONSTANT("PREG_PATTERN_ORDER", PREG_PATTERN_ORDER, CONST_CS | CONST_PERSISTENT);
440 	REGISTER_LONG_CONSTANT("PREG_SET_ORDER", PREG_SET_ORDER, CONST_CS | CONST_PERSISTENT);
441 	REGISTER_LONG_CONSTANT("PREG_OFFSET_CAPTURE", PREG_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
442 	REGISTER_LONG_CONSTANT("PREG_UNMATCHED_AS_NULL", PREG_UNMATCHED_AS_NULL, CONST_CS | CONST_PERSISTENT);
443 	REGISTER_LONG_CONSTANT("PREG_SPLIT_NO_EMPTY", PREG_SPLIT_NO_EMPTY, CONST_CS | CONST_PERSISTENT);
444 	REGISTER_LONG_CONSTANT("PREG_SPLIT_DELIM_CAPTURE", PREG_SPLIT_DELIM_CAPTURE, CONST_CS | CONST_PERSISTENT);
445 	REGISTER_LONG_CONSTANT("PREG_SPLIT_OFFSET_CAPTURE", PREG_SPLIT_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
446 	REGISTER_LONG_CONSTANT("PREG_GREP_INVERT", PREG_GREP_INVERT, CONST_CS | CONST_PERSISTENT);
447 
448 	REGISTER_LONG_CONSTANT("PREG_NO_ERROR", PHP_PCRE_NO_ERROR, CONST_CS | CONST_PERSISTENT);
449 	REGISTER_LONG_CONSTANT("PREG_INTERNAL_ERROR", PHP_PCRE_INTERNAL_ERROR, CONST_CS | CONST_PERSISTENT);
450 	REGISTER_LONG_CONSTANT("PREG_BACKTRACK_LIMIT_ERROR", PHP_PCRE_BACKTRACK_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
451 	REGISTER_LONG_CONSTANT("PREG_RECURSION_LIMIT_ERROR", PHP_PCRE_RECURSION_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
452 	REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_ERROR", PHP_PCRE_BAD_UTF8_ERROR, CONST_CS | CONST_PERSISTENT);
453 	REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_OFFSET_ERROR", PHP_PCRE_BAD_UTF8_OFFSET_ERROR, CONST_CS | CONST_PERSISTENT);
454 	REGISTER_LONG_CONSTANT("PREG_JIT_STACKLIMIT_ERROR", PHP_PCRE_JIT_STACKLIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
455 	version = _pcre2_config_str(PCRE2_CONFIG_VERSION);
456 	REGISTER_STRING_CONSTANT("PCRE_VERSION", version, CONST_CS | CONST_PERSISTENT);
457 	free(version);
458 	REGISTER_LONG_CONSTANT("PCRE_VERSION_MAJOR", PCRE2_MAJOR, CONST_CS | CONST_PERSISTENT);
459 	REGISTER_LONG_CONSTANT("PCRE_VERSION_MINOR", PCRE2_MINOR, CONST_CS | CONST_PERSISTENT);
460 
461 #ifdef HAVE_PCRE_JIT_SUPPORT
462 	REGISTER_BOOL_CONSTANT("PCRE_JIT_SUPPORT", 1, CONST_CS | CONST_PERSISTENT);
463 #else
464 	REGISTER_BOOL_CONSTANT("PCRE_JIT_SUPPORT", 0, CONST_CS | CONST_PERSISTENT);
465 #endif
466 
467 	return SUCCESS;
468 }
469 /* }}} */
470 
471 /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
PHP_MSHUTDOWN_FUNCTION(pcre)472 static PHP_MSHUTDOWN_FUNCTION(pcre)
473 {
474 	UNREGISTER_INI_ENTRIES();
475 
476 	return SUCCESS;
477 }
478 /* }}} */
479 
480 /* {{{ PHP_RINIT_FUNCTION(pcre) */
PHP_RINIT_FUNCTION(pcre)481 static PHP_RINIT_FUNCTION(pcre)
482 {
483 #ifdef HAVE_PCRE_JIT_SUPPORT
484 	if (UNEXPECTED(!pcre2_init_ok)) {
485 		/* Retry. */
486 		php_pcre_mutex_lock();
487 		php_pcre_init_pcre2(PCRE_G(jit));
488 		if (!pcre2_init_ok) {
489 			php_pcre_mutex_unlock();
490 			return FAILURE;
491 		}
492 		php_pcre_mutex_unlock();
493 	}
494 
495 	mdata_used = 0;
496 #endif
497 
498 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
499 	PCRE_G(gctx_zmm) = pcre2_general_context_create(php_pcre_emalloc, php_pcre_efree, NULL);
500 	if (!PCRE_G(gctx_zmm)) {
501 		return FAILURE;
502 	}
503 
504 	if (PCRE_G(per_request_cache)) {
505 		zend_hash_init(&PCRE_G(pcre_cache), 0, NULL, php_efree_pcre_cache, 0);
506 	}
507 
508 	return SUCCESS;
509 }
510 /* }}} */
511 
PHP_RSHUTDOWN_FUNCTION(pcre)512 static PHP_RSHUTDOWN_FUNCTION(pcre)
513 {
514 	pcre2_general_context_free(PCRE_G(gctx_zmm));
515 	PCRE_G(gctx_zmm) = NULL;
516 
517 	if (PCRE_G(per_request_cache)) {
518 		zend_hash_destroy(&PCRE_G(pcre_cache));
519 	}
520 
521 	zval_ptr_dtor(&PCRE_G(unmatched_null_pair));
522 	zval_ptr_dtor(&PCRE_G(unmatched_empty_pair));
523 	ZVAL_UNDEF(&PCRE_G(unmatched_null_pair));
524 	ZVAL_UNDEF(&PCRE_G(unmatched_empty_pair));
525 	return SUCCESS;
526 }
527 
528 /* {{{ static pcre_clean_cache */
pcre_clean_cache(zval * data,void * arg)529 static int pcre_clean_cache(zval *data, void *arg)
530 {
531 	pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
532 	int *num_clean = (int *)arg;
533 
534 	if (*num_clean > 0 && !pce->refcount) {
535 		(*num_clean)--;
536 		return ZEND_HASH_APPLY_REMOVE;
537 	} else {
538 		return ZEND_HASH_APPLY_KEEP;
539 	}
540 }
541 /* }}} */
542 
free_subpats_table(zend_string ** subpat_names,uint32_t num_subpats)543 static void free_subpats_table(zend_string **subpat_names, uint32_t num_subpats) {
544 	uint32_t i;
545 	for (i = 0; i < num_subpats; i++) {
546 		if (subpat_names[i]) {
547 			zend_string_release(subpat_names[i]);
548 		}
549 	}
550 	efree(subpat_names);
551 }
552 
553 /* {{{ static make_subpats_table */
make_subpats_table(uint32_t num_subpats,pcre_cache_entry * pce)554 static zend_string **make_subpats_table(uint32_t num_subpats, pcre_cache_entry *pce)
555 {
556 	uint32_t name_cnt = pce->name_count, name_size, ni = 0;
557 	char *name_table;
558 	zend_string **subpat_names;
559 	int rc1, rc2;
560 
561 	rc1 = pcre2_pattern_info(pce->re, PCRE2_INFO_NAMETABLE, &name_table);
562 	rc2 = pcre2_pattern_info(pce->re, PCRE2_INFO_NAMEENTRYSIZE, &name_size);
563 	if (rc1 < 0 || rc2 < 0) {
564 		php_error_docref(NULL, E_WARNING, "Internal pcre2_pattern_info() error %d", rc1 < 0 ? rc1 : rc2);
565 		return NULL;
566 	}
567 
568 	subpat_names = ecalloc(num_subpats, sizeof(zend_string *));
569 	while (ni++ < name_cnt) {
570 		unsigned short name_idx = 0x100 * (unsigned char)name_table[0] + (unsigned char)name_table[1];
571 		const char *name = name_table + 2;
572 		subpat_names[name_idx] = zend_string_init(name, strlen(name), 0);
573 		if (is_numeric_string(ZSTR_VAL(subpat_names[name_idx]), ZSTR_LEN(subpat_names[name_idx]), NULL, NULL, 0) > 0) {
574 			php_error_docref(NULL, E_WARNING, "Numeric named subpatterns are not allowed");
575 			free_subpats_table(subpat_names, num_subpats);
576 			return NULL;
577 		}
578 		name_table += name_size;
579 	}
580 	return subpat_names;
581 }
582 /* }}} */
583 
584 /* {{{ static calculate_unit_length */
585 /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE2_UTF. */
calculate_unit_length(pcre_cache_entry * pce,const char * start)586 static zend_always_inline size_t calculate_unit_length(pcre_cache_entry *pce, const char *start)
587 {
588 	size_t unit_len;
589 
590 	if (pce->compile_options & PCRE2_UTF) {
591 		const char *end = start;
592 
593 		/* skip continuation bytes */
594 		while ((*++end & 0xC0) == 0x80);
595 		unit_len = end - start;
596 	} else {
597 		unit_len = 1;
598 	}
599 	return unit_len;
600 }
601 /* }}} */
602 
603 /* {{{ pcre_get_compiled_regex_cache */
pcre_get_compiled_regex_cache_ex(zend_string * regex,int locale_aware)604 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache_ex(zend_string *regex, int locale_aware)
605 {
606 	pcre2_code			*re = NULL;
607 #if 10 == PCRE2_MAJOR && 37 == PCRE2_MINOR && !HAVE_BUNDLED_PCRE
608 	uint32_t			 coptions = PCRE2_NO_START_OPTIMIZE;
609 #else
610 	uint32_t			 coptions = 0;
611 #endif
612 	PCRE2_UCHAR	         error[128];
613 	PCRE2_SIZE           erroffset;
614 	int                  errnumber;
615 	char				 delimiter;
616 	char				 start_delimiter;
617 	char				 end_delimiter;
618 	char				*p, *pp;
619 	char				*pattern;
620 	size_t				 pattern_len;
621 	uint32_t			 poptions = 0;
622 	const uint8_t       *tables = NULL;
623 	zval                *zv;
624 	pcre_cache_entry	 new_entry;
625 	int					 rc;
626 	zend_string 		*key;
627 	pcre_cache_entry *ret;
628 
629 	if (locale_aware && BG(ctype_string)) {
630 		key = zend_string_concat2(
631 			ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string)),
632 			ZSTR_VAL(regex), ZSTR_LEN(regex));
633 	} else {
634 		key = regex;
635 	}
636 
637 	/* Try to lookup the cached regex entry, and if successful, just pass
638 	   back the compiled pattern, otherwise go on and compile it. */
639 	zv = zend_hash_find(&PCRE_G(pcre_cache), key);
640 	if (zv) {
641 		if (key != regex) {
642 			zend_string_release_ex(key, 0);
643 		}
644 		return (pcre_cache_entry*)Z_PTR_P(zv);
645 	}
646 
647 	p = ZSTR_VAL(regex);
648 
649 	/* Parse through the leading whitespace, and display a warning if we
650 	   get to the end without encountering a delimiter. */
651 	while (isspace((int)*(unsigned char *)p)) p++;
652 	if (*p == 0) {
653 		if (key != regex) {
654 			zend_string_release_ex(key, 0);
655 		}
656 		php_error_docref(NULL, E_WARNING,
657 						 p < ZSTR_VAL(regex) + ZSTR_LEN(regex) ? "Null byte in regex" : "Empty regular expression");
658 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
659 		return NULL;
660 	}
661 
662 	/* Get the delimiter and display a warning if it is alphanumeric
663 	   or a backslash. */
664 	delimiter = *p++;
665 	if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
666 		if (key != regex) {
667 			zend_string_release_ex(key, 0);
668 		}
669 		php_error_docref(NULL,E_WARNING, "Delimiter must not be alphanumeric or backslash");
670 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
671 		return NULL;
672 	}
673 
674 	start_delimiter = delimiter;
675 	if ((pp = strchr("([{< )]}> )]}>", delimiter)))
676 		delimiter = pp[5];
677 	end_delimiter = delimiter;
678 
679 	pp = p;
680 
681 	if (start_delimiter == end_delimiter) {
682 		/* We need to iterate through the pattern, searching for the ending delimiter,
683 		   but skipping the backslashed delimiters.  If the ending delimiter is not
684 		   found, display a warning. */
685 		while (*pp != 0) {
686 			if (*pp == '\\' && pp[1] != 0) pp++;
687 			else if (*pp == delimiter)
688 				break;
689 			pp++;
690 		}
691 	} else {
692 		/* We iterate through the pattern, searching for the matching ending
693 		 * delimiter. For each matching starting delimiter, we increment nesting
694 		 * level, and decrement it for each matching ending delimiter. If we
695 		 * reach the end of the pattern without matching, display a warning.
696 		 */
697 		int brackets = 1; 	/* brackets nesting level */
698 		while (*pp != 0) {
699 			if (*pp == '\\' && pp[1] != 0) pp++;
700 			else if (*pp == end_delimiter && --brackets <= 0)
701 				break;
702 			else if (*pp == start_delimiter)
703 				brackets++;
704 			pp++;
705 		}
706 	}
707 
708 	if (*pp == 0) {
709 		if (key != regex) {
710 			zend_string_release_ex(key, 0);
711 		}
712 		if (pp < ZSTR_VAL(regex) + ZSTR_LEN(regex)) {
713 			php_error_docref(NULL,E_WARNING, "Null byte in regex");
714 		} else if (start_delimiter == end_delimiter) {
715 			php_error_docref(NULL,E_WARNING, "No ending delimiter '%c' found", delimiter);
716 		} else {
717 			php_error_docref(NULL,E_WARNING, "No ending matching delimiter '%c' found", delimiter);
718 		}
719 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
720 		return NULL;
721 	}
722 
723 	/* Make a copy of the actual pattern. */
724 	pattern_len = pp - p;
725 	pattern = estrndup(p, pattern_len);
726 
727 	/* Move on to the options */
728 	pp++;
729 
730 	/* Parse through the options, setting appropriate flags.  Display
731 	   a warning if we encounter an unknown modifier. */
732 	while (pp < ZSTR_VAL(regex) + ZSTR_LEN(regex)) {
733 		switch (*pp++) {
734 			/* Perl compatible options */
735 			case 'i':	coptions |= PCRE2_CASELESS;		break;
736 			case 'm':	coptions |= PCRE2_MULTILINE;		break;
737 			case 'n':	coptions |= PCRE2_NO_AUTO_CAPTURE;	break;
738 			case 's':	coptions |= PCRE2_DOTALL;		break;
739 			case 'x':	coptions |= PCRE2_EXTENDED;		break;
740 
741 			/* PCRE specific options */
742 			case 'A':	coptions |= PCRE2_ANCHORED;		break;
743 			case 'D':	coptions |= PCRE2_DOLLAR_ENDONLY;break;
744 			case 'S':	/* Pass. */					break;
745 			case 'X':	/* Pass. */					break;
746 			case 'U':	coptions |= PCRE2_UNGREEDY;		break;
747 			case 'u':	coptions |= PCRE2_UTF;
748 	/* In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
749 	   characters, even in UTF-8 mode. However, this can be changed by setting
750 	   the PCRE2_UCP option. */
751 #ifdef PCRE2_UCP
752 						coptions |= PCRE2_UCP;
753 #endif
754 				break;
755 			case 'J':	coptions |= PCRE2_DUPNAMES;		break;
756 
757 			/* Custom preg options */
758 			case 'e':	poptions |= PREG_REPLACE_EVAL;	break;
759 
760 			case ' ':
761 			case '\n':
762 			case '\r':
763 				break;
764 
765 			default:
766 				if (pp[-1]) {
767 					php_error_docref(NULL,E_WARNING, "Unknown modifier '%c'", pp[-1]);
768 				} else {
769 					php_error_docref(NULL,E_WARNING, "Null byte in regex");
770 				}
771 				pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
772 				efree(pattern);
773 				if (key != regex) {
774 					zend_string_release_ex(key, 0);
775 				}
776 				return NULL;
777 		}
778 	}
779 
780 	if (poptions & PREG_REPLACE_EVAL) {
781 		php_error_docref(NULL, E_WARNING, "The /e modifier is no longer supported, use preg_replace_callback instead");
782 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
783 		efree(pattern);
784 		if (key != regex) {
785 			zend_string_release_ex(key, 0);
786 		}
787 		return NULL;
788 	}
789 
790 	if (key != regex) {
791 		tables = (uint8_t *)zend_hash_find_ptr(&char_tables, BG(ctype_string));
792 		if (!tables) {
793 			zend_string *_k;
794 			tables = pcre2_maketables(gctx);
795 			if (UNEXPECTED(!tables)) {
796 				php_error_docref(NULL,E_WARNING, "Failed to generate locale character tables");
797 				pcre_handle_exec_error(PCRE2_ERROR_NOMEMORY);
798 				zend_string_release_ex(key, 0);
799 				efree(pattern);
800 				return NULL;
801 			}
802 			_k = zend_string_init(ZSTR_VAL(BG(ctype_string)), ZSTR_LEN(BG(ctype_string)), 1);
803 			GC_MAKE_PERSISTENT_LOCAL(_k);
804 			zend_hash_add_ptr(&char_tables, _k, (void *)tables);
805 			zend_string_release(_k);
806 		}
807 	}
808 	pcre2_set_character_tables(cctx, tables);
809 
810 	/* Compile pattern and display a warning if compilation failed. */
811 	re = pcre2_compile((PCRE2_SPTR)pattern, pattern_len, coptions, &errnumber, &erroffset, cctx);
812 
813 	if (re == NULL) {
814 		if (key != regex) {
815 			zend_string_release_ex(key, 0);
816 		}
817 		pcre2_get_error_message(errnumber, error, sizeof(error));
818 		php_error_docref(NULL,E_WARNING, "Compilation failed: %s at offset %zu", error, erroffset);
819 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
820 		efree(pattern);
821 		return NULL;
822 	}
823 
824 #ifdef HAVE_PCRE_JIT_SUPPORT
825 	if (PCRE_G(jit)) {
826 		/* Enable PCRE JIT compiler */
827 		rc = pcre2_jit_compile(re, PCRE2_JIT_COMPLETE);
828 		if (EXPECTED(rc >= 0)) {
829 			size_t jit_size = 0;
830 			if (!pcre2_pattern_info(re, PCRE2_INFO_JITSIZE, &jit_size) && jit_size > 0) {
831 				poptions |= PREG_JIT;
832 			}
833 		} else if (rc == PCRE2_ERROR_NOMEMORY) {
834 			php_error_docref(NULL, E_WARNING,
835 				"Allocation of JIT memory failed, PCRE JIT will be disabled. "
836 				"This is likely caused by security restrictions. "
837 				"Either grant PHP permission to allocate executable memory, or set pcre.jit=0");
838 			PCRE_G(jit) = 0;
839 		} else {
840 			pcre2_get_error_message(rc, error, sizeof(error));
841 			php_error_docref(NULL, E_WARNING, "JIT compilation failed: %s", error);
842 			pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
843 		}
844 	}
845 #endif
846 	efree(pattern);
847 
848 	/*
849 	 * If we reached cache limit, clean out the items from the head of the list;
850 	 * these are supposedly the oldest ones (but not necessarily the least used
851 	 * ones).
852 	 */
853 	if (zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) {
854 		int num_clean = PCRE_CACHE_SIZE / 8;
855 		zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean);
856 	}
857 
858 	/* Store the compiled pattern and extra info in the cache. */
859 	new_entry.re = re;
860 	new_entry.preg_options = poptions;
861 	new_entry.compile_options = coptions;
862 	new_entry.refcount = 0;
863 
864 	rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &new_entry.capture_count);
865 	if (rc < 0) {
866 		if (key != regex) {
867 			zend_string_release_ex(key, 0);
868 		}
869 		php_error_docref(NULL, E_WARNING, "Internal pcre2_pattern_info() error %d", rc);
870 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
871 		return NULL;
872 	}
873 
874 	rc = pcre2_pattern_info(re, PCRE2_INFO_NAMECOUNT, &new_entry.name_count);
875 	if (rc < 0) {
876 		if (key != regex) {
877 			zend_string_release_ex(key, 0);
878 		}
879 		php_error_docref(NULL, E_WARNING, "Internal pcre_pattern_info() error %d", rc);
880 		pcre_handle_exec_error(PCRE2_ERROR_INTERNAL);
881 		return NULL;
882 	}
883 
884 	/*
885 	 * Interned strings are not duplicated when stored in HashTable,
886 	 * but all the interned strings created during HTTP request are removed
887 	 * at end of request. However PCRE_G(pcre_cache) must be consistent
888 	 * on the next request as well. So we disable usage of interned strings
889 	 * as hash keys especually for this table.
890 	 * See bug #63180
891 	 */
892 	if (!(GC_FLAGS(key) & IS_STR_PERMANENT) && !PCRE_G(per_request_cache)) {
893 		zend_string *str = zend_string_init(ZSTR_VAL(key), ZSTR_LEN(key), 1);
894 		GC_MAKE_PERSISTENT_LOCAL(str);
895 
896 		ret = zend_hash_add_new_mem(&PCRE_G(pcre_cache), str, &new_entry, sizeof(pcre_cache_entry));
897 		zend_string_release(str);
898 	} else {
899 		ret = zend_hash_add_new_mem(&PCRE_G(pcre_cache), key, &new_entry, sizeof(pcre_cache_entry));
900 	}
901 
902 	if (key != regex) {
903 		zend_string_release_ex(key, 0);
904 	}
905 
906 	return ret;
907 }
908 /* }}} */
909 
910 /* {{{ pcre_get_compiled_regex_cache */
pcre_get_compiled_regex_cache(zend_string * regex)911 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_string *regex)
912 {
913 	return pcre_get_compiled_regex_cache_ex(regex, 1);
914 }
915 /* }}} */
916 
917 /* {{{ pcre_get_compiled_regex */
pcre_get_compiled_regex(zend_string * regex,uint32_t * capture_count)918 PHPAPI pcre2_code *pcre_get_compiled_regex(zend_string *regex, uint32_t *capture_count)
919 {
920 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex);
921 
922 	if (capture_count) {
923 		*capture_count = pce ? pce->capture_count : 0;
924 	}
925 
926 	return pce ? pce->re : NULL;
927 }
928 /* }}} */
929 
930 /* {{{ pcre_get_compiled_regex_ex */
pcre_get_compiled_regex_ex(zend_string * regex,uint32_t * capture_count,uint32_t * preg_options,uint32_t * compile_options)931 PHPAPI pcre2_code* pcre_get_compiled_regex_ex(zend_string *regex, uint32_t *capture_count, uint32_t *preg_options, uint32_t *compile_options)
932 {
933 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex);
934 
935 	if (preg_options) {
936 		*preg_options = pce ? pce->preg_options : 0;
937 	}
938 	if (compile_options) {
939 		*compile_options = pce ? pce->compile_options : 0;
940 	}
941 	if (capture_count) {
942 		*capture_count = pce ? pce->capture_count : 0;
943 	}
944 
945 	return pce ? pce->re : NULL;
946 }
947 /* }}} */
948 
949 /* XXX For the cases where it's only about match yes/no and no capture
950 		required, perhaps just a minimum sized data would suffice. */
php_pcre_create_match_data(uint32_t capture_count,pcre2_code * re)951 PHPAPI pcre2_match_data *php_pcre_create_match_data(uint32_t capture_count, pcre2_code *re)
952 {/*{{{*/
953 
954 	assert(NULL != re);
955 
956 	if (EXPECTED(!mdata_used)) {
957 		int rc = 0;
958 
959 		if (!capture_count) {
960 			/* As we deal with a non cached pattern, no other way to gather this info. */
961 			rc = pcre2_pattern_info(re, PCRE2_INFO_CAPTURECOUNT, &capture_count);
962 		}
963 
964 		if (rc >= 0 && capture_count + 1 <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
965 			mdata_used = 1;
966 			return mdata;
967 		}
968 	}
969 
970 	return pcre2_match_data_create_from_pattern(re, gctx);
971 }/*}}}*/
972 
php_pcre_free_match_data(pcre2_match_data * match_data)973 PHPAPI void php_pcre_free_match_data(pcre2_match_data *match_data)
974 {/*{{{*/
975 	if (UNEXPECTED(match_data != mdata)) {
976 		pcre2_match_data_free(match_data);
977 	} else {
978 		mdata_used = 0;
979 	}
980 }/*}}}*/
981 
init_unmatched_null_pair(void)982 static void init_unmatched_null_pair(void) {
983 	zval val1, val2;
984 	ZVAL_NULL(&val1);
985 	ZVAL_LONG(&val2, -1);
986 	ZVAL_ARR(&PCRE_G(unmatched_null_pair), zend_new_pair(&val1, &val2));
987 }
988 
init_unmatched_empty_pair(void)989 static void init_unmatched_empty_pair(void) {
990 	zval val1, val2;
991 	ZVAL_EMPTY_STRING(&val1);
992 	ZVAL_LONG(&val2, -1);
993 	ZVAL_ARR(&PCRE_G(unmatched_empty_pair), zend_new_pair(&val1, &val2));
994 }
995 
populate_match_value_str(zval * val,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset)996 static zend_always_inline void populate_match_value_str(
997 		zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset) {
998 	ZVAL_STRINGL_FAST(val, subject + start_offset, end_offset - start_offset);
999 }
1000 
populate_match_value(zval * val,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset,uint32_t unmatched_as_null)1001 static inline void populate_match_value(
1002 		zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset,
1003 		uint32_t unmatched_as_null) {
1004 	if (PCRE2_UNSET == start_offset) {
1005 		if (unmatched_as_null) {
1006 			ZVAL_NULL(val);
1007 		} else {
1008 			ZVAL_EMPTY_STRING(val);
1009 		}
1010 	} else {
1011 		populate_match_value_str(val, subject, start_offset, end_offset);
1012 	}
1013 }
1014 
add_named(zval * subpats,zend_string * name,zval * val,bool unmatched)1015 static inline void add_named(
1016 		zval *subpats, zend_string *name, zval *val, bool unmatched) {
1017 	/* If the DUPNAMES option is used, multiple subpatterns might have the same name.
1018 	 * In this case we want to preserve the one that actually has a value. */
1019 	if (!unmatched) {
1020 		zend_hash_update(Z_ARRVAL_P(subpats), name, val);
1021 	} else {
1022 		if (!zend_hash_add(Z_ARRVAL_P(subpats), name, val)) {
1023 			return;
1024 		}
1025 	}
1026 	Z_TRY_ADDREF_P(val);
1027 }
1028 
1029 /* {{{ add_offset_pair */
add_offset_pair(zval * result,const char * subject,PCRE2_SIZE start_offset,PCRE2_SIZE end_offset,zend_string * name,uint32_t unmatched_as_null)1030 static inline void add_offset_pair(
1031 		zval *result, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset,
1032 		zend_string *name, uint32_t unmatched_as_null)
1033 {
1034 	zval match_pair;
1035 
1036 	/* Add (match, offset) to the return value */
1037 	if (PCRE2_UNSET == start_offset) {
1038 		if (unmatched_as_null) {
1039 			if (Z_ISUNDEF(PCRE_G(unmatched_null_pair))) {
1040 				init_unmatched_null_pair();
1041 			}
1042 			ZVAL_COPY(&match_pair, &PCRE_G(unmatched_null_pair));
1043 		} else {
1044 			if (Z_ISUNDEF(PCRE_G(unmatched_empty_pair))) {
1045 				init_unmatched_empty_pair();
1046 			}
1047 			ZVAL_COPY(&match_pair, &PCRE_G(unmatched_empty_pair));
1048 		}
1049 	} else {
1050 		zval val1, val2;
1051 		populate_match_value_str(&val1, subject, start_offset, end_offset);
1052 		ZVAL_LONG(&val2, start_offset);
1053 		ZVAL_ARR(&match_pair, zend_new_pair(&val1, &val2));
1054 	}
1055 
1056 	if (name) {
1057 		add_named(result, name, &match_pair, start_offset == PCRE2_UNSET);
1058 	}
1059 	zend_hash_next_index_insert(Z_ARRVAL_P(result), &match_pair);
1060 }
1061 /* }}} */
1062 
populate_subpat_array(zval * subpats,const char * subject,PCRE2_SIZE * offsets,zend_string ** subpat_names,uint32_t num_subpats,int count,const PCRE2_SPTR mark,zend_long flags)1063 static void populate_subpat_array(
1064 		zval *subpats, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names,
1065 		uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags) {
1066 	bool offset_capture = (flags & PREG_OFFSET_CAPTURE) != 0;
1067 	bool unmatched_as_null = (flags & PREG_UNMATCHED_AS_NULL) != 0;
1068 	zval val;
1069 	int i;
1070 	if (subpat_names) {
1071 		if (offset_capture) {
1072 			for (i = 0; i < count; i++) {
1073 				add_offset_pair(
1074 					subpats, subject, offsets[2*i], offsets[2*i+1],
1075 					subpat_names[i], unmatched_as_null);
1076 			}
1077 			if (unmatched_as_null) {
1078 				for (i = count; i < num_subpats; i++) {
1079 					add_offset_pair(subpats, NULL, PCRE2_UNSET, PCRE2_UNSET, subpat_names[i], 1);
1080 				}
1081 			}
1082 		} else {
1083 			for (i = 0; i < count; i++) {
1084 				populate_match_value(
1085 					&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1086 				if (subpat_names[i]) {
1087 					add_named(subpats, subpat_names[i], &val, offsets[2*i] == PCRE2_UNSET);
1088 				}
1089 				zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &val);
1090 			}
1091 			if (unmatched_as_null) {
1092 				for (i = count; i < num_subpats; i++) {
1093 					ZVAL_NULL(&val);
1094 					if (subpat_names[i]) {
1095 						zend_hash_add(Z_ARRVAL_P(subpats), subpat_names[i], &val);
1096 					}
1097 					zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &val);
1098 				}
1099 			}
1100 		}
1101 	} else {
1102 		if (offset_capture) {
1103 			for (i = 0; i < count; i++) {
1104 				add_offset_pair(
1105 					subpats, subject, offsets[2*i], offsets[2*i+1], NULL, unmatched_as_null);
1106 			}
1107 			if (unmatched_as_null) {
1108 				for (i = count; i < num_subpats; i++) {
1109 					add_offset_pair(subpats, NULL, PCRE2_UNSET, PCRE2_UNSET, NULL, 1);
1110 				}
1111 			}
1112 		} else {
1113 			for (i = 0; i < count; i++) {
1114 				populate_match_value(
1115 					&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1116 				zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &val);
1117 			}
1118 			if (unmatched_as_null) {
1119 				for (i = count; i < num_subpats; i++) {
1120 					add_next_index_null(subpats);
1121 				}
1122 			}
1123 		}
1124 	}
1125 	/* Add MARK, if available */
1126 	if (mark) {
1127 		add_assoc_string_ex(subpats, "MARK", sizeof("MARK") - 1, (char *)mark);
1128 	}
1129 }
1130 
php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS,int global)1131 static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ */
1132 {
1133 	/* parameters */
1134 	zend_string		 *regex;			/* Regular expression */
1135 	zend_string		 *subject;			/* String to match against */
1136 	pcre_cache_entry *pce;				/* Compiled regular expression */
1137 	zval			 *subpats = NULL;	/* Array for subpatterns */
1138 	zend_long		  flags = 0;		/* Match control flags */
1139 	zend_long		  start_offset = 0;	/* Where the new search starts */
1140 
1141 	ZEND_PARSE_PARAMETERS_START(2, 5)
1142 		Z_PARAM_STR(regex)
1143 		Z_PARAM_STR(subject)
1144 		Z_PARAM_OPTIONAL
1145 		Z_PARAM_ZVAL(subpats)
1146 		Z_PARAM_LONG(flags)
1147 		Z_PARAM_LONG(start_offset)
1148 	ZEND_PARSE_PARAMETERS_END();
1149 
1150 	/* Compile regex or get it from cache. */
1151 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1152 		RETURN_FALSE;
1153 	}
1154 
1155 	pce->refcount++;
1156 	php_pcre_match_impl(pce, subject, return_value, subpats,
1157 		global, ZEND_NUM_ARGS() >= 4, flags, start_offset);
1158 	pce->refcount--;
1159 }
1160 /* }}} */
1161 
is_known_valid_utf8(zend_string * subject_str,PCRE2_SIZE start_offset)1162 static zend_always_inline bool is_known_valid_utf8(
1163 		zend_string *subject_str, PCRE2_SIZE start_offset) {
1164 	if (!(GC_FLAGS(subject_str) & IS_STR_VALID_UTF8)) {
1165 		/* We don't know whether the string is valid UTF-8 or not. */
1166 		return 0;
1167 	}
1168 
1169 	if (start_offset == ZSTR_LEN(subject_str)) {
1170 		/* Degenerate case: Offset points to end of string. */
1171 		return 1;
1172 	}
1173 
1174 	/* Check that the offset does not point to an UTF-8 continuation byte. */
1175 	return (ZSTR_VAL(subject_str)[start_offset] & 0xc0) != 0x80;
1176 }
1177 
1178 /* {{{ php_pcre_match_impl() */
php_pcre_match_impl(pcre_cache_entry * pce,zend_string * subject_str,zval * return_value,zval * subpats,int global,int use_flags,zend_long flags,zend_off_t start_offset)1179 PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
1180 	zval *subpats, int global, int use_flags, zend_long flags, zend_off_t start_offset)
1181 {
1182 	zval			 result_set,		/* Holds a set of subpatterns after
1183 										   a global match */
1184 					*match_sets = NULL;	/* An array of sets of matches for each
1185 										   subpattern after a global match */
1186 	uint32_t		 options;			/* Execution options */
1187 	int				 count;				/* Count of matched subpatterns */
1188 	PCRE2_SIZE		*offsets;			/* Array of subpattern offsets */
1189 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1190 	int				 matched;			/* Has anything matched */
1191 	zend_string	   **subpat_names;		/* Array for named subpatterns */
1192 	size_t			 i;
1193 	uint32_t		 subpats_order;		/* Order of subpattern matches */
1194 	uint32_t		 offset_capture;	/* Capture match offsets: yes/no */
1195 	uint32_t		 unmatched_as_null;	/* Null non-matches: yes/no */
1196 	PCRE2_SPTR       mark = NULL;		/* Target for MARK name */
1197 	zval			 marks;				/* Array of marks for PREG_PATTERN_ORDER */
1198 	pcre2_match_data *match_data;
1199 	PCRE2_SIZE		 start_offset2, orig_start_offset;
1200 
1201 	char *subject = ZSTR_VAL(subject_str);
1202 	size_t subject_len = ZSTR_LEN(subject_str);
1203 
1204 	ZVAL_UNDEF(&marks);
1205 
1206 	/* Overwrite the passed-in value for subpatterns with an empty array. */
1207 	if (subpats != NULL) {
1208 		subpats = zend_try_array_init(subpats);
1209 		if (!subpats) {
1210 			RETURN_THROWS();
1211 		}
1212 	}
1213 
1214 	subpats_order = global ? PREG_PATTERN_ORDER : 0;
1215 
1216 	if (use_flags) {
1217 		offset_capture = flags & PREG_OFFSET_CAPTURE;
1218 		unmatched_as_null = flags & PREG_UNMATCHED_AS_NULL;
1219 
1220 		/*
1221 		 * subpats_order is pre-set to pattern mode so we change it only if
1222 		 * necessary.
1223 		 */
1224 		if (flags & 0xff) {
1225 			subpats_order = flags & 0xff;
1226 		}
1227 		if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
1228 			(!global && subpats_order != 0)) {
1229 			zend_argument_value_error(4, "must be a PREG_* constant");
1230 			RETURN_THROWS();
1231 		}
1232 	} else {
1233 		offset_capture = 0;
1234 		unmatched_as_null = 0;
1235 	}
1236 
1237 	/* Negative offset counts from the end of the string. */
1238 	if (start_offset < 0) {
1239 		if ((PCRE2_SIZE)-start_offset <= subject_len) {
1240 			start_offset2 = subject_len + start_offset;
1241 		} else {
1242 			start_offset2 = 0;
1243 		}
1244 	} else {
1245 		start_offset2 = (PCRE2_SIZE)start_offset;
1246 	}
1247 
1248 	if (start_offset2 > subject_len) {
1249 		pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET);
1250 		RETURN_FALSE;
1251 	}
1252 
1253 	/* Calculate the size of the offsets array, and allocate memory for it. */
1254 	num_subpats = pce->capture_count + 1;
1255 
1256 	/*
1257 	 * Build a mapping from subpattern numbers to their names. We will
1258 	 * allocate the table only if there are any named subpatterns.
1259 	 */
1260 	subpat_names = NULL;
1261 	if (subpats && pce->name_count > 0) {
1262 		subpat_names = make_subpats_table(num_subpats, pce);
1263 		if (!subpat_names) {
1264 			RETURN_FALSE;
1265 		}
1266 	}
1267 
1268 	/* Allocate match sets array and initialize the values. */
1269 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
1270 		match_sets = (zval *)safe_emalloc(num_subpats, sizeof(zval), 0);
1271 		for (i=0; i<num_subpats; i++) {
1272 			array_init(&match_sets[i]);
1273 		}
1274 	}
1275 
1276 	matched = 0;
1277 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1278 
1279 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1280 		match_data = mdata;
1281 	} else {
1282 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1283 		if (!match_data) {
1284 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1285 			if (subpat_names) {
1286 				free_subpats_table(subpat_names, num_subpats);
1287 			}
1288 			if (match_sets) {
1289 				efree(match_sets);
1290 			}
1291 			RETURN_FALSE;
1292 		}
1293 	}
1294 
1295 	orig_start_offset = start_offset2;
1296 	options =
1297 		(pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, orig_start_offset)
1298 			? 0 : PCRE2_NO_UTF_CHECK;
1299 
1300 	/* Execute the regular expression. */
1301 #ifdef HAVE_PCRE_JIT_SUPPORT
1302 	if ((pce->preg_options & PREG_JIT) && options) {
1303 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1304 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1305 	} else
1306 #endif
1307 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1308 			options, match_data, mctx);
1309 
1310 	while (1) {
1311 		/* If something has matched */
1312 		if (count >= 0) {
1313 			/* Check for too many substrings condition. */
1314 			if (UNEXPECTED(count == 0)) {
1315 				php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings");
1316 				count = num_subpats;
1317 			}
1318 
1319 matched:
1320 			matched++;
1321 
1322 			offsets = pcre2_get_ovector_pointer(match_data);
1323 
1324 			/* If subpatterns array has been passed, fill it in with values. */
1325 			if (subpats != NULL) {
1326 				/* Try to get the list of substrings and display a warning if failed. */
1327 				if (offsets[1] < offsets[0]) {
1328 					if (subpat_names) {
1329 						free_subpats_table(subpat_names, num_subpats);
1330 					}
1331 					if (match_sets) efree(match_sets);
1332 					php_error_docref(NULL, E_WARNING, "Get subpatterns list failed");
1333 					RETURN_FALSE;
1334 				}
1335 
1336 				if (global) {	/* global pattern matching */
1337 					if (subpats && subpats_order == PREG_PATTERN_ORDER) {
1338 						/* For each subpattern, insert it into the appropriate array. */
1339 						if (offset_capture) {
1340 							for (i = 0; i < count; i++) {
1341 								add_offset_pair(
1342 									&match_sets[i], subject, offsets[2*i], offsets[2*i+1],
1343 									NULL, unmatched_as_null);
1344 							}
1345 						} else {
1346 							for (i = 0; i < count; i++) {
1347 								zval val;
1348 								populate_match_value(
1349 									&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
1350 								zend_hash_next_index_insert_new(Z_ARRVAL(match_sets[i]), &val);
1351 							}
1352 						}
1353 						mark = pcre2_get_mark(match_data);
1354 						/* Add MARK, if available */
1355 						if (mark) {
1356 							if (Z_TYPE(marks) == IS_UNDEF) {
1357 								array_init(&marks);
1358 							}
1359 							add_index_string(&marks, matched - 1, (char *) mark);
1360 						}
1361 						/*
1362 						 * If the number of captured subpatterns on this run is
1363 						 * less than the total possible number, pad the result
1364 						 * arrays with NULLs or empty strings.
1365 						 */
1366 						if (count < num_subpats) {
1367 							for (; i < num_subpats; i++) {
1368 								if (offset_capture) {
1369 									add_offset_pair(
1370 										&match_sets[i], NULL, PCRE2_UNSET, PCRE2_UNSET,
1371 										NULL, unmatched_as_null);
1372 								} else if (unmatched_as_null) {
1373 									add_next_index_null(&match_sets[i]);
1374 								} else {
1375 									add_next_index_str(&match_sets[i], ZSTR_EMPTY_ALLOC());
1376 								}
1377 							}
1378 						}
1379 					} else {
1380 						/* Allocate and populate the result set array */
1381 						array_init_size(&result_set, count + (mark ? 1 : 0));
1382 						mark = pcre2_get_mark(match_data);
1383 						populate_subpat_array(
1384 							&result_set, subject, offsets, subpat_names,
1385 							num_subpats, count, mark, flags);
1386 						/* And add it to the output array */
1387 						zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &result_set);
1388 					}
1389 				} else {			/* single pattern matching */
1390 					/* For each subpattern, insert it into the subpatterns array. */
1391 					mark = pcre2_get_mark(match_data);
1392 					populate_subpat_array(
1393 						subpats, subject, offsets, subpat_names, num_subpats, count, mark, flags);
1394 					break;
1395 				}
1396 			}
1397 
1398 			/* Advance to the next piece. */
1399 			start_offset2 = offsets[1];
1400 
1401 			/* If we have matched an empty string, mimic what Perl's /g options does.
1402 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1403 			   the match again at the same point. If this fails (picked up above) we
1404 			   advance to the next character. */
1405 			if (start_offset2 == offsets[0]) {
1406 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1407 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1408 				if (count >= 0) {
1409 					if (global) {
1410 						goto matched;
1411 					} else {
1412 						break;
1413 					}
1414 				} else if (count == PCRE2_ERROR_NOMATCH) {
1415 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1416 					   this is not necessarily the end. We need to advance
1417 					   the start offset, and continue. Fudge the offset values
1418 					   to achieve this, unless we're already at the end of the string. */
1419 					if (start_offset2 < subject_len) {
1420 						size_t unit_len = calculate_unit_length(pce, subject + start_offset2);
1421 
1422 						start_offset2 += unit_len;
1423 					} else {
1424 						break;
1425 					}
1426 				} else {
1427 					goto error;
1428 				}
1429 			}
1430 		} else if (count == PCRE2_ERROR_NOMATCH) {
1431 			break;
1432 		} else {
1433 error:
1434 			pcre_handle_exec_error(count);
1435 			break;
1436 		}
1437 
1438 		if (!global) {
1439 			break;
1440 		}
1441 
1442 		/* Execute the regular expression. */
1443 #ifdef HAVE_PCRE_JIT_SUPPORT
1444 		if ((pce->preg_options & PREG_JIT)) {
1445 			if (PCRE2_UNSET == start_offset2 || start_offset2 > subject_len) {
1446 				pcre_handle_exec_error(PCRE2_ERROR_BADOFFSET);
1447 				break;
1448 			}
1449 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1450 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1451 		} else
1452 #endif
1453 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset2,
1454 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1455 	}
1456 	if (match_data != mdata) {
1457 		pcre2_match_data_free(match_data);
1458 	}
1459 
1460 	/* Add the match sets to the output array and clean up */
1461 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
1462 		if (subpat_names) {
1463 			for (i = 0; i < num_subpats; i++) {
1464 				if (subpat_names[i]) {
1465 					zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i], &match_sets[i]);
1466 					Z_ADDREF(match_sets[i]);
1467 				}
1468 				zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i]);
1469 			}
1470 		} else {
1471 			for (i = 0; i < num_subpats; i++) {
1472 				zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i]);
1473 			}
1474 		}
1475 		efree(match_sets);
1476 
1477 		if (Z_TYPE(marks) != IS_UNDEF) {
1478 			add_assoc_zval(subpats, "MARK", &marks);
1479 		}
1480 	}
1481 
1482 	if (subpat_names) {
1483 		free_subpats_table(subpat_names, num_subpats);
1484 	}
1485 
1486 	if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
1487 		/* If there was no error and we're in /u mode, remember that the string is valid UTF-8. */
1488 		if ((pce->compile_options & PCRE2_UTF)
1489 				&& !ZSTR_IS_INTERNED(subject_str) && orig_start_offset == 0) {
1490 			GC_ADD_FLAGS(subject_str, IS_STR_VALID_UTF8);
1491 		}
1492 
1493 		RETVAL_LONG(matched);
1494 	} else {
1495 		RETVAL_FALSE;
1496 	}
1497 }
1498 /* }}} */
1499 
1500 /* {{{ Perform a Perl-style regular expression match */
PHP_FUNCTION(preg_match)1501 PHP_FUNCTION(preg_match)
1502 {
1503 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1504 }
1505 /* }}} */
1506 
1507 /* {{{ Perform a Perl-style global regular expression match */
PHP_FUNCTION(preg_match_all)1508 PHP_FUNCTION(preg_match_all)
1509 {
1510 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1511 }
1512 /* }}} */
1513 
1514 /* {{{ preg_get_backref */
preg_get_backref(char ** str,int * backref)1515 static int preg_get_backref(char **str, int *backref)
1516 {
1517 	char in_brace = 0;
1518 	char *walk = *str;
1519 
1520 	if (walk[1] == 0)
1521 		return 0;
1522 
1523 	if (*walk == '$' && walk[1] == '{') {
1524 		in_brace = 1;
1525 		walk++;
1526 	}
1527 	walk++;
1528 
1529 	if (*walk >= '0' && *walk <= '9') {
1530 		*backref = *walk - '0';
1531 		walk++;
1532 	} else
1533 		return 0;
1534 
1535 	if (*walk && *walk >= '0' && *walk <= '9') {
1536 		*backref = *backref * 10 + *walk - '0';
1537 		walk++;
1538 	}
1539 
1540 	if (in_brace) {
1541 		if (*walk != '}')
1542 			return 0;
1543 		else
1544 			walk++;
1545 	}
1546 
1547 	*str = walk;
1548 	return 1;
1549 }
1550 /* }}} */
1551 
1552 /* {{{ preg_do_repl_func */
preg_do_repl_func(zend_fcall_info * fci,zend_fcall_info_cache * fcc,const char * subject,PCRE2_SIZE * offsets,zend_string ** subpat_names,uint32_t num_subpats,int count,const PCRE2_SPTR mark,zend_long flags)1553 static zend_string *preg_do_repl_func(zend_fcall_info *fci, zend_fcall_info_cache *fcc, const char *subject, PCRE2_SIZE *offsets, zend_string **subpat_names, uint32_t num_subpats, int count, const PCRE2_SPTR mark, zend_long flags)
1554 {
1555 	zend_string *result_str;
1556 	zval		 retval;			/* Function return value */
1557 	zval	     arg;				/* Argument to pass to function */
1558 
1559 	array_init_size(&arg, count + (mark ? 1 : 0));
1560 	populate_subpat_array(&arg, subject, offsets, subpat_names, num_subpats, count, mark, flags);
1561 
1562 	fci->retval = &retval;
1563 	fci->param_count = 1;
1564 	fci->params = &arg;
1565 
1566 	if (zend_call_function(fci, fcc) == SUCCESS && Z_TYPE(retval) != IS_UNDEF) {
1567 		if (EXPECTED(Z_TYPE(retval) == IS_STRING)) {
1568 			result_str = Z_STR(retval);
1569 		} else {
1570 			result_str = zval_get_string_func(&retval);
1571 			zval_ptr_dtor(&retval);
1572 		}
1573 	} else {
1574 		if (!EG(exception)) {
1575 			php_error_docref(NULL, E_WARNING, "Unable to call custom replacement function");
1576 		}
1577 
1578 		result_str = zend_string_init(&subject[offsets[0]], offsets[1] - offsets[0], 0);
1579 	}
1580 
1581 	zval_ptr_dtor(&arg);
1582 
1583 	return result_str;
1584 }
1585 /* }}} */
1586 
1587 /* {{{ php_pcre_replace */
php_pcre_replace(zend_string * regex,zend_string * subject_str,const char * subject,size_t subject_len,zend_string * replace_str,size_t limit,size_t * replace_count)1588 PHPAPI zend_string *php_pcre_replace(zend_string *regex,
1589 							  zend_string *subject_str,
1590 							  const char *subject, size_t subject_len,
1591 							  zend_string *replace_str,
1592 							  size_t limit, size_t *replace_count)
1593 {
1594 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
1595 	zend_string	 		*result;			/* Function result */
1596 
1597 	/* Abort on pending exception, e.g. thrown from __toString(). */
1598 	if (UNEXPECTED(EG(exception))) {
1599 		return NULL;
1600 	}
1601 
1602 	/* Compile regex or get it from cache. */
1603 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1604 		return NULL;
1605 	}
1606 	pce->refcount++;
1607 	result = php_pcre_replace_impl(pce, subject_str, subject, subject_len, replace_str,
1608 		limit, replace_count);
1609 	pce->refcount--;
1610 
1611 	return result;
1612 }
1613 /* }}} */
1614 
1615 /* {{{ php_pcre_replace_impl() */
php_pcre_replace_impl(pcre_cache_entry * pce,zend_string * subject_str,const char * subject,size_t subject_len,zend_string * replace_str,size_t limit,size_t * replace_count)1616 PHPAPI zend_string *php_pcre_replace_impl(pcre_cache_entry *pce, zend_string *subject_str, const char *subject, size_t subject_len, zend_string *replace_str, size_t limit, size_t *replace_count)
1617 {
1618 	uint32_t		 options;			/* Execution options */
1619 	int				 count;				/* Count of matched subpatterns */
1620 	PCRE2_SIZE		*offsets;			/* Array of subpattern offsets */
1621 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1622 	size_t			 new_len;			/* Length of needed storage */
1623 	size_t			 alloc_len;			/* Actual allocated length */
1624 	size_t			 match_len;			/* Length of the current match */
1625 	int				 backref;			/* Backreference number */
1626 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
1627 	size_t			 last_end_offset;	/* Where the last search ended */
1628 	char			*walkbuf,			/* Location of current replacement in the result */
1629 					*walk,				/* Used to walk the replacement string */
1630 					 walk_last;			/* Last walked character */
1631 	const char		*match,				/* The current match */
1632 					*piece,				/* The current piece of subject */
1633 					*replace_end;		/* End of replacement string */
1634 	size_t			result_len; 		/* Length of result */
1635 	zend_string		*result;			/* Result of replacement */
1636 	pcre2_match_data *match_data;
1637 
1638 	/* Calculate the size of the offsets array, and allocate memory for it. */
1639 	num_subpats = pce->capture_count + 1;
1640 	alloc_len = 0;
1641 	result = NULL;
1642 
1643 	/* Initialize */
1644 	match = NULL;
1645 	start_offset = 0;
1646 	last_end_offset = 0;
1647 	result_len = 0;
1648 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1649 
1650 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1651 		match_data = mdata;
1652 	} else {
1653 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1654 		if (!match_data) {
1655 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1656 			return NULL;
1657 		}
1658 	}
1659 
1660 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
1661 
1662 	/* Execute the regular expression. */
1663 #ifdef HAVE_PCRE_JIT_SUPPORT
1664 	if ((pce->preg_options & PREG_JIT) && options) {
1665 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1666 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1667 	} else
1668 #endif
1669 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1670 			options, match_data, mctx);
1671 
1672 	while (1) {
1673 		piece = subject + last_end_offset;
1674 
1675 		if (count >= 0 && limit > 0) {
1676 			bool simple_string;
1677 
1678 			/* Check for too many substrings condition. */
1679 			if (UNEXPECTED(count == 0)) {
1680 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
1681 				count = num_subpats;
1682 			}
1683 
1684 matched:
1685 			offsets = pcre2_get_ovector_pointer(match_data);
1686 
1687 			if (UNEXPECTED(offsets[1] < offsets[0])) {
1688 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1689 				if (result) {
1690 					zend_string_release_ex(result, 0);
1691 					result = NULL;
1692 				}
1693 				break;
1694 			}
1695 
1696 			if (replace_count) {
1697 				++*replace_count;
1698 			}
1699 
1700 			/* Set the match location in subject */
1701 			match = subject + offsets[0];
1702 
1703 			new_len = result_len + offsets[0] - last_end_offset; /* part before the match */
1704 
1705 			walk = ZSTR_VAL(replace_str);
1706 			replace_end = walk + ZSTR_LEN(replace_str);
1707 			walk_last = 0;
1708 			simple_string = 1;
1709 			while (walk < replace_end) {
1710 				if ('\\' == *walk || '$' == *walk) {
1711 					simple_string = 0;
1712 					if (walk_last == '\\') {
1713 						walk++;
1714 						walk_last = 0;
1715 						continue;
1716 					}
1717 					if (preg_get_backref(&walk, &backref)) {
1718 						if (backref < count)
1719 							new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
1720 						continue;
1721 					}
1722 				}
1723 				new_len++;
1724 				walk++;
1725 				walk_last = walk[-1];
1726 			}
1727 
1728 			if (new_len >= alloc_len) {
1729 				alloc_len = zend_safe_address_guarded(2, new_len, ZSTR_MAX_OVERHEAD) - ZSTR_MAX_OVERHEAD;
1730 				if (result == NULL) {
1731 					result = zend_string_alloc(alloc_len, 0);
1732 				} else {
1733 					result = zend_string_extend(result, alloc_len, 0);
1734 				}
1735 			}
1736 
1737 			if (match-piece > 0) {
1738 				/* copy the part of the string before the match */
1739 				memcpy(&ZSTR_VAL(result)[result_len], piece, match-piece);
1740 				result_len += (match-piece);
1741 			}
1742 
1743 			if (simple_string) {
1744 				/* copy replacement */
1745 				memcpy(&ZSTR_VAL(result)[result_len], ZSTR_VAL(replace_str), ZSTR_LEN(replace_str)+1);
1746 				result_len += ZSTR_LEN(replace_str);
1747 			} else {
1748 				/* copy replacement and backrefs */
1749 				walkbuf = ZSTR_VAL(result) + result_len;
1750 
1751 				walk = ZSTR_VAL(replace_str);
1752 				walk_last = 0;
1753 				while (walk < replace_end) {
1754 					if ('\\' == *walk || '$' == *walk) {
1755 						if (walk_last == '\\') {
1756 							*(walkbuf-1) = *walk++;
1757 							walk_last = 0;
1758 							continue;
1759 						}
1760 						if (preg_get_backref(&walk, &backref)) {
1761 							if (backref < count) {
1762 								match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1763 								memcpy(walkbuf, subject + offsets[backref<<1], match_len);
1764 								walkbuf += match_len;
1765 							}
1766 							continue;
1767 						}
1768 					}
1769 					*walkbuf++ = *walk++;
1770 					walk_last = walk[-1];
1771 				}
1772 				*walkbuf = '\0';
1773 				/* increment the result length by how much we've added to the string */
1774 				result_len += (walkbuf - (ZSTR_VAL(result) + result_len));
1775 			}
1776 
1777 			limit--;
1778 
1779 			/* Advance to the next piece. */
1780 			start_offset = last_end_offset = offsets[1];
1781 
1782 			/* If we have matched an empty string, mimic what Perl's /g options does.
1783 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1784 			   the match again at the same point. If this fails (picked up above) we
1785 			   advance to the next character. */
1786 			if (start_offset == offsets[0]) {
1787 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1788 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1789 
1790 				piece = subject + start_offset;
1791 				if (count >= 0 && limit > 0) {
1792 					goto matched;
1793 				} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1794 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
1795 					   this is not necessarily the end. We need to advance
1796 					   the start offset, and continue. Fudge the offset values
1797 					   to achieve this, unless we're already at the end of the string. */
1798 					if (start_offset < subject_len) {
1799 						size_t unit_len = calculate_unit_length(pce, piece);
1800 						start_offset += unit_len;
1801 					} else {
1802 						goto not_matched;
1803 					}
1804 				} else {
1805 					goto error;
1806 				}
1807 			}
1808 
1809 		} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
1810 not_matched:
1811 			if (!result && subject_str) {
1812 				result = zend_string_copy(subject_str);
1813 				break;
1814 			}
1815 			/* now we know exactly how long it is */
1816 			alloc_len = result_len + subject_len - last_end_offset;
1817 			if (NULL != result) {
1818 				result = zend_string_realloc(result, alloc_len, 0);
1819 			} else {
1820 				result = zend_string_alloc(alloc_len, 0);
1821 			}
1822 			/* stick that last bit of string on our output */
1823 			memcpy(ZSTR_VAL(result) + result_len, piece, subject_len - last_end_offset);
1824 			result_len += subject_len - last_end_offset;
1825 			ZSTR_VAL(result)[result_len] = '\0';
1826 			ZSTR_LEN(result) = result_len;
1827 			break;
1828 		} else {
1829 error:
1830 			pcre_handle_exec_error(count);
1831 			if (result) {
1832 				zend_string_release_ex(result, 0);
1833 				result = NULL;
1834 			}
1835 			break;
1836 		}
1837 
1838 #ifdef HAVE_PCRE_JIT_SUPPORT
1839 		if (pce->preg_options & PREG_JIT) {
1840 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1841 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1842 		} else
1843 #endif
1844 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1845 					PCRE2_NO_UTF_CHECK, match_data, mctx);
1846 	}
1847 	if (match_data != mdata) {
1848 		pcre2_match_data_free(match_data);
1849 	}
1850 
1851 	return result;
1852 }
1853 /* }}} */
1854 
1855 /* {{{ php_pcre_replace_func_impl() */
php_pcre_replace_func_impl(pcre_cache_entry * pce,zend_string * subject_str,const char * subject,size_t subject_len,zend_fcall_info * fci,zend_fcall_info_cache * fcc,size_t limit,size_t * replace_count,zend_long flags)1856 static zend_string *php_pcre_replace_func_impl(pcre_cache_entry *pce, zend_string *subject_str, const char *subject, size_t subject_len, zend_fcall_info *fci, zend_fcall_info_cache *fcc, size_t limit, size_t *replace_count, zend_long flags)
1857 {
1858 	uint32_t		 options;			/* Execution options */
1859 	int				 count;				/* Count of matched subpatterns */
1860 	PCRE2_SIZE		*offsets;			/* Array of subpattern offsets */
1861 	zend_string		**subpat_names;		/* Array for named subpatterns */
1862 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
1863 	size_t			 new_len;			/* Length of needed storage */
1864 	size_t			 alloc_len;			/* Actual allocated length */
1865 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
1866 	size_t			 last_end_offset;	/* Where the last search ended */
1867 	const char		*match,				/* The current match */
1868 					*piece;				/* The current piece of subject */
1869 	size_t			result_len; 		/* Length of result */
1870 	zend_string		*result;			/* Result of replacement */
1871 	zend_string     *eval_result;		/* Result of custom function */
1872 	pcre2_match_data *match_data;
1873 	bool old_mdata_used;
1874 
1875 	/* Calculate the size of the offsets array, and allocate memory for it. */
1876 	num_subpats = pce->capture_count + 1;
1877 
1878 	/*
1879 	 * Build a mapping from subpattern numbers to their names. We will
1880 	 * allocate the table only if there are any named subpatterns.
1881 	 */
1882 	subpat_names = NULL;
1883 	if (UNEXPECTED(pce->name_count > 0)) {
1884 		subpat_names = make_subpats_table(num_subpats, pce);
1885 		if (!subpat_names) {
1886 			return NULL;
1887 		}
1888 	}
1889 
1890 	alloc_len = 0;
1891 	result = NULL;
1892 
1893 	/* Initialize */
1894 	match = NULL;
1895 	start_offset = 0;
1896 	last_end_offset = 0;
1897 	result_len = 0;
1898 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1899 
1900 	old_mdata_used = mdata_used;
1901 	if (!old_mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
1902 		mdata_used = 1;
1903 		match_data = mdata;
1904 	} else {
1905 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
1906 		if (!match_data) {
1907 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1908 			if (subpat_names) {
1909 				free_subpats_table(subpat_names, num_subpats);
1910 			}
1911 			mdata_used = old_mdata_used;
1912 			return NULL;
1913 		}
1914 	}
1915 
1916 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
1917 
1918 	/* Execute the regular expression. */
1919 #ifdef HAVE_PCRE_JIT_SUPPORT
1920 	if ((pce->preg_options & PREG_JIT) && options) {
1921 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1922 				PCRE2_NO_UTF_CHECK, match_data, mctx);
1923 	} else
1924 #endif
1925 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1926 			options, match_data, mctx);
1927 
1928 	while (1) {
1929 		piece = subject + last_end_offset;
1930 
1931 		if (count >= 0 && limit) {
1932 			/* Check for too many substrings condition. */
1933 			if (UNEXPECTED(count == 0)) {
1934 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
1935 				count = num_subpats;
1936 			}
1937 
1938 matched:
1939 			offsets = pcre2_get_ovector_pointer(match_data);
1940 
1941 			if (UNEXPECTED(offsets[1] < offsets[0])) {
1942 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
1943 				if (result) {
1944 					zend_string_release_ex(result, 0);
1945 					result = NULL;
1946 				}
1947 				break;
1948 			}
1949 
1950 			if (replace_count) {
1951 				++*replace_count;
1952 			}
1953 
1954 			/* Set the match location in subject */
1955 			match = subject + offsets[0];
1956 
1957 			new_len = result_len + offsets[0] - last_end_offset; /* part before the match */
1958 
1959 			/* Use custom function to get replacement string and its length. */
1960 			eval_result = preg_do_repl_func(
1961 				fci, fcc, subject, offsets, subpat_names, num_subpats, count,
1962 				pcre2_get_mark(match_data), flags);
1963 
1964 			ZEND_ASSERT(eval_result);
1965 			new_len = zend_safe_address_guarded(1, ZSTR_LEN(eval_result) + ZSTR_MAX_OVERHEAD, new_len) -ZSTR_MAX_OVERHEAD;
1966 			if (new_len >= alloc_len) {
1967 				alloc_len = zend_safe_address_guarded(2, new_len, ZSTR_MAX_OVERHEAD) - ZSTR_MAX_OVERHEAD;
1968 				if (result == NULL) {
1969 					result = zend_string_alloc(alloc_len, 0);
1970 				} else {
1971 					result = zend_string_extend(result, alloc_len, 0);
1972 				}
1973 			}
1974 
1975 			if (match-piece > 0) {
1976 				/* copy the part of the string before the match */
1977 				memcpy(ZSTR_VAL(result) + result_len, piece, match-piece);
1978 				result_len += (match-piece);
1979 			}
1980 
1981 			/* If using custom function, copy result to the buffer and clean up. */
1982 			memcpy(ZSTR_VAL(result) + result_len, ZSTR_VAL(eval_result), ZSTR_LEN(eval_result));
1983 			result_len += ZSTR_LEN(eval_result);
1984 			zend_string_release_ex(eval_result, 0);
1985 
1986 			limit--;
1987 
1988 			/* Advance to the next piece. */
1989 			start_offset = last_end_offset = offsets[1];
1990 
1991 			/* If we have matched an empty string, mimic what Perl's /g options does.
1992 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
1993 			   the match again at the same point. If this fails (picked up above) we
1994 			   advance to the next character. */
1995 			if (start_offset == offsets[0]) {
1996 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
1997 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
1998 
1999 				piece = subject + start_offset;
2000 				if (count >= 0 && limit) {
2001 					goto matched;
2002 				} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
2003 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
2004 					   this is not necessarily the end. We need to advance
2005 					   the start offset, and continue. Fudge the offset values
2006 					   to achieve this, unless we're already at the end of the string. */
2007 					if (start_offset < subject_len) {
2008 						size_t unit_len = calculate_unit_length(pce, piece);
2009 						start_offset += unit_len;
2010 					} else {
2011 						goto not_matched;
2012 					}
2013 				} else {
2014 					goto error;
2015 				}
2016 			}
2017 
2018 		} else if (count == PCRE2_ERROR_NOMATCH || limit == 0) {
2019 not_matched:
2020 			if (!result && subject_str) {
2021 				result = zend_string_copy(subject_str);
2022 				break;
2023 			}
2024 			/* now we know exactly how long it is */
2025 			alloc_len = result_len + subject_len - last_end_offset;
2026 			if (NULL != result) {
2027 				result = zend_string_realloc(result, alloc_len, 0);
2028 			} else {
2029 				result = zend_string_alloc(alloc_len, 0);
2030 			}
2031 			/* stick that last bit of string on our output */
2032 			memcpy(ZSTR_VAL(result) + result_len, piece, subject_len - last_end_offset);
2033 			result_len += subject_len - last_end_offset;
2034 			ZSTR_VAL(result)[result_len] = '\0';
2035 			ZSTR_LEN(result) = result_len;
2036 			break;
2037 		} else {
2038 error:
2039 			pcre_handle_exec_error(count);
2040 			if (result) {
2041 				zend_string_release_ex(result, 0);
2042 				result = NULL;
2043 			}
2044 			break;
2045 		}
2046 #ifdef HAVE_PCRE_JIT_SUPPORT
2047 		if ((pce->preg_options & PREG_JIT)) {
2048 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
2049 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2050 		} else
2051 #endif
2052 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, subject_len, start_offset,
2053 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2054 	}
2055 	if (match_data != mdata) {
2056 		pcre2_match_data_free(match_data);
2057 	}
2058 	mdata_used = old_mdata_used;
2059 
2060 	if (UNEXPECTED(subpat_names)) {
2061 		free_subpats_table(subpat_names, num_subpats);
2062 	}
2063 
2064 	return result;
2065 }
2066 /* }}} */
2067 
2068 /* {{{ php_pcre_replace_func */
php_pcre_replace_func(zend_string * regex,zend_string * subject_str,zend_fcall_info * fci,zend_fcall_info_cache * fcc,size_t limit,size_t * replace_count,zend_long flags)2069 static zend_always_inline zend_string *php_pcre_replace_func(zend_string *regex,
2070 							  zend_string *subject_str,
2071 							  zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2072 							  size_t limit, size_t *replace_count, zend_long flags)
2073 {
2074 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
2075 	zend_string	 		*result;			/* Function result */
2076 
2077 	/* Compile regex or get it from cache. */
2078 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2079 		return NULL;
2080 	}
2081 	pce->refcount++;
2082 	result = php_pcre_replace_func_impl(
2083 		pce, subject_str, ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), fci, fcc,
2084 		limit, replace_count, flags);
2085 	pce->refcount--;
2086 
2087 	return result;
2088 }
2089 /* }}} */
2090 
2091 /* {{{ php_pcre_replace_array */
php_pcre_replace_array(HashTable * regex,zend_string * replace_str,HashTable * replace_ht,zend_string * subject_str,size_t limit,size_t * replace_count)2092 static zend_string *php_pcre_replace_array(HashTable *regex,
2093 	zend_string *replace_str, HashTable *replace_ht,
2094 	zend_string *subject_str, size_t limit, size_t *replace_count)
2095 {
2096 	zval		*regex_entry;
2097 	zend_string *result;
2098 
2099 	zend_string_addref(subject_str);
2100 
2101 	if (replace_ht) {
2102 		uint32_t replace_idx = 0;
2103 
2104 		/* For each entry in the regex array, get the entry */
2105 		ZEND_HASH_FOREACH_VAL(regex, regex_entry) {
2106 			/* Make sure we're dealing with strings. */
2107 			zend_string *tmp_regex_str;
2108 			zend_string *regex_str = zval_get_tmp_string(regex_entry, &tmp_regex_str);
2109 			zend_string *replace_entry_str, *tmp_replace_entry_str;
2110 			zval *zv;
2111 
2112 			/* Get current entry */
2113 			while (1) {
2114 				if (replace_idx == replace_ht->nNumUsed) {
2115 					replace_entry_str = ZSTR_EMPTY_ALLOC();
2116 					tmp_replace_entry_str = NULL;
2117 					break;
2118 				}
2119 				zv = ZEND_HASH_ELEMENT(replace_ht, replace_idx);
2120 				replace_idx++;
2121 				if (Z_TYPE_P(zv) != IS_UNDEF) {
2122 					replace_entry_str = zval_get_tmp_string(zv, &tmp_replace_entry_str);
2123 					break;
2124 				}
2125 			}
2126 
2127 			/* Do the actual replacement and put the result back into subject_str
2128 			   for further replacements. */
2129 			result = php_pcre_replace(regex_str, subject_str, ZSTR_VAL(subject_str),
2130 				ZSTR_LEN(subject_str), replace_entry_str, limit, replace_count);
2131 			zend_tmp_string_release(tmp_replace_entry_str);
2132 			zend_tmp_string_release(tmp_regex_str);
2133 			zend_string_release_ex(subject_str, 0);
2134 			subject_str = result;
2135 			if (UNEXPECTED(result == NULL)) {
2136 				break;
2137 			}
2138 		} ZEND_HASH_FOREACH_END();
2139 
2140 	} else {
2141 		ZEND_ASSERT(replace_str != NULL);
2142 
2143 		/* For each entry in the regex array, get the entry */
2144 		ZEND_HASH_FOREACH_VAL(regex, regex_entry) {
2145 			/* Make sure we're dealing with strings. */
2146 			zend_string *tmp_regex_str;
2147 			zend_string *regex_str = zval_get_tmp_string(regex_entry, &tmp_regex_str);
2148 
2149 			/* Do the actual replacement and put the result back into subject_str
2150 			   for further replacements. */
2151 			result = php_pcre_replace(regex_str, subject_str, ZSTR_VAL(subject_str),
2152 				ZSTR_LEN(subject_str), replace_str, limit, replace_count);
2153 			zend_tmp_string_release(tmp_regex_str);
2154 			zend_string_release_ex(subject_str, 0);
2155 			subject_str = result;
2156 
2157 			if (UNEXPECTED(result == NULL)) {
2158 				break;
2159 			}
2160 		} ZEND_HASH_FOREACH_END();
2161 	}
2162 
2163 	return subject_str;
2164 }
2165 /* }}} */
2166 
2167 /* {{{ php_replace_in_subject */
php_replace_in_subject(zend_string * regex_str,HashTable * regex_ht,zend_string * replace_str,HashTable * replace_ht,zend_string * subject,size_t limit,size_t * replace_count)2168 static zend_always_inline zend_string *php_replace_in_subject(
2169 	zend_string *regex_str, HashTable *regex_ht,
2170 	zend_string *replace_str, HashTable *replace_ht,
2171 	zend_string *subject, size_t limit, size_t *replace_count)
2172 {
2173 	zend_string *result;
2174 
2175 	if (regex_str) {
2176 		ZEND_ASSERT(replace_str != NULL);
2177 		result = php_pcre_replace(regex_str, subject, ZSTR_VAL(subject), ZSTR_LEN(subject),
2178 			replace_str, limit, replace_count);
2179 	} else {
2180 		ZEND_ASSERT(regex_ht != NULL);
2181 		result = php_pcre_replace_array(regex_ht, replace_str, replace_ht, subject,
2182 			limit, replace_count);
2183 	}
2184 	return result;
2185 }
2186 /* }}} */
2187 
2188 /* {{{ php_replace_in_subject_func */
php_replace_in_subject_func(zend_string * regex_str,HashTable * regex_ht,zend_fcall_info * fci,zend_fcall_info_cache * fcc,zend_string * subject,size_t limit,size_t * replace_count,zend_long flags)2189 static zend_string *php_replace_in_subject_func(zend_string *regex_str, HashTable *regex_ht,
2190 	zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2191 	zend_string *subject, size_t limit, size_t *replace_count, zend_long flags)
2192 {
2193 	zend_string *result;
2194 
2195 	if (regex_str) {
2196 		result = php_pcre_replace_func(
2197 			regex_str, subject, fci, fcc, limit, replace_count, flags);
2198 		return result;
2199 	} else {
2200 		/* If regex is an array */
2201 		zval		*regex_entry;
2202 
2203 		ZEND_ASSERT(regex_ht != NULL);
2204 
2205 		zend_string_addref(subject);
2206 
2207 		/* For each entry in the regex array, get the entry */
2208 		ZEND_HASH_FOREACH_VAL(regex_ht, regex_entry) {
2209 			/* Make sure we're dealing with strings. */
2210 			zend_string *tmp_regex_entry_str;
2211 			zend_string *regex_entry_str = zval_get_tmp_string(regex_entry, &tmp_regex_entry_str);
2212 
2213 			/* Do the actual replacement and put the result back into subject
2214 			   for further replacements. */
2215 			result = php_pcre_replace_func(
2216 				regex_entry_str, subject, fci, fcc, limit, replace_count, flags);
2217 			zend_tmp_string_release(tmp_regex_entry_str);
2218 			zend_string_release(subject);
2219 			subject = result;
2220 			if (UNEXPECTED(result == NULL)) {
2221 				break;
2222 			}
2223 		} ZEND_HASH_FOREACH_END();
2224 
2225 		return subject;
2226 	}
2227 }
2228 /* }}} */
2229 
2230 /* {{{ preg_replace_func_impl */
preg_replace_func_impl(zval * return_value,zend_string * regex_str,HashTable * regex_ht,zend_fcall_info * fci,zend_fcall_info_cache * fcc,zend_string * subject_str,HashTable * subject_ht,zend_long limit_val,zend_long flags)2231 static size_t preg_replace_func_impl(zval *return_value,
2232 	zend_string *regex_str, HashTable *regex_ht,
2233 	zend_fcall_info *fci, zend_fcall_info_cache *fcc,
2234 	zend_string *subject_str, HashTable *subject_ht, zend_long limit_val, zend_long flags)
2235 {
2236 	zend_string	*result;
2237 	size_t replace_count = 0;
2238 
2239 	if (subject_str) {
2240 		result = php_replace_in_subject_func(
2241 			regex_str, regex_ht, fci, fcc, subject_str, limit_val, &replace_count, flags);
2242 		if (result != NULL) {
2243 			RETVAL_STR(result);
2244 		} else {
2245 			RETVAL_NULL();
2246 		}
2247 	} else {
2248 		/* if subject is an array */
2249 		zval		*subject_entry, zv;
2250 		zend_string	*string_key;
2251 		zend_ulong	 num_key;
2252 
2253 		ZEND_ASSERT(subject_ht != NULL);
2254 
2255 		array_init_size(return_value, zend_hash_num_elements(subject_ht));
2256 
2257 		/* For each subject entry, convert it to string, then perform replacement
2258 		   and add the result to the return_value array. */
2259 		ZEND_HASH_FOREACH_KEY_VAL(subject_ht, num_key, string_key, subject_entry) {
2260 			zend_string *tmp_subject_entry_str;
2261 			zend_string *subject_entry_str = zval_get_tmp_string(subject_entry, &tmp_subject_entry_str);
2262 
2263 			result = php_replace_in_subject_func(
2264 				regex_str, regex_ht, fci, fcc, subject_entry_str, limit_val, &replace_count, flags);
2265 			if (result != NULL) {
2266 				/* Add to return array */
2267 				ZVAL_STR(&zv, result);
2268 				if (string_key) {
2269 					zend_hash_add_new(Z_ARRVAL_P(return_value), string_key, &zv);
2270 				} else {
2271 					zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, &zv);
2272 				}
2273 			}
2274 			zend_tmp_string_release(tmp_subject_entry_str);
2275 		} ZEND_HASH_FOREACH_END();
2276 	}
2277 
2278 	return replace_count;
2279 }
2280 /* }}} */
2281 
2282 /* {{{ preg_replace_common */
preg_replace_common(INTERNAL_FUNCTION_PARAMETERS,bool is_filter)2283 static void preg_replace_common(INTERNAL_FUNCTION_PARAMETERS, bool is_filter)
2284 {
2285 	zval *zcount = NULL;
2286 	zend_string *regex_str;
2287 	HashTable *regex_ht;
2288 	zend_string *replace_str;
2289 	HashTable *replace_ht;
2290 	zend_string *subject_str;
2291 	HashTable *subject_ht;
2292 	zend_long limit = -1;
2293 	size_t replace_count = 0;
2294 	zend_string	*result;
2295 	size_t old_replace_count;
2296 
2297 	/* Get function parameters and do error-checking. */
2298 	ZEND_PARSE_PARAMETERS_START(3, 5)
2299 		Z_PARAM_ARRAY_HT_OR_STR(regex_ht, regex_str)
2300 		Z_PARAM_ARRAY_HT_OR_STR(replace_ht, replace_str)
2301 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2302 		Z_PARAM_OPTIONAL
2303 		Z_PARAM_LONG(limit)
2304 		Z_PARAM_ZVAL(zcount)
2305 	ZEND_PARSE_PARAMETERS_END();
2306 
2307 	/* If replace is an array then the regex argument needs to also be an array */
2308 	if (replace_ht && !regex_ht) {
2309 		zend_argument_type_error(1, "must be of type array when argument #2 ($replacement) is an array, string given");
2310 		RETURN_THROWS();
2311 	}
2312 
2313 	if (subject_str) {
2314 		old_replace_count = replace_count;
2315 		result = php_replace_in_subject(regex_str, regex_ht, replace_str, replace_ht,
2316 			subject_str, limit, &replace_count);
2317 		if (result != NULL) {
2318 			if (!is_filter || replace_count > old_replace_count) {
2319 				RETVAL_STR(result);
2320 			} else {
2321 				zend_string_release_ex(result, 0);
2322 				RETVAL_NULL();
2323 			}
2324 		} else {
2325 			RETVAL_NULL();
2326 		}
2327 	} else {
2328 		/* if subject is an array */
2329 		zval		*subject_entry, zv;
2330 		zend_string	*string_key;
2331 		zend_ulong	 num_key;
2332 
2333 		ZEND_ASSERT(subject_ht != NULL);
2334 
2335 		array_init_size(return_value, zend_hash_num_elements(subject_ht));
2336 
2337 		/* For each subject entry, convert it to string, then perform replacement
2338 		   and add the result to the return_value array. */
2339 		ZEND_HASH_FOREACH_KEY_VAL(subject_ht, num_key, string_key, subject_entry) {
2340 			old_replace_count = replace_count;
2341 			zend_string *tmp_subject_entry_str;
2342 			zend_string *subject_entry_str = zval_get_tmp_string(subject_entry, &tmp_subject_entry_str);
2343 			result = php_replace_in_subject(regex_str, regex_ht, replace_str, replace_ht,
2344 				subject_entry_str, limit, &replace_count);
2345 
2346 			if (result != NULL) {
2347 				if (!is_filter || replace_count > old_replace_count) {
2348 					/* Add to return array */
2349 					ZVAL_STR(&zv, result);
2350 					if (string_key) {
2351 						zend_hash_add_new(Z_ARRVAL_P(return_value), string_key, &zv);
2352 					} else {
2353 						zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, &zv);
2354 					}
2355 				} else {
2356 					zend_string_release_ex(result, 0);
2357 				}
2358 			}
2359 			zend_tmp_string_release(tmp_subject_entry_str);
2360 		} ZEND_HASH_FOREACH_END();
2361 	}
2362 
2363 	if (zcount) {
2364 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2365 	}
2366 }
2367 /* }}} */
2368 
2369 /* {{{ Perform Perl-style regular expression replacement. */
PHP_FUNCTION(preg_replace)2370 PHP_FUNCTION(preg_replace)
2371 {
2372 	preg_replace_common(INTERNAL_FUNCTION_PARAM_PASSTHRU, false);
2373 }
2374 /* }}} */
2375 
2376 /* {{{ Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback)2377 PHP_FUNCTION(preg_replace_callback)
2378 {
2379 	zval *zcount = NULL;
2380 	zend_string *regex_str;
2381 	HashTable *regex_ht;
2382 	zend_string *subject_str;
2383 	HashTable *subject_ht;
2384 	zend_long limit = -1, flags = 0;
2385 	size_t replace_count;
2386 	zend_fcall_info fci;
2387 	zend_fcall_info_cache fcc;
2388 
2389 	/* Get function parameters and do error-checking. */
2390 	ZEND_PARSE_PARAMETERS_START(3, 6)
2391 		Z_PARAM_ARRAY_HT_OR_STR(regex_ht, regex_str)
2392 		Z_PARAM_FUNC(fci, fcc)
2393 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2394 		Z_PARAM_OPTIONAL
2395 		Z_PARAM_LONG(limit)
2396 		Z_PARAM_ZVAL(zcount)
2397 		Z_PARAM_LONG(flags)
2398 	ZEND_PARSE_PARAMETERS_END();
2399 
2400 	replace_count = preg_replace_func_impl(return_value, regex_str, regex_ht,
2401 		&fci, &fcc,
2402 		subject_str, subject_ht, limit, flags);
2403 	if (zcount) {
2404 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2405 	}
2406 }
2407 /* }}} */
2408 
2409 /* {{{ Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback_array)2410 PHP_FUNCTION(preg_replace_callback_array)
2411 {
2412 	zval zv, *replace, *zcount = NULL;
2413 	HashTable *pattern, *subject_ht;
2414 	zend_string *subject_str, *str_idx_regex;
2415 	zend_long limit = -1, flags = 0;
2416 	size_t replace_count = 0;
2417 	zend_fcall_info fci;
2418 	zend_fcall_info_cache fcc;
2419 
2420 	/* Get function parameters and do error-checking. */
2421 	ZEND_PARSE_PARAMETERS_START(2, 5)
2422 		Z_PARAM_ARRAY_HT(pattern)
2423 		Z_PARAM_ARRAY_HT_OR_STR(subject_ht, subject_str)
2424 		Z_PARAM_OPTIONAL
2425 		Z_PARAM_LONG(limit)
2426 		Z_PARAM_ZVAL(zcount)
2427 		Z_PARAM_LONG(flags)
2428 	ZEND_PARSE_PARAMETERS_END();
2429 
2430 	fci.size = sizeof(fci);
2431 	fci.object = NULL;
2432 	fci.named_params = NULL;
2433 
2434 	if (subject_ht) {
2435 		GC_TRY_ADDREF(subject_ht);
2436 	} else {
2437 		GC_TRY_ADDREF(subject_str);
2438 	}
2439 
2440 	ZEND_HASH_FOREACH_STR_KEY_VAL(pattern, str_idx_regex, replace) {
2441 		if (!str_idx_regex) {
2442 			php_error_docref(NULL, E_WARNING, "Delimiter must not be alphanumeric or backslash");
2443 			RETVAL_NULL();
2444 			goto error;
2445 		}
2446 
2447 		if (!zend_is_callable_ex(replace, NULL, 0, NULL, &fcc, NULL)) {
2448 			zend_argument_type_error(1, "must contain only valid callbacks");
2449 			goto error;
2450 		}
2451 
2452 		ZVAL_COPY_VALUE(&fci.function_name, replace);
2453 
2454 		replace_count += preg_replace_func_impl(&zv, str_idx_regex, /* regex_ht */ NULL, &fci, &fcc,
2455 			subject_str, subject_ht, limit, flags);
2456 		switch (Z_TYPE(zv)) {
2457 			case IS_ARRAY:
2458 				ZEND_ASSERT(subject_ht);
2459 				zend_array_release(subject_ht);
2460 				subject_ht = Z_ARR(zv);
2461 				break;
2462 			case IS_STRING:
2463 				ZEND_ASSERT(subject_str);
2464 				zend_string_release(subject_str);
2465 				subject_str = Z_STR(zv);
2466 				break;
2467 			case IS_NULL:
2468 				RETVAL_NULL();
2469 				goto error;
2470 			EMPTY_SWITCH_DEFAULT_CASE()
2471 		}
2472 
2473 		if (EG(exception)) {
2474 			goto error;
2475 		}
2476 	} ZEND_HASH_FOREACH_END();
2477 
2478 	if (zcount) {
2479 		ZEND_TRY_ASSIGN_REF_LONG(zcount, replace_count);
2480 	}
2481 
2482 	if (subject_ht) {
2483 		RETURN_ARR(subject_ht);
2484 	} else {
2485 		RETURN_STR(subject_str);
2486 	}
2487 
2488 error:
2489 	if (subject_ht) {
2490 		zend_array_release(subject_ht);
2491 	} else {
2492 		zend_string_release(subject_str);
2493 	}
2494 }
2495 /* }}} */
2496 
2497 /* {{{ Perform Perl-style regular expression replacement and only return matches. */
PHP_FUNCTION(preg_filter)2498 PHP_FUNCTION(preg_filter)
2499 {
2500 	preg_replace_common(INTERNAL_FUNCTION_PARAM_PASSTHRU, true);
2501 }
2502 /* }}} */
2503 
2504 /* {{{ Split string into an array using a perl-style regular expression as a delimiter */
PHP_FUNCTION(preg_split)2505 PHP_FUNCTION(preg_split)
2506 {
2507 	zend_string			*regex;			/* Regular expression */
2508 	zend_string			*subject;		/* String to match against */
2509 	zend_long			 limit_val = -1;/* Integer value of limit */
2510 	zend_long			 flags = 0;		/* Match control flags */
2511 	pcre_cache_entry	*pce;			/* Compiled regular expression */
2512 
2513 	/* Get function parameters and do error checking */
2514 	ZEND_PARSE_PARAMETERS_START(2, 4)
2515 		Z_PARAM_STR(regex)
2516 		Z_PARAM_STR(subject)
2517 		Z_PARAM_OPTIONAL
2518 		Z_PARAM_LONG(limit_val)
2519 		Z_PARAM_LONG(flags)
2520 	ZEND_PARSE_PARAMETERS_END();
2521 
2522 	/* Compile regex or get it from cache. */
2523 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2524 		RETURN_FALSE;
2525 	}
2526 
2527 	pce->refcount++;
2528 	php_pcre_split_impl(pce, subject, return_value, limit_val, flags);
2529 	pce->refcount--;
2530 }
2531 /* }}} */
2532 
2533 /* {{{ php_pcre_split */
php_pcre_split_impl(pcre_cache_entry * pce,zend_string * subject_str,zval * return_value,zend_long limit_val,zend_long flags)2534 PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
2535 	zend_long limit_val, zend_long flags)
2536 {
2537 	PCRE2_SIZE		*offsets;			/* Array of subpattern offsets */
2538 	uint32_t		 options;			/* Execution options */
2539 	int				 count;				/* Count of matched subpatterns */
2540 	PCRE2_SIZE		 start_offset;		/* Where the new search starts */
2541 	PCRE2_SIZE		 last_match_offset;	/* Location of last match */
2542 	uint32_t		 no_empty;			/* If NO_EMPTY flag is set */
2543 	uint32_t		 delim_capture; 	/* If delimiters should be captured */
2544 	uint32_t		 offset_capture;	/* If offsets should be captured */
2545 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
2546 	zval			 tmp;
2547 	pcre2_match_data *match_data;
2548 	char *subject = ZSTR_VAL(subject_str);
2549 
2550 	no_empty = flags & PREG_SPLIT_NO_EMPTY;
2551 	delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
2552 	offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
2553 
2554 	/* Initialize return value */
2555 	array_init(return_value);
2556 
2557 	/* Calculate the size of the offsets array, and allocate memory for it. */
2558 	num_subpats = pce->capture_count + 1;
2559 
2560 	/* Start at the beginning of the string */
2561 	start_offset = 0;
2562 	last_match_offset = 0;
2563 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
2564 
2565 	if (limit_val == -1) {
2566 		/* pass */
2567 	} else if (limit_val == 0) {
2568 		limit_val = -1;
2569 	} else if (limit_val <= 1) {
2570 		goto last;
2571 	}
2572 
2573 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
2574 		match_data = mdata;
2575 	} else {
2576 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
2577 		if (!match_data) {
2578 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2579 			zval_ptr_dtor(return_value);
2580 			RETURN_FALSE;
2581 		}
2582 	}
2583 
2584 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
2585 
2586 #ifdef HAVE_PCRE_JIT_SUPPORT
2587 	if ((pce->preg_options & PREG_JIT) && options) {
2588 		count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2589 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2590 	} else
2591 #endif
2592 	count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2593 			options, match_data, mctx);
2594 
2595 	while (1) {
2596 		/* If something matched */
2597 		if (count >= 0) {
2598 			/* Check for too many substrings condition. */
2599 			if (UNEXPECTED(count == 0)) {
2600 				php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
2601 				count = num_subpats;
2602 			}
2603 
2604 matched:
2605 			offsets = pcre2_get_ovector_pointer(match_data);
2606 
2607 			if (UNEXPECTED(offsets[1] < offsets[0])) {
2608 				PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2609 				break;
2610 			}
2611 
2612 			if (!no_empty || offsets[0] != last_match_offset) {
2613 				if (offset_capture) {
2614 					/* Add (match, offset) pair to the return value */
2615 					add_offset_pair(
2616 						return_value, subject, last_match_offset, offsets[0],
2617 						NULL, 0);
2618 				} else {
2619 					/* Add the piece to the return value */
2620 					populate_match_value_str(&tmp, subject, last_match_offset, offsets[0]);
2621 					zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &tmp);
2622 				}
2623 
2624 				/* One less left to do */
2625 				if (limit_val != -1)
2626 					limit_val--;
2627 			}
2628 
2629 			if (delim_capture) {
2630 				size_t i;
2631 				for (i = 1; i < count; i++) {
2632 					/* If we have matched a delimiter */
2633 					if (!no_empty || offsets[2*i] != offsets[2*i+1]) {
2634 						if (offset_capture) {
2635 							add_offset_pair(
2636 								return_value, subject, offsets[2*i], offsets[2*i+1], NULL, 0);
2637 						} else {
2638 							populate_match_value_str(&tmp, subject, offsets[2*i], offsets[2*i+1]);
2639 							zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &tmp);
2640 						}
2641 					}
2642 				}
2643 			}
2644 
2645 			/* Advance to the position right after the last full match */
2646 			start_offset = last_match_offset = offsets[1];
2647 
2648 			/* If we have matched an empty string, mimic what Perl's /g options does.
2649 			   This turns out to be rather cunning. First we set PCRE2_NOTEMPTY_ATSTART and try
2650 			   the match again at the same point. If this fails (picked up above) we
2651 			   advance to the next character. */
2652 			if (start_offset == offsets[0]) {
2653 				/* Get next piece if no limit or limit not yet reached and something matched*/
2654 				if (limit_val != -1 && limit_val <= 1) {
2655 					break;
2656 				}
2657 				count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2658 					PCRE2_NO_UTF_CHECK | PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED, match_data, mctx);
2659 				if (count >= 0) {
2660 					goto matched;
2661 				} else if (count == PCRE2_ERROR_NOMATCH) {
2662 					/* If we previously set PCRE2_NOTEMPTY_ATSTART after a null match,
2663 					   this is not necessarily the end. We need to advance
2664 					   the start offset, and continue. Fudge the offset values
2665 					   to achieve this, unless we're already at the end of the string. */
2666 					if (start_offset < ZSTR_LEN(subject_str)) {
2667 						start_offset += calculate_unit_length(pce, subject + start_offset);
2668 					} else {
2669 						break;
2670 					}
2671 				} else {
2672 					goto error;
2673 				}
2674 			}
2675 
2676 		} else if (count == PCRE2_ERROR_NOMATCH) {
2677 			break;
2678 		} else {
2679 error:
2680 			pcre_handle_exec_error(count);
2681 			break;
2682 		}
2683 
2684 		/* Get next piece if no limit or limit not yet reached and something matched*/
2685 		if (limit_val != -1 && limit_val <= 1) {
2686 			break;
2687 		}
2688 
2689 #ifdef HAVE_PCRE_JIT_SUPPORT
2690 		if (pce->preg_options & PREG_JIT) {
2691 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2692 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2693 		} else
2694 #endif
2695 		count = pcre2_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
2696 				PCRE2_NO_UTF_CHECK, match_data, mctx);
2697 	}
2698 	if (match_data != mdata) {
2699 		pcre2_match_data_free(match_data);
2700 	}
2701 
2702 	if (PCRE_G(error_code) != PHP_PCRE_NO_ERROR) {
2703 		zval_ptr_dtor(return_value);
2704 		RETURN_FALSE;
2705 	}
2706 
2707 last:
2708 	start_offset = last_match_offset; /* the offset might have been incremented, but without further successful matches */
2709 
2710 	if (!no_empty || start_offset < ZSTR_LEN(subject_str)) {
2711 		if (offset_capture) {
2712 			/* Add the last (match, offset) pair to the return value */
2713 			add_offset_pair(return_value, subject, start_offset, ZSTR_LEN(subject_str), NULL, 0);
2714 		} else {
2715 			/* Add the last piece to the return value */
2716 			if (start_offset == 0) {
2717 				ZVAL_STR_COPY(&tmp, subject_str);
2718 			} else {
2719 				populate_match_value_str(&tmp, subject, start_offset, ZSTR_LEN(subject_str));
2720 			}
2721 			zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &tmp);
2722 		}
2723 	}
2724 }
2725 /* }}} */
2726 
2727 /* {{{ Quote regular expression characters plus an optional character */
PHP_FUNCTION(preg_quote)2728 PHP_FUNCTION(preg_quote)
2729 {
2730 	zend_string *str;       		/* Input string argument */
2731 	zend_string	*delim = NULL;		/* Additional delimiter argument */
2732 	char		*in_str;			/* Input string */
2733 	char		*in_str_end;    	/* End of the input string */
2734 	zend_string	*out_str;			/* Output string with quoted characters */
2735 	size_t       extra_len;         /* Number of additional characters */
2736 	char 		*p,					/* Iterator for input string */
2737 				*q,					/* Iterator for output string */
2738 				 delim_char = '\0',	/* Delimiter character to be quoted */
2739 				 c;					/* Current character */
2740 
2741 	/* Get the arguments and check for errors */
2742 	ZEND_PARSE_PARAMETERS_START(1, 2)
2743 		Z_PARAM_STR(str)
2744 		Z_PARAM_OPTIONAL
2745 		Z_PARAM_STR_OR_NULL(delim)
2746 	ZEND_PARSE_PARAMETERS_END();
2747 
2748 	/* Nothing to do if we got an empty string */
2749 	if (ZSTR_LEN(str) == 0) {
2750 		RETURN_EMPTY_STRING();
2751 	}
2752 
2753 	in_str = ZSTR_VAL(str);
2754 	in_str_end = in_str + ZSTR_LEN(str);
2755 
2756 	if (delim) {
2757 		delim_char = ZSTR_VAL(delim)[0];
2758 	}
2759 
2760 	/* Go through the string and quote necessary characters */
2761 	extra_len = 0;
2762 	p = in_str;
2763 	do {
2764 		c = *p;
2765 		switch(c) {
2766 			case '.':
2767 			case '\\':
2768 			case '+':
2769 			case '*':
2770 			case '?':
2771 			case '[':
2772 			case '^':
2773 			case ']':
2774 			case '$':
2775 			case '(':
2776 			case ')':
2777 			case '{':
2778 			case '}':
2779 			case '=':
2780 			case '!':
2781 			case '>':
2782 			case '<':
2783 			case '|':
2784 			case ':':
2785 			case '-':
2786 			case '#':
2787 				extra_len++;
2788 				break;
2789 
2790 			case '\0':
2791 				extra_len+=3;
2792 				break;
2793 
2794 			default:
2795 				if (c == delim_char) {
2796 					extra_len++;
2797 				}
2798 				break;
2799 		}
2800 		p++;
2801 	} while (p != in_str_end);
2802 
2803 	if (extra_len == 0) {
2804 		RETURN_STR_COPY(str);
2805 	}
2806 
2807 	/* Allocate enough memory so that even if each character
2808 	   is quoted, we won't run out of room */
2809 	out_str = zend_string_safe_alloc(1, ZSTR_LEN(str), extra_len, 0);
2810 	q = ZSTR_VAL(out_str);
2811 	p = in_str;
2812 
2813 	do {
2814 		c = *p;
2815 		switch(c) {
2816 			case '.':
2817 			case '\\':
2818 			case '+':
2819 			case '*':
2820 			case '?':
2821 			case '[':
2822 			case '^':
2823 			case ']':
2824 			case '$':
2825 			case '(':
2826 			case ')':
2827 			case '{':
2828 			case '}':
2829 			case '=':
2830 			case '!':
2831 			case '>':
2832 			case '<':
2833 			case '|':
2834 			case ':':
2835 			case '-':
2836 			case '#':
2837 				*q++ = '\\';
2838 				*q++ = c;
2839 				break;
2840 
2841 			case '\0':
2842 				*q++ = '\\';
2843 				*q++ = '0';
2844 				*q++ = '0';
2845 				*q++ = '0';
2846 				break;
2847 
2848 			default:
2849 				if (c == delim_char) {
2850 					*q++ = '\\';
2851 				}
2852 				*q++ = c;
2853 				break;
2854 		}
2855 		p++;
2856 	} while (p != in_str_end);
2857 	*q = '\0';
2858 
2859 	RETURN_NEW_STR(out_str);
2860 }
2861 /* }}} */
2862 
2863 /* {{{ Searches array and returns entries which match regex */
PHP_FUNCTION(preg_grep)2864 PHP_FUNCTION(preg_grep)
2865 {
2866 	zend_string			*regex;			/* Regular expression */
2867 	zval				*input;			/* Input array */
2868 	zend_long			 flags = 0;		/* Match control flags */
2869 	pcre_cache_entry	*pce;			/* Compiled regular expression */
2870 
2871 	/* Get arguments and do error checking */
2872 	ZEND_PARSE_PARAMETERS_START(2, 3)
2873 		Z_PARAM_STR(regex)
2874 		Z_PARAM_ARRAY(input)
2875 		Z_PARAM_OPTIONAL
2876 		Z_PARAM_LONG(flags)
2877 	ZEND_PARSE_PARAMETERS_END();
2878 
2879 	/* Compile regex or get it from cache. */
2880 	if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2881 		RETURN_FALSE;
2882 	}
2883 
2884 	pce->refcount++;
2885 	php_pcre_grep_impl(pce, input, return_value, flags);
2886 	pce->refcount--;
2887 }
2888 /* }}} */
2889 
php_pcre_grep_impl(pcre_cache_entry * pce,zval * input,zval * return_value,zend_long flags)2890 PHPAPI void  php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, zend_long flags) /* {{{ */
2891 {
2892 	zval            *entry;             /* An entry in the input array */
2893 	uint32_t		 num_subpats;		/* Number of captured subpatterns */
2894 	int				 count;				/* Count of matched subpatterns */
2895 	uint32_t		 options;			/* Execution options */
2896 	zend_string		*string_key;
2897 	zend_ulong		 num_key;
2898 	bool		 invert;			/* Whether to return non-matching
2899 										   entries */
2900 	pcre2_match_data *match_data;
2901 	invert = flags & PREG_GREP_INVERT ? 1 : 0;
2902 
2903 	/* Calculate the size of the offsets array, and allocate memory for it. */
2904 	num_subpats = pce->capture_count + 1;
2905 
2906 	/* Initialize return array */
2907 	array_init(return_value);
2908 
2909 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
2910 
2911 	if (!mdata_used && num_subpats <= PHP_PCRE_PREALLOC_MDATA_SIZE) {
2912 		match_data = mdata;
2913 	} else {
2914 		match_data = pcre2_match_data_create_from_pattern(pce->re, PCRE_G(gctx_zmm));
2915 		if (!match_data) {
2916 			PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
2917 			return;
2918 		}
2919 	}
2920 
2921 	options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
2922 
2923 	/* Go through the input array */
2924 	ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(input), num_key, string_key, entry) {
2925 		zend_string *tmp_subject_str;
2926 		zend_string *subject_str = zval_get_tmp_string(entry, &tmp_subject_str);
2927 
2928 		/* Perform the match */
2929 #ifdef HAVE_PCRE_JIT_SUPPORT
2930 		if ((pce->preg_options & PREG_JIT) && options) {
2931 			count = pcre2_jit_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), 0,
2932 					PCRE2_NO_UTF_CHECK, match_data, mctx);
2933 		} else
2934 #endif
2935 		count = pcre2_match(pce->re, (PCRE2_SPTR)ZSTR_VAL(subject_str), ZSTR_LEN(subject_str), 0,
2936 				options, match_data, mctx);
2937 
2938 		/* If the entry fits our requirements */
2939 		if (count >= 0) {
2940 			/* Check for too many substrings condition. */
2941 			if (UNEXPECTED(count == 0)) {
2942 				php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings");
2943 			}
2944 			if (!invert) {
2945 				Z_TRY_ADDREF_P(entry);
2946 
2947 				/* Add to return array */
2948 				if (string_key) {
2949 					zend_hash_update(Z_ARRVAL_P(return_value), string_key, entry);
2950 				} else {
2951 					zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry);
2952 				}
2953 			}
2954 		} else if (count == PCRE2_ERROR_NOMATCH) {
2955 			if (invert) {
2956 				Z_TRY_ADDREF_P(entry);
2957 
2958 				/* Add to return array */
2959 				if (string_key) {
2960 					zend_hash_update(Z_ARRVAL_P(return_value), string_key, entry);
2961 				} else {
2962 					zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry);
2963 				}
2964 			}
2965 		} else {
2966 			pcre_handle_exec_error(count);
2967 			zend_tmp_string_release(tmp_subject_str);
2968 			break;
2969 		}
2970 
2971 		zend_tmp_string_release(tmp_subject_str);
2972 	} ZEND_HASH_FOREACH_END();
2973 	if (match_data != mdata) {
2974 		pcre2_match_data_free(match_data);
2975 	}
2976 }
2977 /* }}} */
2978 
2979 /* {{{ Returns the error code of the last regexp execution. */
PHP_FUNCTION(preg_last_error)2980 PHP_FUNCTION(preg_last_error)
2981 {
2982 	ZEND_PARSE_PARAMETERS_NONE();
2983 
2984 	RETURN_LONG(PCRE_G(error_code));
2985 }
2986 /* }}} */
2987 
2988 /* {{{ Returns the error message of the last regexp execution. */
PHP_FUNCTION(preg_last_error_msg)2989 PHP_FUNCTION(preg_last_error_msg)
2990 {
2991 	ZEND_PARSE_PARAMETERS_NONE();
2992 
2993 	RETURN_STRING(php_pcre_get_error_msg(PCRE_G(error_code)));
2994 }
2995 /* }}} */
2996 
2997 /* {{{ module definition structures */
2998 
2999 zend_module_entry pcre_module_entry = {
3000 	STANDARD_MODULE_HEADER,
3001 	"pcre",
3002 	ext_functions,
3003 	PHP_MINIT(pcre),
3004 	PHP_MSHUTDOWN(pcre),
3005 	PHP_RINIT(pcre),
3006 	PHP_RSHUTDOWN(pcre),
3007 	PHP_MINFO(pcre),
3008 	PHP_PCRE_VERSION,
3009 	PHP_MODULE_GLOBALS(pcre),
3010 	PHP_GINIT(pcre),
3011 	PHP_GSHUTDOWN(pcre),
3012 	NULL,
3013 	STANDARD_MODULE_PROPERTIES_EX
3014 };
3015 
3016 #ifdef COMPILE_DL_PCRE
ZEND_GET_MODULE(pcre)3017 ZEND_GET_MODULE(pcre)
3018 #endif
3019 
3020 /* }}} */
3021 
3022 PHPAPI pcre2_match_context *php_pcre_mctx(void)
3023 {/*{{{*/
3024 	return mctx;
3025 }/*}}}*/
3026 
php_pcre_gctx(void)3027 PHPAPI pcre2_general_context *php_pcre_gctx(void)
3028 {/*{{{*/
3029 	return gctx;
3030 }/*}}}*/
3031 
php_pcre_cctx(void)3032 PHPAPI pcre2_compile_context *php_pcre_cctx(void)
3033 {/*{{{*/
3034 	return cctx;
3035 }/*}}}*/
3036 
php_pcre_pce_incref(pcre_cache_entry * pce)3037 PHPAPI void php_pcre_pce_incref(pcre_cache_entry *pce)
3038 {/*{{{*/
3039 	assert(NULL != pce);
3040 	pce->refcount++;
3041 }/*}}}*/
3042 
php_pcre_pce_decref(pcre_cache_entry * pce)3043 PHPAPI void php_pcre_pce_decref(pcre_cache_entry *pce)
3044 {/*{{{*/
3045 	assert(NULL != pce);
3046 	assert(0 != pce->refcount);
3047 	pce->refcount--;
3048 }/*}}}*/
3049 
php_pcre_pce_re(pcre_cache_entry * pce)3050 PHPAPI pcre2_code *php_pcre_pce_re(pcre_cache_entry *pce)
3051 {/*{{{*/
3052 	assert(NULL != pce);
3053 	return pce->re;
3054 }/*}}}*/
3055