xref: /PHP-5.3/ext/pcre/php_pcre.c (revision e43c5a83)
1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 5                                                        |
4    +----------------------------------------------------------------------+
5    | Copyright (c) 1997-2013 The PHP Group                                |
6    +----------------------------------------------------------------------+
7    | This source file is subject to version 3.01 of the PHP license,      |
8    | that is bundled with this package in the file LICENSE, and is        |
9    | available through the world-wide-web at the following url:           |
10    | http://www.php.net/license/3_01.txt                                  |
11    | If you did not receive a copy of the PHP license and are unable to   |
12    | obtain it through the world-wide-web, please send a note to          |
13    | license@php.net so we can mail you a copy immediately.               |
14    +----------------------------------------------------------------------+
15    | Author: Andrei Zmievski <andrei@php.net>                             |
16    +----------------------------------------------------------------------+
17  */
18 
19 /* $Id$ */
20 
21 #include "php.h"
22 #include "php_ini.h"
23 #include "php_globals.h"
24 #include "php_pcre.h"
25 #include "ext/standard/info.h"
26 #include "ext/standard/php_smart_str.h"
27 
28 #if HAVE_PCRE || HAVE_BUNDLED_PCRE
29 
30 #include "ext/standard/php_string.h"
31 
32 #define PREG_PATTERN_ORDER			1
33 #define PREG_SET_ORDER				2
34 #define PREG_OFFSET_CAPTURE			(1<<8)
35 
36 #define	PREG_SPLIT_NO_EMPTY			(1<<0)
37 #define PREG_SPLIT_DELIM_CAPTURE	(1<<1)
38 #define PREG_SPLIT_OFFSET_CAPTURE	(1<<2)
39 
40 #define PREG_REPLACE_EVAL			(1<<0)
41 
42 #define PREG_GREP_INVERT			(1<<0)
43 
44 #define PCRE_CACHE_SIZE 4096
45 
46 enum {
47 	PHP_PCRE_NO_ERROR = 0,
48 	PHP_PCRE_INTERNAL_ERROR,
49 	PHP_PCRE_BACKTRACK_LIMIT_ERROR,
50 	PHP_PCRE_RECURSION_LIMIT_ERROR,
51 	PHP_PCRE_BAD_UTF8_ERROR,
52 	PHP_PCRE_BAD_UTF8_OFFSET_ERROR
53 };
54 
55 
ZEND_DECLARE_MODULE_GLOBALS(pcre)56 ZEND_DECLARE_MODULE_GLOBALS(pcre)
57 
58 
59 static void pcre_handle_exec_error(int pcre_code TSRMLS_DC) /* {{{ */
60 {
61 	int preg_code = 0;
62 
63 	switch (pcre_code) {
64 		case PCRE_ERROR_MATCHLIMIT:
65 			preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
66 			break;
67 
68 		case PCRE_ERROR_RECURSIONLIMIT:
69 			preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
70 			break;
71 
72 		case PCRE_ERROR_BADUTF8:
73 			preg_code = PHP_PCRE_BAD_UTF8_ERROR;
74 			break;
75 
76 		case PCRE_ERROR_BADUTF8_OFFSET:
77 			preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
78 			break;
79 
80 		default:
81 			preg_code = PHP_PCRE_INTERNAL_ERROR;
82 			break;
83 	}
84 
85 	PCRE_G(error_code) = preg_code;
86 }
87 /* }}} */
88 
php_free_pcre_cache(void * data)89 static void php_free_pcre_cache(void *data) /* {{{ */
90 {
91 	pcre_cache_entry *pce = (pcre_cache_entry *) data;
92 	if (!pce) return;
93 	pefree(pce->re, 1);
94 	if (pce->extra) pefree(pce->extra, 1);
95 #if HAVE_SETLOCALE
96 	if ((void*)pce->tables) pefree((void*)pce->tables, 1);
97 	pefree(pce->locale, 1);
98 #endif
99 }
100 /* }}} */
101 
PHP_GINIT_FUNCTION(pcre)102 static PHP_GINIT_FUNCTION(pcre) /* {{{ */
103 {
104 	zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1);
105 	pcre_globals->backtrack_limit = 0;
106 	pcre_globals->recursion_limit = 0;
107 	pcre_globals->error_code      = PHP_PCRE_NO_ERROR;
108 }
109 /* }}} */
110 
PHP_GSHUTDOWN_FUNCTION(pcre)111 static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */
112 {
113 	zend_hash_destroy(&pcre_globals->pcre_cache);
114 }
115 /* }}} */
116 
117 PHP_INI_BEGIN()
118 	STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateLong, backtrack_limit, zend_pcre_globals, pcre_globals)
119 	STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000", PHP_INI_ALL, OnUpdateLong, recursion_limit, zend_pcre_globals, pcre_globals)
PHP_INI_END()120 PHP_INI_END()
121 
122 
123 /* {{{ PHP_MINFO_FUNCTION(pcre) */
124 static PHP_MINFO_FUNCTION(pcre)
125 {
126 	php_info_print_table_start();
127 	php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
128 	php_info_print_table_row(2, "PCRE Library Version", pcre_version() );
129 	php_info_print_table_end();
130 
131 	DISPLAY_INI_ENTRIES();
132 }
133 /* }}} */
134 
135 /* {{{ PHP_MINIT_FUNCTION(pcre) */
PHP_MINIT_FUNCTION(pcre)136 static PHP_MINIT_FUNCTION(pcre)
137 {
138 	REGISTER_INI_ENTRIES();
139 
140 	REGISTER_LONG_CONSTANT("PREG_PATTERN_ORDER", PREG_PATTERN_ORDER, CONST_CS | CONST_PERSISTENT);
141 	REGISTER_LONG_CONSTANT("PREG_SET_ORDER", PREG_SET_ORDER, CONST_CS | CONST_PERSISTENT);
142 	REGISTER_LONG_CONSTANT("PREG_OFFSET_CAPTURE", PREG_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
143 	REGISTER_LONG_CONSTANT("PREG_SPLIT_NO_EMPTY", PREG_SPLIT_NO_EMPTY, CONST_CS | CONST_PERSISTENT);
144 	REGISTER_LONG_CONSTANT("PREG_SPLIT_DELIM_CAPTURE", PREG_SPLIT_DELIM_CAPTURE, CONST_CS | CONST_PERSISTENT);
145 	REGISTER_LONG_CONSTANT("PREG_SPLIT_OFFSET_CAPTURE", PREG_SPLIT_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
146 	REGISTER_LONG_CONSTANT("PREG_GREP_INVERT", PREG_GREP_INVERT, CONST_CS | CONST_PERSISTENT);
147 
148 	REGISTER_LONG_CONSTANT("PREG_NO_ERROR", PHP_PCRE_NO_ERROR, CONST_CS | CONST_PERSISTENT);
149 	REGISTER_LONG_CONSTANT("PREG_INTERNAL_ERROR", PHP_PCRE_INTERNAL_ERROR, CONST_CS | CONST_PERSISTENT);
150 	REGISTER_LONG_CONSTANT("PREG_BACKTRACK_LIMIT_ERROR", PHP_PCRE_BACKTRACK_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
151 	REGISTER_LONG_CONSTANT("PREG_RECURSION_LIMIT_ERROR", PHP_PCRE_RECURSION_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
152 	REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_ERROR", PHP_PCRE_BAD_UTF8_ERROR, CONST_CS | CONST_PERSISTENT);
153 	REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_OFFSET_ERROR", PHP_PCRE_BAD_UTF8_OFFSET_ERROR, CONST_CS | CONST_PERSISTENT);
154 	REGISTER_STRING_CONSTANT("PCRE_VERSION", (char *)pcre_version(), CONST_CS | CONST_PERSISTENT);
155 
156 	return SUCCESS;
157 }
158 /* }}} */
159 
160 /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
PHP_MSHUTDOWN_FUNCTION(pcre)161 static PHP_MSHUTDOWN_FUNCTION(pcre)
162 {
163 	UNREGISTER_INI_ENTRIES();
164 
165 	return SUCCESS;
166 }
167 /* }}} */
168 
169 /* {{{ static pcre_clean_cache */
pcre_clean_cache(void * data,void * arg TSRMLS_DC)170 static int pcre_clean_cache(void *data, void *arg TSRMLS_DC)
171 {
172 	int *num_clean = (int *)arg;
173 
174 	if (*num_clean > 0) {
175 		(*num_clean)--;
176 		return 1;
177 	} else {
178 		return 0;
179 	}
180 }
181 /* }}} */
182 
183 /* {{{ static make_subpats_table */
make_subpats_table(int num_subpats,pcre_cache_entry * pce TSRMLS_DC)184 static char **make_subpats_table(int num_subpats, pcre_cache_entry *pce TSRMLS_DC)
185 {
186 	pcre_extra *extra = pce->extra;
187 	int name_cnt = 0, name_size, ni = 0;
188 	int rc;
189 	char *name_table;
190 	unsigned short name_idx;
191 	char **subpat_names = (char **)ecalloc(num_subpats, sizeof(char *));
192 
193 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMECOUNT, &name_cnt);
194 	if (rc < 0) {
195 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
196 		efree(subpat_names);
197 		return NULL;
198 	}
199 	if (name_cnt > 0) {
200 		int rc1, rc2;
201 
202 		rc1 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMETABLE, &name_table);
203 		rc2 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
204 		rc = rc2 ? rc2 : rc1;
205 		if (rc < 0) {
206 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
207 			efree(subpat_names);
208 			return NULL;
209 		}
210 
211 		while (ni++ < name_cnt) {
212 			name_idx = 0xff * (unsigned char)name_table[0] + (unsigned char)name_table[1];
213 			subpat_names[name_idx] = name_table + 2;
214 			if (is_numeric_string(subpat_names[name_idx], strlen(subpat_names[name_idx]), NULL, NULL, 0) > 0) {
215 				php_error_docref(NULL TSRMLS_CC, E_WARNING, "Numeric named subpatterns are not allowed");
216 				efree(subpat_names);
217 				return NULL;
218 			}
219 			name_table += name_size;
220 		}
221 	}
222 
223 	return subpat_names;
224 }
225 /* }}} */
226 
227 /* {{{ pcre_get_compiled_regex_cache
228  */
pcre_get_compiled_regex_cache(char * regex,int regex_len TSRMLS_DC)229 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_len TSRMLS_DC)
230 {
231 	pcre				*re = NULL;
232 	pcre_extra			*extra;
233 	int					 coptions = 0;
234 	int					 soptions = 0;
235 	const char			*error;
236 	int					 erroffset;
237 	char				 delimiter;
238 	char				 start_delimiter;
239 	char				 end_delimiter;
240 	char				*p, *pp;
241 	char				*pattern;
242 	int					 do_study = 0;
243 	int					 poptions = 0;
244 	int				count = 0;
245 	unsigned const char *tables = NULL;
246 #if HAVE_SETLOCALE
247 	char				*locale;
248 #endif
249 	pcre_cache_entry	*pce;
250 	pcre_cache_entry	 new_entry;
251 
252 #if HAVE_SETLOCALE
253 # if defined(PHP_WIN32) && defined(ZTS)
254 	_configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
255 # endif
256 	locale = setlocale(LC_CTYPE, NULL);
257 #endif
258 
259 	/* Try to lookup the cached regex entry, and if successful, just pass
260 	   back the compiled pattern, otherwise go on and compile it. */
261 	if (zend_hash_find(&PCRE_G(pcre_cache), regex, regex_len+1, (void **)&pce) == SUCCESS) {
262 		/*
263 		 * We use a quick pcre_fullinfo() check to see whether cache is corrupted, and if it
264 		 * is, we flush it and compile the pattern from scratch.
265 		 */
266 		if (pcre_fullinfo(pce->re, NULL, PCRE_INFO_CAPTURECOUNT, &count) == PCRE_ERROR_BADMAGIC) {
267 			zend_hash_clean(&PCRE_G(pcre_cache));
268 		} else {
269 #if HAVE_SETLOCALE
270 			if (!strcmp(pce->locale, locale)) {
271 #endif
272 				return pce;
273 #if HAVE_SETLOCALE
274 			}
275 #endif
276 		}
277 	}
278 
279 	p = regex;
280 
281 	/* Parse through the leading whitespace, and display a warning if we
282 	   get to the end without encountering a delimiter. */
283 	while (isspace((int)*(unsigned char *)p)) p++;
284 	if (*p == 0) {
285 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Empty regular expression");
286 		return NULL;
287 	}
288 
289 	/* Get the delimiter and display a warning if it is alphanumeric
290 	   or a backslash. */
291 	delimiter = *p++;
292 	if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
293 		php_error_docref(NULL TSRMLS_CC,E_WARNING, "Delimiter must not be alphanumeric or backslash");
294 		return NULL;
295 	}
296 
297 	start_delimiter = delimiter;
298 	if ((pp = strchr("([{< )]}> )]}>", delimiter)))
299 		delimiter = pp[5];
300 	end_delimiter = delimiter;
301 
302 	if (start_delimiter == end_delimiter) {
303 		/* We need to iterate through the pattern, searching for the ending delimiter,
304 		   but skipping the backslashed delimiters.  If the ending delimiter is not
305 		   found, display a warning. */
306 		pp = p;
307 		while (*pp != 0) {
308 			if (*pp == '\\' && pp[1] != 0) pp++;
309 			else if (*pp == delimiter)
310 				break;
311 			pp++;
312 		}
313 		if (*pp == 0) {
314 			php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending delimiter '%c' found", delimiter);
315 			return NULL;
316 		}
317 	} else {
318 		/* We iterate through the pattern, searching for the matching ending
319 		 * delimiter. For each matching starting delimiter, we increment nesting
320 		 * level, and decrement it for each matching ending delimiter. If we
321 		 * reach the end of the pattern without matching, display a warning.
322 		 */
323 		int brackets = 1; 	/* brackets nesting level */
324 		pp = p;
325 		while (*pp != 0) {
326 			if (*pp == '\\' && pp[1] != 0) pp++;
327 			else if (*pp == end_delimiter && --brackets <= 0)
328 				break;
329 			else if (*pp == start_delimiter)
330 				brackets++;
331 			pp++;
332 		}
333 		if (*pp == 0) {
334 			php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending matching delimiter '%c' found", end_delimiter);
335 			return NULL;
336 		}
337 	}
338 
339 	/* Make a copy of the actual pattern. */
340 	pattern = estrndup(p, pp-p);
341 
342 	/* Move on to the options */
343 	pp++;
344 
345 	/* Parse through the options, setting appropriate flags.  Display
346 	   a warning if we encounter an unknown modifier. */
347 	while (*pp != 0) {
348 		switch (*pp++) {
349 			/* Perl compatible options */
350 			case 'i':	coptions |= PCRE_CASELESS;		break;
351 			case 'm':	coptions |= PCRE_MULTILINE;		break;
352 			case 's':	coptions |= PCRE_DOTALL;		break;
353 			case 'x':	coptions |= PCRE_EXTENDED;		break;
354 
355 			/* PCRE specific options */
356 			case 'A':	coptions |= PCRE_ANCHORED;		break;
357 			case 'D':	coptions |= PCRE_DOLLAR_ENDONLY;break;
358 			case 'S':	do_study  = 1;					break;
359 			case 'U':	coptions |= PCRE_UNGREEDY;		break;
360 			case 'X':	coptions |= PCRE_EXTRA;			break;
361 			case 'u':	coptions |= PCRE_UTF8;
362 	/* In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
363        characters, even in UTF-8 mode. However, this can be changed by setting
364        the PCRE_UCP option. */
365 #ifdef PCRE_UCP
366 						coptions |= PCRE_UCP;
367 #endif
368 				break;
369 
370 			/* Custom preg options */
371 			case 'e':	poptions |= PREG_REPLACE_EVAL;	break;
372 
373 			case ' ':
374 			case '\n':
375 				break;
376 
377 			default:
378 				php_error_docref(NULL TSRMLS_CC,E_WARNING, "Unknown modifier '%c'", pp[-1]);
379 				efree(pattern);
380 				return NULL;
381 		}
382 	}
383 
384 #if HAVE_SETLOCALE
385 	if (strcmp(locale, "C"))
386 		tables = pcre_maketables();
387 #endif
388 
389 	/* Compile pattern and display a warning if compilation failed. */
390 	re = pcre_compile(pattern,
391 					  coptions,
392 					  &error,
393 					  &erroffset,
394 					  tables);
395 
396 	if (re == NULL) {
397 		php_error_docref(NULL TSRMLS_CC,E_WARNING, "Compilation failed: %s at offset %d", error, erroffset);
398 		efree(pattern);
399 		if (tables) {
400 			pefree((void*)tables, 1);
401 		}
402 		return NULL;
403 	}
404 
405 	/* If study option was specified, study the pattern and
406 	   store the result in extra for passing to pcre_exec. */
407 	if (do_study) {
408 		extra = pcre_study(re, soptions, &error);
409 		if (extra) {
410 			extra->flags |= PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
411 		}
412 		if (error != NULL) {
413 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Error while studying pattern");
414 		}
415 	} else {
416 		extra = NULL;
417 	}
418 
419 	efree(pattern);
420 
421 	/*
422 	 * If we reached cache limit, clean out the items from the head of the list;
423 	 * these are supposedly the oldest ones (but not necessarily the least used
424 	 * ones).
425 	 */
426 	if (zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) {
427 		int num_clean = PCRE_CACHE_SIZE / 8;
428 		zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean TSRMLS_CC);
429 	}
430 
431 	/* Store the compiled pattern and extra info in the cache. */
432 	new_entry.re = re;
433 	new_entry.extra = extra;
434 	new_entry.preg_options = poptions;
435 	new_entry.compile_options = coptions;
436 #if HAVE_SETLOCALE
437 	new_entry.locale = pestrdup(locale, 1);
438 	new_entry.tables = tables;
439 #endif
440 	zend_hash_update(&PCRE_G(pcre_cache), regex, regex_len+1, (void *)&new_entry,
441 						sizeof(pcre_cache_entry), (void**)&pce);
442 
443 	return pce;
444 }
445 /* }}} */
446 
447 /* {{{ pcre_get_compiled_regex
448  */
pcre_get_compiled_regex(char * regex,pcre_extra ** extra,int * preg_options TSRMLS_DC)449 PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_options TSRMLS_DC)
450 {
451 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
452 
453 	if (extra) {
454 		*extra = pce ? pce->extra : NULL;
455 	}
456 	if (preg_options) {
457 		*preg_options = pce ? pce->preg_options : 0;
458 	}
459 
460 	return pce ? pce->re : NULL;
461 }
462 /* }}} */
463 
464 /* {{{ pcre_get_compiled_regex_ex
465  */
pcre_get_compiled_regex_ex(char * regex,pcre_extra ** extra,int * preg_options,int * compile_options TSRMLS_DC)466 PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *compile_options TSRMLS_DC)
467 {
468 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
469 
470 	if (extra) {
471 		*extra = pce ? pce->extra : NULL;
472 	}
473 	if (preg_options) {
474 		*preg_options = pce ? pce->preg_options : 0;
475 	}
476 	if (compile_options) {
477 		*compile_options = pce ? pce->compile_options : 0;
478 	}
479 
480 	return pce ? pce->re : NULL;
481 }
482 /* }}} */
483 
484 /* {{{ add_offset_pair */
add_offset_pair(zval * result,char * str,int len,int offset,char * name)485 static inline void add_offset_pair(zval *result, char *str, int len, int offset, char *name)
486 {
487 	zval *match_pair;
488 
489 	ALLOC_ZVAL(match_pair);
490 	array_init(match_pair);
491 	INIT_PZVAL(match_pair);
492 
493 	/* Add (match, offset) to the return value */
494 	add_next_index_stringl(match_pair, str, len, 1);
495 	add_next_index_long(match_pair, offset);
496 
497 	if (name) {
498 		zval_add_ref(&match_pair);
499 		zend_hash_update(Z_ARRVAL_P(result), name, strlen(name)+1, &match_pair, sizeof(zval *), NULL);
500 	}
501 	zend_hash_next_index_insert(Z_ARRVAL_P(result), &match_pair, sizeof(zval *), NULL);
502 }
503 /* }}} */
504 
php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS,int global)505 static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ */
506 {
507 	/* parameters */
508 	char			 *regex;			/* Regular expression */
509 	char			 *subject;			/* String to match against */
510 	int				  regex_len;
511 	int				  subject_len;
512 	pcre_cache_entry *pce;				/* Compiled regular expression */
513 	zval			 *subpats = NULL;	/* Array for subpatterns */
514 	long			  flags = 0;		/* Match control flags */
515 	long			  start_offset = 0;	/* Where the new search starts */
516 
517 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, ((global) ? "ssz|ll" : "ss|zll"), &regex, &regex_len,
518 							  &subject, &subject_len, &subpats, &flags, &start_offset) == FAILURE) {
519 		RETURN_FALSE;
520 	}
521 
522 	/* Compile regex or get it from cache. */
523 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
524 		RETURN_FALSE;
525 	}
526 
527 	php_pcre_match_impl(pce, subject, subject_len, return_value, subpats,
528 		global, ZEND_NUM_ARGS() >= 4, flags, start_offset TSRMLS_CC);
529 }
530 /* }}} */
531 
532 /* {{{ php_pcre_match_impl() */
php_pcre_match_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * return_value,zval * subpats,int global,int use_flags,long flags,long start_offset TSRMLS_DC)533 PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
534 	zval *subpats, int global, int use_flags, long flags, long start_offset TSRMLS_DC)
535 {
536 	zval			*result_set,		/* Holds a set of subpatterns after
537 										   a global match */
538 				   **match_sets = NULL;	/* An array of sets of matches for each
539 										   subpattern after a global match */
540 	pcre_extra		*extra = pce->extra;/* Holds results of studying */
541 	pcre_extra		 extra_data;		/* Used locally for exec options */
542 	int				 exoptions = 0;		/* Execution options */
543 	int				 count = 0;			/* Count of matched subpatterns */
544 	int				*offsets;			/* Array of subpattern offsets */
545 	int				 num_subpats;		/* Number of captured subpatterns */
546 	int				 size_offsets;		/* Size of the offsets array */
547 	int				 matched;			/* Has anything matched */
548 	int				 g_notempty = 0;	/* If the match should not be empty */
549 	const char	   **stringlist;		/* Holds list of subpatterns */
550 	char 		   **subpat_names;		/* Array for named subpatterns */
551 	int				 i, rc;
552 	int				 subpats_order;		/* Order of subpattern matches */
553 	int				 offset_capture;    /* Capture match offsets: yes/no */
554 
555 	/* Overwrite the passed-in value for subpatterns with an empty array. */
556 	if (subpats != NULL) {
557 		zval_dtor(subpats);
558 		array_init(subpats);
559 	}
560 
561 	subpats_order = global ? PREG_PATTERN_ORDER : 0;
562 
563 	if (use_flags) {
564 		offset_capture = flags & PREG_OFFSET_CAPTURE;
565 
566 		/*
567 		 * subpats_order is pre-set to pattern mode so we change it only if
568 		 * necessary.
569 		 */
570 		if (flags & 0xff) {
571 			subpats_order = flags & 0xff;
572 		}
573 		if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
574 			(!global && subpats_order != 0)) {
575 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid flags specified");
576 			return;
577 		}
578 	} else {
579 		offset_capture = 0;
580 	}
581 
582 	/* Negative offset counts from the end of the string. */
583 	if (start_offset < 0) {
584 		start_offset = subject_len + start_offset;
585 		if (start_offset < 0) {
586 			start_offset = 0;
587 		}
588 	}
589 
590 	if (extra == NULL) {
591 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
592 		extra = &extra_data;
593 	}
594 	extra->match_limit = PCRE_G(backtrack_limit);
595 	extra->match_limit_recursion = PCRE_G(recursion_limit);
596 
597 	/* Calculate the size of the offsets array, and allocate memory for it. */
598 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
599 	if (rc < 0) {
600 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
601 		RETURN_FALSE;
602 	}
603 	num_subpats++;
604 	size_offsets = num_subpats * 3;
605 
606 	/*
607 	 * Build a mapping from subpattern numbers to their names. We will always
608 	 * allocate the table, even though there may be no named subpatterns. This
609 	 * avoids somewhat more complicated logic in the inner loops.
610 	 */
611 	subpat_names = make_subpats_table(num_subpats, pce TSRMLS_CC);
612 	if (!subpat_names) {
613 		RETURN_FALSE;
614 	}
615 
616 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
617 
618 	/* Allocate match sets array and initialize the values. */
619 	if (global && subpats_order == PREG_PATTERN_ORDER) {
620 		match_sets = (zval **)safe_emalloc(num_subpats, sizeof(zval *), 0);
621 		for (i=0; i<num_subpats; i++) {
622 			ALLOC_ZVAL(match_sets[i]);
623 			array_init(match_sets[i]);
624 			INIT_PZVAL(match_sets[i]);
625 		}
626 	}
627 
628 	matched = 0;
629 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
630 
631 	do {
632 		/* Execute the regular expression. */
633 		count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
634 						  exoptions|g_notempty, offsets, size_offsets);
635 
636 		/* the string was already proved to be valid UTF-8 */
637 		exoptions |= PCRE_NO_UTF8_CHECK;
638 
639 		/* Check for too many substrings condition. */
640 		if (count == 0) {
641 			php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
642 			count = size_offsets/3;
643 		}
644 
645 		/* If something has matched */
646 		if (count > 0) {
647 			matched++;
648 
649 			/* If subpatterns array has been passed, fill it in with values. */
650 			if (subpats != NULL) {
651 				/* Try to get the list of substrings and display a warning if failed. */
652 				if (pcre_get_substring_list(subject, offsets, count, &stringlist) < 0) {
653 					efree(subpat_names);
654 					efree(offsets);
655 					if (match_sets) efree(match_sets);
656 					php_error_docref(NULL TSRMLS_CC, E_WARNING, "Get subpatterns list failed");
657 					RETURN_FALSE;
658 				}
659 
660 				if (global) {	/* global pattern matching */
661 					if (subpats_order == PREG_PATTERN_ORDER) {
662 						/* For each subpattern, insert it into the appropriate array. */
663 						for (i = 0; i < count; i++) {
664 							if (offset_capture) {
665 								add_offset_pair(match_sets[i], (char *)stringlist[i],
666 												offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL);
667 							} else {
668 								add_next_index_stringl(match_sets[i], (char *)stringlist[i],
669 													   offsets[(i<<1)+1] - offsets[i<<1], 1);
670 							}
671 						}
672 						/*
673 						 * If the number of captured subpatterns on this run is
674 						 * less than the total possible number, pad the result
675 						 * arrays with empty strings.
676 						 */
677 						if (count < num_subpats) {
678 							for (; i < num_subpats; i++) {
679 								add_next_index_string(match_sets[i], "", 1);
680 							}
681 						}
682 					} else {
683 						/* Allocate the result set array */
684 						ALLOC_ZVAL(result_set);
685 						array_init(result_set);
686 						INIT_PZVAL(result_set);
687 
688 						/* Add all the subpatterns to it */
689 						for (i = 0; i < count; i++) {
690 							if (offset_capture) {
691 								add_offset_pair(result_set, (char *)stringlist[i],
692 												offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], subpat_names[i]);
693 							} else {
694 								if (subpat_names[i]) {
695 									add_assoc_stringl(result_set, subpat_names[i], (char *)stringlist[i],
696 														   offsets[(i<<1)+1] - offsets[i<<1], 1);
697 								}
698 								add_next_index_stringl(result_set, (char *)stringlist[i],
699 													   offsets[(i<<1)+1] - offsets[i<<1], 1);
700 							}
701 						}
702 						/* And add it to the output array */
703 						zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &result_set, sizeof(zval *), NULL);
704 					}
705 				} else {			/* single pattern matching */
706 					/* For each subpattern, insert it into the subpatterns array. */
707 					for (i = 0; i < count; i++) {
708 						if (offset_capture) {
709 							add_offset_pair(subpats, (char *)stringlist[i],
710 											offsets[(i<<1)+1] - offsets[i<<1],
711 											offsets[i<<1], subpat_names[i]);
712 						} else {
713 							if (subpat_names[i]) {
714 								add_assoc_stringl(subpats, subpat_names[i], (char *)stringlist[i],
715 												  offsets[(i<<1)+1] - offsets[i<<1], 1);
716 							}
717 							add_next_index_stringl(subpats, (char *)stringlist[i],
718 												   offsets[(i<<1)+1] - offsets[i<<1], 1);
719 						}
720 					}
721 				}
722 
723 				pcre_free((void *) stringlist);
724 			}
725 		} else if (count == PCRE_ERROR_NOMATCH) {
726 			/* If we previously set PCRE_NOTEMPTY after a null match,
727 			   this is not necessarily the end. We need to advance
728 			   the start offset, and continue. Fudge the offset values
729 			   to achieve this, unless we're already at the end of the string. */
730 			if (g_notempty != 0 && start_offset < subject_len) {
731 				offsets[0] = start_offset;
732 				offsets[1] = start_offset + 1;
733 			} else
734 				break;
735 		} else {
736 			pcre_handle_exec_error(count TSRMLS_CC);
737 			break;
738 		}
739 
740 		/* If we have matched an empty string, mimic what Perl's /g options does.
741 		   This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
742 		   the match again at the same point. If this fails (picked up above) we
743 		   advance to the next character. */
744 		g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
745 
746 		/* Advance to the position right after the last full match */
747 		start_offset = offsets[1];
748 	} while (global);
749 
750 	/* Add the match sets to the output array and clean up */
751 	if (global && subpats_order == PREG_PATTERN_ORDER) {
752 		for (i = 0; i < num_subpats; i++) {
753 			if (subpat_names[i]) {
754 				zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i],
755 								 strlen(subpat_names[i])+1, &match_sets[i], sizeof(zval *), NULL);
756 				Z_ADDREF_P(match_sets[i]);
757 			}
758 			zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i], sizeof(zval *), NULL);
759 		}
760 		efree(match_sets);
761 	}
762 
763 	efree(offsets);
764 	efree(subpat_names);
765 
766 	/* Did we encounter an error? */
767 	if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
768 		RETVAL_LONG(matched);
769 	} else {
770 		RETVAL_FALSE;
771 	}
772 }
773 /* }}} */
774 
775 /* {{{ proto int preg_match(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
776    Perform a Perl-style regular expression match */
PHP_FUNCTION(preg_match)777 static PHP_FUNCTION(preg_match)
778 {
779 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
780 }
781 /* }}} */
782 
783 /* {{{ proto int preg_match_all(string pattern, string subject, array &subpatterns [, int flags [, int offset]])
784    Perform a Perl-style global regular expression match */
PHP_FUNCTION(preg_match_all)785 static PHP_FUNCTION(preg_match_all)
786 {
787 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
788 }
789 /* }}} */
790 
791 /* {{{ preg_get_backref
792  */
preg_get_backref(char ** str,int * backref)793 static int preg_get_backref(char **str, int *backref)
794 {
795 	register char in_brace = 0;
796 	register char *walk = *str;
797 
798 	if (walk[1] == 0)
799 		return 0;
800 
801 	if (*walk == '$' && walk[1] == '{') {
802 		in_brace = 1;
803 		walk++;
804 	}
805 	walk++;
806 
807 	if (*walk >= '0' && *walk <= '9') {
808 		*backref = *walk - '0';
809 		walk++;
810 	} else
811 		return 0;
812 
813 	if (*walk && *walk >= '0' && *walk <= '9') {
814 		*backref = *backref * 10 + *walk - '0';
815 		walk++;
816 	}
817 
818 	if (in_brace) {
819 		if (*walk == 0 || *walk != '}')
820 			return 0;
821 		else
822 			walk++;
823 	}
824 
825 	*str = walk;
826 	return 1;
827 }
828 /* }}} */
829 
830 /* {{{ preg_do_repl_func
831  */
preg_do_repl_func(zval * function,char * subject,int * offsets,char ** subpat_names,int count,char ** result TSRMLS_DC)832 static int preg_do_repl_func(zval *function, char *subject, int *offsets, char **subpat_names, int count, char **result TSRMLS_DC)
833 {
834 	zval		*retval_ptr;		/* Function return value */
835 	zval	   **args[1];			/* Argument to pass to function */
836 	zval		*subpats;			/* Captured subpatterns */
837 	int			 result_len;		/* Return value length */
838 	int			 i;
839 
840 	MAKE_STD_ZVAL(subpats);
841 	array_init(subpats);
842 	for (i = 0; i < count; i++) {
843 		if (subpat_names[i]) {
844 			add_assoc_stringl(subpats, subpat_names[i], &subject[offsets[i<<1]] , offsets[(i<<1)+1] - offsets[i<<1], 1);
845 		}
846 		add_next_index_stringl(subpats, &subject[offsets[i<<1]], offsets[(i<<1)+1] - offsets[i<<1], 1);
847 	}
848 	args[0] = &subpats;
849 
850 	if (call_user_function_ex(EG(function_table), NULL, function, &retval_ptr, 1, args, 0, NULL TSRMLS_CC) == SUCCESS && retval_ptr) {
851 		convert_to_string_ex(&retval_ptr);
852 		*result = estrndup(Z_STRVAL_P(retval_ptr), Z_STRLEN_P(retval_ptr));
853 		result_len = Z_STRLEN_P(retval_ptr);
854 		zval_ptr_dtor(&retval_ptr);
855 	} else {
856 		if (!EG(exception)) {
857 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unable to call custom replacement function");
858 		}
859 		result_len = offsets[1] - offsets[0];
860 		*result = estrndup(&subject[offsets[0]], result_len);
861 	}
862 
863 	zval_ptr_dtor(&subpats);
864 
865 	return result_len;
866 }
867 /* }}} */
868 
869 /* {{{ preg_do_eval
870  */
preg_do_eval(char * eval_str,int eval_str_len,char * subject,int * offsets,int count,char ** result TSRMLS_DC)871 static int preg_do_eval(char *eval_str, int eval_str_len, char *subject,
872 						int *offsets, int count, char **result TSRMLS_DC)
873 {
874 	zval		 retval;			/* Return value from evaluation */
875 	char		*eval_str_end,		/* End of eval string */
876 				*match,				/* Current match for a backref */
877 				*esc_match,			/* Quote-escaped match */
878 				*walk,				/* Used to walk the code string */
879 				*segment,			/* Start of segment to append while walking */
880 				 walk_last;			/* Last walked character */
881 	int			 match_len;			/* Length of the match */
882 	int			 esc_match_len;		/* Length of the quote-escaped match */
883 	int			 result_len;		/* Length of the result of the evaluation */
884 	int			 backref;			/* Current backref */
885 	char        *compiled_string_description;
886 	smart_str    code = {0};
887 
888 	eval_str_end = eval_str + eval_str_len;
889 	walk = segment = eval_str;
890 	walk_last = 0;
891 
892 	while (walk < eval_str_end) {
893 		/* If found a backreference.. */
894 		if ('\\' == *walk || '$' == *walk) {
895 			smart_str_appendl(&code, segment, walk - segment);
896 			if (walk_last == '\\') {
897 				code.c[code.len-1] = *walk++;
898 				segment = walk;
899 				walk_last = 0;
900 				continue;
901 			}
902 			segment = walk;
903 			if (preg_get_backref(&walk, &backref)) {
904 				if (backref < count) {
905 					/* Find the corresponding string match and substitute it
906 					   in instead of the backref */
907 					match = subject + offsets[backref<<1];
908 					match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
909 					if (match_len) {
910 						esc_match = php_addslashes_ex(match, match_len, &esc_match_len, 0, 1 TSRMLS_CC);
911 					} else {
912 						esc_match = match;
913 						esc_match_len = 0;
914 					}
915 				} else {
916 					esc_match = "";
917 					esc_match_len = 0;
918 				}
919 				smart_str_appendl(&code, esc_match, esc_match_len);
920 
921 				segment = walk;
922 
923 				/* Clean up and reassign */
924 				if (esc_match_len)
925 					efree(esc_match);
926 				continue;
927 			}
928 		}
929 		walk++;
930 		walk_last = walk[-1];
931 	}
932 	smart_str_appendl(&code, segment, walk - segment);
933 	smart_str_0(&code);
934 
935 	compiled_string_description = zend_make_compiled_string_description("regexp code" TSRMLS_CC);
936 	/* Run the code */
937 	if (zend_eval_stringl(code.c, code.len, &retval, compiled_string_description TSRMLS_CC) == FAILURE) {
938 		efree(compiled_string_description);
939 		php_error_docref(NULL TSRMLS_CC,E_ERROR, "Failed evaluating code: %s%s", PHP_EOL, code.c);
940 		/* zend_error() does not return in this case */
941 	}
942 	efree(compiled_string_description);
943 	convert_to_string(&retval);
944 
945 	/* Save the return value and its length */
946 	*result = estrndup(Z_STRVAL(retval), Z_STRLEN(retval));
947 	result_len = Z_STRLEN(retval);
948 
949 	/* Clean up */
950 	zval_dtor(&retval);
951 	smart_str_free(&code);
952 
953 	return result_len;
954 }
955 /* }}} */
956 
957 /* {{{ php_pcre_replace
958  */
php_pcre_replace(char * regex,int regex_len,char * subject,int subject_len,zval * replace_val,int is_callable_replace,int * result_len,int limit,int * replace_count TSRMLS_DC)959 PHPAPI char *php_pcre_replace(char *regex,   int regex_len,
960 							  char *subject, int subject_len,
961 							  zval *replace_val, int is_callable_replace,
962 							  int *result_len, int limit, int *replace_count TSRMLS_DC)
963 {
964 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
965 
966 	/* Compile regex or get it from cache. */
967 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
968 		return NULL;
969 	}
970 
971 	return php_pcre_replace_impl(pce, subject, subject_len, replace_val,
972 		is_callable_replace, result_len, limit, replace_count TSRMLS_CC);
973 }
974 /* }}} */
975 
976 /* {{{ php_pcre_replace_impl() */
php_pcre_replace_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * replace_val,int is_callable_replace,int * result_len,int limit,int * replace_count TSRMLS_DC)977 PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *replace_val,
978 	int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC)
979 {
980 	pcre_extra		*extra = pce->extra;/* Holds results of studying */
981 	pcre_extra		 extra_data;		/* Used locally for exec options */
982 	int				 exoptions = 0;		/* Execution options */
983 	int				 count = 0;			/* Count of matched subpatterns */
984 	int				*offsets;			/* Array of subpattern offsets */
985 	char 			**subpat_names;		/* Array for named subpatterns */
986 	int				 num_subpats;		/* Number of captured subpatterns */
987 	int				 size_offsets;		/* Size of the offsets array */
988 	int				 new_len;			/* Length of needed storage */
989 	int				 alloc_len;			/* Actual allocated length */
990 	int				 eval_result_len=0;	/* Length of the eval'ed or
991 										   function-returned string */
992 	int				 match_len;			/* Length of the current match */
993 	int				 backref;			/* Backreference number */
994 	int				 eval;				/* If the replacement string should be eval'ed */
995 	int				 start_offset;		/* Where the new search starts */
996 	int				 g_notempty=0;		/* If the match should not be empty */
997 	int				 replace_len=0;		/* Length of replacement string */
998 	char			*result,			/* Result of replacement */
999 					*replace=NULL,		/* Replacement string */
1000 					*new_buf,			/* Temporary buffer for re-allocation */
1001 					*walkbuf,			/* Location of current replacement in the result */
1002 					*walk,				/* Used to walk the replacement string */
1003 					*match,				/* The current match */
1004 					*piece,				/* The current piece of subject */
1005 					*replace_end=NULL,	/* End of replacement string */
1006 					*eval_result,		/* Result of eval or custom function */
1007 					 walk_last;			/* Last walked character */
1008 	int				 rc;
1009 
1010 	if (extra == NULL) {
1011 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1012 		extra = &extra_data;
1013 	}
1014 	extra->match_limit = PCRE_G(backtrack_limit);
1015 	extra->match_limit_recursion = PCRE_G(recursion_limit);
1016 
1017 	eval = pce->preg_options & PREG_REPLACE_EVAL;
1018 	if (is_callable_replace) {
1019 		if (eval) {
1020 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Modifier /e cannot be used with replacement callback");
1021 			return NULL;
1022 		}
1023 	} else {
1024 		replace = Z_STRVAL_P(replace_val);
1025 		replace_len = Z_STRLEN_P(replace_val);
1026 		replace_end = replace + replace_len;
1027 	}
1028 
1029 	/* Calculate the size of the offsets array, and allocate memory for it. */
1030 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
1031 	if (rc < 0) {
1032 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1033 		return NULL;
1034 	}
1035 	num_subpats++;
1036 	size_offsets = num_subpats * 3;
1037 
1038 	/*
1039 	 * Build a mapping from subpattern numbers to their names. We will always
1040 	 * allocate the table, even though there may be no named subpatterns. This
1041 	 * avoids somewhat more complicated logic in the inner loops.
1042 	 */
1043 	subpat_names = make_subpats_table(num_subpats, pce TSRMLS_CC);
1044 	if (!subpat_names) {
1045 		return NULL;
1046 	}
1047 
1048 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1049 
1050 	alloc_len = 2 * subject_len + 1;
1051 	result = safe_emalloc(alloc_len, sizeof(char), 0);
1052 
1053 	/* Initialize */
1054 	match = NULL;
1055 	*result_len = 0;
1056 	start_offset = 0;
1057 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1058 
1059 	while (1) {
1060 		/* Execute the regular expression. */
1061 		count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
1062 						  exoptions|g_notempty, offsets, size_offsets);
1063 
1064 		/* the string was already proved to be valid UTF-8 */
1065 		exoptions |= PCRE_NO_UTF8_CHECK;
1066 
1067 		/* Check for too many substrings condition. */
1068 		if (count == 0) {
1069 			php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
1070 			count = size_offsets/3;
1071 		}
1072 
1073 		piece = subject + start_offset;
1074 
1075 		if (count > 0 && (limit == -1 || limit > 0)) {
1076 			if (replace_count) {
1077 				++*replace_count;
1078 			}
1079 			/* Set the match location in subject */
1080 			match = subject + offsets[0];
1081 
1082 			new_len = *result_len + offsets[0] - start_offset; /* part before the match */
1083 
1084 			/* If evaluating, do it and add the return string's length */
1085 			if (eval) {
1086 				eval_result_len = preg_do_eval(replace, replace_len, subject,
1087 											   offsets, count, &eval_result TSRMLS_CC);
1088 				new_len += eval_result_len;
1089 			} else if (is_callable_replace) {
1090 				/* Use custom function to get replacement string and its length. */
1091 				eval_result_len = preg_do_repl_func(replace_val, subject, offsets, subpat_names, count, &eval_result TSRMLS_CC);
1092 				new_len += eval_result_len;
1093 			} else { /* do regular substitution */
1094 				walk = replace;
1095 				walk_last = 0;
1096 				while (walk < replace_end) {
1097 					if ('\\' == *walk || '$' == *walk) {
1098 						if (walk_last == '\\') {
1099 							walk++;
1100 							walk_last = 0;
1101 							continue;
1102 						}
1103 						if (preg_get_backref(&walk, &backref)) {
1104 							if (backref < count)
1105 								new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
1106 							continue;
1107 						}
1108 					}
1109 					new_len++;
1110 					walk++;
1111 					walk_last = walk[-1];
1112 				}
1113 			}
1114 
1115 			if (new_len + 1 > alloc_len) {
1116 				alloc_len = 1 + alloc_len + 2 * new_len;
1117 				new_buf = emalloc(alloc_len);
1118 				memcpy(new_buf, result, *result_len);
1119 				efree(result);
1120 				result = new_buf;
1121 			}
1122 			/* copy the part of the string before the match */
1123 			memcpy(&result[*result_len], piece, match-piece);
1124 			*result_len += match-piece;
1125 
1126 			/* copy replacement and backrefs */
1127 			walkbuf = result + *result_len;
1128 
1129 			/* If evaluating or using custom function, copy result to the buffer
1130 			 * and clean up. */
1131 			if (eval || is_callable_replace) {
1132 				memcpy(walkbuf, eval_result, eval_result_len);
1133 				*result_len += eval_result_len;
1134 				STR_FREE(eval_result);
1135 			} else { /* do regular backreference copying */
1136 				walk = replace;
1137 				walk_last = 0;
1138 				while (walk < replace_end) {
1139 					if ('\\' == *walk || '$' == *walk) {
1140 						if (walk_last == '\\') {
1141 							*(walkbuf-1) = *walk++;
1142 							walk_last = 0;
1143 							continue;
1144 						}
1145 						if (preg_get_backref(&walk, &backref)) {
1146 							if (backref < count) {
1147 								match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1148 								memcpy(walkbuf, subject + offsets[backref<<1], match_len);
1149 								walkbuf += match_len;
1150 							}
1151 							continue;
1152 						}
1153 					}
1154 					*walkbuf++ = *walk++;
1155 					walk_last = walk[-1];
1156 				}
1157 				*walkbuf = '\0';
1158 				/* increment the result length by how much we've added to the string */
1159 				*result_len += walkbuf - (result + *result_len);
1160 			}
1161 
1162 			if (limit != -1)
1163 				limit--;
1164 
1165 		} else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
1166 			/* If we previously set PCRE_NOTEMPTY after a null match,
1167 			   this is not necessarily the end. We need to advance
1168 			   the start offset, and continue. Fudge the offset values
1169 			   to achieve this, unless we're already at the end of the string. */
1170 			if (g_notempty != 0 && start_offset < subject_len) {
1171 				offsets[0] = start_offset;
1172 				offsets[1] = start_offset + 1;
1173 				memcpy(&result[*result_len], piece, 1);
1174 				(*result_len)++;
1175 			} else {
1176 				new_len = *result_len + subject_len - start_offset;
1177 				if (new_len + 1 > alloc_len) {
1178 					alloc_len = new_len + 1; /* now we know exactly how long it is */
1179 					new_buf = safe_emalloc(alloc_len, sizeof(char), 0);
1180 					memcpy(new_buf, result, *result_len);
1181 					efree(result);
1182 					result = new_buf;
1183 				}
1184 				/* stick that last bit of string on our output */
1185 				memcpy(&result[*result_len], piece, subject_len - start_offset);
1186 				*result_len += subject_len - start_offset;
1187 				result[*result_len] = '\0';
1188 				break;
1189 			}
1190 		} else {
1191 			pcre_handle_exec_error(count TSRMLS_CC);
1192 			efree(result);
1193 			result = NULL;
1194 			break;
1195 		}
1196 
1197 		/* If we have matched an empty string, mimic what Perl's /g options does.
1198 		   This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1199 		   the match again at the same point. If this fails (picked up above) we
1200 		   advance to the next character. */
1201 		g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1202 
1203 		/* Advance to the next piece. */
1204 		start_offset = offsets[1];
1205 	}
1206 
1207 	efree(offsets);
1208 	efree(subpat_names);
1209 
1210 	return result;
1211 }
1212 /* }}} */
1213 
1214 /* {{{ php_replace_in_subject
1215  */
php_replace_in_subject(zval * regex,zval * replace,zval ** subject,int * result_len,int limit,int is_callable_replace,int * replace_count TSRMLS_DC)1216 static char *php_replace_in_subject(zval *regex, zval *replace, zval **subject, int *result_len, int limit, int is_callable_replace, int *replace_count TSRMLS_DC)
1217 {
1218 	zval		**regex_entry,
1219 				**replace_entry = NULL,
1220 				 *replace_value,
1221 				  empty_replace;
1222 	char		*subject_value,
1223 				*result;
1224 	int			 subject_len;
1225 
1226 	/* Make sure we're dealing with strings. */
1227 	convert_to_string_ex(subject);
1228 	/* FIXME: This might need to be changed to STR_EMPTY_ALLOC(). Check if this zval could be dtor()'ed somehow */
1229 	ZVAL_STRINGL(&empty_replace, "", 0, 0);
1230 
1231 	/* If regex is an array */
1232 	if (Z_TYPE_P(regex) == IS_ARRAY) {
1233 		/* Duplicate subject string for repeated replacement */
1234 		subject_value = estrndup(Z_STRVAL_PP(subject), Z_STRLEN_PP(subject));
1235 		subject_len = Z_STRLEN_PP(subject);
1236 		*result_len = subject_len;
1237 
1238 		zend_hash_internal_pointer_reset(Z_ARRVAL_P(regex));
1239 
1240 		replace_value = replace;
1241 		if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace)
1242 			zend_hash_internal_pointer_reset(Z_ARRVAL_P(replace));
1243 
1244 		/* For each entry in the regex array, get the entry */
1245 		while (zend_hash_get_current_data(Z_ARRVAL_P(regex), (void **)&regex_entry) == SUCCESS) {
1246 			/* Make sure we're dealing with strings. */
1247 			convert_to_string_ex(regex_entry);
1248 
1249 			/* If replace is an array and not a callable construct */
1250 			if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace) {
1251 				/* Get current entry */
1252 				if (zend_hash_get_current_data(Z_ARRVAL_P(replace), (void **)&replace_entry) == SUCCESS) {
1253 					if (!is_callable_replace) {
1254 						convert_to_string_ex(replace_entry);
1255 					}
1256 					replace_value = *replace_entry;
1257 					zend_hash_move_forward(Z_ARRVAL_P(replace));
1258 				} else {
1259 					/* We've run out of replacement strings, so use an empty one */
1260 					replace_value = &empty_replace;
1261 				}
1262 			}
1263 
1264 			/* Do the actual replacement and put the result back into subject_value
1265 			   for further replacements. */
1266 			if ((result = php_pcre_replace(Z_STRVAL_PP(regex_entry),
1267 										   Z_STRLEN_PP(regex_entry),
1268 										   subject_value,
1269 										   subject_len,
1270 										   replace_value,
1271 										   is_callable_replace,
1272 										   result_len,
1273 										   limit,
1274 										   replace_count TSRMLS_CC)) != NULL) {
1275 				efree(subject_value);
1276 				subject_value = result;
1277 				subject_len = *result_len;
1278 			} else {
1279 				efree(subject_value);
1280 				return NULL;
1281 			}
1282 
1283 			zend_hash_move_forward(Z_ARRVAL_P(regex));
1284 		}
1285 
1286 		return subject_value;
1287 	} else {
1288 		result = php_pcre_replace(Z_STRVAL_P(regex),
1289 								  Z_STRLEN_P(regex),
1290 								  Z_STRVAL_PP(subject),
1291 								  Z_STRLEN_PP(subject),
1292 								  replace,
1293 								  is_callable_replace,
1294 								  result_len,
1295 								  limit,
1296 								  replace_count TSRMLS_CC);
1297 		return result;
1298 	}
1299 }
1300 /* }}} */
1301 
1302 /* {{{ preg_replace_impl
1303  */
preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS,int is_callable_replace,int is_filter)1304 static void preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS, int is_callable_replace, int is_filter)
1305 {
1306 	zval		   **regex,
1307 				   **replace,
1308 				   **subject,
1309 				   **subject_entry,
1310 				   **zcount = NULL;
1311 	char			*result;
1312 	int				 result_len;
1313 	int				 limit_val = -1;
1314 	long			limit = -1;
1315 	char			*string_key;
1316 	ulong			 num_key;
1317 	char			*callback_name;
1318 	int				 replace_count=0, old_replace_count;
1319 
1320 	/* Get function parameters and do error-checking. */
1321 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ZZZ|lZ", &regex, &replace, &subject, &limit, &zcount) == FAILURE) {
1322 		return;
1323 	}
1324 
1325 	if (!is_callable_replace && Z_TYPE_PP(replace) == IS_ARRAY && Z_TYPE_PP(regex) != IS_ARRAY) {
1326 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Parameter mismatch, pattern is a string while replacement is an array");
1327 		RETURN_FALSE;
1328 	}
1329 
1330 	SEPARATE_ZVAL(replace);
1331 	if (Z_TYPE_PP(replace) != IS_ARRAY && (Z_TYPE_PP(replace) != IS_OBJECT || !is_callable_replace)) {
1332 		convert_to_string_ex(replace);
1333 	}
1334 	if (is_callable_replace) {
1335 		if (!zend_is_callable(*replace, 0, &callback_name TSRMLS_CC)) {
1336 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Requires argument 2, '%s', to be a valid callback", callback_name);
1337 			efree(callback_name);
1338 			MAKE_COPY_ZVAL(subject, return_value);
1339 			return;
1340 		}
1341 		efree(callback_name);
1342 	}
1343 
1344 	SEPARATE_ZVAL(regex);
1345 	SEPARATE_ZVAL(subject);
1346 
1347 	if (ZEND_NUM_ARGS() > 3) {
1348 		limit_val = limit;
1349 	}
1350 
1351 	if (Z_TYPE_PP(regex) != IS_ARRAY)
1352 		convert_to_string_ex(regex);
1353 
1354 	/* if subject is an array */
1355 	if (Z_TYPE_PP(subject) == IS_ARRAY) {
1356 		array_init(return_value);
1357 		zend_hash_internal_pointer_reset(Z_ARRVAL_PP(subject));
1358 
1359 		/* For each subject entry, convert it to string, then perform replacement
1360 		   and add the result to the return_value array. */
1361 		while (zend_hash_get_current_data(Z_ARRVAL_PP(subject), (void **)&subject_entry) == SUCCESS) {
1362 			SEPARATE_ZVAL(subject_entry);
1363 			old_replace_count = replace_count;
1364 			if ((result = php_replace_in_subject(*regex, *replace, subject_entry, &result_len, limit_val, is_callable_replace, &replace_count TSRMLS_CC)) != NULL) {
1365 				if (!is_filter || replace_count > old_replace_count) {
1366 					/* Add to return array */
1367 					switch(zend_hash_get_current_key(Z_ARRVAL_PP(subject), &string_key, &num_key, 0))
1368 					{
1369 					case HASH_KEY_IS_STRING:
1370 						add_assoc_stringl(return_value, string_key, result, result_len, 0);
1371 						break;
1372 
1373 					case HASH_KEY_IS_LONG:
1374 						add_index_stringl(return_value, num_key, result, result_len, 0);
1375 						break;
1376 					}
1377 				} else {
1378 					efree(result);
1379 				}
1380 			}
1381 
1382 			zend_hash_move_forward(Z_ARRVAL_PP(subject));
1383 		}
1384 	} else {	/* if subject is not an array */
1385 		old_replace_count = replace_count;
1386 		if ((result = php_replace_in_subject(*regex, *replace, subject, &result_len, limit_val, is_callable_replace, &replace_count TSRMLS_CC)) != NULL) {
1387 			if (!is_filter || replace_count > old_replace_count) {
1388 				RETVAL_STRINGL(result, result_len, 0);
1389 			} else {
1390 				efree(result);
1391 			}
1392 		}
1393 	}
1394 	if (ZEND_NUM_ARGS() > 4) {
1395 		zval_dtor(*zcount);
1396 		ZVAL_LONG(*zcount, replace_count);
1397 	}
1398 
1399 }
1400 /* }}} */
1401 
1402 /* {{{ proto mixed preg_replace(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1403    Perform Perl-style regular expression replacement. */
PHP_FUNCTION(preg_replace)1404 static PHP_FUNCTION(preg_replace)
1405 {
1406 	preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
1407 }
1408 /* }}} */
1409 
1410 /* {{{ proto mixed preg_replace_callback(mixed regex, mixed callback, mixed subject [, int limit [, int &count]])
1411    Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback)1412 static PHP_FUNCTION(preg_replace_callback)
1413 {
1414 	preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1, 0);
1415 }
1416 /* }}} */
1417 
1418 /* {{{ proto mixed preg_filter(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1419    Perform Perl-style regular expression replacement and only return matches. */
PHP_FUNCTION(preg_filter)1420 static PHP_FUNCTION(preg_filter)
1421 {
1422 	preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
1423 }
1424 /* }}} */
1425 
1426 /* {{{ proto array preg_split(string pattern, string subject [, int limit [, int flags]])
1427    Split string into an array using a perl-style regular expression as a delimiter */
PHP_FUNCTION(preg_split)1428 static PHP_FUNCTION(preg_split)
1429 {
1430 	char				*regex;			/* Regular expression */
1431 	char				*subject;		/* String to match against */
1432 	int					 regex_len;
1433 	int					 subject_len;
1434 	long				 limit_val = -1;/* Integer value of limit */
1435 	long				 flags = 0;		/* Match control flags */
1436 	pcre_cache_entry	*pce;			/* Compiled regular expression */
1437 
1438 	/* Get function parameters and do error checking */
1439 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|ll", &regex, &regex_len,
1440 							  &subject, &subject_len, &limit_val, &flags) == FAILURE) {
1441 		RETURN_FALSE;
1442 	}
1443 
1444 	/* Compile regex or get it from cache. */
1445 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1446 		RETURN_FALSE;
1447 	}
1448 
1449 	php_pcre_split_impl(pce, subject, subject_len, return_value, limit_val, flags TSRMLS_CC);
1450 }
1451 /* }}} */
1452 
1453 /* {{{ php_pcre_split
1454  */
php_pcre_split_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * return_value,long limit_val,long flags TSRMLS_DC)1455 PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
1456 	long limit_val, long flags TSRMLS_DC)
1457 {
1458 	pcre_extra		*extra = NULL;		/* Holds results of studying */
1459 	pcre			*re_bump = NULL;	/* Regex instance for empty matches */
1460 	pcre_extra		*extra_bump = NULL;	/* Almost dummy */
1461 	pcre_extra		 extra_data;		/* Used locally for exec options */
1462 	int				*offsets;			/* Array of subpattern offsets */
1463 	int				 size_offsets;		/* Size of the offsets array */
1464 	int				 exoptions = 0;		/* Execution options */
1465 	int				 count = 0;			/* Count of matched subpatterns */
1466 	int				 start_offset;		/* Where the new search starts */
1467 	int				 next_offset;		/* End of the last delimiter match + 1 */
1468 	int				 g_notempty = 0;	/* If the match should not be empty */
1469 	char			*last_match;		/* Location of last match */
1470 	int				 rc;
1471 	int				 no_empty;			/* If NO_EMPTY flag is set */
1472 	int				 delim_capture; 	/* If delimiters should be captured */
1473 	int				 offset_capture;	/* If offsets should be captured */
1474 
1475 	no_empty = flags & PREG_SPLIT_NO_EMPTY;
1476 	delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
1477 	offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
1478 
1479 	if (limit_val == 0) {
1480 		limit_val = -1;
1481 	}
1482 
1483 	if (extra == NULL) {
1484 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1485 		extra = &extra_data;
1486 	}
1487 	extra->match_limit = PCRE_G(backtrack_limit);
1488 	extra->match_limit_recursion = PCRE_G(recursion_limit);
1489 
1490 	/* Initialize return value */
1491 	array_init(return_value);
1492 
1493 	/* Calculate the size of the offsets array, and allocate memory for it. */
1494 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
1495 	if (rc < 0) {
1496 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1497 		RETURN_FALSE;
1498 	}
1499 	size_offsets = (size_offsets + 1) * 3;
1500 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1501 
1502 	/* Start at the beginning of the string */
1503 	start_offset = 0;
1504 	next_offset = 0;
1505 	last_match = subject;
1506 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1507 
1508 	/* Get next piece if no limit or limit not yet reached and something matched*/
1509 	while ((limit_val == -1 || limit_val > 1)) {
1510 		count = pcre_exec(pce->re, extra, subject,
1511 						  subject_len, start_offset,
1512 						  exoptions|g_notempty, offsets, size_offsets);
1513 
1514 		/* the string was already proved to be valid UTF-8 */
1515 		exoptions |= PCRE_NO_UTF8_CHECK;
1516 
1517 		/* Check for too many substrings condition. */
1518 		if (count == 0) {
1519 			php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
1520 			count = size_offsets/3;
1521 		}
1522 
1523 		/* If something matched */
1524 		if (count > 0) {
1525 			if (!no_empty || &subject[offsets[0]] != last_match) {
1526 
1527 				if (offset_capture) {
1528 					/* Add (match, offset) pair to the return value */
1529 					add_offset_pair(return_value, last_match, &subject[offsets[0]]-last_match, next_offset, NULL);
1530 				} else {
1531 					/* Add the piece to the return value */
1532 					add_next_index_stringl(return_value, last_match,
1533 								   	   &subject[offsets[0]]-last_match, 1);
1534 				}
1535 
1536 				/* One less left to do */
1537 				if (limit_val != -1)
1538 					limit_val--;
1539 			}
1540 
1541 			last_match = &subject[offsets[1]];
1542 			next_offset = offsets[1];
1543 
1544 			if (delim_capture) {
1545 				int i, match_len;
1546 				for (i = 1; i < count; i++) {
1547 					match_len = offsets[(i<<1)+1] - offsets[i<<1];
1548 					/* If we have matched a delimiter */
1549 					if (!no_empty || match_len > 0) {
1550 						if (offset_capture) {
1551 							add_offset_pair(return_value, &subject[offsets[i<<1]], match_len, offsets[i<<1], NULL);
1552 						} else {
1553 							add_next_index_stringl(return_value,
1554 												   &subject[offsets[i<<1]],
1555 												   match_len, 1);
1556 						}
1557 					}
1558 				}
1559 			}
1560 		} else if (count == PCRE_ERROR_NOMATCH) {
1561 			/* If we previously set PCRE_NOTEMPTY after a null match,
1562 			   this is not necessarily the end. We need to advance
1563 			   the start offset, and continue. Fudge the offset values
1564 			   to achieve this, unless we're already at the end of the string. */
1565 			if (g_notempty != 0 && start_offset < subject_len) {
1566 				if (pce->compile_options & PCRE_UTF8) {
1567 					if (re_bump == NULL) {
1568 						int dummy;
1569 
1570 						if ((re_bump = pcre_get_compiled_regex("/./us", &extra_bump, &dummy TSRMLS_CC)) == NULL) {
1571 							RETURN_FALSE;
1572 						}
1573 					}
1574 					count = pcre_exec(re_bump, extra_bump, subject,
1575 							  subject_len, start_offset,
1576 							  exoptions, offsets, size_offsets);
1577 					if (count < 1) {
1578 						php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown error");
1579 						RETURN_FALSE;
1580 					}
1581 				} else {
1582 					offsets[0] = start_offset;
1583 					offsets[1] = start_offset + 1;
1584 				}
1585 			} else
1586 				break;
1587 		} else {
1588 			pcre_handle_exec_error(count TSRMLS_CC);
1589 			break;
1590 		}
1591 
1592 		/* If we have matched an empty string, mimic what Perl's /g options does.
1593 		   This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1594 		   the match again at the same point. If this fails (picked up above) we
1595 		   advance to the next character. */
1596 		g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1597 
1598 		/* Advance to the position right after the last full match */
1599 		start_offset = offsets[1];
1600 	}
1601 
1602 
1603 	start_offset = last_match - subject; /* the offset might have been incremented, but without further successful matches */
1604 
1605 	if (!no_empty || start_offset < subject_len)
1606 	{
1607 		if (offset_capture) {
1608 			/* Add the last (match, offset) pair to the return value */
1609 			add_offset_pair(return_value, &subject[start_offset], subject_len - start_offset, start_offset, NULL);
1610 		} else {
1611 			/* Add the last piece to the return value */
1612 			add_next_index_stringl(return_value, last_match, subject + subject_len - last_match, 1);
1613 		}
1614 	}
1615 
1616 
1617 	/* Clean up */
1618 	efree(offsets);
1619 }
1620 /* }}} */
1621 
1622 /* {{{ proto string preg_quote(string str [, string delim_char])
1623    Quote regular expression characters plus an optional character */
PHP_FUNCTION(preg_quote)1624 static PHP_FUNCTION(preg_quote)
1625 {
1626 	int		 in_str_len;
1627 	char	*in_str;		/* Input string argument */
1628 	char	*in_str_end;    /* End of the input string */
1629 	int		 delim_len = 0;
1630 	char	*delim = NULL;	/* Additional delimiter argument */
1631 	char	*out_str,		/* Output string with quoted characters */
1632 		 	*p,				/* Iterator for input string */
1633 			*q,				/* Iterator for output string */
1634 			 delim_char=0,	/* Delimiter character to be quoted */
1635 			 c;				/* Current character */
1636 	zend_bool quote_delim = 0; /* Whether to quote additional delim char */
1637 
1638 	/* Get the arguments and check for errors */
1639 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|s", &in_str, &in_str_len,
1640 							  &delim, &delim_len) == FAILURE) {
1641 		return;
1642 	}
1643 
1644 	in_str_end = in_str + in_str_len;
1645 
1646 	/* Nothing to do if we got an empty string */
1647 	if (in_str == in_str_end) {
1648 		RETURN_EMPTY_STRING();
1649 	}
1650 
1651 	if (delim && *delim) {
1652 		delim_char = delim[0];
1653 		quote_delim = 1;
1654 	}
1655 
1656 	/* Allocate enough memory so that even if each character
1657 	   is quoted, we won't run out of room */
1658 	out_str = safe_emalloc(4, in_str_len, 1);
1659 
1660 	/* Go through the string and quote necessary characters */
1661 	for(p = in_str, q = out_str; p != in_str_end; p++) {
1662 		c = *p;
1663 		switch(c) {
1664 			case '.':
1665 			case '\\':
1666 			case '+':
1667 			case '*':
1668 			case '?':
1669 			case '[':
1670 			case '^':
1671 			case ']':
1672 			case '$':
1673 			case '(':
1674 			case ')':
1675 			case '{':
1676 			case '}':
1677 			case '=':
1678 			case '!':
1679 			case '>':
1680 			case '<':
1681 			case '|':
1682 			case ':':
1683 			case '-':
1684 				*q++ = '\\';
1685 				*q++ = c;
1686 				break;
1687 
1688 			case '\0':
1689 				*q++ = '\\';
1690 				*q++ = '0';
1691 				*q++ = '0';
1692 				*q++ = '0';
1693 				break;
1694 
1695 			default:
1696 				if (quote_delim && c == delim_char)
1697 					*q++ = '\\';
1698 				*q++ = c;
1699 				break;
1700 		}
1701 	}
1702 	*q = '\0';
1703 
1704 	/* Reallocate string and return it */
1705 	RETVAL_STRINGL(erealloc(out_str, q - out_str + 1), q - out_str, 0);
1706 }
1707 /* }}} */
1708 
1709 /* {{{ proto array preg_grep(string regex, array input [, int flags])
1710    Searches array and returns entries which match regex */
PHP_FUNCTION(preg_grep)1711 static PHP_FUNCTION(preg_grep)
1712 {
1713 	char				*regex;			/* Regular expression */
1714 	int				 	 regex_len;
1715 	zval				*input;			/* Input array */
1716 	long				 flags = 0;		/* Match control flags */
1717 	pcre_cache_entry	*pce;			/* Compiled regular expression */
1718 
1719 	/* Get arguments and do error checking */
1720 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sa|l", &regex, &regex_len,
1721 							  &input, &flags) == FAILURE) {
1722 		return;
1723 	}
1724 
1725 	/* Compile regex or get it from cache. */
1726 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1727 		RETURN_FALSE;
1728 	}
1729 
1730 	php_pcre_grep_impl(pce, input, return_value, flags TSRMLS_CC);
1731 }
1732 /* }}} */
1733 
php_pcre_grep_impl(pcre_cache_entry * pce,zval * input,zval * return_value,long flags TSRMLS_DC)1734 PHPAPI void  php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, long flags TSRMLS_DC) /* {{{ */
1735 {
1736 	zval		   **entry;				/* An entry in the input array */
1737 	pcre_extra		*extra = pce->extra;/* Holds results of studying */
1738 	pcre_extra		 extra_data;		/* Used locally for exec options */
1739 	int				*offsets;			/* Array of subpattern offsets */
1740 	int				 size_offsets;		/* Size of the offsets array */
1741 	int				 count = 0;			/* Count of matched subpatterns */
1742 	char			*string_key;
1743 	ulong			 num_key;
1744 	zend_bool		 invert;			/* Whether to return non-matching
1745 										   entries */
1746 	int				 rc;
1747 
1748 	invert = flags & PREG_GREP_INVERT ? 1 : 0;
1749 
1750 	if (extra == NULL) {
1751 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1752 		extra = &extra_data;
1753 	}
1754 	extra->match_limit = PCRE_G(backtrack_limit);
1755 	extra->match_limit_recursion = PCRE_G(recursion_limit);
1756 
1757 	/* Calculate the size of the offsets array, and allocate memory for it. */
1758 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
1759 	if (rc < 0) {
1760 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1761 		RETURN_FALSE;
1762 	}
1763 	size_offsets = (size_offsets + 1) * 3;
1764 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1765 
1766 	/* Initialize return array */
1767 	array_init(return_value);
1768 
1769 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1770 
1771 	/* Go through the input array */
1772 	zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
1773 	while (zend_hash_get_current_data(Z_ARRVAL_P(input), (void **)&entry) == SUCCESS) {
1774 		zval subject = **entry;
1775 
1776 		if (Z_TYPE_PP(entry) != IS_STRING) {
1777 			zval_copy_ctor(&subject);
1778 			convert_to_string(&subject);
1779 		}
1780 
1781 		/* Perform the match */
1782 		count = pcre_exec(pce->re, extra, Z_STRVAL(subject),
1783 						  Z_STRLEN(subject), 0,
1784 						  0, offsets, size_offsets);
1785 
1786 		/* Check for too many substrings condition. */
1787 		if (count == 0) {
1788 			php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
1789 			count = size_offsets/3;
1790 		} else if (count < 0 && count != PCRE_ERROR_NOMATCH) {
1791 			pcre_handle_exec_error(count TSRMLS_CC);
1792 			break;
1793 		}
1794 
1795 		/* If the entry fits our requirements */
1796 		if ((count > 0 && !invert) || (count == PCRE_ERROR_NOMATCH && invert)) {
1797 
1798 			Z_ADDREF_PP(entry);
1799 
1800 			/* Add to return array */
1801 			switch (zend_hash_get_current_key(Z_ARRVAL_P(input), &string_key, &num_key, 0))
1802 			{
1803 				case HASH_KEY_IS_STRING:
1804 					zend_hash_update(Z_ARRVAL_P(return_value), string_key,
1805 									 strlen(string_key)+1, entry, sizeof(zval *), NULL);
1806 					break;
1807 
1808 				case HASH_KEY_IS_LONG:
1809 					zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry,
1810 										   sizeof(zval *), NULL);
1811 					break;
1812 			}
1813 		}
1814 
1815 		if (Z_TYPE_PP(entry) != IS_STRING) {
1816 			zval_dtor(&subject);
1817 		}
1818 
1819 		zend_hash_move_forward(Z_ARRVAL_P(input));
1820 	}
1821 	zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
1822 	/* Clean up */
1823 	efree(offsets);
1824 }
1825 /* }}} */
1826 
1827 /* {{{ proto int preg_last_error()
1828    Returns the error code of the last regexp execution. */
PHP_FUNCTION(preg_last_error)1829 static PHP_FUNCTION(preg_last_error)
1830 {
1831 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) {
1832 		return;
1833 	}
1834 
1835 	RETURN_LONG(PCRE_G(error_code));
1836 }
1837 /* }}} */
1838 
1839 /* {{{ module definition structures */
1840 
1841 /* {{{ arginfo */
1842 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match, 0, 0, 2)
1843     ZEND_ARG_INFO(0, pattern)
1844     ZEND_ARG_INFO(0, subject)
1845     ZEND_ARG_INFO(1, subpatterns) /* array */
1846     ZEND_ARG_INFO(0, flags)
1847     ZEND_ARG_INFO(0, offset)
1848 ZEND_END_ARG_INFO()
1849 
1850 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match_all, 0, 0, 3)
1851     ZEND_ARG_INFO(0, pattern)
1852     ZEND_ARG_INFO(0, subject)
1853     ZEND_ARG_INFO(1, subpatterns) /* array */
1854     ZEND_ARG_INFO(0, flags)
1855     ZEND_ARG_INFO(0, offset)
1856 ZEND_END_ARG_INFO()
1857 
1858 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace, 0, 0, 3)
1859     ZEND_ARG_INFO(0, regex)
1860     ZEND_ARG_INFO(0, replace)
1861     ZEND_ARG_INFO(0, subject)
1862     ZEND_ARG_INFO(0, limit)
1863     ZEND_ARG_INFO(1, count)
1864 ZEND_END_ARG_INFO()
1865 
1866 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace_callback, 0, 0, 3)
1867     ZEND_ARG_INFO(0, regex)
1868     ZEND_ARG_INFO(0, callback)
1869     ZEND_ARG_INFO(0, subject)
1870     ZEND_ARG_INFO(0, limit)
1871     ZEND_ARG_INFO(1, count)
1872 ZEND_END_ARG_INFO()
1873 
1874 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_split, 0, 0, 2)
1875     ZEND_ARG_INFO(0, pattern)
1876     ZEND_ARG_INFO(0, subject)
1877     ZEND_ARG_INFO(0, limit)
1878     ZEND_ARG_INFO(0, flags)
1879 ZEND_END_ARG_INFO()
1880 
1881 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_quote, 0, 0, 1)
1882     ZEND_ARG_INFO(0, str)
1883     ZEND_ARG_INFO(0, delim_char)
1884 ZEND_END_ARG_INFO()
1885 
1886 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_grep, 0, 0, 2)
1887     ZEND_ARG_INFO(0, regex)
1888     ZEND_ARG_INFO(0, input) /* array */
1889     ZEND_ARG_INFO(0, flags)
1890 ZEND_END_ARG_INFO()
1891 
1892 ZEND_BEGIN_ARG_INFO(arginfo_preg_last_error, 0)
1893 ZEND_END_ARG_INFO()
1894 /* }}} */
1895 
1896 static const zend_function_entry pcre_functions[] = {
1897 	PHP_FE(preg_match,				arginfo_preg_match)
1898 	PHP_FE(preg_match_all,			arginfo_preg_match_all)
1899 	PHP_FE(preg_replace,			arginfo_preg_replace)
1900 	PHP_FE(preg_replace_callback,	arginfo_preg_replace_callback)
1901 	PHP_FE(preg_filter,				arginfo_preg_replace)
1902 	PHP_FE(preg_split,				arginfo_preg_split)
1903 	PHP_FE(preg_quote,				arginfo_preg_quote)
1904 	PHP_FE(preg_grep,				arginfo_preg_grep)
1905 	PHP_FE(preg_last_error,			arginfo_preg_last_error)
1906 	PHP_FE_END
1907 };
1908 
1909 zend_module_entry pcre_module_entry = {
1910 	STANDARD_MODULE_HEADER,
1911    "pcre",
1912 	pcre_functions,
1913 	PHP_MINIT(pcre),
1914 	PHP_MSHUTDOWN(pcre),
1915 	NULL,
1916 	NULL,
1917 	PHP_MINFO(pcre),
1918 	NO_VERSION_YET,
1919 	PHP_MODULE_GLOBALS(pcre),
1920 	PHP_GINIT(pcre),
1921 	PHP_GSHUTDOWN(pcre),
1922 	NULL,
1923 	STANDARD_MODULE_PROPERTIES_EX
1924 };
1925 
1926 #ifdef COMPILE_DL_PCRE
1927 ZEND_GET_MODULE(pcre)
1928 #endif
1929 
1930 /* }}} */
1931 
1932 #endif /* HAVE_PCRE || HAVE_BUNDLED_PCRE */
1933 
1934 /*
1935  * Local variables:
1936  * tab-width: 4
1937  * c-basic-offset: 4
1938  * End:
1939  * vim600: sw=4 ts=4 fdm=marker
1940  * vim<600: sw=4 ts=4
1941  */
1942