xref: /PHP-5.4/ext/pcre/php_pcre.c (revision 03964892)
1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 5                                                        |
4    +----------------------------------------------------------------------+
5    | Copyright (c) 1997-2014 The PHP Group                                |
6    +----------------------------------------------------------------------+
7    | This source file is subject to version 3.01 of the PHP license,      |
8    | that is bundled with this package in the file LICENSE, and is        |
9    | available through the world-wide-web at the following url:           |
10    | http://www.php.net/license/3_01.txt                                  |
11    | If you did not receive a copy of the PHP license and are unable to   |
12    | obtain it through the world-wide-web, please send a note to          |
13    | license@php.net so we can mail you a copy immediately.               |
14    +----------------------------------------------------------------------+
15    | Author: Andrei Zmievski <andrei@php.net>                             |
16    +----------------------------------------------------------------------+
17  */
18 
19 /* $Id$ */
20 
21 #include "php.h"
22 #include "php_ini.h"
23 #include "php_globals.h"
24 #include "php_pcre.h"
25 #include "ext/standard/info.h"
26 #include "ext/standard/php_smart_str.h"
27 
28 #if HAVE_PCRE || HAVE_BUNDLED_PCRE
29 
30 #include "ext/standard/php_string.h"
31 
32 #define PREG_PATTERN_ORDER			1
33 #define PREG_SET_ORDER				2
34 #define PREG_OFFSET_CAPTURE			(1<<8)
35 
36 #define	PREG_SPLIT_NO_EMPTY			(1<<0)
37 #define PREG_SPLIT_DELIM_CAPTURE	(1<<1)
38 #define PREG_SPLIT_OFFSET_CAPTURE	(1<<2)
39 
40 #define PREG_REPLACE_EVAL			(1<<0)
41 
42 #define PREG_GREP_INVERT			(1<<0)
43 
44 #define PCRE_CACHE_SIZE 4096
45 
46 enum {
47 	PHP_PCRE_NO_ERROR = 0,
48 	PHP_PCRE_INTERNAL_ERROR,
49 	PHP_PCRE_BACKTRACK_LIMIT_ERROR,
50 	PHP_PCRE_RECURSION_LIMIT_ERROR,
51 	PHP_PCRE_BAD_UTF8_ERROR,
52 	PHP_PCRE_BAD_UTF8_OFFSET_ERROR
53 };
54 
55 
ZEND_DECLARE_MODULE_GLOBALS(pcre)56 ZEND_DECLARE_MODULE_GLOBALS(pcre)
57 
58 
59 static void pcre_handle_exec_error(int pcre_code TSRMLS_DC) /* {{{ */
60 {
61 	int preg_code = 0;
62 
63 	switch (pcre_code) {
64 		case PCRE_ERROR_MATCHLIMIT:
65 			preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
66 			break;
67 
68 		case PCRE_ERROR_RECURSIONLIMIT:
69 			preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
70 			break;
71 
72 		case PCRE_ERROR_BADUTF8:
73 			preg_code = PHP_PCRE_BAD_UTF8_ERROR;
74 			break;
75 
76 		case PCRE_ERROR_BADUTF8_OFFSET:
77 			preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
78 			break;
79 
80 		default:
81 			preg_code = PHP_PCRE_INTERNAL_ERROR;
82 			break;
83 	}
84 
85 	PCRE_G(error_code) = preg_code;
86 }
87 /* }}} */
88 
php_free_pcre_cache(void * data)89 static void php_free_pcre_cache(void *data) /* {{{ */
90 {
91 	pcre_cache_entry *pce = (pcre_cache_entry *) data;
92 	if (!pce) return;
93 	pefree(pce->re, 1);
94 	if (pce->extra) pefree(pce->extra, 1);
95 #if HAVE_SETLOCALE
96 	if ((void*)pce->tables) pefree((void*)pce->tables, 1);
97 	pefree(pce->locale, 1);
98 #endif
99 }
100 /* }}} */
101 
PHP_GINIT_FUNCTION(pcre)102 static PHP_GINIT_FUNCTION(pcre) /* {{{ */
103 {
104 	zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1);
105 	pcre_globals->backtrack_limit = 0;
106 	pcre_globals->recursion_limit = 0;
107 	pcre_globals->error_code      = PHP_PCRE_NO_ERROR;
108 }
109 /* }}} */
110 
PHP_GSHUTDOWN_FUNCTION(pcre)111 static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */
112 {
113 	zend_hash_destroy(&pcre_globals->pcre_cache);
114 }
115 /* }}} */
116 
117 PHP_INI_BEGIN()
118 	STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateLong, backtrack_limit, zend_pcre_globals, pcre_globals)
119 	STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000", PHP_INI_ALL, OnUpdateLong, recursion_limit, zend_pcre_globals, pcre_globals)
PHP_INI_END()120 PHP_INI_END()
121 
122 
123 /* {{{ PHP_MINFO_FUNCTION(pcre) */
124 static PHP_MINFO_FUNCTION(pcre)
125 {
126 	php_info_print_table_start();
127 	php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
128 	php_info_print_table_row(2, "PCRE Library Version", pcre_version() );
129 	php_info_print_table_end();
130 
131 	DISPLAY_INI_ENTRIES();
132 }
133 /* }}} */
134 
135 /* {{{ PHP_MINIT_FUNCTION(pcre) */
PHP_MINIT_FUNCTION(pcre)136 static PHP_MINIT_FUNCTION(pcre)
137 {
138 	REGISTER_INI_ENTRIES();
139 
140 	REGISTER_LONG_CONSTANT("PREG_PATTERN_ORDER", PREG_PATTERN_ORDER, CONST_CS | CONST_PERSISTENT);
141 	REGISTER_LONG_CONSTANT("PREG_SET_ORDER", PREG_SET_ORDER, CONST_CS | CONST_PERSISTENT);
142 	REGISTER_LONG_CONSTANT("PREG_OFFSET_CAPTURE", PREG_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
143 	REGISTER_LONG_CONSTANT("PREG_SPLIT_NO_EMPTY", PREG_SPLIT_NO_EMPTY, CONST_CS | CONST_PERSISTENT);
144 	REGISTER_LONG_CONSTANT("PREG_SPLIT_DELIM_CAPTURE", PREG_SPLIT_DELIM_CAPTURE, CONST_CS | CONST_PERSISTENT);
145 	REGISTER_LONG_CONSTANT("PREG_SPLIT_OFFSET_CAPTURE", PREG_SPLIT_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
146 	REGISTER_LONG_CONSTANT("PREG_GREP_INVERT", PREG_GREP_INVERT, CONST_CS | CONST_PERSISTENT);
147 
148 	REGISTER_LONG_CONSTANT("PREG_NO_ERROR", PHP_PCRE_NO_ERROR, CONST_CS | CONST_PERSISTENT);
149 	REGISTER_LONG_CONSTANT("PREG_INTERNAL_ERROR", PHP_PCRE_INTERNAL_ERROR, CONST_CS | CONST_PERSISTENT);
150 	REGISTER_LONG_CONSTANT("PREG_BACKTRACK_LIMIT_ERROR", PHP_PCRE_BACKTRACK_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
151 	REGISTER_LONG_CONSTANT("PREG_RECURSION_LIMIT_ERROR", PHP_PCRE_RECURSION_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
152 	REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_ERROR", PHP_PCRE_BAD_UTF8_ERROR, CONST_CS | CONST_PERSISTENT);
153 	REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_OFFSET_ERROR", PHP_PCRE_BAD_UTF8_OFFSET_ERROR, CONST_CS | CONST_PERSISTENT);
154 	REGISTER_STRING_CONSTANT("PCRE_VERSION", (char *)pcre_version(), CONST_CS | CONST_PERSISTENT);
155 
156 	return SUCCESS;
157 }
158 /* }}} */
159 
160 /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
PHP_MSHUTDOWN_FUNCTION(pcre)161 static PHP_MSHUTDOWN_FUNCTION(pcre)
162 {
163 	UNREGISTER_INI_ENTRIES();
164 
165 	return SUCCESS;
166 }
167 /* }}} */
168 
169 /* {{{ static pcre_clean_cache */
pcre_clean_cache(void * data,void * arg TSRMLS_DC)170 static int pcre_clean_cache(void *data, void *arg TSRMLS_DC)
171 {
172 	int *num_clean = (int *)arg;
173 
174 	if (*num_clean > 0) {
175 		(*num_clean)--;
176 		return 1;
177 	} else {
178 		return 0;
179 	}
180 }
181 /* }}} */
182 
183 /* {{{ static make_subpats_table */
make_subpats_table(int num_subpats,pcre_cache_entry * pce TSRMLS_DC)184 static char **make_subpats_table(int num_subpats, pcre_cache_entry *pce TSRMLS_DC)
185 {
186 	pcre_extra *extra = pce->extra;
187 	int name_cnt = 0, name_size, ni = 0;
188 	int rc;
189 	char *name_table;
190 	unsigned short name_idx;
191 	char **subpat_names = (char **)ecalloc(num_subpats, sizeof(char *));
192 
193 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMECOUNT, &name_cnt);
194 	if (rc < 0) {
195 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
196 		efree(subpat_names);
197 		return NULL;
198 	}
199 	if (name_cnt > 0) {
200 		int rc1, rc2;
201 
202 		rc1 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMETABLE, &name_table);
203 		rc2 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
204 		rc = rc2 ? rc2 : rc1;
205 		if (rc < 0) {
206 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
207 			efree(subpat_names);
208 			return NULL;
209 		}
210 
211 		while (ni++ < name_cnt) {
212 			name_idx = 0xff * (unsigned char)name_table[0] + (unsigned char)name_table[1];
213 			subpat_names[name_idx] = name_table + 2;
214 			if (is_numeric_string(subpat_names[name_idx], strlen(subpat_names[name_idx]), NULL, NULL, 0) > 0) {
215 				php_error_docref(NULL TSRMLS_CC, E_WARNING, "Numeric named subpatterns are not allowed");
216 				efree(subpat_names);
217 				return NULL;
218 			}
219 			name_table += name_size;
220 		}
221 	}
222 
223 	return subpat_names;
224 }
225 /* }}} */
226 
227 /* {{{ pcre_get_compiled_regex_cache
228  */
pcre_get_compiled_regex_cache(char * regex,int regex_len TSRMLS_DC)229 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_len TSRMLS_DC)
230 {
231 	pcre				*re = NULL;
232 	pcre_extra			*extra;
233 	int					 coptions = 0;
234 	int					 soptions = 0;
235 	const char			*error;
236 	int					 erroffset;
237 	char				 delimiter;
238 	char				 start_delimiter;
239 	char				 end_delimiter;
240 	char				*p, *pp;
241 	char				*pattern;
242 	int					 do_study = 0;
243 	int					 poptions = 0;
244 	int				count = 0;
245 	unsigned const char *tables = NULL;
246 #if HAVE_SETLOCALE
247 	char				*locale;
248 #endif
249 	pcre_cache_entry	*pce;
250 	pcre_cache_entry	 new_entry;
251 	char                *tmp = NULL;
252 
253 #if HAVE_SETLOCALE
254 # if defined(PHP_WIN32) && defined(ZTS)
255 	_configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
256 # endif
257 	locale = setlocale(LC_CTYPE, NULL);
258 #endif
259 
260 	/* Try to lookup the cached regex entry, and if successful, just pass
261 	   back the compiled pattern, otherwise go on and compile it. */
262 	if (zend_hash_find(&PCRE_G(pcre_cache), regex, regex_len+1, (void **)&pce) == SUCCESS) {
263 		/*
264 		 * We use a quick pcre_fullinfo() check to see whether cache is corrupted, and if it
265 		 * is, we flush it and compile the pattern from scratch.
266 		 */
267 		if (pcre_fullinfo(pce->re, NULL, PCRE_INFO_CAPTURECOUNT, &count) == PCRE_ERROR_BADMAGIC) {
268 			zend_hash_clean(&PCRE_G(pcre_cache));
269 		} else {
270 #if HAVE_SETLOCALE
271 			if (!strcmp(pce->locale, locale)) {
272 #endif
273 				return pce;
274 #if HAVE_SETLOCALE
275 			}
276 #endif
277 		}
278 	}
279 
280 	p = regex;
281 
282 	/* Parse through the leading whitespace, and display a warning if we
283 	   get to the end without encountering a delimiter. */
284 	while (isspace((int)*(unsigned char *)p)) p++;
285 	if (*p == 0) {
286 		php_error_docref(NULL TSRMLS_CC, E_WARNING,
287 						 p < regex + regex_len ? "Null byte in regex" : "Empty regular expression");
288 		return NULL;
289 	}
290 
291 	/* Get the delimiter and display a warning if it is alphanumeric
292 	   or a backslash. */
293 	delimiter = *p++;
294 	if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
295 		php_error_docref(NULL TSRMLS_CC,E_WARNING, "Delimiter must not be alphanumeric or backslash");
296 		return NULL;
297 	}
298 
299 	start_delimiter = delimiter;
300 	if ((pp = strchr("([{< )]}> )]}>", delimiter)))
301 		delimiter = pp[5];
302 	end_delimiter = delimiter;
303 
304 	pp = p;
305 
306 	if (start_delimiter == end_delimiter) {
307 		/* We need to iterate through the pattern, searching for the ending delimiter,
308 		   but skipping the backslashed delimiters.  If the ending delimiter is not
309 		   found, display a warning. */
310 		while (*pp != 0) {
311 			if (*pp == '\\' && pp[1] != 0) pp++;
312 			else if (*pp == delimiter)
313 				break;
314 			pp++;
315 		}
316 	} else {
317 		/* We iterate through the pattern, searching for the matching ending
318 		 * delimiter. For each matching starting delimiter, we increment nesting
319 		 * level, and decrement it for each matching ending delimiter. If we
320 		 * reach the end of the pattern without matching, display a warning.
321 		 */
322 		int brackets = 1; 	/* brackets nesting level */
323 		while (*pp != 0) {
324 			if (*pp == '\\' && pp[1] != 0) pp++;
325 			else if (*pp == end_delimiter && --brackets <= 0)
326 				break;
327 			else if (*pp == start_delimiter)
328 				brackets++;
329 			pp++;
330 		}
331 	}
332 
333 	if (*pp == 0) {
334 		if (pp < regex + regex_len) {
335 			php_error_docref(NULL TSRMLS_CC,E_WARNING, "Null byte in regex");
336 		} else if (start_delimiter == end_delimiter) {
337 			php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending delimiter '%c' found", delimiter);
338 		} else {
339 			php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending matching delimiter '%c' found", delimiter);
340 		}
341 		return NULL;
342 	}
343 
344 	/* Make a copy of the actual pattern. */
345 	pattern = estrndup(p, pp-p);
346 
347 	/* Move on to the options */
348 	pp++;
349 
350 	/* Parse through the options, setting appropriate flags.  Display
351 	   a warning if we encounter an unknown modifier. */
352 	while (pp < regex + regex_len) {
353 		switch (*pp++) {
354 			/* Perl compatible options */
355 			case 'i':	coptions |= PCRE_CASELESS;		break;
356 			case 'm':	coptions |= PCRE_MULTILINE;		break;
357 			case 's':	coptions |= PCRE_DOTALL;		break;
358 			case 'x':	coptions |= PCRE_EXTENDED;		break;
359 
360 			/* PCRE specific options */
361 			case 'A':	coptions |= PCRE_ANCHORED;		break;
362 			case 'D':	coptions |= PCRE_DOLLAR_ENDONLY;break;
363 			case 'S':	do_study  = 1;					break;
364 			case 'U':	coptions |= PCRE_UNGREEDY;		break;
365 			case 'X':	coptions |= PCRE_EXTRA;			break;
366 			case 'u':	coptions |= PCRE_UTF8;
367 	/* In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
368        characters, even in UTF-8 mode. However, this can be changed by setting
369        the PCRE_UCP option. */
370 #ifdef PCRE_UCP
371 						coptions |= PCRE_UCP;
372 #endif
373 				break;
374 
375 			/* Custom preg options */
376 			case 'e':	poptions |= PREG_REPLACE_EVAL;	break;
377 
378 			case ' ':
379 			case '\n':
380 				break;
381 
382 			default:
383 				if (pp[-1]) {
384 					php_error_docref(NULL TSRMLS_CC,E_WARNING, "Unknown modifier '%c'", pp[-1]);
385 				} else {
386 					php_error_docref(NULL TSRMLS_CC,E_WARNING, "Null byte in regex");
387 				}
388 				efree(pattern);
389 				return NULL;
390 		}
391 	}
392 
393 #if HAVE_SETLOCALE
394 	if (strcmp(locale, "C"))
395 		tables = pcre_maketables();
396 #endif
397 
398 	/* Compile pattern and display a warning if compilation failed. */
399 	re = pcre_compile(pattern,
400 					  coptions,
401 					  &error,
402 					  &erroffset,
403 					  tables);
404 
405 	if (re == NULL) {
406 		php_error_docref(NULL TSRMLS_CC,E_WARNING, "Compilation failed: %s at offset %d", error, erroffset);
407 		efree(pattern);
408 		if (tables) {
409 			pefree((void*)tables, 1);
410 		}
411 		return NULL;
412 	}
413 
414 	/* If study option was specified, study the pattern and
415 	   store the result in extra for passing to pcre_exec. */
416 	if (do_study) {
417 		extra = pcre_study(re, soptions, &error);
418 		if (extra) {
419 			extra->flags |= PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
420 		}
421 		if (error != NULL) {
422 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Error while studying pattern");
423 		}
424 	} else {
425 		extra = NULL;
426 	}
427 
428 	efree(pattern);
429 
430 	/*
431 	 * If we reached cache limit, clean out the items from the head of the list;
432 	 * these are supposedly the oldest ones (but not necessarily the least used
433 	 * ones).
434 	 */
435 	if (zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) {
436 		int num_clean = PCRE_CACHE_SIZE / 8;
437 		zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean TSRMLS_CC);
438 	}
439 
440 	/* Store the compiled pattern and extra info in the cache. */
441 	new_entry.re = re;
442 	new_entry.extra = extra;
443 	new_entry.preg_options = poptions;
444 	new_entry.compile_options = coptions;
445 #if HAVE_SETLOCALE
446 	new_entry.locale = pestrdup(locale, 1);
447 	new_entry.tables = tables;
448 #endif
449 
450 	/*
451 	 * Interned strings are not duplicated when stored in HashTable,
452 	 * but all the interned strings created during HTTP request are removed
453 	 * at end of request. However PCRE_G(pcre_cache) must be consistent
454 	 * on the next request as well. So we disable usage of interned strings
455 	 * as hash keys especually for this table.
456 	 * See bug #63180
457 	 */
458 	if (IS_INTERNED(regex)) {
459 		regex = tmp = estrndup(regex, regex_len);
460 	}
461 
462 	zend_hash_update(&PCRE_G(pcre_cache), regex, regex_len+1, (void *)&new_entry,
463 						sizeof(pcre_cache_entry), (void**)&pce);
464 
465 	if (tmp) {
466 		efree(tmp);
467 	}
468 
469 	return pce;
470 }
471 /* }}} */
472 
473 /* {{{ pcre_get_compiled_regex
474  */
pcre_get_compiled_regex(char * regex,pcre_extra ** extra,int * preg_options TSRMLS_DC)475 PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_options TSRMLS_DC)
476 {
477 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
478 
479 	if (extra) {
480 		*extra = pce ? pce->extra : NULL;
481 	}
482 	if (preg_options) {
483 		*preg_options = pce ? pce->preg_options : 0;
484 	}
485 
486 	return pce ? pce->re : NULL;
487 }
488 /* }}} */
489 
490 /* {{{ pcre_get_compiled_regex_ex
491  */
pcre_get_compiled_regex_ex(char * regex,pcre_extra ** extra,int * preg_options,int * compile_options TSRMLS_DC)492 PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *compile_options TSRMLS_DC)
493 {
494 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
495 
496 	if (extra) {
497 		*extra = pce ? pce->extra : NULL;
498 	}
499 	if (preg_options) {
500 		*preg_options = pce ? pce->preg_options : 0;
501 	}
502 	if (compile_options) {
503 		*compile_options = pce ? pce->compile_options : 0;
504 	}
505 
506 	return pce ? pce->re : NULL;
507 }
508 /* }}} */
509 
510 /* {{{ add_offset_pair */
add_offset_pair(zval * result,char * str,int len,int offset,char * name)511 static inline void add_offset_pair(zval *result, char *str, int len, int offset, char *name)
512 {
513 	zval *match_pair;
514 
515 	ALLOC_ZVAL(match_pair);
516 	array_init(match_pair);
517 	INIT_PZVAL(match_pair);
518 
519 	/* Add (match, offset) to the return value */
520 	add_next_index_stringl(match_pair, str, len, 1);
521 	add_next_index_long(match_pair, offset);
522 
523 	if (name) {
524 		zval_add_ref(&match_pair);
525 		zend_hash_update(Z_ARRVAL_P(result), name, strlen(name)+1, &match_pair, sizeof(zval *), NULL);
526 	}
527 	zend_hash_next_index_insert(Z_ARRVAL_P(result), &match_pair, sizeof(zval *), NULL);
528 }
529 /* }}} */
530 
php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS,int global)531 static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ */
532 {
533 	/* parameters */
534 	char			 *regex;			/* Regular expression */
535 	char			 *subject;			/* String to match against */
536 	int				  regex_len;
537 	int				  subject_len;
538 	pcre_cache_entry *pce;				/* Compiled regular expression */
539 	zval			 *subpats = NULL;	/* Array for subpatterns */
540 	long			  flags = 0;		/* Match control flags */
541 	long			  start_offset = 0;	/* Where the new search starts */
542 
543 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|zll", &regex, &regex_len,
544 							  &subject, &subject_len, &subpats, &flags, &start_offset) == FAILURE) {
545 		RETURN_FALSE;
546 	}
547 
548 	/* Compile regex or get it from cache. */
549 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
550 		RETURN_FALSE;
551 	}
552 
553 	php_pcre_match_impl(pce, subject, subject_len, return_value, subpats,
554 		global, ZEND_NUM_ARGS() >= 4, flags, start_offset TSRMLS_CC);
555 }
556 /* }}} */
557 
558 /* {{{ php_pcre_match_impl() */
php_pcre_match_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * return_value,zval * subpats,int global,int use_flags,long flags,long start_offset TSRMLS_DC)559 PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
560 	zval *subpats, int global, int use_flags, long flags, long start_offset TSRMLS_DC)
561 {
562 	zval			*result_set,		/* Holds a set of subpatterns after
563 										   a global match */
564 				   **match_sets = NULL;	/* An array of sets of matches for each
565 										   subpattern after a global match */
566 	pcre_extra		*extra = pce->extra;/* Holds results of studying */
567 	pcre_extra		 extra_data;		/* Used locally for exec options */
568 	int				 exoptions = 0;		/* Execution options */
569 	int				 count = 0;			/* Count of matched subpatterns */
570 	int				*offsets;			/* Array of subpattern offsets */
571 	int				 num_subpats;		/* Number of captured subpatterns */
572 	int				 size_offsets;		/* Size of the offsets array */
573 	int				 matched;			/* Has anything matched */
574 	int				 g_notempty = 0;	/* If the match should not be empty */
575 	const char	   **stringlist;		/* Holds list of subpatterns */
576 	char 		   **subpat_names;		/* Array for named subpatterns */
577 	int				 i, rc;
578 	int				 subpats_order;		/* Order of subpattern matches */
579 	int				 offset_capture;    /* Capture match offsets: yes/no */
580 
581 	/* Overwrite the passed-in value for subpatterns with an empty array. */
582 	if (subpats != NULL) {
583 		zval_dtor(subpats);
584 		array_init(subpats);
585 	}
586 
587 	subpats_order = global ? PREG_PATTERN_ORDER : 0;
588 
589 	if (use_flags) {
590 		offset_capture = flags & PREG_OFFSET_CAPTURE;
591 
592 		/*
593 		 * subpats_order is pre-set to pattern mode so we change it only if
594 		 * necessary.
595 		 */
596 		if (flags & 0xff) {
597 			subpats_order = flags & 0xff;
598 		}
599 		if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
600 			(!global && subpats_order != 0)) {
601 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid flags specified");
602 			return;
603 		}
604 	} else {
605 		offset_capture = 0;
606 	}
607 
608 	/* Negative offset counts from the end of the string. */
609 	if (start_offset < 0) {
610 		start_offset = subject_len + start_offset;
611 		if (start_offset < 0) {
612 			start_offset = 0;
613 		}
614 	}
615 
616 	if (extra == NULL) {
617 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
618 		extra = &extra_data;
619 	}
620 	extra->match_limit = PCRE_G(backtrack_limit);
621 	extra->match_limit_recursion = PCRE_G(recursion_limit);
622 
623 	/* Calculate the size of the offsets array, and allocate memory for it. */
624 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
625 	if (rc < 0) {
626 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
627 		RETURN_FALSE;
628 	}
629 	num_subpats++;
630 	size_offsets = num_subpats * 3;
631 
632 	/*
633 	 * Build a mapping from subpattern numbers to their names. We will always
634 	 * allocate the table, even though there may be no named subpatterns. This
635 	 * avoids somewhat more complicated logic in the inner loops.
636 	 */
637 	subpat_names = make_subpats_table(num_subpats, pce TSRMLS_CC);
638 	if (!subpat_names) {
639 		RETURN_FALSE;
640 	}
641 
642 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
643 	memset(offsets, 0, size_offsets*sizeof(int));
644 	/* Allocate match sets array and initialize the values. */
645 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
646 		match_sets = (zval **)safe_emalloc(num_subpats, sizeof(zval *), 0);
647 		for (i=0; i<num_subpats; i++) {
648 			ALLOC_ZVAL(match_sets[i]);
649 			array_init(match_sets[i]);
650 			INIT_PZVAL(match_sets[i]);
651 		}
652 	}
653 
654 	matched = 0;
655 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
656 
657 	do {
658 		/* Execute the regular expression. */
659 		count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
660 						  exoptions|g_notempty, offsets, size_offsets);
661 
662 		/* the string was already proved to be valid UTF-8 */
663 		exoptions |= PCRE_NO_UTF8_CHECK;
664 
665 		/* Check for too many substrings condition. */
666 		if (count == 0) {
667 			php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
668 			count = size_offsets/3;
669 		}
670 
671 		/* If something has matched */
672 		if (count > 0) {
673 			matched++;
674 
675 			/* If subpatterns array has been passed, fill it in with values. */
676 			if (subpats != NULL) {
677 				/* Try to get the list of substrings and display a warning if failed. */
678 				if ((offsets[1] - offsets[0] < 0) || pcre_get_substring_list(subject, offsets, count, &stringlist) < 0) {
679 					efree(subpat_names);
680 					efree(offsets);
681 					if (match_sets) efree(match_sets);
682 					php_error_docref(NULL TSRMLS_CC, E_WARNING, "Get subpatterns list failed");
683 					RETURN_FALSE;
684 				}
685 
686 				if (global) {	/* global pattern matching */
687 					if (subpats && subpats_order == PREG_PATTERN_ORDER) {
688 						/* For each subpattern, insert it into the appropriate array. */
689 						for (i = 0; i < count; i++) {
690 							if (offset_capture) {
691 								add_offset_pair(match_sets[i], (char *)stringlist[i],
692 												offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL);
693 							} else {
694 								add_next_index_stringl(match_sets[i], (char *)stringlist[i],
695 													   offsets[(i<<1)+1] - offsets[i<<1], 1);
696 							}
697 						}
698 						/*
699 						 * If the number of captured subpatterns on this run is
700 						 * less than the total possible number, pad the result
701 						 * arrays with empty strings.
702 						 */
703 						if (count < num_subpats) {
704 							for (; i < num_subpats; i++) {
705 								add_next_index_string(match_sets[i], "", 1);
706 							}
707 						}
708 					} else {
709 						/* Allocate the result set array */
710 						ALLOC_ZVAL(result_set);
711 						array_init(result_set);
712 						INIT_PZVAL(result_set);
713 
714 						/* Add all the subpatterns to it */
715 						for (i = 0; i < count; i++) {
716 							if (offset_capture) {
717 								add_offset_pair(result_set, (char *)stringlist[i],
718 												offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], subpat_names[i]);
719 							} else {
720 								if (subpat_names[i]) {
721 									add_assoc_stringl(result_set, subpat_names[i], (char *)stringlist[i],
722 														   offsets[(i<<1)+1] - offsets[i<<1], 1);
723 								}
724 								add_next_index_stringl(result_set, (char *)stringlist[i],
725 													   offsets[(i<<1)+1] - offsets[i<<1], 1);
726 							}
727 						}
728 						/* And add it to the output array */
729 						zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &result_set, sizeof(zval *), NULL);
730 					}
731 				} else {			/* single pattern matching */
732 					/* For each subpattern, insert it into the subpatterns array. */
733 					for (i = 0; i < count; i++) {
734 						if (offset_capture) {
735 							add_offset_pair(subpats, (char *)stringlist[i],
736 											offsets[(i<<1)+1] - offsets[i<<1],
737 											offsets[i<<1], subpat_names[i]);
738 						} else {
739 							if (subpat_names[i]) {
740 								add_assoc_stringl(subpats, subpat_names[i], (char *)stringlist[i],
741 												  offsets[(i<<1)+1] - offsets[i<<1], 1);
742 							}
743 							add_next_index_stringl(subpats, (char *)stringlist[i],
744 												   offsets[(i<<1)+1] - offsets[i<<1], 1);
745 						}
746 					}
747 				}
748 
749 				pcre_free((void *) stringlist);
750 			}
751 		} else if (count == PCRE_ERROR_NOMATCH) {
752 			/* If we previously set PCRE_NOTEMPTY after a null match,
753 			   this is not necessarily the end. We need to advance
754 			   the start offset, and continue. Fudge the offset values
755 			   to achieve this, unless we're already at the end of the string. */
756 			if (g_notempty != 0 && start_offset < subject_len) {
757 				offsets[0] = start_offset;
758 				offsets[1] = start_offset + 1;
759 			} else
760 				break;
761 		} else {
762 			pcre_handle_exec_error(count TSRMLS_CC);
763 			break;
764 		}
765 
766 		/* If we have matched an empty string, mimic what Perl's /g options does.
767 		   This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
768 		   the match again at the same point. If this fails (picked up above) we
769 		   advance to the next character. */
770 		g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
771 
772 		/* Advance to the position right after the last full match */
773 		start_offset = offsets[1];
774 	} while (global);
775 
776 	/* Add the match sets to the output array and clean up */
777 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
778 		for (i = 0; i < num_subpats; i++) {
779 			if (subpat_names[i]) {
780 				zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i],
781 								 strlen(subpat_names[i])+1, &match_sets[i], sizeof(zval *), NULL);
782 				Z_ADDREF_P(match_sets[i]);
783 			}
784 			zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i], sizeof(zval *), NULL);
785 		}
786 		efree(match_sets);
787 	}
788 
789 	efree(offsets);
790 	efree(subpat_names);
791 
792 	/* Did we encounter an error? */
793 	if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
794 		RETVAL_LONG(matched);
795 	} else {
796 		RETVAL_FALSE;
797 	}
798 }
799 /* }}} */
800 
801 /* {{{ proto int preg_match(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
802    Perform a Perl-style regular expression match */
PHP_FUNCTION(preg_match)803 static PHP_FUNCTION(preg_match)
804 {
805 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
806 }
807 /* }}} */
808 
809 /* {{{ proto int preg_match_all(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
810    Perform a Perl-style global regular expression match */
PHP_FUNCTION(preg_match_all)811 static PHP_FUNCTION(preg_match_all)
812 {
813 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
814 }
815 /* }}} */
816 
817 /* {{{ preg_get_backref
818  */
preg_get_backref(char ** str,int * backref)819 static int preg_get_backref(char **str, int *backref)
820 {
821 	register char in_brace = 0;
822 	register char *walk = *str;
823 
824 	if (walk[1] == 0)
825 		return 0;
826 
827 	if (*walk == '$' && walk[1] == '{') {
828 		in_brace = 1;
829 		walk++;
830 	}
831 	walk++;
832 
833 	if (*walk >= '0' && *walk <= '9') {
834 		*backref = *walk - '0';
835 		walk++;
836 	} else
837 		return 0;
838 
839 	if (*walk && *walk >= '0' && *walk <= '9') {
840 		*backref = *backref * 10 + *walk - '0';
841 		walk++;
842 	}
843 
844 	if (in_brace) {
845 		if (*walk == 0 || *walk != '}')
846 			return 0;
847 		else
848 			walk++;
849 	}
850 
851 	*str = walk;
852 	return 1;
853 }
854 /* }}} */
855 
856 /* {{{ preg_do_repl_func
857  */
preg_do_repl_func(zval * function,char * subject,int * offsets,char ** subpat_names,int count,char ** result TSRMLS_DC)858 static int preg_do_repl_func(zval *function, char *subject, int *offsets, char **subpat_names, int count, char **result TSRMLS_DC)
859 {
860 	zval		*retval_ptr;		/* Function return value */
861 	zval	   **args[1];			/* Argument to pass to function */
862 	zval		*subpats;			/* Captured subpatterns */
863 	int			 result_len;		/* Return value length */
864 	int			 i;
865 
866 	MAKE_STD_ZVAL(subpats);
867 	array_init(subpats);
868 	for (i = 0; i < count; i++) {
869 		if (subpat_names[i]) {
870 			add_assoc_stringl(subpats, subpat_names[i], &subject[offsets[i<<1]] , offsets[(i<<1)+1] - offsets[i<<1], 1);
871 		}
872 		add_next_index_stringl(subpats, &subject[offsets[i<<1]], offsets[(i<<1)+1] - offsets[i<<1], 1);
873 	}
874 	args[0] = &subpats;
875 
876 	if (call_user_function_ex(EG(function_table), NULL, function, &retval_ptr, 1, args, 0, NULL TSRMLS_CC) == SUCCESS && retval_ptr) {
877 		convert_to_string_ex(&retval_ptr);
878 		*result = estrndup(Z_STRVAL_P(retval_ptr), Z_STRLEN_P(retval_ptr));
879 		result_len = Z_STRLEN_P(retval_ptr);
880 		zval_ptr_dtor(&retval_ptr);
881 	} else {
882 		if (!EG(exception)) {
883 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unable to call custom replacement function");
884 		}
885 		result_len = offsets[1] - offsets[0];
886 		*result = estrndup(&subject[offsets[0]], result_len);
887 	}
888 
889 	zval_ptr_dtor(&subpats);
890 
891 	return result_len;
892 }
893 /* }}} */
894 
895 /* {{{ preg_do_eval
896  */
preg_do_eval(char * eval_str,int eval_str_len,char * subject,int * offsets,int count,char ** result TSRMLS_DC)897 static int preg_do_eval(char *eval_str, int eval_str_len, char *subject,
898 						int *offsets, int count, char **result TSRMLS_DC)
899 {
900 	zval		 retval;			/* Return value from evaluation */
901 	char		*eval_str_end,		/* End of eval string */
902 				*match,				/* Current match for a backref */
903 				*esc_match,			/* Quote-escaped match */
904 				*walk,				/* Used to walk the code string */
905 				*segment,			/* Start of segment to append while walking */
906 				 walk_last;			/* Last walked character */
907 	int			 match_len;			/* Length of the match */
908 	int			 esc_match_len;		/* Length of the quote-escaped match */
909 	int			 result_len;		/* Length of the result of the evaluation */
910 	int			 backref;			/* Current backref */
911 	char        *compiled_string_description;
912 	smart_str    code = {0};
913 
914 	eval_str_end = eval_str + eval_str_len;
915 	walk = segment = eval_str;
916 	walk_last = 0;
917 
918 	while (walk < eval_str_end) {
919 		/* If found a backreference.. */
920 		if ('\\' == *walk || '$' == *walk) {
921 			smart_str_appendl(&code, segment, walk - segment);
922 			if (walk_last == '\\') {
923 				code.c[code.len-1] = *walk++;
924 				segment = walk;
925 				walk_last = 0;
926 				continue;
927 			}
928 			segment = walk;
929 			if (preg_get_backref(&walk, &backref)) {
930 				if (backref < count) {
931 					/* Find the corresponding string match and substitute it
932 					   in instead of the backref */
933 					match = subject + offsets[backref<<1];
934 					match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
935 					if (match_len) {
936 						esc_match = php_addslashes(match, match_len, &esc_match_len, 0 TSRMLS_CC);
937 					} else {
938 						esc_match = match;
939 						esc_match_len = 0;
940 					}
941 				} else {
942 					esc_match = "";
943 					esc_match_len = 0;
944 				}
945 				smart_str_appendl(&code, esc_match, esc_match_len);
946 
947 				segment = walk;
948 
949 				/* Clean up and reassign */
950 				if (esc_match_len)
951 					efree(esc_match);
952 				continue;
953 			}
954 		}
955 		walk++;
956 		walk_last = walk[-1];
957 	}
958 	smart_str_appendl(&code, segment, walk - segment);
959 	smart_str_0(&code);
960 
961 	compiled_string_description = zend_make_compiled_string_description("regexp code" TSRMLS_CC);
962 	/* Run the code */
963 	if (zend_eval_stringl(code.c, code.len, &retval, compiled_string_description TSRMLS_CC) == FAILURE) {
964 		efree(compiled_string_description);
965 		php_error_docref(NULL TSRMLS_CC,E_ERROR, "Failed evaluating code: %s%s", PHP_EOL, code.c);
966 		/* zend_error() does not return in this case */
967 	}
968 	efree(compiled_string_description);
969 	convert_to_string(&retval);
970 
971 	/* Save the return value and its length */
972 	*result = estrndup(Z_STRVAL(retval), Z_STRLEN(retval));
973 	result_len = Z_STRLEN(retval);
974 
975 	/* Clean up */
976 	zval_dtor(&retval);
977 	smart_str_free(&code);
978 
979 	return result_len;
980 }
981 /* }}} */
982 
983 /* {{{ php_pcre_replace
984  */
php_pcre_replace(char * regex,int regex_len,char * subject,int subject_len,zval * replace_val,int is_callable_replace,int * result_len,int limit,int * replace_count TSRMLS_DC)985 PHPAPI char *php_pcre_replace(char *regex,   int regex_len,
986 							  char *subject, int subject_len,
987 							  zval *replace_val, int is_callable_replace,
988 							  int *result_len, int limit, int *replace_count TSRMLS_DC)
989 {
990 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
991 
992 	/* Compile regex or get it from cache. */
993 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
994 		return NULL;
995 	}
996 
997 	return php_pcre_replace_impl(pce, subject, subject_len, replace_val,
998 		is_callable_replace, result_len, limit, replace_count TSRMLS_CC);
999 }
1000 /* }}} */
1001 
1002 /* {{{ php_pcre_replace_impl() */
php_pcre_replace_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * replace_val,int is_callable_replace,int * result_len,int limit,int * replace_count TSRMLS_DC)1003 PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *replace_val,
1004 	int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC)
1005 {
1006 	pcre_extra		*extra = pce->extra;/* Holds results of studying */
1007 	pcre_extra		 extra_data;		/* Used locally for exec options */
1008 	int				 exoptions = 0;		/* Execution options */
1009 	int				 count = 0;			/* Count of matched subpatterns */
1010 	int				*offsets;			/* Array of subpattern offsets */
1011 	char 			**subpat_names;		/* Array for named subpatterns */
1012 	int				 num_subpats;		/* Number of captured subpatterns */
1013 	int				 size_offsets;		/* Size of the offsets array */
1014 	int				 new_len;			/* Length of needed storage */
1015 	int				 alloc_len;			/* Actual allocated length */
1016 	int				 eval_result_len=0;	/* Length of the eval'ed or
1017 										   function-returned string */
1018 	int				 match_len;			/* Length of the current match */
1019 	int				 backref;			/* Backreference number */
1020 	int				 eval;				/* If the replacement string should be eval'ed */
1021 	int				 start_offset;		/* Where the new search starts */
1022 	int				 g_notempty=0;		/* If the match should not be empty */
1023 	int				 replace_len=0;		/* Length of replacement string */
1024 	char			*result,			/* Result of replacement */
1025 					*replace=NULL,		/* Replacement string */
1026 					*new_buf,			/* Temporary buffer for re-allocation */
1027 					*walkbuf,			/* Location of current replacement in the result */
1028 					*walk,				/* Used to walk the replacement string */
1029 					*match,				/* The current match */
1030 					*piece,				/* The current piece of subject */
1031 					*replace_end=NULL,	/* End of replacement string */
1032 					*eval_result,		/* Result of eval or custom function */
1033 					 walk_last;			/* Last walked character */
1034 	int				 rc;
1035 
1036 	if (extra == NULL) {
1037 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1038 		extra = &extra_data;
1039 	}
1040 	extra->match_limit = PCRE_G(backtrack_limit);
1041 	extra->match_limit_recursion = PCRE_G(recursion_limit);
1042 
1043 	eval = pce->preg_options & PREG_REPLACE_EVAL;
1044 	if (is_callable_replace) {
1045 		if (eval) {
1046 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Modifier /e cannot be used with replacement callback");
1047 			return NULL;
1048 		}
1049 	} else {
1050 		replace = Z_STRVAL_P(replace_val);
1051 		replace_len = Z_STRLEN_P(replace_val);
1052 		replace_end = replace + replace_len;
1053 	}
1054 
1055 	/* Calculate the size of the offsets array, and allocate memory for it. */
1056 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
1057 	if (rc < 0) {
1058 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1059 		return NULL;
1060 	}
1061 	num_subpats++;
1062 	size_offsets = num_subpats * 3;
1063 
1064 	/*
1065 	 * Build a mapping from subpattern numbers to their names. We will always
1066 	 * allocate the table, even though there may be no named subpatterns. This
1067 	 * avoids somewhat more complicated logic in the inner loops.
1068 	 */
1069 	subpat_names = make_subpats_table(num_subpats, pce TSRMLS_CC);
1070 	if (!subpat_names) {
1071 		return NULL;
1072 	}
1073 
1074 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1075 
1076 	alloc_len = 2 * subject_len + 1;
1077 	result = safe_emalloc(alloc_len, sizeof(char), 0);
1078 
1079 	/* Initialize */
1080 	match = NULL;
1081 	*result_len = 0;
1082 	start_offset = 0;
1083 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1084 
1085 	while (1) {
1086 		/* Execute the regular expression. */
1087 		count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
1088 						  exoptions|g_notempty, offsets, size_offsets);
1089 
1090 		/* the string was already proved to be valid UTF-8 */
1091 		exoptions |= PCRE_NO_UTF8_CHECK;
1092 
1093 		/* Check for too many substrings condition. */
1094 		if (count == 0) {
1095 			php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
1096 			count = size_offsets/3;
1097 		}
1098 
1099 		piece = subject + start_offset;
1100 
1101 		if (count > 0 && (offsets[1] - offsets[0] >= 0) && (limit == -1 || limit > 0)) {
1102 			if (replace_count) {
1103 				++*replace_count;
1104 			}
1105 			/* Set the match location in subject */
1106 			match = subject + offsets[0];
1107 
1108 			new_len = *result_len + offsets[0] - start_offset; /* part before the match */
1109 
1110 			/* If evaluating, do it and add the return string's length */
1111 			if (eval) {
1112 				eval_result_len = preg_do_eval(replace, replace_len, subject,
1113 											   offsets, count, &eval_result TSRMLS_CC);
1114 				new_len += eval_result_len;
1115 			} else if (is_callable_replace) {
1116 				/* Use custom function to get replacement string and its length. */
1117 				eval_result_len = preg_do_repl_func(replace_val, subject, offsets, subpat_names, count, &eval_result TSRMLS_CC);
1118 				new_len += eval_result_len;
1119 			} else { /* do regular substitution */
1120 				walk = replace;
1121 				walk_last = 0;
1122 				while (walk < replace_end) {
1123 					if ('\\' == *walk || '$' == *walk) {
1124 						if (walk_last == '\\') {
1125 							walk++;
1126 							walk_last = 0;
1127 							continue;
1128 						}
1129 						if (preg_get_backref(&walk, &backref)) {
1130 							if (backref < count)
1131 								new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
1132 							continue;
1133 						}
1134 					}
1135 					new_len++;
1136 					walk++;
1137 					walk_last = walk[-1];
1138 				}
1139 			}
1140 
1141 			if (new_len + 1 > alloc_len) {
1142 				alloc_len = 1 + alloc_len + 2 * new_len;
1143 				new_buf = emalloc(alloc_len);
1144 				memcpy(new_buf, result, *result_len);
1145 				efree(result);
1146 				result = new_buf;
1147 			}
1148 			/* copy the part of the string before the match */
1149 			memcpy(&result[*result_len], piece, match-piece);
1150 			*result_len += match-piece;
1151 
1152 			/* copy replacement and backrefs */
1153 			walkbuf = result + *result_len;
1154 
1155 			/* If evaluating or using custom function, copy result to the buffer
1156 			 * and clean up. */
1157 			if (eval || is_callable_replace) {
1158 				memcpy(walkbuf, eval_result, eval_result_len);
1159 				*result_len += eval_result_len;
1160 				STR_FREE(eval_result);
1161 			} else { /* do regular backreference copying */
1162 				walk = replace;
1163 				walk_last = 0;
1164 				while (walk < replace_end) {
1165 					if ('\\' == *walk || '$' == *walk) {
1166 						if (walk_last == '\\') {
1167 							*(walkbuf-1) = *walk++;
1168 							walk_last = 0;
1169 							continue;
1170 						}
1171 						if (preg_get_backref(&walk, &backref)) {
1172 							if (backref < count) {
1173 								match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1174 								memcpy(walkbuf, subject + offsets[backref<<1], match_len);
1175 								walkbuf += match_len;
1176 							}
1177 							continue;
1178 						}
1179 					}
1180 					*walkbuf++ = *walk++;
1181 					walk_last = walk[-1];
1182 				}
1183 				*walkbuf = '\0';
1184 				/* increment the result length by how much we've added to the string */
1185 				*result_len += walkbuf - (result + *result_len);
1186 			}
1187 
1188 			if (limit != -1)
1189 				limit--;
1190 
1191 		} else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
1192 			/* If we previously set PCRE_NOTEMPTY after a null match,
1193 			   this is not necessarily the end. We need to advance
1194 			   the start offset, and continue. Fudge the offset values
1195 			   to achieve this, unless we're already at the end of the string. */
1196 			if (g_notempty != 0 && start_offset < subject_len) {
1197 				offsets[0] = start_offset;
1198 				offsets[1] = start_offset + 1;
1199 				memcpy(&result[*result_len], piece, 1);
1200 				(*result_len)++;
1201 			} else {
1202 				new_len = *result_len + subject_len - start_offset;
1203 				if (new_len + 1 > alloc_len) {
1204 					alloc_len = new_len + 1; /* now we know exactly how long it is */
1205 					new_buf = safe_emalloc(alloc_len, sizeof(char), 0);
1206 					memcpy(new_buf, result, *result_len);
1207 					efree(result);
1208 					result = new_buf;
1209 				}
1210 				/* stick that last bit of string on our output */
1211 				memcpy(&result[*result_len], piece, subject_len - start_offset);
1212 				*result_len += subject_len - start_offset;
1213 				result[*result_len] = '\0';
1214 				break;
1215 			}
1216 		} else {
1217 			pcre_handle_exec_error(count TSRMLS_CC);
1218 			efree(result);
1219 			result = NULL;
1220 			break;
1221 		}
1222 
1223 		/* If we have matched an empty string, mimic what Perl's /g options does.
1224 		   This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1225 		   the match again at the same point. If this fails (picked up above) we
1226 		   advance to the next character. */
1227 		g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1228 
1229 		/* Advance to the next piece. */
1230 		start_offset = offsets[1];
1231 	}
1232 
1233 	efree(offsets);
1234 	efree(subpat_names);
1235 
1236 	return result;
1237 }
1238 /* }}} */
1239 
1240 /* {{{ php_replace_in_subject
1241  */
php_replace_in_subject(zval * regex,zval * replace,zval ** subject,int * result_len,int limit,int is_callable_replace,int * replace_count TSRMLS_DC)1242 static char *php_replace_in_subject(zval *regex, zval *replace, zval **subject, int *result_len, int limit, int is_callable_replace, int *replace_count TSRMLS_DC)
1243 {
1244 	zval		**regex_entry,
1245 				**replace_entry = NULL,
1246 				 *replace_value,
1247 				  empty_replace;
1248 	char		*subject_value,
1249 				*result;
1250 	int			 subject_len;
1251 
1252 	/* Make sure we're dealing with strings. */
1253 	convert_to_string_ex(subject);
1254 	/* FIXME: This might need to be changed to STR_EMPTY_ALLOC(). Check if this zval could be dtor()'ed somehow */
1255 	ZVAL_STRINGL(&empty_replace, "", 0, 0);
1256 
1257 	/* If regex is an array */
1258 	if (Z_TYPE_P(regex) == IS_ARRAY) {
1259 		/* Duplicate subject string for repeated replacement */
1260 		subject_value = estrndup(Z_STRVAL_PP(subject), Z_STRLEN_PP(subject));
1261 		subject_len = Z_STRLEN_PP(subject);
1262 		*result_len = subject_len;
1263 
1264 		zend_hash_internal_pointer_reset(Z_ARRVAL_P(regex));
1265 
1266 		replace_value = replace;
1267 		if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace)
1268 			zend_hash_internal_pointer_reset(Z_ARRVAL_P(replace));
1269 
1270 		/* For each entry in the regex array, get the entry */
1271 		while (zend_hash_get_current_data(Z_ARRVAL_P(regex), (void **)&regex_entry) == SUCCESS) {
1272 			/* Make sure we're dealing with strings. */
1273 			convert_to_string_ex(regex_entry);
1274 
1275 			/* If replace is an array and not a callable construct */
1276 			if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace) {
1277 				/* Get current entry */
1278 				if (zend_hash_get_current_data(Z_ARRVAL_P(replace), (void **)&replace_entry) == SUCCESS) {
1279 					if (!is_callable_replace) {
1280 						convert_to_string_ex(replace_entry);
1281 					}
1282 					replace_value = *replace_entry;
1283 					zend_hash_move_forward(Z_ARRVAL_P(replace));
1284 				} else {
1285 					/* We've run out of replacement strings, so use an empty one */
1286 					replace_value = &empty_replace;
1287 				}
1288 			}
1289 
1290 			/* Do the actual replacement and put the result back into subject_value
1291 			   for further replacements. */
1292 			if ((result = php_pcre_replace(Z_STRVAL_PP(regex_entry),
1293 										   Z_STRLEN_PP(regex_entry),
1294 										   subject_value,
1295 										   subject_len,
1296 										   replace_value,
1297 										   is_callable_replace,
1298 										   result_len,
1299 										   limit,
1300 										   replace_count TSRMLS_CC)) != NULL) {
1301 				efree(subject_value);
1302 				subject_value = result;
1303 				subject_len = *result_len;
1304 			} else {
1305 				efree(subject_value);
1306 				return NULL;
1307 			}
1308 
1309 			zend_hash_move_forward(Z_ARRVAL_P(regex));
1310 		}
1311 
1312 		return subject_value;
1313 	} else {
1314 		result = php_pcre_replace(Z_STRVAL_P(regex),
1315 								  Z_STRLEN_P(regex),
1316 								  Z_STRVAL_PP(subject),
1317 								  Z_STRLEN_PP(subject),
1318 								  replace,
1319 								  is_callable_replace,
1320 								  result_len,
1321 								  limit,
1322 								  replace_count TSRMLS_CC);
1323 		return result;
1324 	}
1325 }
1326 /* }}} */
1327 
1328 /* {{{ preg_replace_impl
1329  */
preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS,int is_callable_replace,int is_filter)1330 static void preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS, int is_callable_replace, int is_filter)
1331 {
1332 	zval		   **regex,
1333 				   **replace,
1334 				   **subject,
1335 				   **subject_entry,
1336 				   **zcount = NULL;
1337 	char			*result;
1338 	int				 result_len;
1339 	int				 limit_val = -1;
1340 	long			limit = -1;
1341 	char			*string_key;
1342 	ulong			 num_key;
1343 	char			*callback_name;
1344 	int				 replace_count=0, old_replace_count;
1345 
1346 	/* Get function parameters and do error-checking. */
1347 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ZZZ|lZ", &regex, &replace, &subject, &limit, &zcount) == FAILURE) {
1348 		return;
1349 	}
1350 
1351 	if (!is_callable_replace && Z_TYPE_PP(replace) == IS_ARRAY && Z_TYPE_PP(regex) != IS_ARRAY) {
1352 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Parameter mismatch, pattern is a string while replacement is an array");
1353 		RETURN_FALSE;
1354 	}
1355 
1356 	SEPARATE_ZVAL(replace);
1357 	if (Z_TYPE_PP(replace) != IS_ARRAY && (Z_TYPE_PP(replace) != IS_OBJECT || !is_callable_replace)) {
1358 		convert_to_string_ex(replace);
1359 	}
1360 	if (is_callable_replace) {
1361 		if (!zend_is_callable(*replace, 0, &callback_name TSRMLS_CC)) {
1362 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Requires argument 2, '%s', to be a valid callback", callback_name);
1363 			efree(callback_name);
1364 			MAKE_COPY_ZVAL(subject, return_value);
1365 			return;
1366 		}
1367 		efree(callback_name);
1368 	}
1369 
1370 	SEPARATE_ZVAL(regex);
1371 	SEPARATE_ZVAL(subject);
1372 
1373 	if (ZEND_NUM_ARGS() > 3) {
1374 		limit_val = limit;
1375 	}
1376 
1377 	if (Z_TYPE_PP(regex) != IS_ARRAY)
1378 		convert_to_string_ex(regex);
1379 
1380 	/* if subject is an array */
1381 	if (Z_TYPE_PP(subject) == IS_ARRAY) {
1382 		array_init(return_value);
1383 		zend_hash_internal_pointer_reset(Z_ARRVAL_PP(subject));
1384 
1385 		/* For each subject entry, convert it to string, then perform replacement
1386 		   and add the result to the return_value array. */
1387 		while (zend_hash_get_current_data(Z_ARRVAL_PP(subject), (void **)&subject_entry) == SUCCESS) {
1388 			SEPARATE_ZVAL(subject_entry);
1389 			old_replace_count = replace_count;
1390 			if ((result = php_replace_in_subject(*regex, *replace, subject_entry, &result_len, limit_val, is_callable_replace, &replace_count TSRMLS_CC)) != NULL) {
1391 				if (!is_filter || replace_count > old_replace_count) {
1392 					/* Add to return array */
1393 					switch(zend_hash_get_current_key(Z_ARRVAL_PP(subject), &string_key, &num_key, 0))
1394 					{
1395 					case HASH_KEY_IS_STRING:
1396 						add_assoc_stringl(return_value, string_key, result, result_len, 0);
1397 						break;
1398 
1399 					case HASH_KEY_IS_LONG:
1400 						add_index_stringl(return_value, num_key, result, result_len, 0);
1401 						break;
1402 					}
1403 				} else {
1404 					efree(result);
1405 				}
1406 			}
1407 
1408 			zend_hash_move_forward(Z_ARRVAL_PP(subject));
1409 		}
1410 	} else {	/* if subject is not an array */
1411 		old_replace_count = replace_count;
1412 		if ((result = php_replace_in_subject(*regex, *replace, subject, &result_len, limit_val, is_callable_replace, &replace_count TSRMLS_CC)) != NULL) {
1413 			if (!is_filter || replace_count > old_replace_count) {
1414 				RETVAL_STRINGL(result, result_len, 0);
1415 			} else {
1416 				efree(result);
1417 			}
1418 		}
1419 	}
1420 	if (ZEND_NUM_ARGS() > 4) {
1421 		zval_dtor(*zcount);
1422 		ZVAL_LONG(*zcount, replace_count);
1423 	}
1424 
1425 }
1426 /* }}} */
1427 
1428 /* {{{ proto mixed preg_replace(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1429    Perform Perl-style regular expression replacement. */
PHP_FUNCTION(preg_replace)1430 static PHP_FUNCTION(preg_replace)
1431 {
1432 	preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
1433 }
1434 /* }}} */
1435 
1436 /* {{{ proto mixed preg_replace_callback(mixed regex, mixed callback, mixed subject [, int limit [, int &count]])
1437    Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback)1438 static PHP_FUNCTION(preg_replace_callback)
1439 {
1440 	preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1, 0);
1441 }
1442 /* }}} */
1443 
1444 /* {{{ proto mixed preg_filter(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1445    Perform Perl-style regular expression replacement and only return matches. */
PHP_FUNCTION(preg_filter)1446 static PHP_FUNCTION(preg_filter)
1447 {
1448 	preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
1449 }
1450 /* }}} */
1451 
1452 /* {{{ proto array preg_split(string pattern, string subject [, int limit [, int flags]])
1453    Split string into an array using a perl-style regular expression as a delimiter */
PHP_FUNCTION(preg_split)1454 static PHP_FUNCTION(preg_split)
1455 {
1456 	char				*regex;			/* Regular expression */
1457 	char				*subject;		/* String to match against */
1458 	int					 regex_len;
1459 	int					 subject_len;
1460 	long				 limit_val = -1;/* Integer value of limit */
1461 	long				 flags = 0;		/* Match control flags */
1462 	pcre_cache_entry	*pce;			/* Compiled regular expression */
1463 
1464 	/* Get function parameters and do error checking */
1465 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|ll", &regex, &regex_len,
1466 							  &subject, &subject_len, &limit_val, &flags) == FAILURE) {
1467 		RETURN_FALSE;
1468 	}
1469 
1470 	/* Compile regex or get it from cache. */
1471 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1472 		RETURN_FALSE;
1473 	}
1474 
1475 	php_pcre_split_impl(pce, subject, subject_len, return_value, limit_val, flags TSRMLS_CC);
1476 }
1477 /* }}} */
1478 
1479 /* {{{ php_pcre_split
1480  */
php_pcre_split_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * return_value,long limit_val,long flags TSRMLS_DC)1481 PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
1482 	long limit_val, long flags TSRMLS_DC)
1483 {
1484 	pcre_extra		*extra = NULL;		/* Holds results of studying */
1485 	pcre			*re_bump = NULL;	/* Regex instance for empty matches */
1486 	pcre_extra		*extra_bump = NULL;	/* Almost dummy */
1487 	pcre_extra		 extra_data;		/* Used locally for exec options */
1488 	int				*offsets;			/* Array of subpattern offsets */
1489 	int				 size_offsets;		/* Size of the offsets array */
1490 	int				 exoptions = 0;		/* Execution options */
1491 	int				 count = 0;			/* Count of matched subpatterns */
1492 	int				 start_offset;		/* Where the new search starts */
1493 	int				 next_offset;		/* End of the last delimiter match + 1 */
1494 	int				 g_notempty = 0;	/* If the match should not be empty */
1495 	char			*last_match;		/* Location of last match */
1496 	int				 rc;
1497 	int				 no_empty;			/* If NO_EMPTY flag is set */
1498 	int				 delim_capture; 	/* If delimiters should be captured */
1499 	int				 offset_capture;	/* If offsets should be captured */
1500 
1501 	no_empty = flags & PREG_SPLIT_NO_EMPTY;
1502 	delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
1503 	offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
1504 
1505 	if (limit_val == 0) {
1506 		limit_val = -1;
1507 	}
1508 
1509 	if (extra == NULL) {
1510 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1511 		extra = &extra_data;
1512 	}
1513 	extra->match_limit = PCRE_G(backtrack_limit);
1514 	extra->match_limit_recursion = PCRE_G(recursion_limit);
1515 
1516 	/* Initialize return value */
1517 	array_init(return_value);
1518 
1519 	/* Calculate the size of the offsets array, and allocate memory for it. */
1520 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
1521 	if (rc < 0) {
1522 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1523 		RETURN_FALSE;
1524 	}
1525 	size_offsets = (size_offsets + 1) * 3;
1526 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1527 
1528 	/* Start at the beginning of the string */
1529 	start_offset = 0;
1530 	next_offset = 0;
1531 	last_match = subject;
1532 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1533 
1534 	/* Get next piece if no limit or limit not yet reached and something matched*/
1535 	while ((limit_val == -1 || limit_val > 1)) {
1536 		count = pcre_exec(pce->re, extra, subject,
1537 						  subject_len, start_offset,
1538 						  exoptions|g_notempty, offsets, size_offsets);
1539 
1540 		/* the string was already proved to be valid UTF-8 */
1541 		exoptions |= PCRE_NO_UTF8_CHECK;
1542 
1543 		/* Check for too many substrings condition. */
1544 		if (count == 0) {
1545 			php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
1546 			count = size_offsets/3;
1547 		}
1548 
1549 		/* If something matched */
1550 		if (count > 0 && (offsets[1] - offsets[0] >= 0)) {
1551 			if (!no_empty || &subject[offsets[0]] != last_match) {
1552 
1553 				if (offset_capture) {
1554 					/* Add (match, offset) pair to the return value */
1555 					add_offset_pair(return_value, last_match, &subject[offsets[0]]-last_match, next_offset, NULL);
1556 				} else {
1557 					/* Add the piece to the return value */
1558 					add_next_index_stringl(return_value, last_match,
1559 								   	   &subject[offsets[0]]-last_match, 1);
1560 				}
1561 
1562 				/* One less left to do */
1563 				if (limit_val != -1)
1564 					limit_val--;
1565 			}
1566 
1567 			last_match = &subject[offsets[1]];
1568 			next_offset = offsets[1];
1569 
1570 			if (delim_capture) {
1571 				int i, match_len;
1572 				for (i = 1; i < count; i++) {
1573 					match_len = offsets[(i<<1)+1] - offsets[i<<1];
1574 					/* If we have matched a delimiter */
1575 					if (!no_empty || match_len > 0) {
1576 						if (offset_capture) {
1577 							add_offset_pair(return_value, &subject[offsets[i<<1]], match_len, offsets[i<<1], NULL);
1578 						} else {
1579 							add_next_index_stringl(return_value,
1580 												   &subject[offsets[i<<1]],
1581 												   match_len, 1);
1582 						}
1583 					}
1584 				}
1585 			}
1586 		} else if (count == PCRE_ERROR_NOMATCH) {
1587 			/* If we previously set PCRE_NOTEMPTY after a null match,
1588 			   this is not necessarily the end. We need to advance
1589 			   the start offset, and continue. Fudge the offset values
1590 			   to achieve this, unless we're already at the end of the string. */
1591 			if (g_notempty != 0 && start_offset < subject_len) {
1592 				if (pce->compile_options & PCRE_UTF8) {
1593 					if (re_bump == NULL) {
1594 						int dummy;
1595 
1596 						if ((re_bump = pcre_get_compiled_regex("/./us", &extra_bump, &dummy TSRMLS_CC)) == NULL) {
1597 							RETURN_FALSE;
1598 						}
1599 					}
1600 					count = pcre_exec(re_bump, extra_bump, subject,
1601 							  subject_len, start_offset,
1602 							  exoptions, offsets, size_offsets);
1603 					if (count < 1) {
1604 						php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown error");
1605 						RETURN_FALSE;
1606 					}
1607 				} else {
1608 					offsets[0] = start_offset;
1609 					offsets[1] = start_offset + 1;
1610 				}
1611 			} else
1612 				break;
1613 		} else {
1614 			pcre_handle_exec_error(count TSRMLS_CC);
1615 			break;
1616 		}
1617 
1618 		/* If we have matched an empty string, mimic what Perl's /g options does.
1619 		   This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1620 		   the match again at the same point. If this fails (picked up above) we
1621 		   advance to the next character. */
1622 		g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1623 
1624 		/* Advance to the position right after the last full match */
1625 		start_offset = offsets[1];
1626 	}
1627 
1628 
1629 	start_offset = last_match - subject; /* the offset might have been incremented, but without further successful matches */
1630 
1631 	if (!no_empty || start_offset < subject_len)
1632 	{
1633 		if (offset_capture) {
1634 			/* Add the last (match, offset) pair to the return value */
1635 			add_offset_pair(return_value, &subject[start_offset], subject_len - start_offset, start_offset, NULL);
1636 		} else {
1637 			/* Add the last piece to the return value */
1638 			add_next_index_stringl(return_value, last_match, subject + subject_len - last_match, 1);
1639 		}
1640 	}
1641 
1642 
1643 	/* Clean up */
1644 	efree(offsets);
1645 }
1646 /* }}} */
1647 
1648 /* {{{ proto string preg_quote(string str [, string delim_char])
1649    Quote regular expression characters plus an optional character */
PHP_FUNCTION(preg_quote)1650 static PHP_FUNCTION(preg_quote)
1651 {
1652 	int		 in_str_len;
1653 	char	*in_str;		/* Input string argument */
1654 	char	*in_str_end;    /* End of the input string */
1655 	int		 delim_len = 0;
1656 	char	*delim = NULL;	/* Additional delimiter argument */
1657 	char	*out_str,		/* Output string with quoted characters */
1658 		 	*p,				/* Iterator for input string */
1659 			*q,				/* Iterator for output string */
1660 			 delim_char=0,	/* Delimiter character to be quoted */
1661 			 c;				/* Current character */
1662 	zend_bool quote_delim = 0; /* Whether to quote additional delim char */
1663 
1664 	/* Get the arguments and check for errors */
1665 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|s", &in_str, &in_str_len,
1666 							  &delim, &delim_len) == FAILURE) {
1667 		return;
1668 	}
1669 
1670 	in_str_end = in_str + in_str_len;
1671 
1672 	/* Nothing to do if we got an empty string */
1673 	if (in_str == in_str_end) {
1674 		RETURN_EMPTY_STRING();
1675 	}
1676 
1677 	if (delim && *delim) {
1678 		delim_char = delim[0];
1679 		quote_delim = 1;
1680 	}
1681 
1682 	/* Allocate enough memory so that even if each character
1683 	   is quoted, we won't run out of room */
1684 	out_str = safe_emalloc(4, in_str_len, 1);
1685 
1686 	/* Go through the string and quote necessary characters */
1687 	for(p = in_str, q = out_str; p != in_str_end; p++) {
1688 		c = *p;
1689 		switch(c) {
1690 			case '.':
1691 			case '\\':
1692 			case '+':
1693 			case '*':
1694 			case '?':
1695 			case '[':
1696 			case '^':
1697 			case ']':
1698 			case '$':
1699 			case '(':
1700 			case ')':
1701 			case '{':
1702 			case '}':
1703 			case '=':
1704 			case '!':
1705 			case '>':
1706 			case '<':
1707 			case '|':
1708 			case ':':
1709 			case '-':
1710 				*q++ = '\\';
1711 				*q++ = c;
1712 				break;
1713 
1714 			case '\0':
1715 				*q++ = '\\';
1716 				*q++ = '0';
1717 				*q++ = '0';
1718 				*q++ = '0';
1719 				break;
1720 
1721 			default:
1722 				if (quote_delim && c == delim_char)
1723 					*q++ = '\\';
1724 				*q++ = c;
1725 				break;
1726 		}
1727 	}
1728 	*q = '\0';
1729 
1730 	/* Reallocate string and return it */
1731 	RETVAL_STRINGL(erealloc(out_str, q - out_str + 1), q - out_str, 0);
1732 }
1733 /* }}} */
1734 
1735 /* {{{ proto array preg_grep(string regex, array input [, int flags])
1736    Searches array and returns entries which match regex */
PHP_FUNCTION(preg_grep)1737 static PHP_FUNCTION(preg_grep)
1738 {
1739 	char				*regex;			/* Regular expression */
1740 	int				 	 regex_len;
1741 	zval				*input;			/* Input array */
1742 	long				 flags = 0;		/* Match control flags */
1743 	pcre_cache_entry	*pce;			/* Compiled regular expression */
1744 
1745 	/* Get arguments and do error checking */
1746 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sa|l", &regex, &regex_len,
1747 							  &input, &flags) == FAILURE) {
1748 		return;
1749 	}
1750 
1751 	/* Compile regex or get it from cache. */
1752 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1753 		RETURN_FALSE;
1754 	}
1755 
1756 	php_pcre_grep_impl(pce, input, return_value, flags TSRMLS_CC);
1757 }
1758 /* }}} */
1759 
php_pcre_grep_impl(pcre_cache_entry * pce,zval * input,zval * return_value,long flags TSRMLS_DC)1760 PHPAPI void  php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, long flags TSRMLS_DC) /* {{{ */
1761 {
1762 	zval		   **entry;				/* An entry in the input array */
1763 	pcre_extra		*extra = pce->extra;/* Holds results of studying */
1764 	pcre_extra		 extra_data;		/* Used locally for exec options */
1765 	int				*offsets;			/* Array of subpattern offsets */
1766 	int				 size_offsets;		/* Size of the offsets array */
1767 	int				 count = 0;			/* Count of matched subpatterns */
1768 	char			*string_key;
1769 	ulong			 num_key;
1770 	zend_bool		 invert;			/* Whether to return non-matching
1771 										   entries */
1772 	int				 rc;
1773 
1774 	invert = flags & PREG_GREP_INVERT ? 1 : 0;
1775 
1776 	if (extra == NULL) {
1777 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1778 		extra = &extra_data;
1779 	}
1780 	extra->match_limit = PCRE_G(backtrack_limit);
1781 	extra->match_limit_recursion = PCRE_G(recursion_limit);
1782 
1783 	/* Calculate the size of the offsets array, and allocate memory for it. */
1784 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
1785 	if (rc < 0) {
1786 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1787 		RETURN_FALSE;
1788 	}
1789 	size_offsets = (size_offsets + 1) * 3;
1790 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1791 
1792 	/* Initialize return array */
1793 	array_init(return_value);
1794 
1795 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1796 
1797 	/* Go through the input array */
1798 	zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
1799 	while (zend_hash_get_current_data(Z_ARRVAL_P(input), (void **)&entry) == SUCCESS) {
1800 		zval subject = **entry;
1801 
1802 		if (Z_TYPE_PP(entry) != IS_STRING) {
1803 			zval_copy_ctor(&subject);
1804 			convert_to_string(&subject);
1805 		}
1806 
1807 		/* Perform the match */
1808 		count = pcre_exec(pce->re, extra, Z_STRVAL(subject),
1809 						  Z_STRLEN(subject), 0,
1810 						  0, offsets, size_offsets);
1811 
1812 		/* Check for too many substrings condition. */
1813 		if (count == 0) {
1814 			php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
1815 			count = size_offsets/3;
1816 		} else if (count < 0 && count != PCRE_ERROR_NOMATCH) {
1817 			pcre_handle_exec_error(count TSRMLS_CC);
1818 			break;
1819 		}
1820 
1821 		/* If the entry fits our requirements */
1822 		if ((count > 0 && !invert) || (count == PCRE_ERROR_NOMATCH && invert)) {
1823 
1824 			Z_ADDREF_PP(entry);
1825 
1826 			/* Add to return array */
1827 			switch (zend_hash_get_current_key(Z_ARRVAL_P(input), &string_key, &num_key, 0))
1828 			{
1829 				case HASH_KEY_IS_STRING:
1830 					zend_hash_update(Z_ARRVAL_P(return_value), string_key,
1831 									 strlen(string_key)+1, entry, sizeof(zval *), NULL);
1832 					break;
1833 
1834 				case HASH_KEY_IS_LONG:
1835 					zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry,
1836 										   sizeof(zval *), NULL);
1837 					break;
1838 			}
1839 		}
1840 
1841 		if (Z_TYPE_PP(entry) != IS_STRING) {
1842 			zval_dtor(&subject);
1843 		}
1844 
1845 		zend_hash_move_forward(Z_ARRVAL_P(input));
1846 	}
1847 	zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
1848 	/* Clean up */
1849 	efree(offsets);
1850 }
1851 /* }}} */
1852 
1853 /* {{{ proto int preg_last_error()
1854    Returns the error code of the last regexp execution. */
PHP_FUNCTION(preg_last_error)1855 static PHP_FUNCTION(preg_last_error)
1856 {
1857 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) {
1858 		return;
1859 	}
1860 
1861 	RETURN_LONG(PCRE_G(error_code));
1862 }
1863 /* }}} */
1864 
1865 /* {{{ module definition structures */
1866 
1867 /* {{{ arginfo */
1868 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match, 0, 0, 2)
1869     ZEND_ARG_INFO(0, pattern)
1870     ZEND_ARG_INFO(0, subject)
1871     ZEND_ARG_INFO(1, subpatterns) /* array */
1872     ZEND_ARG_INFO(0, flags)
1873     ZEND_ARG_INFO(0, offset)
1874 ZEND_END_ARG_INFO()
1875 
1876 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match_all, 0, 0, 2)
1877     ZEND_ARG_INFO(0, pattern)
1878     ZEND_ARG_INFO(0, subject)
1879     ZEND_ARG_INFO(1, subpatterns) /* array */
1880     ZEND_ARG_INFO(0, flags)
1881     ZEND_ARG_INFO(0, offset)
1882 ZEND_END_ARG_INFO()
1883 
1884 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace, 0, 0, 3)
1885     ZEND_ARG_INFO(0, regex)
1886     ZEND_ARG_INFO(0, replace)
1887     ZEND_ARG_INFO(0, subject)
1888     ZEND_ARG_INFO(0, limit)
1889     ZEND_ARG_INFO(1, count)
1890 ZEND_END_ARG_INFO()
1891 
1892 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace_callback, 0, 0, 3)
1893     ZEND_ARG_INFO(0, regex)
1894     ZEND_ARG_INFO(0, callback)
1895     ZEND_ARG_INFO(0, subject)
1896     ZEND_ARG_INFO(0, limit)
1897     ZEND_ARG_INFO(1, count)
1898 ZEND_END_ARG_INFO()
1899 
1900 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_split, 0, 0, 2)
1901     ZEND_ARG_INFO(0, pattern)
1902     ZEND_ARG_INFO(0, subject)
1903     ZEND_ARG_INFO(0, limit)
1904     ZEND_ARG_INFO(0, flags)
1905 ZEND_END_ARG_INFO()
1906 
1907 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_quote, 0, 0, 1)
1908     ZEND_ARG_INFO(0, str)
1909     ZEND_ARG_INFO(0, delim_char)
1910 ZEND_END_ARG_INFO()
1911 
1912 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_grep, 0, 0, 2)
1913     ZEND_ARG_INFO(0, regex)
1914     ZEND_ARG_INFO(0, input) /* array */
1915     ZEND_ARG_INFO(0, flags)
1916 ZEND_END_ARG_INFO()
1917 
1918 ZEND_BEGIN_ARG_INFO(arginfo_preg_last_error, 0)
1919 ZEND_END_ARG_INFO()
1920 /* }}} */
1921 
1922 static const zend_function_entry pcre_functions[] = {
1923 	PHP_FE(preg_match,				arginfo_preg_match)
1924 	PHP_FE(preg_match_all,			arginfo_preg_match_all)
1925 	PHP_FE(preg_replace,			arginfo_preg_replace)
1926 	PHP_FE(preg_replace_callback,	arginfo_preg_replace_callback)
1927 	PHP_FE(preg_filter,				arginfo_preg_replace)
1928 	PHP_FE(preg_split,				arginfo_preg_split)
1929 	PHP_FE(preg_quote,				arginfo_preg_quote)
1930 	PHP_FE(preg_grep,				arginfo_preg_grep)
1931 	PHP_FE(preg_last_error,			arginfo_preg_last_error)
1932 	PHP_FE_END
1933 };
1934 
1935 zend_module_entry pcre_module_entry = {
1936 	STANDARD_MODULE_HEADER,
1937    "pcre",
1938 	pcre_functions,
1939 	PHP_MINIT(pcre),
1940 	PHP_MSHUTDOWN(pcre),
1941 	NULL,
1942 	NULL,
1943 	PHP_MINFO(pcre),
1944 	NO_VERSION_YET,
1945 	PHP_MODULE_GLOBALS(pcre),
1946 	PHP_GINIT(pcre),
1947 	PHP_GSHUTDOWN(pcre),
1948 	NULL,
1949 	STANDARD_MODULE_PROPERTIES_EX
1950 };
1951 
1952 #ifdef COMPILE_DL_PCRE
1953 ZEND_GET_MODULE(pcre)
1954 #endif
1955 
1956 /* }}} */
1957 
1958 #endif /* HAVE_PCRE || HAVE_BUNDLED_PCRE */
1959 
1960 /*
1961  * Local variables:
1962  * tab-width: 4
1963  * c-basic-offset: 4
1964  * End:
1965  * vim600: sw=4 ts=4 fdm=marker
1966  * vim<600: sw=4 ts=4
1967  */
1968