xref: /PHP-5.6/ext/pcre/php_pcre.c (revision 29e2a204)
1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 5                                                        |
4    +----------------------------------------------------------------------+
5    | Copyright (c) 1997-2016 The PHP Group                                |
6    +----------------------------------------------------------------------+
7    | This source file is subject to version 3.01 of the PHP license,      |
8    | that is bundled with this package in the file LICENSE, and is        |
9    | available through the world-wide-web at the following url:           |
10    | http://www.php.net/license/3_01.txt                                  |
11    | If you did not receive a copy of the PHP license and are unable to   |
12    | obtain it through the world-wide-web, please send a note to          |
13    | license@php.net so we can mail you a copy immediately.               |
14    +----------------------------------------------------------------------+
15    | Author: Andrei Zmievski <andrei@php.net>                             |
16    +----------------------------------------------------------------------+
17  */
18 
19 /* $Id$ */
20 
21 #include "php.h"
22 #include "php_ini.h"
23 #include "php_globals.h"
24 #include "php_pcre.h"
25 #include "ext/standard/info.h"
26 #include "ext/standard/php_smart_str.h"
27 
28 #if HAVE_PCRE || HAVE_BUNDLED_PCRE
29 
30 #include "ext/standard/php_string.h"
31 
32 #define PREG_PATTERN_ORDER			1
33 #define PREG_SET_ORDER				2
34 #define PREG_OFFSET_CAPTURE			(1<<8)
35 
36 #define	PREG_SPLIT_NO_EMPTY			(1<<0)
37 #define PREG_SPLIT_DELIM_CAPTURE	(1<<1)
38 #define PREG_SPLIT_OFFSET_CAPTURE	(1<<2)
39 
40 #define PREG_REPLACE_EVAL			(1<<0)
41 
42 #define PREG_GREP_INVERT			(1<<0)
43 
44 #define PCRE_CACHE_SIZE 4096
45 
46 /* not fully functional workaround for libpcre < 8.0, see bug #70232 */
47 #ifndef PCRE_NOTEMPTY_ATSTART
48 # define PCRE_NOTEMPTY_ATSTART PCRE_NOTEMPTY
49 #endif
50 
51 enum {
52 	PHP_PCRE_NO_ERROR = 0,
53 	PHP_PCRE_INTERNAL_ERROR,
54 	PHP_PCRE_BACKTRACK_LIMIT_ERROR,
55 	PHP_PCRE_RECURSION_LIMIT_ERROR,
56 	PHP_PCRE_BAD_UTF8_ERROR,
57 	PHP_PCRE_BAD_UTF8_OFFSET_ERROR
58 };
59 
60 
ZEND_DECLARE_MODULE_GLOBALS(pcre)61 ZEND_DECLARE_MODULE_GLOBALS(pcre)
62 
63 
64 static void pcre_handle_exec_error(int pcre_code TSRMLS_DC) /* {{{ */
65 {
66 	int preg_code = 0;
67 
68 	switch (pcre_code) {
69 		case PCRE_ERROR_MATCHLIMIT:
70 			preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
71 			break;
72 
73 		case PCRE_ERROR_RECURSIONLIMIT:
74 			preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
75 			break;
76 
77 		case PCRE_ERROR_BADUTF8:
78 			preg_code = PHP_PCRE_BAD_UTF8_ERROR;
79 			break;
80 
81 		case PCRE_ERROR_BADUTF8_OFFSET:
82 			preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
83 			break;
84 
85 		default:
86 			preg_code = PHP_PCRE_INTERNAL_ERROR;
87 			break;
88 	}
89 
90 	PCRE_G(error_code) = preg_code;
91 }
92 /* }}} */
93 
php_free_pcre_cache(void * data)94 static void php_free_pcre_cache(void *data) /* {{{ */
95 {
96 	pcre_cache_entry *pce = (pcre_cache_entry *) data;
97 	if (!pce) return;
98 	pefree(pce->re, 1);
99 	if (pce->extra) pefree(pce->extra, 1);
100 #if HAVE_SETLOCALE
101 	if ((void*)pce->tables) pefree((void*)pce->tables, 1);
102 	pefree(pce->locale, 1);
103 #endif
104 }
105 /* }}} */
106 
PHP_GINIT_FUNCTION(pcre)107 static PHP_GINIT_FUNCTION(pcre) /* {{{ */
108 {
109 	zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1);
110 	pcre_globals->backtrack_limit = 0;
111 	pcre_globals->recursion_limit = 0;
112 	pcre_globals->error_code      = PHP_PCRE_NO_ERROR;
113 }
114 /* }}} */
115 
PHP_GSHUTDOWN_FUNCTION(pcre)116 static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */
117 {
118 	zend_hash_destroy(&pcre_globals->pcre_cache);
119 }
120 /* }}} */
121 
122 PHP_INI_BEGIN()
123 	STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateLong, backtrack_limit, zend_pcre_globals, pcre_globals)
124 	STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000", PHP_INI_ALL, OnUpdateLong, recursion_limit, zend_pcre_globals, pcre_globals)
PHP_INI_END()125 PHP_INI_END()
126 
127 
128 /* {{{ PHP_MINFO_FUNCTION(pcre) */
129 static PHP_MINFO_FUNCTION(pcre)
130 {
131 	php_info_print_table_start();
132 	php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
133 	php_info_print_table_row(2, "PCRE Library Version", pcre_version() );
134 	php_info_print_table_end();
135 
136 	DISPLAY_INI_ENTRIES();
137 }
138 /* }}} */
139 
140 /* {{{ PHP_MINIT_FUNCTION(pcre) */
PHP_MINIT_FUNCTION(pcre)141 static PHP_MINIT_FUNCTION(pcre)
142 {
143 	REGISTER_INI_ENTRIES();
144 
145 	REGISTER_LONG_CONSTANT("PREG_PATTERN_ORDER", PREG_PATTERN_ORDER, CONST_CS | CONST_PERSISTENT);
146 	REGISTER_LONG_CONSTANT("PREG_SET_ORDER", PREG_SET_ORDER, CONST_CS | CONST_PERSISTENT);
147 	REGISTER_LONG_CONSTANT("PREG_OFFSET_CAPTURE", PREG_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
148 	REGISTER_LONG_CONSTANT("PREG_SPLIT_NO_EMPTY", PREG_SPLIT_NO_EMPTY, CONST_CS | CONST_PERSISTENT);
149 	REGISTER_LONG_CONSTANT("PREG_SPLIT_DELIM_CAPTURE", PREG_SPLIT_DELIM_CAPTURE, CONST_CS | CONST_PERSISTENT);
150 	REGISTER_LONG_CONSTANT("PREG_SPLIT_OFFSET_CAPTURE", PREG_SPLIT_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
151 	REGISTER_LONG_CONSTANT("PREG_GREP_INVERT", PREG_GREP_INVERT, CONST_CS | CONST_PERSISTENT);
152 
153 	REGISTER_LONG_CONSTANT("PREG_NO_ERROR", PHP_PCRE_NO_ERROR, CONST_CS | CONST_PERSISTENT);
154 	REGISTER_LONG_CONSTANT("PREG_INTERNAL_ERROR", PHP_PCRE_INTERNAL_ERROR, CONST_CS | CONST_PERSISTENT);
155 	REGISTER_LONG_CONSTANT("PREG_BACKTRACK_LIMIT_ERROR", PHP_PCRE_BACKTRACK_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
156 	REGISTER_LONG_CONSTANT("PREG_RECURSION_LIMIT_ERROR", PHP_PCRE_RECURSION_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
157 	REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_ERROR", PHP_PCRE_BAD_UTF8_ERROR, CONST_CS | CONST_PERSISTENT);
158 	REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_OFFSET_ERROR", PHP_PCRE_BAD_UTF8_OFFSET_ERROR, CONST_CS | CONST_PERSISTENT);
159 	REGISTER_STRING_CONSTANT("PCRE_VERSION", (char *)pcre_version(), CONST_CS | CONST_PERSISTENT);
160 
161 	return SUCCESS;
162 }
163 /* }}} */
164 
165 /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
PHP_MSHUTDOWN_FUNCTION(pcre)166 static PHP_MSHUTDOWN_FUNCTION(pcre)
167 {
168 	UNREGISTER_INI_ENTRIES();
169 
170 	return SUCCESS;
171 }
172 /* }}} */
173 
174 /* {{{ static pcre_clean_cache */
pcre_clean_cache(void * data,void * arg TSRMLS_DC)175 static int pcre_clean_cache(void *data, void *arg TSRMLS_DC)
176 {
177 	pcre_cache_entry *pce = (pcre_cache_entry *) data;
178 	int *num_clean = (int *)arg;
179 
180 	if (*num_clean > 0 && !pce->refcount) {
181 		(*num_clean)--;
182 		return ZEND_HASH_APPLY_REMOVE;
183 	} else {
184 		return ZEND_HASH_APPLY_KEEP;
185 	}
186 }
187 /* }}} */
188 
189 /* {{{ static make_subpats_table */
make_subpats_table(int num_subpats,pcre_cache_entry * pce TSRMLS_DC)190 static char **make_subpats_table(int num_subpats, pcre_cache_entry *pce TSRMLS_DC)
191 {
192 	pcre_extra *extra = pce->extra;
193 	int name_cnt = 0, name_size, ni = 0;
194 	int rc;
195 	char *name_table;
196 	unsigned short name_idx;
197 	char **subpat_names = (char **)ecalloc(num_subpats, sizeof(char *));
198 
199 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMECOUNT, &name_cnt);
200 	if (rc < 0) {
201 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
202 		efree(subpat_names);
203 		return NULL;
204 	}
205 	if (name_cnt > 0) {
206 		int rc1, rc2;
207 
208 		rc1 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMETABLE, &name_table);
209 		rc2 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
210 		rc = rc2 ? rc2 : rc1;
211 		if (rc < 0) {
212 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
213 			efree(subpat_names);
214 			return NULL;
215 		}
216 
217 		while (ni++ < name_cnt) {
218 			name_idx = 0x100 * (unsigned char)name_table[0] + (unsigned char)name_table[1];
219 			subpat_names[name_idx] = name_table + 2;
220 			if (is_numeric_string(subpat_names[name_idx], strlen(subpat_names[name_idx]), NULL, NULL, 0) > 0) {
221 				php_error_docref(NULL TSRMLS_CC, E_WARNING, "Numeric named subpatterns are not allowed");
222 				efree(subpat_names);
223 				return NULL;
224 			}
225 			name_table += name_size;
226 		}
227 	}
228 
229 	return subpat_names;
230 }
231 /* }}} */
232 
233 /* {{{ static calculate_unit_length */
234 /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE_UTF8. */
calculate_unit_length(pcre_cache_entry * pce,char * start)235 static zend_always_inline int calculate_unit_length(pcre_cache_entry *pce, char *start)
236 {
237 	int unit_len;
238 
239 	if (pce->compile_options & PCRE_UTF8) {
240 		char *end = start;
241 
242 		/* skip continuation bytes */
243 		while ((*++end & 0xC0) == 0x80);
244 		unit_len = end - start;
245 	} else {
246 		unit_len = 1;
247 	}
248 	return unit_len;
249 }
250 /* }}} */
251 
252 /* {{{ pcre_get_compiled_regex_cache
253  */
pcre_get_compiled_regex_cache(char * regex,int regex_len TSRMLS_DC)254 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_len TSRMLS_DC)
255 {
256 	pcre				*re = NULL;
257 	pcre_extra			*extra;
258 	int					 coptions = 0;
259 	int					 soptions = 0;
260 	const char			*error;
261 	int					 erroffset;
262 	char				 delimiter;
263 	char				 start_delimiter;
264 	char				 end_delimiter;
265 	char				*p, *pp;
266 	char				*pattern;
267 	int					 do_study = 0;
268 	int					 poptions = 0;
269 	int				count = 0;
270 	unsigned const char *tables = NULL;
271 #if HAVE_SETLOCALE
272 	char				*locale;
273 #endif
274 	pcre_cache_entry	*pce;
275 	pcre_cache_entry	 new_entry;
276 	char                *tmp = NULL;
277 
278 #if HAVE_SETLOCALE
279 # if defined(PHP_WIN32) && defined(ZTS)
280 	_configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
281 # endif
282 	locale = setlocale(LC_CTYPE, NULL);
283 #endif
284 
285 	/* Try to lookup the cached regex entry, and if successful, just pass
286 	   back the compiled pattern, otherwise go on and compile it. */
287 	if (zend_hash_find(&PCRE_G(pcre_cache), regex, regex_len+1, (void **)&pce) == SUCCESS) {
288 		/*
289 		 * We use a quick pcre_fullinfo() check to see whether cache is corrupted, and if it
290 		 * is, we flush it and compile the pattern from scratch.
291 		 */
292 		if (pcre_fullinfo(pce->re, NULL, PCRE_INFO_CAPTURECOUNT, &count) == PCRE_ERROR_BADMAGIC) {
293 			zend_hash_clean(&PCRE_G(pcre_cache));
294 		} else {
295 #if HAVE_SETLOCALE
296 			if (!strcmp(pce->locale, locale)) {
297 #endif
298 				return pce;
299 #if HAVE_SETLOCALE
300 			}
301 #endif
302 		}
303 	}
304 
305 	p = regex;
306 
307 	/* Parse through the leading whitespace, and display a warning if we
308 	   get to the end without encountering a delimiter. */
309 	while (isspace((int)*(unsigned char *)p)) p++;
310 	if (*p == 0) {
311 		php_error_docref(NULL TSRMLS_CC, E_WARNING,
312 						 p < regex + regex_len ? "Null byte in regex" : "Empty regular expression");
313 		return NULL;
314 	}
315 
316 	/* Get the delimiter and display a warning if it is alphanumeric
317 	   or a backslash. */
318 	delimiter = *p++;
319 	if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
320 		php_error_docref(NULL TSRMLS_CC,E_WARNING, "Delimiter must not be alphanumeric or backslash");
321 		return NULL;
322 	}
323 
324 	start_delimiter = delimiter;
325 	if ((pp = strchr("([{< )]}> )]}>", delimiter)))
326 		delimiter = pp[5];
327 	end_delimiter = delimiter;
328 
329 	pp = p;
330 
331 	if (start_delimiter == end_delimiter) {
332 		/* We need to iterate through the pattern, searching for the ending delimiter,
333 		   but skipping the backslashed delimiters.  If the ending delimiter is not
334 		   found, display a warning. */
335 		while (*pp != 0) {
336 			if (*pp == '\\' && pp[1] != 0) pp++;
337 			else if (*pp == delimiter)
338 				break;
339 			pp++;
340 		}
341 	} else {
342 		/* We iterate through the pattern, searching for the matching ending
343 		 * delimiter. For each matching starting delimiter, we increment nesting
344 		 * level, and decrement it for each matching ending delimiter. If we
345 		 * reach the end of the pattern without matching, display a warning.
346 		 */
347 		int brackets = 1; 	/* brackets nesting level */
348 		while (*pp != 0) {
349 			if (*pp == '\\' && pp[1] != 0) pp++;
350 			else if (*pp == end_delimiter && --brackets <= 0)
351 				break;
352 			else if (*pp == start_delimiter)
353 				brackets++;
354 			pp++;
355 		}
356 	}
357 
358 	if (*pp == 0) {
359 		if (pp < regex + regex_len) {
360 			php_error_docref(NULL TSRMLS_CC,E_WARNING, "Null byte in regex");
361 		} else if (start_delimiter == end_delimiter) {
362 			php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending delimiter '%c' found", delimiter);
363 		} else {
364 			php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending matching delimiter '%c' found", delimiter);
365 		}
366 		return NULL;
367 	}
368 
369 	/* Make a copy of the actual pattern. */
370 	pattern = estrndup(p, pp-p);
371 
372 	/* Move on to the options */
373 	pp++;
374 
375 	/* Parse through the options, setting appropriate flags.  Display
376 	   a warning if we encounter an unknown modifier. */
377 	while (pp < regex + regex_len) {
378 		switch (*pp++) {
379 			/* Perl compatible options */
380 			case 'i':	coptions |= PCRE_CASELESS;		break;
381 			case 'm':	coptions |= PCRE_MULTILINE;		break;
382 			case 's':	coptions |= PCRE_DOTALL;		break;
383 			case 'x':	coptions |= PCRE_EXTENDED;		break;
384 
385 			/* PCRE specific options */
386 			case 'A':	coptions |= PCRE_ANCHORED;		break;
387 			case 'D':	coptions |= PCRE_DOLLAR_ENDONLY;break;
388 			case 'S':	do_study  = 1;					break;
389 			case 'U':	coptions |= PCRE_UNGREEDY;		break;
390 			case 'X':	coptions |= PCRE_EXTRA;			break;
391 			case 'u':	coptions |= PCRE_UTF8;
392 	/* In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
393        characters, even in UTF-8 mode. However, this can be changed by setting
394        the PCRE_UCP option. */
395 #ifdef PCRE_UCP
396 						coptions |= PCRE_UCP;
397 #endif
398 				break;
399 			case 'J':	coptions |= PCRE_DUPNAMES;		break;
400 
401 			/* Custom preg options */
402 			case 'e':	poptions |= PREG_REPLACE_EVAL;	break;
403 
404 			case ' ':
405 			case '\n':
406 				break;
407 
408 			default:
409 				if (pp[-1]) {
410 					php_error_docref(NULL TSRMLS_CC,E_WARNING, "Unknown modifier '%c'", pp[-1]);
411 				} else {
412 					php_error_docref(NULL TSRMLS_CC,E_WARNING, "Null byte in regex");
413 				}
414 				efree(pattern);
415 				return NULL;
416 		}
417 	}
418 
419 #if HAVE_SETLOCALE
420 	if (strcmp(locale, "C"))
421 		tables = pcre_maketables();
422 #endif
423 
424 	/* Compile pattern and display a warning if compilation failed. */
425 	re = pcre_compile(pattern,
426 					  coptions,
427 					  &error,
428 					  &erroffset,
429 					  tables);
430 
431 	if (re == NULL) {
432 		php_error_docref(NULL TSRMLS_CC,E_WARNING, "Compilation failed: %s at offset %d", error, erroffset);
433 		efree(pattern);
434 		if (tables) {
435 			pefree((void*)tables, 1);
436 		}
437 		return NULL;
438 	}
439 
440 	/* If study option was specified, study the pattern and
441 	   store the result in extra for passing to pcre_exec. */
442 	if (do_study) {
443 		extra = pcre_study(re, soptions, &error);
444 		if (extra) {
445 			extra->flags |= PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
446 		}
447 		if (error != NULL) {
448 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Error while studying pattern");
449 		}
450 	} else {
451 		extra = NULL;
452 	}
453 
454 	efree(pattern);
455 
456 	/*
457 	 * If we reached cache limit, clean out the items from the head of the list;
458 	 * these are supposedly the oldest ones (but not necessarily the least used
459 	 * ones).
460 	 */
461 	if (zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) {
462 		int num_clean = PCRE_CACHE_SIZE / 8;
463 		zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean TSRMLS_CC);
464 	}
465 
466 	/* Store the compiled pattern and extra info in the cache. */
467 	new_entry.re = re;
468 	new_entry.extra = extra;
469 	new_entry.preg_options = poptions;
470 	new_entry.compile_options = coptions;
471 #if HAVE_SETLOCALE
472 	new_entry.locale = pestrdup(locale, 1);
473 	new_entry.tables = tables;
474 #endif
475 	new_entry.refcount = 0;
476 
477 	/*
478 	 * Interned strings are not duplicated when stored in HashTable,
479 	 * but all the interned strings created during HTTP request are removed
480 	 * at end of request. However PCRE_G(pcre_cache) must be consistent
481 	 * on the next request as well. So we disable usage of interned strings
482 	 * as hash keys especually for this table.
483 	 * See bug #63180
484 	 */
485 	if (IS_INTERNED(regex)) {
486 		regex = tmp = estrndup(regex, regex_len);
487 	}
488 
489 	zend_hash_update(&PCRE_G(pcre_cache), regex, regex_len+1, (void *)&new_entry,
490 						sizeof(pcre_cache_entry), (void**)&pce);
491 
492 	if (tmp) {
493 		efree(tmp);
494 	}
495 
496 	return pce;
497 }
498 /* }}} */
499 
500 /* {{{ pcre_get_compiled_regex
501  */
pcre_get_compiled_regex(char * regex,pcre_extra ** extra,int * preg_options TSRMLS_DC)502 PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_options TSRMLS_DC)
503 {
504 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
505 
506 	if (extra) {
507 		*extra = pce ? pce->extra : NULL;
508 	}
509 	if (preg_options) {
510 		*preg_options = pce ? pce->preg_options : 0;
511 	}
512 
513 	return pce ? pce->re : NULL;
514 }
515 /* }}} */
516 
517 /* {{{ pcre_get_compiled_regex_ex
518  */
pcre_get_compiled_regex_ex(char * regex,pcre_extra ** extra,int * preg_options,int * compile_options TSRMLS_DC)519 PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *compile_options TSRMLS_DC)
520 {
521 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
522 
523 	if (extra) {
524 		*extra = pce ? pce->extra : NULL;
525 	}
526 	if (preg_options) {
527 		*preg_options = pce ? pce->preg_options : 0;
528 	}
529 	if (compile_options) {
530 		*compile_options = pce ? pce->compile_options : 0;
531 	}
532 
533 	return pce ? pce->re : NULL;
534 }
535 /* }}} */
536 
537 /* {{{ add_offset_pair */
add_offset_pair(zval * result,char * str,int len,int offset,char * name)538 static inline void add_offset_pair(zval *result, char *str, int len, int offset, char *name)
539 {
540 	zval *match_pair;
541 
542 	ALLOC_ZVAL(match_pair);
543 	array_init(match_pair);
544 	INIT_PZVAL(match_pair);
545 
546 	/* Add (match, offset) to the return value */
547 	add_next_index_stringl(match_pair, str, len, 1);
548 	add_next_index_long(match_pair, offset);
549 
550 	if (name) {
551 		zval_add_ref(&match_pair);
552 		zend_hash_update(Z_ARRVAL_P(result), name, strlen(name)+1, &match_pair, sizeof(zval *), NULL);
553 	}
554 	zend_hash_next_index_insert(Z_ARRVAL_P(result), &match_pair, sizeof(zval *), NULL);
555 }
556 /* }}} */
557 
php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS,int global)558 static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ */
559 {
560 	/* parameters */
561 	char			 *regex;			/* Regular expression */
562 	char			 *subject;			/* String to match against */
563 	int				  regex_len;
564 	int				  subject_len;
565 	pcre_cache_entry *pce;				/* Compiled regular expression */
566 	zval			 *subpats = NULL;	/* Array for subpatterns */
567 	long			  flags = 0;		/* Match control flags */
568 	long			  start_offset = 0;	/* Where the new search starts */
569 
570 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|zll", &regex, &regex_len,
571 							  &subject, &subject_len, &subpats, &flags, &start_offset) == FAILURE) {
572 		RETURN_FALSE;
573 	}
574 
575 	/* Compile regex or get it from cache. */
576 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
577 		RETURN_FALSE;
578 	}
579 
580 	pce->refcount++;
581 	php_pcre_match_impl(pce, subject, subject_len, return_value, subpats,
582 		global, ZEND_NUM_ARGS() >= 4, flags, start_offset TSRMLS_CC);
583 	pce->refcount--;
584 }
585 /* }}} */
586 
587 /* {{{ php_pcre_match_impl() */
php_pcre_match_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * return_value,zval * subpats,int global,int use_flags,long flags,long start_offset TSRMLS_DC)588 PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
589 	zval *subpats, int global, int use_flags, long flags, long start_offset TSRMLS_DC)
590 {
591 	zval			*result_set,		/* Holds a set of subpatterns after
592 										   a global match */
593 				   **match_sets = NULL;	/* An array of sets of matches for each
594 										   subpattern after a global match */
595 	pcre_extra		*extra = pce->extra;/* Holds results of studying */
596 	pcre_extra		 extra_data;		/* Used locally for exec options */
597 	int				 exoptions = 0;		/* Execution options */
598 	int				 count = 0;			/* Count of matched subpatterns */
599 	int				*offsets;			/* Array of subpattern offsets */
600 	int				 num_subpats;		/* Number of captured subpatterns */
601 	int				 size_offsets;		/* Size of the offsets array */
602 	int				 matched;			/* Has anything matched */
603 	int				 g_notempty = 0;	/* If the match should not be empty */
604 	const char	   **stringlist;		/* Holds list of subpatterns */
605 	char 		   **subpat_names;		/* Array for named subpatterns */
606 	int				 i, rc;
607 	int				 subpats_order;		/* Order of subpattern matches */
608 	int				 offset_capture;    /* Capture match offsets: yes/no */
609 	unsigned char   *mark = NULL;       /* Target for MARK name */
610 	zval            *marks = NULL;      /* Array of marks for PREG_PATTERN_ORDER */
611 
612 	/* Overwrite the passed-in value for subpatterns with an empty array. */
613 	if (subpats != NULL) {
614 		zval_dtor(subpats);
615 		array_init(subpats);
616 	}
617 
618 	subpats_order = global ? PREG_PATTERN_ORDER : 0;
619 
620 	if (use_flags) {
621 		offset_capture = flags & PREG_OFFSET_CAPTURE;
622 
623 		/*
624 		 * subpats_order is pre-set to pattern mode so we change it only if
625 		 * necessary.
626 		 */
627 		if (flags & 0xff) {
628 			subpats_order = flags & 0xff;
629 		}
630 		if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
631 			(!global && subpats_order != 0)) {
632 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid flags specified");
633 			return;
634 		}
635 	} else {
636 		offset_capture = 0;
637 	}
638 
639 	/* Negative offset counts from the end of the string. */
640 	if (start_offset < 0) {
641 		start_offset = subject_len + start_offset;
642 		if (start_offset < 0) {
643 			start_offset = 0;
644 		}
645 	}
646 
647 	if (extra == NULL) {
648 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
649 		extra = &extra_data;
650 	}
651 	extra->match_limit = PCRE_G(backtrack_limit);
652 	extra->match_limit_recursion = PCRE_G(recursion_limit);
653 #ifdef PCRE_EXTRA_MARK
654 	extra->mark = &mark;
655 	extra->flags |= PCRE_EXTRA_MARK;
656 #endif
657 
658 	/* Calculate the size of the offsets array, and allocate memory for it. */
659 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
660 	if (rc < 0) {
661 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
662 		RETURN_FALSE;
663 	}
664 	num_subpats++;
665 	size_offsets = num_subpats * 3;
666 
667 	/*
668 	 * Build a mapping from subpattern numbers to their names. We will always
669 	 * allocate the table, even though there may be no named subpatterns. This
670 	 * avoids somewhat more complicated logic in the inner loops.
671 	 */
672 	subpat_names = make_subpats_table(num_subpats, pce TSRMLS_CC);
673 	if (!subpat_names) {
674 		RETURN_FALSE;
675 	}
676 
677 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
678 	memset(offsets, 0, size_offsets*sizeof(int));
679 	/* Allocate match sets array and initialize the values. */
680 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
681 		match_sets = (zval **)safe_emalloc(num_subpats, sizeof(zval *), 0);
682 		for (i=0; i<num_subpats; i++) {
683 			ALLOC_ZVAL(match_sets[i]);
684 			array_init(match_sets[i]);
685 			INIT_PZVAL(match_sets[i]);
686 		}
687 	}
688 
689 	matched = 0;
690 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
691 
692 	do {
693 		/* Execute the regular expression. */
694 		count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
695 						  exoptions|g_notempty, offsets, size_offsets);
696 
697 		/* the string was already proved to be valid UTF-8 */
698 		exoptions |= PCRE_NO_UTF8_CHECK;
699 
700 		/* Check for too many substrings condition. */
701 		if (count == 0) {
702 			php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
703 			count = size_offsets/3;
704 		}
705 
706 		/* If something has matched */
707 		if (count > 0) {
708 			matched++;
709 
710 			/* If subpatterns array has been passed, fill it in with values. */
711 			if (subpats != NULL) {
712 				/* Try to get the list of substrings and display a warning if failed. */
713 				if ((offsets[1] - offsets[0] < 0) || pcre_get_substring_list(subject, offsets, count, &stringlist) < 0) {
714 					efree(subpat_names);
715 					efree(offsets);
716 					if (match_sets) efree(match_sets);
717 					php_error_docref(NULL TSRMLS_CC, E_WARNING, "Get subpatterns list failed");
718 					RETURN_FALSE;
719 				}
720 
721 				if (global) {	/* global pattern matching */
722 					if (subpats && subpats_order == PREG_PATTERN_ORDER) {
723 						/* For each subpattern, insert it into the appropriate array. */
724 						for (i = 0; i < count; i++) {
725 							if (offset_capture) {
726 								add_offset_pair(match_sets[i], (char *)stringlist[i],
727 												offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL);
728 							} else {
729 								add_next_index_stringl(match_sets[i], (char *)stringlist[i],
730 													   offsets[(i<<1)+1] - offsets[i<<1], 1);
731 							}
732 						}
733 						/* Add MARK, if available */
734 						if (mark) {
735 							if (!marks) {
736 								MAKE_STD_ZVAL(marks);
737 								array_init(marks);
738 							}
739 							add_index_string(marks, matched - 1, (char *) mark, 1);
740 						}
741 						/*
742 						 * If the number of captured subpatterns on this run is
743 						 * less than the total possible number, pad the result
744 						 * arrays with empty strings.
745 						 */
746 						if (count < num_subpats) {
747 							for (; i < num_subpats; i++) {
748 								add_next_index_string(match_sets[i], "", 1);
749 							}
750 						}
751 					} else {
752 						/* Allocate the result set array */
753 						ALLOC_ZVAL(result_set);
754 						array_init(result_set);
755 						INIT_PZVAL(result_set);
756 
757 						/* Add all the subpatterns to it */
758 						for (i = 0; i < count; i++) {
759 							if (offset_capture) {
760 								add_offset_pair(result_set, (char *)stringlist[i],
761 												offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], subpat_names[i]);
762 							} else {
763 								if (subpat_names[i]) {
764 									add_assoc_stringl(result_set, subpat_names[i], (char *)stringlist[i],
765 														   offsets[(i<<1)+1] - offsets[i<<1], 1);
766 								}
767 								add_next_index_stringl(result_set, (char *)stringlist[i],
768 													   offsets[(i<<1)+1] - offsets[i<<1], 1);
769 							}
770 						}
771 						/* Add MARK, if available */
772 						if (mark) {
773 							add_assoc_string(result_set, "MARK", (char *) mark, 1);
774 						}
775 						/* And add it to the output array */
776 						zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &result_set, sizeof(zval *), NULL);
777 					}
778 				} else {			/* single pattern matching */
779 					/* For each subpattern, insert it into the subpatterns array. */
780 					for (i = 0; i < count; i++) {
781 						if (offset_capture) {
782 							add_offset_pair(subpats, (char *)stringlist[i],
783 											offsets[(i<<1)+1] - offsets[i<<1],
784 											offsets[i<<1], subpat_names[i]);
785 						} else {
786 							if (subpat_names[i]) {
787 								add_assoc_stringl(subpats, subpat_names[i], (char *)stringlist[i],
788 												  offsets[(i<<1)+1] - offsets[i<<1], 1);
789 							}
790 							add_next_index_stringl(subpats, (char *)stringlist[i],
791 												   offsets[(i<<1)+1] - offsets[i<<1], 1);
792 						}
793 					}
794 					/* Add MARK, if available */
795 					if (mark) {
796 						add_assoc_string(subpats, "MARK", (char *) mark, 1);
797 					}
798 				}
799 
800 				pcre_free((void *) stringlist);
801 			}
802 		} else if (count == PCRE_ERROR_NOMATCH) {
803 			/* If we previously set PCRE_NOTEMPTY_ATSTART after a null match,
804 			   this is not necessarily the end. We need to advance
805 			   the start offset, and continue. Fudge the offset values
806 			   to achieve this, unless we're already at the end of the string. */
807 			if (g_notempty != 0 && start_offset < subject_len) {
808 				int unit_len = calculate_unit_length(pce, subject + start_offset);
809 
810 				offsets[0] = start_offset;
811 				offsets[1] = start_offset + unit_len;
812 			} else
813 				break;
814 		} else {
815 			pcre_handle_exec_error(count TSRMLS_CC);
816 			break;
817 		}
818 
819 		/* If we have matched an empty string, mimic what Perl's /g options does.
820 		   This turns out to be rather cunning. First we set PCRE_NOTEMPTY_ATSTART and try
821 		   the match again at the same point. If this fails (picked up above) we
822 		   advance to the next character. */
823 		g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED : 0;
824 
825 		/* Advance to the position right after the last full match */
826 		start_offset = offsets[1];
827 	} while (global);
828 
829 	/* Add the match sets to the output array and clean up */
830 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
831 		for (i = 0; i < num_subpats; i++) {
832 			if (subpat_names[i]) {
833 				zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i],
834 								 strlen(subpat_names[i])+1, &match_sets[i], sizeof(zval *), NULL);
835 				Z_ADDREF_P(match_sets[i]);
836 			}
837 			zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i], sizeof(zval *), NULL);
838 		}
839 		efree(match_sets);
840 
841 		if (marks) {
842 			add_assoc_zval(subpats, "MARK", marks);
843 		}
844 	}
845 
846 	efree(offsets);
847 	efree(subpat_names);
848 
849 	/* Did we encounter an error? */
850 	if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
851 		RETVAL_LONG(matched);
852 	} else {
853 		RETVAL_FALSE;
854 	}
855 }
856 /* }}} */
857 
858 /* {{{ proto int preg_match(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
859    Perform a Perl-style regular expression match */
PHP_FUNCTION(preg_match)860 static PHP_FUNCTION(preg_match)
861 {
862 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
863 }
864 /* }}} */
865 
866 /* {{{ proto int preg_match_all(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
867    Perform a Perl-style global regular expression match */
PHP_FUNCTION(preg_match_all)868 static PHP_FUNCTION(preg_match_all)
869 {
870 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
871 }
872 /* }}} */
873 
874 /* {{{ preg_get_backref
875  */
preg_get_backref(char ** str,int * backref)876 static int preg_get_backref(char **str, int *backref)
877 {
878 	register char in_brace = 0;
879 	register char *walk = *str;
880 
881 	if (walk[1] == 0)
882 		return 0;
883 
884 	if (*walk == '$' && walk[1] == '{') {
885 		in_brace = 1;
886 		walk++;
887 	}
888 	walk++;
889 
890 	if (*walk >= '0' && *walk <= '9') {
891 		*backref = *walk - '0';
892 		walk++;
893 	} else
894 		return 0;
895 
896 	if (*walk && *walk >= '0' && *walk <= '9') {
897 		*backref = *backref * 10 + *walk - '0';
898 		walk++;
899 	}
900 
901 	if (in_brace) {
902 		if (*walk == 0 || *walk != '}')
903 			return 0;
904 		else
905 			walk++;
906 	}
907 
908 	*str = walk;
909 	return 1;
910 }
911 /* }}} */
912 
913 /* {{{ preg_do_repl_func
914  */
preg_do_repl_func(zval * function,char * subject,int * offsets,char ** subpat_names,int count,unsigned char * mark,char ** result TSRMLS_DC)915 static int preg_do_repl_func(zval *function, char *subject, int *offsets, char **subpat_names, int count, unsigned char *mark, char **result TSRMLS_DC)
916 {
917 	zval		*retval_ptr;		/* Function return value */
918 	zval	   **args[1];			/* Argument to pass to function */
919 	zval		*subpats;			/* Captured subpatterns */
920 	int			 result_len;		/* Return value length */
921 	int			 i;
922 
923 	MAKE_STD_ZVAL(subpats);
924 	array_init(subpats);
925 	for (i = 0; i < count; i++) {
926 		if (subpat_names[i]) {
927 			add_assoc_stringl(subpats, subpat_names[i], &subject[offsets[i<<1]] , offsets[(i<<1)+1] - offsets[i<<1], 1);
928 		}
929 		add_next_index_stringl(subpats, &subject[offsets[i<<1]], offsets[(i<<1)+1] - offsets[i<<1], 1);
930 	}
931 	if (mark) {
932 		add_assoc_string(subpats, "MARK", (char *) mark, 1);
933 	}
934 	args[0] = &subpats;
935 
936 	if (call_user_function_ex(EG(function_table), NULL, function, &retval_ptr, 1, args, 0, NULL TSRMLS_CC) == SUCCESS && retval_ptr) {
937 		convert_to_string_ex(&retval_ptr);
938 		*result = estrndup(Z_STRVAL_P(retval_ptr), Z_STRLEN_P(retval_ptr));
939 		result_len = Z_STRLEN_P(retval_ptr);
940 		zval_ptr_dtor(&retval_ptr);
941 	} else {
942 		if (!EG(exception)) {
943 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unable to call custom replacement function");
944 		}
945 		result_len = offsets[1] - offsets[0];
946 		*result = estrndup(&subject[offsets[0]], result_len);
947 	}
948 
949 	zval_ptr_dtor(&subpats);
950 
951 	return result_len;
952 }
953 /* }}} */
954 
955 /* {{{ preg_do_eval
956  */
preg_do_eval(char * eval_str,int eval_str_len,char * subject,int * offsets,int count,char ** result TSRMLS_DC)957 static int preg_do_eval(char *eval_str, int eval_str_len, char *subject,
958 						int *offsets, int count, char **result TSRMLS_DC)
959 {
960 	zval		 retval;			/* Return value from evaluation */
961 	char		*eval_str_end,		/* End of eval string */
962 				*match,				/* Current match for a backref */
963 				*esc_match,			/* Quote-escaped match */
964 				*walk,				/* Used to walk the code string */
965 				*segment,			/* Start of segment to append while walking */
966 				 walk_last;			/* Last walked character */
967 	int			 match_len;			/* Length of the match */
968 	int			 esc_match_len;		/* Length of the quote-escaped match */
969 	int			 result_len;		/* Length of the result of the evaluation */
970 	int			 backref;			/* Current backref */
971 	char        *compiled_string_description;
972 	smart_str    code = {0};
973 
974 	eval_str_end = eval_str + eval_str_len;
975 	walk = segment = eval_str;
976 	walk_last = 0;
977 
978 	while (walk < eval_str_end) {
979 		/* If found a backreference.. */
980 		if ('\\' == *walk || '$' == *walk) {
981 			smart_str_appendl(&code, segment, walk - segment);
982 			if (walk_last == '\\') {
983 				code.c[code.len-1] = *walk++;
984 				segment = walk;
985 				walk_last = 0;
986 				continue;
987 			}
988 			segment = walk;
989 			if (preg_get_backref(&walk, &backref)) {
990 				if (backref < count) {
991 					/* Find the corresponding string match and substitute it
992 					   in instead of the backref */
993 					match = subject + offsets[backref<<1];
994 					match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
995 					if (match_len) {
996 						esc_match = php_addslashes(match, match_len, &esc_match_len, 0 TSRMLS_CC);
997 					} else {
998 						esc_match = match;
999 						esc_match_len = 0;
1000 					}
1001 				} else {
1002 					esc_match = "";
1003 					esc_match_len = 0;
1004 				}
1005 				smart_str_appendl(&code, esc_match, esc_match_len);
1006 
1007 				segment = walk;
1008 
1009 				/* Clean up and reassign */
1010 				if (esc_match_len)
1011 					efree(esc_match);
1012 				continue;
1013 			}
1014 		}
1015 		walk++;
1016 		walk_last = walk[-1];
1017 	}
1018 	smart_str_appendl(&code, segment, walk - segment);
1019 	smart_str_0(&code);
1020 
1021 	compiled_string_description = zend_make_compiled_string_description("regexp code" TSRMLS_CC);
1022 	/* Run the code */
1023 	if (zend_eval_stringl(code.c, code.len, &retval, compiled_string_description TSRMLS_CC) == FAILURE) {
1024 		efree(compiled_string_description);
1025 		php_error_docref(NULL TSRMLS_CC,E_ERROR, "Failed evaluating code: %s%s", PHP_EOL, code.c);
1026 		/* zend_error() does not return in this case */
1027 	}
1028 	efree(compiled_string_description);
1029 	convert_to_string(&retval);
1030 
1031 	/* Save the return value and its length */
1032 	*result = estrndup(Z_STRVAL(retval), Z_STRLEN(retval));
1033 	result_len = Z_STRLEN(retval);
1034 
1035 	/* Clean up */
1036 	zval_dtor(&retval);
1037 	smart_str_free(&code);
1038 
1039 	return result_len;
1040 }
1041 /* }}} */
1042 
1043 /* {{{ php_pcre_replace
1044  */
php_pcre_replace(char * regex,int regex_len,char * subject,int subject_len,zval * replace_val,int is_callable_replace,int * result_len,int limit,int * replace_count TSRMLS_DC)1045 PHPAPI char *php_pcre_replace(char *regex,   int regex_len,
1046 							  char *subject, int subject_len,
1047 							  zval *replace_val, int is_callable_replace,
1048 							  int *result_len, int limit, int *replace_count TSRMLS_DC)
1049 {
1050 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
1051 	char		 		*result;			/* Function result */
1052 
1053 	/* Compile regex or get it from cache. */
1054 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1055 		return NULL;
1056 	}
1057 	pce->refcount++;
1058 	result = php_pcre_replace_impl(pce, subject, subject_len, replace_val,
1059 		is_callable_replace, result_len, limit, replace_count TSRMLS_CC);
1060 	pce->refcount--;
1061 
1062 	return result;
1063 }
1064 /* }}} */
1065 
1066 /* {{{ php_pcre_replace_impl() */
php_pcre_replace_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * replace_val,int is_callable_replace,int * result_len,int limit,int * replace_count TSRMLS_DC)1067 PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *replace_val,
1068 	int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC)
1069 {
1070 	pcre_extra		*extra = pce->extra;/* Holds results of studying */
1071 	pcre_extra		 extra_data;		/* Used locally for exec options */
1072 	int				 exoptions = 0;		/* Execution options */
1073 	int				 count = 0;			/* Count of matched subpatterns */
1074 	int				*offsets;			/* Array of subpattern offsets */
1075 	char 			**subpat_names;		/* Array for named subpatterns */
1076 	int				 num_subpats;		/* Number of captured subpatterns */
1077 	int				 size_offsets;		/* Size of the offsets array */
1078 	size_t			 new_len;			/* Length of needed storage */
1079 	size_t			 alloc_len;			/* Actual allocated length */
1080 	int				 eval_result_len=0;	/* Length of the eval'ed or
1081 										   function-returned string */
1082 	int				 match_len;			/* Length of the current match */
1083 	int				 backref;			/* Backreference number */
1084 	int				 eval;				/* If the replacement string should be eval'ed */
1085 	int				 start_offset;		/* Where the new search starts */
1086 	int				 g_notempty=0;		/* If the match should not be empty */
1087 	int				 replace_len=0;		/* Length of replacement string */
1088 	char			*result,			/* Result of replacement */
1089 					*replace=NULL,		/* Replacement string */
1090 					*new_buf,			/* Temporary buffer for re-allocation */
1091 					*walkbuf,			/* Location of current replacement in the result */
1092 					*walk,				/* Used to walk the replacement string */
1093 					*match,				/* The current match */
1094 					*piece,				/* The current piece of subject */
1095 					*replace_end=NULL,	/* End of replacement string */
1096 					*eval_result,		/* Result of eval or custom function */
1097 					 walk_last;			/* Last walked character */
1098 	int				 rc;
1099 	unsigned char   *mark = NULL;       /* Target for MARK name */
1100 
1101 	if (extra == NULL) {
1102 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1103 		extra = &extra_data;
1104 	}
1105 	extra->match_limit = PCRE_G(backtrack_limit);
1106 	extra->match_limit_recursion = PCRE_G(recursion_limit);
1107 #ifdef PCRE_EXTRA_MARK
1108 	extra->mark = &mark;
1109 	extra->flags |= PCRE_EXTRA_MARK;
1110 #endif
1111 
1112 	eval = pce->preg_options & PREG_REPLACE_EVAL;
1113 	if (is_callable_replace) {
1114 		if (eval) {
1115 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Modifier /e cannot be used with replacement callback");
1116 			return NULL;
1117 		}
1118 	} else {
1119 		replace = Z_STRVAL_P(replace_val);
1120 		replace_len = Z_STRLEN_P(replace_val);
1121 		replace_end = replace + replace_len;
1122 	}
1123 
1124 	if (eval) {
1125 		php_error_docref(NULL TSRMLS_CC, E_DEPRECATED, "The /e modifier is deprecated, use preg_replace_callback instead");
1126 	}
1127 
1128 	/* Calculate the size of the offsets array, and allocate memory for it. */
1129 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
1130 	if (rc < 0) {
1131 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1132 		return NULL;
1133 	}
1134 	num_subpats++;
1135 	size_offsets = num_subpats * 3;
1136 
1137 	/*
1138 	 * Build a mapping from subpattern numbers to their names. We will always
1139 	 * allocate the table, even though there may be no named subpatterns. This
1140 	 * avoids somewhat more complicated logic in the inner loops.
1141 	 */
1142 	subpat_names = make_subpats_table(num_subpats, pce TSRMLS_CC);
1143 	if (!subpat_names) {
1144 		return NULL;
1145 	}
1146 
1147 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1148 
1149 	result = safe_emalloc(subject_len, 2*sizeof(char), 1);
1150 	alloc_len = 2 * (size_t)subject_len + 1;
1151 
1152 	/* Initialize */
1153 	match = NULL;
1154 	*result_len = 0;
1155 	start_offset = 0;
1156 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1157 
1158 	while (1) {
1159 		/* Execute the regular expression. */
1160 		count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
1161 						  exoptions|g_notempty, offsets, size_offsets);
1162 
1163 		/* the string was already proved to be valid UTF-8 */
1164 		exoptions |= PCRE_NO_UTF8_CHECK;
1165 
1166 		/* Check for too many substrings condition. */
1167 		if (count == 0) {
1168 			php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
1169 			count = size_offsets/3;
1170 		}
1171 
1172 		piece = subject + start_offset;
1173 
1174 		if (count > 0 && (offsets[1] - offsets[0] >= 0) && (limit == -1 || limit > 0)) {
1175 			if (replace_count) {
1176 				++*replace_count;
1177 			}
1178 			/* Set the match location in subject */
1179 			match = subject + offsets[0];
1180 
1181 			new_len = *result_len + offsets[0] - start_offset; /* part before the match */
1182 
1183 			/* If evaluating, do it and add the return string's length */
1184 			if (eval) {
1185 				eval_result_len = preg_do_eval(replace, replace_len, subject,
1186 											   offsets, count, &eval_result TSRMLS_CC);
1187 				new_len += eval_result_len;
1188 			} else if (is_callable_replace) {
1189 				/* Use custom function to get replacement string and its length. */
1190 				eval_result_len = preg_do_repl_func(replace_val, subject, offsets, subpat_names, count, mark, &eval_result TSRMLS_CC);
1191 				new_len += eval_result_len;
1192 			} else { /* do regular substitution */
1193 				walk = replace;
1194 				walk_last = 0;
1195 				while (walk < replace_end) {
1196 					if ('\\' == *walk || '$' == *walk) {
1197 						if (walk_last == '\\') {
1198 							walk++;
1199 							walk_last = 0;
1200 							continue;
1201 						}
1202 						if (preg_get_backref(&walk, &backref)) {
1203 							if (backref < count)
1204 								new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
1205 							continue;
1206 						}
1207 					}
1208 					new_len++;
1209 					walk++;
1210 					walk_last = walk[-1];
1211 				}
1212 			}
1213 
1214 			if (new_len + 1 > alloc_len) {
1215 				new_buf = safe_emalloc(2, new_len + 1, alloc_len);
1216 				alloc_len = 1 + alloc_len + 2 * (size_t)new_len;
1217 				memcpy(new_buf, result, *result_len);
1218 				efree(result);
1219 				result = new_buf;
1220 			}
1221 			/* copy the part of the string before the match */
1222 			memcpy(&result[*result_len], piece, match-piece);
1223 			*result_len += match-piece;
1224 
1225 			/* copy replacement and backrefs */
1226 			walkbuf = result + *result_len;
1227 
1228 			/* If evaluating or using custom function, copy result to the buffer
1229 			 * and clean up. */
1230 			if (eval || is_callable_replace) {
1231 				memcpy(walkbuf, eval_result, eval_result_len);
1232 				*result_len += eval_result_len;
1233 				STR_FREE(eval_result);
1234 			} else { /* do regular backreference copying */
1235 				walk = replace;
1236 				walk_last = 0;
1237 				while (walk < replace_end) {
1238 					if ('\\' == *walk || '$' == *walk) {
1239 						if (walk_last == '\\') {
1240 							*(walkbuf-1) = *walk++;
1241 							walk_last = 0;
1242 							continue;
1243 						}
1244 						if (preg_get_backref(&walk, &backref)) {
1245 							if (backref < count) {
1246 								match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1247 								memcpy(walkbuf, subject + offsets[backref<<1], match_len);
1248 								walkbuf += match_len;
1249 							}
1250 							continue;
1251 						}
1252 					}
1253 					*walkbuf++ = *walk++;
1254 					walk_last = walk[-1];
1255 				}
1256 				*walkbuf = '\0';
1257 				/* increment the result length by how much we've added to the string */
1258 				*result_len += walkbuf - (result + *result_len);
1259 			}
1260 
1261 			if (limit != -1)
1262 				limit--;
1263 
1264 		} else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
1265 			/* If we previously set PCRE_NOTEMPTY_ATSTART after a null match,
1266 			   this is not necessarily the end. We need to advance
1267 			   the start offset, and continue. Fudge the offset values
1268 			   to achieve this, unless we're already at the end of the string. */
1269 			if (g_notempty != 0 && start_offset < subject_len) {
1270 				int unit_len = calculate_unit_length(pce, piece);
1271 
1272 				offsets[0] = start_offset;
1273 				offsets[1] = start_offset + unit_len;
1274 				memcpy(&result[*result_len], piece, unit_len);
1275 				*result_len += unit_len;
1276 			} else {
1277 				new_len = *result_len + subject_len - start_offset;
1278 				if (new_len + 1 > alloc_len) {
1279 					new_buf = safe_emalloc(new_len, sizeof(char), 1);
1280 					alloc_len = (size_t)new_len + 1; /* now we know exactly how long it is */
1281 					memcpy(new_buf, result, *result_len);
1282 					efree(result);
1283 					result = new_buf;
1284 				}
1285 				/* stick that last bit of string on our output */
1286 				memcpy(&result[*result_len], piece, subject_len - start_offset);
1287 				*result_len += subject_len - start_offset;
1288 				result[*result_len] = '\0';
1289 				break;
1290 			}
1291 		} else {
1292 			pcre_handle_exec_error(count TSRMLS_CC);
1293 			efree(result);
1294 			result = NULL;
1295 			break;
1296 		}
1297 
1298 		/* If we have matched an empty string, mimic what Perl's /g options does.
1299 		   This turns out to be rather cunning. First we set PCRE_NOTEMPTY_ATSTART and try
1300 		   the match again at the same point. If this fails (picked up above) we
1301 		   advance to the next character. */
1302 		g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED : 0;
1303 
1304 		/* Advance to the next piece. */
1305 		start_offset = offsets[1];
1306 	}
1307 
1308 	efree(offsets);
1309 	efree(subpat_names);
1310 
1311 	if(result && (size_t)(*result_len) > INT_MAX) {
1312 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Result is too big, max is %d", INT_MAX);
1313 		efree(result);
1314 		result = NULL;
1315 	}
1316 
1317 	return result;
1318 }
1319 /* }}} */
1320 
1321 /* {{{ php_replace_in_subject
1322  */
php_replace_in_subject(zval * regex,zval * replace,zval ** subject,int * result_len,int limit,int is_callable_replace,int * replace_count TSRMLS_DC)1323 static char *php_replace_in_subject(zval *regex, zval *replace, zval **subject, int *result_len, int limit, int is_callable_replace, int *replace_count TSRMLS_DC)
1324 {
1325 	zval		**regex_entry,
1326 				**replace_entry = NULL,
1327 				 *replace_value,
1328 				  empty_replace;
1329 	char		*subject_value,
1330 				*result;
1331 	int			 subject_len;
1332 
1333 	/* Make sure we're dealing with strings. */
1334 	convert_to_string_ex(subject);
1335 	/* FIXME: This might need to be changed to STR_EMPTY_ALLOC(). Check if this zval could be dtor()'ed somehow */
1336 	ZVAL_STRINGL(&empty_replace, "", 0, 0);
1337 
1338 	/* If regex is an array */
1339 	if (Z_TYPE_P(regex) == IS_ARRAY) {
1340 		/* Duplicate subject string for repeated replacement */
1341 		subject_value = estrndup(Z_STRVAL_PP(subject), Z_STRLEN_PP(subject));
1342 		subject_len = Z_STRLEN_PP(subject);
1343 		*result_len = subject_len;
1344 
1345 		zend_hash_internal_pointer_reset(Z_ARRVAL_P(regex));
1346 
1347 		replace_value = replace;
1348 		if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace)
1349 			zend_hash_internal_pointer_reset(Z_ARRVAL_P(replace));
1350 
1351 		/* For each entry in the regex array, get the entry */
1352 		while (zend_hash_get_current_data(Z_ARRVAL_P(regex), (void **)&regex_entry) == SUCCESS) {
1353 			/* Make sure we're dealing with strings. */
1354 			convert_to_string_ex(regex_entry);
1355 
1356 			/* If replace is an array and not a callable construct */
1357 			if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace) {
1358 				/* Get current entry */
1359 				if (zend_hash_get_current_data(Z_ARRVAL_P(replace), (void **)&replace_entry) == SUCCESS) {
1360 					if (!is_callable_replace) {
1361 						convert_to_string_ex(replace_entry);
1362 					}
1363 					replace_value = *replace_entry;
1364 					zend_hash_move_forward(Z_ARRVAL_P(replace));
1365 				} else {
1366 					/* We've run out of replacement strings, so use an empty one */
1367 					replace_value = &empty_replace;
1368 				}
1369 			}
1370 
1371 			/* Do the actual replacement and put the result back into subject_value
1372 			   for further replacements. */
1373 			if ((result = php_pcre_replace(Z_STRVAL_PP(regex_entry),
1374 										   Z_STRLEN_PP(regex_entry),
1375 										   subject_value,
1376 										   subject_len,
1377 										   replace_value,
1378 										   is_callable_replace,
1379 										   result_len,
1380 										   limit,
1381 										   replace_count TSRMLS_CC)) != NULL) {
1382 				efree(subject_value);
1383 				subject_value = result;
1384 				subject_len = *result_len;
1385 			} else {
1386 				efree(subject_value);
1387 				return NULL;
1388 			}
1389 
1390 			zend_hash_move_forward(Z_ARRVAL_P(regex));
1391 		}
1392 
1393 		return subject_value;
1394 	} else {
1395 		result = php_pcre_replace(Z_STRVAL_P(regex),
1396 								  Z_STRLEN_P(regex),
1397 								  Z_STRVAL_PP(subject),
1398 								  Z_STRLEN_PP(subject),
1399 								  replace,
1400 								  is_callable_replace,
1401 								  result_len,
1402 								  limit,
1403 								  replace_count TSRMLS_CC);
1404 		return result;
1405 	}
1406 }
1407 /* }}} */
1408 
1409 /* {{{ preg_replace_impl
1410  */
preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS,int is_callable_replace,int is_filter)1411 static void preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS, int is_callable_replace, int is_filter)
1412 {
1413 	zval		   **regex,
1414 				   **replace,
1415 				   **subject,
1416 				   **subject_entry,
1417 				   **zcount = NULL;
1418 	char			*result;
1419 	int				 result_len;
1420 	int				 limit_val = -1;
1421 	long			limit = -1;
1422 	char			*string_key;
1423 	uint			 string_key_len;
1424 	ulong			 num_key;
1425 	char			*callback_name;
1426 	int				 replace_count=0, old_replace_count;
1427 
1428 	/* Get function parameters and do error-checking. */
1429 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ZZZ|lZ", &regex, &replace, &subject, &limit, &zcount) == FAILURE) {
1430 		return;
1431 	}
1432 
1433 	if (!is_callable_replace && Z_TYPE_PP(replace) == IS_ARRAY && Z_TYPE_PP(regex) != IS_ARRAY) {
1434 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Parameter mismatch, pattern is a string while replacement is an array");
1435 		RETURN_FALSE;
1436 	}
1437 
1438 	SEPARATE_ZVAL(replace);
1439 	if (Z_TYPE_PP(replace) != IS_ARRAY && (Z_TYPE_PP(replace) != IS_OBJECT || !is_callable_replace)) {
1440 		convert_to_string_ex(replace);
1441 	}
1442 	if (is_callable_replace) {
1443 		if (!zend_is_callable(*replace, 0, &callback_name TSRMLS_CC)) {
1444 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Requires argument 2, '%s', to be a valid callback", callback_name);
1445 			efree(callback_name);
1446 			MAKE_COPY_ZVAL(subject, return_value);
1447 			return;
1448 		}
1449 		efree(callback_name);
1450 	}
1451 
1452 	SEPARATE_ZVAL(regex);
1453 	SEPARATE_ZVAL(subject);
1454 
1455 	if (ZEND_NUM_ARGS() > 3) {
1456 		limit_val = limit;
1457 	}
1458 
1459 	if (Z_TYPE_PP(regex) != IS_ARRAY)
1460 		convert_to_string_ex(regex);
1461 
1462 	/* if subject is an array */
1463 	if (Z_TYPE_PP(subject) == IS_ARRAY) {
1464 		array_init(return_value);
1465 		zend_hash_internal_pointer_reset(Z_ARRVAL_PP(subject));
1466 
1467 		/* For each subject entry, convert it to string, then perform replacement
1468 		   and add the result to the return_value array. */
1469 		while (zend_hash_get_current_data(Z_ARRVAL_PP(subject), (void **)&subject_entry) == SUCCESS) {
1470 			SEPARATE_ZVAL(subject_entry);
1471 			old_replace_count = replace_count;
1472 			if ((result = php_replace_in_subject(*regex, *replace, subject_entry, &result_len, limit_val, is_callable_replace, &replace_count TSRMLS_CC)) != NULL) {
1473 				if (!is_filter || replace_count > old_replace_count) {
1474 					/* Add to return array */
1475 					switch(zend_hash_get_current_key_ex(Z_ARRVAL_PP(subject), &string_key, &string_key_len, &num_key, 0, NULL))
1476 					{
1477 					case HASH_KEY_IS_STRING:
1478 						add_assoc_stringl_ex(return_value, string_key, string_key_len, result, result_len, 0);
1479 						break;
1480 
1481 					case HASH_KEY_IS_LONG:
1482 						add_index_stringl(return_value, num_key, result, result_len, 0);
1483 						break;
1484 					}
1485 				} else {
1486 					efree(result);
1487 				}
1488 			}
1489 
1490 			zend_hash_move_forward(Z_ARRVAL_PP(subject));
1491 		}
1492 	} else {	/* if subject is not an array */
1493 		old_replace_count = replace_count;
1494 		if ((result = php_replace_in_subject(*regex, *replace, subject, &result_len, limit_val, is_callable_replace, &replace_count TSRMLS_CC)) != NULL) {
1495 			if (!is_filter || replace_count > old_replace_count) {
1496 				RETVAL_STRINGL(result, result_len, 0);
1497 			} else {
1498 				efree(result);
1499 			}
1500 		}
1501 	}
1502 	if (ZEND_NUM_ARGS() > 4) {
1503 		zval_dtor(*zcount);
1504 		ZVAL_LONG(*zcount, replace_count);
1505 	}
1506 
1507 }
1508 /* }}} */
1509 
1510 /* {{{ proto mixed preg_replace(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1511    Perform Perl-style regular expression replacement. */
PHP_FUNCTION(preg_replace)1512 static PHP_FUNCTION(preg_replace)
1513 {
1514 	preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
1515 }
1516 /* }}} */
1517 
1518 /* {{{ proto mixed preg_replace_callback(mixed regex, mixed callback, mixed subject [, int limit [, int &count]])
1519    Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback)1520 static PHP_FUNCTION(preg_replace_callback)
1521 {
1522 	preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1, 0);
1523 }
1524 /* }}} */
1525 
1526 /* {{{ proto mixed preg_filter(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1527    Perform Perl-style regular expression replacement and only return matches. */
PHP_FUNCTION(preg_filter)1528 static PHP_FUNCTION(preg_filter)
1529 {
1530 	preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
1531 }
1532 /* }}} */
1533 
1534 /* {{{ proto array preg_split(string pattern, string subject [, int limit [, int flags]])
1535    Split string into an array using a perl-style regular expression as a delimiter */
PHP_FUNCTION(preg_split)1536 static PHP_FUNCTION(preg_split)
1537 {
1538 	char				*regex;			/* Regular expression */
1539 	char				*subject;		/* String to match against */
1540 	int					 regex_len;
1541 	int					 subject_len;
1542 	long				 limit_val = -1;/* Integer value of limit */
1543 	long				 flags = 0;		/* Match control flags */
1544 	pcre_cache_entry	*pce;			/* Compiled regular expression */
1545 
1546 	/* Get function parameters and do error checking */
1547 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|ll", &regex, &regex_len,
1548 							  &subject, &subject_len, &limit_val, &flags) == FAILURE) {
1549 		RETURN_FALSE;
1550 	}
1551 
1552 	/* Compile regex or get it from cache. */
1553 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1554 		RETURN_FALSE;
1555 	}
1556 
1557 	pce->refcount++;
1558 	php_pcre_split_impl(pce, subject, subject_len, return_value, limit_val, flags TSRMLS_CC);
1559 	pce->refcount--;
1560 }
1561 /* }}} */
1562 
1563 /* {{{ php_pcre_split
1564  */
php_pcre_split_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * return_value,long limit_val,long flags TSRMLS_DC)1565 PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
1566 	long limit_val, long flags TSRMLS_DC)
1567 {
1568 	pcre_extra		*extra = NULL;		/* Holds results of studying */
1569 	pcre			*re_bump = NULL;	/* Regex instance for empty matches */
1570 	pcre_extra		*extra_bump = NULL;	/* Almost dummy */
1571 	pcre_extra		 extra_data;		/* Used locally for exec options */
1572 	int				*offsets;			/* Array of subpattern offsets */
1573 	int				 size_offsets;		/* Size of the offsets array */
1574 	int				 exoptions = 0;		/* Execution options */
1575 	int				 count = 0;			/* Count of matched subpatterns */
1576 	int				 start_offset;		/* Where the new search starts */
1577 	int				 next_offset;		/* End of the last delimiter match + 1 */
1578 	int				 g_notempty = 0;	/* If the match should not be empty */
1579 	char			*last_match;		/* Location of last match */
1580 	int				 rc;
1581 	int				 no_empty;			/* If NO_EMPTY flag is set */
1582 	int				 delim_capture; 	/* If delimiters should be captured */
1583 	int				 offset_capture;	/* If offsets should be captured */
1584 
1585 	no_empty = flags & PREG_SPLIT_NO_EMPTY;
1586 	delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
1587 	offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
1588 
1589 	if (limit_val == 0) {
1590 		limit_val = -1;
1591 	}
1592 
1593 	if (extra == NULL) {
1594 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1595 		extra = &extra_data;
1596 	}
1597 	extra->match_limit = PCRE_G(backtrack_limit);
1598 	extra->match_limit_recursion = PCRE_G(recursion_limit);
1599 #ifdef PCRE_EXTRA_MARK
1600 	extra->flags &= ~PCRE_EXTRA_MARK;
1601 #endif
1602 
1603 	/* Initialize return value */
1604 	array_init(return_value);
1605 
1606 	/* Calculate the size of the offsets array, and allocate memory for it. */
1607 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
1608 	if (rc < 0) {
1609 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1610 		RETURN_FALSE;
1611 	}
1612 	size_offsets = (size_offsets + 1) * 3;
1613 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1614 
1615 	/* Start at the beginning of the string */
1616 	start_offset = 0;
1617 	next_offset = 0;
1618 	last_match = subject;
1619 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1620 
1621 	/* Get next piece if no limit or limit not yet reached and something matched*/
1622 	while ((limit_val == -1 || limit_val > 1)) {
1623 		count = pcre_exec(pce->re, extra, subject,
1624 						  subject_len, start_offset,
1625 						  exoptions|g_notempty, offsets, size_offsets);
1626 
1627 		/* the string was already proved to be valid UTF-8 */
1628 		exoptions |= PCRE_NO_UTF8_CHECK;
1629 
1630 		/* Check for too many substrings condition. */
1631 		if (count == 0) {
1632 			php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
1633 			count = size_offsets/3;
1634 		}
1635 
1636 		/* If something matched */
1637 		if (count > 0 && (offsets[1] - offsets[0] >= 0)) {
1638 			if (!no_empty || &subject[offsets[0]] != last_match) {
1639 
1640 				if (offset_capture) {
1641 					/* Add (match, offset) pair to the return value */
1642 					add_offset_pair(return_value, last_match, &subject[offsets[0]]-last_match, next_offset, NULL);
1643 				} else {
1644 					/* Add the piece to the return value */
1645 					add_next_index_stringl(return_value, last_match,
1646 								   	   &subject[offsets[0]]-last_match, 1);
1647 				}
1648 
1649 				/* One less left to do */
1650 				if (limit_val != -1)
1651 					limit_val--;
1652 			}
1653 
1654 			last_match = &subject[offsets[1]];
1655 			next_offset = offsets[1];
1656 
1657 			if (delim_capture) {
1658 				int i, match_len;
1659 				for (i = 1; i < count; i++) {
1660 					match_len = offsets[(i<<1)+1] - offsets[i<<1];
1661 					/* If we have matched a delimiter */
1662 					if (!no_empty || match_len > 0) {
1663 						if (offset_capture) {
1664 							add_offset_pair(return_value, &subject[offsets[i<<1]], match_len, offsets[i<<1], NULL);
1665 						} else {
1666 							add_next_index_stringl(return_value,
1667 												   &subject[offsets[i<<1]],
1668 												   match_len, 1);
1669 						}
1670 					}
1671 				}
1672 			}
1673 		} else if (count == PCRE_ERROR_NOMATCH) {
1674 			/* If we previously set PCRE_NOTEMPTY_ATSTART after a null match,
1675 			   this is not necessarily the end. We need to advance
1676 			   the start offset, and continue. Fudge the offset values
1677 			   to achieve this, unless we're already at the end of the string. */
1678 			if (g_notempty != 0 && start_offset < subject_len) {
1679 				if (pce->compile_options & PCRE_UTF8) {
1680 					if (re_bump == NULL) {
1681 						int dummy;
1682 
1683 						if ((re_bump = pcre_get_compiled_regex("/./us", &extra_bump, &dummy TSRMLS_CC)) == NULL) {
1684 							RETURN_FALSE;
1685 						}
1686 					}
1687 					count = pcre_exec(re_bump, extra_bump, subject,
1688 							  subject_len, start_offset,
1689 							  exoptions, offsets, size_offsets);
1690 					if (count < 1) {
1691 						php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown error");
1692 						RETURN_FALSE;
1693 					}
1694 				} else {
1695 					offsets[0] = start_offset;
1696 					offsets[1] = start_offset + 1;
1697 				}
1698 			} else
1699 				break;
1700 		} else {
1701 			pcre_handle_exec_error(count TSRMLS_CC);
1702 			break;
1703 		}
1704 
1705 		/* If we have matched an empty string, mimic what Perl's /g options does.
1706 		   This turns out to be rather cunning. First we set PCRE_NOTEMPTY_ATSTART and try
1707 		   the match again at the same point. If this fails (picked up above) we
1708 		   advance to the next character. */
1709 		g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED : 0;
1710 
1711 		/* Advance to the position right after the last full match */
1712 		start_offset = offsets[1];
1713 	}
1714 
1715 
1716 	start_offset = last_match - subject; /* the offset might have been incremented, but without further successful matches */
1717 
1718 	if (!no_empty || start_offset < subject_len)
1719 	{
1720 		if (offset_capture) {
1721 			/* Add the last (match, offset) pair to the return value */
1722 			add_offset_pair(return_value, &subject[start_offset], subject_len - start_offset, start_offset, NULL);
1723 		} else {
1724 			/* Add the last piece to the return value */
1725 			add_next_index_stringl(return_value, last_match, subject + subject_len - last_match, 1);
1726 		}
1727 	}
1728 
1729 
1730 	/* Clean up */
1731 	efree(offsets);
1732 }
1733 /* }}} */
1734 
1735 /* {{{ proto string preg_quote(string str [, string delim_char])
1736    Quote regular expression characters plus an optional character */
PHP_FUNCTION(preg_quote)1737 static PHP_FUNCTION(preg_quote)
1738 {
1739 	int		 in_str_len;
1740 	char	*in_str;		/* Input string argument */
1741 	char	*in_str_end;    /* End of the input string */
1742 	int		 delim_len = 0;
1743 	char	*delim = NULL;	/* Additional delimiter argument */
1744 	char	*out_str,		/* Output string with quoted characters */
1745 		 	*p,				/* Iterator for input string */
1746 			*q,				/* Iterator for output string */
1747 			 delim_char=0,	/* Delimiter character to be quoted */
1748 			 c;				/* Current character */
1749 	zend_bool quote_delim = 0; /* Whether to quote additional delim char */
1750 
1751 	/* Get the arguments and check for errors */
1752 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|s", &in_str, &in_str_len,
1753 							  &delim, &delim_len) == FAILURE) {
1754 		return;
1755 	}
1756 
1757 	in_str_end = in_str + in_str_len;
1758 
1759 	/* Nothing to do if we got an empty string */
1760 	if (in_str == in_str_end) {
1761 		RETURN_EMPTY_STRING();
1762 	}
1763 
1764 	if (delim && *delim) {
1765 		delim_char = delim[0];
1766 		quote_delim = 1;
1767 	}
1768 
1769 	/* Allocate enough memory so that even if each character
1770 	   is quoted, we won't run out of room */
1771 	out_str = safe_emalloc_string(4, in_str_len, 1);
1772 
1773 	/* Go through the string and quote necessary characters */
1774 	for(p = in_str, q = out_str; p != in_str_end; p++) {
1775 		c = *p;
1776 		switch(c) {
1777 			case '.':
1778 			case '\\':
1779 			case '+':
1780 			case '*':
1781 			case '?':
1782 			case '[':
1783 			case '^':
1784 			case ']':
1785 			case '$':
1786 			case '(':
1787 			case ')':
1788 			case '{':
1789 			case '}':
1790 			case '=':
1791 			case '!':
1792 			case '>':
1793 			case '<':
1794 			case '|':
1795 			case ':':
1796 			case '-':
1797 				*q++ = '\\';
1798 				*q++ = c;
1799 				break;
1800 
1801 			case '\0':
1802 				*q++ = '\\';
1803 				*q++ = '0';
1804 				*q++ = '0';
1805 				*q++ = '0';
1806 				break;
1807 
1808 			default:
1809 				if (quote_delim && c == delim_char)
1810 					*q++ = '\\';
1811 				*q++ = c;
1812 				break;
1813 		}
1814 	}
1815 	*q = '\0';
1816 
1817 	/* Reallocate string and return it */
1818 	RETVAL_STRINGL(erealloc(out_str, q - out_str + 1), q - out_str, 0);
1819 }
1820 /* }}} */
1821 
1822 /* {{{ proto array preg_grep(string regex, array input [, int flags])
1823    Searches array and returns entries which match regex */
PHP_FUNCTION(preg_grep)1824 static PHP_FUNCTION(preg_grep)
1825 {
1826 	char				*regex;			/* Regular expression */
1827 	int				 	 regex_len;
1828 	zval				*input;			/* Input array */
1829 	long				 flags = 0;		/* Match control flags */
1830 	pcre_cache_entry	*pce;			/* Compiled regular expression */
1831 
1832 	/* Get arguments and do error checking */
1833 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sa|l", &regex, &regex_len,
1834 							  &input, &flags) == FAILURE) {
1835 		return;
1836 	}
1837 
1838 	/* Compile regex or get it from cache. */
1839 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1840 		RETURN_FALSE;
1841 	}
1842 
1843 	pce->refcount++;
1844 	php_pcre_grep_impl(pce, input, return_value, flags TSRMLS_CC);
1845 	pce->refcount--;
1846 }
1847 /* }}} */
1848 
php_pcre_grep_impl(pcre_cache_entry * pce,zval * input,zval * return_value,long flags TSRMLS_DC)1849 PHPAPI void  php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, long flags TSRMLS_DC) /* {{{ */
1850 {
1851 	zval		   **entry;				/* An entry in the input array */
1852 	pcre_extra		*extra = pce->extra;/* Holds results of studying */
1853 	pcre_extra		 extra_data;		/* Used locally for exec options */
1854 	int				*offsets;			/* Array of subpattern offsets */
1855 	int				 size_offsets;		/* Size of the offsets array */
1856 	int				 count = 0;			/* Count of matched subpatterns */
1857 	char			*string_key;
1858 	uint			 string_key_len;
1859 	ulong			 num_key;
1860 	zend_bool		 invert;			/* Whether to return non-matching
1861 										   entries */
1862 	int				 rc;
1863 
1864 	invert = flags & PREG_GREP_INVERT ? 1 : 0;
1865 
1866 	if (extra == NULL) {
1867 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1868 		extra = &extra_data;
1869 	}
1870 	extra->match_limit = PCRE_G(backtrack_limit);
1871 	extra->match_limit_recursion = PCRE_G(recursion_limit);
1872 #ifdef PCRE_EXTRA_MARK
1873 	extra->flags &= ~PCRE_EXTRA_MARK;
1874 #endif
1875 
1876 	/* Calculate the size of the offsets array, and allocate memory for it. */
1877 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
1878 	if (rc < 0) {
1879 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1880 		RETURN_FALSE;
1881 	}
1882 	size_offsets = (size_offsets + 1) * 3;
1883 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1884 
1885 	/* Initialize return array */
1886 	array_init(return_value);
1887 
1888 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1889 
1890 	/* Go through the input array */
1891 	zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
1892 	while (zend_hash_get_current_data(Z_ARRVAL_P(input), (void **)&entry) == SUCCESS) {
1893 		zval subject = **entry;
1894 
1895 		if (Z_TYPE_PP(entry) != IS_STRING) {
1896 			zval_copy_ctor(&subject);
1897 			convert_to_string(&subject);
1898 		}
1899 
1900 		/* Perform the match */
1901 		count = pcre_exec(pce->re, extra, Z_STRVAL(subject),
1902 						  Z_STRLEN(subject), 0,
1903 						  0, offsets, size_offsets);
1904 
1905 		/* Check for too many substrings condition. */
1906 		if (count == 0) {
1907 			php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
1908 			count = size_offsets/3;
1909 		} else if (count < 0 && count != PCRE_ERROR_NOMATCH) {
1910 			pcre_handle_exec_error(count TSRMLS_CC);
1911 			break;
1912 		}
1913 
1914 		/* If the entry fits our requirements */
1915 		if ((count > 0 && !invert) || (count == PCRE_ERROR_NOMATCH && invert)) {
1916 
1917 			Z_ADDREF_PP(entry);
1918 
1919 			/* Add to return array */
1920 			switch (zend_hash_get_current_key_ex(Z_ARRVAL_P(input), &string_key, &string_key_len, &num_key, 0, NULL))
1921 			{
1922 				case HASH_KEY_IS_STRING:
1923 					zend_hash_update(Z_ARRVAL_P(return_value), string_key,
1924 									 string_key_len, entry, sizeof(zval *), NULL);
1925 					break;
1926 
1927 				case HASH_KEY_IS_LONG:
1928 					zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry,
1929 										   sizeof(zval *), NULL);
1930 					break;
1931 			}
1932 		}
1933 
1934 		if (Z_TYPE_PP(entry) != IS_STRING) {
1935 			zval_dtor(&subject);
1936 		}
1937 
1938 		zend_hash_move_forward(Z_ARRVAL_P(input));
1939 	}
1940 	zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
1941 	/* Clean up */
1942 	efree(offsets);
1943 }
1944 /* }}} */
1945 
1946 /* {{{ proto int preg_last_error()
1947    Returns the error code of the last regexp execution. */
PHP_FUNCTION(preg_last_error)1948 static PHP_FUNCTION(preg_last_error)
1949 {
1950 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) {
1951 		return;
1952 	}
1953 
1954 	RETURN_LONG(PCRE_G(error_code));
1955 }
1956 /* }}} */
1957 
1958 /* {{{ module definition structures */
1959 
1960 /* {{{ arginfo */
1961 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match, 0, 0, 2)
1962     ZEND_ARG_INFO(0, pattern)
1963     ZEND_ARG_INFO(0, subject)
1964     ZEND_ARG_INFO(1, subpatterns) /* array */
1965     ZEND_ARG_INFO(0, flags)
1966     ZEND_ARG_INFO(0, offset)
1967 ZEND_END_ARG_INFO()
1968 
1969 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match_all, 0, 0, 2)
1970     ZEND_ARG_INFO(0, pattern)
1971     ZEND_ARG_INFO(0, subject)
1972     ZEND_ARG_INFO(1, subpatterns) /* array */
1973     ZEND_ARG_INFO(0, flags)
1974     ZEND_ARG_INFO(0, offset)
1975 ZEND_END_ARG_INFO()
1976 
1977 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace, 0, 0, 3)
1978     ZEND_ARG_INFO(0, regex)
1979     ZEND_ARG_INFO(0, replace)
1980     ZEND_ARG_INFO(0, subject)
1981     ZEND_ARG_INFO(0, limit)
1982     ZEND_ARG_INFO(1, count)
1983 ZEND_END_ARG_INFO()
1984 
1985 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace_callback, 0, 0, 3)
1986     ZEND_ARG_INFO(0, regex)
1987     ZEND_ARG_INFO(0, callback)
1988     ZEND_ARG_INFO(0, subject)
1989     ZEND_ARG_INFO(0, limit)
1990     ZEND_ARG_INFO(1, count)
1991 ZEND_END_ARG_INFO()
1992 
1993 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_split, 0, 0, 2)
1994     ZEND_ARG_INFO(0, pattern)
1995     ZEND_ARG_INFO(0, subject)
1996     ZEND_ARG_INFO(0, limit)
1997     ZEND_ARG_INFO(0, flags)
1998 ZEND_END_ARG_INFO()
1999 
2000 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_quote, 0, 0, 1)
2001     ZEND_ARG_INFO(0, str)
2002     ZEND_ARG_INFO(0, delim_char)
2003 ZEND_END_ARG_INFO()
2004 
2005 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_grep, 0, 0, 2)
2006     ZEND_ARG_INFO(0, regex)
2007     ZEND_ARG_INFO(0, input) /* array */
2008     ZEND_ARG_INFO(0, flags)
2009 ZEND_END_ARG_INFO()
2010 
2011 ZEND_BEGIN_ARG_INFO(arginfo_preg_last_error, 0)
2012 ZEND_END_ARG_INFO()
2013 /* }}} */
2014 
2015 static const zend_function_entry pcre_functions[] = {
2016 	PHP_FE(preg_match,				arginfo_preg_match)
2017 	PHP_FE(preg_match_all,			arginfo_preg_match_all)
2018 	PHP_FE(preg_replace,			arginfo_preg_replace)
2019 	PHP_FE(preg_replace_callback,	arginfo_preg_replace_callback)
2020 	PHP_FE(preg_filter,				arginfo_preg_replace)
2021 	PHP_FE(preg_split,				arginfo_preg_split)
2022 	PHP_FE(preg_quote,				arginfo_preg_quote)
2023 	PHP_FE(preg_grep,				arginfo_preg_grep)
2024 	PHP_FE(preg_last_error,			arginfo_preg_last_error)
2025 	PHP_FE_END
2026 };
2027 
2028 zend_module_entry pcre_module_entry = {
2029 	STANDARD_MODULE_HEADER,
2030    "pcre",
2031 	pcre_functions,
2032 	PHP_MINIT(pcre),
2033 	PHP_MSHUTDOWN(pcre),
2034 	NULL,
2035 	NULL,
2036 	PHP_MINFO(pcre),
2037 	NO_VERSION_YET,
2038 	PHP_MODULE_GLOBALS(pcre),
2039 	PHP_GINIT(pcre),
2040 	PHP_GSHUTDOWN(pcre),
2041 	NULL,
2042 	STANDARD_MODULE_PROPERTIES_EX
2043 };
2044 
2045 #ifdef COMPILE_DL_PCRE
2046 ZEND_GET_MODULE(pcre)
2047 #endif
2048 
2049 /* }}} */
2050 
2051 #endif /* HAVE_PCRE || HAVE_BUNDLED_PCRE */
2052 
2053 /*
2054  * Local variables:
2055  * tab-width: 4
2056  * c-basic-offset: 4
2057  * End:
2058  * vim600: sw=4 ts=4 fdm=marker
2059  * vim<600: sw=4 ts=4
2060  */
2061