xref: /PHP-5.5/ext/pcre/php_pcre.c (revision 03964892)
1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 5                                                        |
4    +----------------------------------------------------------------------+
5    | Copyright (c) 1997-2015 The PHP Group                                |
6    +----------------------------------------------------------------------+
7    | This source file is subject to version 3.01 of the PHP license,      |
8    | that is bundled with this package in the file LICENSE, and is        |
9    | available through the world-wide-web at the following url:           |
10    | http://www.php.net/license/3_01.txt                                  |
11    | If you did not receive a copy of the PHP license and are unable to   |
12    | obtain it through the world-wide-web, please send a note to          |
13    | license@php.net so we can mail you a copy immediately.               |
14    +----------------------------------------------------------------------+
15    | Author: Andrei Zmievski <andrei@php.net>                             |
16    +----------------------------------------------------------------------+
17  */
18 
19 /* $Id$ */
20 
21 #include "php.h"
22 #include "php_ini.h"
23 #include "php_globals.h"
24 #include "php_pcre.h"
25 #include "ext/standard/info.h"
26 #include "ext/standard/php_smart_str.h"
27 
28 #if HAVE_PCRE || HAVE_BUNDLED_PCRE
29 
30 #include "ext/standard/php_string.h"
31 
32 #define PREG_PATTERN_ORDER			1
33 #define PREG_SET_ORDER				2
34 #define PREG_OFFSET_CAPTURE			(1<<8)
35 
36 #define	PREG_SPLIT_NO_EMPTY			(1<<0)
37 #define PREG_SPLIT_DELIM_CAPTURE	(1<<1)
38 #define PREG_SPLIT_OFFSET_CAPTURE	(1<<2)
39 
40 #define PREG_REPLACE_EVAL			(1<<0)
41 
42 #define PREG_GREP_INVERT			(1<<0)
43 
44 #define PCRE_CACHE_SIZE 4096
45 
46 enum {
47 	PHP_PCRE_NO_ERROR = 0,
48 	PHP_PCRE_INTERNAL_ERROR,
49 	PHP_PCRE_BACKTRACK_LIMIT_ERROR,
50 	PHP_PCRE_RECURSION_LIMIT_ERROR,
51 	PHP_PCRE_BAD_UTF8_ERROR,
52 	PHP_PCRE_BAD_UTF8_OFFSET_ERROR
53 };
54 
55 
ZEND_DECLARE_MODULE_GLOBALS(pcre)56 ZEND_DECLARE_MODULE_GLOBALS(pcre)
57 
58 
59 static void pcre_handle_exec_error(int pcre_code TSRMLS_DC) /* {{{ */
60 {
61 	int preg_code = 0;
62 
63 	switch (pcre_code) {
64 		case PCRE_ERROR_MATCHLIMIT:
65 			preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
66 			break;
67 
68 		case PCRE_ERROR_RECURSIONLIMIT:
69 			preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
70 			break;
71 
72 		case PCRE_ERROR_BADUTF8:
73 			preg_code = PHP_PCRE_BAD_UTF8_ERROR;
74 			break;
75 
76 		case PCRE_ERROR_BADUTF8_OFFSET:
77 			preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
78 			break;
79 
80 		default:
81 			preg_code = PHP_PCRE_INTERNAL_ERROR;
82 			break;
83 	}
84 
85 	PCRE_G(error_code) = preg_code;
86 }
87 /* }}} */
88 
php_free_pcre_cache(void * data)89 static void php_free_pcre_cache(void *data) /* {{{ */
90 {
91 	pcre_cache_entry *pce = (pcre_cache_entry *) data;
92 	if (!pce) return;
93 	pefree(pce->re, 1);
94 	if (pce->extra) pefree(pce->extra, 1);
95 #if HAVE_SETLOCALE
96 	if ((void*)pce->tables) pefree((void*)pce->tables, 1);
97 	pefree(pce->locale, 1);
98 #endif
99 }
100 /* }}} */
101 
PHP_GINIT_FUNCTION(pcre)102 static PHP_GINIT_FUNCTION(pcre) /* {{{ */
103 {
104 	zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1);
105 	pcre_globals->backtrack_limit = 0;
106 	pcre_globals->recursion_limit = 0;
107 	pcre_globals->error_code      = PHP_PCRE_NO_ERROR;
108 }
109 /* }}} */
110 
PHP_GSHUTDOWN_FUNCTION(pcre)111 static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */
112 {
113 	zend_hash_destroy(&pcre_globals->pcre_cache);
114 }
115 /* }}} */
116 
117 PHP_INI_BEGIN()
118 	STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateLong, backtrack_limit, zend_pcre_globals, pcre_globals)
119 	STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000", PHP_INI_ALL, OnUpdateLong, recursion_limit, zend_pcre_globals, pcre_globals)
PHP_INI_END()120 PHP_INI_END()
121 
122 
123 /* {{{ PHP_MINFO_FUNCTION(pcre) */
124 static PHP_MINFO_FUNCTION(pcre)
125 {
126 	php_info_print_table_start();
127 	php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
128 	php_info_print_table_row(2, "PCRE Library Version", pcre_version() );
129 	php_info_print_table_end();
130 
131 	DISPLAY_INI_ENTRIES();
132 }
133 /* }}} */
134 
135 /* {{{ PHP_MINIT_FUNCTION(pcre) */
PHP_MINIT_FUNCTION(pcre)136 static PHP_MINIT_FUNCTION(pcre)
137 {
138 	REGISTER_INI_ENTRIES();
139 
140 	REGISTER_LONG_CONSTANT("PREG_PATTERN_ORDER", PREG_PATTERN_ORDER, CONST_CS | CONST_PERSISTENT);
141 	REGISTER_LONG_CONSTANT("PREG_SET_ORDER", PREG_SET_ORDER, CONST_CS | CONST_PERSISTENT);
142 	REGISTER_LONG_CONSTANT("PREG_OFFSET_CAPTURE", PREG_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
143 	REGISTER_LONG_CONSTANT("PREG_SPLIT_NO_EMPTY", PREG_SPLIT_NO_EMPTY, CONST_CS | CONST_PERSISTENT);
144 	REGISTER_LONG_CONSTANT("PREG_SPLIT_DELIM_CAPTURE", PREG_SPLIT_DELIM_CAPTURE, CONST_CS | CONST_PERSISTENT);
145 	REGISTER_LONG_CONSTANT("PREG_SPLIT_OFFSET_CAPTURE", PREG_SPLIT_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
146 	REGISTER_LONG_CONSTANT("PREG_GREP_INVERT", PREG_GREP_INVERT, CONST_CS | CONST_PERSISTENT);
147 
148 	REGISTER_LONG_CONSTANT("PREG_NO_ERROR", PHP_PCRE_NO_ERROR, CONST_CS | CONST_PERSISTENT);
149 	REGISTER_LONG_CONSTANT("PREG_INTERNAL_ERROR", PHP_PCRE_INTERNAL_ERROR, CONST_CS | CONST_PERSISTENT);
150 	REGISTER_LONG_CONSTANT("PREG_BACKTRACK_LIMIT_ERROR", PHP_PCRE_BACKTRACK_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
151 	REGISTER_LONG_CONSTANT("PREG_RECURSION_LIMIT_ERROR", PHP_PCRE_RECURSION_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
152 	REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_ERROR", PHP_PCRE_BAD_UTF8_ERROR, CONST_CS | CONST_PERSISTENT);
153 	REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_OFFSET_ERROR", PHP_PCRE_BAD_UTF8_OFFSET_ERROR, CONST_CS | CONST_PERSISTENT);
154 	REGISTER_STRING_CONSTANT("PCRE_VERSION", (char *)pcre_version(), CONST_CS | CONST_PERSISTENT);
155 
156 	return SUCCESS;
157 }
158 /* }}} */
159 
160 /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
PHP_MSHUTDOWN_FUNCTION(pcre)161 static PHP_MSHUTDOWN_FUNCTION(pcre)
162 {
163 	UNREGISTER_INI_ENTRIES();
164 
165 	return SUCCESS;
166 }
167 /* }}} */
168 
169 /* {{{ static pcre_clean_cache */
pcre_clean_cache(void * data,void * arg TSRMLS_DC)170 static int pcre_clean_cache(void *data, void *arg TSRMLS_DC)
171 {
172 	pcre_cache_entry *pce = (pcre_cache_entry *) data;
173 	int *num_clean = (int *)arg;
174 
175 	if (*num_clean > 0 && !pce->refcount) {
176 		(*num_clean)--;
177 		return ZEND_HASH_APPLY_REMOVE;
178 	} else {
179 		return ZEND_HASH_APPLY_KEEP;
180 	}
181 }
182 /* }}} */
183 
184 /* {{{ static make_subpats_table */
make_subpats_table(int num_subpats,pcre_cache_entry * pce TSRMLS_DC)185 static char **make_subpats_table(int num_subpats, pcre_cache_entry *pce TSRMLS_DC)
186 {
187 	pcre_extra *extra = pce->extra;
188 	int name_cnt = 0, name_size, ni = 0;
189 	int rc;
190 	char *name_table;
191 	unsigned short name_idx;
192 	char **subpat_names = (char **)ecalloc(num_subpats, sizeof(char *));
193 
194 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMECOUNT, &name_cnt);
195 	if (rc < 0) {
196 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
197 		efree(subpat_names);
198 		return NULL;
199 	}
200 	if (name_cnt > 0) {
201 		int rc1, rc2;
202 
203 		rc1 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMETABLE, &name_table);
204 		rc2 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
205 		rc = rc2 ? rc2 : rc1;
206 		if (rc < 0) {
207 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
208 			efree(subpat_names);
209 			return NULL;
210 		}
211 
212 		while (ni++ < name_cnt) {
213 			name_idx = 0xff * (unsigned char)name_table[0] + (unsigned char)name_table[1];
214 			subpat_names[name_idx] = name_table + 2;
215 			if (is_numeric_string(subpat_names[name_idx], strlen(subpat_names[name_idx]), NULL, NULL, 0) > 0) {
216 				php_error_docref(NULL TSRMLS_CC, E_WARNING, "Numeric named subpatterns are not allowed");
217 				efree(subpat_names);
218 				return NULL;
219 			}
220 			name_table += name_size;
221 		}
222 	}
223 
224 	return subpat_names;
225 }
226 /* }}} */
227 
228 /* {{{ static calculate_unit_length */
229 /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE_UTF8. */
calculate_unit_length(pcre_cache_entry * pce,char * start)230 static zend_always_inline int calculate_unit_length(pcre_cache_entry *pce, char *start)
231 {
232 	int unit_len;
233 
234 	if (pce->compile_options & PCRE_UTF8) {
235 		char *end = start;
236 
237 		/* skip continuation bytes */
238 		while ((*++end & 0xC0) == 0x80);
239 		unit_len = end - start;
240 	} else {
241 		unit_len = 1;
242 	}
243 	return unit_len;
244 }
245 /* }}} */
246 
247 /* {{{ pcre_get_compiled_regex_cache
248  */
pcre_get_compiled_regex_cache(char * regex,int regex_len TSRMLS_DC)249 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_len TSRMLS_DC)
250 {
251 	pcre				*re = NULL;
252 	pcre_extra			*extra;
253 	int					 coptions = 0;
254 	int					 soptions = 0;
255 	const char			*error;
256 	int					 erroffset;
257 	char				 delimiter;
258 	char				 start_delimiter;
259 	char				 end_delimiter;
260 	char				*p, *pp;
261 	char				*pattern;
262 	int					 do_study = 0;
263 	int					 poptions = 0;
264 	int				count = 0;
265 	unsigned const char *tables = NULL;
266 #if HAVE_SETLOCALE
267 	char				*locale;
268 #endif
269 	pcre_cache_entry	*pce;
270 	pcre_cache_entry	 new_entry;
271 	char                *tmp = NULL;
272 
273 #if HAVE_SETLOCALE
274 # if defined(PHP_WIN32) && defined(ZTS)
275 	_configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
276 # endif
277 	locale = setlocale(LC_CTYPE, NULL);
278 #endif
279 
280 	/* Try to lookup the cached regex entry, and if successful, just pass
281 	   back the compiled pattern, otherwise go on and compile it. */
282 	if (zend_hash_find(&PCRE_G(pcre_cache), regex, regex_len+1, (void **)&pce) == SUCCESS) {
283 		/*
284 		 * We use a quick pcre_fullinfo() check to see whether cache is corrupted, and if it
285 		 * is, we flush it and compile the pattern from scratch.
286 		 */
287 		if (pcre_fullinfo(pce->re, NULL, PCRE_INFO_CAPTURECOUNT, &count) == PCRE_ERROR_BADMAGIC) {
288 			zend_hash_clean(&PCRE_G(pcre_cache));
289 		} else {
290 #if HAVE_SETLOCALE
291 			if (!strcmp(pce->locale, locale)) {
292 #endif
293 				return pce;
294 #if HAVE_SETLOCALE
295 			}
296 #endif
297 		}
298 	}
299 
300 	p = regex;
301 
302 	/* Parse through the leading whitespace, and display a warning if we
303 	   get to the end without encountering a delimiter. */
304 	while (isspace((int)*(unsigned char *)p)) p++;
305 	if (*p == 0) {
306 		php_error_docref(NULL TSRMLS_CC, E_WARNING,
307 						 p < regex + regex_len ? "Null byte in regex" : "Empty regular expression");
308 		return NULL;
309 	}
310 
311 	/* Get the delimiter and display a warning if it is alphanumeric
312 	   or a backslash. */
313 	delimiter = *p++;
314 	if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
315 		php_error_docref(NULL TSRMLS_CC,E_WARNING, "Delimiter must not be alphanumeric or backslash");
316 		return NULL;
317 	}
318 
319 	start_delimiter = delimiter;
320 	if ((pp = strchr("([{< )]}> )]}>", delimiter)))
321 		delimiter = pp[5];
322 	end_delimiter = delimiter;
323 
324 	pp = p;
325 
326 	if (start_delimiter == end_delimiter) {
327 		/* We need to iterate through the pattern, searching for the ending delimiter,
328 		   but skipping the backslashed delimiters.  If the ending delimiter is not
329 		   found, display a warning. */
330 		while (*pp != 0) {
331 			if (*pp == '\\' && pp[1] != 0) pp++;
332 			else if (*pp == delimiter)
333 				break;
334 			pp++;
335 		}
336 	} else {
337 		/* We iterate through the pattern, searching for the matching ending
338 		 * delimiter. For each matching starting delimiter, we increment nesting
339 		 * level, and decrement it for each matching ending delimiter. If we
340 		 * reach the end of the pattern without matching, display a warning.
341 		 */
342 		int brackets = 1; 	/* brackets nesting level */
343 		while (*pp != 0) {
344 			if (*pp == '\\' && pp[1] != 0) pp++;
345 			else if (*pp == end_delimiter && --brackets <= 0)
346 				break;
347 			else if (*pp == start_delimiter)
348 				brackets++;
349 			pp++;
350 		}
351 	}
352 
353 	if (*pp == 0) {
354 		if (pp < regex + regex_len) {
355 			php_error_docref(NULL TSRMLS_CC,E_WARNING, "Null byte in regex");
356 		} else if (start_delimiter == end_delimiter) {
357 			php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending delimiter '%c' found", delimiter);
358 		} else {
359 			php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending matching delimiter '%c' found", delimiter);
360 		}
361 		return NULL;
362 	}
363 
364 	/* Make a copy of the actual pattern. */
365 	pattern = estrndup(p, pp-p);
366 
367 	/* Move on to the options */
368 	pp++;
369 
370 	/* Parse through the options, setting appropriate flags.  Display
371 	   a warning if we encounter an unknown modifier. */
372 	while (pp < regex + regex_len) {
373 		switch (*pp++) {
374 			/* Perl compatible options */
375 			case 'i':	coptions |= PCRE_CASELESS;		break;
376 			case 'm':	coptions |= PCRE_MULTILINE;		break;
377 			case 's':	coptions |= PCRE_DOTALL;		break;
378 			case 'x':	coptions |= PCRE_EXTENDED;		break;
379 
380 			/* PCRE specific options */
381 			case 'A':	coptions |= PCRE_ANCHORED;		break;
382 			case 'D':	coptions |= PCRE_DOLLAR_ENDONLY;break;
383 			case 'S':	do_study  = 1;					break;
384 			case 'U':	coptions |= PCRE_UNGREEDY;		break;
385 			case 'X':	coptions |= PCRE_EXTRA;			break;
386 			case 'u':	coptions |= PCRE_UTF8;
387 	/* In  PCRE,  by  default, \d, \D, \s, \S, \w, and \W recognize only ASCII
388        characters, even in UTF-8 mode. However, this can be changed by setting
389        the PCRE_UCP option. */
390 #ifdef PCRE_UCP
391 						coptions |= PCRE_UCP;
392 #endif
393 				break;
394 
395 			/* Custom preg options */
396 			case 'e':	poptions |= PREG_REPLACE_EVAL;	break;
397 
398 			case ' ':
399 			case '\n':
400 				break;
401 
402 			default:
403 				if (pp[-1]) {
404 					php_error_docref(NULL TSRMLS_CC,E_WARNING, "Unknown modifier '%c'", pp[-1]);
405 				} else {
406 					php_error_docref(NULL TSRMLS_CC,E_WARNING, "Null byte in regex");
407 				}
408 				efree(pattern);
409 				return NULL;
410 		}
411 	}
412 
413 #if HAVE_SETLOCALE
414 	if (strcmp(locale, "C"))
415 		tables = pcre_maketables();
416 #endif
417 
418 	/* Compile pattern and display a warning if compilation failed. */
419 	re = pcre_compile(pattern,
420 					  coptions,
421 					  &error,
422 					  &erroffset,
423 					  tables);
424 
425 	if (re == NULL) {
426 		php_error_docref(NULL TSRMLS_CC,E_WARNING, "Compilation failed: %s at offset %d", error, erroffset);
427 		efree(pattern);
428 		if (tables) {
429 			pefree((void*)tables, 1);
430 		}
431 		return NULL;
432 	}
433 
434 	/* If study option was specified, study the pattern and
435 	   store the result in extra for passing to pcre_exec. */
436 	if (do_study) {
437 		extra = pcre_study(re, soptions, &error);
438 		if (extra) {
439 			extra->flags |= PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
440 		}
441 		if (error != NULL) {
442 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Error while studying pattern");
443 		}
444 	} else {
445 		extra = NULL;
446 	}
447 
448 	efree(pattern);
449 
450 	/*
451 	 * If we reached cache limit, clean out the items from the head of the list;
452 	 * these are supposedly the oldest ones (but not necessarily the least used
453 	 * ones).
454 	 */
455 	if (zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) {
456 		int num_clean = PCRE_CACHE_SIZE / 8;
457 		zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean TSRMLS_CC);
458 	}
459 
460 	/* Store the compiled pattern and extra info in the cache. */
461 	new_entry.re = re;
462 	new_entry.extra = extra;
463 	new_entry.preg_options = poptions;
464 	new_entry.compile_options = coptions;
465 #if HAVE_SETLOCALE
466 	new_entry.locale = pestrdup(locale, 1);
467 	new_entry.tables = tables;
468 #endif
469 	new_entry.refcount = 0;
470 
471 	/*
472 	 * Interned strings are not duplicated when stored in HashTable,
473 	 * but all the interned strings created during HTTP request are removed
474 	 * at end of request. However PCRE_G(pcre_cache) must be consistent
475 	 * on the next request as well. So we disable usage of interned strings
476 	 * as hash keys especually for this table.
477 	 * See bug #63180
478 	 */
479 	if (IS_INTERNED(regex)) {
480 		regex = tmp = estrndup(regex, regex_len);
481 	}
482 
483 	zend_hash_update(&PCRE_G(pcre_cache), regex, regex_len+1, (void *)&new_entry,
484 						sizeof(pcre_cache_entry), (void**)&pce);
485 
486 	if (tmp) {
487 		efree(tmp);
488 	}
489 
490 	return pce;
491 }
492 /* }}} */
493 
494 /* {{{ pcre_get_compiled_regex
495  */
pcre_get_compiled_regex(char * regex,pcre_extra ** extra,int * preg_options TSRMLS_DC)496 PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_options TSRMLS_DC)
497 {
498 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
499 
500 	if (extra) {
501 		*extra = pce ? pce->extra : NULL;
502 	}
503 	if (preg_options) {
504 		*preg_options = pce ? pce->preg_options : 0;
505 	}
506 
507 	return pce ? pce->re : NULL;
508 }
509 /* }}} */
510 
511 /* {{{ pcre_get_compiled_regex_ex
512  */
pcre_get_compiled_regex_ex(char * regex,pcre_extra ** extra,int * preg_options,int * compile_options TSRMLS_DC)513 PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *compile_options TSRMLS_DC)
514 {
515 	pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
516 
517 	if (extra) {
518 		*extra = pce ? pce->extra : NULL;
519 	}
520 	if (preg_options) {
521 		*preg_options = pce ? pce->preg_options : 0;
522 	}
523 	if (compile_options) {
524 		*compile_options = pce ? pce->compile_options : 0;
525 	}
526 
527 	return pce ? pce->re : NULL;
528 }
529 /* }}} */
530 
531 /* {{{ add_offset_pair */
add_offset_pair(zval * result,char * str,int len,int offset,char * name)532 static inline void add_offset_pair(zval *result, char *str, int len, int offset, char *name)
533 {
534 	zval *match_pair;
535 
536 	ALLOC_ZVAL(match_pair);
537 	array_init(match_pair);
538 	INIT_PZVAL(match_pair);
539 
540 	/* Add (match, offset) to the return value */
541 	add_next_index_stringl(match_pair, str, len, 1);
542 	add_next_index_long(match_pair, offset);
543 
544 	if (name) {
545 		zval_add_ref(&match_pair);
546 		zend_hash_update(Z_ARRVAL_P(result), name, strlen(name)+1, &match_pair, sizeof(zval *), NULL);
547 	}
548 	zend_hash_next_index_insert(Z_ARRVAL_P(result), &match_pair, sizeof(zval *), NULL);
549 }
550 /* }}} */
551 
php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS,int global)552 static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ */
553 {
554 	/* parameters */
555 	char			 *regex;			/* Regular expression */
556 	char			 *subject;			/* String to match against */
557 	int				  regex_len;
558 	int				  subject_len;
559 	pcre_cache_entry *pce;				/* Compiled regular expression */
560 	zval			 *subpats = NULL;	/* Array for subpatterns */
561 	long			  flags = 0;		/* Match control flags */
562 	long			  start_offset = 0;	/* Where the new search starts */
563 
564 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|zll", &regex, &regex_len,
565 							  &subject, &subject_len, &subpats, &flags, &start_offset) == FAILURE) {
566 		RETURN_FALSE;
567 	}
568 
569 	/* Compile regex or get it from cache. */
570 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
571 		RETURN_FALSE;
572 	}
573 
574 	pce->refcount++;
575 	php_pcre_match_impl(pce, subject, subject_len, return_value, subpats,
576 		global, ZEND_NUM_ARGS() >= 4, flags, start_offset TSRMLS_CC);
577 	pce->refcount--;
578 }
579 /* }}} */
580 
581 /* {{{ php_pcre_match_impl() */
php_pcre_match_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * return_value,zval * subpats,int global,int use_flags,long flags,long start_offset TSRMLS_DC)582 PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
583 	zval *subpats, int global, int use_flags, long flags, long start_offset TSRMLS_DC)
584 {
585 	zval			*result_set,		/* Holds a set of subpatterns after
586 										   a global match */
587 				   **match_sets = NULL;	/* An array of sets of matches for each
588 										   subpattern after a global match */
589 	pcre_extra		*extra = pce->extra;/* Holds results of studying */
590 	pcre_extra		 extra_data;		/* Used locally for exec options */
591 	int				 exoptions = 0;		/* Execution options */
592 	int				 count = 0;			/* Count of matched subpatterns */
593 	int				*offsets;			/* Array of subpattern offsets */
594 	int				 num_subpats;		/* Number of captured subpatterns */
595 	int				 size_offsets;		/* Size of the offsets array */
596 	int				 matched;			/* Has anything matched */
597 	int				 g_notempty = 0;	/* If the match should not be empty */
598 	const char	   **stringlist;		/* Holds list of subpatterns */
599 	char 		   **subpat_names;		/* Array for named subpatterns */
600 	int				 i, rc;
601 	int				 subpats_order;		/* Order of subpattern matches */
602 	int				 offset_capture;    /* Capture match offsets: yes/no */
603 
604 	/* Overwrite the passed-in value for subpatterns with an empty array. */
605 	if (subpats != NULL) {
606 		zval_dtor(subpats);
607 		array_init(subpats);
608 	}
609 
610 	subpats_order = global ? PREG_PATTERN_ORDER : 0;
611 
612 	if (use_flags) {
613 		offset_capture = flags & PREG_OFFSET_CAPTURE;
614 
615 		/*
616 		 * subpats_order is pre-set to pattern mode so we change it only if
617 		 * necessary.
618 		 */
619 		if (flags & 0xff) {
620 			subpats_order = flags & 0xff;
621 		}
622 		if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
623 			(!global && subpats_order != 0)) {
624 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid flags specified");
625 			return;
626 		}
627 	} else {
628 		offset_capture = 0;
629 	}
630 
631 	/* Negative offset counts from the end of the string. */
632 	if (start_offset < 0) {
633 		start_offset = subject_len + start_offset;
634 		if (start_offset < 0) {
635 			start_offset = 0;
636 		}
637 	}
638 
639 	if (extra == NULL) {
640 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
641 		extra = &extra_data;
642 	}
643 	extra->match_limit = PCRE_G(backtrack_limit);
644 	extra->match_limit_recursion = PCRE_G(recursion_limit);
645 
646 	/* Calculate the size of the offsets array, and allocate memory for it. */
647 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
648 	if (rc < 0) {
649 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
650 		RETURN_FALSE;
651 	}
652 	num_subpats++;
653 	size_offsets = num_subpats * 3;
654 
655 	/*
656 	 * Build a mapping from subpattern numbers to their names. We will always
657 	 * allocate the table, even though there may be no named subpatterns. This
658 	 * avoids somewhat more complicated logic in the inner loops.
659 	 */
660 	subpat_names = make_subpats_table(num_subpats, pce TSRMLS_CC);
661 	if (!subpat_names) {
662 		RETURN_FALSE;
663 	}
664 
665 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
666 	memset(offsets, 0, size_offsets*sizeof(int));
667 	/* Allocate match sets array and initialize the values. */
668 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
669 		match_sets = (zval **)safe_emalloc(num_subpats, sizeof(zval *), 0);
670 		for (i=0; i<num_subpats; i++) {
671 			ALLOC_ZVAL(match_sets[i]);
672 			array_init(match_sets[i]);
673 			INIT_PZVAL(match_sets[i]);
674 		}
675 	}
676 
677 	matched = 0;
678 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
679 
680 	do {
681 		/* Execute the regular expression. */
682 		count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
683 						  exoptions|g_notempty, offsets, size_offsets);
684 
685 		/* the string was already proved to be valid UTF-8 */
686 		exoptions |= PCRE_NO_UTF8_CHECK;
687 
688 		/* Check for too many substrings condition. */
689 		if (count == 0) {
690 			php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
691 			count = size_offsets/3;
692 		}
693 
694 		/* If something has matched */
695 		if (count > 0) {
696 			matched++;
697 
698 			/* If subpatterns array has been passed, fill it in with values. */
699 			if (subpats != NULL) {
700 				/* Try to get the list of substrings and display a warning if failed. */
701 				if ((offsets[1] - offsets[0] < 0) || pcre_get_substring_list(subject, offsets, count, &stringlist) < 0) {
702 					efree(subpat_names);
703 					efree(offsets);
704 					if (match_sets) efree(match_sets);
705 					php_error_docref(NULL TSRMLS_CC, E_WARNING, "Get subpatterns list failed");
706 					RETURN_FALSE;
707 				}
708 
709 				if (global) {	/* global pattern matching */
710 					if (subpats && subpats_order == PREG_PATTERN_ORDER) {
711 						/* For each subpattern, insert it into the appropriate array. */
712 						for (i = 0; i < count; i++) {
713 							if (offset_capture) {
714 								add_offset_pair(match_sets[i], (char *)stringlist[i],
715 												offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL);
716 							} else {
717 								add_next_index_stringl(match_sets[i], (char *)stringlist[i],
718 													   offsets[(i<<1)+1] - offsets[i<<1], 1);
719 							}
720 						}
721 						/*
722 						 * If the number of captured subpatterns on this run is
723 						 * less than the total possible number, pad the result
724 						 * arrays with empty strings.
725 						 */
726 						if (count < num_subpats) {
727 							for (; i < num_subpats; i++) {
728 								add_next_index_string(match_sets[i], "", 1);
729 							}
730 						}
731 					} else {
732 						/* Allocate the result set array */
733 						ALLOC_ZVAL(result_set);
734 						array_init(result_set);
735 						INIT_PZVAL(result_set);
736 
737 						/* Add all the subpatterns to it */
738 						for (i = 0; i < count; i++) {
739 							if (offset_capture) {
740 								add_offset_pair(result_set, (char *)stringlist[i],
741 												offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], subpat_names[i]);
742 							} else {
743 								if (subpat_names[i]) {
744 									add_assoc_stringl(result_set, subpat_names[i], (char *)stringlist[i],
745 														   offsets[(i<<1)+1] - offsets[i<<1], 1);
746 								}
747 								add_next_index_stringl(result_set, (char *)stringlist[i],
748 													   offsets[(i<<1)+1] - offsets[i<<1], 1);
749 							}
750 						}
751 						/* And add it to the output array */
752 						zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &result_set, sizeof(zval *), NULL);
753 					}
754 				} else {			/* single pattern matching */
755 					/* For each subpattern, insert it into the subpatterns array. */
756 					for (i = 0; i < count; i++) {
757 						if (offset_capture) {
758 							add_offset_pair(subpats, (char *)stringlist[i],
759 											offsets[(i<<1)+1] - offsets[i<<1],
760 											offsets[i<<1], subpat_names[i]);
761 						} else {
762 							if (subpat_names[i]) {
763 								add_assoc_stringl(subpats, subpat_names[i], (char *)stringlist[i],
764 												  offsets[(i<<1)+1] - offsets[i<<1], 1);
765 							}
766 							add_next_index_stringl(subpats, (char *)stringlist[i],
767 												   offsets[(i<<1)+1] - offsets[i<<1], 1);
768 						}
769 					}
770 				}
771 
772 				pcre_free((void *) stringlist);
773 			}
774 		} else if (count == PCRE_ERROR_NOMATCH) {
775 			/* If we previously set PCRE_NOTEMPTY after a null match,
776 			   this is not necessarily the end. We need to advance
777 			   the start offset, and continue. Fudge the offset values
778 			   to achieve this, unless we're already at the end of the string. */
779 			if (g_notempty != 0 && start_offset < subject_len) {
780 				int unit_len = calculate_unit_length(pce, subject + start_offset);
781 
782 				offsets[0] = start_offset;
783 				offsets[1] = start_offset + unit_len;
784 			} else
785 				break;
786 		} else {
787 			pcre_handle_exec_error(count TSRMLS_CC);
788 			break;
789 		}
790 
791 		/* If we have matched an empty string, mimic what Perl's /g options does.
792 		   This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
793 		   the match again at the same point. If this fails (picked up above) we
794 		   advance to the next character. */
795 		g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
796 
797 		/* Advance to the position right after the last full match */
798 		start_offset = offsets[1];
799 	} while (global);
800 
801 	/* Add the match sets to the output array and clean up */
802 	if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
803 		for (i = 0; i < num_subpats; i++) {
804 			if (subpat_names[i]) {
805 				zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i],
806 								 strlen(subpat_names[i])+1, &match_sets[i], sizeof(zval *), NULL);
807 				Z_ADDREF_P(match_sets[i]);
808 			}
809 			zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i], sizeof(zval *), NULL);
810 		}
811 		efree(match_sets);
812 	}
813 
814 	efree(offsets);
815 	efree(subpat_names);
816 
817 	/* Did we encounter an error? */
818 	if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
819 		RETVAL_LONG(matched);
820 	} else {
821 		RETVAL_FALSE;
822 	}
823 }
824 /* }}} */
825 
826 /* {{{ proto int preg_match(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
827    Perform a Perl-style regular expression match */
PHP_FUNCTION(preg_match)828 static PHP_FUNCTION(preg_match)
829 {
830 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
831 }
832 /* }}} */
833 
834 /* {{{ proto int preg_match_all(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
835    Perform a Perl-style global regular expression match */
PHP_FUNCTION(preg_match_all)836 static PHP_FUNCTION(preg_match_all)
837 {
838 	php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
839 }
840 /* }}} */
841 
842 /* {{{ preg_get_backref
843  */
preg_get_backref(char ** str,int * backref)844 static int preg_get_backref(char **str, int *backref)
845 {
846 	register char in_brace = 0;
847 	register char *walk = *str;
848 
849 	if (walk[1] == 0)
850 		return 0;
851 
852 	if (*walk == '$' && walk[1] == '{') {
853 		in_brace = 1;
854 		walk++;
855 	}
856 	walk++;
857 
858 	if (*walk >= '0' && *walk <= '9') {
859 		*backref = *walk - '0';
860 		walk++;
861 	} else
862 		return 0;
863 
864 	if (*walk && *walk >= '0' && *walk <= '9') {
865 		*backref = *backref * 10 + *walk - '0';
866 		walk++;
867 	}
868 
869 	if (in_brace) {
870 		if (*walk == 0 || *walk != '}')
871 			return 0;
872 		else
873 			walk++;
874 	}
875 
876 	*str = walk;
877 	return 1;
878 }
879 /* }}} */
880 
881 /* {{{ preg_do_repl_func
882  */
preg_do_repl_func(zval * function,char * subject,int * offsets,char ** subpat_names,int count,char ** result TSRMLS_DC)883 static int preg_do_repl_func(zval *function, char *subject, int *offsets, char **subpat_names, int count, char **result TSRMLS_DC)
884 {
885 	zval		*retval_ptr;		/* Function return value */
886 	zval	   **args[1];			/* Argument to pass to function */
887 	zval		*subpats;			/* Captured subpatterns */
888 	int			 result_len;		/* Return value length */
889 	int			 i;
890 
891 	MAKE_STD_ZVAL(subpats);
892 	array_init(subpats);
893 	for (i = 0; i < count; i++) {
894 		if (subpat_names[i]) {
895 			add_assoc_stringl(subpats, subpat_names[i], &subject[offsets[i<<1]] , offsets[(i<<1)+1] - offsets[i<<1], 1);
896 		}
897 		add_next_index_stringl(subpats, &subject[offsets[i<<1]], offsets[(i<<1)+1] - offsets[i<<1], 1);
898 	}
899 	args[0] = &subpats;
900 
901 	if (call_user_function_ex(EG(function_table), NULL, function, &retval_ptr, 1, args, 0, NULL TSRMLS_CC) == SUCCESS && retval_ptr) {
902 		convert_to_string_ex(&retval_ptr);
903 		*result = estrndup(Z_STRVAL_P(retval_ptr), Z_STRLEN_P(retval_ptr));
904 		result_len = Z_STRLEN_P(retval_ptr);
905 		zval_ptr_dtor(&retval_ptr);
906 	} else {
907 		if (!EG(exception)) {
908 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unable to call custom replacement function");
909 		}
910 		result_len = offsets[1] - offsets[0];
911 		*result = estrndup(&subject[offsets[0]], result_len);
912 	}
913 
914 	zval_ptr_dtor(&subpats);
915 
916 	return result_len;
917 }
918 /* }}} */
919 
920 /* {{{ preg_do_eval
921  */
preg_do_eval(char * eval_str,int eval_str_len,char * subject,int * offsets,int count,char ** result TSRMLS_DC)922 static int preg_do_eval(char *eval_str, int eval_str_len, char *subject,
923 						int *offsets, int count, char **result TSRMLS_DC)
924 {
925 	zval		 retval;			/* Return value from evaluation */
926 	char		*eval_str_end,		/* End of eval string */
927 				*match,				/* Current match for a backref */
928 				*esc_match,			/* Quote-escaped match */
929 				*walk,				/* Used to walk the code string */
930 				*segment,			/* Start of segment to append while walking */
931 				 walk_last;			/* Last walked character */
932 	int			 match_len;			/* Length of the match */
933 	int			 esc_match_len;		/* Length of the quote-escaped match */
934 	int			 result_len;		/* Length of the result of the evaluation */
935 	int			 backref;			/* Current backref */
936 	char        *compiled_string_description;
937 	smart_str    code = {0};
938 
939 	eval_str_end = eval_str + eval_str_len;
940 	walk = segment = eval_str;
941 	walk_last = 0;
942 
943 	while (walk < eval_str_end) {
944 		/* If found a backreference.. */
945 		if ('\\' == *walk || '$' == *walk) {
946 			smart_str_appendl(&code, segment, walk - segment);
947 			if (walk_last == '\\') {
948 				code.c[code.len-1] = *walk++;
949 				segment = walk;
950 				walk_last = 0;
951 				continue;
952 			}
953 			segment = walk;
954 			if (preg_get_backref(&walk, &backref)) {
955 				if (backref < count) {
956 					/* Find the corresponding string match and substitute it
957 					   in instead of the backref */
958 					match = subject + offsets[backref<<1];
959 					match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
960 					if (match_len) {
961 						esc_match = php_addslashes(match, match_len, &esc_match_len, 0 TSRMLS_CC);
962 					} else {
963 						esc_match = match;
964 						esc_match_len = 0;
965 					}
966 				} else {
967 					esc_match = "";
968 					esc_match_len = 0;
969 				}
970 				smart_str_appendl(&code, esc_match, esc_match_len);
971 
972 				segment = walk;
973 
974 				/* Clean up and reassign */
975 				if (esc_match_len)
976 					efree(esc_match);
977 				continue;
978 			}
979 		}
980 		walk++;
981 		walk_last = walk[-1];
982 	}
983 	smart_str_appendl(&code, segment, walk - segment);
984 	smart_str_0(&code);
985 
986 	compiled_string_description = zend_make_compiled_string_description("regexp code" TSRMLS_CC);
987 	/* Run the code */
988 	if (zend_eval_stringl(code.c, code.len, &retval, compiled_string_description TSRMLS_CC) == FAILURE) {
989 		efree(compiled_string_description);
990 		php_error_docref(NULL TSRMLS_CC,E_ERROR, "Failed evaluating code: %s%s", PHP_EOL, code.c);
991 		/* zend_error() does not return in this case */
992 	}
993 	efree(compiled_string_description);
994 	convert_to_string(&retval);
995 
996 	/* Save the return value and its length */
997 	*result = estrndup(Z_STRVAL(retval), Z_STRLEN(retval));
998 	result_len = Z_STRLEN(retval);
999 
1000 	/* Clean up */
1001 	zval_dtor(&retval);
1002 	smart_str_free(&code);
1003 
1004 	return result_len;
1005 }
1006 /* }}} */
1007 
1008 /* {{{ php_pcre_replace
1009  */
php_pcre_replace(char * regex,int regex_len,char * subject,int subject_len,zval * replace_val,int is_callable_replace,int * result_len,int limit,int * replace_count TSRMLS_DC)1010 PHPAPI char *php_pcre_replace(char *regex,   int regex_len,
1011 							  char *subject, int subject_len,
1012 							  zval *replace_val, int is_callable_replace,
1013 							  int *result_len, int limit, int *replace_count TSRMLS_DC)
1014 {
1015 	pcre_cache_entry	*pce;			    /* Compiled regular expression */
1016 	char		 		*result;			/* Function result */
1017 
1018 	/* Compile regex or get it from cache. */
1019 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1020 		return NULL;
1021 	}
1022 	pce->refcount++;
1023 	result = php_pcre_replace_impl(pce, subject, subject_len, replace_val,
1024 		is_callable_replace, result_len, limit, replace_count TSRMLS_CC);
1025 	pce->refcount--;
1026 
1027 	return result;
1028 }
1029 /* }}} */
1030 
1031 /* {{{ php_pcre_replace_impl() */
php_pcre_replace_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * replace_val,int is_callable_replace,int * result_len,int limit,int * replace_count TSRMLS_DC)1032 PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *replace_val,
1033 	int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC)
1034 {
1035 	pcre_extra		*extra = pce->extra;/* Holds results of studying */
1036 	pcre_extra		 extra_data;		/* Used locally for exec options */
1037 	int				 exoptions = 0;		/* Execution options */
1038 	int				 count = 0;			/* Count of matched subpatterns */
1039 	int				*offsets;			/* Array of subpattern offsets */
1040 	char 			**subpat_names;		/* Array for named subpatterns */
1041 	int				 num_subpats;		/* Number of captured subpatterns */
1042 	int				 size_offsets;		/* Size of the offsets array */
1043 	int				 new_len;			/* Length of needed storage */
1044 	int				 alloc_len;			/* Actual allocated length */
1045 	int				 eval_result_len=0;	/* Length of the eval'ed or
1046 										   function-returned string */
1047 	int				 match_len;			/* Length of the current match */
1048 	int				 backref;			/* Backreference number */
1049 	int				 eval;				/* If the replacement string should be eval'ed */
1050 	int				 start_offset;		/* Where the new search starts */
1051 	int				 g_notempty=0;		/* If the match should not be empty */
1052 	int				 replace_len=0;		/* Length of replacement string */
1053 	char			*result,			/* Result of replacement */
1054 					*replace=NULL,		/* Replacement string */
1055 					*new_buf,			/* Temporary buffer for re-allocation */
1056 					*walkbuf,			/* Location of current replacement in the result */
1057 					*walk,				/* Used to walk the replacement string */
1058 					*match,				/* The current match */
1059 					*piece,				/* The current piece of subject */
1060 					*replace_end=NULL,	/* End of replacement string */
1061 					*eval_result,		/* Result of eval or custom function */
1062 					 walk_last;			/* Last walked character */
1063 	int				 rc;
1064 
1065 	if (extra == NULL) {
1066 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1067 		extra = &extra_data;
1068 	}
1069 	extra->match_limit = PCRE_G(backtrack_limit);
1070 	extra->match_limit_recursion = PCRE_G(recursion_limit);
1071 
1072 	eval = pce->preg_options & PREG_REPLACE_EVAL;
1073 	if (is_callable_replace) {
1074 		if (eval) {
1075 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Modifier /e cannot be used with replacement callback");
1076 			return NULL;
1077 		}
1078 	} else {
1079 		replace = Z_STRVAL_P(replace_val);
1080 		replace_len = Z_STRLEN_P(replace_val);
1081 		replace_end = replace + replace_len;
1082 	}
1083 
1084 	if (eval) {
1085 		php_error_docref(NULL TSRMLS_CC, E_DEPRECATED, "The /e modifier is deprecated, use preg_replace_callback instead");
1086 	}
1087 
1088 	/* Calculate the size of the offsets array, and allocate memory for it. */
1089 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
1090 	if (rc < 0) {
1091 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1092 		return NULL;
1093 	}
1094 	num_subpats++;
1095 	size_offsets = num_subpats * 3;
1096 
1097 	/*
1098 	 * Build a mapping from subpattern numbers to their names. We will always
1099 	 * allocate the table, even though there may be no named subpatterns. This
1100 	 * avoids somewhat more complicated logic in the inner loops.
1101 	 */
1102 	subpat_names = make_subpats_table(num_subpats, pce TSRMLS_CC);
1103 	if (!subpat_names) {
1104 		return NULL;
1105 	}
1106 
1107 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1108 
1109 	alloc_len = 2 * subject_len + 1;
1110 	result = safe_emalloc(alloc_len, sizeof(char), 0);
1111 
1112 	/* Initialize */
1113 	match = NULL;
1114 	*result_len = 0;
1115 	start_offset = 0;
1116 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1117 
1118 	while (1) {
1119 		/* Execute the regular expression. */
1120 		count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
1121 						  exoptions|g_notempty, offsets, size_offsets);
1122 
1123 		/* the string was already proved to be valid UTF-8 */
1124 		exoptions |= PCRE_NO_UTF8_CHECK;
1125 
1126 		/* Check for too many substrings condition. */
1127 		if (count == 0) {
1128 			php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
1129 			count = size_offsets/3;
1130 		}
1131 
1132 		piece = subject + start_offset;
1133 
1134 		if (count > 0 && (offsets[1] - offsets[0] >= 0) && (limit == -1 || limit > 0)) {
1135 			if (replace_count) {
1136 				++*replace_count;
1137 			}
1138 			/* Set the match location in subject */
1139 			match = subject + offsets[0];
1140 
1141 			new_len = *result_len + offsets[0] - start_offset; /* part before the match */
1142 
1143 			/* If evaluating, do it and add the return string's length */
1144 			if (eval) {
1145 				eval_result_len = preg_do_eval(replace, replace_len, subject,
1146 											   offsets, count, &eval_result TSRMLS_CC);
1147 				new_len += eval_result_len;
1148 			} else if (is_callable_replace) {
1149 				/* Use custom function to get replacement string and its length. */
1150 				eval_result_len = preg_do_repl_func(replace_val, subject, offsets, subpat_names, count, &eval_result TSRMLS_CC);
1151 				new_len += eval_result_len;
1152 			} else { /* do regular substitution */
1153 				walk = replace;
1154 				walk_last = 0;
1155 				while (walk < replace_end) {
1156 					if ('\\' == *walk || '$' == *walk) {
1157 						if (walk_last == '\\') {
1158 							walk++;
1159 							walk_last = 0;
1160 							continue;
1161 						}
1162 						if (preg_get_backref(&walk, &backref)) {
1163 							if (backref < count)
1164 								new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
1165 							continue;
1166 						}
1167 					}
1168 					new_len++;
1169 					walk++;
1170 					walk_last = walk[-1];
1171 				}
1172 			}
1173 
1174 			if (new_len + 1 > alloc_len) {
1175 				alloc_len = 1 + alloc_len + 2 * new_len;
1176 				new_buf = emalloc(alloc_len);
1177 				memcpy(new_buf, result, *result_len);
1178 				efree(result);
1179 				result = new_buf;
1180 			}
1181 			/* copy the part of the string before the match */
1182 			memcpy(&result[*result_len], piece, match-piece);
1183 			*result_len += match-piece;
1184 
1185 			/* copy replacement and backrefs */
1186 			walkbuf = result + *result_len;
1187 
1188 			/* If evaluating or using custom function, copy result to the buffer
1189 			 * and clean up. */
1190 			if (eval || is_callable_replace) {
1191 				memcpy(walkbuf, eval_result, eval_result_len);
1192 				*result_len += eval_result_len;
1193 				STR_FREE(eval_result);
1194 			} else { /* do regular backreference copying */
1195 				walk = replace;
1196 				walk_last = 0;
1197 				while (walk < replace_end) {
1198 					if ('\\' == *walk || '$' == *walk) {
1199 						if (walk_last == '\\') {
1200 							*(walkbuf-1) = *walk++;
1201 							walk_last = 0;
1202 							continue;
1203 						}
1204 						if (preg_get_backref(&walk, &backref)) {
1205 							if (backref < count) {
1206 								match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1207 								memcpy(walkbuf, subject + offsets[backref<<1], match_len);
1208 								walkbuf += match_len;
1209 							}
1210 							continue;
1211 						}
1212 					}
1213 					*walkbuf++ = *walk++;
1214 					walk_last = walk[-1];
1215 				}
1216 				*walkbuf = '\0';
1217 				/* increment the result length by how much we've added to the string */
1218 				*result_len += walkbuf - (result + *result_len);
1219 			}
1220 
1221 			if (limit != -1)
1222 				limit--;
1223 
1224 		} else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
1225 			/* If we previously set PCRE_NOTEMPTY after a null match,
1226 			   this is not necessarily the end. We need to advance
1227 			   the start offset, and continue. Fudge the offset values
1228 			   to achieve this, unless we're already at the end of the string. */
1229 			if (g_notempty != 0 && start_offset < subject_len) {
1230 				int unit_len = calculate_unit_length(pce, piece);
1231 
1232 				offsets[0] = start_offset;
1233 				offsets[1] = start_offset + unit_len;
1234 				memcpy(&result[*result_len], piece, unit_len);
1235 				*result_len += unit_len;
1236 			} else {
1237 				new_len = *result_len + subject_len - start_offset;
1238 				if (new_len + 1 > alloc_len) {
1239 					alloc_len = new_len + 1; /* now we know exactly how long it is */
1240 					new_buf = safe_emalloc(alloc_len, sizeof(char), 0);
1241 					memcpy(new_buf, result, *result_len);
1242 					efree(result);
1243 					result = new_buf;
1244 				}
1245 				/* stick that last bit of string on our output */
1246 				memcpy(&result[*result_len], piece, subject_len - start_offset);
1247 				*result_len += subject_len - start_offset;
1248 				result[*result_len] = '\0';
1249 				break;
1250 			}
1251 		} else {
1252 			pcre_handle_exec_error(count TSRMLS_CC);
1253 			efree(result);
1254 			result = NULL;
1255 			break;
1256 		}
1257 
1258 		/* If we have matched an empty string, mimic what Perl's /g options does.
1259 		   This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1260 		   the match again at the same point. If this fails (picked up above) we
1261 		   advance to the next character. */
1262 		g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1263 
1264 		/* Advance to the next piece. */
1265 		start_offset = offsets[1];
1266 	}
1267 
1268 	efree(offsets);
1269 	efree(subpat_names);
1270 
1271 	return result;
1272 }
1273 /* }}} */
1274 
1275 /* {{{ php_replace_in_subject
1276  */
php_replace_in_subject(zval * regex,zval * replace,zval ** subject,int * result_len,int limit,int is_callable_replace,int * replace_count TSRMLS_DC)1277 static char *php_replace_in_subject(zval *regex, zval *replace, zval **subject, int *result_len, int limit, int is_callable_replace, int *replace_count TSRMLS_DC)
1278 {
1279 	zval		**regex_entry,
1280 				**replace_entry = NULL,
1281 				 *replace_value,
1282 				  empty_replace;
1283 	char		*subject_value,
1284 				*result;
1285 	int			 subject_len;
1286 
1287 	/* Make sure we're dealing with strings. */
1288 	convert_to_string_ex(subject);
1289 	/* FIXME: This might need to be changed to STR_EMPTY_ALLOC(). Check if this zval could be dtor()'ed somehow */
1290 	ZVAL_STRINGL(&empty_replace, "", 0, 0);
1291 
1292 	/* If regex is an array */
1293 	if (Z_TYPE_P(regex) == IS_ARRAY) {
1294 		/* Duplicate subject string for repeated replacement */
1295 		subject_value = estrndup(Z_STRVAL_PP(subject), Z_STRLEN_PP(subject));
1296 		subject_len = Z_STRLEN_PP(subject);
1297 		*result_len = subject_len;
1298 
1299 		zend_hash_internal_pointer_reset(Z_ARRVAL_P(regex));
1300 
1301 		replace_value = replace;
1302 		if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace)
1303 			zend_hash_internal_pointer_reset(Z_ARRVAL_P(replace));
1304 
1305 		/* For each entry in the regex array, get the entry */
1306 		while (zend_hash_get_current_data(Z_ARRVAL_P(regex), (void **)&regex_entry) == SUCCESS) {
1307 			/* Make sure we're dealing with strings. */
1308 			convert_to_string_ex(regex_entry);
1309 
1310 			/* If replace is an array and not a callable construct */
1311 			if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace) {
1312 				/* Get current entry */
1313 				if (zend_hash_get_current_data(Z_ARRVAL_P(replace), (void **)&replace_entry) == SUCCESS) {
1314 					if (!is_callable_replace) {
1315 						convert_to_string_ex(replace_entry);
1316 					}
1317 					replace_value = *replace_entry;
1318 					zend_hash_move_forward(Z_ARRVAL_P(replace));
1319 				} else {
1320 					/* We've run out of replacement strings, so use an empty one */
1321 					replace_value = &empty_replace;
1322 				}
1323 			}
1324 
1325 			/* Do the actual replacement and put the result back into subject_value
1326 			   for further replacements. */
1327 			if ((result = php_pcre_replace(Z_STRVAL_PP(regex_entry),
1328 										   Z_STRLEN_PP(regex_entry),
1329 										   subject_value,
1330 										   subject_len,
1331 										   replace_value,
1332 										   is_callable_replace,
1333 										   result_len,
1334 										   limit,
1335 										   replace_count TSRMLS_CC)) != NULL) {
1336 				efree(subject_value);
1337 				subject_value = result;
1338 				subject_len = *result_len;
1339 			} else {
1340 				efree(subject_value);
1341 				return NULL;
1342 			}
1343 
1344 			zend_hash_move_forward(Z_ARRVAL_P(regex));
1345 		}
1346 
1347 		return subject_value;
1348 	} else {
1349 		result = php_pcre_replace(Z_STRVAL_P(regex),
1350 								  Z_STRLEN_P(regex),
1351 								  Z_STRVAL_PP(subject),
1352 								  Z_STRLEN_PP(subject),
1353 								  replace,
1354 								  is_callable_replace,
1355 								  result_len,
1356 								  limit,
1357 								  replace_count TSRMLS_CC);
1358 		return result;
1359 	}
1360 }
1361 /* }}} */
1362 
1363 /* {{{ preg_replace_impl
1364  */
preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS,int is_callable_replace,int is_filter)1365 static void preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS, int is_callable_replace, int is_filter)
1366 {
1367 	zval		   **regex,
1368 				   **replace,
1369 				   **subject,
1370 				   **subject_entry,
1371 				   **zcount = NULL;
1372 	char			*result;
1373 	int				 result_len;
1374 	int				 limit_val = -1;
1375 	long			limit = -1;
1376 	char			*string_key;
1377 	ulong			 num_key;
1378 	char			*callback_name;
1379 	int				 replace_count=0, old_replace_count;
1380 
1381 	/* Get function parameters and do error-checking. */
1382 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ZZZ|lZ", &regex, &replace, &subject, &limit, &zcount) == FAILURE) {
1383 		return;
1384 	}
1385 
1386 	if (!is_callable_replace && Z_TYPE_PP(replace) == IS_ARRAY && Z_TYPE_PP(regex) != IS_ARRAY) {
1387 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Parameter mismatch, pattern is a string while replacement is an array");
1388 		RETURN_FALSE;
1389 	}
1390 
1391 	SEPARATE_ZVAL(replace);
1392 	if (Z_TYPE_PP(replace) != IS_ARRAY && (Z_TYPE_PP(replace) != IS_OBJECT || !is_callable_replace)) {
1393 		convert_to_string_ex(replace);
1394 	}
1395 	if (is_callable_replace) {
1396 		if (!zend_is_callable(*replace, 0, &callback_name TSRMLS_CC)) {
1397 			php_error_docref(NULL TSRMLS_CC, E_WARNING, "Requires argument 2, '%s', to be a valid callback", callback_name);
1398 			efree(callback_name);
1399 			MAKE_COPY_ZVAL(subject, return_value);
1400 			return;
1401 		}
1402 		efree(callback_name);
1403 	}
1404 
1405 	SEPARATE_ZVAL(regex);
1406 	SEPARATE_ZVAL(subject);
1407 
1408 	if (ZEND_NUM_ARGS() > 3) {
1409 		limit_val = limit;
1410 	}
1411 
1412 	if (Z_TYPE_PP(regex) != IS_ARRAY)
1413 		convert_to_string_ex(regex);
1414 
1415 	/* if subject is an array */
1416 	if (Z_TYPE_PP(subject) == IS_ARRAY) {
1417 		array_init(return_value);
1418 		zend_hash_internal_pointer_reset(Z_ARRVAL_PP(subject));
1419 
1420 		/* For each subject entry, convert it to string, then perform replacement
1421 		   and add the result to the return_value array. */
1422 		while (zend_hash_get_current_data(Z_ARRVAL_PP(subject), (void **)&subject_entry) == SUCCESS) {
1423 			SEPARATE_ZVAL(subject_entry);
1424 			old_replace_count = replace_count;
1425 			if ((result = php_replace_in_subject(*regex, *replace, subject_entry, &result_len, limit_val, is_callable_replace, &replace_count TSRMLS_CC)) != NULL) {
1426 				if (!is_filter || replace_count > old_replace_count) {
1427 					/* Add to return array */
1428 					switch(zend_hash_get_current_key(Z_ARRVAL_PP(subject), &string_key, &num_key, 0))
1429 					{
1430 					case HASH_KEY_IS_STRING:
1431 						add_assoc_stringl(return_value, string_key, result, result_len, 0);
1432 						break;
1433 
1434 					case HASH_KEY_IS_LONG:
1435 						add_index_stringl(return_value, num_key, result, result_len, 0);
1436 						break;
1437 					}
1438 				} else {
1439 					efree(result);
1440 				}
1441 			}
1442 
1443 			zend_hash_move_forward(Z_ARRVAL_PP(subject));
1444 		}
1445 	} else {	/* if subject is not an array */
1446 		old_replace_count = replace_count;
1447 		if ((result = php_replace_in_subject(*regex, *replace, subject, &result_len, limit_val, is_callable_replace, &replace_count TSRMLS_CC)) != NULL) {
1448 			if (!is_filter || replace_count > old_replace_count) {
1449 				RETVAL_STRINGL(result, result_len, 0);
1450 			} else {
1451 				efree(result);
1452 			}
1453 		}
1454 	}
1455 	if (ZEND_NUM_ARGS() > 4) {
1456 		zval_dtor(*zcount);
1457 		ZVAL_LONG(*zcount, replace_count);
1458 	}
1459 
1460 }
1461 /* }}} */
1462 
1463 /* {{{ proto mixed preg_replace(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1464    Perform Perl-style regular expression replacement. */
PHP_FUNCTION(preg_replace)1465 static PHP_FUNCTION(preg_replace)
1466 {
1467 	preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
1468 }
1469 /* }}} */
1470 
1471 /* {{{ proto mixed preg_replace_callback(mixed regex, mixed callback, mixed subject [, int limit [, int &count]])
1472    Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback)1473 static PHP_FUNCTION(preg_replace_callback)
1474 {
1475 	preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1, 0);
1476 }
1477 /* }}} */
1478 
1479 /* {{{ proto mixed preg_filter(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1480    Perform Perl-style regular expression replacement and only return matches. */
PHP_FUNCTION(preg_filter)1481 static PHP_FUNCTION(preg_filter)
1482 {
1483 	preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
1484 }
1485 /* }}} */
1486 
1487 /* {{{ proto array preg_split(string pattern, string subject [, int limit [, int flags]])
1488    Split string into an array using a perl-style regular expression as a delimiter */
PHP_FUNCTION(preg_split)1489 static PHP_FUNCTION(preg_split)
1490 {
1491 	char				*regex;			/* Regular expression */
1492 	char				*subject;		/* String to match against */
1493 	int					 regex_len;
1494 	int					 subject_len;
1495 	long				 limit_val = -1;/* Integer value of limit */
1496 	long				 flags = 0;		/* Match control flags */
1497 	pcre_cache_entry	*pce;			/* Compiled regular expression */
1498 
1499 	/* Get function parameters and do error checking */
1500 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|ll", &regex, &regex_len,
1501 							  &subject, &subject_len, &limit_val, &flags) == FAILURE) {
1502 		RETURN_FALSE;
1503 	}
1504 
1505 	/* Compile regex or get it from cache. */
1506 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1507 		RETURN_FALSE;
1508 	}
1509 
1510 	pce->refcount++;
1511 	php_pcre_split_impl(pce, subject, subject_len, return_value, limit_val, flags TSRMLS_CC);
1512 	pce->refcount--;
1513 }
1514 /* }}} */
1515 
1516 /* {{{ php_pcre_split
1517  */
php_pcre_split_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * return_value,long limit_val,long flags TSRMLS_DC)1518 PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
1519 	long limit_val, long flags TSRMLS_DC)
1520 {
1521 	pcre_extra		*extra = NULL;		/* Holds results of studying */
1522 	pcre			*re_bump = NULL;	/* Regex instance for empty matches */
1523 	pcre_extra		*extra_bump = NULL;	/* Almost dummy */
1524 	pcre_extra		 extra_data;		/* Used locally for exec options */
1525 	int				*offsets;			/* Array of subpattern offsets */
1526 	int				 size_offsets;		/* Size of the offsets array */
1527 	int				 exoptions = 0;		/* Execution options */
1528 	int				 count = 0;			/* Count of matched subpatterns */
1529 	int				 start_offset;		/* Where the new search starts */
1530 	int				 next_offset;		/* End of the last delimiter match + 1 */
1531 	int				 g_notempty = 0;	/* If the match should not be empty */
1532 	char			*last_match;		/* Location of last match */
1533 	int				 rc;
1534 	int				 no_empty;			/* If NO_EMPTY flag is set */
1535 	int				 delim_capture; 	/* If delimiters should be captured */
1536 	int				 offset_capture;	/* If offsets should be captured */
1537 
1538 	no_empty = flags & PREG_SPLIT_NO_EMPTY;
1539 	delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
1540 	offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
1541 
1542 	if (limit_val == 0) {
1543 		limit_val = -1;
1544 	}
1545 
1546 	if (extra == NULL) {
1547 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1548 		extra = &extra_data;
1549 	}
1550 	extra->match_limit = PCRE_G(backtrack_limit);
1551 	extra->match_limit_recursion = PCRE_G(recursion_limit);
1552 
1553 	/* Initialize return value */
1554 	array_init(return_value);
1555 
1556 	/* Calculate the size of the offsets array, and allocate memory for it. */
1557 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
1558 	if (rc < 0) {
1559 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1560 		RETURN_FALSE;
1561 	}
1562 	size_offsets = (size_offsets + 1) * 3;
1563 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1564 
1565 	/* Start at the beginning of the string */
1566 	start_offset = 0;
1567 	next_offset = 0;
1568 	last_match = subject;
1569 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1570 
1571 	/* Get next piece if no limit or limit not yet reached and something matched*/
1572 	while ((limit_val == -1 || limit_val > 1)) {
1573 		count = pcre_exec(pce->re, extra, subject,
1574 						  subject_len, start_offset,
1575 						  exoptions|g_notempty, offsets, size_offsets);
1576 
1577 		/* the string was already proved to be valid UTF-8 */
1578 		exoptions |= PCRE_NO_UTF8_CHECK;
1579 
1580 		/* Check for too many substrings condition. */
1581 		if (count == 0) {
1582 			php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
1583 			count = size_offsets/3;
1584 		}
1585 
1586 		/* If something matched */
1587 		if (count > 0 && (offsets[1] - offsets[0] >= 0)) {
1588 			if (!no_empty || &subject[offsets[0]] != last_match) {
1589 
1590 				if (offset_capture) {
1591 					/* Add (match, offset) pair to the return value */
1592 					add_offset_pair(return_value, last_match, &subject[offsets[0]]-last_match, next_offset, NULL);
1593 				} else {
1594 					/* Add the piece to the return value */
1595 					add_next_index_stringl(return_value, last_match,
1596 								   	   &subject[offsets[0]]-last_match, 1);
1597 				}
1598 
1599 				/* One less left to do */
1600 				if (limit_val != -1)
1601 					limit_val--;
1602 			}
1603 
1604 			last_match = &subject[offsets[1]];
1605 			next_offset = offsets[1];
1606 
1607 			if (delim_capture) {
1608 				int i, match_len;
1609 				for (i = 1; i < count; i++) {
1610 					match_len = offsets[(i<<1)+1] - offsets[i<<1];
1611 					/* If we have matched a delimiter */
1612 					if (!no_empty || match_len > 0) {
1613 						if (offset_capture) {
1614 							add_offset_pair(return_value, &subject[offsets[i<<1]], match_len, offsets[i<<1], NULL);
1615 						} else {
1616 							add_next_index_stringl(return_value,
1617 												   &subject[offsets[i<<1]],
1618 												   match_len, 1);
1619 						}
1620 					}
1621 				}
1622 			}
1623 		} else if (count == PCRE_ERROR_NOMATCH) {
1624 			/* If we previously set PCRE_NOTEMPTY after a null match,
1625 			   this is not necessarily the end. We need to advance
1626 			   the start offset, and continue. Fudge the offset values
1627 			   to achieve this, unless we're already at the end of the string. */
1628 			if (g_notempty != 0 && start_offset < subject_len) {
1629 				if (pce->compile_options & PCRE_UTF8) {
1630 					if (re_bump == NULL) {
1631 						int dummy;
1632 
1633 						if ((re_bump = pcre_get_compiled_regex("/./us", &extra_bump, &dummy TSRMLS_CC)) == NULL) {
1634 							RETURN_FALSE;
1635 						}
1636 					}
1637 					count = pcre_exec(re_bump, extra_bump, subject,
1638 							  subject_len, start_offset,
1639 							  exoptions, offsets, size_offsets);
1640 					if (count < 1) {
1641 						php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown error");
1642 						RETURN_FALSE;
1643 					}
1644 				} else {
1645 					offsets[0] = start_offset;
1646 					offsets[1] = start_offset + 1;
1647 				}
1648 			} else
1649 				break;
1650 		} else {
1651 			pcre_handle_exec_error(count TSRMLS_CC);
1652 			break;
1653 		}
1654 
1655 		/* If we have matched an empty string, mimic what Perl's /g options does.
1656 		   This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1657 		   the match again at the same point. If this fails (picked up above) we
1658 		   advance to the next character. */
1659 		g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1660 
1661 		/* Advance to the position right after the last full match */
1662 		start_offset = offsets[1];
1663 	}
1664 
1665 
1666 	start_offset = last_match - subject; /* the offset might have been incremented, but without further successful matches */
1667 
1668 	if (!no_empty || start_offset < subject_len)
1669 	{
1670 		if (offset_capture) {
1671 			/* Add the last (match, offset) pair to the return value */
1672 			add_offset_pair(return_value, &subject[start_offset], subject_len - start_offset, start_offset, NULL);
1673 		} else {
1674 			/* Add the last piece to the return value */
1675 			add_next_index_stringl(return_value, last_match, subject + subject_len - last_match, 1);
1676 		}
1677 	}
1678 
1679 
1680 	/* Clean up */
1681 	efree(offsets);
1682 }
1683 /* }}} */
1684 
1685 /* {{{ proto string preg_quote(string str [, string delim_char])
1686    Quote regular expression characters plus an optional character */
PHP_FUNCTION(preg_quote)1687 static PHP_FUNCTION(preg_quote)
1688 {
1689 	int		 in_str_len;
1690 	char	*in_str;		/* Input string argument */
1691 	char	*in_str_end;    /* End of the input string */
1692 	int		 delim_len = 0;
1693 	char	*delim = NULL;	/* Additional delimiter argument */
1694 	char	*out_str,		/* Output string with quoted characters */
1695 		 	*p,				/* Iterator for input string */
1696 			*q,				/* Iterator for output string */
1697 			 delim_char=0,	/* Delimiter character to be quoted */
1698 			 c;				/* Current character */
1699 	zend_bool quote_delim = 0; /* Whether to quote additional delim char */
1700 
1701 	/* Get the arguments and check for errors */
1702 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|s", &in_str, &in_str_len,
1703 							  &delim, &delim_len) == FAILURE) {
1704 		return;
1705 	}
1706 
1707 	in_str_end = in_str + in_str_len;
1708 
1709 	/* Nothing to do if we got an empty string */
1710 	if (in_str == in_str_end) {
1711 		RETURN_EMPTY_STRING();
1712 	}
1713 
1714 	if (delim && *delim) {
1715 		delim_char = delim[0];
1716 		quote_delim = 1;
1717 	}
1718 
1719 	/* Allocate enough memory so that even if each character
1720 	   is quoted, we won't run out of room */
1721 	out_str = safe_emalloc(4, in_str_len, 1);
1722 
1723 	/* Go through the string and quote necessary characters */
1724 	for(p = in_str, q = out_str; p != in_str_end; p++) {
1725 		c = *p;
1726 		switch(c) {
1727 			case '.':
1728 			case '\\':
1729 			case '+':
1730 			case '*':
1731 			case '?':
1732 			case '[':
1733 			case '^':
1734 			case ']':
1735 			case '$':
1736 			case '(':
1737 			case ')':
1738 			case '{':
1739 			case '}':
1740 			case '=':
1741 			case '!':
1742 			case '>':
1743 			case '<':
1744 			case '|':
1745 			case ':':
1746 			case '-':
1747 				*q++ = '\\';
1748 				*q++ = c;
1749 				break;
1750 
1751 			case '\0':
1752 				*q++ = '\\';
1753 				*q++ = '0';
1754 				*q++ = '0';
1755 				*q++ = '0';
1756 				break;
1757 
1758 			default:
1759 				if (quote_delim && c == delim_char)
1760 					*q++ = '\\';
1761 				*q++ = c;
1762 				break;
1763 		}
1764 	}
1765 	*q = '\0';
1766 
1767 	/* Reallocate string and return it */
1768 	RETVAL_STRINGL(erealloc(out_str, q - out_str + 1), q - out_str, 0);
1769 }
1770 /* }}} */
1771 
1772 /* {{{ proto array preg_grep(string regex, array input [, int flags])
1773    Searches array and returns entries which match regex */
PHP_FUNCTION(preg_grep)1774 static PHP_FUNCTION(preg_grep)
1775 {
1776 	char				*regex;			/* Regular expression */
1777 	int				 	 regex_len;
1778 	zval				*input;			/* Input array */
1779 	long				 flags = 0;		/* Match control flags */
1780 	pcre_cache_entry	*pce;			/* Compiled regular expression */
1781 
1782 	/* Get arguments and do error checking */
1783 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sa|l", &regex, &regex_len,
1784 							  &input, &flags) == FAILURE) {
1785 		return;
1786 	}
1787 
1788 	/* Compile regex or get it from cache. */
1789 	if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1790 		RETURN_FALSE;
1791 	}
1792 
1793 	pce->refcount++;
1794 	php_pcre_grep_impl(pce, input, return_value, flags TSRMLS_CC);
1795 	pce->refcount--;
1796 }
1797 /* }}} */
1798 
php_pcre_grep_impl(pcre_cache_entry * pce,zval * input,zval * return_value,long flags TSRMLS_DC)1799 PHPAPI void  php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, long flags TSRMLS_DC) /* {{{ */
1800 {
1801 	zval		   **entry;				/* An entry in the input array */
1802 	pcre_extra		*extra = pce->extra;/* Holds results of studying */
1803 	pcre_extra		 extra_data;		/* Used locally for exec options */
1804 	int				*offsets;			/* Array of subpattern offsets */
1805 	int				 size_offsets;		/* Size of the offsets array */
1806 	int				 count = 0;			/* Count of matched subpatterns */
1807 	char			*string_key;
1808 	ulong			 num_key;
1809 	zend_bool		 invert;			/* Whether to return non-matching
1810 										   entries */
1811 	int				 rc;
1812 
1813 	invert = flags & PREG_GREP_INVERT ? 1 : 0;
1814 
1815 	if (extra == NULL) {
1816 		extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1817 		extra = &extra_data;
1818 	}
1819 	extra->match_limit = PCRE_G(backtrack_limit);
1820 	extra->match_limit_recursion = PCRE_G(recursion_limit);
1821 
1822 	/* Calculate the size of the offsets array, and allocate memory for it. */
1823 	rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
1824 	if (rc < 0) {
1825 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1826 		RETURN_FALSE;
1827 	}
1828 	size_offsets = (size_offsets + 1) * 3;
1829 	offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1830 
1831 	/* Initialize return array */
1832 	array_init(return_value);
1833 
1834 	PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1835 
1836 	/* Go through the input array */
1837 	zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
1838 	while (zend_hash_get_current_data(Z_ARRVAL_P(input), (void **)&entry) == SUCCESS) {
1839 		zval subject = **entry;
1840 
1841 		if (Z_TYPE_PP(entry) != IS_STRING) {
1842 			zval_copy_ctor(&subject);
1843 			convert_to_string(&subject);
1844 		}
1845 
1846 		/* Perform the match */
1847 		count = pcre_exec(pce->re, extra, Z_STRVAL(subject),
1848 						  Z_STRLEN(subject), 0,
1849 						  0, offsets, size_offsets);
1850 
1851 		/* Check for too many substrings condition. */
1852 		if (count == 0) {
1853 			php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
1854 			count = size_offsets/3;
1855 		} else if (count < 0 && count != PCRE_ERROR_NOMATCH) {
1856 			pcre_handle_exec_error(count TSRMLS_CC);
1857 			break;
1858 		}
1859 
1860 		/* If the entry fits our requirements */
1861 		if ((count > 0 && !invert) || (count == PCRE_ERROR_NOMATCH && invert)) {
1862 
1863 			Z_ADDREF_PP(entry);
1864 
1865 			/* Add to return array */
1866 			switch (zend_hash_get_current_key(Z_ARRVAL_P(input), &string_key, &num_key, 0))
1867 			{
1868 				case HASH_KEY_IS_STRING:
1869 					zend_hash_update(Z_ARRVAL_P(return_value), string_key,
1870 									 strlen(string_key)+1, entry, sizeof(zval *), NULL);
1871 					break;
1872 
1873 				case HASH_KEY_IS_LONG:
1874 					zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry,
1875 										   sizeof(zval *), NULL);
1876 					break;
1877 			}
1878 		}
1879 
1880 		if (Z_TYPE_PP(entry) != IS_STRING) {
1881 			zval_dtor(&subject);
1882 		}
1883 
1884 		zend_hash_move_forward(Z_ARRVAL_P(input));
1885 	}
1886 	zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
1887 	/* Clean up */
1888 	efree(offsets);
1889 }
1890 /* }}} */
1891 
1892 /* {{{ proto int preg_last_error()
1893    Returns the error code of the last regexp execution. */
PHP_FUNCTION(preg_last_error)1894 static PHP_FUNCTION(preg_last_error)
1895 {
1896 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) {
1897 		return;
1898 	}
1899 
1900 	RETURN_LONG(PCRE_G(error_code));
1901 }
1902 /* }}} */
1903 
1904 /* {{{ module definition structures */
1905 
1906 /* {{{ arginfo */
1907 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match, 0, 0, 2)
1908     ZEND_ARG_INFO(0, pattern)
1909     ZEND_ARG_INFO(0, subject)
1910     ZEND_ARG_INFO(1, subpatterns) /* array */
1911     ZEND_ARG_INFO(0, flags)
1912     ZEND_ARG_INFO(0, offset)
1913 ZEND_END_ARG_INFO()
1914 
1915 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match_all, 0, 0, 2)
1916     ZEND_ARG_INFO(0, pattern)
1917     ZEND_ARG_INFO(0, subject)
1918     ZEND_ARG_INFO(1, subpatterns) /* array */
1919     ZEND_ARG_INFO(0, flags)
1920     ZEND_ARG_INFO(0, offset)
1921 ZEND_END_ARG_INFO()
1922 
1923 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace, 0, 0, 3)
1924     ZEND_ARG_INFO(0, regex)
1925     ZEND_ARG_INFO(0, replace)
1926     ZEND_ARG_INFO(0, subject)
1927     ZEND_ARG_INFO(0, limit)
1928     ZEND_ARG_INFO(1, count)
1929 ZEND_END_ARG_INFO()
1930 
1931 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace_callback, 0, 0, 3)
1932     ZEND_ARG_INFO(0, regex)
1933     ZEND_ARG_INFO(0, callback)
1934     ZEND_ARG_INFO(0, subject)
1935     ZEND_ARG_INFO(0, limit)
1936     ZEND_ARG_INFO(1, count)
1937 ZEND_END_ARG_INFO()
1938 
1939 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_split, 0, 0, 2)
1940     ZEND_ARG_INFO(0, pattern)
1941     ZEND_ARG_INFO(0, subject)
1942     ZEND_ARG_INFO(0, limit)
1943     ZEND_ARG_INFO(0, flags)
1944 ZEND_END_ARG_INFO()
1945 
1946 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_quote, 0, 0, 1)
1947     ZEND_ARG_INFO(0, str)
1948     ZEND_ARG_INFO(0, delim_char)
1949 ZEND_END_ARG_INFO()
1950 
1951 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_grep, 0, 0, 2)
1952     ZEND_ARG_INFO(0, regex)
1953     ZEND_ARG_INFO(0, input) /* array */
1954     ZEND_ARG_INFO(0, flags)
1955 ZEND_END_ARG_INFO()
1956 
1957 ZEND_BEGIN_ARG_INFO(arginfo_preg_last_error, 0)
1958 ZEND_END_ARG_INFO()
1959 /* }}} */
1960 
1961 static const zend_function_entry pcre_functions[] = {
1962 	PHP_FE(preg_match,				arginfo_preg_match)
1963 	PHP_FE(preg_match_all,			arginfo_preg_match_all)
1964 	PHP_FE(preg_replace,			arginfo_preg_replace)
1965 	PHP_FE(preg_replace_callback,	arginfo_preg_replace_callback)
1966 	PHP_FE(preg_filter,				arginfo_preg_replace)
1967 	PHP_FE(preg_split,				arginfo_preg_split)
1968 	PHP_FE(preg_quote,				arginfo_preg_quote)
1969 	PHP_FE(preg_grep,				arginfo_preg_grep)
1970 	PHP_FE(preg_last_error,			arginfo_preg_last_error)
1971 	PHP_FE_END
1972 };
1973 
1974 zend_module_entry pcre_module_entry = {
1975 	STANDARD_MODULE_HEADER,
1976    "pcre",
1977 	pcre_functions,
1978 	PHP_MINIT(pcre),
1979 	PHP_MSHUTDOWN(pcre),
1980 	NULL,
1981 	NULL,
1982 	PHP_MINFO(pcre),
1983 	NO_VERSION_YET,
1984 	PHP_MODULE_GLOBALS(pcre),
1985 	PHP_GINIT(pcre),
1986 	PHP_GSHUTDOWN(pcre),
1987 	NULL,
1988 	STANDARD_MODULE_PROPERTIES_EX
1989 };
1990 
1991 #ifdef COMPILE_DL_PCRE
1992 ZEND_GET_MODULE(pcre)
1993 #endif
1994 
1995 /* }}} */
1996 
1997 #endif /* HAVE_PCRE || HAVE_BUNDLED_PCRE */
1998 
1999 /*
2000  * Local variables:
2001  * tab-width: 4
2002  * c-basic-offset: 4
2003  * End:
2004  * vim600: sw=4 ts=4 fdm=marker
2005  * vim<600: sw=4 ts=4
2006  */
2007