1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 5 |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 1997-2015 The PHP Group |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
15 | Author: Andrei Zmievski <andrei@php.net> |
16 +----------------------------------------------------------------------+
17 */
18
19 /* $Id$ */
20
21 #include "php.h"
22 #include "php_ini.h"
23 #include "php_globals.h"
24 #include "php_pcre.h"
25 #include "ext/standard/info.h"
26 #include "ext/standard/php_smart_str.h"
27
28 #if HAVE_PCRE || HAVE_BUNDLED_PCRE
29
30 #include "ext/standard/php_string.h"
31
32 #define PREG_PATTERN_ORDER 1
33 #define PREG_SET_ORDER 2
34 #define PREG_OFFSET_CAPTURE (1<<8)
35
36 #define PREG_SPLIT_NO_EMPTY (1<<0)
37 #define PREG_SPLIT_DELIM_CAPTURE (1<<1)
38 #define PREG_SPLIT_OFFSET_CAPTURE (1<<2)
39
40 #define PREG_REPLACE_EVAL (1<<0)
41
42 #define PREG_GREP_INVERT (1<<0)
43
44 #define PCRE_CACHE_SIZE 4096
45
46 enum {
47 PHP_PCRE_NO_ERROR = 0,
48 PHP_PCRE_INTERNAL_ERROR,
49 PHP_PCRE_BACKTRACK_LIMIT_ERROR,
50 PHP_PCRE_RECURSION_LIMIT_ERROR,
51 PHP_PCRE_BAD_UTF8_ERROR,
52 PHP_PCRE_BAD_UTF8_OFFSET_ERROR
53 };
54
55
ZEND_DECLARE_MODULE_GLOBALS(pcre)56 ZEND_DECLARE_MODULE_GLOBALS(pcre)
57
58
59 static void pcre_handle_exec_error(int pcre_code TSRMLS_DC) /* {{{ */
60 {
61 int preg_code = 0;
62
63 switch (pcre_code) {
64 case PCRE_ERROR_MATCHLIMIT:
65 preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
66 break;
67
68 case PCRE_ERROR_RECURSIONLIMIT:
69 preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
70 break;
71
72 case PCRE_ERROR_BADUTF8:
73 preg_code = PHP_PCRE_BAD_UTF8_ERROR;
74 break;
75
76 case PCRE_ERROR_BADUTF8_OFFSET:
77 preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
78 break;
79
80 default:
81 preg_code = PHP_PCRE_INTERNAL_ERROR;
82 break;
83 }
84
85 PCRE_G(error_code) = preg_code;
86 }
87 /* }}} */
88
php_free_pcre_cache(void * data)89 static void php_free_pcre_cache(void *data) /* {{{ */
90 {
91 pcre_cache_entry *pce = (pcre_cache_entry *) data;
92 if (!pce) return;
93 pefree(pce->re, 1);
94 if (pce->extra) pefree(pce->extra, 1);
95 #if HAVE_SETLOCALE
96 if ((void*)pce->tables) pefree((void*)pce->tables, 1);
97 pefree(pce->locale, 1);
98 #endif
99 }
100 /* }}} */
101
PHP_GINIT_FUNCTION(pcre)102 static PHP_GINIT_FUNCTION(pcre) /* {{{ */
103 {
104 zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1);
105 pcre_globals->backtrack_limit = 0;
106 pcre_globals->recursion_limit = 0;
107 pcre_globals->error_code = PHP_PCRE_NO_ERROR;
108 }
109 /* }}} */
110
PHP_GSHUTDOWN_FUNCTION(pcre)111 static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */
112 {
113 zend_hash_destroy(&pcre_globals->pcre_cache);
114 }
115 /* }}} */
116
117 PHP_INI_BEGIN()
118 STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateLong, backtrack_limit, zend_pcre_globals, pcre_globals)
119 STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000", PHP_INI_ALL, OnUpdateLong, recursion_limit, zend_pcre_globals, pcre_globals)
PHP_INI_END()120 PHP_INI_END()
121
122
123 /* {{{ PHP_MINFO_FUNCTION(pcre) */
124 static PHP_MINFO_FUNCTION(pcre)
125 {
126 php_info_print_table_start();
127 php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
128 php_info_print_table_row(2, "PCRE Library Version", pcre_version() );
129 php_info_print_table_end();
130
131 DISPLAY_INI_ENTRIES();
132 }
133 /* }}} */
134
135 /* {{{ PHP_MINIT_FUNCTION(pcre) */
PHP_MINIT_FUNCTION(pcre)136 static PHP_MINIT_FUNCTION(pcre)
137 {
138 REGISTER_INI_ENTRIES();
139
140 REGISTER_LONG_CONSTANT("PREG_PATTERN_ORDER", PREG_PATTERN_ORDER, CONST_CS | CONST_PERSISTENT);
141 REGISTER_LONG_CONSTANT("PREG_SET_ORDER", PREG_SET_ORDER, CONST_CS | CONST_PERSISTENT);
142 REGISTER_LONG_CONSTANT("PREG_OFFSET_CAPTURE", PREG_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
143 REGISTER_LONG_CONSTANT("PREG_SPLIT_NO_EMPTY", PREG_SPLIT_NO_EMPTY, CONST_CS | CONST_PERSISTENT);
144 REGISTER_LONG_CONSTANT("PREG_SPLIT_DELIM_CAPTURE", PREG_SPLIT_DELIM_CAPTURE, CONST_CS | CONST_PERSISTENT);
145 REGISTER_LONG_CONSTANT("PREG_SPLIT_OFFSET_CAPTURE", PREG_SPLIT_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
146 REGISTER_LONG_CONSTANT("PREG_GREP_INVERT", PREG_GREP_INVERT, CONST_CS | CONST_PERSISTENT);
147
148 REGISTER_LONG_CONSTANT("PREG_NO_ERROR", PHP_PCRE_NO_ERROR, CONST_CS | CONST_PERSISTENT);
149 REGISTER_LONG_CONSTANT("PREG_INTERNAL_ERROR", PHP_PCRE_INTERNAL_ERROR, CONST_CS | CONST_PERSISTENT);
150 REGISTER_LONG_CONSTANT("PREG_BACKTRACK_LIMIT_ERROR", PHP_PCRE_BACKTRACK_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
151 REGISTER_LONG_CONSTANT("PREG_RECURSION_LIMIT_ERROR", PHP_PCRE_RECURSION_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
152 REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_ERROR", PHP_PCRE_BAD_UTF8_ERROR, CONST_CS | CONST_PERSISTENT);
153 REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_OFFSET_ERROR", PHP_PCRE_BAD_UTF8_OFFSET_ERROR, CONST_CS | CONST_PERSISTENT);
154 REGISTER_STRING_CONSTANT("PCRE_VERSION", (char *)pcre_version(), CONST_CS | CONST_PERSISTENT);
155
156 return SUCCESS;
157 }
158 /* }}} */
159
160 /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
PHP_MSHUTDOWN_FUNCTION(pcre)161 static PHP_MSHUTDOWN_FUNCTION(pcre)
162 {
163 UNREGISTER_INI_ENTRIES();
164
165 return SUCCESS;
166 }
167 /* }}} */
168
169 /* {{{ static pcre_clean_cache */
pcre_clean_cache(void * data,void * arg TSRMLS_DC)170 static int pcre_clean_cache(void *data, void *arg TSRMLS_DC)
171 {
172 pcre_cache_entry *pce = (pcre_cache_entry *) data;
173 int *num_clean = (int *)arg;
174
175 if (*num_clean > 0 && !pce->refcount) {
176 (*num_clean)--;
177 return ZEND_HASH_APPLY_REMOVE;
178 } else {
179 return ZEND_HASH_APPLY_KEEP;
180 }
181 }
182 /* }}} */
183
184 /* {{{ static make_subpats_table */
make_subpats_table(int num_subpats,pcre_cache_entry * pce TSRMLS_DC)185 static char **make_subpats_table(int num_subpats, pcre_cache_entry *pce TSRMLS_DC)
186 {
187 pcre_extra *extra = pce->extra;
188 int name_cnt = 0, name_size, ni = 0;
189 int rc;
190 char *name_table;
191 unsigned short name_idx;
192 char **subpat_names = (char **)ecalloc(num_subpats, sizeof(char *));
193
194 rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMECOUNT, &name_cnt);
195 if (rc < 0) {
196 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
197 efree(subpat_names);
198 return NULL;
199 }
200 if (name_cnt > 0) {
201 int rc1, rc2;
202
203 rc1 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMETABLE, &name_table);
204 rc2 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
205 rc = rc2 ? rc2 : rc1;
206 if (rc < 0) {
207 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
208 efree(subpat_names);
209 return NULL;
210 }
211
212 while (ni++ < name_cnt) {
213 name_idx = 0xff * (unsigned char)name_table[0] + (unsigned char)name_table[1];
214 subpat_names[name_idx] = name_table + 2;
215 if (is_numeric_string(subpat_names[name_idx], strlen(subpat_names[name_idx]), NULL, NULL, 0) > 0) {
216 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Numeric named subpatterns are not allowed");
217 efree(subpat_names);
218 return NULL;
219 }
220 name_table += name_size;
221 }
222 }
223
224 return subpat_names;
225 }
226 /* }}} */
227
228 /* {{{ static calculate_unit_length */
229 /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE_UTF8. */
calculate_unit_length(pcre_cache_entry * pce,char * start)230 static zend_always_inline int calculate_unit_length(pcre_cache_entry *pce, char *start)
231 {
232 int unit_len;
233
234 if (pce->compile_options & PCRE_UTF8) {
235 char *end = start;
236
237 /* skip continuation bytes */
238 while ((*++end & 0xC0) == 0x80);
239 unit_len = end - start;
240 } else {
241 unit_len = 1;
242 }
243 return unit_len;
244 }
245 /* }}} */
246
247 /* {{{ pcre_get_compiled_regex_cache
248 */
pcre_get_compiled_regex_cache(char * regex,int regex_len TSRMLS_DC)249 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_len TSRMLS_DC)
250 {
251 pcre *re = NULL;
252 pcre_extra *extra;
253 int coptions = 0;
254 int soptions = 0;
255 const char *error;
256 int erroffset;
257 char delimiter;
258 char start_delimiter;
259 char end_delimiter;
260 char *p, *pp;
261 char *pattern;
262 int do_study = 0;
263 int poptions = 0;
264 int count = 0;
265 unsigned const char *tables = NULL;
266 #if HAVE_SETLOCALE
267 char *locale;
268 #endif
269 pcre_cache_entry *pce;
270 pcre_cache_entry new_entry;
271 char *tmp = NULL;
272
273 #if HAVE_SETLOCALE
274 # if defined(PHP_WIN32) && defined(ZTS)
275 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
276 # endif
277 locale = setlocale(LC_CTYPE, NULL);
278 #endif
279
280 /* Try to lookup the cached regex entry, and if successful, just pass
281 back the compiled pattern, otherwise go on and compile it. */
282 if (zend_hash_find(&PCRE_G(pcre_cache), regex, regex_len+1, (void **)&pce) == SUCCESS) {
283 /*
284 * We use a quick pcre_fullinfo() check to see whether cache is corrupted, and if it
285 * is, we flush it and compile the pattern from scratch.
286 */
287 if (pcre_fullinfo(pce->re, NULL, PCRE_INFO_CAPTURECOUNT, &count) == PCRE_ERROR_BADMAGIC) {
288 zend_hash_clean(&PCRE_G(pcre_cache));
289 } else {
290 #if HAVE_SETLOCALE
291 if (!strcmp(pce->locale, locale)) {
292 #endif
293 return pce;
294 #if HAVE_SETLOCALE
295 }
296 #endif
297 }
298 }
299
300 p = regex;
301
302 /* Parse through the leading whitespace, and display a warning if we
303 get to the end without encountering a delimiter. */
304 while (isspace((int)*(unsigned char *)p)) p++;
305 if (*p == 0) {
306 php_error_docref(NULL TSRMLS_CC, E_WARNING,
307 p < regex + regex_len ? "Null byte in regex" : "Empty regular expression");
308 return NULL;
309 }
310
311 /* Get the delimiter and display a warning if it is alphanumeric
312 or a backslash. */
313 delimiter = *p++;
314 if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
315 php_error_docref(NULL TSRMLS_CC,E_WARNING, "Delimiter must not be alphanumeric or backslash");
316 return NULL;
317 }
318
319 start_delimiter = delimiter;
320 if ((pp = strchr("([{< )]}> )]}>", delimiter)))
321 delimiter = pp[5];
322 end_delimiter = delimiter;
323
324 pp = p;
325
326 if (start_delimiter == end_delimiter) {
327 /* We need to iterate through the pattern, searching for the ending delimiter,
328 but skipping the backslashed delimiters. If the ending delimiter is not
329 found, display a warning. */
330 while (*pp != 0) {
331 if (*pp == '\\' && pp[1] != 0) pp++;
332 else if (*pp == delimiter)
333 break;
334 pp++;
335 }
336 } else {
337 /* We iterate through the pattern, searching for the matching ending
338 * delimiter. For each matching starting delimiter, we increment nesting
339 * level, and decrement it for each matching ending delimiter. If we
340 * reach the end of the pattern without matching, display a warning.
341 */
342 int brackets = 1; /* brackets nesting level */
343 while (*pp != 0) {
344 if (*pp == '\\' && pp[1] != 0) pp++;
345 else if (*pp == end_delimiter && --brackets <= 0)
346 break;
347 else if (*pp == start_delimiter)
348 brackets++;
349 pp++;
350 }
351 }
352
353 if (*pp == 0) {
354 if (pp < regex + regex_len) {
355 php_error_docref(NULL TSRMLS_CC,E_WARNING, "Null byte in regex");
356 } else if (start_delimiter == end_delimiter) {
357 php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending delimiter '%c' found", delimiter);
358 } else {
359 php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending matching delimiter '%c' found", delimiter);
360 }
361 return NULL;
362 }
363
364 /* Make a copy of the actual pattern. */
365 pattern = estrndup(p, pp-p);
366
367 /* Move on to the options */
368 pp++;
369
370 /* Parse through the options, setting appropriate flags. Display
371 a warning if we encounter an unknown modifier. */
372 while (pp < regex + regex_len) {
373 switch (*pp++) {
374 /* Perl compatible options */
375 case 'i': coptions |= PCRE_CASELESS; break;
376 case 'm': coptions |= PCRE_MULTILINE; break;
377 case 's': coptions |= PCRE_DOTALL; break;
378 case 'x': coptions |= PCRE_EXTENDED; break;
379
380 /* PCRE specific options */
381 case 'A': coptions |= PCRE_ANCHORED; break;
382 case 'D': coptions |= PCRE_DOLLAR_ENDONLY;break;
383 case 'S': do_study = 1; break;
384 case 'U': coptions |= PCRE_UNGREEDY; break;
385 case 'X': coptions |= PCRE_EXTRA; break;
386 case 'u': coptions |= PCRE_UTF8;
387 /* In PCRE, by default, \d, \D, \s, \S, \w, and \W recognize only ASCII
388 characters, even in UTF-8 mode. However, this can be changed by setting
389 the PCRE_UCP option. */
390 #ifdef PCRE_UCP
391 coptions |= PCRE_UCP;
392 #endif
393 break;
394
395 /* Custom preg options */
396 case 'e': poptions |= PREG_REPLACE_EVAL; break;
397
398 case ' ':
399 case '\n':
400 break;
401
402 default:
403 if (pp[-1]) {
404 php_error_docref(NULL TSRMLS_CC,E_WARNING, "Unknown modifier '%c'", pp[-1]);
405 } else {
406 php_error_docref(NULL TSRMLS_CC,E_WARNING, "Null byte in regex");
407 }
408 efree(pattern);
409 return NULL;
410 }
411 }
412
413 #if HAVE_SETLOCALE
414 if (strcmp(locale, "C"))
415 tables = pcre_maketables();
416 #endif
417
418 /* Compile pattern and display a warning if compilation failed. */
419 re = pcre_compile(pattern,
420 coptions,
421 &error,
422 &erroffset,
423 tables);
424
425 if (re == NULL) {
426 php_error_docref(NULL TSRMLS_CC,E_WARNING, "Compilation failed: %s at offset %d", error, erroffset);
427 efree(pattern);
428 if (tables) {
429 pefree((void*)tables, 1);
430 }
431 return NULL;
432 }
433
434 /* If study option was specified, study the pattern and
435 store the result in extra for passing to pcre_exec. */
436 if (do_study) {
437 extra = pcre_study(re, soptions, &error);
438 if (extra) {
439 extra->flags |= PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
440 }
441 if (error != NULL) {
442 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Error while studying pattern");
443 }
444 } else {
445 extra = NULL;
446 }
447
448 efree(pattern);
449
450 /*
451 * If we reached cache limit, clean out the items from the head of the list;
452 * these are supposedly the oldest ones (but not necessarily the least used
453 * ones).
454 */
455 if (zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) {
456 int num_clean = PCRE_CACHE_SIZE / 8;
457 zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean TSRMLS_CC);
458 }
459
460 /* Store the compiled pattern and extra info in the cache. */
461 new_entry.re = re;
462 new_entry.extra = extra;
463 new_entry.preg_options = poptions;
464 new_entry.compile_options = coptions;
465 #if HAVE_SETLOCALE
466 new_entry.locale = pestrdup(locale, 1);
467 new_entry.tables = tables;
468 #endif
469 new_entry.refcount = 0;
470
471 /*
472 * Interned strings are not duplicated when stored in HashTable,
473 * but all the interned strings created during HTTP request are removed
474 * at end of request. However PCRE_G(pcre_cache) must be consistent
475 * on the next request as well. So we disable usage of interned strings
476 * as hash keys especually for this table.
477 * See bug #63180
478 */
479 if (IS_INTERNED(regex)) {
480 regex = tmp = estrndup(regex, regex_len);
481 }
482
483 zend_hash_update(&PCRE_G(pcre_cache), regex, regex_len+1, (void *)&new_entry,
484 sizeof(pcre_cache_entry), (void**)&pce);
485
486 if (tmp) {
487 efree(tmp);
488 }
489
490 return pce;
491 }
492 /* }}} */
493
494 /* {{{ pcre_get_compiled_regex
495 */
pcre_get_compiled_regex(char * regex,pcre_extra ** extra,int * preg_options TSRMLS_DC)496 PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_options TSRMLS_DC)
497 {
498 pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
499
500 if (extra) {
501 *extra = pce ? pce->extra : NULL;
502 }
503 if (preg_options) {
504 *preg_options = pce ? pce->preg_options : 0;
505 }
506
507 return pce ? pce->re : NULL;
508 }
509 /* }}} */
510
511 /* {{{ pcre_get_compiled_regex_ex
512 */
pcre_get_compiled_regex_ex(char * regex,pcre_extra ** extra,int * preg_options,int * compile_options TSRMLS_DC)513 PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *compile_options TSRMLS_DC)
514 {
515 pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
516
517 if (extra) {
518 *extra = pce ? pce->extra : NULL;
519 }
520 if (preg_options) {
521 *preg_options = pce ? pce->preg_options : 0;
522 }
523 if (compile_options) {
524 *compile_options = pce ? pce->compile_options : 0;
525 }
526
527 return pce ? pce->re : NULL;
528 }
529 /* }}} */
530
531 /* {{{ add_offset_pair */
add_offset_pair(zval * result,char * str,int len,int offset,char * name)532 static inline void add_offset_pair(zval *result, char *str, int len, int offset, char *name)
533 {
534 zval *match_pair;
535
536 ALLOC_ZVAL(match_pair);
537 array_init(match_pair);
538 INIT_PZVAL(match_pair);
539
540 /* Add (match, offset) to the return value */
541 add_next_index_stringl(match_pair, str, len, 1);
542 add_next_index_long(match_pair, offset);
543
544 if (name) {
545 zval_add_ref(&match_pair);
546 zend_hash_update(Z_ARRVAL_P(result), name, strlen(name)+1, &match_pair, sizeof(zval *), NULL);
547 }
548 zend_hash_next_index_insert(Z_ARRVAL_P(result), &match_pair, sizeof(zval *), NULL);
549 }
550 /* }}} */
551
php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS,int global)552 static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ */
553 {
554 /* parameters */
555 char *regex; /* Regular expression */
556 char *subject; /* String to match against */
557 int regex_len;
558 int subject_len;
559 pcre_cache_entry *pce; /* Compiled regular expression */
560 zval *subpats = NULL; /* Array for subpatterns */
561 long flags = 0; /* Match control flags */
562 long start_offset = 0; /* Where the new search starts */
563
564 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|zll", ®ex, ®ex_len,
565 &subject, &subject_len, &subpats, &flags, &start_offset) == FAILURE) {
566 RETURN_FALSE;
567 }
568
569 /* Compile regex or get it from cache. */
570 if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
571 RETURN_FALSE;
572 }
573
574 pce->refcount++;
575 php_pcre_match_impl(pce, subject, subject_len, return_value, subpats,
576 global, ZEND_NUM_ARGS() >= 4, flags, start_offset TSRMLS_CC);
577 pce->refcount--;
578 }
579 /* }}} */
580
581 /* {{{ php_pcre_match_impl() */
php_pcre_match_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * return_value,zval * subpats,int global,int use_flags,long flags,long start_offset TSRMLS_DC)582 PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
583 zval *subpats, int global, int use_flags, long flags, long start_offset TSRMLS_DC)
584 {
585 zval *result_set, /* Holds a set of subpatterns after
586 a global match */
587 **match_sets = NULL; /* An array of sets of matches for each
588 subpattern after a global match */
589 pcre_extra *extra = pce->extra;/* Holds results of studying */
590 pcre_extra extra_data; /* Used locally for exec options */
591 int exoptions = 0; /* Execution options */
592 int count = 0; /* Count of matched subpatterns */
593 int *offsets; /* Array of subpattern offsets */
594 int num_subpats; /* Number of captured subpatterns */
595 int size_offsets; /* Size of the offsets array */
596 int matched; /* Has anything matched */
597 int g_notempty = 0; /* If the match should not be empty */
598 const char **stringlist; /* Holds list of subpatterns */
599 char **subpat_names; /* Array for named subpatterns */
600 int i, rc;
601 int subpats_order; /* Order of subpattern matches */
602 int offset_capture; /* Capture match offsets: yes/no */
603
604 /* Overwrite the passed-in value for subpatterns with an empty array. */
605 if (subpats != NULL) {
606 zval_dtor(subpats);
607 array_init(subpats);
608 }
609
610 subpats_order = global ? PREG_PATTERN_ORDER : 0;
611
612 if (use_flags) {
613 offset_capture = flags & PREG_OFFSET_CAPTURE;
614
615 /*
616 * subpats_order is pre-set to pattern mode so we change it only if
617 * necessary.
618 */
619 if (flags & 0xff) {
620 subpats_order = flags & 0xff;
621 }
622 if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
623 (!global && subpats_order != 0)) {
624 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid flags specified");
625 return;
626 }
627 } else {
628 offset_capture = 0;
629 }
630
631 /* Negative offset counts from the end of the string. */
632 if (start_offset < 0) {
633 start_offset = subject_len + start_offset;
634 if (start_offset < 0) {
635 start_offset = 0;
636 }
637 }
638
639 if (extra == NULL) {
640 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
641 extra = &extra_data;
642 }
643 extra->match_limit = PCRE_G(backtrack_limit);
644 extra->match_limit_recursion = PCRE_G(recursion_limit);
645
646 /* Calculate the size of the offsets array, and allocate memory for it. */
647 rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
648 if (rc < 0) {
649 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
650 RETURN_FALSE;
651 }
652 num_subpats++;
653 size_offsets = num_subpats * 3;
654
655 /*
656 * Build a mapping from subpattern numbers to their names. We will always
657 * allocate the table, even though there may be no named subpatterns. This
658 * avoids somewhat more complicated logic in the inner loops.
659 */
660 subpat_names = make_subpats_table(num_subpats, pce TSRMLS_CC);
661 if (!subpat_names) {
662 RETURN_FALSE;
663 }
664
665 offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
666 memset(offsets, 0, size_offsets*sizeof(int));
667 /* Allocate match sets array and initialize the values. */
668 if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
669 match_sets = (zval **)safe_emalloc(num_subpats, sizeof(zval *), 0);
670 for (i=0; i<num_subpats; i++) {
671 ALLOC_ZVAL(match_sets[i]);
672 array_init(match_sets[i]);
673 INIT_PZVAL(match_sets[i]);
674 }
675 }
676
677 matched = 0;
678 PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
679
680 do {
681 /* Execute the regular expression. */
682 count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
683 exoptions|g_notempty, offsets, size_offsets);
684
685 /* the string was already proved to be valid UTF-8 */
686 exoptions |= PCRE_NO_UTF8_CHECK;
687
688 /* Check for too many substrings condition. */
689 if (count == 0) {
690 php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
691 count = size_offsets/3;
692 }
693
694 /* If something has matched */
695 if (count > 0) {
696 matched++;
697
698 /* If subpatterns array has been passed, fill it in with values. */
699 if (subpats != NULL) {
700 /* Try to get the list of substrings and display a warning if failed. */
701 if ((offsets[1] - offsets[0] < 0) || pcre_get_substring_list(subject, offsets, count, &stringlist) < 0) {
702 efree(subpat_names);
703 efree(offsets);
704 if (match_sets) efree(match_sets);
705 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Get subpatterns list failed");
706 RETURN_FALSE;
707 }
708
709 if (global) { /* global pattern matching */
710 if (subpats && subpats_order == PREG_PATTERN_ORDER) {
711 /* For each subpattern, insert it into the appropriate array. */
712 for (i = 0; i < count; i++) {
713 if (offset_capture) {
714 add_offset_pair(match_sets[i], (char *)stringlist[i],
715 offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL);
716 } else {
717 add_next_index_stringl(match_sets[i], (char *)stringlist[i],
718 offsets[(i<<1)+1] - offsets[i<<1], 1);
719 }
720 }
721 /*
722 * If the number of captured subpatterns on this run is
723 * less than the total possible number, pad the result
724 * arrays with empty strings.
725 */
726 if (count < num_subpats) {
727 for (; i < num_subpats; i++) {
728 add_next_index_string(match_sets[i], "", 1);
729 }
730 }
731 } else {
732 /* Allocate the result set array */
733 ALLOC_ZVAL(result_set);
734 array_init(result_set);
735 INIT_PZVAL(result_set);
736
737 /* Add all the subpatterns to it */
738 for (i = 0; i < count; i++) {
739 if (offset_capture) {
740 add_offset_pair(result_set, (char *)stringlist[i],
741 offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], subpat_names[i]);
742 } else {
743 if (subpat_names[i]) {
744 add_assoc_stringl(result_set, subpat_names[i], (char *)stringlist[i],
745 offsets[(i<<1)+1] - offsets[i<<1], 1);
746 }
747 add_next_index_stringl(result_set, (char *)stringlist[i],
748 offsets[(i<<1)+1] - offsets[i<<1], 1);
749 }
750 }
751 /* And add it to the output array */
752 zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &result_set, sizeof(zval *), NULL);
753 }
754 } else { /* single pattern matching */
755 /* For each subpattern, insert it into the subpatterns array. */
756 for (i = 0; i < count; i++) {
757 if (offset_capture) {
758 add_offset_pair(subpats, (char *)stringlist[i],
759 offsets[(i<<1)+1] - offsets[i<<1],
760 offsets[i<<1], subpat_names[i]);
761 } else {
762 if (subpat_names[i]) {
763 add_assoc_stringl(subpats, subpat_names[i], (char *)stringlist[i],
764 offsets[(i<<1)+1] - offsets[i<<1], 1);
765 }
766 add_next_index_stringl(subpats, (char *)stringlist[i],
767 offsets[(i<<1)+1] - offsets[i<<1], 1);
768 }
769 }
770 }
771
772 pcre_free((void *) stringlist);
773 }
774 } else if (count == PCRE_ERROR_NOMATCH) {
775 /* If we previously set PCRE_NOTEMPTY after a null match,
776 this is not necessarily the end. We need to advance
777 the start offset, and continue. Fudge the offset values
778 to achieve this, unless we're already at the end of the string. */
779 if (g_notempty != 0 && start_offset < subject_len) {
780 int unit_len = calculate_unit_length(pce, subject + start_offset);
781
782 offsets[0] = start_offset;
783 offsets[1] = start_offset + unit_len;
784 } else
785 break;
786 } else {
787 pcre_handle_exec_error(count TSRMLS_CC);
788 break;
789 }
790
791 /* If we have matched an empty string, mimic what Perl's /g options does.
792 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
793 the match again at the same point. If this fails (picked up above) we
794 advance to the next character. */
795 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
796
797 /* Advance to the position right after the last full match */
798 start_offset = offsets[1];
799 } while (global);
800
801 /* Add the match sets to the output array and clean up */
802 if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
803 for (i = 0; i < num_subpats; i++) {
804 if (subpat_names[i]) {
805 zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i],
806 strlen(subpat_names[i])+1, &match_sets[i], sizeof(zval *), NULL);
807 Z_ADDREF_P(match_sets[i]);
808 }
809 zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i], sizeof(zval *), NULL);
810 }
811 efree(match_sets);
812 }
813
814 efree(offsets);
815 efree(subpat_names);
816
817 /* Did we encounter an error? */
818 if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
819 RETVAL_LONG(matched);
820 } else {
821 RETVAL_FALSE;
822 }
823 }
824 /* }}} */
825
826 /* {{{ proto int preg_match(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
827 Perform a Perl-style regular expression match */
PHP_FUNCTION(preg_match)828 static PHP_FUNCTION(preg_match)
829 {
830 php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
831 }
832 /* }}} */
833
834 /* {{{ proto int preg_match_all(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
835 Perform a Perl-style global regular expression match */
PHP_FUNCTION(preg_match_all)836 static PHP_FUNCTION(preg_match_all)
837 {
838 php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
839 }
840 /* }}} */
841
842 /* {{{ preg_get_backref
843 */
preg_get_backref(char ** str,int * backref)844 static int preg_get_backref(char **str, int *backref)
845 {
846 register char in_brace = 0;
847 register char *walk = *str;
848
849 if (walk[1] == 0)
850 return 0;
851
852 if (*walk == '$' && walk[1] == '{') {
853 in_brace = 1;
854 walk++;
855 }
856 walk++;
857
858 if (*walk >= '0' && *walk <= '9') {
859 *backref = *walk - '0';
860 walk++;
861 } else
862 return 0;
863
864 if (*walk && *walk >= '0' && *walk <= '9') {
865 *backref = *backref * 10 + *walk - '0';
866 walk++;
867 }
868
869 if (in_brace) {
870 if (*walk == 0 || *walk != '}')
871 return 0;
872 else
873 walk++;
874 }
875
876 *str = walk;
877 return 1;
878 }
879 /* }}} */
880
881 /* {{{ preg_do_repl_func
882 */
preg_do_repl_func(zval * function,char * subject,int * offsets,char ** subpat_names,int count,char ** result TSRMLS_DC)883 static int preg_do_repl_func(zval *function, char *subject, int *offsets, char **subpat_names, int count, char **result TSRMLS_DC)
884 {
885 zval *retval_ptr; /* Function return value */
886 zval **args[1]; /* Argument to pass to function */
887 zval *subpats; /* Captured subpatterns */
888 int result_len; /* Return value length */
889 int i;
890
891 MAKE_STD_ZVAL(subpats);
892 array_init(subpats);
893 for (i = 0; i < count; i++) {
894 if (subpat_names[i]) {
895 add_assoc_stringl(subpats, subpat_names[i], &subject[offsets[i<<1]] , offsets[(i<<1)+1] - offsets[i<<1], 1);
896 }
897 add_next_index_stringl(subpats, &subject[offsets[i<<1]], offsets[(i<<1)+1] - offsets[i<<1], 1);
898 }
899 args[0] = &subpats;
900
901 if (call_user_function_ex(EG(function_table), NULL, function, &retval_ptr, 1, args, 0, NULL TSRMLS_CC) == SUCCESS && retval_ptr) {
902 convert_to_string_ex(&retval_ptr);
903 *result = estrndup(Z_STRVAL_P(retval_ptr), Z_STRLEN_P(retval_ptr));
904 result_len = Z_STRLEN_P(retval_ptr);
905 zval_ptr_dtor(&retval_ptr);
906 } else {
907 if (!EG(exception)) {
908 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unable to call custom replacement function");
909 }
910 result_len = offsets[1] - offsets[0];
911 *result = estrndup(&subject[offsets[0]], result_len);
912 }
913
914 zval_ptr_dtor(&subpats);
915
916 return result_len;
917 }
918 /* }}} */
919
920 /* {{{ preg_do_eval
921 */
preg_do_eval(char * eval_str,int eval_str_len,char * subject,int * offsets,int count,char ** result TSRMLS_DC)922 static int preg_do_eval(char *eval_str, int eval_str_len, char *subject,
923 int *offsets, int count, char **result TSRMLS_DC)
924 {
925 zval retval; /* Return value from evaluation */
926 char *eval_str_end, /* End of eval string */
927 *match, /* Current match for a backref */
928 *esc_match, /* Quote-escaped match */
929 *walk, /* Used to walk the code string */
930 *segment, /* Start of segment to append while walking */
931 walk_last; /* Last walked character */
932 int match_len; /* Length of the match */
933 int esc_match_len; /* Length of the quote-escaped match */
934 int result_len; /* Length of the result of the evaluation */
935 int backref; /* Current backref */
936 char *compiled_string_description;
937 smart_str code = {0};
938
939 eval_str_end = eval_str + eval_str_len;
940 walk = segment = eval_str;
941 walk_last = 0;
942
943 while (walk < eval_str_end) {
944 /* If found a backreference.. */
945 if ('\\' == *walk || '$' == *walk) {
946 smart_str_appendl(&code, segment, walk - segment);
947 if (walk_last == '\\') {
948 code.c[code.len-1] = *walk++;
949 segment = walk;
950 walk_last = 0;
951 continue;
952 }
953 segment = walk;
954 if (preg_get_backref(&walk, &backref)) {
955 if (backref < count) {
956 /* Find the corresponding string match and substitute it
957 in instead of the backref */
958 match = subject + offsets[backref<<1];
959 match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
960 if (match_len) {
961 esc_match = php_addslashes(match, match_len, &esc_match_len, 0 TSRMLS_CC);
962 } else {
963 esc_match = match;
964 esc_match_len = 0;
965 }
966 } else {
967 esc_match = "";
968 esc_match_len = 0;
969 }
970 smart_str_appendl(&code, esc_match, esc_match_len);
971
972 segment = walk;
973
974 /* Clean up and reassign */
975 if (esc_match_len)
976 efree(esc_match);
977 continue;
978 }
979 }
980 walk++;
981 walk_last = walk[-1];
982 }
983 smart_str_appendl(&code, segment, walk - segment);
984 smart_str_0(&code);
985
986 compiled_string_description = zend_make_compiled_string_description("regexp code" TSRMLS_CC);
987 /* Run the code */
988 if (zend_eval_stringl(code.c, code.len, &retval, compiled_string_description TSRMLS_CC) == FAILURE) {
989 efree(compiled_string_description);
990 php_error_docref(NULL TSRMLS_CC,E_ERROR, "Failed evaluating code: %s%s", PHP_EOL, code.c);
991 /* zend_error() does not return in this case */
992 }
993 efree(compiled_string_description);
994 convert_to_string(&retval);
995
996 /* Save the return value and its length */
997 *result = estrndup(Z_STRVAL(retval), Z_STRLEN(retval));
998 result_len = Z_STRLEN(retval);
999
1000 /* Clean up */
1001 zval_dtor(&retval);
1002 smart_str_free(&code);
1003
1004 return result_len;
1005 }
1006 /* }}} */
1007
1008 /* {{{ php_pcre_replace
1009 */
php_pcre_replace(char * regex,int regex_len,char * subject,int subject_len,zval * replace_val,int is_callable_replace,int * result_len,int limit,int * replace_count TSRMLS_DC)1010 PHPAPI char *php_pcre_replace(char *regex, int regex_len,
1011 char *subject, int subject_len,
1012 zval *replace_val, int is_callable_replace,
1013 int *result_len, int limit, int *replace_count TSRMLS_DC)
1014 {
1015 pcre_cache_entry *pce; /* Compiled regular expression */
1016 char *result; /* Function result */
1017
1018 /* Compile regex or get it from cache. */
1019 if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1020 return NULL;
1021 }
1022 pce->refcount++;
1023 result = php_pcre_replace_impl(pce, subject, subject_len, replace_val,
1024 is_callable_replace, result_len, limit, replace_count TSRMLS_CC);
1025 pce->refcount--;
1026
1027 return result;
1028 }
1029 /* }}} */
1030
1031 /* {{{ php_pcre_replace_impl() */
php_pcre_replace_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * replace_val,int is_callable_replace,int * result_len,int limit,int * replace_count TSRMLS_DC)1032 PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *replace_val,
1033 int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC)
1034 {
1035 pcre_extra *extra = pce->extra;/* Holds results of studying */
1036 pcre_extra extra_data; /* Used locally for exec options */
1037 int exoptions = 0; /* Execution options */
1038 int count = 0; /* Count of matched subpatterns */
1039 int *offsets; /* Array of subpattern offsets */
1040 char **subpat_names; /* Array for named subpatterns */
1041 int num_subpats; /* Number of captured subpatterns */
1042 int size_offsets; /* Size of the offsets array */
1043 int new_len; /* Length of needed storage */
1044 int alloc_len; /* Actual allocated length */
1045 int eval_result_len=0; /* Length of the eval'ed or
1046 function-returned string */
1047 int match_len; /* Length of the current match */
1048 int backref; /* Backreference number */
1049 int eval; /* If the replacement string should be eval'ed */
1050 int start_offset; /* Where the new search starts */
1051 int g_notempty=0; /* If the match should not be empty */
1052 int replace_len=0; /* Length of replacement string */
1053 char *result, /* Result of replacement */
1054 *replace=NULL, /* Replacement string */
1055 *new_buf, /* Temporary buffer for re-allocation */
1056 *walkbuf, /* Location of current replacement in the result */
1057 *walk, /* Used to walk the replacement string */
1058 *match, /* The current match */
1059 *piece, /* The current piece of subject */
1060 *replace_end=NULL, /* End of replacement string */
1061 *eval_result, /* Result of eval or custom function */
1062 walk_last; /* Last walked character */
1063 int rc;
1064
1065 if (extra == NULL) {
1066 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1067 extra = &extra_data;
1068 }
1069 extra->match_limit = PCRE_G(backtrack_limit);
1070 extra->match_limit_recursion = PCRE_G(recursion_limit);
1071
1072 eval = pce->preg_options & PREG_REPLACE_EVAL;
1073 if (is_callable_replace) {
1074 if (eval) {
1075 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Modifier /e cannot be used with replacement callback");
1076 return NULL;
1077 }
1078 } else {
1079 replace = Z_STRVAL_P(replace_val);
1080 replace_len = Z_STRLEN_P(replace_val);
1081 replace_end = replace + replace_len;
1082 }
1083
1084 if (eval) {
1085 php_error_docref(NULL TSRMLS_CC, E_DEPRECATED, "The /e modifier is deprecated, use preg_replace_callback instead");
1086 }
1087
1088 /* Calculate the size of the offsets array, and allocate memory for it. */
1089 rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
1090 if (rc < 0) {
1091 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1092 return NULL;
1093 }
1094 num_subpats++;
1095 size_offsets = num_subpats * 3;
1096
1097 /*
1098 * Build a mapping from subpattern numbers to their names. We will always
1099 * allocate the table, even though there may be no named subpatterns. This
1100 * avoids somewhat more complicated logic in the inner loops.
1101 */
1102 subpat_names = make_subpats_table(num_subpats, pce TSRMLS_CC);
1103 if (!subpat_names) {
1104 return NULL;
1105 }
1106
1107 offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1108
1109 alloc_len = 2 * subject_len + 1;
1110 result = safe_emalloc(alloc_len, sizeof(char), 0);
1111
1112 /* Initialize */
1113 match = NULL;
1114 *result_len = 0;
1115 start_offset = 0;
1116 PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1117
1118 while (1) {
1119 /* Execute the regular expression. */
1120 count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
1121 exoptions|g_notempty, offsets, size_offsets);
1122
1123 /* the string was already proved to be valid UTF-8 */
1124 exoptions |= PCRE_NO_UTF8_CHECK;
1125
1126 /* Check for too many substrings condition. */
1127 if (count == 0) {
1128 php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
1129 count = size_offsets/3;
1130 }
1131
1132 piece = subject + start_offset;
1133
1134 if (count > 0 && (offsets[1] - offsets[0] >= 0) && (limit == -1 || limit > 0)) {
1135 if (replace_count) {
1136 ++*replace_count;
1137 }
1138 /* Set the match location in subject */
1139 match = subject + offsets[0];
1140
1141 new_len = *result_len + offsets[0] - start_offset; /* part before the match */
1142
1143 /* If evaluating, do it and add the return string's length */
1144 if (eval) {
1145 eval_result_len = preg_do_eval(replace, replace_len, subject,
1146 offsets, count, &eval_result TSRMLS_CC);
1147 new_len += eval_result_len;
1148 } else if (is_callable_replace) {
1149 /* Use custom function to get replacement string and its length. */
1150 eval_result_len = preg_do_repl_func(replace_val, subject, offsets, subpat_names, count, &eval_result TSRMLS_CC);
1151 new_len += eval_result_len;
1152 } else { /* do regular substitution */
1153 walk = replace;
1154 walk_last = 0;
1155 while (walk < replace_end) {
1156 if ('\\' == *walk || '$' == *walk) {
1157 if (walk_last == '\\') {
1158 walk++;
1159 walk_last = 0;
1160 continue;
1161 }
1162 if (preg_get_backref(&walk, &backref)) {
1163 if (backref < count)
1164 new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
1165 continue;
1166 }
1167 }
1168 new_len++;
1169 walk++;
1170 walk_last = walk[-1];
1171 }
1172 }
1173
1174 if (new_len + 1 > alloc_len) {
1175 alloc_len = 1 + alloc_len + 2 * new_len;
1176 new_buf = emalloc(alloc_len);
1177 memcpy(new_buf, result, *result_len);
1178 efree(result);
1179 result = new_buf;
1180 }
1181 /* copy the part of the string before the match */
1182 memcpy(&result[*result_len], piece, match-piece);
1183 *result_len += match-piece;
1184
1185 /* copy replacement and backrefs */
1186 walkbuf = result + *result_len;
1187
1188 /* If evaluating or using custom function, copy result to the buffer
1189 * and clean up. */
1190 if (eval || is_callable_replace) {
1191 memcpy(walkbuf, eval_result, eval_result_len);
1192 *result_len += eval_result_len;
1193 STR_FREE(eval_result);
1194 } else { /* do regular backreference copying */
1195 walk = replace;
1196 walk_last = 0;
1197 while (walk < replace_end) {
1198 if ('\\' == *walk || '$' == *walk) {
1199 if (walk_last == '\\') {
1200 *(walkbuf-1) = *walk++;
1201 walk_last = 0;
1202 continue;
1203 }
1204 if (preg_get_backref(&walk, &backref)) {
1205 if (backref < count) {
1206 match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1207 memcpy(walkbuf, subject + offsets[backref<<1], match_len);
1208 walkbuf += match_len;
1209 }
1210 continue;
1211 }
1212 }
1213 *walkbuf++ = *walk++;
1214 walk_last = walk[-1];
1215 }
1216 *walkbuf = '\0';
1217 /* increment the result length by how much we've added to the string */
1218 *result_len += walkbuf - (result + *result_len);
1219 }
1220
1221 if (limit != -1)
1222 limit--;
1223
1224 } else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
1225 /* If we previously set PCRE_NOTEMPTY after a null match,
1226 this is not necessarily the end. We need to advance
1227 the start offset, and continue. Fudge the offset values
1228 to achieve this, unless we're already at the end of the string. */
1229 if (g_notempty != 0 && start_offset < subject_len) {
1230 int unit_len = calculate_unit_length(pce, piece);
1231
1232 offsets[0] = start_offset;
1233 offsets[1] = start_offset + unit_len;
1234 memcpy(&result[*result_len], piece, unit_len);
1235 *result_len += unit_len;
1236 } else {
1237 new_len = *result_len + subject_len - start_offset;
1238 if (new_len + 1 > alloc_len) {
1239 alloc_len = new_len + 1; /* now we know exactly how long it is */
1240 new_buf = safe_emalloc(alloc_len, sizeof(char), 0);
1241 memcpy(new_buf, result, *result_len);
1242 efree(result);
1243 result = new_buf;
1244 }
1245 /* stick that last bit of string on our output */
1246 memcpy(&result[*result_len], piece, subject_len - start_offset);
1247 *result_len += subject_len - start_offset;
1248 result[*result_len] = '\0';
1249 break;
1250 }
1251 } else {
1252 pcre_handle_exec_error(count TSRMLS_CC);
1253 efree(result);
1254 result = NULL;
1255 break;
1256 }
1257
1258 /* If we have matched an empty string, mimic what Perl's /g options does.
1259 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1260 the match again at the same point. If this fails (picked up above) we
1261 advance to the next character. */
1262 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1263
1264 /* Advance to the next piece. */
1265 start_offset = offsets[1];
1266 }
1267
1268 efree(offsets);
1269 efree(subpat_names);
1270
1271 return result;
1272 }
1273 /* }}} */
1274
1275 /* {{{ php_replace_in_subject
1276 */
php_replace_in_subject(zval * regex,zval * replace,zval ** subject,int * result_len,int limit,int is_callable_replace,int * replace_count TSRMLS_DC)1277 static char *php_replace_in_subject(zval *regex, zval *replace, zval **subject, int *result_len, int limit, int is_callable_replace, int *replace_count TSRMLS_DC)
1278 {
1279 zval **regex_entry,
1280 **replace_entry = NULL,
1281 *replace_value,
1282 empty_replace;
1283 char *subject_value,
1284 *result;
1285 int subject_len;
1286
1287 /* Make sure we're dealing with strings. */
1288 convert_to_string_ex(subject);
1289 /* FIXME: This might need to be changed to STR_EMPTY_ALLOC(). Check if this zval could be dtor()'ed somehow */
1290 ZVAL_STRINGL(&empty_replace, "", 0, 0);
1291
1292 /* If regex is an array */
1293 if (Z_TYPE_P(regex) == IS_ARRAY) {
1294 /* Duplicate subject string for repeated replacement */
1295 subject_value = estrndup(Z_STRVAL_PP(subject), Z_STRLEN_PP(subject));
1296 subject_len = Z_STRLEN_PP(subject);
1297 *result_len = subject_len;
1298
1299 zend_hash_internal_pointer_reset(Z_ARRVAL_P(regex));
1300
1301 replace_value = replace;
1302 if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace)
1303 zend_hash_internal_pointer_reset(Z_ARRVAL_P(replace));
1304
1305 /* For each entry in the regex array, get the entry */
1306 while (zend_hash_get_current_data(Z_ARRVAL_P(regex), (void **)®ex_entry) == SUCCESS) {
1307 /* Make sure we're dealing with strings. */
1308 convert_to_string_ex(regex_entry);
1309
1310 /* If replace is an array and not a callable construct */
1311 if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace) {
1312 /* Get current entry */
1313 if (zend_hash_get_current_data(Z_ARRVAL_P(replace), (void **)&replace_entry) == SUCCESS) {
1314 if (!is_callable_replace) {
1315 convert_to_string_ex(replace_entry);
1316 }
1317 replace_value = *replace_entry;
1318 zend_hash_move_forward(Z_ARRVAL_P(replace));
1319 } else {
1320 /* We've run out of replacement strings, so use an empty one */
1321 replace_value = &empty_replace;
1322 }
1323 }
1324
1325 /* Do the actual replacement and put the result back into subject_value
1326 for further replacements. */
1327 if ((result = php_pcre_replace(Z_STRVAL_PP(regex_entry),
1328 Z_STRLEN_PP(regex_entry),
1329 subject_value,
1330 subject_len,
1331 replace_value,
1332 is_callable_replace,
1333 result_len,
1334 limit,
1335 replace_count TSRMLS_CC)) != NULL) {
1336 efree(subject_value);
1337 subject_value = result;
1338 subject_len = *result_len;
1339 } else {
1340 efree(subject_value);
1341 return NULL;
1342 }
1343
1344 zend_hash_move_forward(Z_ARRVAL_P(regex));
1345 }
1346
1347 return subject_value;
1348 } else {
1349 result = php_pcre_replace(Z_STRVAL_P(regex),
1350 Z_STRLEN_P(regex),
1351 Z_STRVAL_PP(subject),
1352 Z_STRLEN_PP(subject),
1353 replace,
1354 is_callable_replace,
1355 result_len,
1356 limit,
1357 replace_count TSRMLS_CC);
1358 return result;
1359 }
1360 }
1361 /* }}} */
1362
1363 /* {{{ preg_replace_impl
1364 */
preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS,int is_callable_replace,int is_filter)1365 static void preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS, int is_callable_replace, int is_filter)
1366 {
1367 zval **regex,
1368 **replace,
1369 **subject,
1370 **subject_entry,
1371 **zcount = NULL;
1372 char *result;
1373 int result_len;
1374 int limit_val = -1;
1375 long limit = -1;
1376 char *string_key;
1377 ulong num_key;
1378 char *callback_name;
1379 int replace_count=0, old_replace_count;
1380
1381 /* Get function parameters and do error-checking. */
1382 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ZZZ|lZ", ®ex, &replace, &subject, &limit, &zcount) == FAILURE) {
1383 return;
1384 }
1385
1386 if (!is_callable_replace && Z_TYPE_PP(replace) == IS_ARRAY && Z_TYPE_PP(regex) != IS_ARRAY) {
1387 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Parameter mismatch, pattern is a string while replacement is an array");
1388 RETURN_FALSE;
1389 }
1390
1391 SEPARATE_ZVAL(replace);
1392 if (Z_TYPE_PP(replace) != IS_ARRAY && (Z_TYPE_PP(replace) != IS_OBJECT || !is_callable_replace)) {
1393 convert_to_string_ex(replace);
1394 }
1395 if (is_callable_replace) {
1396 if (!zend_is_callable(*replace, 0, &callback_name TSRMLS_CC)) {
1397 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Requires argument 2, '%s', to be a valid callback", callback_name);
1398 efree(callback_name);
1399 MAKE_COPY_ZVAL(subject, return_value);
1400 return;
1401 }
1402 efree(callback_name);
1403 }
1404
1405 SEPARATE_ZVAL(regex);
1406 SEPARATE_ZVAL(subject);
1407
1408 if (ZEND_NUM_ARGS() > 3) {
1409 limit_val = limit;
1410 }
1411
1412 if (Z_TYPE_PP(regex) != IS_ARRAY)
1413 convert_to_string_ex(regex);
1414
1415 /* if subject is an array */
1416 if (Z_TYPE_PP(subject) == IS_ARRAY) {
1417 array_init(return_value);
1418 zend_hash_internal_pointer_reset(Z_ARRVAL_PP(subject));
1419
1420 /* For each subject entry, convert it to string, then perform replacement
1421 and add the result to the return_value array. */
1422 while (zend_hash_get_current_data(Z_ARRVAL_PP(subject), (void **)&subject_entry) == SUCCESS) {
1423 SEPARATE_ZVAL(subject_entry);
1424 old_replace_count = replace_count;
1425 if ((result = php_replace_in_subject(*regex, *replace, subject_entry, &result_len, limit_val, is_callable_replace, &replace_count TSRMLS_CC)) != NULL) {
1426 if (!is_filter || replace_count > old_replace_count) {
1427 /* Add to return array */
1428 switch(zend_hash_get_current_key(Z_ARRVAL_PP(subject), &string_key, &num_key, 0))
1429 {
1430 case HASH_KEY_IS_STRING:
1431 add_assoc_stringl(return_value, string_key, result, result_len, 0);
1432 break;
1433
1434 case HASH_KEY_IS_LONG:
1435 add_index_stringl(return_value, num_key, result, result_len, 0);
1436 break;
1437 }
1438 } else {
1439 efree(result);
1440 }
1441 }
1442
1443 zend_hash_move_forward(Z_ARRVAL_PP(subject));
1444 }
1445 } else { /* if subject is not an array */
1446 old_replace_count = replace_count;
1447 if ((result = php_replace_in_subject(*regex, *replace, subject, &result_len, limit_val, is_callable_replace, &replace_count TSRMLS_CC)) != NULL) {
1448 if (!is_filter || replace_count > old_replace_count) {
1449 RETVAL_STRINGL(result, result_len, 0);
1450 } else {
1451 efree(result);
1452 }
1453 }
1454 }
1455 if (ZEND_NUM_ARGS() > 4) {
1456 zval_dtor(*zcount);
1457 ZVAL_LONG(*zcount, replace_count);
1458 }
1459
1460 }
1461 /* }}} */
1462
1463 /* {{{ proto mixed preg_replace(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1464 Perform Perl-style regular expression replacement. */
PHP_FUNCTION(preg_replace)1465 static PHP_FUNCTION(preg_replace)
1466 {
1467 preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
1468 }
1469 /* }}} */
1470
1471 /* {{{ proto mixed preg_replace_callback(mixed regex, mixed callback, mixed subject [, int limit [, int &count]])
1472 Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback)1473 static PHP_FUNCTION(preg_replace_callback)
1474 {
1475 preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1, 0);
1476 }
1477 /* }}} */
1478
1479 /* {{{ proto mixed preg_filter(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1480 Perform Perl-style regular expression replacement and only return matches. */
PHP_FUNCTION(preg_filter)1481 static PHP_FUNCTION(preg_filter)
1482 {
1483 preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
1484 }
1485 /* }}} */
1486
1487 /* {{{ proto array preg_split(string pattern, string subject [, int limit [, int flags]])
1488 Split string into an array using a perl-style regular expression as a delimiter */
PHP_FUNCTION(preg_split)1489 static PHP_FUNCTION(preg_split)
1490 {
1491 char *regex; /* Regular expression */
1492 char *subject; /* String to match against */
1493 int regex_len;
1494 int subject_len;
1495 long limit_val = -1;/* Integer value of limit */
1496 long flags = 0; /* Match control flags */
1497 pcre_cache_entry *pce; /* Compiled regular expression */
1498
1499 /* Get function parameters and do error checking */
1500 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|ll", ®ex, ®ex_len,
1501 &subject, &subject_len, &limit_val, &flags) == FAILURE) {
1502 RETURN_FALSE;
1503 }
1504
1505 /* Compile regex or get it from cache. */
1506 if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1507 RETURN_FALSE;
1508 }
1509
1510 pce->refcount++;
1511 php_pcre_split_impl(pce, subject, subject_len, return_value, limit_val, flags TSRMLS_CC);
1512 pce->refcount--;
1513 }
1514 /* }}} */
1515
1516 /* {{{ php_pcre_split
1517 */
php_pcre_split_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * return_value,long limit_val,long flags TSRMLS_DC)1518 PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
1519 long limit_val, long flags TSRMLS_DC)
1520 {
1521 pcre_extra *extra = NULL; /* Holds results of studying */
1522 pcre *re_bump = NULL; /* Regex instance for empty matches */
1523 pcre_extra *extra_bump = NULL; /* Almost dummy */
1524 pcre_extra extra_data; /* Used locally for exec options */
1525 int *offsets; /* Array of subpattern offsets */
1526 int size_offsets; /* Size of the offsets array */
1527 int exoptions = 0; /* Execution options */
1528 int count = 0; /* Count of matched subpatterns */
1529 int start_offset; /* Where the new search starts */
1530 int next_offset; /* End of the last delimiter match + 1 */
1531 int g_notempty = 0; /* If the match should not be empty */
1532 char *last_match; /* Location of last match */
1533 int rc;
1534 int no_empty; /* If NO_EMPTY flag is set */
1535 int delim_capture; /* If delimiters should be captured */
1536 int offset_capture; /* If offsets should be captured */
1537
1538 no_empty = flags & PREG_SPLIT_NO_EMPTY;
1539 delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
1540 offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
1541
1542 if (limit_val == 0) {
1543 limit_val = -1;
1544 }
1545
1546 if (extra == NULL) {
1547 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1548 extra = &extra_data;
1549 }
1550 extra->match_limit = PCRE_G(backtrack_limit);
1551 extra->match_limit_recursion = PCRE_G(recursion_limit);
1552
1553 /* Initialize return value */
1554 array_init(return_value);
1555
1556 /* Calculate the size of the offsets array, and allocate memory for it. */
1557 rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
1558 if (rc < 0) {
1559 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1560 RETURN_FALSE;
1561 }
1562 size_offsets = (size_offsets + 1) * 3;
1563 offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1564
1565 /* Start at the beginning of the string */
1566 start_offset = 0;
1567 next_offset = 0;
1568 last_match = subject;
1569 PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1570
1571 /* Get next piece if no limit or limit not yet reached and something matched*/
1572 while ((limit_val == -1 || limit_val > 1)) {
1573 count = pcre_exec(pce->re, extra, subject,
1574 subject_len, start_offset,
1575 exoptions|g_notempty, offsets, size_offsets);
1576
1577 /* the string was already proved to be valid UTF-8 */
1578 exoptions |= PCRE_NO_UTF8_CHECK;
1579
1580 /* Check for too many substrings condition. */
1581 if (count == 0) {
1582 php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
1583 count = size_offsets/3;
1584 }
1585
1586 /* If something matched */
1587 if (count > 0 && (offsets[1] - offsets[0] >= 0)) {
1588 if (!no_empty || &subject[offsets[0]] != last_match) {
1589
1590 if (offset_capture) {
1591 /* Add (match, offset) pair to the return value */
1592 add_offset_pair(return_value, last_match, &subject[offsets[0]]-last_match, next_offset, NULL);
1593 } else {
1594 /* Add the piece to the return value */
1595 add_next_index_stringl(return_value, last_match,
1596 &subject[offsets[0]]-last_match, 1);
1597 }
1598
1599 /* One less left to do */
1600 if (limit_val != -1)
1601 limit_val--;
1602 }
1603
1604 last_match = &subject[offsets[1]];
1605 next_offset = offsets[1];
1606
1607 if (delim_capture) {
1608 int i, match_len;
1609 for (i = 1; i < count; i++) {
1610 match_len = offsets[(i<<1)+1] - offsets[i<<1];
1611 /* If we have matched a delimiter */
1612 if (!no_empty || match_len > 0) {
1613 if (offset_capture) {
1614 add_offset_pair(return_value, &subject[offsets[i<<1]], match_len, offsets[i<<1], NULL);
1615 } else {
1616 add_next_index_stringl(return_value,
1617 &subject[offsets[i<<1]],
1618 match_len, 1);
1619 }
1620 }
1621 }
1622 }
1623 } else if (count == PCRE_ERROR_NOMATCH) {
1624 /* If we previously set PCRE_NOTEMPTY after a null match,
1625 this is not necessarily the end. We need to advance
1626 the start offset, and continue. Fudge the offset values
1627 to achieve this, unless we're already at the end of the string. */
1628 if (g_notempty != 0 && start_offset < subject_len) {
1629 if (pce->compile_options & PCRE_UTF8) {
1630 if (re_bump == NULL) {
1631 int dummy;
1632
1633 if ((re_bump = pcre_get_compiled_regex("/./us", &extra_bump, &dummy TSRMLS_CC)) == NULL) {
1634 RETURN_FALSE;
1635 }
1636 }
1637 count = pcre_exec(re_bump, extra_bump, subject,
1638 subject_len, start_offset,
1639 exoptions, offsets, size_offsets);
1640 if (count < 1) {
1641 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown error");
1642 RETURN_FALSE;
1643 }
1644 } else {
1645 offsets[0] = start_offset;
1646 offsets[1] = start_offset + 1;
1647 }
1648 } else
1649 break;
1650 } else {
1651 pcre_handle_exec_error(count TSRMLS_CC);
1652 break;
1653 }
1654
1655 /* If we have matched an empty string, mimic what Perl's /g options does.
1656 This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
1657 the match again at the same point. If this fails (picked up above) we
1658 advance to the next character. */
1659 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
1660
1661 /* Advance to the position right after the last full match */
1662 start_offset = offsets[1];
1663 }
1664
1665
1666 start_offset = last_match - subject; /* the offset might have been incremented, but without further successful matches */
1667
1668 if (!no_empty || start_offset < subject_len)
1669 {
1670 if (offset_capture) {
1671 /* Add the last (match, offset) pair to the return value */
1672 add_offset_pair(return_value, &subject[start_offset], subject_len - start_offset, start_offset, NULL);
1673 } else {
1674 /* Add the last piece to the return value */
1675 add_next_index_stringl(return_value, last_match, subject + subject_len - last_match, 1);
1676 }
1677 }
1678
1679
1680 /* Clean up */
1681 efree(offsets);
1682 }
1683 /* }}} */
1684
1685 /* {{{ proto string preg_quote(string str [, string delim_char])
1686 Quote regular expression characters plus an optional character */
PHP_FUNCTION(preg_quote)1687 static PHP_FUNCTION(preg_quote)
1688 {
1689 int in_str_len;
1690 char *in_str; /* Input string argument */
1691 char *in_str_end; /* End of the input string */
1692 int delim_len = 0;
1693 char *delim = NULL; /* Additional delimiter argument */
1694 char *out_str, /* Output string with quoted characters */
1695 *p, /* Iterator for input string */
1696 *q, /* Iterator for output string */
1697 delim_char=0, /* Delimiter character to be quoted */
1698 c; /* Current character */
1699 zend_bool quote_delim = 0; /* Whether to quote additional delim char */
1700
1701 /* Get the arguments and check for errors */
1702 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|s", &in_str, &in_str_len,
1703 &delim, &delim_len) == FAILURE) {
1704 return;
1705 }
1706
1707 in_str_end = in_str + in_str_len;
1708
1709 /* Nothing to do if we got an empty string */
1710 if (in_str == in_str_end) {
1711 RETURN_EMPTY_STRING();
1712 }
1713
1714 if (delim && *delim) {
1715 delim_char = delim[0];
1716 quote_delim = 1;
1717 }
1718
1719 /* Allocate enough memory so that even if each character
1720 is quoted, we won't run out of room */
1721 out_str = safe_emalloc(4, in_str_len, 1);
1722
1723 /* Go through the string and quote necessary characters */
1724 for(p = in_str, q = out_str; p != in_str_end; p++) {
1725 c = *p;
1726 switch(c) {
1727 case '.':
1728 case '\\':
1729 case '+':
1730 case '*':
1731 case '?':
1732 case '[':
1733 case '^':
1734 case ']':
1735 case '$':
1736 case '(':
1737 case ')':
1738 case '{':
1739 case '}':
1740 case '=':
1741 case '!':
1742 case '>':
1743 case '<':
1744 case '|':
1745 case ':':
1746 case '-':
1747 *q++ = '\\';
1748 *q++ = c;
1749 break;
1750
1751 case '\0':
1752 *q++ = '\\';
1753 *q++ = '0';
1754 *q++ = '0';
1755 *q++ = '0';
1756 break;
1757
1758 default:
1759 if (quote_delim && c == delim_char)
1760 *q++ = '\\';
1761 *q++ = c;
1762 break;
1763 }
1764 }
1765 *q = '\0';
1766
1767 /* Reallocate string and return it */
1768 RETVAL_STRINGL(erealloc(out_str, q - out_str + 1), q - out_str, 0);
1769 }
1770 /* }}} */
1771
1772 /* {{{ proto array preg_grep(string regex, array input [, int flags])
1773 Searches array and returns entries which match regex */
PHP_FUNCTION(preg_grep)1774 static PHP_FUNCTION(preg_grep)
1775 {
1776 char *regex; /* Regular expression */
1777 int regex_len;
1778 zval *input; /* Input array */
1779 long flags = 0; /* Match control flags */
1780 pcre_cache_entry *pce; /* Compiled regular expression */
1781
1782 /* Get arguments and do error checking */
1783 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sa|l", ®ex, ®ex_len,
1784 &input, &flags) == FAILURE) {
1785 return;
1786 }
1787
1788 /* Compile regex or get it from cache. */
1789 if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1790 RETURN_FALSE;
1791 }
1792
1793 pce->refcount++;
1794 php_pcre_grep_impl(pce, input, return_value, flags TSRMLS_CC);
1795 pce->refcount--;
1796 }
1797 /* }}} */
1798
php_pcre_grep_impl(pcre_cache_entry * pce,zval * input,zval * return_value,long flags TSRMLS_DC)1799 PHPAPI void php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, long flags TSRMLS_DC) /* {{{ */
1800 {
1801 zval **entry; /* An entry in the input array */
1802 pcre_extra *extra = pce->extra;/* Holds results of studying */
1803 pcre_extra extra_data; /* Used locally for exec options */
1804 int *offsets; /* Array of subpattern offsets */
1805 int size_offsets; /* Size of the offsets array */
1806 int count = 0; /* Count of matched subpatterns */
1807 char *string_key;
1808 ulong num_key;
1809 zend_bool invert; /* Whether to return non-matching
1810 entries */
1811 int rc;
1812
1813 invert = flags & PREG_GREP_INVERT ? 1 : 0;
1814
1815 if (extra == NULL) {
1816 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1817 extra = &extra_data;
1818 }
1819 extra->match_limit = PCRE_G(backtrack_limit);
1820 extra->match_limit_recursion = PCRE_G(recursion_limit);
1821
1822 /* Calculate the size of the offsets array, and allocate memory for it. */
1823 rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
1824 if (rc < 0) {
1825 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1826 RETURN_FALSE;
1827 }
1828 size_offsets = (size_offsets + 1) * 3;
1829 offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1830
1831 /* Initialize return array */
1832 array_init(return_value);
1833
1834 PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1835
1836 /* Go through the input array */
1837 zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
1838 while (zend_hash_get_current_data(Z_ARRVAL_P(input), (void **)&entry) == SUCCESS) {
1839 zval subject = **entry;
1840
1841 if (Z_TYPE_PP(entry) != IS_STRING) {
1842 zval_copy_ctor(&subject);
1843 convert_to_string(&subject);
1844 }
1845
1846 /* Perform the match */
1847 count = pcre_exec(pce->re, extra, Z_STRVAL(subject),
1848 Z_STRLEN(subject), 0,
1849 0, offsets, size_offsets);
1850
1851 /* Check for too many substrings condition. */
1852 if (count == 0) {
1853 php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
1854 count = size_offsets/3;
1855 } else if (count < 0 && count != PCRE_ERROR_NOMATCH) {
1856 pcre_handle_exec_error(count TSRMLS_CC);
1857 break;
1858 }
1859
1860 /* If the entry fits our requirements */
1861 if ((count > 0 && !invert) || (count == PCRE_ERROR_NOMATCH && invert)) {
1862
1863 Z_ADDREF_PP(entry);
1864
1865 /* Add to return array */
1866 switch (zend_hash_get_current_key(Z_ARRVAL_P(input), &string_key, &num_key, 0))
1867 {
1868 case HASH_KEY_IS_STRING:
1869 zend_hash_update(Z_ARRVAL_P(return_value), string_key,
1870 strlen(string_key)+1, entry, sizeof(zval *), NULL);
1871 break;
1872
1873 case HASH_KEY_IS_LONG:
1874 zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry,
1875 sizeof(zval *), NULL);
1876 break;
1877 }
1878 }
1879
1880 if (Z_TYPE_PP(entry) != IS_STRING) {
1881 zval_dtor(&subject);
1882 }
1883
1884 zend_hash_move_forward(Z_ARRVAL_P(input));
1885 }
1886 zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
1887 /* Clean up */
1888 efree(offsets);
1889 }
1890 /* }}} */
1891
1892 /* {{{ proto int preg_last_error()
1893 Returns the error code of the last regexp execution. */
PHP_FUNCTION(preg_last_error)1894 static PHP_FUNCTION(preg_last_error)
1895 {
1896 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) {
1897 return;
1898 }
1899
1900 RETURN_LONG(PCRE_G(error_code));
1901 }
1902 /* }}} */
1903
1904 /* {{{ module definition structures */
1905
1906 /* {{{ arginfo */
1907 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match, 0, 0, 2)
1908 ZEND_ARG_INFO(0, pattern)
1909 ZEND_ARG_INFO(0, subject)
1910 ZEND_ARG_INFO(1, subpatterns) /* array */
1911 ZEND_ARG_INFO(0, flags)
1912 ZEND_ARG_INFO(0, offset)
1913 ZEND_END_ARG_INFO()
1914
1915 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match_all, 0, 0, 2)
1916 ZEND_ARG_INFO(0, pattern)
1917 ZEND_ARG_INFO(0, subject)
1918 ZEND_ARG_INFO(1, subpatterns) /* array */
1919 ZEND_ARG_INFO(0, flags)
1920 ZEND_ARG_INFO(0, offset)
1921 ZEND_END_ARG_INFO()
1922
1923 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace, 0, 0, 3)
1924 ZEND_ARG_INFO(0, regex)
1925 ZEND_ARG_INFO(0, replace)
1926 ZEND_ARG_INFO(0, subject)
1927 ZEND_ARG_INFO(0, limit)
1928 ZEND_ARG_INFO(1, count)
1929 ZEND_END_ARG_INFO()
1930
1931 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace_callback, 0, 0, 3)
1932 ZEND_ARG_INFO(0, regex)
1933 ZEND_ARG_INFO(0, callback)
1934 ZEND_ARG_INFO(0, subject)
1935 ZEND_ARG_INFO(0, limit)
1936 ZEND_ARG_INFO(1, count)
1937 ZEND_END_ARG_INFO()
1938
1939 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_split, 0, 0, 2)
1940 ZEND_ARG_INFO(0, pattern)
1941 ZEND_ARG_INFO(0, subject)
1942 ZEND_ARG_INFO(0, limit)
1943 ZEND_ARG_INFO(0, flags)
1944 ZEND_END_ARG_INFO()
1945
1946 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_quote, 0, 0, 1)
1947 ZEND_ARG_INFO(0, str)
1948 ZEND_ARG_INFO(0, delim_char)
1949 ZEND_END_ARG_INFO()
1950
1951 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_grep, 0, 0, 2)
1952 ZEND_ARG_INFO(0, regex)
1953 ZEND_ARG_INFO(0, input) /* array */
1954 ZEND_ARG_INFO(0, flags)
1955 ZEND_END_ARG_INFO()
1956
1957 ZEND_BEGIN_ARG_INFO(arginfo_preg_last_error, 0)
1958 ZEND_END_ARG_INFO()
1959 /* }}} */
1960
1961 static const zend_function_entry pcre_functions[] = {
1962 PHP_FE(preg_match, arginfo_preg_match)
1963 PHP_FE(preg_match_all, arginfo_preg_match_all)
1964 PHP_FE(preg_replace, arginfo_preg_replace)
1965 PHP_FE(preg_replace_callback, arginfo_preg_replace_callback)
1966 PHP_FE(preg_filter, arginfo_preg_replace)
1967 PHP_FE(preg_split, arginfo_preg_split)
1968 PHP_FE(preg_quote, arginfo_preg_quote)
1969 PHP_FE(preg_grep, arginfo_preg_grep)
1970 PHP_FE(preg_last_error, arginfo_preg_last_error)
1971 PHP_FE_END
1972 };
1973
1974 zend_module_entry pcre_module_entry = {
1975 STANDARD_MODULE_HEADER,
1976 "pcre",
1977 pcre_functions,
1978 PHP_MINIT(pcre),
1979 PHP_MSHUTDOWN(pcre),
1980 NULL,
1981 NULL,
1982 PHP_MINFO(pcre),
1983 NO_VERSION_YET,
1984 PHP_MODULE_GLOBALS(pcre),
1985 PHP_GINIT(pcre),
1986 PHP_GSHUTDOWN(pcre),
1987 NULL,
1988 STANDARD_MODULE_PROPERTIES_EX
1989 };
1990
1991 #ifdef COMPILE_DL_PCRE
1992 ZEND_GET_MODULE(pcre)
1993 #endif
1994
1995 /* }}} */
1996
1997 #endif /* HAVE_PCRE || HAVE_BUNDLED_PCRE */
1998
1999 /*
2000 * Local variables:
2001 * tab-width: 4
2002 * c-basic-offset: 4
2003 * End:
2004 * vim600: sw=4 ts=4 fdm=marker
2005 * vim<600: sw=4 ts=4
2006 */
2007