1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 5 |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 1997-2016 The PHP Group |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
15 | Author: Andrei Zmievski <andrei@php.net> |
16 +----------------------------------------------------------------------+
17 */
18
19 /* $Id$ */
20
21 #include "php.h"
22 #include "php_ini.h"
23 #include "php_globals.h"
24 #include "php_pcre.h"
25 #include "ext/standard/info.h"
26 #include "ext/standard/php_smart_str.h"
27
28 #if HAVE_PCRE || HAVE_BUNDLED_PCRE
29
30 #include "ext/standard/php_string.h"
31
32 #define PREG_PATTERN_ORDER 1
33 #define PREG_SET_ORDER 2
34 #define PREG_OFFSET_CAPTURE (1<<8)
35
36 #define PREG_SPLIT_NO_EMPTY (1<<0)
37 #define PREG_SPLIT_DELIM_CAPTURE (1<<1)
38 #define PREG_SPLIT_OFFSET_CAPTURE (1<<2)
39
40 #define PREG_REPLACE_EVAL (1<<0)
41
42 #define PREG_GREP_INVERT (1<<0)
43
44 #define PCRE_CACHE_SIZE 4096
45
46 /* not fully functional workaround for libpcre < 8.0, see bug #70232 */
47 #ifndef PCRE_NOTEMPTY_ATSTART
48 # define PCRE_NOTEMPTY_ATSTART PCRE_NOTEMPTY
49 #endif
50
51 enum {
52 PHP_PCRE_NO_ERROR = 0,
53 PHP_PCRE_INTERNAL_ERROR,
54 PHP_PCRE_BACKTRACK_LIMIT_ERROR,
55 PHP_PCRE_RECURSION_LIMIT_ERROR,
56 PHP_PCRE_BAD_UTF8_ERROR,
57 PHP_PCRE_BAD_UTF8_OFFSET_ERROR
58 };
59
60
ZEND_DECLARE_MODULE_GLOBALS(pcre)61 ZEND_DECLARE_MODULE_GLOBALS(pcre)
62
63
64 static void pcre_handle_exec_error(int pcre_code TSRMLS_DC) /* {{{ */
65 {
66 int preg_code = 0;
67
68 switch (pcre_code) {
69 case PCRE_ERROR_MATCHLIMIT:
70 preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
71 break;
72
73 case PCRE_ERROR_RECURSIONLIMIT:
74 preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
75 break;
76
77 case PCRE_ERROR_BADUTF8:
78 preg_code = PHP_PCRE_BAD_UTF8_ERROR;
79 break;
80
81 case PCRE_ERROR_BADUTF8_OFFSET:
82 preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
83 break;
84
85 default:
86 preg_code = PHP_PCRE_INTERNAL_ERROR;
87 break;
88 }
89
90 PCRE_G(error_code) = preg_code;
91 }
92 /* }}} */
93
php_free_pcre_cache(void * data)94 static void php_free_pcre_cache(void *data) /* {{{ */
95 {
96 pcre_cache_entry *pce = (pcre_cache_entry *) data;
97 if (!pce) return;
98 pefree(pce->re, 1);
99 if (pce->extra) pefree(pce->extra, 1);
100 #if HAVE_SETLOCALE
101 if ((void*)pce->tables) pefree((void*)pce->tables, 1);
102 pefree(pce->locale, 1);
103 #endif
104 }
105 /* }}} */
106
PHP_GINIT_FUNCTION(pcre)107 static PHP_GINIT_FUNCTION(pcre) /* {{{ */
108 {
109 zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1);
110 pcre_globals->backtrack_limit = 0;
111 pcre_globals->recursion_limit = 0;
112 pcre_globals->error_code = PHP_PCRE_NO_ERROR;
113 }
114 /* }}} */
115
PHP_GSHUTDOWN_FUNCTION(pcre)116 static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */
117 {
118 zend_hash_destroy(&pcre_globals->pcre_cache);
119 }
120 /* }}} */
121
122 PHP_INI_BEGIN()
123 STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateLong, backtrack_limit, zend_pcre_globals, pcre_globals)
124 STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000", PHP_INI_ALL, OnUpdateLong, recursion_limit, zend_pcre_globals, pcre_globals)
PHP_INI_END()125 PHP_INI_END()
126
127
128 /* {{{ PHP_MINFO_FUNCTION(pcre) */
129 static PHP_MINFO_FUNCTION(pcre)
130 {
131 php_info_print_table_start();
132 php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
133 php_info_print_table_row(2, "PCRE Library Version", pcre_version() );
134 php_info_print_table_end();
135
136 DISPLAY_INI_ENTRIES();
137 }
138 /* }}} */
139
140 /* {{{ PHP_MINIT_FUNCTION(pcre) */
PHP_MINIT_FUNCTION(pcre)141 static PHP_MINIT_FUNCTION(pcre)
142 {
143 REGISTER_INI_ENTRIES();
144
145 REGISTER_LONG_CONSTANT("PREG_PATTERN_ORDER", PREG_PATTERN_ORDER, CONST_CS | CONST_PERSISTENT);
146 REGISTER_LONG_CONSTANT("PREG_SET_ORDER", PREG_SET_ORDER, CONST_CS | CONST_PERSISTENT);
147 REGISTER_LONG_CONSTANT("PREG_OFFSET_CAPTURE", PREG_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
148 REGISTER_LONG_CONSTANT("PREG_SPLIT_NO_EMPTY", PREG_SPLIT_NO_EMPTY, CONST_CS | CONST_PERSISTENT);
149 REGISTER_LONG_CONSTANT("PREG_SPLIT_DELIM_CAPTURE", PREG_SPLIT_DELIM_CAPTURE, CONST_CS | CONST_PERSISTENT);
150 REGISTER_LONG_CONSTANT("PREG_SPLIT_OFFSET_CAPTURE", PREG_SPLIT_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
151 REGISTER_LONG_CONSTANT("PREG_GREP_INVERT", PREG_GREP_INVERT, CONST_CS | CONST_PERSISTENT);
152
153 REGISTER_LONG_CONSTANT("PREG_NO_ERROR", PHP_PCRE_NO_ERROR, CONST_CS | CONST_PERSISTENT);
154 REGISTER_LONG_CONSTANT("PREG_INTERNAL_ERROR", PHP_PCRE_INTERNAL_ERROR, CONST_CS | CONST_PERSISTENT);
155 REGISTER_LONG_CONSTANT("PREG_BACKTRACK_LIMIT_ERROR", PHP_PCRE_BACKTRACK_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
156 REGISTER_LONG_CONSTANT("PREG_RECURSION_LIMIT_ERROR", PHP_PCRE_RECURSION_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
157 REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_ERROR", PHP_PCRE_BAD_UTF8_ERROR, CONST_CS | CONST_PERSISTENT);
158 REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_OFFSET_ERROR", PHP_PCRE_BAD_UTF8_OFFSET_ERROR, CONST_CS | CONST_PERSISTENT);
159 REGISTER_STRING_CONSTANT("PCRE_VERSION", (char *)pcre_version(), CONST_CS | CONST_PERSISTENT);
160
161 return SUCCESS;
162 }
163 /* }}} */
164
165 /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
PHP_MSHUTDOWN_FUNCTION(pcre)166 static PHP_MSHUTDOWN_FUNCTION(pcre)
167 {
168 UNREGISTER_INI_ENTRIES();
169
170 return SUCCESS;
171 }
172 /* }}} */
173
174 /* {{{ static pcre_clean_cache */
pcre_clean_cache(void * data,void * arg TSRMLS_DC)175 static int pcre_clean_cache(void *data, void *arg TSRMLS_DC)
176 {
177 pcre_cache_entry *pce = (pcre_cache_entry *) data;
178 int *num_clean = (int *)arg;
179
180 if (*num_clean > 0 && !pce->refcount) {
181 (*num_clean)--;
182 return ZEND_HASH_APPLY_REMOVE;
183 } else {
184 return ZEND_HASH_APPLY_KEEP;
185 }
186 }
187 /* }}} */
188
189 /* {{{ static make_subpats_table */
make_subpats_table(int num_subpats,pcre_cache_entry * pce TSRMLS_DC)190 static char **make_subpats_table(int num_subpats, pcre_cache_entry *pce TSRMLS_DC)
191 {
192 pcre_extra *extra = pce->extra;
193 int name_cnt = 0, name_size, ni = 0;
194 int rc;
195 char *name_table;
196 unsigned short name_idx;
197 char **subpat_names = (char **)ecalloc(num_subpats, sizeof(char *));
198
199 rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMECOUNT, &name_cnt);
200 if (rc < 0) {
201 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
202 efree(subpat_names);
203 return NULL;
204 }
205 if (name_cnt > 0) {
206 int rc1, rc2;
207
208 rc1 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMETABLE, &name_table);
209 rc2 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
210 rc = rc2 ? rc2 : rc1;
211 if (rc < 0) {
212 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
213 efree(subpat_names);
214 return NULL;
215 }
216
217 while (ni++ < name_cnt) {
218 name_idx = 0x100 * (unsigned char)name_table[0] + (unsigned char)name_table[1];
219 subpat_names[name_idx] = name_table + 2;
220 if (is_numeric_string(subpat_names[name_idx], strlen(subpat_names[name_idx]), NULL, NULL, 0) > 0) {
221 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Numeric named subpatterns are not allowed");
222 efree(subpat_names);
223 return NULL;
224 }
225 name_table += name_size;
226 }
227 }
228
229 return subpat_names;
230 }
231 /* }}} */
232
233 /* {{{ static calculate_unit_length */
234 /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE_UTF8. */
calculate_unit_length(pcre_cache_entry * pce,char * start)235 static zend_always_inline int calculate_unit_length(pcre_cache_entry *pce, char *start)
236 {
237 int unit_len;
238
239 if (pce->compile_options & PCRE_UTF8) {
240 char *end = start;
241
242 /* skip continuation bytes */
243 while ((*++end & 0xC0) == 0x80);
244 unit_len = end - start;
245 } else {
246 unit_len = 1;
247 }
248 return unit_len;
249 }
250 /* }}} */
251
252 /* {{{ pcre_get_compiled_regex_cache
253 */
pcre_get_compiled_regex_cache(char * regex,int regex_len TSRMLS_DC)254 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(char *regex, int regex_len TSRMLS_DC)
255 {
256 pcre *re = NULL;
257 pcre_extra *extra;
258 int coptions = 0;
259 int soptions = 0;
260 const char *error;
261 int erroffset;
262 char delimiter;
263 char start_delimiter;
264 char end_delimiter;
265 char *p, *pp;
266 char *pattern;
267 int do_study = 0;
268 int poptions = 0;
269 int count = 0;
270 unsigned const char *tables = NULL;
271 #if HAVE_SETLOCALE
272 char *locale;
273 #endif
274 pcre_cache_entry *pce;
275 pcre_cache_entry new_entry;
276 char *tmp = NULL;
277
278 #if HAVE_SETLOCALE
279 # if defined(PHP_WIN32) && defined(ZTS)
280 _configthreadlocale(_ENABLE_PER_THREAD_LOCALE);
281 # endif
282 locale = setlocale(LC_CTYPE, NULL);
283 #endif
284
285 /* Try to lookup the cached regex entry, and if successful, just pass
286 back the compiled pattern, otherwise go on and compile it. */
287 if (zend_hash_find(&PCRE_G(pcre_cache), regex, regex_len+1, (void **)&pce) == SUCCESS) {
288 /*
289 * We use a quick pcre_fullinfo() check to see whether cache is corrupted, and if it
290 * is, we flush it and compile the pattern from scratch.
291 */
292 if (pcre_fullinfo(pce->re, NULL, PCRE_INFO_CAPTURECOUNT, &count) == PCRE_ERROR_BADMAGIC) {
293 zend_hash_clean(&PCRE_G(pcre_cache));
294 } else {
295 #if HAVE_SETLOCALE
296 if (!strcmp(pce->locale, locale)) {
297 #endif
298 return pce;
299 #if HAVE_SETLOCALE
300 }
301 #endif
302 }
303 }
304
305 p = regex;
306
307 /* Parse through the leading whitespace, and display a warning if we
308 get to the end without encountering a delimiter. */
309 while (isspace((int)*(unsigned char *)p)) p++;
310 if (*p == 0) {
311 php_error_docref(NULL TSRMLS_CC, E_WARNING,
312 p < regex + regex_len ? "Null byte in regex" : "Empty regular expression");
313 return NULL;
314 }
315
316 /* Get the delimiter and display a warning if it is alphanumeric
317 or a backslash. */
318 delimiter = *p++;
319 if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
320 php_error_docref(NULL TSRMLS_CC,E_WARNING, "Delimiter must not be alphanumeric or backslash");
321 return NULL;
322 }
323
324 start_delimiter = delimiter;
325 if ((pp = strchr("([{< )]}> )]}>", delimiter)))
326 delimiter = pp[5];
327 end_delimiter = delimiter;
328
329 pp = p;
330
331 if (start_delimiter == end_delimiter) {
332 /* We need to iterate through the pattern, searching for the ending delimiter,
333 but skipping the backslashed delimiters. If the ending delimiter is not
334 found, display a warning. */
335 while (*pp != 0) {
336 if (*pp == '\\' && pp[1] != 0) pp++;
337 else if (*pp == delimiter)
338 break;
339 pp++;
340 }
341 } else {
342 /* We iterate through the pattern, searching for the matching ending
343 * delimiter. For each matching starting delimiter, we increment nesting
344 * level, and decrement it for each matching ending delimiter. If we
345 * reach the end of the pattern without matching, display a warning.
346 */
347 int brackets = 1; /* brackets nesting level */
348 while (*pp != 0) {
349 if (*pp == '\\' && pp[1] != 0) pp++;
350 else if (*pp == end_delimiter && --brackets <= 0)
351 break;
352 else if (*pp == start_delimiter)
353 brackets++;
354 pp++;
355 }
356 }
357
358 if (*pp == 0) {
359 if (pp < regex + regex_len) {
360 php_error_docref(NULL TSRMLS_CC,E_WARNING, "Null byte in regex");
361 } else if (start_delimiter == end_delimiter) {
362 php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending delimiter '%c' found", delimiter);
363 } else {
364 php_error_docref(NULL TSRMLS_CC,E_WARNING, "No ending matching delimiter '%c' found", delimiter);
365 }
366 return NULL;
367 }
368
369 /* Make a copy of the actual pattern. */
370 pattern = estrndup(p, pp-p);
371
372 /* Move on to the options */
373 pp++;
374
375 /* Parse through the options, setting appropriate flags. Display
376 a warning if we encounter an unknown modifier. */
377 while (pp < regex + regex_len) {
378 switch (*pp++) {
379 /* Perl compatible options */
380 case 'i': coptions |= PCRE_CASELESS; break;
381 case 'm': coptions |= PCRE_MULTILINE; break;
382 case 's': coptions |= PCRE_DOTALL; break;
383 case 'x': coptions |= PCRE_EXTENDED; break;
384
385 /* PCRE specific options */
386 case 'A': coptions |= PCRE_ANCHORED; break;
387 case 'D': coptions |= PCRE_DOLLAR_ENDONLY;break;
388 case 'S': do_study = 1; break;
389 case 'U': coptions |= PCRE_UNGREEDY; break;
390 case 'X': coptions |= PCRE_EXTRA; break;
391 case 'u': coptions |= PCRE_UTF8;
392 /* In PCRE, by default, \d, \D, \s, \S, \w, and \W recognize only ASCII
393 characters, even in UTF-8 mode. However, this can be changed by setting
394 the PCRE_UCP option. */
395 #ifdef PCRE_UCP
396 coptions |= PCRE_UCP;
397 #endif
398 break;
399 case 'J': coptions |= PCRE_DUPNAMES; break;
400
401 /* Custom preg options */
402 case 'e': poptions |= PREG_REPLACE_EVAL; break;
403
404 case ' ':
405 case '\n':
406 break;
407
408 default:
409 if (pp[-1]) {
410 php_error_docref(NULL TSRMLS_CC,E_WARNING, "Unknown modifier '%c'", pp[-1]);
411 } else {
412 php_error_docref(NULL TSRMLS_CC,E_WARNING, "Null byte in regex");
413 }
414 efree(pattern);
415 return NULL;
416 }
417 }
418
419 #if HAVE_SETLOCALE
420 if (strcmp(locale, "C"))
421 tables = pcre_maketables();
422 #endif
423
424 /* Compile pattern and display a warning if compilation failed. */
425 re = pcre_compile(pattern,
426 coptions,
427 &error,
428 &erroffset,
429 tables);
430
431 if (re == NULL) {
432 php_error_docref(NULL TSRMLS_CC,E_WARNING, "Compilation failed: %s at offset %d", error, erroffset);
433 efree(pattern);
434 if (tables) {
435 pefree((void*)tables, 1);
436 }
437 return NULL;
438 }
439
440 /* If study option was specified, study the pattern and
441 store the result in extra for passing to pcre_exec. */
442 if (do_study) {
443 extra = pcre_study(re, soptions, &error);
444 if (extra) {
445 extra->flags |= PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
446 }
447 if (error != NULL) {
448 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Error while studying pattern");
449 }
450 } else {
451 extra = NULL;
452 }
453
454 efree(pattern);
455
456 /*
457 * If we reached cache limit, clean out the items from the head of the list;
458 * these are supposedly the oldest ones (but not necessarily the least used
459 * ones).
460 */
461 if (zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) {
462 int num_clean = PCRE_CACHE_SIZE / 8;
463 zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean TSRMLS_CC);
464 }
465
466 /* Store the compiled pattern and extra info in the cache. */
467 new_entry.re = re;
468 new_entry.extra = extra;
469 new_entry.preg_options = poptions;
470 new_entry.compile_options = coptions;
471 #if HAVE_SETLOCALE
472 new_entry.locale = pestrdup(locale, 1);
473 new_entry.tables = tables;
474 #endif
475 new_entry.refcount = 0;
476
477 /*
478 * Interned strings are not duplicated when stored in HashTable,
479 * but all the interned strings created during HTTP request are removed
480 * at end of request. However PCRE_G(pcre_cache) must be consistent
481 * on the next request as well. So we disable usage of interned strings
482 * as hash keys especually for this table.
483 * See bug #63180
484 */
485 if (IS_INTERNED(regex)) {
486 regex = tmp = estrndup(regex, regex_len);
487 }
488
489 zend_hash_update(&PCRE_G(pcre_cache), regex, regex_len+1, (void *)&new_entry,
490 sizeof(pcre_cache_entry), (void**)&pce);
491
492 if (tmp) {
493 efree(tmp);
494 }
495
496 return pce;
497 }
498 /* }}} */
499
500 /* {{{ pcre_get_compiled_regex
501 */
pcre_get_compiled_regex(char * regex,pcre_extra ** extra,int * preg_options TSRMLS_DC)502 PHPAPI pcre* pcre_get_compiled_regex(char *regex, pcre_extra **extra, int *preg_options TSRMLS_DC)
503 {
504 pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
505
506 if (extra) {
507 *extra = pce ? pce->extra : NULL;
508 }
509 if (preg_options) {
510 *preg_options = pce ? pce->preg_options : 0;
511 }
512
513 return pce ? pce->re : NULL;
514 }
515 /* }}} */
516
517 /* {{{ pcre_get_compiled_regex_ex
518 */
pcre_get_compiled_regex_ex(char * regex,pcre_extra ** extra,int * preg_options,int * compile_options TSRMLS_DC)519 PHPAPI pcre* pcre_get_compiled_regex_ex(char *regex, pcre_extra **extra, int *preg_options, int *compile_options TSRMLS_DC)
520 {
521 pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex, strlen(regex) TSRMLS_CC);
522
523 if (extra) {
524 *extra = pce ? pce->extra : NULL;
525 }
526 if (preg_options) {
527 *preg_options = pce ? pce->preg_options : 0;
528 }
529 if (compile_options) {
530 *compile_options = pce ? pce->compile_options : 0;
531 }
532
533 return pce ? pce->re : NULL;
534 }
535 /* }}} */
536
537 /* {{{ add_offset_pair */
add_offset_pair(zval * result,char * str,int len,int offset,char * name)538 static inline void add_offset_pair(zval *result, char *str, int len, int offset, char *name)
539 {
540 zval *match_pair;
541
542 ALLOC_ZVAL(match_pair);
543 array_init(match_pair);
544 INIT_PZVAL(match_pair);
545
546 /* Add (match, offset) to the return value */
547 add_next_index_stringl(match_pair, str, len, 1);
548 add_next_index_long(match_pair, offset);
549
550 if (name) {
551 zval_add_ref(&match_pair);
552 zend_hash_update(Z_ARRVAL_P(result), name, strlen(name)+1, &match_pair, sizeof(zval *), NULL);
553 }
554 zend_hash_next_index_insert(Z_ARRVAL_P(result), &match_pair, sizeof(zval *), NULL);
555 }
556 /* }}} */
557
php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS,int global)558 static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ */
559 {
560 /* parameters */
561 char *regex; /* Regular expression */
562 char *subject; /* String to match against */
563 int regex_len;
564 int subject_len;
565 pcre_cache_entry *pce; /* Compiled regular expression */
566 zval *subpats = NULL; /* Array for subpatterns */
567 long flags = 0; /* Match control flags */
568 long start_offset = 0; /* Where the new search starts */
569
570 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|zll", ®ex, ®ex_len,
571 &subject, &subject_len, &subpats, &flags, &start_offset) == FAILURE) {
572 RETURN_FALSE;
573 }
574
575 /* Compile regex or get it from cache. */
576 if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
577 RETURN_FALSE;
578 }
579
580 pce->refcount++;
581 php_pcre_match_impl(pce, subject, subject_len, return_value, subpats,
582 global, ZEND_NUM_ARGS() >= 4, flags, start_offset TSRMLS_CC);
583 pce->refcount--;
584 }
585 /* }}} */
586
587 /* {{{ php_pcre_match_impl() */
php_pcre_match_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * return_value,zval * subpats,int global,int use_flags,long flags,long start_offset TSRMLS_DC)588 PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
589 zval *subpats, int global, int use_flags, long flags, long start_offset TSRMLS_DC)
590 {
591 zval *result_set, /* Holds a set of subpatterns after
592 a global match */
593 **match_sets = NULL; /* An array of sets of matches for each
594 subpattern after a global match */
595 pcre_extra *extra = pce->extra;/* Holds results of studying */
596 pcre_extra extra_data; /* Used locally for exec options */
597 int exoptions = 0; /* Execution options */
598 int count = 0; /* Count of matched subpatterns */
599 int *offsets; /* Array of subpattern offsets */
600 int num_subpats; /* Number of captured subpatterns */
601 int size_offsets; /* Size of the offsets array */
602 int matched; /* Has anything matched */
603 int g_notempty = 0; /* If the match should not be empty */
604 const char **stringlist; /* Holds list of subpatterns */
605 char **subpat_names; /* Array for named subpatterns */
606 int i, rc;
607 int subpats_order; /* Order of subpattern matches */
608 int offset_capture; /* Capture match offsets: yes/no */
609 unsigned char *mark = NULL; /* Target for MARK name */
610 zval *marks = NULL; /* Array of marks for PREG_PATTERN_ORDER */
611
612 /* Overwrite the passed-in value for subpatterns with an empty array. */
613 if (subpats != NULL) {
614 zval_dtor(subpats);
615 array_init(subpats);
616 }
617
618 subpats_order = global ? PREG_PATTERN_ORDER : 0;
619
620 if (use_flags) {
621 offset_capture = flags & PREG_OFFSET_CAPTURE;
622
623 /*
624 * subpats_order is pre-set to pattern mode so we change it only if
625 * necessary.
626 */
627 if (flags & 0xff) {
628 subpats_order = flags & 0xff;
629 }
630 if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
631 (!global && subpats_order != 0)) {
632 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Invalid flags specified");
633 return;
634 }
635 } else {
636 offset_capture = 0;
637 }
638
639 /* Negative offset counts from the end of the string. */
640 if (start_offset < 0) {
641 start_offset = subject_len + start_offset;
642 if (start_offset < 0) {
643 start_offset = 0;
644 }
645 }
646
647 if (extra == NULL) {
648 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
649 extra = &extra_data;
650 }
651 extra->match_limit = PCRE_G(backtrack_limit);
652 extra->match_limit_recursion = PCRE_G(recursion_limit);
653 #ifdef PCRE_EXTRA_MARK
654 extra->mark = &mark;
655 extra->flags |= PCRE_EXTRA_MARK;
656 #endif
657
658 /* Calculate the size of the offsets array, and allocate memory for it. */
659 rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
660 if (rc < 0) {
661 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
662 RETURN_FALSE;
663 }
664 num_subpats++;
665 size_offsets = num_subpats * 3;
666
667 /*
668 * Build a mapping from subpattern numbers to their names. We will always
669 * allocate the table, even though there may be no named subpatterns. This
670 * avoids somewhat more complicated logic in the inner loops.
671 */
672 subpat_names = make_subpats_table(num_subpats, pce TSRMLS_CC);
673 if (!subpat_names) {
674 RETURN_FALSE;
675 }
676
677 offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
678 memset(offsets, 0, size_offsets*sizeof(int));
679 /* Allocate match sets array and initialize the values. */
680 if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
681 match_sets = (zval **)safe_emalloc(num_subpats, sizeof(zval *), 0);
682 for (i=0; i<num_subpats; i++) {
683 ALLOC_ZVAL(match_sets[i]);
684 array_init(match_sets[i]);
685 INIT_PZVAL(match_sets[i]);
686 }
687 }
688
689 matched = 0;
690 PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
691
692 do {
693 /* Execute the regular expression. */
694 count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
695 exoptions|g_notempty, offsets, size_offsets);
696
697 /* the string was already proved to be valid UTF-8 */
698 exoptions |= PCRE_NO_UTF8_CHECK;
699
700 /* Check for too many substrings condition. */
701 if (count == 0) {
702 php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
703 count = size_offsets/3;
704 }
705
706 /* If something has matched */
707 if (count > 0) {
708 matched++;
709
710 /* If subpatterns array has been passed, fill it in with values. */
711 if (subpats != NULL) {
712 /* Try to get the list of substrings and display a warning if failed. */
713 if ((offsets[1] - offsets[0] < 0) || pcre_get_substring_list(subject, offsets, count, &stringlist) < 0) {
714 efree(subpat_names);
715 efree(offsets);
716 if (match_sets) efree(match_sets);
717 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Get subpatterns list failed");
718 RETURN_FALSE;
719 }
720
721 if (global) { /* global pattern matching */
722 if (subpats && subpats_order == PREG_PATTERN_ORDER) {
723 /* For each subpattern, insert it into the appropriate array. */
724 for (i = 0; i < count; i++) {
725 if (offset_capture) {
726 add_offset_pair(match_sets[i], (char *)stringlist[i],
727 offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL);
728 } else {
729 add_next_index_stringl(match_sets[i], (char *)stringlist[i],
730 offsets[(i<<1)+1] - offsets[i<<1], 1);
731 }
732 }
733 /* Add MARK, if available */
734 if (mark) {
735 if (!marks) {
736 MAKE_STD_ZVAL(marks);
737 array_init(marks);
738 }
739 add_index_string(marks, matched - 1, (char *) mark, 1);
740 }
741 /*
742 * If the number of captured subpatterns on this run is
743 * less than the total possible number, pad the result
744 * arrays with empty strings.
745 */
746 if (count < num_subpats) {
747 for (; i < num_subpats; i++) {
748 add_next_index_string(match_sets[i], "", 1);
749 }
750 }
751 } else {
752 /* Allocate the result set array */
753 ALLOC_ZVAL(result_set);
754 array_init(result_set);
755 INIT_PZVAL(result_set);
756
757 /* Add all the subpatterns to it */
758 for (i = 0; i < count; i++) {
759 if (offset_capture) {
760 add_offset_pair(result_set, (char *)stringlist[i],
761 offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], subpat_names[i]);
762 } else {
763 if (subpat_names[i]) {
764 add_assoc_stringl(result_set, subpat_names[i], (char *)stringlist[i],
765 offsets[(i<<1)+1] - offsets[i<<1], 1);
766 }
767 add_next_index_stringl(result_set, (char *)stringlist[i],
768 offsets[(i<<1)+1] - offsets[i<<1], 1);
769 }
770 }
771 /* Add MARK, if available */
772 if (mark) {
773 add_assoc_string(result_set, "MARK", (char *) mark, 1);
774 }
775 /* And add it to the output array */
776 zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &result_set, sizeof(zval *), NULL);
777 }
778 } else { /* single pattern matching */
779 /* For each subpattern, insert it into the subpatterns array. */
780 for (i = 0; i < count; i++) {
781 if (offset_capture) {
782 add_offset_pair(subpats, (char *)stringlist[i],
783 offsets[(i<<1)+1] - offsets[i<<1],
784 offsets[i<<1], subpat_names[i]);
785 } else {
786 if (subpat_names[i]) {
787 add_assoc_stringl(subpats, subpat_names[i], (char *)stringlist[i],
788 offsets[(i<<1)+1] - offsets[i<<1], 1);
789 }
790 add_next_index_stringl(subpats, (char *)stringlist[i],
791 offsets[(i<<1)+1] - offsets[i<<1], 1);
792 }
793 }
794 /* Add MARK, if available */
795 if (mark) {
796 add_assoc_string(subpats, "MARK", (char *) mark, 1);
797 }
798 }
799
800 pcre_free((void *) stringlist);
801 }
802 } else if (count == PCRE_ERROR_NOMATCH) {
803 /* If we previously set PCRE_NOTEMPTY_ATSTART after a null match,
804 this is not necessarily the end. We need to advance
805 the start offset, and continue. Fudge the offset values
806 to achieve this, unless we're already at the end of the string. */
807 if (g_notempty != 0 && start_offset < subject_len) {
808 int unit_len = calculate_unit_length(pce, subject + start_offset);
809
810 offsets[0] = start_offset;
811 offsets[1] = start_offset + unit_len;
812 } else
813 break;
814 } else {
815 pcre_handle_exec_error(count TSRMLS_CC);
816 break;
817 }
818
819 /* If we have matched an empty string, mimic what Perl's /g options does.
820 This turns out to be rather cunning. First we set PCRE_NOTEMPTY_ATSTART and try
821 the match again at the same point. If this fails (picked up above) we
822 advance to the next character. */
823 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED : 0;
824
825 /* Advance to the position right after the last full match */
826 start_offset = offsets[1];
827 } while (global);
828
829 /* Add the match sets to the output array and clean up */
830 if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
831 for (i = 0; i < num_subpats; i++) {
832 if (subpat_names[i]) {
833 zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i],
834 strlen(subpat_names[i])+1, &match_sets[i], sizeof(zval *), NULL);
835 Z_ADDREF_P(match_sets[i]);
836 }
837 zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i], sizeof(zval *), NULL);
838 }
839 efree(match_sets);
840
841 if (marks) {
842 add_assoc_zval(subpats, "MARK", marks);
843 }
844 }
845
846 efree(offsets);
847 efree(subpat_names);
848
849 /* Did we encounter an error? */
850 if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
851 RETVAL_LONG(matched);
852 } else {
853 RETVAL_FALSE;
854 }
855 }
856 /* }}} */
857
858 /* {{{ proto int preg_match(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
859 Perform a Perl-style regular expression match */
PHP_FUNCTION(preg_match)860 static PHP_FUNCTION(preg_match)
861 {
862 php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
863 }
864 /* }}} */
865
866 /* {{{ proto int preg_match_all(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
867 Perform a Perl-style global regular expression match */
PHP_FUNCTION(preg_match_all)868 static PHP_FUNCTION(preg_match_all)
869 {
870 php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
871 }
872 /* }}} */
873
874 /* {{{ preg_get_backref
875 */
preg_get_backref(char ** str,int * backref)876 static int preg_get_backref(char **str, int *backref)
877 {
878 register char in_brace = 0;
879 register char *walk = *str;
880
881 if (walk[1] == 0)
882 return 0;
883
884 if (*walk == '$' && walk[1] == '{') {
885 in_brace = 1;
886 walk++;
887 }
888 walk++;
889
890 if (*walk >= '0' && *walk <= '9') {
891 *backref = *walk - '0';
892 walk++;
893 } else
894 return 0;
895
896 if (*walk && *walk >= '0' && *walk <= '9') {
897 *backref = *backref * 10 + *walk - '0';
898 walk++;
899 }
900
901 if (in_brace) {
902 if (*walk == 0 || *walk != '}')
903 return 0;
904 else
905 walk++;
906 }
907
908 *str = walk;
909 return 1;
910 }
911 /* }}} */
912
913 /* {{{ preg_do_repl_func
914 */
preg_do_repl_func(zval * function,char * subject,int * offsets,char ** subpat_names,int count,unsigned char * mark,char ** result TSRMLS_DC)915 static int preg_do_repl_func(zval *function, char *subject, int *offsets, char **subpat_names, int count, unsigned char *mark, char **result TSRMLS_DC)
916 {
917 zval *retval_ptr; /* Function return value */
918 zval **args[1]; /* Argument to pass to function */
919 zval *subpats; /* Captured subpatterns */
920 int result_len; /* Return value length */
921 int i;
922
923 MAKE_STD_ZVAL(subpats);
924 array_init(subpats);
925 for (i = 0; i < count; i++) {
926 if (subpat_names[i]) {
927 add_assoc_stringl(subpats, subpat_names[i], &subject[offsets[i<<1]] , offsets[(i<<1)+1] - offsets[i<<1], 1);
928 }
929 add_next_index_stringl(subpats, &subject[offsets[i<<1]], offsets[(i<<1)+1] - offsets[i<<1], 1);
930 }
931 if (mark) {
932 add_assoc_string(subpats, "MARK", (char *) mark, 1);
933 }
934 args[0] = &subpats;
935
936 if (call_user_function_ex(EG(function_table), NULL, function, &retval_ptr, 1, args, 0, NULL TSRMLS_CC) == SUCCESS && retval_ptr) {
937 convert_to_string_ex(&retval_ptr);
938 *result = estrndup(Z_STRVAL_P(retval_ptr), Z_STRLEN_P(retval_ptr));
939 result_len = Z_STRLEN_P(retval_ptr);
940 zval_ptr_dtor(&retval_ptr);
941 } else {
942 if (!EG(exception)) {
943 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unable to call custom replacement function");
944 }
945 result_len = offsets[1] - offsets[0];
946 *result = estrndup(&subject[offsets[0]], result_len);
947 }
948
949 zval_ptr_dtor(&subpats);
950
951 return result_len;
952 }
953 /* }}} */
954
955 /* {{{ preg_do_eval
956 */
preg_do_eval(char * eval_str,int eval_str_len,char * subject,int * offsets,int count,char ** result TSRMLS_DC)957 static int preg_do_eval(char *eval_str, int eval_str_len, char *subject,
958 int *offsets, int count, char **result TSRMLS_DC)
959 {
960 zval retval; /* Return value from evaluation */
961 char *eval_str_end, /* End of eval string */
962 *match, /* Current match for a backref */
963 *esc_match, /* Quote-escaped match */
964 *walk, /* Used to walk the code string */
965 *segment, /* Start of segment to append while walking */
966 walk_last; /* Last walked character */
967 int match_len; /* Length of the match */
968 int esc_match_len; /* Length of the quote-escaped match */
969 int result_len; /* Length of the result of the evaluation */
970 int backref; /* Current backref */
971 char *compiled_string_description;
972 smart_str code = {0};
973
974 eval_str_end = eval_str + eval_str_len;
975 walk = segment = eval_str;
976 walk_last = 0;
977
978 while (walk < eval_str_end) {
979 /* If found a backreference.. */
980 if ('\\' == *walk || '$' == *walk) {
981 smart_str_appendl(&code, segment, walk - segment);
982 if (walk_last == '\\') {
983 code.c[code.len-1] = *walk++;
984 segment = walk;
985 walk_last = 0;
986 continue;
987 }
988 segment = walk;
989 if (preg_get_backref(&walk, &backref)) {
990 if (backref < count) {
991 /* Find the corresponding string match and substitute it
992 in instead of the backref */
993 match = subject + offsets[backref<<1];
994 match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
995 if (match_len) {
996 esc_match = php_addslashes(match, match_len, &esc_match_len, 0 TSRMLS_CC);
997 } else {
998 esc_match = match;
999 esc_match_len = 0;
1000 }
1001 } else {
1002 esc_match = "";
1003 esc_match_len = 0;
1004 }
1005 smart_str_appendl(&code, esc_match, esc_match_len);
1006
1007 segment = walk;
1008
1009 /* Clean up and reassign */
1010 if (esc_match_len)
1011 efree(esc_match);
1012 continue;
1013 }
1014 }
1015 walk++;
1016 walk_last = walk[-1];
1017 }
1018 smart_str_appendl(&code, segment, walk - segment);
1019 smart_str_0(&code);
1020
1021 compiled_string_description = zend_make_compiled_string_description("regexp code" TSRMLS_CC);
1022 /* Run the code */
1023 if (zend_eval_stringl(code.c, code.len, &retval, compiled_string_description TSRMLS_CC) == FAILURE) {
1024 efree(compiled_string_description);
1025 php_error_docref(NULL TSRMLS_CC,E_ERROR, "Failed evaluating code: %s%s", PHP_EOL, code.c);
1026 /* zend_error() does not return in this case */
1027 }
1028 efree(compiled_string_description);
1029 convert_to_string(&retval);
1030
1031 /* Save the return value and its length */
1032 *result = estrndup(Z_STRVAL(retval), Z_STRLEN(retval));
1033 result_len = Z_STRLEN(retval);
1034
1035 /* Clean up */
1036 zval_dtor(&retval);
1037 smart_str_free(&code);
1038
1039 return result_len;
1040 }
1041 /* }}} */
1042
1043 /* {{{ php_pcre_replace
1044 */
php_pcre_replace(char * regex,int regex_len,char * subject,int subject_len,zval * replace_val,int is_callable_replace,int * result_len,int limit,int * replace_count TSRMLS_DC)1045 PHPAPI char *php_pcre_replace(char *regex, int regex_len,
1046 char *subject, int subject_len,
1047 zval *replace_val, int is_callable_replace,
1048 int *result_len, int limit, int *replace_count TSRMLS_DC)
1049 {
1050 pcre_cache_entry *pce; /* Compiled regular expression */
1051 char *result; /* Function result */
1052
1053 /* Compile regex or get it from cache. */
1054 if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1055 return NULL;
1056 }
1057 pce->refcount++;
1058 result = php_pcre_replace_impl(pce, subject, subject_len, replace_val,
1059 is_callable_replace, result_len, limit, replace_count TSRMLS_CC);
1060 pce->refcount--;
1061
1062 return result;
1063 }
1064 /* }}} */
1065
1066 /* {{{ php_pcre_replace_impl() */
php_pcre_replace_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * replace_val,int is_callable_replace,int * result_len,int limit,int * replace_count TSRMLS_DC)1067 PHPAPI char *php_pcre_replace_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *replace_val,
1068 int is_callable_replace, int *result_len, int limit, int *replace_count TSRMLS_DC)
1069 {
1070 pcre_extra *extra = pce->extra;/* Holds results of studying */
1071 pcre_extra extra_data; /* Used locally for exec options */
1072 int exoptions = 0; /* Execution options */
1073 int count = 0; /* Count of matched subpatterns */
1074 int *offsets; /* Array of subpattern offsets */
1075 char **subpat_names; /* Array for named subpatterns */
1076 int num_subpats; /* Number of captured subpatterns */
1077 int size_offsets; /* Size of the offsets array */
1078 size_t new_len; /* Length of needed storage */
1079 size_t alloc_len; /* Actual allocated length */
1080 int eval_result_len=0; /* Length of the eval'ed or
1081 function-returned string */
1082 int match_len; /* Length of the current match */
1083 int backref; /* Backreference number */
1084 int eval; /* If the replacement string should be eval'ed */
1085 int start_offset; /* Where the new search starts */
1086 int g_notempty=0; /* If the match should not be empty */
1087 int replace_len=0; /* Length of replacement string */
1088 char *result, /* Result of replacement */
1089 *replace=NULL, /* Replacement string */
1090 *new_buf, /* Temporary buffer for re-allocation */
1091 *walkbuf, /* Location of current replacement in the result */
1092 *walk, /* Used to walk the replacement string */
1093 *match, /* The current match */
1094 *piece, /* The current piece of subject */
1095 *replace_end=NULL, /* End of replacement string */
1096 *eval_result, /* Result of eval or custom function */
1097 walk_last; /* Last walked character */
1098 int rc;
1099 unsigned char *mark = NULL; /* Target for MARK name */
1100
1101 if (extra == NULL) {
1102 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1103 extra = &extra_data;
1104 }
1105 extra->match_limit = PCRE_G(backtrack_limit);
1106 extra->match_limit_recursion = PCRE_G(recursion_limit);
1107 #ifdef PCRE_EXTRA_MARK
1108 extra->mark = &mark;
1109 extra->flags |= PCRE_EXTRA_MARK;
1110 #endif
1111
1112 eval = pce->preg_options & PREG_REPLACE_EVAL;
1113 if (is_callable_replace) {
1114 if (eval) {
1115 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Modifier /e cannot be used with replacement callback");
1116 return NULL;
1117 }
1118 } else {
1119 replace = Z_STRVAL_P(replace_val);
1120 replace_len = Z_STRLEN_P(replace_val);
1121 replace_end = replace + replace_len;
1122 }
1123
1124 if (eval) {
1125 php_error_docref(NULL TSRMLS_CC, E_DEPRECATED, "The /e modifier is deprecated, use preg_replace_callback instead");
1126 }
1127
1128 /* Calculate the size of the offsets array, and allocate memory for it. */
1129 rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &num_subpats);
1130 if (rc < 0) {
1131 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1132 return NULL;
1133 }
1134 num_subpats++;
1135 size_offsets = num_subpats * 3;
1136
1137 /*
1138 * Build a mapping from subpattern numbers to their names. We will always
1139 * allocate the table, even though there may be no named subpatterns. This
1140 * avoids somewhat more complicated logic in the inner loops.
1141 */
1142 subpat_names = make_subpats_table(num_subpats, pce TSRMLS_CC);
1143 if (!subpat_names) {
1144 return NULL;
1145 }
1146
1147 offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1148
1149 result = safe_emalloc(subject_len, 2*sizeof(char), 1);
1150 alloc_len = 2 * (size_t)subject_len + 1;
1151
1152 /* Initialize */
1153 match = NULL;
1154 *result_len = 0;
1155 start_offset = 0;
1156 PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1157
1158 while (1) {
1159 /* Execute the regular expression. */
1160 count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
1161 exoptions|g_notempty, offsets, size_offsets);
1162
1163 /* the string was already proved to be valid UTF-8 */
1164 exoptions |= PCRE_NO_UTF8_CHECK;
1165
1166 /* Check for too many substrings condition. */
1167 if (count == 0) {
1168 php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
1169 count = size_offsets/3;
1170 }
1171
1172 piece = subject + start_offset;
1173
1174 if (count > 0 && (offsets[1] - offsets[0] >= 0) && (limit == -1 || limit > 0)) {
1175 if (replace_count) {
1176 ++*replace_count;
1177 }
1178 /* Set the match location in subject */
1179 match = subject + offsets[0];
1180
1181 new_len = *result_len + offsets[0] - start_offset; /* part before the match */
1182
1183 /* If evaluating, do it and add the return string's length */
1184 if (eval) {
1185 eval_result_len = preg_do_eval(replace, replace_len, subject,
1186 offsets, count, &eval_result TSRMLS_CC);
1187 new_len += eval_result_len;
1188 } else if (is_callable_replace) {
1189 /* Use custom function to get replacement string and its length. */
1190 eval_result_len = preg_do_repl_func(replace_val, subject, offsets, subpat_names, count, mark, &eval_result TSRMLS_CC);
1191 new_len += eval_result_len;
1192 } else { /* do regular substitution */
1193 walk = replace;
1194 walk_last = 0;
1195 while (walk < replace_end) {
1196 if ('\\' == *walk || '$' == *walk) {
1197 if (walk_last == '\\') {
1198 walk++;
1199 walk_last = 0;
1200 continue;
1201 }
1202 if (preg_get_backref(&walk, &backref)) {
1203 if (backref < count)
1204 new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
1205 continue;
1206 }
1207 }
1208 new_len++;
1209 walk++;
1210 walk_last = walk[-1];
1211 }
1212 }
1213
1214 if (new_len + 1 > alloc_len) {
1215 new_buf = safe_emalloc(2, new_len + 1, alloc_len);
1216 alloc_len = 1 + alloc_len + 2 * (size_t)new_len;
1217 memcpy(new_buf, result, *result_len);
1218 efree(result);
1219 result = new_buf;
1220 }
1221 /* copy the part of the string before the match */
1222 memcpy(&result[*result_len], piece, match-piece);
1223 *result_len += match-piece;
1224
1225 /* copy replacement and backrefs */
1226 walkbuf = result + *result_len;
1227
1228 /* If evaluating or using custom function, copy result to the buffer
1229 * and clean up. */
1230 if (eval || is_callable_replace) {
1231 memcpy(walkbuf, eval_result, eval_result_len);
1232 *result_len += eval_result_len;
1233 STR_FREE(eval_result);
1234 } else { /* do regular backreference copying */
1235 walk = replace;
1236 walk_last = 0;
1237 while (walk < replace_end) {
1238 if ('\\' == *walk || '$' == *walk) {
1239 if (walk_last == '\\') {
1240 *(walkbuf-1) = *walk++;
1241 walk_last = 0;
1242 continue;
1243 }
1244 if (preg_get_backref(&walk, &backref)) {
1245 if (backref < count) {
1246 match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1247 memcpy(walkbuf, subject + offsets[backref<<1], match_len);
1248 walkbuf += match_len;
1249 }
1250 continue;
1251 }
1252 }
1253 *walkbuf++ = *walk++;
1254 walk_last = walk[-1];
1255 }
1256 *walkbuf = '\0';
1257 /* increment the result length by how much we've added to the string */
1258 *result_len += walkbuf - (result + *result_len);
1259 }
1260
1261 if (limit != -1)
1262 limit--;
1263
1264 } else if (count == PCRE_ERROR_NOMATCH || limit == 0) {
1265 /* If we previously set PCRE_NOTEMPTY_ATSTART after a null match,
1266 this is not necessarily the end. We need to advance
1267 the start offset, and continue. Fudge the offset values
1268 to achieve this, unless we're already at the end of the string. */
1269 if (g_notempty != 0 && start_offset < subject_len) {
1270 int unit_len = calculate_unit_length(pce, piece);
1271
1272 offsets[0] = start_offset;
1273 offsets[1] = start_offset + unit_len;
1274 memcpy(&result[*result_len], piece, unit_len);
1275 *result_len += unit_len;
1276 } else {
1277 new_len = *result_len + subject_len - start_offset;
1278 if (new_len + 1 > alloc_len) {
1279 new_buf = safe_emalloc(new_len, sizeof(char), 1);
1280 alloc_len = (size_t)new_len + 1; /* now we know exactly how long it is */
1281 memcpy(new_buf, result, *result_len);
1282 efree(result);
1283 result = new_buf;
1284 }
1285 /* stick that last bit of string on our output */
1286 memcpy(&result[*result_len], piece, subject_len - start_offset);
1287 *result_len += subject_len - start_offset;
1288 result[*result_len] = '\0';
1289 break;
1290 }
1291 } else {
1292 pcre_handle_exec_error(count TSRMLS_CC);
1293 efree(result);
1294 result = NULL;
1295 break;
1296 }
1297
1298 /* If we have matched an empty string, mimic what Perl's /g options does.
1299 This turns out to be rather cunning. First we set PCRE_NOTEMPTY_ATSTART and try
1300 the match again at the same point. If this fails (picked up above) we
1301 advance to the next character. */
1302 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED : 0;
1303
1304 /* Advance to the next piece. */
1305 start_offset = offsets[1];
1306 }
1307
1308 efree(offsets);
1309 efree(subpat_names);
1310
1311 if(result && (size_t)(*result_len) > INT_MAX) {
1312 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Result is too big, max is %d", INT_MAX);
1313 efree(result);
1314 result = NULL;
1315 }
1316
1317 return result;
1318 }
1319 /* }}} */
1320
1321 /* {{{ php_replace_in_subject
1322 */
php_replace_in_subject(zval * regex,zval * replace,zval ** subject,int * result_len,int limit,int is_callable_replace,int * replace_count TSRMLS_DC)1323 static char *php_replace_in_subject(zval *regex, zval *replace, zval **subject, int *result_len, int limit, int is_callable_replace, int *replace_count TSRMLS_DC)
1324 {
1325 zval **regex_entry,
1326 **replace_entry = NULL,
1327 *replace_value,
1328 empty_replace;
1329 char *subject_value,
1330 *result;
1331 int subject_len;
1332
1333 /* Make sure we're dealing with strings. */
1334 convert_to_string_ex(subject);
1335 /* FIXME: This might need to be changed to STR_EMPTY_ALLOC(). Check if this zval could be dtor()'ed somehow */
1336 ZVAL_STRINGL(&empty_replace, "", 0, 0);
1337
1338 /* If regex is an array */
1339 if (Z_TYPE_P(regex) == IS_ARRAY) {
1340 /* Duplicate subject string for repeated replacement */
1341 subject_value = estrndup(Z_STRVAL_PP(subject), Z_STRLEN_PP(subject));
1342 subject_len = Z_STRLEN_PP(subject);
1343 *result_len = subject_len;
1344
1345 zend_hash_internal_pointer_reset(Z_ARRVAL_P(regex));
1346
1347 replace_value = replace;
1348 if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace)
1349 zend_hash_internal_pointer_reset(Z_ARRVAL_P(replace));
1350
1351 /* For each entry in the regex array, get the entry */
1352 while (zend_hash_get_current_data(Z_ARRVAL_P(regex), (void **)®ex_entry) == SUCCESS) {
1353 /* Make sure we're dealing with strings. */
1354 convert_to_string_ex(regex_entry);
1355
1356 /* If replace is an array and not a callable construct */
1357 if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace) {
1358 /* Get current entry */
1359 if (zend_hash_get_current_data(Z_ARRVAL_P(replace), (void **)&replace_entry) == SUCCESS) {
1360 if (!is_callable_replace) {
1361 convert_to_string_ex(replace_entry);
1362 }
1363 replace_value = *replace_entry;
1364 zend_hash_move_forward(Z_ARRVAL_P(replace));
1365 } else {
1366 /* We've run out of replacement strings, so use an empty one */
1367 replace_value = &empty_replace;
1368 }
1369 }
1370
1371 /* Do the actual replacement and put the result back into subject_value
1372 for further replacements. */
1373 if ((result = php_pcre_replace(Z_STRVAL_PP(regex_entry),
1374 Z_STRLEN_PP(regex_entry),
1375 subject_value,
1376 subject_len,
1377 replace_value,
1378 is_callable_replace,
1379 result_len,
1380 limit,
1381 replace_count TSRMLS_CC)) != NULL) {
1382 efree(subject_value);
1383 subject_value = result;
1384 subject_len = *result_len;
1385 } else {
1386 efree(subject_value);
1387 return NULL;
1388 }
1389
1390 zend_hash_move_forward(Z_ARRVAL_P(regex));
1391 }
1392
1393 return subject_value;
1394 } else {
1395 result = php_pcre_replace(Z_STRVAL_P(regex),
1396 Z_STRLEN_P(regex),
1397 Z_STRVAL_PP(subject),
1398 Z_STRLEN_PP(subject),
1399 replace,
1400 is_callable_replace,
1401 result_len,
1402 limit,
1403 replace_count TSRMLS_CC);
1404 return result;
1405 }
1406 }
1407 /* }}} */
1408
1409 /* {{{ preg_replace_impl
1410 */
preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS,int is_callable_replace,int is_filter)1411 static void preg_replace_impl(INTERNAL_FUNCTION_PARAMETERS, int is_callable_replace, int is_filter)
1412 {
1413 zval **regex,
1414 **replace,
1415 **subject,
1416 **subject_entry,
1417 **zcount = NULL;
1418 char *result;
1419 int result_len;
1420 int limit_val = -1;
1421 long limit = -1;
1422 char *string_key;
1423 uint string_key_len;
1424 ulong num_key;
1425 char *callback_name;
1426 int replace_count=0, old_replace_count;
1427
1428 /* Get function parameters and do error-checking. */
1429 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ZZZ|lZ", ®ex, &replace, &subject, &limit, &zcount) == FAILURE) {
1430 return;
1431 }
1432
1433 if (!is_callable_replace && Z_TYPE_PP(replace) == IS_ARRAY && Z_TYPE_PP(regex) != IS_ARRAY) {
1434 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Parameter mismatch, pattern is a string while replacement is an array");
1435 RETURN_FALSE;
1436 }
1437
1438 SEPARATE_ZVAL(replace);
1439 if (Z_TYPE_PP(replace) != IS_ARRAY && (Z_TYPE_PP(replace) != IS_OBJECT || !is_callable_replace)) {
1440 convert_to_string_ex(replace);
1441 }
1442 if (is_callable_replace) {
1443 if (!zend_is_callable(*replace, 0, &callback_name TSRMLS_CC)) {
1444 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Requires argument 2, '%s', to be a valid callback", callback_name);
1445 efree(callback_name);
1446 MAKE_COPY_ZVAL(subject, return_value);
1447 return;
1448 }
1449 efree(callback_name);
1450 }
1451
1452 SEPARATE_ZVAL(regex);
1453 SEPARATE_ZVAL(subject);
1454
1455 if (ZEND_NUM_ARGS() > 3) {
1456 limit_val = limit;
1457 }
1458
1459 if (Z_TYPE_PP(regex) != IS_ARRAY)
1460 convert_to_string_ex(regex);
1461
1462 /* if subject is an array */
1463 if (Z_TYPE_PP(subject) == IS_ARRAY) {
1464 array_init(return_value);
1465 zend_hash_internal_pointer_reset(Z_ARRVAL_PP(subject));
1466
1467 /* For each subject entry, convert it to string, then perform replacement
1468 and add the result to the return_value array. */
1469 while (zend_hash_get_current_data(Z_ARRVAL_PP(subject), (void **)&subject_entry) == SUCCESS) {
1470 SEPARATE_ZVAL(subject_entry);
1471 old_replace_count = replace_count;
1472 if ((result = php_replace_in_subject(*regex, *replace, subject_entry, &result_len, limit_val, is_callable_replace, &replace_count TSRMLS_CC)) != NULL) {
1473 if (!is_filter || replace_count > old_replace_count) {
1474 /* Add to return array */
1475 switch(zend_hash_get_current_key_ex(Z_ARRVAL_PP(subject), &string_key, &string_key_len, &num_key, 0, NULL))
1476 {
1477 case HASH_KEY_IS_STRING:
1478 add_assoc_stringl_ex(return_value, string_key, string_key_len, result, result_len, 0);
1479 break;
1480
1481 case HASH_KEY_IS_LONG:
1482 add_index_stringl(return_value, num_key, result, result_len, 0);
1483 break;
1484 }
1485 } else {
1486 efree(result);
1487 }
1488 }
1489
1490 zend_hash_move_forward(Z_ARRVAL_PP(subject));
1491 }
1492 } else { /* if subject is not an array */
1493 old_replace_count = replace_count;
1494 if ((result = php_replace_in_subject(*regex, *replace, subject, &result_len, limit_val, is_callable_replace, &replace_count TSRMLS_CC)) != NULL) {
1495 if (!is_filter || replace_count > old_replace_count) {
1496 RETVAL_STRINGL(result, result_len, 0);
1497 } else {
1498 efree(result);
1499 }
1500 }
1501 }
1502 if (ZEND_NUM_ARGS() > 4) {
1503 zval_dtor(*zcount);
1504 ZVAL_LONG(*zcount, replace_count);
1505 }
1506
1507 }
1508 /* }}} */
1509
1510 /* {{{ proto mixed preg_replace(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1511 Perform Perl-style regular expression replacement. */
PHP_FUNCTION(preg_replace)1512 static PHP_FUNCTION(preg_replace)
1513 {
1514 preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 0);
1515 }
1516 /* }}} */
1517
1518 /* {{{ proto mixed preg_replace_callback(mixed regex, mixed callback, mixed subject [, int limit [, int &count]])
1519 Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback)1520 static PHP_FUNCTION(preg_replace_callback)
1521 {
1522 preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1, 0);
1523 }
1524 /* }}} */
1525
1526 /* {{{ proto mixed preg_filter(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1527 Perform Perl-style regular expression replacement and only return matches. */
PHP_FUNCTION(preg_filter)1528 static PHP_FUNCTION(preg_filter)
1529 {
1530 preg_replace_impl(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0, 1);
1531 }
1532 /* }}} */
1533
1534 /* {{{ proto array preg_split(string pattern, string subject [, int limit [, int flags]])
1535 Split string into an array using a perl-style regular expression as a delimiter */
PHP_FUNCTION(preg_split)1536 static PHP_FUNCTION(preg_split)
1537 {
1538 char *regex; /* Regular expression */
1539 char *subject; /* String to match against */
1540 int regex_len;
1541 int subject_len;
1542 long limit_val = -1;/* Integer value of limit */
1543 long flags = 0; /* Match control flags */
1544 pcre_cache_entry *pce; /* Compiled regular expression */
1545
1546 /* Get function parameters and do error checking */
1547 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|ll", ®ex, ®ex_len,
1548 &subject, &subject_len, &limit_val, &flags) == FAILURE) {
1549 RETURN_FALSE;
1550 }
1551
1552 /* Compile regex or get it from cache. */
1553 if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1554 RETURN_FALSE;
1555 }
1556
1557 pce->refcount++;
1558 php_pcre_split_impl(pce, subject, subject_len, return_value, limit_val, flags TSRMLS_CC);
1559 pce->refcount--;
1560 }
1561 /* }}} */
1562
1563 /* {{{ php_pcre_split
1564 */
php_pcre_split_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * return_value,long limit_val,long flags TSRMLS_DC)1565 PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
1566 long limit_val, long flags TSRMLS_DC)
1567 {
1568 pcre_extra *extra = NULL; /* Holds results of studying */
1569 pcre *re_bump = NULL; /* Regex instance for empty matches */
1570 pcre_extra *extra_bump = NULL; /* Almost dummy */
1571 pcre_extra extra_data; /* Used locally for exec options */
1572 int *offsets; /* Array of subpattern offsets */
1573 int size_offsets; /* Size of the offsets array */
1574 int exoptions = 0; /* Execution options */
1575 int count = 0; /* Count of matched subpatterns */
1576 int start_offset; /* Where the new search starts */
1577 int next_offset; /* End of the last delimiter match + 1 */
1578 int g_notempty = 0; /* If the match should not be empty */
1579 char *last_match; /* Location of last match */
1580 int rc;
1581 int no_empty; /* If NO_EMPTY flag is set */
1582 int delim_capture; /* If delimiters should be captured */
1583 int offset_capture; /* If offsets should be captured */
1584
1585 no_empty = flags & PREG_SPLIT_NO_EMPTY;
1586 delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
1587 offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
1588
1589 if (limit_val == 0) {
1590 limit_val = -1;
1591 }
1592
1593 if (extra == NULL) {
1594 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1595 extra = &extra_data;
1596 }
1597 extra->match_limit = PCRE_G(backtrack_limit);
1598 extra->match_limit_recursion = PCRE_G(recursion_limit);
1599 #ifdef PCRE_EXTRA_MARK
1600 extra->flags &= ~PCRE_EXTRA_MARK;
1601 #endif
1602
1603 /* Initialize return value */
1604 array_init(return_value);
1605
1606 /* Calculate the size of the offsets array, and allocate memory for it. */
1607 rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
1608 if (rc < 0) {
1609 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1610 RETURN_FALSE;
1611 }
1612 size_offsets = (size_offsets + 1) * 3;
1613 offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1614
1615 /* Start at the beginning of the string */
1616 start_offset = 0;
1617 next_offset = 0;
1618 last_match = subject;
1619 PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1620
1621 /* Get next piece if no limit or limit not yet reached and something matched*/
1622 while ((limit_val == -1 || limit_val > 1)) {
1623 count = pcre_exec(pce->re, extra, subject,
1624 subject_len, start_offset,
1625 exoptions|g_notempty, offsets, size_offsets);
1626
1627 /* the string was already proved to be valid UTF-8 */
1628 exoptions |= PCRE_NO_UTF8_CHECK;
1629
1630 /* Check for too many substrings condition. */
1631 if (count == 0) {
1632 php_error_docref(NULL TSRMLS_CC,E_NOTICE, "Matched, but too many substrings");
1633 count = size_offsets/3;
1634 }
1635
1636 /* If something matched */
1637 if (count > 0 && (offsets[1] - offsets[0] >= 0)) {
1638 if (!no_empty || &subject[offsets[0]] != last_match) {
1639
1640 if (offset_capture) {
1641 /* Add (match, offset) pair to the return value */
1642 add_offset_pair(return_value, last_match, &subject[offsets[0]]-last_match, next_offset, NULL);
1643 } else {
1644 /* Add the piece to the return value */
1645 add_next_index_stringl(return_value, last_match,
1646 &subject[offsets[0]]-last_match, 1);
1647 }
1648
1649 /* One less left to do */
1650 if (limit_val != -1)
1651 limit_val--;
1652 }
1653
1654 last_match = &subject[offsets[1]];
1655 next_offset = offsets[1];
1656
1657 if (delim_capture) {
1658 int i, match_len;
1659 for (i = 1; i < count; i++) {
1660 match_len = offsets[(i<<1)+1] - offsets[i<<1];
1661 /* If we have matched a delimiter */
1662 if (!no_empty || match_len > 0) {
1663 if (offset_capture) {
1664 add_offset_pair(return_value, &subject[offsets[i<<1]], match_len, offsets[i<<1], NULL);
1665 } else {
1666 add_next_index_stringl(return_value,
1667 &subject[offsets[i<<1]],
1668 match_len, 1);
1669 }
1670 }
1671 }
1672 }
1673 } else if (count == PCRE_ERROR_NOMATCH) {
1674 /* If we previously set PCRE_NOTEMPTY_ATSTART after a null match,
1675 this is not necessarily the end. We need to advance
1676 the start offset, and continue. Fudge the offset values
1677 to achieve this, unless we're already at the end of the string. */
1678 if (g_notempty != 0 && start_offset < subject_len) {
1679 if (pce->compile_options & PCRE_UTF8) {
1680 if (re_bump == NULL) {
1681 int dummy;
1682
1683 if ((re_bump = pcre_get_compiled_regex("/./us", &extra_bump, &dummy TSRMLS_CC)) == NULL) {
1684 RETURN_FALSE;
1685 }
1686 }
1687 count = pcre_exec(re_bump, extra_bump, subject,
1688 subject_len, start_offset,
1689 exoptions, offsets, size_offsets);
1690 if (count < 1) {
1691 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown error");
1692 RETURN_FALSE;
1693 }
1694 } else {
1695 offsets[0] = start_offset;
1696 offsets[1] = start_offset + 1;
1697 }
1698 } else
1699 break;
1700 } else {
1701 pcre_handle_exec_error(count TSRMLS_CC);
1702 break;
1703 }
1704
1705 /* If we have matched an empty string, mimic what Perl's /g options does.
1706 This turns out to be rather cunning. First we set PCRE_NOTEMPTY_ATSTART and try
1707 the match again at the same point. If this fails (picked up above) we
1708 advance to the next character. */
1709 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED : 0;
1710
1711 /* Advance to the position right after the last full match */
1712 start_offset = offsets[1];
1713 }
1714
1715
1716 start_offset = last_match - subject; /* the offset might have been incremented, but without further successful matches */
1717
1718 if (!no_empty || start_offset < subject_len)
1719 {
1720 if (offset_capture) {
1721 /* Add the last (match, offset) pair to the return value */
1722 add_offset_pair(return_value, &subject[start_offset], subject_len - start_offset, start_offset, NULL);
1723 } else {
1724 /* Add the last piece to the return value */
1725 add_next_index_stringl(return_value, last_match, subject + subject_len - last_match, 1);
1726 }
1727 }
1728
1729
1730 /* Clean up */
1731 efree(offsets);
1732 }
1733 /* }}} */
1734
1735 /* {{{ proto string preg_quote(string str [, string delim_char])
1736 Quote regular expression characters plus an optional character */
PHP_FUNCTION(preg_quote)1737 static PHP_FUNCTION(preg_quote)
1738 {
1739 int in_str_len;
1740 char *in_str; /* Input string argument */
1741 char *in_str_end; /* End of the input string */
1742 int delim_len = 0;
1743 char *delim = NULL; /* Additional delimiter argument */
1744 char *out_str, /* Output string with quoted characters */
1745 *p, /* Iterator for input string */
1746 *q, /* Iterator for output string */
1747 delim_char=0, /* Delimiter character to be quoted */
1748 c; /* Current character */
1749 zend_bool quote_delim = 0; /* Whether to quote additional delim char */
1750
1751 /* Get the arguments and check for errors */
1752 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|s", &in_str, &in_str_len,
1753 &delim, &delim_len) == FAILURE) {
1754 return;
1755 }
1756
1757 in_str_end = in_str + in_str_len;
1758
1759 /* Nothing to do if we got an empty string */
1760 if (in_str == in_str_end) {
1761 RETURN_EMPTY_STRING();
1762 }
1763
1764 if (delim && *delim) {
1765 delim_char = delim[0];
1766 quote_delim = 1;
1767 }
1768
1769 /* Allocate enough memory so that even if each character
1770 is quoted, we won't run out of room */
1771 out_str = safe_emalloc_string(4, in_str_len, 1);
1772
1773 /* Go through the string and quote necessary characters */
1774 for(p = in_str, q = out_str; p != in_str_end; p++) {
1775 c = *p;
1776 switch(c) {
1777 case '.':
1778 case '\\':
1779 case '+':
1780 case '*':
1781 case '?':
1782 case '[':
1783 case '^':
1784 case ']':
1785 case '$':
1786 case '(':
1787 case ')':
1788 case '{':
1789 case '}':
1790 case '=':
1791 case '!':
1792 case '>':
1793 case '<':
1794 case '|':
1795 case ':':
1796 case '-':
1797 *q++ = '\\';
1798 *q++ = c;
1799 break;
1800
1801 case '\0':
1802 *q++ = '\\';
1803 *q++ = '0';
1804 *q++ = '0';
1805 *q++ = '0';
1806 break;
1807
1808 default:
1809 if (quote_delim && c == delim_char)
1810 *q++ = '\\';
1811 *q++ = c;
1812 break;
1813 }
1814 }
1815 *q = '\0';
1816
1817 /* Reallocate string and return it */
1818 RETVAL_STRINGL(erealloc(out_str, q - out_str + 1), q - out_str, 0);
1819 }
1820 /* }}} */
1821
1822 /* {{{ proto array preg_grep(string regex, array input [, int flags])
1823 Searches array and returns entries which match regex */
PHP_FUNCTION(preg_grep)1824 static PHP_FUNCTION(preg_grep)
1825 {
1826 char *regex; /* Regular expression */
1827 int regex_len;
1828 zval *input; /* Input array */
1829 long flags = 0; /* Match control flags */
1830 pcre_cache_entry *pce; /* Compiled regular expression */
1831
1832 /* Get arguments and do error checking */
1833 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sa|l", ®ex, ®ex_len,
1834 &input, &flags) == FAILURE) {
1835 return;
1836 }
1837
1838 /* Compile regex or get it from cache. */
1839 if ((pce = pcre_get_compiled_regex_cache(regex, regex_len TSRMLS_CC)) == NULL) {
1840 RETURN_FALSE;
1841 }
1842
1843 pce->refcount++;
1844 php_pcre_grep_impl(pce, input, return_value, flags TSRMLS_CC);
1845 pce->refcount--;
1846 }
1847 /* }}} */
1848
php_pcre_grep_impl(pcre_cache_entry * pce,zval * input,zval * return_value,long flags TSRMLS_DC)1849 PHPAPI void php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, long flags TSRMLS_DC) /* {{{ */
1850 {
1851 zval **entry; /* An entry in the input array */
1852 pcre_extra *extra = pce->extra;/* Holds results of studying */
1853 pcre_extra extra_data; /* Used locally for exec options */
1854 int *offsets; /* Array of subpattern offsets */
1855 int size_offsets; /* Size of the offsets array */
1856 int count = 0; /* Count of matched subpatterns */
1857 char *string_key;
1858 uint string_key_len;
1859 ulong num_key;
1860 zend_bool invert; /* Whether to return non-matching
1861 entries */
1862 int rc;
1863
1864 invert = flags & PREG_GREP_INVERT ? 1 : 0;
1865
1866 if (extra == NULL) {
1867 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1868 extra = &extra_data;
1869 }
1870 extra->match_limit = PCRE_G(backtrack_limit);
1871 extra->match_limit_recursion = PCRE_G(recursion_limit);
1872 #ifdef PCRE_EXTRA_MARK
1873 extra->flags &= ~PCRE_EXTRA_MARK;
1874 #endif
1875
1876 /* Calculate the size of the offsets array, and allocate memory for it. */
1877 rc = pcre_fullinfo(pce->re, extra, PCRE_INFO_CAPTURECOUNT, &size_offsets);
1878 if (rc < 0) {
1879 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
1880 RETURN_FALSE;
1881 }
1882 size_offsets = (size_offsets + 1) * 3;
1883 offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1884
1885 /* Initialize return array */
1886 array_init(return_value);
1887
1888 PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1889
1890 /* Go through the input array */
1891 zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
1892 while (zend_hash_get_current_data(Z_ARRVAL_P(input), (void **)&entry) == SUCCESS) {
1893 zval subject = **entry;
1894
1895 if (Z_TYPE_PP(entry) != IS_STRING) {
1896 zval_copy_ctor(&subject);
1897 convert_to_string(&subject);
1898 }
1899
1900 /* Perform the match */
1901 count = pcre_exec(pce->re, extra, Z_STRVAL(subject),
1902 Z_STRLEN(subject), 0,
1903 0, offsets, size_offsets);
1904
1905 /* Check for too many substrings condition. */
1906 if (count == 0) {
1907 php_error_docref(NULL TSRMLS_CC, E_NOTICE, "Matched, but too many substrings");
1908 count = size_offsets/3;
1909 } else if (count < 0 && count != PCRE_ERROR_NOMATCH) {
1910 pcre_handle_exec_error(count TSRMLS_CC);
1911 break;
1912 }
1913
1914 /* If the entry fits our requirements */
1915 if ((count > 0 && !invert) || (count == PCRE_ERROR_NOMATCH && invert)) {
1916
1917 Z_ADDREF_PP(entry);
1918
1919 /* Add to return array */
1920 switch (zend_hash_get_current_key_ex(Z_ARRVAL_P(input), &string_key, &string_key_len, &num_key, 0, NULL))
1921 {
1922 case HASH_KEY_IS_STRING:
1923 zend_hash_update(Z_ARRVAL_P(return_value), string_key,
1924 string_key_len, entry, sizeof(zval *), NULL);
1925 break;
1926
1927 case HASH_KEY_IS_LONG:
1928 zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry,
1929 sizeof(zval *), NULL);
1930 break;
1931 }
1932 }
1933
1934 if (Z_TYPE_PP(entry) != IS_STRING) {
1935 zval_dtor(&subject);
1936 }
1937
1938 zend_hash_move_forward(Z_ARRVAL_P(input));
1939 }
1940 zend_hash_internal_pointer_reset(Z_ARRVAL_P(input));
1941 /* Clean up */
1942 efree(offsets);
1943 }
1944 /* }}} */
1945
1946 /* {{{ proto int preg_last_error()
1947 Returns the error code of the last regexp execution. */
PHP_FUNCTION(preg_last_error)1948 static PHP_FUNCTION(preg_last_error)
1949 {
1950 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "") == FAILURE) {
1951 return;
1952 }
1953
1954 RETURN_LONG(PCRE_G(error_code));
1955 }
1956 /* }}} */
1957
1958 /* {{{ module definition structures */
1959
1960 /* {{{ arginfo */
1961 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match, 0, 0, 2)
1962 ZEND_ARG_INFO(0, pattern)
1963 ZEND_ARG_INFO(0, subject)
1964 ZEND_ARG_INFO(1, subpatterns) /* array */
1965 ZEND_ARG_INFO(0, flags)
1966 ZEND_ARG_INFO(0, offset)
1967 ZEND_END_ARG_INFO()
1968
1969 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match_all, 0, 0, 2)
1970 ZEND_ARG_INFO(0, pattern)
1971 ZEND_ARG_INFO(0, subject)
1972 ZEND_ARG_INFO(1, subpatterns) /* array */
1973 ZEND_ARG_INFO(0, flags)
1974 ZEND_ARG_INFO(0, offset)
1975 ZEND_END_ARG_INFO()
1976
1977 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace, 0, 0, 3)
1978 ZEND_ARG_INFO(0, regex)
1979 ZEND_ARG_INFO(0, replace)
1980 ZEND_ARG_INFO(0, subject)
1981 ZEND_ARG_INFO(0, limit)
1982 ZEND_ARG_INFO(1, count)
1983 ZEND_END_ARG_INFO()
1984
1985 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace_callback, 0, 0, 3)
1986 ZEND_ARG_INFO(0, regex)
1987 ZEND_ARG_INFO(0, callback)
1988 ZEND_ARG_INFO(0, subject)
1989 ZEND_ARG_INFO(0, limit)
1990 ZEND_ARG_INFO(1, count)
1991 ZEND_END_ARG_INFO()
1992
1993 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_split, 0, 0, 2)
1994 ZEND_ARG_INFO(0, pattern)
1995 ZEND_ARG_INFO(0, subject)
1996 ZEND_ARG_INFO(0, limit)
1997 ZEND_ARG_INFO(0, flags)
1998 ZEND_END_ARG_INFO()
1999
2000 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_quote, 0, 0, 1)
2001 ZEND_ARG_INFO(0, str)
2002 ZEND_ARG_INFO(0, delim_char)
2003 ZEND_END_ARG_INFO()
2004
2005 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_grep, 0, 0, 2)
2006 ZEND_ARG_INFO(0, regex)
2007 ZEND_ARG_INFO(0, input) /* array */
2008 ZEND_ARG_INFO(0, flags)
2009 ZEND_END_ARG_INFO()
2010
2011 ZEND_BEGIN_ARG_INFO(arginfo_preg_last_error, 0)
2012 ZEND_END_ARG_INFO()
2013 /* }}} */
2014
2015 static const zend_function_entry pcre_functions[] = {
2016 PHP_FE(preg_match, arginfo_preg_match)
2017 PHP_FE(preg_match_all, arginfo_preg_match_all)
2018 PHP_FE(preg_replace, arginfo_preg_replace)
2019 PHP_FE(preg_replace_callback, arginfo_preg_replace_callback)
2020 PHP_FE(preg_filter, arginfo_preg_replace)
2021 PHP_FE(preg_split, arginfo_preg_split)
2022 PHP_FE(preg_quote, arginfo_preg_quote)
2023 PHP_FE(preg_grep, arginfo_preg_grep)
2024 PHP_FE(preg_last_error, arginfo_preg_last_error)
2025 PHP_FE_END
2026 };
2027
2028 zend_module_entry pcre_module_entry = {
2029 STANDARD_MODULE_HEADER,
2030 "pcre",
2031 pcre_functions,
2032 PHP_MINIT(pcre),
2033 PHP_MSHUTDOWN(pcre),
2034 NULL,
2035 NULL,
2036 PHP_MINFO(pcre),
2037 NO_VERSION_YET,
2038 PHP_MODULE_GLOBALS(pcre),
2039 PHP_GINIT(pcre),
2040 PHP_GSHUTDOWN(pcre),
2041 NULL,
2042 STANDARD_MODULE_PROPERTIES_EX
2043 };
2044
2045 #ifdef COMPILE_DL_PCRE
2046 ZEND_GET_MODULE(pcre)
2047 #endif
2048
2049 /* }}} */
2050
2051 #endif /* HAVE_PCRE || HAVE_BUNDLED_PCRE */
2052
2053 /*
2054 * Local variables:
2055 * tab-width: 4
2056 * c-basic-offset: 4
2057 * End:
2058 * vim600: sw=4 ts=4 fdm=marker
2059 * vim<600: sw=4 ts=4
2060 */
2061