1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 7 |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 1997-2018 The PHP Group |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
15 | Author: Andrei Zmievski <andrei@php.net> |
16 +----------------------------------------------------------------------+
17 */
18
19 /* $Id$ */
20
21 #include "php.h"
22 #include "php_ini.h"
23 #include "php_globals.h"
24 #include "php_pcre.h"
25 #include "ext/standard/info.h"
26 #include "ext/standard/basic_functions.h"
27 #include "zend_smart_str.h"
28
29 #if HAVE_PCRE || HAVE_BUNDLED_PCRE
30
31 #include "ext/standard/php_string.h"
32
33 #define PREG_PATTERN_ORDER 1
34 #define PREG_SET_ORDER 2
35 #define PREG_OFFSET_CAPTURE (1<<8)
36
37 #define PREG_SPLIT_NO_EMPTY (1<<0)
38 #define PREG_SPLIT_DELIM_CAPTURE (1<<1)
39 #define PREG_SPLIT_OFFSET_CAPTURE (1<<2)
40
41 #define PREG_REPLACE_EVAL (1<<0)
42
43 #define PREG_GREP_INVERT (1<<0)
44
45 #define PCRE_CACHE_SIZE 4096
46
47 /* not fully functional workaround for libpcre < 8.0, see bug #70232 */
48 #ifndef PCRE_NOTEMPTY_ATSTART
49 # define PCRE_NOTEMPTY_ATSTART PCRE_NOTEMPTY
50 #endif
51
52 enum {
53 PHP_PCRE_NO_ERROR = 0,
54 PHP_PCRE_INTERNAL_ERROR,
55 PHP_PCRE_BACKTRACK_LIMIT_ERROR,
56 PHP_PCRE_RECURSION_LIMIT_ERROR,
57 PHP_PCRE_BAD_UTF8_ERROR,
58 PHP_PCRE_BAD_UTF8_OFFSET_ERROR,
59 PHP_PCRE_JIT_STACKLIMIT_ERROR
60 };
61
62
63 PHPAPI ZEND_DECLARE_MODULE_GLOBALS(pcre)
64
65 #ifdef HAVE_PCRE_JIT_SUPPORT
66 #define PCRE_JIT_STACK_MIN_SIZE (32 * 1024)
67 #define PCRE_JIT_STACK_MAX_SIZE (64 * 1024)
68 ZEND_TLS pcre_jit_stack *jit_stack = NULL;
69 #endif
70 #if defined(ZTS)
71 static MUTEX_T pcre_mt = NULL;
72 #define php_pcre_mutex_alloc() if (!pcre_mt) pcre_mt = tsrm_mutex_alloc();
73 #define php_pcre_mutex_free() if (pcre_mt) tsrm_mutex_free(pcre_mt); pcre_mt = NULL;
74 #define php_pcre_mutex_lock() tsrm_mutex_lock(pcre_mt);
75 #define php_pcre_mutex_unlock() tsrm_mutex_unlock(pcre_mt);
76 #else
77 #define php_pcre_mutex_alloc()
78 #define php_pcre_mutex_free()
79 #define php_pcre_mutex_lock()
80 #define php_pcre_mutex_unlock()
81 #endif
82
pcre_handle_exec_error(int pcre_code)83 static void pcre_handle_exec_error(int pcre_code) /* {{{ */
84 {
85 int preg_code = 0;
86
87 switch (pcre_code) {
88 case PCRE_ERROR_MATCHLIMIT:
89 preg_code = PHP_PCRE_BACKTRACK_LIMIT_ERROR;
90 break;
91
92 case PCRE_ERROR_RECURSIONLIMIT:
93 preg_code = PHP_PCRE_RECURSION_LIMIT_ERROR;
94 break;
95
96 case PCRE_ERROR_BADUTF8:
97 preg_code = PHP_PCRE_BAD_UTF8_ERROR;
98 break;
99
100 case PCRE_ERROR_BADUTF8_OFFSET:
101 preg_code = PHP_PCRE_BAD_UTF8_OFFSET_ERROR;
102 break;
103
104 #ifdef HAVE_PCRE_JIT_SUPPORT
105 case PCRE_ERROR_JIT_STACKLIMIT:
106 preg_code = PHP_PCRE_JIT_STACKLIMIT_ERROR;
107 break;
108 #endif
109
110 default:
111 preg_code = PHP_PCRE_INTERNAL_ERROR;
112 break;
113 }
114
115 PCRE_G(error_code) = preg_code;
116 }
117 /* }}} */
118
php_free_pcre_cache(zval * data)119 static void php_free_pcre_cache(zval *data) /* {{{ */
120 {
121 pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
122 if (!pce) return;
123 pcre_free(pce->re);
124 if (pce->extra) {
125 pcre_free_study(pce->extra);
126 }
127 #if HAVE_SETLOCALE
128 if ((void*)pce->tables) pefree((void*)pce->tables, 1);
129 #endif
130 pefree(pce, 1);
131 }
132 /* }}} */
133
PHP_GINIT_FUNCTION(pcre)134 static PHP_GINIT_FUNCTION(pcre) /* {{{ */
135 {
136 zend_hash_init(&pcre_globals->pcre_cache, 0, NULL, php_free_pcre_cache, 1);
137 pcre_globals->backtrack_limit = 0;
138 pcre_globals->recursion_limit = 0;
139 pcre_globals->error_code = PHP_PCRE_NO_ERROR;
140 }
141 /* }}} */
142
PHP_GSHUTDOWN_FUNCTION(pcre)143 static PHP_GSHUTDOWN_FUNCTION(pcre) /* {{{ */
144 {
145 zend_hash_destroy(&pcre_globals->pcre_cache);
146
147 #ifdef HAVE_PCRE_JIT_SUPPORT
148 /* Stack may only be destroyed when no cached patterns
149 possibly associated with it do exist. */
150 if (jit_stack) {
151 pcre_jit_stack_free(jit_stack);
152 jit_stack = NULL;
153 }
154 #endif
155
156 }
157 /* }}} */
158
159 PHP_INI_BEGIN()
160 STD_PHP_INI_ENTRY("pcre.backtrack_limit", "1000000", PHP_INI_ALL, OnUpdateLong, backtrack_limit, zend_pcre_globals, pcre_globals)
161 STD_PHP_INI_ENTRY("pcre.recursion_limit", "100000", PHP_INI_ALL, OnUpdateLong, recursion_limit, zend_pcre_globals, pcre_globals)
162 #ifdef HAVE_PCRE_JIT_SUPPORT
163 STD_PHP_INI_ENTRY("pcre.jit", "1", PHP_INI_ALL, OnUpdateBool, jit, zend_pcre_globals, pcre_globals)
164 #endif
PHP_INI_END()165 PHP_INI_END()
166
167
168 /* {{{ PHP_MINFO_FUNCTION(pcre) */
169 static PHP_MINFO_FUNCTION(pcre)
170 {
171 #ifdef HAVE_PCRE_JIT_SUPPORT
172 int jit_yes = 0;
173 #endif
174
175 php_info_print_table_start();
176 php_info_print_table_row(2, "PCRE (Perl Compatible Regular Expressions) Support", "enabled" );
177 php_info_print_table_row(2, "PCRE Library Version", pcre_version() );
178
179 #ifdef HAVE_PCRE_JIT_SUPPORT
180 if (!pcre_config(PCRE_CONFIG_JIT, &jit_yes)) {
181 php_info_print_table_row(2, "PCRE JIT Support", jit_yes ? "enabled" : "disabled");
182 } else {
183 php_info_print_table_row(2, "PCRE JIT Support", "unknown" );
184 }
185 #else
186 php_info_print_table_row(2, "PCRE JIT Support", "not compiled in" );
187 #endif
188
189 php_info_print_table_end();
190
191 DISPLAY_INI_ENTRIES();
192 }
193 /* }}} */
194
195 /* {{{ PHP_MINIT_FUNCTION(pcre) */
PHP_MINIT_FUNCTION(pcre)196 static PHP_MINIT_FUNCTION(pcre)
197 {
198 REGISTER_INI_ENTRIES();
199
200 php_pcre_mutex_alloc();
201
202 REGISTER_LONG_CONSTANT("PREG_PATTERN_ORDER", PREG_PATTERN_ORDER, CONST_CS | CONST_PERSISTENT);
203 REGISTER_LONG_CONSTANT("PREG_SET_ORDER", PREG_SET_ORDER, CONST_CS | CONST_PERSISTENT);
204 REGISTER_LONG_CONSTANT("PREG_OFFSET_CAPTURE", PREG_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
205 REGISTER_LONG_CONSTANT("PREG_SPLIT_NO_EMPTY", PREG_SPLIT_NO_EMPTY, CONST_CS | CONST_PERSISTENT);
206 REGISTER_LONG_CONSTANT("PREG_SPLIT_DELIM_CAPTURE", PREG_SPLIT_DELIM_CAPTURE, CONST_CS | CONST_PERSISTENT);
207 REGISTER_LONG_CONSTANT("PREG_SPLIT_OFFSET_CAPTURE", PREG_SPLIT_OFFSET_CAPTURE, CONST_CS | CONST_PERSISTENT);
208 REGISTER_LONG_CONSTANT("PREG_GREP_INVERT", PREG_GREP_INVERT, CONST_CS | CONST_PERSISTENT);
209
210 REGISTER_LONG_CONSTANT("PREG_NO_ERROR", PHP_PCRE_NO_ERROR, CONST_CS | CONST_PERSISTENT);
211 REGISTER_LONG_CONSTANT("PREG_INTERNAL_ERROR", PHP_PCRE_INTERNAL_ERROR, CONST_CS | CONST_PERSISTENT);
212 REGISTER_LONG_CONSTANT("PREG_BACKTRACK_LIMIT_ERROR", PHP_PCRE_BACKTRACK_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
213 REGISTER_LONG_CONSTANT("PREG_RECURSION_LIMIT_ERROR", PHP_PCRE_RECURSION_LIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
214 REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_ERROR", PHP_PCRE_BAD_UTF8_ERROR, CONST_CS | CONST_PERSISTENT);
215 REGISTER_LONG_CONSTANT("PREG_BAD_UTF8_OFFSET_ERROR", PHP_PCRE_BAD_UTF8_OFFSET_ERROR, CONST_CS | CONST_PERSISTENT);
216 REGISTER_LONG_CONSTANT("PREG_JIT_STACKLIMIT_ERROR", PHP_PCRE_JIT_STACKLIMIT_ERROR, CONST_CS | CONST_PERSISTENT);
217 REGISTER_STRING_CONSTANT("PCRE_VERSION", (char *)pcre_version(), CONST_CS | CONST_PERSISTENT);
218
219 return SUCCESS;
220 }
221 /* }}} */
222
223 /* {{{ PHP_MSHUTDOWN_FUNCTION(pcre) */
PHP_MSHUTDOWN_FUNCTION(pcre)224 static PHP_MSHUTDOWN_FUNCTION(pcre)
225 {
226 UNREGISTER_INI_ENTRIES();
227
228 php_pcre_mutex_free();
229
230 return SUCCESS;
231 }
232 /* }}} */
233
234 #ifdef HAVE_PCRE_JIT_SUPPORT
235 /* {{{ PHP_RINIT_FUNCTION(pcre) */
PHP_RINIT_FUNCTION(pcre)236 static PHP_RINIT_FUNCTION(pcre)
237 {
238 if (PCRE_G(jit) && jit_stack == NULL) {
239 php_pcre_mutex_lock();
240 jit_stack = pcre_jit_stack_alloc(PCRE_JIT_STACK_MIN_SIZE,PCRE_JIT_STACK_MAX_SIZE);
241 php_pcre_mutex_unlock();
242 }
243
244 return SUCCESS;
245 }
246 /* }}} */
247 #endif
248
249 /* {{{ static pcre_clean_cache */
pcre_clean_cache(zval * data,void * arg)250 static int pcre_clean_cache(zval *data, void *arg)
251 {
252 pcre_cache_entry *pce = (pcre_cache_entry *) Z_PTR_P(data);
253 int *num_clean = (int *)arg;
254
255 if (*num_clean > 0 && !pce->refcount) {
256 (*num_clean)--;
257 return ZEND_HASH_APPLY_REMOVE;
258 } else {
259 return ZEND_HASH_APPLY_KEEP;
260 }
261 }
262 /* }}} */
263
264 /* {{{ static make_subpats_table */
make_subpats_table(int num_subpats,pcre_cache_entry * pce)265 static char **make_subpats_table(int num_subpats, pcre_cache_entry *pce)
266 {
267 pcre_extra *extra = pce->extra;
268 int name_cnt = pce->name_count, name_size, ni = 0;
269 int rc;
270 char *name_table;
271 unsigned short name_idx;
272 char **subpat_names;
273 int rc1, rc2;
274
275 rc1 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMETABLE, &name_table);
276 rc2 = pcre_fullinfo(pce->re, extra, PCRE_INFO_NAMEENTRYSIZE, &name_size);
277 rc = rc2 ? rc2 : rc1;
278 if (rc < 0) {
279 php_error_docref(NULL, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
280 return NULL;
281 }
282
283 subpat_names = (char **)ecalloc(num_subpats, sizeof(char *));
284 while (ni++ < name_cnt) {
285 name_idx = 0x100 * (unsigned char)name_table[0] + (unsigned char)name_table[1];
286 subpat_names[name_idx] = name_table + 2;
287 if (is_numeric_string(subpat_names[name_idx], strlen(subpat_names[name_idx]), NULL, NULL, 0) > 0) {
288 php_error_docref(NULL, E_WARNING, "Numeric named subpatterns are not allowed");
289 efree(subpat_names);
290 return NULL;
291 }
292 name_table += name_size;
293 }
294 return subpat_names;
295 }
296 /* }}} */
297
298 /* {{{ static calculate_unit_length */
299 /* Calculates the byte length of the next character. Assumes valid UTF-8 for PCRE_UTF8. */
calculate_unit_length(pcre_cache_entry * pce,char * start)300 static zend_always_inline int calculate_unit_length(pcre_cache_entry *pce, char *start)
301 {
302 int unit_len;
303
304 if (pce->compile_options & PCRE_UTF8) {
305 char *end = start;
306
307 /* skip continuation bytes */
308 while ((*++end & 0xC0) == 0x80);
309 unit_len = end - start;
310 } else {
311 unit_len = 1;
312 }
313 return unit_len;
314 }
315 /* }}} */
316
317 /* {{{ pcre_get_compiled_regex_cache
318 */
pcre_get_compiled_regex_cache(zend_string * regex)319 PHPAPI pcre_cache_entry* pcre_get_compiled_regex_cache(zend_string *regex)
320 {
321 pcre *re = NULL;
322 pcre_extra *extra;
323 int coptions = 0;
324 int soptions = 0;
325 const char *error;
326 int erroffset;
327 char delimiter;
328 char start_delimiter;
329 char end_delimiter;
330 char *p, *pp;
331 char *pattern;
332 int do_study = 0;
333 int poptions = 0;
334 unsigned const char *tables = NULL;
335 pcre_cache_entry *pce;
336 pcre_cache_entry new_entry;
337 int rc;
338 zend_string *key;
339
340 #if HAVE_SETLOCALE
341 if (BG(locale_string) &&
342 (ZSTR_LEN(BG(locale_string)) != 1 && ZSTR_VAL(BG(locale_string))[0] != 'C')) {
343 key = zend_string_alloc(ZSTR_LEN(regex) + ZSTR_LEN(BG(locale_string)) + 1, 0);
344 memcpy(ZSTR_VAL(key), ZSTR_VAL(BG(locale_string)), ZSTR_LEN(BG(locale_string)) + 1);
345 memcpy(ZSTR_VAL(key) + ZSTR_LEN(BG(locale_string)), ZSTR_VAL(regex), ZSTR_LEN(regex) + 1);
346 } else
347 #endif
348 {
349 key = regex;
350 }
351
352 /* Try to lookup the cached regex entry, and if successful, just pass
353 back the compiled pattern, otherwise go on and compile it. */
354 pce = zend_hash_find_ptr(&PCRE_G(pcre_cache), key);
355 if (pce) {
356 #if HAVE_SETLOCALE
357 if (key != regex) {
358 zend_string_release(key);
359 }
360 #endif
361 return pce;
362 }
363
364 p = ZSTR_VAL(regex);
365
366 /* Parse through the leading whitespace, and display a warning if we
367 get to the end without encountering a delimiter. */
368 while (isspace((int)*(unsigned char *)p)) p++;
369 if (*p == 0) {
370 #if HAVE_SETLOCALE
371 if (key != regex) {
372 zend_string_release(key);
373 }
374 #endif
375 php_error_docref(NULL, E_WARNING,
376 p < ZSTR_VAL(regex) + ZSTR_LEN(regex) ? "Null byte in regex" : "Empty regular expression");
377 pcre_handle_exec_error(PCRE_ERROR_INTERNAL);
378 return NULL;
379 }
380
381 /* Get the delimiter and display a warning if it is alphanumeric
382 or a backslash. */
383 delimiter = *p++;
384 if (isalnum((int)*(unsigned char *)&delimiter) || delimiter == '\\') {
385 #if HAVE_SETLOCALE
386 if (key != regex) {
387 zend_string_release(key);
388 }
389 #endif
390 php_error_docref(NULL,E_WARNING, "Delimiter must not be alphanumeric or backslash");
391 pcre_handle_exec_error(PCRE_ERROR_INTERNAL);
392 return NULL;
393 }
394
395 start_delimiter = delimiter;
396 if ((pp = strchr("([{< )]}> )]}>", delimiter)))
397 delimiter = pp[5];
398 end_delimiter = delimiter;
399
400 pp = p;
401
402 if (start_delimiter == end_delimiter) {
403 /* We need to iterate through the pattern, searching for the ending delimiter,
404 but skipping the backslashed delimiters. If the ending delimiter is not
405 found, display a warning. */
406 while (*pp != 0) {
407 if (*pp == '\\' && pp[1] != 0) pp++;
408 else if (*pp == delimiter)
409 break;
410 pp++;
411 }
412 } else {
413 /* We iterate through the pattern, searching for the matching ending
414 * delimiter. For each matching starting delimiter, we increment nesting
415 * level, and decrement it for each matching ending delimiter. If we
416 * reach the end of the pattern without matching, display a warning.
417 */
418 int brackets = 1; /* brackets nesting level */
419 while (*pp != 0) {
420 if (*pp == '\\' && pp[1] != 0) pp++;
421 else if (*pp == end_delimiter && --brackets <= 0)
422 break;
423 else if (*pp == start_delimiter)
424 brackets++;
425 pp++;
426 }
427 }
428
429 if (*pp == 0) {
430 #if HAVE_SETLOCALE
431 if (key != regex) {
432 zend_string_release(key);
433 }
434 #endif
435 if (pp < ZSTR_VAL(regex) + ZSTR_LEN(regex)) {
436 php_error_docref(NULL,E_WARNING, "Null byte in regex");
437 } else if (start_delimiter == end_delimiter) {
438 php_error_docref(NULL,E_WARNING, "No ending delimiter '%c' found", delimiter);
439 } else {
440 php_error_docref(NULL,E_WARNING, "No ending matching delimiter '%c' found", delimiter);
441 }
442 pcre_handle_exec_error(PCRE_ERROR_INTERNAL);
443 return NULL;
444 }
445
446 /* Make a copy of the actual pattern. */
447 pattern = estrndup(p, pp-p);
448
449 /* Move on to the options */
450 pp++;
451
452 /* Parse through the options, setting appropriate flags. Display
453 a warning if we encounter an unknown modifier. */
454 while (pp < ZSTR_VAL(regex) + ZSTR_LEN(regex)) {
455 switch (*pp++) {
456 /* Perl compatible options */
457 case 'i': coptions |= PCRE_CASELESS; break;
458 case 'm': coptions |= PCRE_MULTILINE; break;
459 case 's': coptions |= PCRE_DOTALL; break;
460 case 'x': coptions |= PCRE_EXTENDED; break;
461
462 /* PCRE specific options */
463 case 'A': coptions |= PCRE_ANCHORED; break;
464 case 'D': coptions |= PCRE_DOLLAR_ENDONLY;break;
465 case 'S': do_study = 1; break;
466 case 'U': coptions |= PCRE_UNGREEDY; break;
467 case 'X': coptions |= PCRE_EXTRA; break;
468 case 'u': coptions |= PCRE_UTF8;
469 /* In PCRE, by default, \d, \D, \s, \S, \w, and \W recognize only ASCII
470 characters, even in UTF-8 mode. However, this can be changed by setting
471 the PCRE_UCP option. */
472 #ifdef PCRE_UCP
473 coptions |= PCRE_UCP;
474 #endif
475 break;
476 case 'J': coptions |= PCRE_DUPNAMES; break;
477
478 /* Custom preg options */
479 case 'e': poptions |= PREG_REPLACE_EVAL; break;
480
481 case ' ':
482 case '\n':
483 break;
484
485 default:
486 if (pp[-1]) {
487 php_error_docref(NULL,E_WARNING, "Unknown modifier '%c'", pp[-1]);
488 } else {
489 php_error_docref(NULL,E_WARNING, "Null byte in regex");
490 }
491 pcre_handle_exec_error(PCRE_ERROR_INTERNAL);
492 efree(pattern);
493 #if HAVE_SETLOCALE
494 if (key != regex) {
495 zend_string_release(key);
496 }
497 #endif
498 return NULL;
499 }
500 }
501
502 #if HAVE_SETLOCALE
503 if (key != regex) {
504 tables = pcre_maketables();
505 }
506 #endif
507
508 /* Compile pattern and display a warning if compilation failed. */
509 re = pcre_compile(pattern,
510 coptions,
511 &error,
512 &erroffset,
513 tables);
514
515 if (re == NULL) {
516 #if HAVE_SETLOCALE
517 if (key != regex) {
518 zend_string_release(key);
519 }
520 #endif
521 php_error_docref(NULL,E_WARNING, "Compilation failed: %s at offset %d", error, erroffset);
522 pcre_handle_exec_error(PCRE_ERROR_INTERNAL);
523 efree(pattern);
524 if (tables) {
525 pefree((void*)tables, 1);
526 }
527 return NULL;
528 }
529
530 #ifdef HAVE_PCRE_JIT_SUPPORT
531 if (PCRE_G(jit)) {
532 /* Enable PCRE JIT compiler */
533 do_study = 1;
534 soptions |= PCRE_STUDY_JIT_COMPILE;
535 }
536 #endif
537
538 /* If study option was specified, study the pattern and
539 store the result in extra for passing to pcre_exec. */
540 if (do_study) {
541 php_pcre_mutex_lock();
542 extra = pcre_study(re, soptions, &error);
543 php_pcre_mutex_unlock();
544 if (extra) {
545 extra->flags |= PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
546 extra->match_limit = (unsigned long)PCRE_G(backtrack_limit);
547 extra->match_limit_recursion = (unsigned long)PCRE_G(recursion_limit);
548 #ifdef HAVE_PCRE_JIT_SUPPORT
549 if (PCRE_G(jit) && jit_stack) {
550 pcre_assign_jit_stack(extra, NULL, jit_stack);
551 }
552 #endif
553 }
554 if (error != NULL) {
555 php_error_docref(NULL, E_WARNING, "Error while studying pattern");
556 pcre_handle_exec_error(PCRE_ERROR_INTERNAL);
557 }
558 } else {
559 extra = NULL;
560 }
561
562 efree(pattern);
563
564 /*
565 * If we reached cache limit, clean out the items from the head of the list;
566 * these are supposedly the oldest ones (but not necessarily the least used
567 * ones).
568 */
569 if (!pce && zend_hash_num_elements(&PCRE_G(pcre_cache)) == PCRE_CACHE_SIZE) {
570 int num_clean = PCRE_CACHE_SIZE / 8;
571 zend_hash_apply_with_argument(&PCRE_G(pcre_cache), pcre_clean_cache, &num_clean);
572 }
573
574 /* Store the compiled pattern and extra info in the cache. */
575 new_entry.re = re;
576 new_entry.extra = extra;
577 new_entry.preg_options = poptions;
578 new_entry.compile_options = coptions;
579 #if HAVE_SETLOCALE
580 new_entry.locale = NULL;
581 new_entry.tables = tables;
582 #endif
583 new_entry.refcount = 0;
584
585 rc = pcre_fullinfo(re, extra, PCRE_INFO_CAPTURECOUNT, &new_entry.capture_count);
586 if (rc < 0) {
587 #if HAVE_SETLOCALE
588 if (key != regex) {
589 zend_string_release(key);
590 }
591 #endif
592 php_error_docref(NULL, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
593 pcre_handle_exec_error(PCRE_ERROR_INTERNAL);
594 return NULL;
595 }
596
597 rc = pcre_fullinfo(re, extra, PCRE_INFO_NAMECOUNT, &new_entry.name_count);
598 if (rc < 0) {
599 #if HAVE_SETLOCALE
600 if (key != regex) {
601 zend_string_release(key);
602 }
603 #endif
604 php_error_docref(NULL, E_WARNING, "Internal pcre_fullinfo() error %d", rc);
605 pcre_handle_exec_error(PCRE_ERROR_INTERNAL);
606 return NULL;
607 }
608
609 /*
610 * Interned strings are not duplicated when stored in HashTable,
611 * but all the interned strings created during HTTP request are removed
612 * at end of request. However PCRE_G(pcre_cache) must be consistent
613 * on the next request as well. So we disable usage of interned strings
614 * as hash keys especually for this table.
615 * See bug #63180
616 */
617 if (!ZSTR_IS_INTERNED(key) || !(GC_FLAGS(key) & IS_STR_PERMANENT)) {
618 pce = zend_hash_str_update_mem(&PCRE_G(pcre_cache),
619 ZSTR_VAL(key), ZSTR_LEN(key), &new_entry, sizeof(pcre_cache_entry));
620 #if HAVE_SETLOCALE
621 if (key != regex) {
622 zend_string_release(key);
623 }
624 #endif
625 } else {
626 pce = zend_hash_update_mem(&PCRE_G(pcre_cache), key, &new_entry, sizeof(pcre_cache_entry));
627 }
628
629 return pce;
630 }
631 /* }}} */
632
633 /* {{{ pcre_get_compiled_regex
634 */
pcre_get_compiled_regex(zend_string * regex,pcre_extra ** extra,int * preg_options)635 PHPAPI pcre* pcre_get_compiled_regex(zend_string *regex, pcre_extra **extra, int *preg_options)
636 {
637 pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex);
638
639 if (extra) {
640 *extra = pce ? pce->extra : NULL;
641 }
642 if (preg_options) {
643 *preg_options = pce ? pce->preg_options : 0;
644 }
645
646 return pce ? pce->re : NULL;
647 }
648 /* }}} */
649
650 /* {{{ pcre_get_compiled_regex_ex
651 */
pcre_get_compiled_regex_ex(zend_string * regex,pcre_extra ** extra,int * preg_options,int * compile_options)652 PHPAPI pcre* pcre_get_compiled_regex_ex(zend_string *regex, pcre_extra **extra, int *preg_options, int *compile_options)
653 {
654 pcre_cache_entry * pce = pcre_get_compiled_regex_cache(regex);
655
656 if (extra) {
657 *extra = pce ? pce->extra : NULL;
658 }
659 if (preg_options) {
660 *preg_options = pce ? pce->preg_options : 0;
661 }
662 if (compile_options) {
663 *compile_options = pce ? pce->compile_options : 0;
664 }
665
666 return pce ? pce->re : NULL;
667 }
668 /* }}} */
669
670 /* {{{ add_offset_pair */
add_offset_pair(zval * result,char * str,int len,int offset,char * name)671 static inline void add_offset_pair(zval *result, char *str, int len, int offset, char *name)
672 {
673 zval match_pair, tmp;
674
675 array_init_size(&match_pair, 2);
676
677 /* Add (match, offset) to the return value */
678 ZVAL_STRINGL(&tmp, str, len);
679 zend_hash_next_index_insert_new(Z_ARRVAL(match_pair), &tmp);
680 ZVAL_LONG(&tmp, offset);
681 zend_hash_next_index_insert_new(Z_ARRVAL(match_pair), &tmp);
682
683 if (name) {
684 Z_ADDREF(match_pair);
685 zend_hash_str_update(Z_ARRVAL_P(result), name, strlen(name), &match_pair);
686 }
687 zend_hash_next_index_insert(Z_ARRVAL_P(result), &match_pair);
688 }
689 /* }}} */
690
php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS,int global)691 static void php_do_pcre_match(INTERNAL_FUNCTION_PARAMETERS, int global) /* {{{ */
692 {
693 /* parameters */
694 zend_string *regex; /* Regular expression */
695 zend_string *subject; /* String to match against */
696 pcre_cache_entry *pce; /* Compiled regular expression */
697 zval *subpats = NULL; /* Array for subpatterns */
698 zend_long flags = 0; /* Match control flags */
699 zend_long start_offset = 0; /* Where the new search starts */
700
701 ZEND_PARSE_PARAMETERS_START(2, 5)
702 Z_PARAM_STR(regex)
703 Z_PARAM_STR(subject)
704 Z_PARAM_OPTIONAL
705 Z_PARAM_ZVAL_EX(subpats, 0, 1)
706 Z_PARAM_LONG(flags)
707 Z_PARAM_LONG(start_offset)
708 ZEND_PARSE_PARAMETERS_END_EX(RETURN_FALSE);
709
710 if (ZEND_SIZE_T_INT_OVFL(ZSTR_LEN(subject))) {
711 php_error_docref(NULL, E_WARNING, "Subject is too long");
712 RETURN_FALSE;
713 }
714
715 /* Compile regex or get it from cache. */
716 if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
717 RETURN_FALSE;
718 }
719
720 pce->refcount++;
721 php_pcre_match_impl(pce, ZSTR_VAL(subject), (int)ZSTR_LEN(subject), return_value, subpats,
722 global, ZEND_NUM_ARGS() >= 4, flags, start_offset);
723 pce->refcount--;
724 }
725 /* }}} */
726
727 /* {{{ php_pcre_match_impl() */
php_pcre_match_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * return_value,zval * subpats,int global,int use_flags,zend_long flags,zend_long start_offset)728 PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
729 zval *subpats, int global, int use_flags, zend_long flags, zend_long start_offset)
730 {
731 zval result_set, /* Holds a set of subpatterns after
732 a global match */
733 *match_sets = NULL; /* An array of sets of matches for each
734 subpattern after a global match */
735 pcre_extra *extra = pce->extra;/* Holds results of studying */
736 pcre_extra extra_data; /* Used locally for exec options */
737 int exoptions = 0; /* Execution options */
738 int count = 0; /* Count of matched subpatterns */
739 int *offsets; /* Array of subpattern offsets */
740 int num_subpats; /* Number of captured subpatterns */
741 int size_offsets; /* Size of the offsets array */
742 int matched; /* Has anything matched */
743 int g_notempty = 0; /* If the match should not be empty */
744 const char **stringlist; /* Holds list of subpatterns */
745 char **subpat_names; /* Array for named subpatterns */
746 int i;
747 int subpats_order; /* Order of subpattern matches */
748 int offset_capture; /* Capture match offsets: yes/no */
749 unsigned char *mark = NULL; /* Target for MARK name */
750 zval marks; /* Array of marks for PREG_PATTERN_ORDER */
751 ALLOCA_FLAG(use_heap);
752
753 ZVAL_UNDEF(&marks);
754
755 /* Overwrite the passed-in value for subpatterns with an empty array. */
756 if (subpats != NULL) {
757 zval_ptr_dtor(subpats);
758 array_init(subpats);
759 }
760
761 subpats_order = global ? PREG_PATTERN_ORDER : 0;
762
763 if (use_flags) {
764 offset_capture = flags & PREG_OFFSET_CAPTURE;
765
766 /*
767 * subpats_order is pre-set to pattern mode so we change it only if
768 * necessary.
769 */
770 if (flags & 0xff) {
771 subpats_order = flags & 0xff;
772 }
773 if ((global && (subpats_order < PREG_PATTERN_ORDER || subpats_order > PREG_SET_ORDER)) ||
774 (!global && subpats_order != 0)) {
775 php_error_docref(NULL, E_WARNING, "Invalid flags specified");
776 return;
777 }
778 } else {
779 offset_capture = 0;
780 }
781
782 /* Negative offset counts from the end of the string. */
783 if (start_offset < 0) {
784 start_offset = subject_len + start_offset;
785 if (start_offset < 0) {
786 start_offset = 0;
787 }
788 }
789
790 if (extra == NULL) {
791 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
792 extra = &extra_data;
793 }
794 extra->match_limit = (unsigned long)PCRE_G(backtrack_limit);
795 extra->match_limit_recursion = (unsigned long)PCRE_G(recursion_limit);
796 #ifdef PCRE_EXTRA_MARK
797 extra->mark = &mark;
798 extra->flags |= PCRE_EXTRA_MARK;
799 #endif
800
801 /* Calculate the size of the offsets array, and allocate memory for it. */
802 num_subpats = pce->capture_count + 1;
803 size_offsets = num_subpats * 3;
804
805 /*
806 * Build a mapping from subpattern numbers to their names. We will
807 * allocate the table only if there are any named subpatterns.
808 */
809 subpat_names = NULL;
810 if (pce->name_count > 0) {
811 subpat_names = make_subpats_table(num_subpats, pce);
812 if (!subpat_names) {
813 RETURN_FALSE;
814 }
815 }
816
817 if (size_offsets <= 32) {
818 offsets = (int *)do_alloca(size_offsets * sizeof(int), use_heap);
819 } else {
820 offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
821 }
822 memset(offsets, 0, size_offsets*sizeof(int));
823 /* Allocate match sets array and initialize the values. */
824 if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
825 match_sets = (zval *)safe_emalloc(num_subpats, sizeof(zval), 0);
826 for (i=0; i<num_subpats; i++) {
827 array_init(&match_sets[i]);
828 }
829 }
830
831 matched = 0;
832 PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
833
834 do {
835 /* Execute the regular expression. */
836 count = pcre_exec(pce->re, extra, subject, (int)subject_len, (int)start_offset,
837 exoptions|g_notempty, offsets, size_offsets);
838
839 /* the string was already proved to be valid UTF-8 */
840 exoptions |= PCRE_NO_UTF8_CHECK;
841
842 /* Check for too many substrings condition. */
843 if (count == 0) {
844 php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings");
845 count = size_offsets/3;
846 }
847
848 /* If something has matched */
849 if (count > 0) {
850 matched++;
851
852 /* If subpatterns array has been passed, fill it in with values. */
853 if (subpats != NULL) {
854 /* Try to get the list of substrings and display a warning if failed. */
855 if ((offsets[1] - offsets[0] < 0) || pcre_get_substring_list(subject, offsets, count, &stringlist) < 0) {
856 if (subpat_names) {
857 efree(subpat_names);
858 }
859 if (size_offsets <= 32) {
860 free_alloca(offsets, use_heap);
861 } else {
862 efree(offsets);
863 }
864 if (match_sets) efree(match_sets);
865 php_error_docref(NULL, E_WARNING, "Get subpatterns list failed");
866 RETURN_FALSE;
867 }
868
869 if (global) { /* global pattern matching */
870 if (subpats && subpats_order == PREG_PATTERN_ORDER) {
871 /* For each subpattern, insert it into the appropriate array. */
872 if (offset_capture) {
873 for (i = 0; i < count; i++) {
874 add_offset_pair(&match_sets[i], (char *)stringlist[i],
875 offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL);
876 }
877 } else {
878 for (i = 0; i < count; i++) {
879 add_next_index_stringl(&match_sets[i], (char *)stringlist[i],
880 offsets[(i<<1)+1] - offsets[i<<1]);
881 }
882 }
883 /* Add MARK, if available */
884 if (mark) {
885 if (Z_TYPE(marks) == IS_UNDEF) {
886 array_init(&marks);
887 }
888 add_index_string(&marks, matched - 1, (char *) mark);
889 }
890 /*
891 * If the number of captured subpatterns on this run is
892 * less than the total possible number, pad the result
893 * arrays with empty strings.
894 */
895 if (count < num_subpats) {
896 for (; i < num_subpats; i++) {
897 add_next_index_string(&match_sets[i], "");
898 }
899 }
900 } else {
901 /* Allocate the result set array */
902 array_init_size(&result_set, count + (mark ? 1 : 0));
903
904 /* Add all the subpatterns to it */
905 if (subpat_names) {
906 if (offset_capture) {
907 for (i = 0; i < count; i++) {
908 add_offset_pair(&result_set, (char *)stringlist[i],
909 offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], subpat_names[i]);
910 }
911 } else {
912 for (i = 0; i < count; i++) {
913 if (subpat_names[i]) {
914 add_assoc_stringl(&result_set, subpat_names[i], (char *)stringlist[i],
915 offsets[(i<<1)+1] - offsets[i<<1]);
916 }
917 add_next_index_stringl(&result_set, (char *)stringlist[i],
918 offsets[(i<<1)+1] - offsets[i<<1]);
919 }
920 }
921 } else {
922 if (offset_capture) {
923 for (i = 0; i < count; i++) {
924 add_offset_pair(&result_set, (char *)stringlist[i],
925 offsets[(i<<1)+1] - offsets[i<<1], offsets[i<<1], NULL);
926 }
927 } else {
928 for (i = 0; i < count; i++) {
929 add_next_index_stringl(&result_set, (char *)stringlist[i],
930 offsets[(i<<1)+1] - offsets[i<<1]);
931 }
932 }
933 }
934 /* Add MARK, if available */
935 if (mark) {
936 add_assoc_string_ex(&result_set, "MARK", sizeof("MARK") - 1, (char *)mark);
937 }
938 /* And add it to the output array */
939 zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &result_set);
940 }
941 } else { /* single pattern matching */
942 /* For each subpattern, insert it into the subpatterns array. */
943 if (subpat_names) {
944 if (offset_capture) {
945 for (i = 0; i < count; i++) {
946 add_offset_pair(subpats, (char *)stringlist[i],
947 offsets[(i<<1)+1] - offsets[i<<1],
948 offsets[i<<1], subpat_names[i]);
949 }
950 } else {
951 for (i = 0; i < count; i++) {
952 if (subpat_names[i]) {
953 add_assoc_stringl(subpats, subpat_names[i], (char *)stringlist[i],
954 offsets[(i<<1)+1] - offsets[i<<1]);
955 }
956 add_next_index_stringl(subpats, (char *)stringlist[i],
957 offsets[(i<<1)+1] - offsets[i<<1]);
958 }
959 }
960 } else {
961 if (offset_capture) {
962 for (i = 0; i < count; i++) {
963 add_offset_pair(subpats, (char *)stringlist[i],
964 offsets[(i<<1)+1] - offsets[i<<1],
965 offsets[i<<1], NULL);
966 }
967 } else {
968 for (i = 0; i < count; i++) {
969 add_next_index_stringl(subpats, (char *)stringlist[i],
970 offsets[(i<<1)+1] - offsets[i<<1]);
971 }
972 }
973 }
974 /* Add MARK, if available */
975 if (mark) {
976 add_assoc_string_ex(subpats, "MARK", sizeof("MARK") - 1, (char *)mark);
977 }
978 }
979
980 pcre_free((void *) stringlist);
981 }
982 } else if (count == PCRE_ERROR_NOMATCH) {
983 /* If we previously set PCRE_NOTEMPTY_ATSTART after a null match,
984 this is not necessarily the end. We need to advance
985 the start offset, and continue. Fudge the offset values
986 to achieve this, unless we're already at the end of the string. */
987 if (g_notempty != 0 && start_offset < subject_len) {
988 int unit_len = calculate_unit_length(pce, subject + start_offset);
989
990 offsets[0] = (int)start_offset;
991 offsets[1] = (int)(start_offset + unit_len);
992 } else
993 break;
994 } else {
995 pcre_handle_exec_error(count);
996 break;
997 }
998
999 /* If we have matched an empty string, mimic what Perl's /g options does.
1000 This turns out to be rather cunning. First we set PCRE_NOTEMPTY_ATSTART and try
1001 the match again at the same point. If this fails (picked up above) we
1002 advance to the next character. */
1003 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED : 0;
1004
1005 /* Advance to the position right after the last full match */
1006 start_offset = offsets[1];
1007 } while (global);
1008
1009 /* Add the match sets to the output array and clean up */
1010 if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
1011 if (subpat_names) {
1012 for (i = 0; i < num_subpats; i++) {
1013 if (subpat_names[i]) {
1014 zend_hash_str_update(Z_ARRVAL_P(subpats), subpat_names[i],
1015 strlen(subpat_names[i]), &match_sets[i]);
1016 Z_ADDREF(match_sets[i]);
1017 }
1018 zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i]);
1019 }
1020 } else {
1021 for (i = 0; i < num_subpats; i++) {
1022 zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i]);
1023 }
1024 }
1025 efree(match_sets);
1026
1027 if (Z_TYPE(marks) != IS_UNDEF) {
1028 add_assoc_zval(subpats, "MARK", &marks);
1029 }
1030 }
1031
1032 if (size_offsets <= 32) {
1033 free_alloca(offsets, use_heap);
1034 } else {
1035 efree(offsets);
1036 }
1037 if (subpat_names) {
1038 efree(subpat_names);
1039 }
1040
1041 /* Did we encounter an error? */
1042 if (PCRE_G(error_code) == PHP_PCRE_NO_ERROR) {
1043 RETVAL_LONG(matched);
1044 } else {
1045 RETVAL_FALSE;
1046 }
1047 }
1048 /* }}} */
1049
1050 /* {{{ proto int preg_match(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
1051 Perform a Perl-style regular expression match */
PHP_FUNCTION(preg_match)1052 static PHP_FUNCTION(preg_match)
1053 {
1054 php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0);
1055 }
1056 /* }}} */
1057
1058 /* {{{ proto int preg_match_all(string pattern, string subject [, array &subpatterns [, int flags [, int offset]]])
1059 Perform a Perl-style global regular expression match */
PHP_FUNCTION(preg_match_all)1060 static PHP_FUNCTION(preg_match_all)
1061 {
1062 php_do_pcre_match(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1);
1063 }
1064 /* }}} */
1065
1066 /* {{{ preg_get_backref
1067 */
preg_get_backref(char ** str,int * backref)1068 static int preg_get_backref(char **str, int *backref)
1069 {
1070 register char in_brace = 0;
1071 register char *walk = *str;
1072
1073 if (walk[1] == 0)
1074 return 0;
1075
1076 if (*walk == '$' && walk[1] == '{') {
1077 in_brace = 1;
1078 walk++;
1079 }
1080 walk++;
1081
1082 if (*walk >= '0' && *walk <= '9') {
1083 *backref = *walk - '0';
1084 walk++;
1085 } else
1086 return 0;
1087
1088 if (*walk && *walk >= '0' && *walk <= '9') {
1089 *backref = *backref * 10 + *walk - '0';
1090 walk++;
1091 }
1092
1093 if (in_brace) {
1094 if (*walk != '}')
1095 return 0;
1096 else
1097 walk++;
1098 }
1099
1100 *str = walk;
1101 return 1;
1102 }
1103 /* }}} */
1104
1105 /* {{{ preg_do_repl_func
1106 */
preg_do_repl_func(zval * function,char * subject,int * offsets,char ** subpat_names,int count,unsigned char * mark)1107 static zend_string *preg_do_repl_func(zval *function, char *subject, int *offsets, char **subpat_names, int count, unsigned char *mark)
1108 {
1109 zend_string *result_str;
1110 zval retval; /* Function return value */
1111 zval args[1]; /* Argument to pass to function */
1112 int i;
1113
1114 array_init_size(&args[0], count + (mark ? 1 : 0));
1115 if (subpat_names) {
1116 for (i = 0; i < count; i++) {
1117 if (subpat_names[i]) {
1118 add_assoc_stringl(&args[0], subpat_names[i], &subject[offsets[i<<1]] , offsets[(i<<1)+1] - offsets[i<<1]);
1119 }
1120 add_next_index_stringl(&args[0], &subject[offsets[i<<1]], offsets[(i<<1)+1] - offsets[i<<1]);
1121 }
1122 } else {
1123 for (i = 0; i < count; i++) {
1124 add_next_index_stringl(&args[0], &subject[offsets[i<<1]], offsets[(i<<1)+1] - offsets[i<<1]);
1125 }
1126 }
1127 if (mark) {
1128 add_assoc_string(&args[0], "MARK", (char *) mark);
1129 }
1130
1131 if (call_user_function_ex(EG(function_table), NULL, function, &retval, 1, args, 0, NULL) == SUCCESS && Z_TYPE(retval) != IS_UNDEF) {
1132 result_str = zval_get_string(&retval);
1133 zval_ptr_dtor(&retval);
1134 } else {
1135 if (!EG(exception)) {
1136 php_error_docref(NULL, E_WARNING, "Unable to call custom replacement function");
1137 }
1138
1139 result_str = zend_string_init(&subject[offsets[0]], offsets[1] - offsets[0], 0);
1140 }
1141
1142 zval_ptr_dtor(&args[0]);
1143
1144 return result_str;
1145 }
1146 /* }}} */
1147
1148 /* {{{ php_pcre_replace
1149 */
php_pcre_replace(zend_string * regex,zend_string * subject_str,char * subject,int subject_len,zval * replace_val,int is_callable_replace,int limit,int * replace_count)1150 PHPAPI zend_string *php_pcre_replace(zend_string *regex,
1151 zend_string *subject_str,
1152 char *subject, int subject_len,
1153 zval *replace_val, int is_callable_replace,
1154 int limit, int *replace_count)
1155 {
1156 pcre_cache_entry *pce; /* Compiled regular expression */
1157 zend_string *result; /* Function result */
1158
1159 /* Compile regex or get it from cache. */
1160 if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1161 return NULL;
1162 }
1163 pce->refcount++;
1164 result = php_pcre_replace_impl(pce, subject_str, subject, subject_len, replace_val,
1165 is_callable_replace, limit, replace_count);
1166 pce->refcount--;
1167
1168 return result;
1169 }
1170 /* }}} */
1171
1172 /* {{{ php_pcre_replace_impl() */
php_pcre_replace_impl(pcre_cache_entry * pce,zend_string * subject_str,char * subject,int subject_len,zval * replace_val,int is_callable_replace,int limit,int * replace_count)1173 PHPAPI zend_string *php_pcre_replace_impl(pcre_cache_entry *pce, zend_string *subject_str, char *subject, int subject_len, zval *replace_val, int is_callable_replace, int limit, int *replace_count)
1174 {
1175 pcre_extra *extra = pce->extra;/* Holds results of studying */
1176 pcre_extra extra_data; /* Used locally for exec options */
1177 int exoptions = 0; /* Execution options */
1178 int count = 0; /* Count of matched subpatterns */
1179 int *offsets; /* Array of subpattern offsets */
1180 char **subpat_names; /* Array for named subpatterns */
1181 int num_subpats; /* Number of captured subpatterns */
1182 int size_offsets; /* Size of the offsets array */
1183 size_t new_len; /* Length of needed storage */
1184 size_t alloc_len; /* Actual allocated length */
1185 int match_len; /* Length of the current match */
1186 int backref; /* Backreference number */
1187 int start_offset; /* Where the new search starts */
1188 int g_notempty=0; /* If the match should not be empty */
1189 char *replace=NULL, /* Replacement string */
1190 *walkbuf, /* Location of current replacement in the result */
1191 *walk, /* Used to walk the replacement string */
1192 *match, /* The current match */
1193 *piece, /* The current piece of subject */
1194 *replace_end=NULL, /* End of replacement string */
1195 walk_last; /* Last walked character */
1196 size_t result_len; /* Length of result */
1197 unsigned char *mark = NULL; /* Target for MARK name */
1198 zend_string *result; /* Result of replacement */
1199 zend_string *eval_result=NULL; /* Result of custom function */
1200
1201 ALLOCA_FLAG(use_heap);
1202
1203 if (extra == NULL) {
1204 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1205 extra = &extra_data;
1206 }
1207
1208 extra->match_limit = (unsigned long)PCRE_G(backtrack_limit);
1209 extra->match_limit_recursion = (unsigned long)PCRE_G(recursion_limit);
1210
1211 if (UNEXPECTED(pce->preg_options & PREG_REPLACE_EVAL)) {
1212 php_error_docref(NULL, E_WARNING, "The /e modifier is no longer supported, use preg_replace_callback instead");
1213 return NULL;
1214 }
1215
1216 if (!is_callable_replace) {
1217 replace = Z_STRVAL_P(replace_val);
1218 replace_end = replace + Z_STRLEN_P(replace_val);
1219 }
1220
1221 /* Calculate the size of the offsets array, and allocate memory for it. */
1222 num_subpats = pce->capture_count + 1;
1223 size_offsets = num_subpats * 3;
1224 if (size_offsets <= 32) {
1225 offsets = (int *)do_alloca(size_offsets * sizeof(int), use_heap);
1226 } else {
1227 offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1228 }
1229
1230 /*
1231 * Build a mapping from subpattern numbers to their names. We will
1232 * allocate the table only if there are any named subpatterns.
1233 */
1234 subpat_names = NULL;
1235 if (UNEXPECTED(pce->name_count > 0)) {
1236 subpat_names = make_subpats_table(num_subpats, pce);
1237 if (!subpat_names) {
1238 if (size_offsets <= 32) {
1239 free_alloca(offsets, use_heap);
1240 } else {
1241 efree(offsets);
1242 }
1243 return NULL;
1244 }
1245 }
1246
1247 alloc_len = 0;
1248 result = NULL;
1249
1250 /* Initialize */
1251 match = NULL;
1252 start_offset = 0;
1253 result_len = 0;
1254 PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1255
1256 while (1) {
1257 #ifdef PCRE_EXTRA_MARK
1258 extra->mark = &mark;
1259 extra->flags |= PCRE_EXTRA_MARK;
1260 #endif
1261 /* Execute the regular expression. */
1262 count = pcre_exec(pce->re, extra, subject, subject_len, start_offset,
1263 exoptions|g_notempty, offsets, size_offsets);
1264
1265 /* the string was already proved to be valid UTF-8 */
1266 exoptions |= PCRE_NO_UTF8_CHECK;
1267
1268 /* Check for too many substrings condition. */
1269 if (UNEXPECTED(count == 0)) {
1270 php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
1271 count = size_offsets / 3;
1272 }
1273
1274 piece = subject + start_offset;
1275
1276 /* if (EXPECTED(count > 0 && (limit == -1 || limit > 0))) */
1277 if (EXPECTED(count > 0 && (offsets[1] - offsets[0] >= 0) && limit)) {
1278 if (UNEXPECTED(replace_count)) {
1279 ++*replace_count;
1280 }
1281
1282 /* Set the match location in subject */
1283 match = subject + offsets[0];
1284
1285 new_len = result_len + offsets[0] - start_offset; /* part before the match */
1286
1287 /* if (!is_callable_replace) */
1288 if (EXPECTED(replace)) {
1289 /* do regular substitution */
1290 walk = replace;
1291 walk_last = 0;
1292
1293 while (walk < replace_end) {
1294 if ('\\' == *walk || '$' == *walk) {
1295 if (walk_last == '\\') {
1296 walk++;
1297 walk_last = 0;
1298 continue;
1299 }
1300 if (preg_get_backref(&walk, &backref)) {
1301 if (backref < count)
1302 new_len += offsets[(backref<<1)+1] - offsets[backref<<1];
1303 continue;
1304 }
1305 }
1306 new_len++;
1307 walk++;
1308 walk_last = walk[-1];
1309 }
1310
1311 if (new_len >= alloc_len) {
1312 alloc_len = zend_safe_address_guarded(2, new_len, alloc_len);
1313 if (result == NULL) {
1314 result = zend_string_alloc(alloc_len, 0);
1315 } else {
1316 result = zend_string_extend(result, alloc_len, 0);
1317 }
1318 }
1319
1320 /* copy the part of the string before the match */
1321 memcpy(&ZSTR_VAL(result)[result_len], piece, match-piece);
1322 result_len += (match-piece);
1323
1324 /* copy replacement and backrefs */
1325 walkbuf = ZSTR_VAL(result) + result_len;
1326
1327 walk = replace;
1328 walk_last = 0;
1329 while (walk < replace_end) {
1330 if ('\\' == *walk || '$' == *walk) {
1331 if (walk_last == '\\') {
1332 *(walkbuf-1) = *walk++;
1333 walk_last = 0;
1334 continue;
1335 }
1336 if (preg_get_backref(&walk, &backref)) {
1337 if (backref < count) {
1338 match_len = offsets[(backref<<1)+1] - offsets[backref<<1];
1339 memcpy(walkbuf, subject + offsets[backref<<1], match_len);
1340 walkbuf += match_len;
1341 }
1342 continue;
1343 }
1344 }
1345 *walkbuf++ = *walk++;
1346 walk_last = walk[-1];
1347 }
1348 *walkbuf = '\0';
1349 /* increment the result length by how much we've added to the string */
1350 result_len += (walkbuf - (ZSTR_VAL(result) + result_len));
1351 } else {
1352 /* Use custom function to get replacement string and its length. */
1353 eval_result = preg_do_repl_func(replace_val, subject, offsets, subpat_names, count, mark);
1354 ZEND_ASSERT(eval_result);
1355 new_len = zend_safe_address_guarded(1, ZSTR_LEN(eval_result), new_len);
1356 if (new_len >= alloc_len) {
1357 alloc_len = zend_safe_address_guarded(2, new_len, alloc_len);
1358 if (result == NULL) {
1359 result = zend_string_alloc(alloc_len, 0);
1360 } else {
1361 result = zend_string_extend(result, alloc_len, 0);
1362 }
1363 }
1364 /* copy the part of the string before the match */
1365 memcpy(ZSTR_VAL(result) + result_len, piece, match-piece);
1366 result_len += (int)(match-piece);
1367
1368 /* copy replacement and backrefs */
1369 walkbuf = ZSTR_VAL(result) + result_len;
1370
1371 /* If using custom function, copy result to the buffer and clean up. */
1372 memcpy(walkbuf, ZSTR_VAL(eval_result), ZSTR_LEN(eval_result));
1373 result_len += (int)ZSTR_LEN(eval_result);
1374 zend_string_release(eval_result);
1375 }
1376
1377 if (EXPECTED(limit)) {
1378 limit--;
1379 }
1380 } else if (count == PCRE_ERROR_NOMATCH || UNEXPECTED(limit == 0)) {
1381 /* If we previously set PCRE_NOTEMPTY_ATSTART after a null match,
1382 this is not necessarily the end. We need to advance
1383 the start offset, and continue. Fudge the offset values
1384 to achieve this, unless we're already at the end of the string. */
1385 if (g_notempty != 0 && start_offset < subject_len) {
1386 int unit_len = calculate_unit_length(pce, piece);
1387
1388 offsets[0] = start_offset;
1389 offsets[1] = start_offset + unit_len;
1390 memcpy(ZSTR_VAL(result) + result_len, piece, unit_len);
1391 result_len += unit_len;
1392 } else {
1393 if (!result && subject_str) {
1394 result = zend_string_copy(subject_str);
1395 break;
1396 }
1397 new_len = result_len + subject_len - start_offset;
1398 if (new_len >= alloc_len) {
1399 alloc_len = new_len; /* now we know exactly how long it is */
1400 if (NULL != result) {
1401 result = zend_string_realloc(result, alloc_len, 0);
1402 } else {
1403 result = zend_string_alloc(alloc_len, 0);
1404 }
1405 }
1406 /* stick that last bit of string on our output */
1407 memcpy(ZSTR_VAL(result) + result_len, piece, subject_len - start_offset);
1408 result_len += subject_len - start_offset;
1409 ZSTR_VAL(result)[result_len] = '\0';
1410 ZSTR_LEN(result) = result_len;
1411 break;
1412 }
1413 } else {
1414 pcre_handle_exec_error(count);
1415 if (result) {
1416 zend_string_free(result);
1417 result = NULL;
1418 }
1419 break;
1420 }
1421
1422 /* If we have matched an empty string, mimic what Perl's /g options does.
1423 This turns out to be rather cunning. First we set PCRE_NOTEMPTY_ATSTART and try
1424 the match again at the same point. If this fails (picked up above) we
1425 advance to the next character. */
1426 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED : 0;
1427
1428 /* Advance to the next piece. */
1429 start_offset = offsets[1];
1430 }
1431
1432 if (size_offsets <= 32) {
1433 free_alloca(offsets, use_heap);
1434 } else {
1435 efree(offsets);
1436 }
1437 if (UNEXPECTED(subpat_names)) {
1438 efree(subpat_names);
1439 }
1440
1441 return result;
1442 }
1443 /* }}} */
1444
1445 /* {{{ php_replace_in_subject
1446 */
php_replace_in_subject(zval * regex,zval * replace,zval * subject,int limit,int is_callable_replace,int * replace_count)1447 static zend_string *php_replace_in_subject(zval *regex, zval *replace, zval *subject, int limit, int is_callable_replace, int *replace_count)
1448 {
1449 zval *regex_entry,
1450 *replace_value,
1451 empty_replace;
1452 zend_string *result;
1453 uint32_t replace_idx;
1454 zend_string *subject_str = zval_get_string(subject);
1455
1456 /* FIXME: This might need to be changed to ZSTR_EMPTY_ALLOC(). Check if this zval could be dtor()'ed somehow */
1457 ZVAL_EMPTY_STRING(&empty_replace);
1458
1459 if (ZEND_SIZE_T_INT_OVFL(ZSTR_LEN(subject_str))) {
1460 php_error_docref(NULL, E_WARNING, "Subject is too long");
1461 return NULL;
1462 }
1463
1464 /* If regex is an array */
1465 if (Z_TYPE_P(regex) == IS_ARRAY) {
1466 replace_value = replace;
1467 replace_idx = 0;
1468
1469 /* For each entry in the regex array, get the entry */
1470 ZEND_HASH_FOREACH_VAL(Z_ARRVAL_P(regex), regex_entry) {
1471 zval replace_str;
1472 /* Make sure we're dealing with strings. */
1473 zend_string *regex_str = zval_get_string(regex_entry);
1474
1475 ZVAL_UNDEF(&replace_str);
1476 /* If replace is an array and not a callable construct */
1477 if (Z_TYPE_P(replace) == IS_ARRAY && !is_callable_replace) {
1478 /* Get current entry */
1479 while (replace_idx < Z_ARRVAL_P(replace)->nNumUsed) {
1480 if (Z_TYPE(Z_ARRVAL_P(replace)->arData[replace_idx].val) != IS_UNDEF) {
1481 ZVAL_COPY(&replace_str, &Z_ARRVAL_P(replace)->arData[replace_idx].val);
1482 break;
1483 }
1484 replace_idx++;
1485 }
1486 if (!Z_ISUNDEF(replace_str)) {
1487 if (!is_callable_replace) {
1488 convert_to_string(&replace_str);
1489 }
1490 replace_value = &replace_str;
1491 replace_idx++;
1492 } else {
1493 /* We've run out of replacement strings, so use an empty one */
1494 replace_value = &empty_replace;
1495 }
1496 }
1497
1498 /* Do the actual replacement and put the result back into subject_str
1499 for further replacements. */
1500 if ((result = php_pcre_replace(regex_str,
1501 subject_str,
1502 ZSTR_VAL(subject_str),
1503 (int)ZSTR_LEN(subject_str),
1504 replace_value,
1505 is_callable_replace,
1506 limit,
1507 replace_count)) != NULL) {
1508 zend_string_release(subject_str);
1509 subject_str = result;
1510 } else {
1511 zend_string_release(subject_str);
1512 zend_string_release(regex_str);
1513 zval_dtor(&replace_str);
1514 return NULL;
1515 }
1516
1517 zend_string_release(regex_str);
1518 zval_dtor(&replace_str);
1519 } ZEND_HASH_FOREACH_END();
1520
1521 return subject_str;
1522 } else {
1523 result = php_pcre_replace(Z_STR_P(regex),
1524 subject_str,
1525 ZSTR_VAL(subject_str),
1526 (int)ZSTR_LEN(subject_str),
1527 replace,
1528 is_callable_replace,
1529 limit,
1530 replace_count);
1531 zend_string_release(subject_str);
1532 return result;
1533 }
1534 }
1535 /* }}} */
1536
1537 /* {{{ preg_replace_impl
1538 */
preg_replace_impl(zval * return_value,zval * regex,zval * replace,zval * subject,zend_long limit_val,int is_callable_replace,int is_filter)1539 static int preg_replace_impl(zval *return_value, zval *regex, zval *replace, zval *subject, zend_long limit_val, int is_callable_replace, int is_filter)
1540 {
1541 zval *subject_entry;
1542 zend_string *result;
1543 zend_string *string_key;
1544 zend_ulong num_key;
1545 int replace_count = 0, old_replace_count;
1546
1547 if (Z_TYPE_P(replace) != IS_ARRAY && (Z_TYPE_P(replace) != IS_OBJECT || !is_callable_replace)) {
1548 convert_to_string_ex(replace);
1549 }
1550
1551 if (Z_TYPE_P(regex) != IS_ARRAY) {
1552 convert_to_string_ex(regex);
1553 }
1554
1555 /* if subject is an array */
1556 if (Z_TYPE_P(subject) == IS_ARRAY) {
1557 array_init_size(return_value, zend_hash_num_elements(Z_ARRVAL_P(subject)));
1558
1559 /* For each subject entry, convert it to string, then perform replacement
1560 and add the result to the return_value array. */
1561 ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(subject), num_key, string_key, subject_entry) {
1562 old_replace_count = replace_count;
1563 if ((result = php_replace_in_subject(regex, replace, subject_entry, limit_val, is_callable_replace, &replace_count)) != NULL) {
1564 if (!is_filter || replace_count > old_replace_count) {
1565 /* Add to return array */
1566 zval zv;
1567
1568 ZVAL_STR(&zv, result);
1569 if (string_key) {
1570 zend_hash_add_new(Z_ARRVAL_P(return_value), string_key, &zv);
1571 } else {
1572 zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, &zv);
1573 }
1574 } else {
1575 zend_string_release(result);
1576 }
1577 }
1578 } ZEND_HASH_FOREACH_END();
1579 } else {
1580 /* if subject is not an array */
1581 old_replace_count = replace_count;
1582 if ((result = php_replace_in_subject(regex, replace, subject, limit_val, is_callable_replace, &replace_count)) != NULL) {
1583 if (!is_filter || replace_count > old_replace_count) {
1584 RETVAL_STR(result);
1585 } else {
1586 zend_string_release(result);
1587 RETVAL_NULL();
1588 }
1589 } else {
1590 RETVAL_NULL();
1591 }
1592 }
1593
1594 return replace_count;
1595 }
1596 /* }}} */
1597
1598 /* {{{ proto mixed preg_replace(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1599 Perform Perl-style regular expression replacement. */
PHP_FUNCTION(preg_replace)1600 static PHP_FUNCTION(preg_replace)
1601 {
1602 zval *regex, *replace, *subject, *zcount = NULL;
1603 zend_long limit = -1;
1604 int replace_count;
1605
1606 /* Get function parameters and do error-checking. */
1607 ZEND_PARSE_PARAMETERS_START(3, 5)
1608 Z_PARAM_ZVAL(regex)
1609 Z_PARAM_ZVAL(replace)
1610 Z_PARAM_ZVAL(subject)
1611 Z_PARAM_OPTIONAL
1612 Z_PARAM_LONG(limit)
1613 Z_PARAM_ZVAL_EX(zcount, 0, 1)
1614 ZEND_PARSE_PARAMETERS_END();
1615
1616 if (Z_TYPE_P(replace) == IS_ARRAY && Z_TYPE_P(regex) != IS_ARRAY) {
1617 php_error_docref(NULL, E_WARNING, "Parameter mismatch, pattern is a string while replacement is an array");
1618 RETURN_FALSE;
1619 }
1620
1621 replace_count = preg_replace_impl(return_value, regex, replace, subject, limit, 0, 0);
1622 if (zcount) {
1623 zval_ptr_dtor(zcount);
1624 ZVAL_LONG(zcount, replace_count);
1625 }
1626 }
1627 /* }}} */
1628
1629 /* {{{ proto mixed preg_replace_callback(mixed regex, mixed callback, mixed subject [, int limit [, int &count]])
1630 Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback)1631 static PHP_FUNCTION(preg_replace_callback)
1632 {
1633 zval *regex, *replace, *subject, *zcount = NULL;
1634 zend_long limit = -1;
1635 zend_string *callback_name;
1636 int replace_count;
1637
1638 /* Get function parameters and do error-checking. */
1639 ZEND_PARSE_PARAMETERS_START(3, 5)
1640 Z_PARAM_ZVAL(regex)
1641 Z_PARAM_ZVAL(replace)
1642 Z_PARAM_ZVAL(subject)
1643 Z_PARAM_OPTIONAL
1644 Z_PARAM_LONG(limit)
1645 Z_PARAM_ZVAL_EX(zcount, 0, 1)
1646 ZEND_PARSE_PARAMETERS_END();
1647
1648 if (!zend_is_callable(replace, 0, &callback_name)) {
1649 php_error_docref(NULL, E_WARNING, "Requires argument 2, '%s', to be a valid callback", ZSTR_VAL(callback_name));
1650 zend_string_release(callback_name);
1651 ZVAL_STR(return_value, zval_get_string(subject));
1652 return;
1653 }
1654 zend_string_release(callback_name);
1655
1656 replace_count = preg_replace_impl(return_value, regex, replace, subject, limit, 1, 0);
1657 if (zcount) {
1658 zval_ptr_dtor(zcount);
1659 ZVAL_LONG(zcount, replace_count);
1660 }
1661 }
1662 /* }}} */
1663
1664 /* {{{ proto mixed preg_replace_callback_array(array pattern, mixed subject [, int limit [, int &count]])
1665 Perform Perl-style regular expression replacement using replacement callback. */
PHP_FUNCTION(preg_replace_callback_array)1666 static PHP_FUNCTION(preg_replace_callback_array)
1667 {
1668 zval regex, zv, *replace, *subject, *pattern, *zcount = NULL;
1669 zend_long limit = -1;
1670 zend_string *str_idx;
1671 zend_string *callback_name;
1672 int replace_count = 0;
1673
1674 /* Get function parameters and do error-checking. */
1675 ZEND_PARSE_PARAMETERS_START(2, 4)
1676 Z_PARAM_ARRAY(pattern)
1677 Z_PARAM_ZVAL(subject)
1678 Z_PARAM_OPTIONAL
1679 Z_PARAM_LONG(limit)
1680 Z_PARAM_ZVAL_EX(zcount, 0, 1)
1681 ZEND_PARSE_PARAMETERS_END();
1682
1683 ZEND_HASH_FOREACH_STR_KEY_VAL(Z_ARRVAL_P(pattern), str_idx, replace) {
1684 if (str_idx) {
1685 ZVAL_STR_COPY(®ex, str_idx);
1686 } else {
1687 php_error_docref(NULL, E_WARNING, "Delimiter must not be alphanumeric or backslash");
1688 zval_ptr_dtor(return_value);
1689 RETURN_NULL();
1690 }
1691
1692 if (!zend_is_callable(replace, 0, &callback_name)) {
1693 php_error_docref(NULL, E_WARNING, "'%s' is not a valid callback", ZSTR_VAL(callback_name));
1694 zend_string_release(callback_name);
1695 zval_ptr_dtor(®ex);
1696 zval_ptr_dtor(return_value);
1697 ZVAL_COPY(return_value, subject);
1698 return;
1699 }
1700 zend_string_release(callback_name);
1701
1702 if (Z_ISNULL_P(return_value)) {
1703 replace_count += preg_replace_impl(&zv, ®ex, replace, subject, limit, 1, 0);
1704 } else {
1705 replace_count += preg_replace_impl(&zv, ®ex, replace, return_value, limit, 1, 0);
1706 zval_ptr_dtor(return_value);
1707 }
1708
1709 zval_ptr_dtor(®ex);
1710
1711 ZVAL_COPY_VALUE(return_value, &zv);
1712
1713 if (UNEXPECTED(EG(exception))) {
1714 zval_ptr_dtor(return_value);
1715 RETURN_NULL();
1716 }
1717 } ZEND_HASH_FOREACH_END();
1718
1719 if (zcount) {
1720 zval_ptr_dtor(zcount);
1721 ZVAL_LONG(zcount, replace_count);
1722 }
1723 }
1724 /* }}} */
1725
1726 /* {{{ proto mixed preg_filter(mixed regex, mixed replace, mixed subject [, int limit [, int &count]])
1727 Perform Perl-style regular expression replacement and only return matches. */
PHP_FUNCTION(preg_filter)1728 static PHP_FUNCTION(preg_filter)
1729 {
1730 zval *regex, *replace, *subject, *zcount = NULL;
1731 zend_long limit = -1;
1732 int replace_count;
1733
1734 /* Get function parameters and do error-checking. */
1735 ZEND_PARSE_PARAMETERS_START(3, 5)
1736 Z_PARAM_ZVAL(regex)
1737 Z_PARAM_ZVAL(replace)
1738 Z_PARAM_ZVAL(subject)
1739 Z_PARAM_OPTIONAL
1740 Z_PARAM_LONG(limit)
1741 Z_PARAM_ZVAL_EX(zcount, 0, 1)
1742 ZEND_PARSE_PARAMETERS_END();
1743
1744 if (Z_TYPE_P(replace) == IS_ARRAY && Z_TYPE_P(regex) != IS_ARRAY) {
1745 php_error_docref(NULL, E_WARNING, "Parameter mismatch, pattern is a string while replacement is an array");
1746 RETURN_FALSE;
1747 }
1748
1749 replace_count = preg_replace_impl(return_value, regex, replace, subject, limit, 0, 1);
1750 if (zcount) {
1751 zval_ptr_dtor(zcount);
1752 ZVAL_LONG(zcount, replace_count);
1753 }
1754 }
1755 /* }}} */
1756
1757 /* {{{ proto array preg_split(string pattern, string subject [, int limit [, int flags]])
1758 Split string into an array using a perl-style regular expression as a delimiter */
PHP_FUNCTION(preg_split)1759 static PHP_FUNCTION(preg_split)
1760 {
1761 zend_string *regex; /* Regular expression */
1762 zend_string *subject; /* String to match against */
1763 zend_long limit_val = -1;/* Integer value of limit */
1764 zend_long flags = 0; /* Match control flags */
1765 pcre_cache_entry *pce; /* Compiled regular expression */
1766
1767 /* Get function parameters and do error checking */
1768 ZEND_PARSE_PARAMETERS_START(2, 4)
1769 Z_PARAM_STR(regex)
1770 Z_PARAM_STR(subject)
1771 Z_PARAM_OPTIONAL
1772 Z_PARAM_LONG(limit_val)
1773 Z_PARAM_LONG(flags)
1774 ZEND_PARSE_PARAMETERS_END_EX(RETURN_FALSE);
1775
1776 if (ZEND_SIZE_T_INT_OVFL(ZSTR_LEN(subject))) {
1777 php_error_docref(NULL, E_WARNING, "Subject is too long");
1778 RETURN_FALSE;
1779 }
1780
1781 /* Compile regex or get it from cache. */
1782 if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
1783 RETURN_FALSE;
1784 }
1785
1786 pce->refcount++;
1787 php_pcre_split_impl(pce, ZSTR_VAL(subject), (int)ZSTR_LEN(subject), return_value, (int)limit_val, flags);
1788 pce->refcount--;
1789 }
1790 /* }}} */
1791
1792 /* {{{ php_pcre_split
1793 */
php_pcre_split_impl(pcre_cache_entry * pce,char * subject,int subject_len,zval * return_value,zend_long limit_val,zend_long flags)1794 PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, char *subject, int subject_len, zval *return_value,
1795 zend_long limit_val, zend_long flags)
1796 {
1797 pcre_extra *extra = pce->extra;/* Holds results of studying */
1798 pcre_extra extra_data; /* Used locally for exec options */
1799 int *offsets; /* Array of subpattern offsets */
1800 int size_offsets; /* Size of the offsets array */
1801 int exoptions = 0; /* Execution options */
1802 int count = 0; /* Count of matched subpatterns */
1803 int start_offset; /* Where the new search starts */
1804 int next_offset; /* End of the last delimiter match + 1 */
1805 int g_notempty = 0; /* If the match should not be empty */
1806 char *last_match; /* Location of last match */
1807 int no_empty; /* If NO_EMPTY flag is set */
1808 int delim_capture; /* If delimiters should be captured */
1809 int offset_capture; /* If offsets should be captured */
1810 zval tmp;
1811 ALLOCA_FLAG(use_heap);
1812
1813 no_empty = flags & PREG_SPLIT_NO_EMPTY;
1814 delim_capture = flags & PREG_SPLIT_DELIM_CAPTURE;
1815 offset_capture = flags & PREG_SPLIT_OFFSET_CAPTURE;
1816
1817 if (limit_val == 0) {
1818 limit_val = -1;
1819 }
1820
1821 if (extra == NULL) {
1822 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
1823 extra = &extra_data;
1824 }
1825 extra->match_limit = (unsigned long)PCRE_G(backtrack_limit);
1826 extra->match_limit_recursion = (unsigned long)PCRE_G(recursion_limit);
1827 #ifdef PCRE_EXTRA_MARK
1828 extra->flags &= ~PCRE_EXTRA_MARK;
1829 #endif
1830
1831 /* Initialize return value */
1832 array_init(return_value);
1833
1834 /* Calculate the size of the offsets array, and allocate memory for it. */
1835 size_offsets = (pce->capture_count + 1) * 3;
1836 if (size_offsets <= 32) {
1837 offsets = (int *)do_alloca(size_offsets * sizeof(int), use_heap);
1838 } else {
1839 offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
1840 }
1841
1842 /* Start at the beginning of the string */
1843 start_offset = 0;
1844 next_offset = 0;
1845 last_match = subject;
1846 PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
1847
1848 /* Get next piece if no limit or limit not yet reached and something matched*/
1849 while ((limit_val == -1 || limit_val > 1)) {
1850 count = pcre_exec(pce->re, extra, subject,
1851 subject_len, start_offset,
1852 exoptions|g_notempty, offsets, size_offsets);
1853
1854 /* the string was already proved to be valid UTF-8 */
1855 exoptions |= PCRE_NO_UTF8_CHECK;
1856
1857 /* Check for too many substrings condition. */
1858 if (count == 0) {
1859 php_error_docref(NULL,E_NOTICE, "Matched, but too many substrings");
1860 count = size_offsets/3;
1861 }
1862
1863 /* If something matched */
1864 if (count > 0 && (offsets[1] - offsets[0] >= 0)) {
1865 if (!no_empty || &subject[offsets[0]] != last_match) {
1866
1867 if (offset_capture) {
1868 /* Add (match, offset) pair to the return value */
1869 add_offset_pair(return_value, last_match, (int)(&subject[offsets[0]]-last_match), next_offset, NULL);
1870 } else {
1871 /* Add the piece to the return value */
1872 ZVAL_STRINGL(&tmp, last_match, &subject[offsets[0]]-last_match);
1873 zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &tmp);
1874 }
1875
1876 /* One less left to do */
1877 if (limit_val != -1)
1878 limit_val--;
1879 }
1880
1881 last_match = &subject[offsets[1]];
1882 next_offset = offsets[1];
1883
1884 if (delim_capture) {
1885 int i, match_len;
1886 for (i = 1; i < count; i++) {
1887 match_len = offsets[(i<<1)+1] - offsets[i<<1];
1888 /* If we have matched a delimiter */
1889 if (!no_empty || match_len > 0) {
1890 if (offset_capture) {
1891 add_offset_pair(return_value, &subject[offsets[i<<1]], match_len, offsets[i<<1], NULL);
1892 } else {
1893 ZVAL_STRINGL(&tmp, &subject[offsets[i<<1]], match_len);
1894 zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &tmp);
1895 }
1896 }
1897 }
1898 }
1899 } else if (count == PCRE_ERROR_NOMATCH) {
1900 /* If we previously set PCRE_NOTEMPTY_ATSTART after a null match,
1901 this is not necessarily the end. We need to advance
1902 the start offset, and continue. Fudge the offset values
1903 to achieve this, unless we're already at the end of the string. */
1904 if (g_notempty != 0 && start_offset < subject_len) {
1905 offsets[0] = start_offset;
1906 offsets[1] = start_offset + calculate_unit_length(pce, subject + start_offset);
1907 } else {
1908 break;
1909 }
1910 } else {
1911 pcre_handle_exec_error(count);
1912 break;
1913 }
1914
1915 /* If we have matched an empty string, mimic what Perl's /g options does.
1916 This turns out to be rather cunning. First we set PCRE_NOTEMPTY_ATSTART and try
1917 the match again at the same point. If this fails (picked up above) we
1918 advance to the next character. */
1919 g_notempty = (offsets[1] == offsets[0])? PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED : 0;
1920
1921 /* Advance to the position right after the last full match */
1922 start_offset = offsets[1];
1923 }
1924
1925
1926 start_offset = (int)(last_match - subject); /* the offset might have been incremented, but without further successful matches */
1927
1928 if (!no_empty || start_offset < subject_len)
1929 {
1930 if (offset_capture) {
1931 /* Add the last (match, offset) pair to the return value */
1932 add_offset_pair(return_value, &subject[start_offset], subject_len - start_offset, start_offset, NULL);
1933 } else {
1934 /* Add the last piece to the return value */
1935 ZVAL_STRINGL(&tmp, last_match, subject + subject_len - last_match);
1936 zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &tmp);
1937 }
1938 }
1939
1940
1941 /* Clean up */
1942 if (size_offsets <= 32) {
1943 free_alloca(offsets, use_heap);
1944 } else {
1945 efree(offsets);
1946 }
1947 }
1948 /* }}} */
1949
1950 /* {{{ proto string preg_quote(string str [, string delim_char])
1951 Quote regular expression characters plus an optional character */
PHP_FUNCTION(preg_quote)1952 static PHP_FUNCTION(preg_quote)
1953 {
1954 size_t in_str_len;
1955 char *in_str; /* Input string argument */
1956 char *in_str_end; /* End of the input string */
1957 size_t delim_len = 0;
1958 char *delim = NULL; /* Additional delimiter argument */
1959 zend_string *out_str; /* Output string with quoted characters */
1960 char *p, /* Iterator for input string */
1961 *q, /* Iterator for output string */
1962 delim_char=0, /* Delimiter character to be quoted */
1963 c; /* Current character */
1964 zend_bool quote_delim = 0; /* Whether to quote additional delim char */
1965
1966 /* Get the arguments and check for errors */
1967 ZEND_PARSE_PARAMETERS_START(1, 2)
1968 Z_PARAM_STRING(in_str, in_str_len)
1969 Z_PARAM_OPTIONAL
1970 Z_PARAM_STRING(delim, delim_len)
1971 ZEND_PARSE_PARAMETERS_END();
1972
1973 in_str_end = in_str + in_str_len;
1974
1975 /* Nothing to do if we got an empty string */
1976 if (in_str == in_str_end) {
1977 RETURN_EMPTY_STRING();
1978 }
1979
1980 if (delim && *delim) {
1981 delim_char = delim[0];
1982 quote_delim = 1;
1983 }
1984
1985 /* Allocate enough memory so that even if each character
1986 is quoted, we won't run out of room */
1987 out_str = zend_string_safe_alloc(4, in_str_len, 0, 0);
1988
1989 /* Go through the string and quote necessary characters */
1990 for (p = in_str, q = ZSTR_VAL(out_str); p != in_str_end; p++) {
1991 c = *p;
1992 switch(c) {
1993 case '.':
1994 case '\\':
1995 case '+':
1996 case '*':
1997 case '?':
1998 case '[':
1999 case '^':
2000 case ']':
2001 case '$':
2002 case '(':
2003 case ')':
2004 case '{':
2005 case '}':
2006 case '=':
2007 case '!':
2008 case '>':
2009 case '<':
2010 case '|':
2011 case ':':
2012 case '-':
2013 *q++ = '\\';
2014 *q++ = c;
2015 break;
2016
2017 case '\0':
2018 *q++ = '\\';
2019 *q++ = '0';
2020 *q++ = '0';
2021 *q++ = '0';
2022 break;
2023
2024 default:
2025 if (quote_delim && c == delim_char)
2026 *q++ = '\\';
2027 *q++ = c;
2028 break;
2029 }
2030 }
2031 *q = '\0';
2032
2033 /* Reallocate string and return it */
2034 out_str = zend_string_truncate(out_str, q - ZSTR_VAL(out_str), 0);
2035 RETURN_NEW_STR(out_str);
2036 }
2037 /* }}} */
2038
2039 /* {{{ proto array preg_grep(string regex, array input [, int flags])
2040 Searches array and returns entries which match regex */
PHP_FUNCTION(preg_grep)2041 static PHP_FUNCTION(preg_grep)
2042 {
2043 zend_string *regex; /* Regular expression */
2044 zval *input; /* Input array */
2045 zend_long flags = 0; /* Match control flags */
2046 pcre_cache_entry *pce; /* Compiled regular expression */
2047
2048 /* Get arguments and do error checking */
2049 ZEND_PARSE_PARAMETERS_START(2, 3)
2050 Z_PARAM_STR(regex)
2051 Z_PARAM_ARRAY(input)
2052 Z_PARAM_OPTIONAL
2053 Z_PARAM_LONG(flags)
2054 ZEND_PARSE_PARAMETERS_END();
2055
2056 /* Compile regex or get it from cache. */
2057 if ((pce = pcre_get_compiled_regex_cache(regex)) == NULL) {
2058 RETURN_FALSE;
2059 }
2060
2061 pce->refcount++;
2062 php_pcre_grep_impl(pce, input, return_value, flags);
2063 pce->refcount--;
2064 }
2065 /* }}} */
2066
php_pcre_grep_impl(pcre_cache_entry * pce,zval * input,zval * return_value,zend_long flags)2067 PHPAPI void php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return_value, zend_long flags) /* {{{ */
2068 {
2069 zval *entry; /* An entry in the input array */
2070 pcre_extra *extra = pce->extra;/* Holds results of studying */
2071 pcre_extra extra_data; /* Used locally for exec options */
2072 int *offsets; /* Array of subpattern offsets */
2073 int size_offsets; /* Size of the offsets array */
2074 int count = 0; /* Count of matched subpatterns */
2075 zend_string *string_key;
2076 zend_ulong num_key;
2077 zend_bool invert; /* Whether to return non-matching
2078 entries */
2079 ALLOCA_FLAG(use_heap);
2080
2081 invert = flags & PREG_GREP_INVERT ? 1 : 0;
2082
2083 if (extra == NULL) {
2084 extra_data.flags = PCRE_EXTRA_MATCH_LIMIT | PCRE_EXTRA_MATCH_LIMIT_RECURSION;
2085 extra = &extra_data;
2086 }
2087 extra->match_limit = (unsigned long)PCRE_G(backtrack_limit);
2088 extra->match_limit_recursion = (unsigned long)PCRE_G(recursion_limit);
2089 #ifdef PCRE_EXTRA_MARK
2090 extra->flags &= ~PCRE_EXTRA_MARK;
2091 #endif
2092
2093 /* Calculate the size of the offsets array, and allocate memory for it. */
2094 size_offsets = (pce->capture_count + 1) * 3;
2095 if (size_offsets <= 32) {
2096 offsets = (int *)do_alloca(size_offsets * sizeof(int), use_heap);
2097 } else {
2098 offsets = (int *)safe_emalloc(size_offsets, sizeof(int), 0);
2099 }
2100
2101 /* Initialize return array */
2102 array_init(return_value);
2103
2104 PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
2105
2106 /* Go through the input array */
2107 ZEND_HASH_FOREACH_KEY_VAL(Z_ARRVAL_P(input), num_key, string_key, entry) {
2108 zend_string *subject_str = zval_get_string(entry);
2109
2110 /* Perform the match */
2111 count = pcre_exec(pce->re, extra, ZSTR_VAL(subject_str),
2112 (int)ZSTR_LEN(subject_str), 0,
2113 0, offsets, size_offsets);
2114
2115 /* Check for too many substrings condition. */
2116 if (count == 0) {
2117 php_error_docref(NULL, E_NOTICE, "Matched, but too many substrings");
2118 count = size_offsets/3;
2119 } else if (count < 0 && count != PCRE_ERROR_NOMATCH) {
2120 pcre_handle_exec_error(count);
2121 zend_string_release(subject_str);
2122 break;
2123 }
2124
2125 /* If the entry fits our requirements */
2126 if ((count > 0 && !invert) || (count == PCRE_ERROR_NOMATCH && invert)) {
2127 if (Z_REFCOUNTED_P(entry)) {
2128 Z_ADDREF_P(entry);
2129 }
2130
2131 /* Add to return array */
2132 if (string_key) {
2133 zend_hash_update(Z_ARRVAL_P(return_value), string_key, entry);
2134 } else {
2135 zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry);
2136 }
2137 }
2138
2139 zend_string_release(subject_str);
2140 } ZEND_HASH_FOREACH_END();
2141
2142 /* Clean up */
2143 if (size_offsets <= 32) {
2144 free_alloca(offsets, use_heap);
2145 } else {
2146 efree(offsets);
2147 }
2148 }
2149 /* }}} */
2150
2151 /* {{{ proto int preg_last_error()
2152 Returns the error code of the last regexp execution. */
PHP_FUNCTION(preg_last_error)2153 static PHP_FUNCTION(preg_last_error)
2154 {
2155 ZEND_PARSE_PARAMETERS_START(0, 0)
2156 ZEND_PARSE_PARAMETERS_END();
2157
2158 RETURN_LONG(PCRE_G(error_code));
2159 }
2160 /* }}} */
2161
2162 /* {{{ module definition structures */
2163
2164 /* {{{ arginfo */
2165 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match, 0, 0, 2)
2166 ZEND_ARG_INFO(0, pattern)
2167 ZEND_ARG_INFO(0, subject)
2168 ZEND_ARG_INFO(1, subpatterns) /* array */
2169 ZEND_ARG_INFO(0, flags)
2170 ZEND_ARG_INFO(0, offset)
2171 ZEND_END_ARG_INFO()
2172
2173 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_match_all, 0, 0, 2)
2174 ZEND_ARG_INFO(0, pattern)
2175 ZEND_ARG_INFO(0, subject)
2176 ZEND_ARG_INFO(1, subpatterns) /* array */
2177 ZEND_ARG_INFO(0, flags)
2178 ZEND_ARG_INFO(0, offset)
2179 ZEND_END_ARG_INFO()
2180
2181 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace, 0, 0, 3)
2182 ZEND_ARG_INFO(0, regex)
2183 ZEND_ARG_INFO(0, replace)
2184 ZEND_ARG_INFO(0, subject)
2185 ZEND_ARG_INFO(0, limit)
2186 ZEND_ARG_INFO(1, count)
2187 ZEND_END_ARG_INFO()
2188
2189 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace_callback, 0, 0, 3)
2190 ZEND_ARG_INFO(0, regex)
2191 ZEND_ARG_INFO(0, callback)
2192 ZEND_ARG_INFO(0, subject)
2193 ZEND_ARG_INFO(0, limit)
2194 ZEND_ARG_INFO(1, count)
2195 ZEND_END_ARG_INFO()
2196
2197 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_replace_callback_array, 0, 0, 2)
2198 ZEND_ARG_INFO(0, pattern)
2199 ZEND_ARG_INFO(0, subject)
2200 ZEND_ARG_INFO(0, limit)
2201 ZEND_ARG_INFO(1, count)
2202 ZEND_END_ARG_INFO()
2203
2204 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_split, 0, 0, 2)
2205 ZEND_ARG_INFO(0, pattern)
2206 ZEND_ARG_INFO(0, subject)
2207 ZEND_ARG_INFO(0, limit)
2208 ZEND_ARG_INFO(0, flags)
2209 ZEND_END_ARG_INFO()
2210
2211 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_quote, 0, 0, 1)
2212 ZEND_ARG_INFO(0, str)
2213 ZEND_ARG_INFO(0, delim_char)
2214 ZEND_END_ARG_INFO()
2215
2216 ZEND_BEGIN_ARG_INFO_EX(arginfo_preg_grep, 0, 0, 2)
2217 ZEND_ARG_INFO(0, regex)
2218 ZEND_ARG_INFO(0, input) /* array */
2219 ZEND_ARG_INFO(0, flags)
2220 ZEND_END_ARG_INFO()
2221
2222 ZEND_BEGIN_ARG_INFO(arginfo_preg_last_error, 0)
2223 ZEND_END_ARG_INFO()
2224 /* }}} */
2225
2226 static const zend_function_entry pcre_functions[] = {
2227 PHP_FE(preg_match, arginfo_preg_match)
2228 PHP_FE(preg_match_all, arginfo_preg_match_all)
2229 PHP_FE(preg_replace, arginfo_preg_replace)
2230 PHP_FE(preg_replace_callback, arginfo_preg_replace_callback)
2231 PHP_FE(preg_replace_callback_array, arginfo_preg_replace_callback_array)
2232 PHP_FE(preg_filter, arginfo_preg_replace)
2233 PHP_FE(preg_split, arginfo_preg_split)
2234 PHP_FE(preg_quote, arginfo_preg_quote)
2235 PHP_FE(preg_grep, arginfo_preg_grep)
2236 PHP_FE(preg_last_error, arginfo_preg_last_error)
2237 PHP_FE_END
2238 };
2239
2240 zend_module_entry pcre_module_entry = {
2241 STANDARD_MODULE_HEADER,
2242 "pcre",
2243 pcre_functions,
2244 PHP_MINIT(pcre),
2245 PHP_MSHUTDOWN(pcre),
2246 #ifdef HAVE_PCRE_JIT_SUPPORT
2247 PHP_RINIT(pcre),
2248 #else
2249 NULL,
2250 #endif
2251 NULL,
2252 PHP_MINFO(pcre),
2253 PHP_PCRE_VERSION,
2254 PHP_MODULE_GLOBALS(pcre),
2255 PHP_GINIT(pcre),
2256 PHP_GSHUTDOWN(pcre),
2257 NULL,
2258 STANDARD_MODULE_PROPERTIES_EX
2259 };
2260
2261 #ifdef COMPILE_DL_PCRE
2262 ZEND_GET_MODULE(pcre)
2263 #endif
2264
2265 /* }}} */
2266
2267 #endif /* HAVE_PCRE || HAVE_BUNDLED_PCRE */
2268
2269 /*
2270 * Local variables:
2271 * tab-width: 4
2272 * c-basic-offset: 4
2273 * End:
2274 * vim600: sw=4 ts=4 fdm=marker
2275 * vim<600: sw=4 ts=4
2276 */
2277