1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 7 |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | http://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Ed Batutis <ed@batutis.com> |
14 +----------------------------------------------------------------------+
15 */
16
17 /* {{{ includes */
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21
22 #include <php.h>
23 #include "grapheme.h"
24 #include "grapheme_util.h"
25
26 #include <unicode/utypes.h>
27 #if U_ICU_VERSION_MAJOR_NUM >= 49
28 #include <unicode/utf8.h>
29 #endif
30 #include <unicode/ucol.h>
31 #include <unicode/ustring.h>
32 #include <unicode/ubrk.h>
33
34 #include "ext/standard/php_string.h"
35
36 /* }}} */
37
38 #define GRAPHEME_EXTRACT_TYPE_COUNT 0
39 #define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1
40 #define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2
41 #define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT
42 #define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS
43
44
45 /* {{{ grapheme_register_constants
46 * Register API constants
47 */
grapheme_register_constants(INIT_FUNC_ARGS)48 void grapheme_register_constants( INIT_FUNC_ARGS )
49 {
50 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
51 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
52 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
53 }
54 /* }}} */
55
56 /* {{{ proto size_t grapheme_strlen(string str)
57 Get number of graphemes in a string */
PHP_FUNCTION(grapheme_strlen)58 PHP_FUNCTION(grapheme_strlen)
59 {
60 char* string;
61 size_t string_len;
62 UChar* ustring = NULL;
63 int ustring_len = 0;
64 zend_long ret_len;
65 UErrorCode status;
66
67 if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &string, &string_len) == FAILURE) {
68 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
69 "grapheme_strlen: unable to parse input param", 0 );
70 RETURN_FALSE;
71 }
72
73 ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
74
75 if ( ret_len >= 0 )
76 RETURN_LONG(string_len);
77
78 /* convert the string to UTF-16. */
79 status = U_ZERO_ERROR;
80 intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
81
82 if ( U_FAILURE( status ) ) {
83 /* Set global error code. */
84 intl_error_set_code( NULL, status );
85
86 /* Set error messages. */
87 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
88 if (ustring) {
89 efree( ustring );
90 }
91 RETURN_NULL();
92 }
93
94 ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
95
96 if (ustring) {
97 efree( ustring );
98 }
99
100 if (ret_len >= 0) {
101 RETVAL_LONG(ret_len);
102 } else {
103 RETVAL_FALSE;
104 }
105 }
106 /* }}} */
107
108 /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
109 Find position of first occurrence of a string within another */
PHP_FUNCTION(grapheme_strpos)110 PHP_FUNCTION(grapheme_strpos)
111 {
112 char *haystack, *needle;
113 size_t haystack_len, needle_len;
114 const char *found;
115 zend_long loffset = 0;
116 int32_t offset = 0;
117 size_t noffset = 0;
118 zend_long ret_pos;
119
120 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
121 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
122 "grapheme_strpos: unable to parse input param", 0 );
123 RETURN_FALSE;
124 }
125
126 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
127 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
128 RETURN_FALSE;
129 }
130
131 /* we checked that it will fit: */
132 offset = (int32_t) loffset;
133 noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
134
135 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
136
137 if (needle_len == 0) {
138 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
139 RETURN_FALSE;
140 }
141
142 if (offset >= 0) {
143 /* quick check to see if the string might be there
144 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
145 */
146 found = php_memnstr(haystack + noffset, needle, needle_len, haystack + haystack_len);
147
148 /* if it isn't there the we are done */
149 if (!found) {
150 RETURN_FALSE;
151 }
152
153 /* if it is there, and if the haystack is ascii, we are all done */
154 if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
155 RETURN_LONG(found - haystack);
156 }
157 }
158
159 /* do utf16 part of the strpos */
160 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
161
162 if ( ret_pos >= 0 ) {
163 RETURN_LONG(ret_pos);
164 } else {
165 RETURN_FALSE;
166 }
167
168 }
169 /* }}} */
170
171 /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
172 Find position of first occurrence of a string within another, ignoring case differences */
PHP_FUNCTION(grapheme_stripos)173 PHP_FUNCTION(grapheme_stripos)
174 {
175 char *haystack, *needle;
176 size_t haystack_len, needle_len;
177 const char *found;
178 zend_long loffset = 0;
179 int32_t offset = 0;
180 zend_long ret_pos;
181 int is_ascii;
182
183 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
184 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
185 "grapheme_stripos: unable to parse input param", 0 );
186 RETURN_FALSE;
187 }
188
189 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
190 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 );
191 RETURN_FALSE;
192 }
193
194 /* we checked that it will fit: */
195 offset = (int32_t) loffset;
196
197 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
198
199 if (needle_len == 0) {
200 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 );
201 RETURN_FALSE;
202 }
203
204 is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
205
206 if ( is_ascii ) {
207 char *haystack_dup, *needle_dup;
208 int32_t noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
209 needle_dup = estrndup(needle, needle_len);
210 php_strtolower(needle_dup, needle_len);
211 haystack_dup = estrndup(haystack, haystack_len);
212 php_strtolower(haystack_dup, haystack_len);
213
214 found = php_memnstr(haystack_dup + noffset, needle_dup, needle_len, haystack_dup + haystack_len);
215
216 efree(haystack_dup);
217 efree(needle_dup);
218
219 if (found) {
220 RETURN_LONG(found - haystack_dup);
221 }
222
223 /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
224 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
225 RETURN_FALSE;
226 }
227 }
228
229 /* do utf16 part of the strpos */
230 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
231
232 if ( ret_pos >= 0 ) {
233 RETURN_LONG(ret_pos);
234 } else {
235 RETURN_FALSE;
236 }
237
238 }
239 /* }}} */
240
241 /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
242 Find position of last occurrence of a string within another */
PHP_FUNCTION(grapheme_strrpos)243 PHP_FUNCTION(grapheme_strrpos)
244 {
245 char *haystack, *needle;
246 size_t haystack_len, needle_len;
247 zend_long loffset = 0;
248 int32_t offset = 0;
249 zend_long ret_pos;
250 int is_ascii;
251
252 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
253 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
254 "grapheme_strrpos: unable to parse input param", 0 );
255 RETURN_FALSE;
256 }
257
258 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
259 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
260 RETURN_FALSE;
261 }
262
263 /* we checked that it will fit: */
264 offset = (int32_t) loffset;
265
266 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
267
268 if (needle_len == 0) {
269 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
270 RETURN_FALSE;
271 }
272
273 is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
274
275 if ( is_ascii ) {
276
277 ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
278
279 if ( ret_pos >= 0 ) {
280 RETURN_LONG(ret_pos);
281 }
282
283 /* if the needle was ascii too, we are done */
284
285 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
286 RETURN_FALSE;
287 }
288
289 /* else we need to continue via utf16 */
290 }
291
292 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
293
294 if ( ret_pos >= 0 ) {
295 RETURN_LONG(ret_pos);
296 } else {
297 RETURN_FALSE;
298 }
299
300
301 }
302 /* }}} */
303
304 /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
305 Find position of last occurrence of a string within another, ignoring case */
PHP_FUNCTION(grapheme_strripos)306 PHP_FUNCTION(grapheme_strripos)
307 {
308 char *haystack, *needle;
309 size_t haystack_len, needle_len;
310 zend_long loffset = 0;
311 int32_t offset = 0;
312 zend_long ret_pos;
313 int is_ascii;
314
315 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
316 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
317 "grapheme_strrpos: unable to parse input param", 0 );
318 RETURN_FALSE;
319 }
320
321 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
322 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
323 RETURN_FALSE;
324 }
325
326 /* we checked that it will fit: */
327 offset = (int32_t) loffset;
328
329 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
330
331 if (needle_len == 0) {
332 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
333 RETURN_FALSE;
334 }
335
336 is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
337
338 if ( is_ascii ) {
339 char *needle_dup, *haystack_dup;
340
341 needle_dup = estrndup(needle, needle_len);
342 php_strtolower(needle_dup, needle_len);
343 haystack_dup = estrndup(haystack, haystack_len);
344 php_strtolower(haystack_dup, haystack_len);
345
346 ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
347
348 efree(haystack_dup);
349 efree(needle_dup);
350
351 if ( ret_pos >= 0 ) {
352 RETURN_LONG(ret_pos);
353 }
354
355 /* if the needle was ascii too, we are done */
356
357 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
358 RETURN_FALSE;
359 }
360
361 /* else we need to continue via utf16 */
362 }
363
364 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* f_ignore_case */, 1 /*last */);
365
366 if ( ret_pos >= 0 ) {
367 RETURN_LONG(ret_pos);
368 } else {
369 RETURN_FALSE;
370 }
371
372
373 }
374 /* }}} */
375
376 /* {{{ proto string grapheme_substr(string str, int start [, int length])
377 Returns part of a string */
PHP_FUNCTION(grapheme_substr)378 PHP_FUNCTION(grapheme_substr)
379 {
380 char *str;
381 zend_string *u8_sub_str;
382 UChar *ustr;
383 size_t str_len;
384 int32_t ustr_len;
385 zend_long lstart = 0, length = 0;
386 int32_t start = 0;
387 int iter_val;
388 UErrorCode status;
389 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
390 UBreakIterator* bi = NULL;
391 int sub_str_start_pos, sub_str_end_pos;
392 int32_t (*iter_func)(UBreakIterator *);
393 zend_bool no_length = 1;
394
395 if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|l!", &str, &str_len, &lstart, &length, &no_length) == FAILURE) {
396 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
397 "grapheme_substr: unable to parse input param", 0 );
398 RETURN_FALSE;
399 }
400
401 if ( OUTSIDE_STRING(lstart, str_len)) {
402 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 );
403 RETURN_FALSE;
404 }
405
406 /* we checked that it will fit: */
407 start = (int32_t) lstart;
408
409 if(no_length) {
410 length = str_len;
411 }
412
413 if(length < INT32_MIN) {
414 length = INT32_MIN;
415 } else if(length > INT32_MAX) {
416 length = INT32_MAX;
417 }
418
419 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
420
421 if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
422 int32_t asub_str_len;
423 char *sub_str;
424 grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
425
426 if ( NULL == sub_str ) {
427 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
428 RETURN_FALSE;
429 }
430
431 RETURN_STRINGL(sub_str, asub_str_len);
432 }
433
434 ustr = NULL;
435 ustr_len = 0;
436 status = U_ZERO_ERROR;
437 intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
438
439 if ( U_FAILURE( status ) ) {
440 /* Set global error code. */
441 intl_error_set_code( NULL, status );
442
443 /* Set error messages. */
444 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
445 if (ustr) {
446 efree( ustr );
447 }
448 RETURN_FALSE;
449 }
450
451 bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
452
453 if( U_FAILURE(status) ) {
454 RETURN_FALSE;
455 }
456
457 ubrk_setText(bi, ustr, ustr_len, &status);
458
459 if ( start < 0 ) {
460 iter_func = ubrk_previous;
461 ubrk_last(bi);
462 iter_val = 1;
463 }
464 else {
465 iter_func = ubrk_next;
466 iter_val = -1;
467 }
468
469 sub_str_start_pos = 0;
470
471 while ( start ) {
472 sub_str_start_pos = iter_func(bi);
473
474 if ( UBRK_DONE == sub_str_start_pos ) {
475 break;
476 }
477
478 start += iter_val;
479 }
480
481 if ( 0 != start || sub_str_start_pos >= ustr_len ) {
482
483 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 );
484
485 if (ustr) {
486 efree(ustr);
487 }
488 ubrk_close(bi);
489 RETURN_FALSE;
490 }
491
492 /* OK to convert here since if str_len were big, convert above would fail */
493 if (length >= (int32_t)str_len) {
494
495 /* no length supplied or length is too big, return the rest of the string */
496
497 status = U_ZERO_ERROR;
498 u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
499
500 if (ustr) {
501 efree( ustr );
502 }
503 ubrk_close( bi );
504
505 if ( !u8_sub_str ) {
506 /* Set global error code. */
507 intl_error_set_code( NULL, status );
508
509 /* Set error messages. */
510 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
511
512 RETURN_FALSE;
513 }
514
515 /* return the allocated string, not a duplicate */
516 RETVAL_NEW_STR(u8_sub_str);
517 return;
518 }
519
520 if(length == 0) {
521 /* empty length - we've validated start, we can return "" now */
522 if (ustr) {
523 efree(ustr);
524 }
525 ubrk_close(bi);
526 RETURN_EMPTY_STRING();
527 }
528
529 /* find the end point of the string to return */
530
531 if ( length < 0 ) {
532 iter_func = ubrk_previous;
533 ubrk_last(bi);
534 iter_val = 1;
535 }
536 else {
537 iter_func = ubrk_next;
538 iter_val = -1;
539 }
540
541 sub_str_end_pos = 0;
542
543 while ( length ) {
544 sub_str_end_pos = iter_func(bi);
545
546 if ( UBRK_DONE == sub_str_end_pos ) {
547 break;
548 }
549
550 length += iter_val;
551 }
552
553 ubrk_close(bi);
554
555 if ( UBRK_DONE == sub_str_end_pos) {
556 if(length < 0) {
557 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 );
558
559 efree(ustr);
560 RETURN_FALSE;
561 } else {
562 sub_str_end_pos = ustr_len;
563 }
564 }
565
566 if(sub_str_start_pos > sub_str_end_pos) {
567 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length is beyond start", 1 );
568
569 efree(ustr);
570 RETURN_FALSE;
571 }
572
573 status = U_ZERO_ERROR;
574 u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
575
576 efree( ustr );
577
578 if ( !u8_sub_str ) {
579 /* Set global error code. */
580 intl_error_set_code( NULL, status );
581
582 /* Set error messages. */
583 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
584
585 RETURN_FALSE;
586 }
587
588 /* return the allocated string, not a duplicate */
589 RETVAL_NEW_STR(u8_sub_str);
590 }
591 /* }}} */
592
593 /* {{{ strstr_common_handler */
strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS,int f_ignore_case)594 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
595 {
596 char *haystack, *needle;
597 const char *found;
598 size_t haystack_len, needle_len;
599 int32_t ret_pos, uchar_pos;
600 zend_bool part = 0;
601
602 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|b", &haystack, &haystack_len, &needle, &needle_len, &part) == FAILURE) {
603
604 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
605 "grapheme_strstr: unable to parse input param", 0 );
606
607 RETURN_FALSE;
608 }
609
610 if (needle_len == 0) {
611
612 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
613
614 RETURN_FALSE;
615 }
616
617
618 if ( !f_ignore_case ) {
619
620 /* ASCII optimization: quick check to see if the string might be there
621 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
622 */
623 found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
624
625 /* if it isn't there the we are done */
626 if ( !found ) {
627 RETURN_FALSE;
628 }
629
630 /* if it is there, and if the haystack is ascii, we are all done */
631 if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
632 size_t found_offset = found - haystack;
633
634 if (part) {
635 RETURN_STRINGL(haystack, found_offset);
636 } else {
637 RETURN_STRINGL(found, haystack_len - found_offset);
638 }
639 }
640
641 }
642
643 /* need to work in utf16 */
644 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
645
646 if ( ret_pos < 0 ) {
647 RETURN_FALSE;
648 }
649
650 /* uchar_pos is the 'nth' Unicode character position of the needle */
651
652 ret_pos = 0;
653 U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
654
655 if (part) {
656 RETURN_STRINGL(haystack, ret_pos);
657 } else {
658 RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
659 }
660
661 }
662 /* }}} */
663
664 /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
665 Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_strstr)666 PHP_FUNCTION(grapheme_strstr)
667 {
668 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
669 }
670 /* }}} */
671
672 /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
673 Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_stristr)674 PHP_FUNCTION(grapheme_stristr)
675 {
676 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
677 }
678 /* }}} */
679
680 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
681 static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator * bi,int32_t csize,unsigned char * pstr,int32_t str_len)682 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
683 {
684 int pos = 0;
685 int ret_pos = 0;
686 int break_pos, prev_break_pos;
687 int count = 0;
688
689 while ( 1 ) {
690 pos = ubrk_next(bi);
691
692 if ( UBRK_DONE == pos ) {
693 break;
694 }
695
696 for ( break_pos = ret_pos; break_pos < pos; ) {
697 count++;
698 prev_break_pos = break_pos;
699 U8_FWD_1(pstr, break_pos, str_len);
700
701 if ( prev_break_pos == break_pos ) {
702 /* something wrong - malformed utf8? */
703 csize = 0;
704 break;
705 }
706 }
707
708 /* if we are beyond our limit, then the loop is done */
709 if ( count > csize ) {
710 break;
711 }
712
713 ret_pos = break_pos;
714 }
715
716 return ret_pos;
717 }
718 /* }}} */
719
720 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
721 static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator * bi,int32_t bsize,unsigned char * pstr,int32_t str_len)722 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
723 {
724 int pos = 0;
725 int ret_pos = 0;
726
727 while ( 1 ) {
728 pos = ubrk_next(bi);
729
730 if ( UBRK_DONE == pos ) {
731 break;
732 }
733
734 if ( pos > bsize ) {
735 break;
736 }
737
738 ret_pos = pos;
739 }
740
741 return ret_pos;
742 }
743 /* }}} */
744
745 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
746 static inline int32_t
grapheme_extract_count_iter(UBreakIterator * bi,int32_t size,unsigned char * pstr,int32_t str_len)747 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
748 {
749 int next_pos = 0;
750 int ret_pos = 0;
751
752 while ( size ) {
753 next_pos = ubrk_next(bi);
754
755 if ( UBRK_DONE == next_pos ) {
756 break;
757 }
758 ret_pos = next_pos;
759 size--;
760 }
761
762 return ret_pos;
763 }
764 /* }}} */
765
766 /* {{{ grapheme extract iter function pointer array */
767 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
768
769 static grapheme_extract_iter grapheme_extract_iters[] = {
770 &grapheme_extract_count_iter,
771 &grapheme_extract_bytecount_iter,
772 &grapheme_extract_charcount_iter,
773 };
774 /* }}} */
775
776 /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
777 Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)778 PHP_FUNCTION(grapheme_extract)
779 {
780 char *str, *pstr;
781 UText ut = UTEXT_INITIALIZER;
782 size_t str_len;
783 zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
784 zend_long lstart = 0; /* starting position in str in bytes */
785 int32_t start = 0;
786 zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
787 UErrorCode status;
788 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
789 UBreakIterator* bi = NULL;
790 int ret_pos;
791 zval *next = NULL; /* return offset of next part of the string */
792
793 if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|llz", &str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
794 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
795 "grapheme_extract: unable to parse input param", 0 );
796 RETURN_FALSE;
797 }
798
799 if (lstart < 0) {
800 lstart += str_len;
801 }
802
803 if ( NULL != next ) {
804 if ( !Z_ISREF_P(next) ) {
805 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
806 "grapheme_extract: 'next' was not passed by reference", 0 );
807 RETURN_FALSE;
808 } else {
809 ZVAL_DEREF(next);
810 /* initialize next */
811 zval_ptr_dtor(next);
812 ZVAL_LONG(next, lstart);
813 }
814 }
815
816 if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
817 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
818 "grapheme_extract: unknown extract type param", 0 );
819 RETURN_FALSE;
820 }
821
822 if ( lstart > INT32_MAX || lstart < 0 || (size_t)lstart >= str_len ) {
823 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
824 RETURN_FALSE;
825 }
826
827 if ( size > INT32_MAX || size < 0) {
828 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 );
829 RETURN_FALSE;
830 }
831 if (size == 0) {
832 RETURN_EMPTY_STRING();
833 }
834
835 /* we checked that it will fit: */
836 start = (int32_t) lstart;
837
838 pstr = str + start;
839
840 /* just in case pstr points in the middle of a character, move forward to the start of the next char */
841 if ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
842 char *str_end = str + str_len;
843
844 while ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
845 pstr++;
846 if ( pstr >= str_end ) {
847 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
848 "grapheme_extract: invalid input string", 0 );
849
850 RETURN_FALSE;
851 }
852 }
853 }
854
855 str_len -= (pstr - str);
856
857 /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
858 (size + 1 because the size-th character might be the beginning of a grapheme cluster)
859 */
860
861 if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
862 size_t nsize = MIN(size, str_len);
863 if ( NULL != next ) {
864 ZVAL_LONG(next, start+nsize);
865 }
866 RETURN_STRINGL(pstr, nsize);
867 }
868
869 status = U_ZERO_ERROR;
870 utext_openUTF8(&ut, pstr, str_len, &status);
871
872 if ( U_FAILURE( status ) ) {
873 /* Set global error code. */
874 intl_error_set_code( NULL, status );
875
876 /* Set error messages. */
877 intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
878
879 RETURN_FALSE;
880 }
881
882 bi = NULL;
883 status = U_ZERO_ERROR;
884 bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
885
886 ubrk_setUText(bi, &ut, &status);
887 /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
888 can't back up. So, we will not do anything. */
889
890 /* now we need to find the end of the chunk the user wants us to return */
891 /* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
892 ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
893
894 utext_close(&ut);
895 ubrk_close(bi);
896
897 if ( NULL != next ) {
898 ZVAL_LONG(next, start+ret_pos);
899 }
900
901 RETURN_STRINGL(((char *)pstr), ret_pos);
902 }
903
904 /* }}} */
905
906 /*
907 * Local variables:
908 * tab-width: 4
909 * c-basic-offset: 4
910 * End:
911 * vim600: fdm=marker
912 * vim: noet sw=4 ts=4
913 */
914