1 /*
2 +----------------------------------------------------------------------+
3 | This source file is subject to version 3.01 of the PHP license, |
4 | that is bundled with this package in the file LICENSE, and is |
5 | available through the world-wide-web at the following url: |
6 | https://www.php.net/license/3_01.txt |
7 | If you did not receive a copy of the PHP license and are unable to |
8 | obtain it through the world-wide-web, please send a note to |
9 | license@php.net so we can mail you a copy immediately. |
10 +----------------------------------------------------------------------+
11 | Author: Ed Batutis <ed@batutis.com> |
12 +----------------------------------------------------------------------+
13 */
14
15 /* {{{ includes */
16 #ifdef HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19
20 #include <php.h>
21 #include "grapheme.h"
22 #include "grapheme_util.h"
23
24 #include <unicode/utypes.h>
25 #include <unicode/utf8.h>
26 #include <unicode/ucol.h>
27 #include <unicode/ustring.h>
28 #include <unicode/ubrk.h>
29
30 /* }}} */
31
32 /* {{{ Get number of graphemes in a string */
PHP_FUNCTION(grapheme_strlen)33 PHP_FUNCTION(grapheme_strlen)
34 {
35 char* string;
36 size_t string_len;
37 UChar* ustring = NULL;
38 int ustring_len = 0;
39 zend_long ret_len;
40 UErrorCode status;
41
42 if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &string, &string_len) == FAILURE) {
43 RETURN_THROWS();
44 }
45
46 ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
47
48 if ( ret_len >= 0 )
49 RETURN_LONG(string_len);
50
51 /* convert the string to UTF-16. */
52 status = U_ZERO_ERROR;
53 intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
54
55 if ( U_FAILURE( status ) ) {
56 /* Set global error code. */
57 intl_error_set_code( NULL, status );
58
59 /* Set error messages. */
60 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
61 if (ustring) {
62 efree( ustring );
63 }
64 RETURN_NULL();
65 }
66
67 ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
68
69 if (ustring) {
70 efree( ustring );
71 }
72
73 if (ret_len >= 0) {
74 RETVAL_LONG(ret_len);
75 } else {
76 RETVAL_FALSE;
77 }
78 }
79 /* }}} */
80
81 /* {{{ Find position of first occurrence of a string within another */
PHP_FUNCTION(grapheme_strpos)82 PHP_FUNCTION(grapheme_strpos)
83 {
84 char *haystack, *needle;
85 size_t haystack_len, needle_len;
86 const char *found;
87 zend_long loffset = 0;
88 int32_t offset = 0;
89 size_t noffset = 0;
90 zend_long ret_pos;
91
92 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
93 RETURN_THROWS();
94 }
95
96 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
97 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
98 RETURN_THROWS();
99 }
100
101 /* we checked that it will fit: */
102 offset = (int32_t) loffset;
103 noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
104
105 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
106
107 if (offset >= 0 && grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0) {
108 /* quick check to see if the string might be there
109 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
110 */
111 found = php_memnstr(haystack + noffset, needle, needle_len, haystack + haystack_len);
112
113 /* if it isn't there the we are done */
114 if (found) {
115 RETURN_LONG(found - haystack);
116 }
117 RETURN_FALSE;
118 }
119
120 /* do utf16 part of the strpos */
121 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
122
123 if ( ret_pos >= 0 ) {
124 RETURN_LONG(ret_pos);
125 } else {
126 RETURN_FALSE;
127 }
128 }
129 /* }}} */
130
131 /* {{{ Find position of first occurrence of a string within another, ignoring case differences */
PHP_FUNCTION(grapheme_stripos)132 PHP_FUNCTION(grapheme_stripos)
133 {
134 char *haystack, *needle;
135 size_t haystack_len, needle_len;
136 const char *found;
137 zend_long loffset = 0;
138 int32_t offset = 0;
139 zend_long ret_pos;
140 int is_ascii;
141
142 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
143 RETURN_THROWS();
144 }
145
146 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
147 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
148 RETURN_THROWS();
149 }
150
151 /* we checked that it will fit: */
152 offset = (int32_t) loffset;
153
154 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
155
156 is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
157
158 if ( is_ascii ) {
159 char *haystack_dup, *needle_dup;
160 int32_t noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
161 needle_dup = estrndup(needle, needle_len);
162 zend_str_tolower(needle_dup, needle_len);
163 haystack_dup = estrndup(haystack, haystack_len);
164 zend_str_tolower(haystack_dup, haystack_len);
165
166 found = php_memnstr(haystack_dup + noffset, needle_dup, needle_len, haystack_dup + haystack_len);
167
168 efree(haystack_dup);
169 efree(needle_dup);
170
171 if (found) {
172 RETURN_LONG(found - haystack_dup);
173 }
174
175 /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
176 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
177 RETURN_FALSE;
178 }
179 }
180
181 /* do utf16 part of the strpos */
182 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
183
184 if ( ret_pos >= 0 ) {
185 RETURN_LONG(ret_pos);
186 } else {
187 RETURN_FALSE;
188 }
189
190 }
191 /* }}} */
192
193 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(grapheme_strrpos)194 PHP_FUNCTION(grapheme_strrpos)
195 {
196 char *haystack, *needle;
197 size_t haystack_len, needle_len;
198 zend_long loffset = 0;
199 int32_t offset = 0;
200 zend_long ret_pos;
201 int is_ascii;
202
203 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
204 RETURN_THROWS();
205 }
206
207 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
208 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
209 RETURN_THROWS();
210 }
211
212 /* we checked that it will fit: */
213 offset = (int32_t) loffset;
214
215 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
216
217 is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
218
219 if ( is_ascii ) {
220
221 ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
222
223 if ( ret_pos >= 0 ) {
224 RETURN_LONG(ret_pos);
225 }
226
227 /* if the needle was ascii too, we are done */
228
229 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
230 RETURN_FALSE;
231 }
232
233 /* else we need to continue via utf16 */
234 }
235
236 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
237
238 if ( ret_pos >= 0 ) {
239 RETURN_LONG(ret_pos);
240 } else {
241 RETURN_FALSE;
242 }
243
244
245 }
246 /* }}} */
247
248 /* {{{ Find position of last occurrence of a string within another, ignoring case */
PHP_FUNCTION(grapheme_strripos)249 PHP_FUNCTION(grapheme_strripos)
250 {
251 char *haystack, *needle;
252 size_t haystack_len, needle_len;
253 zend_long loffset = 0;
254 int32_t offset = 0;
255 zend_long ret_pos;
256 int is_ascii;
257
258 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
259 RETURN_THROWS();
260 }
261
262 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
263 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
264 RETURN_THROWS();
265 }
266
267 /* we checked that it will fit: */
268 offset = (int32_t) loffset;
269
270 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
271
272 is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
273
274 if ( is_ascii ) {
275 char *needle_dup, *haystack_dup;
276
277 needle_dup = estrndup(needle, needle_len);
278 zend_str_tolower(needle_dup, needle_len);
279 haystack_dup = estrndup(haystack, haystack_len);
280 zend_str_tolower(haystack_dup, haystack_len);
281
282 ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
283
284 efree(haystack_dup);
285 efree(needle_dup);
286
287 if ( ret_pos >= 0 ) {
288 RETURN_LONG(ret_pos);
289 }
290
291 /* if the needle was ascii too, we are done */
292
293 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
294 RETURN_FALSE;
295 }
296
297 /* else we need to continue via utf16 */
298 }
299
300 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* f_ignore_case */, 1 /*last */);
301
302 if ( ret_pos >= 0 ) {
303 RETURN_LONG(ret_pos);
304 } else {
305 RETURN_FALSE;
306 }
307
308
309 }
310 /* }}} */
311
312 /* {{{ Returns part of a string */
PHP_FUNCTION(grapheme_substr)313 PHP_FUNCTION(grapheme_substr)
314 {
315 char *str;
316 zend_string *u8_sub_str;
317 UChar *ustr;
318 size_t str_len;
319 int32_t ustr_len;
320 zend_long lstart = 0, length = 0;
321 int32_t start = 0;
322 int iter_val;
323 UErrorCode status;
324 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
325 UBreakIterator* bi = NULL;
326 int sub_str_start_pos, sub_str_end_pos;
327 int32_t (*iter_func)(UBreakIterator *);
328 bool no_length = 1;
329
330 if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|l!", &str, &str_len, &lstart, &length, &no_length) == FAILURE) {
331 RETURN_THROWS();
332 }
333
334 if (lstart < INT32_MIN || lstart > INT32_MAX) {
335 zend_argument_value_error(2, "is too large");
336 RETURN_THROWS();
337 }
338
339 start = (int32_t) lstart;
340
341 if (no_length) {
342 length = str_len;
343 }
344
345 if (length < INT32_MIN || length > INT32_MAX) {
346 zend_argument_value_error(3, "is too large");
347 RETURN_THROWS();
348 }
349
350 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
351
352 if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
353 int32_t asub_str_len;
354 char *sub_str;
355 grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
356
357 if ( NULL == sub_str ) {
358 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
359 RETURN_FALSE;
360 }
361
362 RETURN_STRINGL(sub_str, asub_str_len);
363 }
364
365 ustr = NULL;
366 ustr_len = 0;
367 status = U_ZERO_ERROR;
368 intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
369
370 if ( U_FAILURE( status ) ) {
371 /* Set global error code. */
372 intl_error_set_code( NULL, status );
373
374 /* Set error messages. */
375 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
376 if (ustr) {
377 efree( ustr );
378 }
379 RETURN_FALSE;
380 }
381
382 bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
383
384 if( U_FAILURE(status) ) {
385 RETURN_FALSE;
386 }
387
388 ubrk_setText(bi, ustr, ustr_len, &status);
389
390 if ( start < 0 ) {
391 iter_func = ubrk_previous;
392 ubrk_last(bi);
393 iter_val = 1;
394 }
395 else {
396 iter_func = ubrk_next;
397 iter_val = -1;
398 }
399
400 sub_str_start_pos = 0;
401
402 while ( start ) {
403 sub_str_start_pos = iter_func(bi);
404
405 if ( UBRK_DONE == sub_str_start_pos ) {
406 break;
407 }
408
409 start += iter_val;
410 }
411
412 if (0 != start) {
413 if (start > 0) {
414 if (ustr) {
415 efree(ustr);
416 }
417 ubrk_close(bi);
418 RETURN_EMPTY_STRING();
419 }
420
421 sub_str_start_pos = 0;
422 ubrk_first(bi);
423 }
424
425 /* OK to convert here since if str_len were big, convert above would fail */
426 if (length >= (int32_t)str_len) {
427
428 /* no length supplied or length is too big, return the rest of the string */
429
430 status = U_ZERO_ERROR;
431 u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
432
433 if (ustr) {
434 efree( ustr );
435 }
436 ubrk_close( bi );
437
438 if ( !u8_sub_str ) {
439 /* Set global error code. */
440 intl_error_set_code( NULL, status );
441
442 /* Set error messages. */
443 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
444
445 RETURN_FALSE;
446 }
447
448 /* return the allocated string, not a duplicate */
449 RETVAL_NEW_STR(u8_sub_str);
450 return;
451 }
452
453 if(length == 0) {
454 /* empty length - we've validated start, we can return "" now */
455 if (ustr) {
456 efree(ustr);
457 }
458 ubrk_close(bi);
459 RETURN_EMPTY_STRING();
460 }
461
462 /* find the end point of the string to return */
463
464 if ( length < 0 ) {
465 iter_func = ubrk_previous;
466 ubrk_last(bi);
467 iter_val = 1;
468 }
469 else {
470 iter_func = ubrk_next;
471 iter_val = -1;
472 }
473
474 sub_str_end_pos = 0;
475
476 while ( length ) {
477 sub_str_end_pos = iter_func(bi);
478
479 if ( UBRK_DONE == sub_str_end_pos ) {
480 break;
481 }
482
483 length += iter_val;
484 }
485
486 ubrk_close(bi);
487
488 if ( UBRK_DONE == sub_str_end_pos) {
489 if (length < 0) {
490 efree(ustr);
491 RETURN_EMPTY_STRING();
492 } else {
493 sub_str_end_pos = ustr_len;
494 }
495 }
496
497 if (sub_str_start_pos > sub_str_end_pos) {
498 efree(ustr);
499 RETURN_EMPTY_STRING();
500 }
501
502 status = U_ZERO_ERROR;
503 u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
504
505 efree( ustr );
506
507 if ( !u8_sub_str ) {
508 /* Set global error code. */
509 intl_error_set_code( NULL, status );
510
511 /* Set error messages. */
512 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
513
514 RETURN_FALSE;
515 }
516
517 /* return the allocated string, not a duplicate */
518 RETVAL_NEW_STR(u8_sub_str);
519 }
520 /* }}} */
521
522 /* {{{ strstr_common_handler */
strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS,int f_ignore_case)523 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
524 {
525 char *haystack, *needle;
526 const char *found;
527 size_t haystack_len, needle_len;
528 int32_t ret_pos, uchar_pos;
529 bool part = 0;
530
531 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|b", &haystack, &haystack_len, &needle, &needle_len, &part) == FAILURE) {
532 RETURN_THROWS();
533 }
534
535 if ( !f_ignore_case ) {
536
537 /* ASCII optimization: quick check to see if the string might be there */
538 found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
539
540 /* if it isn't there the we are done */
541 if ( !found ) {
542 RETURN_FALSE;
543 }
544
545 /* if it is there, and if the haystack is ascii, we are all done */
546 if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
547 size_t found_offset = found - haystack;
548
549 if (part) {
550 RETURN_STRINGL(haystack, found_offset);
551 } else {
552 RETURN_STRINGL(found, haystack_len - found_offset);
553 }
554 }
555
556 }
557
558 /* need to work in utf16 */
559 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
560
561 if ( ret_pos < 0 ) {
562 RETURN_FALSE;
563 }
564
565 /* uchar_pos is the 'nth' Unicode character position of the needle */
566
567 ret_pos = 0;
568 U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
569
570 if (part) {
571 RETURN_STRINGL(haystack, ret_pos);
572 } else {
573 RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
574 }
575
576 }
577 /* }}} */
578
579 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_strstr)580 PHP_FUNCTION(grapheme_strstr)
581 {
582 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
583 }
584 /* }}} */
585
586 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_stristr)587 PHP_FUNCTION(grapheme_stristr)
588 {
589 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
590 }
591 /* }}} */
592
593 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
594 static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator * bi,int32_t csize,unsigned char * pstr,int32_t str_len)595 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
596 {
597 int pos = 0;
598 int ret_pos = 0;
599 int break_pos, prev_break_pos;
600 int count = 0;
601
602 while ( 1 ) {
603 pos = ubrk_next(bi);
604
605 if ( UBRK_DONE == pos ) {
606 break;
607 }
608
609 for ( break_pos = ret_pos; break_pos < pos; ) {
610 count++;
611 prev_break_pos = break_pos;
612 U8_FWD_1(pstr, break_pos, str_len);
613
614 if ( prev_break_pos == break_pos ) {
615 /* something wrong - malformed utf8? */
616 csize = 0;
617 break;
618 }
619 }
620
621 /* if we are beyond our limit, then the loop is done */
622 if ( count > csize ) {
623 break;
624 }
625
626 ret_pos = break_pos;
627 }
628
629 return ret_pos;
630 }
631 /* }}} */
632
633 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
634 static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator * bi,int32_t bsize,unsigned char * pstr,int32_t str_len)635 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
636 {
637 int pos = 0;
638 int ret_pos = 0;
639
640 while ( 1 ) {
641 pos = ubrk_next(bi);
642
643 if ( UBRK_DONE == pos ) {
644 break;
645 }
646
647 if ( pos > bsize ) {
648 break;
649 }
650
651 ret_pos = pos;
652 }
653
654 return ret_pos;
655 }
656 /* }}} */
657
658 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
659 static inline int32_t
grapheme_extract_count_iter(UBreakIterator * bi,int32_t size,unsigned char * pstr,int32_t str_len)660 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
661 {
662 int next_pos = 0;
663 int ret_pos = 0;
664
665 while ( size ) {
666 next_pos = ubrk_next(bi);
667
668 if ( UBRK_DONE == next_pos ) {
669 break;
670 }
671 ret_pos = next_pos;
672 size--;
673 }
674
675 return ret_pos;
676 }
677 /* }}} */
678
679 /* {{{ grapheme extract iter function pointer array */
680 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
681
682 static grapheme_extract_iter grapheme_extract_iters[] = {
683 &grapheme_extract_count_iter,
684 &grapheme_extract_bytecount_iter,
685 &grapheme_extract_charcount_iter,
686 };
687 /* }}} */
688
689 /* {{{ Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)690 PHP_FUNCTION(grapheme_extract)
691 {
692 char *str, *pstr;
693 UText ut = UTEXT_INITIALIZER;
694 size_t str_len;
695 zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
696 zend_long lstart = 0; /* starting position in str in bytes */
697 int32_t start = 0;
698 zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
699 UErrorCode status;
700 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
701 UBreakIterator* bi = NULL;
702 int ret_pos;
703 zval *next = NULL; /* return offset of next part of the string */
704
705 if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|llz", &str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
706 RETURN_THROWS();
707 }
708
709 if (lstart < 0) {
710 lstart += str_len;
711 }
712
713 if ( NULL != next ) {
714 if ( !Z_ISREF_P(next) ) {
715 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
716 "grapheme_extract: 'next' was not passed by reference", 0 );
717 RETURN_FALSE;
718 } else {
719 ZVAL_DEREF(next);
720 /* initialize next */
721 zval_ptr_dtor(next);
722 ZVAL_LONG(next, lstart);
723 }
724 }
725
726 if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
727 zend_argument_value_error(3, "must be one of GRAPHEME_EXTR_COUNT, GRAPHEME_EXTR_MAXBYTES, or GRAPHEME_EXTR_MAXCHARS");
728 RETURN_THROWS();
729 }
730
731 if ( lstart > INT32_MAX || lstart < 0 || (size_t)lstart >= str_len ) {
732 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
733 RETURN_FALSE;
734 }
735
736 if (size < 0) {
737 zend_argument_value_error(2, "must be greater than or equal to 0");
738 RETURN_THROWS();
739 }
740
741 if (size > INT32_MAX) {
742 zend_argument_value_error(2, "is too large");
743 RETURN_THROWS();
744 }
745
746 if (size == 0) {
747 RETURN_EMPTY_STRING();
748 }
749
750 /* we checked that it will fit: */
751 start = (int32_t) lstart;
752
753 pstr = str + start;
754
755 /* just in case pstr points in the middle of a character, move forward to the start of the next char */
756 if ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
757 char *str_end = str + str_len;
758
759 while ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
760 pstr++;
761 if ( pstr >= str_end ) {
762 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
763 "grapheme_extract: invalid input string", 0 );
764
765 RETURN_FALSE;
766 }
767 }
768 }
769
770 str_len -= (pstr - str);
771
772 /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
773 (size + 1 because the size-th character might be the beginning of a grapheme cluster)
774 */
775
776 if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
777 size_t nsize = MIN(size, str_len);
778 if ( NULL != next ) {
779 ZVAL_LONG(next, start+nsize);
780 }
781 RETURN_STRINGL(pstr, nsize);
782 }
783
784 status = U_ZERO_ERROR;
785 utext_openUTF8(&ut, pstr, str_len, &status);
786
787 if ( U_FAILURE( status ) ) {
788 /* Set global error code. */
789 intl_error_set_code( NULL, status );
790
791 /* Set error messages. */
792 intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
793
794 RETURN_FALSE;
795 }
796
797 bi = NULL;
798 status = U_ZERO_ERROR;
799 bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
800
801 ubrk_setUText(bi, &ut, &status);
802 /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
803 can't back up. So, we will not do anything. */
804
805 /* now we need to find the end of the chunk the user wants us to return */
806 /* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
807 ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
808
809 utext_close(&ut);
810 ubrk_close(bi);
811
812 if ( NULL != next ) {
813 ZVAL_LONG(next, start+ret_pos);
814 }
815
816 RETURN_STRINGL(((char *)pstr), ret_pos);
817 }
818
819 /* }}} */
820