1 /*
2 +----------------------------------------------------------------------+
3 | This source file is subject to version 3.01 of the PHP license, |
4 | that is bundled with this package in the file LICENSE, and is |
5 | available through the world-wide-web at the following url: |
6 | https://www.php.net/license/3_01.txt |
7 | If you did not receive a copy of the PHP license and are unable to |
8 | obtain it through the world-wide-web, please send a note to |
9 | license@php.net so we can mail you a copy immediately. |
10 +----------------------------------------------------------------------+
11 | Author: Ed Batutis <ed@batutis.com> |
12 +----------------------------------------------------------------------+
13 */
14
15 /* {{{ includes */
16 #ifdef HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19
20 #include <php.h>
21 #include "grapheme.h"
22 #include "grapheme_util.h"
23
24 #include <unicode/utypes.h>
25 #include <unicode/utf8.h>
26 #include <unicode/ucol.h>
27 #include <unicode/ustring.h>
28 #include <unicode/ubrk.h>
29
30 /* }}} */
31
32 #define GRAPHEME_EXTRACT_TYPE_COUNT 0
33 #define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1
34 #define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2
35 #define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT
36 #define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS
37
38
39 /* {{{ grapheme_register_constants
40 * Register API constants
41 */
grapheme_register_constants(INIT_FUNC_ARGS)42 void grapheme_register_constants( INIT_FUNC_ARGS )
43 {
44 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
45 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
46 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
47 }
48 /* }}} */
49
50 /* {{{ Get number of graphemes in a string */
PHP_FUNCTION(grapheme_strlen)51 PHP_FUNCTION(grapheme_strlen)
52 {
53 char* string;
54 size_t string_len;
55 UChar* ustring = NULL;
56 int ustring_len = 0;
57 zend_long ret_len;
58 UErrorCode status;
59
60 if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &string, &string_len) == FAILURE) {
61 RETURN_THROWS();
62 }
63
64 ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
65
66 if ( ret_len >= 0 )
67 RETURN_LONG(string_len);
68
69 /* convert the string to UTF-16. */
70 status = U_ZERO_ERROR;
71 intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
72
73 if ( U_FAILURE( status ) ) {
74 /* Set global error code. */
75 intl_error_set_code( NULL, status );
76
77 /* Set error messages. */
78 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
79 if (ustring) {
80 efree( ustring );
81 }
82 RETURN_NULL();
83 }
84
85 ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
86
87 if (ustring) {
88 efree( ustring );
89 }
90
91 if (ret_len >= 0) {
92 RETVAL_LONG(ret_len);
93 } else {
94 RETVAL_FALSE;
95 }
96 }
97 /* }}} */
98
99 /* {{{ Find position of first occurrence of a string within another */
PHP_FUNCTION(grapheme_strpos)100 PHP_FUNCTION(grapheme_strpos)
101 {
102 char *haystack, *needle;
103 size_t haystack_len, needle_len;
104 const char *found;
105 zend_long loffset = 0;
106 int32_t offset = 0;
107 size_t noffset = 0;
108 zend_long ret_pos;
109
110 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
111 RETURN_THROWS();
112 }
113
114 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
115 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
116 RETURN_THROWS();
117 }
118
119 /* we checked that it will fit: */
120 offset = (int32_t) loffset;
121 noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
122
123 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
124
125 if (offset >= 0 && grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0) {
126 /* quick check to see if the string might be there
127 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
128 */
129 found = php_memnstr(haystack + noffset, needle, needle_len, haystack + haystack_len);
130
131 /* if it isn't there the we are done */
132 if (found) {
133 RETURN_LONG(found - haystack);
134 }
135 RETURN_FALSE;
136 }
137
138 /* do utf16 part of the strpos */
139 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
140
141 if ( ret_pos >= 0 ) {
142 RETURN_LONG(ret_pos);
143 } else {
144 RETURN_FALSE;
145 }
146 }
147 /* }}} */
148
149 /* {{{ Find position of first occurrence of a string within another, ignoring case differences */
PHP_FUNCTION(grapheme_stripos)150 PHP_FUNCTION(grapheme_stripos)
151 {
152 char *haystack, *needle;
153 size_t haystack_len, needle_len;
154 const char *found;
155 zend_long loffset = 0;
156 int32_t offset = 0;
157 zend_long ret_pos;
158 int is_ascii;
159
160 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
161 RETURN_THROWS();
162 }
163
164 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
165 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
166 RETURN_THROWS();
167 }
168
169 /* we checked that it will fit: */
170 offset = (int32_t) loffset;
171
172 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
173
174 is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
175
176 if ( is_ascii ) {
177 char *haystack_dup, *needle_dup;
178 int32_t noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
179 needle_dup = estrndup(needle, needle_len);
180 zend_str_tolower(needle_dup, needle_len);
181 haystack_dup = estrndup(haystack, haystack_len);
182 zend_str_tolower(haystack_dup, haystack_len);
183
184 found = php_memnstr(haystack_dup + noffset, needle_dup, needle_len, haystack_dup + haystack_len);
185
186 efree(haystack_dup);
187 efree(needle_dup);
188
189 if (found) {
190 RETURN_LONG(found - haystack_dup);
191 }
192
193 /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
194 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
195 RETURN_FALSE;
196 }
197 }
198
199 /* do utf16 part of the strpos */
200 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
201
202 if ( ret_pos >= 0 ) {
203 RETURN_LONG(ret_pos);
204 } else {
205 RETURN_FALSE;
206 }
207
208 }
209 /* }}} */
210
211 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(grapheme_strrpos)212 PHP_FUNCTION(grapheme_strrpos)
213 {
214 char *haystack, *needle;
215 size_t haystack_len, needle_len;
216 zend_long loffset = 0;
217 int32_t offset = 0;
218 zend_long ret_pos;
219 int is_ascii;
220
221 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
222 RETURN_THROWS();
223 }
224
225 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
226 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
227 RETURN_THROWS();
228 }
229
230 /* we checked that it will fit: */
231 offset = (int32_t) loffset;
232
233 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
234
235 is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
236
237 if ( is_ascii ) {
238
239 ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
240
241 if ( ret_pos >= 0 ) {
242 RETURN_LONG(ret_pos);
243 }
244
245 /* if the needle was ascii too, we are done */
246
247 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
248 RETURN_FALSE;
249 }
250
251 /* else we need to continue via utf16 */
252 }
253
254 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
255
256 if ( ret_pos >= 0 ) {
257 RETURN_LONG(ret_pos);
258 } else {
259 RETURN_FALSE;
260 }
261
262
263 }
264 /* }}} */
265
266 /* {{{ Find position of last occurrence of a string within another, ignoring case */
PHP_FUNCTION(grapheme_strripos)267 PHP_FUNCTION(grapheme_strripos)
268 {
269 char *haystack, *needle;
270 size_t haystack_len, needle_len;
271 zend_long loffset = 0;
272 int32_t offset = 0;
273 zend_long ret_pos;
274 int is_ascii;
275
276 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
277 RETURN_THROWS();
278 }
279
280 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
281 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
282 RETURN_THROWS();
283 }
284
285 /* we checked that it will fit: */
286 offset = (int32_t) loffset;
287
288 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
289
290 is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
291
292 if ( is_ascii ) {
293 char *needle_dup, *haystack_dup;
294
295 needle_dup = estrndup(needle, needle_len);
296 zend_str_tolower(needle_dup, needle_len);
297 haystack_dup = estrndup(haystack, haystack_len);
298 zend_str_tolower(haystack_dup, haystack_len);
299
300 ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
301
302 efree(haystack_dup);
303 efree(needle_dup);
304
305 if ( ret_pos >= 0 ) {
306 RETURN_LONG(ret_pos);
307 }
308
309 /* if the needle was ascii too, we are done */
310
311 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
312 RETURN_FALSE;
313 }
314
315 /* else we need to continue via utf16 */
316 }
317
318 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* f_ignore_case */, 1 /*last */);
319
320 if ( ret_pos >= 0 ) {
321 RETURN_LONG(ret_pos);
322 } else {
323 RETURN_FALSE;
324 }
325
326
327 }
328 /* }}} */
329
330 /* {{{ Returns part of a string */
PHP_FUNCTION(grapheme_substr)331 PHP_FUNCTION(grapheme_substr)
332 {
333 char *str;
334 zend_string *u8_sub_str;
335 UChar *ustr;
336 size_t str_len;
337 int32_t ustr_len;
338 zend_long lstart = 0, length = 0;
339 int32_t start = 0;
340 int iter_val;
341 UErrorCode status;
342 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
343 UBreakIterator* bi = NULL;
344 int sub_str_start_pos, sub_str_end_pos;
345 int32_t (*iter_func)(UBreakIterator *);
346 bool no_length = 1;
347
348 if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|l!", &str, &str_len, &lstart, &length, &no_length) == FAILURE) {
349 RETURN_THROWS();
350 }
351
352 if (lstart < INT32_MIN || lstart > INT32_MAX) {
353 zend_argument_value_error(2, "is too large");
354 RETURN_THROWS();
355 }
356
357 start = (int32_t) lstart;
358
359 if (no_length) {
360 length = str_len;
361 }
362
363 if (length < INT32_MIN || length > INT32_MAX) {
364 zend_argument_value_error(3, "is too large");
365 RETURN_THROWS();
366 }
367
368 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
369
370 if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
371 int32_t asub_str_len;
372 char *sub_str;
373 grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
374
375 if ( NULL == sub_str ) {
376 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
377 RETURN_FALSE;
378 }
379
380 RETURN_STRINGL(sub_str, asub_str_len);
381 }
382
383 ustr = NULL;
384 ustr_len = 0;
385 status = U_ZERO_ERROR;
386 intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
387
388 if ( U_FAILURE( status ) ) {
389 /* Set global error code. */
390 intl_error_set_code( NULL, status );
391
392 /* Set error messages. */
393 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
394 if (ustr) {
395 efree( ustr );
396 }
397 RETURN_FALSE;
398 }
399
400 bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
401
402 if( U_FAILURE(status) ) {
403 RETURN_FALSE;
404 }
405
406 ubrk_setText(bi, ustr, ustr_len, &status);
407
408 if ( start < 0 ) {
409 iter_func = ubrk_previous;
410 ubrk_last(bi);
411 iter_val = 1;
412 }
413 else {
414 iter_func = ubrk_next;
415 iter_val = -1;
416 }
417
418 sub_str_start_pos = 0;
419
420 while ( start ) {
421 sub_str_start_pos = iter_func(bi);
422
423 if ( UBRK_DONE == sub_str_start_pos ) {
424 break;
425 }
426
427 start += iter_val;
428 }
429
430 if (0 != start) {
431 if (start > 0) {
432 if (ustr) {
433 efree(ustr);
434 }
435 ubrk_close(bi);
436 RETURN_EMPTY_STRING();
437 }
438
439 sub_str_start_pos = 0;
440 ubrk_first(bi);
441 }
442
443 /* OK to convert here since if str_len were big, convert above would fail */
444 if (length >= (int32_t)str_len) {
445
446 /* no length supplied or length is too big, return the rest of the string */
447
448 status = U_ZERO_ERROR;
449 u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
450
451 if (ustr) {
452 efree( ustr );
453 }
454 ubrk_close( bi );
455
456 if ( !u8_sub_str ) {
457 /* Set global error code. */
458 intl_error_set_code( NULL, status );
459
460 /* Set error messages. */
461 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
462
463 RETURN_FALSE;
464 }
465
466 /* return the allocated string, not a duplicate */
467 RETVAL_NEW_STR(u8_sub_str);
468 return;
469 }
470
471 if(length == 0) {
472 /* empty length - we've validated start, we can return "" now */
473 if (ustr) {
474 efree(ustr);
475 }
476 ubrk_close(bi);
477 RETURN_EMPTY_STRING();
478 }
479
480 /* find the end point of the string to return */
481
482 if ( length < 0 ) {
483 iter_func = ubrk_previous;
484 ubrk_last(bi);
485 iter_val = 1;
486 }
487 else {
488 iter_func = ubrk_next;
489 iter_val = -1;
490 }
491
492 sub_str_end_pos = 0;
493
494 while ( length ) {
495 sub_str_end_pos = iter_func(bi);
496
497 if ( UBRK_DONE == sub_str_end_pos ) {
498 break;
499 }
500
501 length += iter_val;
502 }
503
504 ubrk_close(bi);
505
506 if ( UBRK_DONE == sub_str_end_pos) {
507 if (length < 0) {
508 efree(ustr);
509 RETURN_EMPTY_STRING();
510 } else {
511 sub_str_end_pos = ustr_len;
512 }
513 }
514
515 if (sub_str_start_pos > sub_str_end_pos) {
516 efree(ustr);
517 RETURN_EMPTY_STRING();
518 }
519
520 status = U_ZERO_ERROR;
521 u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
522
523 efree( ustr );
524
525 if ( !u8_sub_str ) {
526 /* Set global error code. */
527 intl_error_set_code( NULL, status );
528
529 /* Set error messages. */
530 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
531
532 RETURN_FALSE;
533 }
534
535 /* return the allocated string, not a duplicate */
536 RETVAL_NEW_STR(u8_sub_str);
537 }
538 /* }}} */
539
540 /* {{{ strstr_common_handler */
strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS,int f_ignore_case)541 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
542 {
543 char *haystack, *needle;
544 const char *found;
545 size_t haystack_len, needle_len;
546 int32_t ret_pos, uchar_pos;
547 bool part = 0;
548
549 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|b", &haystack, &haystack_len, &needle, &needle_len, &part) == FAILURE) {
550 RETURN_THROWS();
551 }
552
553 if ( !f_ignore_case ) {
554
555 /* ASCII optimization: quick check to see if the string might be there */
556 found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
557
558 /* if it isn't there the we are done */
559 if ( !found ) {
560 RETURN_FALSE;
561 }
562
563 /* if it is there, and if the haystack is ascii, we are all done */
564 if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
565 size_t found_offset = found - haystack;
566
567 if (part) {
568 RETURN_STRINGL(haystack, found_offset);
569 } else {
570 RETURN_STRINGL(found, haystack_len - found_offset);
571 }
572 }
573
574 }
575
576 /* need to work in utf16 */
577 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
578
579 if ( ret_pos < 0 ) {
580 RETURN_FALSE;
581 }
582
583 /* uchar_pos is the 'nth' Unicode character position of the needle */
584
585 ret_pos = 0;
586 U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
587
588 if (part) {
589 RETURN_STRINGL(haystack, ret_pos);
590 } else {
591 RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
592 }
593
594 }
595 /* }}} */
596
597 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_strstr)598 PHP_FUNCTION(grapheme_strstr)
599 {
600 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
601 }
602 /* }}} */
603
604 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_stristr)605 PHP_FUNCTION(grapheme_stristr)
606 {
607 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
608 }
609 /* }}} */
610
611 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
612 static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator * bi,int32_t csize,unsigned char * pstr,int32_t str_len)613 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
614 {
615 int pos = 0;
616 int ret_pos = 0;
617 int break_pos, prev_break_pos;
618 int count = 0;
619
620 while ( 1 ) {
621 pos = ubrk_next(bi);
622
623 if ( UBRK_DONE == pos ) {
624 break;
625 }
626
627 for ( break_pos = ret_pos; break_pos < pos; ) {
628 count++;
629 prev_break_pos = break_pos;
630 U8_FWD_1(pstr, break_pos, str_len);
631
632 if ( prev_break_pos == break_pos ) {
633 /* something wrong - malformed utf8? */
634 csize = 0;
635 break;
636 }
637 }
638
639 /* if we are beyond our limit, then the loop is done */
640 if ( count > csize ) {
641 break;
642 }
643
644 ret_pos = break_pos;
645 }
646
647 return ret_pos;
648 }
649 /* }}} */
650
651 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
652 static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator * bi,int32_t bsize,unsigned char * pstr,int32_t str_len)653 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
654 {
655 int pos = 0;
656 int ret_pos = 0;
657
658 while ( 1 ) {
659 pos = ubrk_next(bi);
660
661 if ( UBRK_DONE == pos ) {
662 break;
663 }
664
665 if ( pos > bsize ) {
666 break;
667 }
668
669 ret_pos = pos;
670 }
671
672 return ret_pos;
673 }
674 /* }}} */
675
676 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
677 static inline int32_t
grapheme_extract_count_iter(UBreakIterator * bi,int32_t size,unsigned char * pstr,int32_t str_len)678 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
679 {
680 int next_pos = 0;
681 int ret_pos = 0;
682
683 while ( size ) {
684 next_pos = ubrk_next(bi);
685
686 if ( UBRK_DONE == next_pos ) {
687 break;
688 }
689 ret_pos = next_pos;
690 size--;
691 }
692
693 return ret_pos;
694 }
695 /* }}} */
696
697 /* {{{ grapheme extract iter function pointer array */
698 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
699
700 static grapheme_extract_iter grapheme_extract_iters[] = {
701 &grapheme_extract_count_iter,
702 &grapheme_extract_bytecount_iter,
703 &grapheme_extract_charcount_iter,
704 };
705 /* }}} */
706
707 /* {{{ Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)708 PHP_FUNCTION(grapheme_extract)
709 {
710 char *str, *pstr;
711 UText ut = UTEXT_INITIALIZER;
712 size_t str_len;
713 zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
714 zend_long lstart = 0; /* starting position in str in bytes */
715 int32_t start = 0;
716 zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
717 UErrorCode status;
718 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
719 UBreakIterator* bi = NULL;
720 int ret_pos;
721 zval *next = NULL; /* return offset of next part of the string */
722
723 if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|llz", &str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
724 RETURN_THROWS();
725 }
726
727 if (lstart < 0) {
728 lstart += str_len;
729 }
730
731 if ( NULL != next ) {
732 if ( !Z_ISREF_P(next) ) {
733 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
734 "grapheme_extract: 'next' was not passed by reference", 0 );
735 RETURN_FALSE;
736 } else {
737 ZVAL_DEREF(next);
738 /* initialize next */
739 zval_ptr_dtor(next);
740 ZVAL_LONG(next, lstart);
741 }
742 }
743
744 if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
745 zend_argument_value_error(3, "must be one of GRAPHEME_EXTR_COUNT, GRAPHEME_EXTR_MAXBYTES, or GRAPHEME_EXTR_MAXCHARS");
746 RETURN_THROWS();
747 }
748
749 if ( lstart > INT32_MAX || lstart < 0 || (size_t)lstart >= str_len ) {
750 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
751 RETURN_FALSE;
752 }
753
754 if (size < 0) {
755 zend_argument_value_error(2, "must be greater than or equal to 0");
756 RETURN_THROWS();
757 }
758
759 if (size > INT32_MAX) {
760 zend_argument_value_error(2, "is too large");
761 RETURN_THROWS();
762 }
763
764 if (size == 0) {
765 RETURN_EMPTY_STRING();
766 }
767
768 /* we checked that it will fit: */
769 start = (int32_t) lstart;
770
771 pstr = str + start;
772
773 /* just in case pstr points in the middle of a character, move forward to the start of the next char */
774 if ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
775 char *str_end = str + str_len;
776
777 while ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
778 pstr++;
779 if ( pstr >= str_end ) {
780 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
781 "grapheme_extract: invalid input string", 0 );
782
783 RETURN_FALSE;
784 }
785 }
786 }
787
788 str_len -= (pstr - str);
789
790 /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
791 (size + 1 because the size-th character might be the beginning of a grapheme cluster)
792 */
793
794 if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
795 size_t nsize = MIN(size, str_len);
796 if ( NULL != next ) {
797 ZVAL_LONG(next, start+nsize);
798 }
799 RETURN_STRINGL(pstr, nsize);
800 }
801
802 status = U_ZERO_ERROR;
803 utext_openUTF8(&ut, pstr, str_len, &status);
804
805 if ( U_FAILURE( status ) ) {
806 /* Set global error code. */
807 intl_error_set_code( NULL, status );
808
809 /* Set error messages. */
810 intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
811
812 RETURN_FALSE;
813 }
814
815 bi = NULL;
816 status = U_ZERO_ERROR;
817 bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
818
819 ubrk_setUText(bi, &ut, &status);
820 /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
821 can't back up. So, we will not do anything. */
822
823 /* now we need to find the end of the chunk the user wants us to return */
824 /* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
825 ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
826
827 utext_close(&ut);
828 ubrk_close(bi);
829
830 if ( NULL != next ) {
831 ZVAL_LONG(next, start+ret_pos);
832 }
833
834 RETURN_STRINGL(((char *)pstr), ret_pos);
835 }
836
837 /* }}} */
838