1 /*
2 +----------------------------------------------------------------------+
3 | This source file is subject to version 3.01 of the PHP license, |
4 | that is bundled with this package in the file LICENSE, and is |
5 | available through the world-wide-web at the following url: |
6 | https://www.php.net/license/3_01.txt |
7 | If you did not receive a copy of the PHP license and are unable to |
8 | obtain it through the world-wide-web, please send a note to |
9 | license@php.net so we can mail you a copy immediately. |
10 +----------------------------------------------------------------------+
11 | Author: Ed Batutis <ed@batutis.com> |
12 +----------------------------------------------------------------------+
13 */
14
15 /* {{{ includes */
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif
19
20 #include <php.h>
21 #include "grapheme.h"
22 #include "grapheme_util.h"
23
24 #include <unicode/utypes.h>
25 #include <unicode/utf8.h>
26 #include <unicode/ucol.h>
27 #include <unicode/ustring.h>
28 #include <unicode/ubrk.h>
29
30 /* }}} */
31
32 /* {{{ Get number of graphemes in a string */
PHP_FUNCTION(grapheme_strlen)33 PHP_FUNCTION(grapheme_strlen)
34 {
35 char* string;
36 size_t string_len;
37 UChar* ustring = NULL;
38 int ustring_len = 0;
39 zend_long ret_len;
40 UErrorCode status;
41
42 ZEND_PARSE_PARAMETERS_START(1, 1)
43 Z_PARAM_STRING(string, string_len)
44 ZEND_PARSE_PARAMETERS_END();
45
46 ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
47
48 if ( ret_len >= 0 )
49 RETURN_LONG(string_len);
50
51 /* convert the string to UTF-16. */
52 status = U_ZERO_ERROR;
53 intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
54
55 if ( U_FAILURE( status ) ) {
56 /* Set global error code. */
57 intl_error_set_code( NULL, status );
58
59 /* Set error messages. */
60 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
61 if (ustring) {
62 efree( ustring );
63 }
64 RETURN_NULL();
65 }
66
67 ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
68
69 if (ustring) {
70 efree( ustring );
71 }
72
73 if (ret_len >= 0) {
74 RETVAL_LONG(ret_len);
75 } else {
76 RETVAL_FALSE;
77 }
78 }
79 /* }}} */
80
81 /* {{{ Find position of first occurrence of a string within another */
PHP_FUNCTION(grapheme_strpos)82 PHP_FUNCTION(grapheme_strpos)
83 {
84 char *haystack, *needle;
85 size_t haystack_len, needle_len;
86 const char *found;
87 zend_long loffset = 0;
88 int32_t offset = 0;
89 size_t noffset = 0;
90 zend_long ret_pos;
91
92 ZEND_PARSE_PARAMETERS_START(2, 3)
93 Z_PARAM_STRING(haystack, haystack_len)
94 Z_PARAM_STRING(needle, needle_len)
95 Z_PARAM_OPTIONAL
96 Z_PARAM_LONG(loffset)
97 ZEND_PARSE_PARAMETERS_END();
98
99 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
100 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
101 RETURN_THROWS();
102 }
103
104 /* we checked that it will fit: */
105 offset = (int32_t) loffset;
106 noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
107
108 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
109
110 if (offset >= 0 && grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0) {
111 /* quick check to see if the string might be there
112 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
113 */
114 found = php_memnstr(haystack + noffset, needle, needle_len, haystack + haystack_len);
115
116 /* if it isn't there the we are done */
117 if (found) {
118 RETURN_LONG(found - haystack);
119 }
120 RETURN_FALSE;
121 }
122
123 /* do utf16 part of the strpos */
124 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
125
126 if ( ret_pos >= 0 ) {
127 RETURN_LONG(ret_pos);
128 } else {
129 RETURN_FALSE;
130 }
131 }
132 /* }}} */
133
134 /* {{{ Find position of first occurrence of a string within another, ignoring case differences */
PHP_FUNCTION(grapheme_stripos)135 PHP_FUNCTION(grapheme_stripos)
136 {
137 char *haystack, *needle;
138 size_t haystack_len, needle_len;
139 const char *found;
140 zend_long loffset = 0;
141 int32_t offset = 0;
142 zend_long ret_pos;
143 int is_ascii;
144
145 ZEND_PARSE_PARAMETERS_START(2, 3)
146 Z_PARAM_STRING(haystack, haystack_len)
147 Z_PARAM_STRING(needle, needle_len)
148 Z_PARAM_OPTIONAL
149 Z_PARAM_LONG(loffset)
150 ZEND_PARSE_PARAMETERS_END();
151
152 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
153 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
154 RETURN_THROWS();
155 }
156
157 /* we checked that it will fit: */
158 offset = (int32_t) loffset;
159
160 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
161
162 is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
163
164 if ( is_ascii ) {
165 char *haystack_dup, *needle_dup;
166 int32_t noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
167 needle_dup = estrndup(needle, needle_len);
168 zend_str_tolower(needle_dup, needle_len);
169 haystack_dup = estrndup(haystack, haystack_len);
170 zend_str_tolower(haystack_dup, haystack_len);
171
172 found = php_memnstr(haystack_dup + noffset, needle_dup, needle_len, haystack_dup + haystack_len);
173
174 efree(haystack_dup);
175 efree(needle_dup);
176
177 if (found) {
178 RETURN_LONG(found - haystack_dup);
179 }
180
181 /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
182 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
183 RETURN_FALSE;
184 }
185 }
186
187 /* do utf16 part of the strpos */
188 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
189
190 if ( ret_pos >= 0 ) {
191 RETURN_LONG(ret_pos);
192 } else {
193 RETURN_FALSE;
194 }
195
196 }
197 /* }}} */
198
199 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(grapheme_strrpos)200 PHP_FUNCTION(grapheme_strrpos)
201 {
202 char *haystack, *needle;
203 size_t haystack_len, needle_len;
204 zend_long loffset = 0;
205 int32_t offset = 0;
206 zend_long ret_pos;
207 int is_ascii;
208
209 ZEND_PARSE_PARAMETERS_START(2, 3)
210 Z_PARAM_STRING(haystack, haystack_len)
211 Z_PARAM_STRING(needle, needle_len)
212 Z_PARAM_OPTIONAL
213 Z_PARAM_LONG(loffset)
214 ZEND_PARSE_PARAMETERS_END();
215
216 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
217 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
218 RETURN_THROWS();
219 }
220
221 /* we checked that it will fit: */
222 offset = (int32_t) loffset;
223
224 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
225
226 is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
227
228 if ( is_ascii ) {
229
230 ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
231
232 if ( ret_pos >= 0 ) {
233 RETURN_LONG(ret_pos);
234 }
235
236 /* if the needle was ascii too, we are done */
237
238 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
239 RETURN_FALSE;
240 }
241
242 /* else we need to continue via utf16 */
243 }
244
245 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
246
247 if ( ret_pos >= 0 ) {
248 RETURN_LONG(ret_pos);
249 } else {
250 RETURN_FALSE;
251 }
252
253
254 }
255 /* }}} */
256
257 /* {{{ Find position of last occurrence of a string within another, ignoring case */
PHP_FUNCTION(grapheme_strripos)258 PHP_FUNCTION(grapheme_strripos)
259 {
260 char *haystack, *needle;
261 size_t haystack_len, needle_len;
262 zend_long loffset = 0;
263 int32_t offset = 0;
264 zend_long ret_pos;
265 int is_ascii;
266
267 ZEND_PARSE_PARAMETERS_START(2, 3)
268 Z_PARAM_STRING(haystack, haystack_len)
269 Z_PARAM_STRING(needle, needle_len)
270 Z_PARAM_OPTIONAL
271 Z_PARAM_LONG(loffset)
272 ZEND_PARSE_PARAMETERS_END();
273
274 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
275 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
276 RETURN_THROWS();
277 }
278
279 /* we checked that it will fit: */
280 offset = (int32_t) loffset;
281
282 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
283
284 is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
285
286 if ( is_ascii ) {
287 char *needle_dup, *haystack_dup;
288
289 needle_dup = estrndup(needle, needle_len);
290 zend_str_tolower(needle_dup, needle_len);
291 haystack_dup = estrndup(haystack, haystack_len);
292 zend_str_tolower(haystack_dup, haystack_len);
293
294 ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
295
296 efree(haystack_dup);
297 efree(needle_dup);
298
299 if ( ret_pos >= 0 ) {
300 RETURN_LONG(ret_pos);
301 }
302
303 /* if the needle was ascii too, we are done */
304
305 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
306 RETURN_FALSE;
307 }
308
309 /* else we need to continue via utf16 */
310 }
311
312 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* f_ignore_case */, 1 /*last */);
313
314 if ( ret_pos >= 0 ) {
315 RETURN_LONG(ret_pos);
316 } else {
317 RETURN_FALSE;
318 }
319
320
321 }
322 /* }}} */
323
324 /* {{{ Returns part of a string */
PHP_FUNCTION(grapheme_substr)325 PHP_FUNCTION(grapheme_substr)
326 {
327 char *str;
328 zend_string *u8_sub_str;
329 UChar *ustr;
330 size_t str_len;
331 int32_t ustr_len;
332 zend_long lstart = 0, length = 0;
333 int32_t start = 0;
334 int iter_val;
335 UErrorCode status;
336 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
337 UBreakIterator* bi = NULL;
338 int sub_str_start_pos, sub_str_end_pos;
339 int32_t (*iter_func)(UBreakIterator *);
340 bool no_length = true;
341
342 ZEND_PARSE_PARAMETERS_START(2, 3)
343 Z_PARAM_STRING(str, str_len)
344 Z_PARAM_LONG(lstart)
345 Z_PARAM_OPTIONAL
346 Z_PARAM_LONG_OR_NULL(length, no_length)
347 ZEND_PARSE_PARAMETERS_END();
348
349 if (lstart < INT32_MIN || lstart > INT32_MAX) {
350 zend_argument_value_error(2, "is too large");
351 RETURN_THROWS();
352 }
353
354 start = (int32_t) lstart;
355
356 if (no_length) {
357 length = str_len;
358 }
359
360 if (length < INT32_MIN || length > INT32_MAX) {
361 zend_argument_value_error(3, "is too large");
362 RETURN_THROWS();
363 }
364
365 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
366
367 if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
368 int32_t asub_str_len;
369 char *sub_str;
370 grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
371
372 if ( NULL == sub_str ) {
373 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
374 RETURN_FALSE;
375 }
376
377 RETURN_STRINGL(sub_str, asub_str_len);
378 }
379
380 ustr = NULL;
381 ustr_len = 0;
382 status = U_ZERO_ERROR;
383 intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
384
385 if ( U_FAILURE( status ) ) {
386 /* Set global error code. */
387 intl_error_set_code( NULL, status );
388
389 /* Set error messages. */
390 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
391 if (ustr) {
392 efree( ustr );
393 }
394 RETURN_FALSE;
395 }
396
397 bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
398
399 if( U_FAILURE(status) ) {
400 RETURN_FALSE;
401 }
402
403 ubrk_setText(bi, ustr, ustr_len, &status);
404
405 if ( start < 0 ) {
406 iter_func = ubrk_previous;
407 ubrk_last(bi);
408 iter_val = 1;
409 }
410 else {
411 iter_func = ubrk_next;
412 iter_val = -1;
413 }
414
415 sub_str_start_pos = 0;
416
417 while ( start ) {
418 sub_str_start_pos = iter_func(bi);
419
420 if ( UBRK_DONE == sub_str_start_pos ) {
421 break;
422 }
423
424 start += iter_val;
425 }
426
427 if (0 != start) {
428 if (start > 0) {
429 if (ustr) {
430 efree(ustr);
431 }
432 ubrk_close(bi);
433 RETURN_EMPTY_STRING();
434 }
435
436 sub_str_start_pos = 0;
437 ubrk_first(bi);
438 }
439
440 /* OK to convert here since if str_len were big, convert above would fail */
441 if (length >= (int32_t)str_len) {
442
443 /* no length supplied or length is too big, return the rest of the string */
444
445 status = U_ZERO_ERROR;
446 u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
447
448 if (ustr) {
449 efree( ustr );
450 }
451 ubrk_close( bi );
452
453 if ( !u8_sub_str ) {
454 /* Set global error code. */
455 intl_error_set_code( NULL, status );
456
457 /* Set error messages. */
458 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
459
460 RETURN_FALSE;
461 }
462
463 /* return the allocated string, not a duplicate */
464 RETVAL_NEW_STR(u8_sub_str);
465 return;
466 }
467
468 if(length == 0) {
469 /* empty length - we've validated start, we can return "" now */
470 if (ustr) {
471 efree(ustr);
472 }
473 ubrk_close(bi);
474 RETURN_EMPTY_STRING();
475 }
476
477 /* find the end point of the string to return */
478
479 if ( length < 0 ) {
480 iter_func = ubrk_previous;
481 ubrk_last(bi);
482 iter_val = 1;
483 }
484 else {
485 iter_func = ubrk_next;
486 iter_val = -1;
487 }
488
489 sub_str_end_pos = 0;
490
491 while ( length ) {
492 sub_str_end_pos = iter_func(bi);
493
494 if ( UBRK_DONE == sub_str_end_pos ) {
495 break;
496 }
497
498 length += iter_val;
499 }
500
501 ubrk_close(bi);
502
503 if ( UBRK_DONE == sub_str_end_pos) {
504 if (length < 0) {
505 efree(ustr);
506 RETURN_EMPTY_STRING();
507 } else {
508 sub_str_end_pos = ustr_len;
509 }
510 }
511
512 if (sub_str_start_pos > sub_str_end_pos) {
513 efree(ustr);
514 RETURN_EMPTY_STRING();
515 }
516
517 status = U_ZERO_ERROR;
518 u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
519
520 efree( ustr );
521
522 if ( !u8_sub_str ) {
523 /* Set global error code. */
524 intl_error_set_code( NULL, status );
525
526 /* Set error messages. */
527 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
528
529 RETURN_FALSE;
530 }
531
532 /* return the allocated string, not a duplicate */
533 RETVAL_NEW_STR(u8_sub_str);
534 }
535 /* }}} */
536
537 /* {{{ strstr_common_handler */
strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS,int f_ignore_case)538 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
539 {
540 char *haystack, *needle;
541 const char *found;
542 size_t haystack_len, needle_len;
543 int32_t ret_pos, uchar_pos;
544 bool part = false;
545
546 ZEND_PARSE_PARAMETERS_START(2, 3)
547 Z_PARAM_STRING(haystack, haystack_len)
548 Z_PARAM_STRING(needle, needle_len)
549 Z_PARAM_OPTIONAL
550 Z_PARAM_BOOL(part)
551 ZEND_PARSE_PARAMETERS_END();
552
553 if ( !f_ignore_case ) {
554
555 /* ASCII optimization: quick check to see if the string might be there */
556 found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
557
558 /* if it isn't there the we are done */
559 if ( !found ) {
560 RETURN_FALSE;
561 }
562
563 /* if it is there, and if the haystack is ascii, we are all done */
564 if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
565 size_t found_offset = found - haystack;
566
567 if (part) {
568 RETURN_STRINGL(haystack, found_offset);
569 } else {
570 RETURN_STRINGL(found, haystack_len - found_offset);
571 }
572 }
573
574 }
575
576 /* need to work in utf16 */
577 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
578
579 if ( ret_pos < 0 ) {
580 RETURN_FALSE;
581 }
582
583 /* uchar_pos is the 'nth' Unicode character position of the needle */
584
585 ret_pos = 0;
586 U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
587
588 if (part) {
589 RETURN_STRINGL(haystack, ret_pos);
590 } else {
591 RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
592 }
593
594 }
595 /* }}} */
596
597 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_strstr)598 PHP_FUNCTION(grapheme_strstr)
599 {
600 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
601 }
602 /* }}} */
603
604 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_stristr)605 PHP_FUNCTION(grapheme_stristr)
606 {
607 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
608 }
609 /* }}} */
610
611 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
612 static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator * bi,int32_t csize,unsigned char * pstr,int32_t str_len)613 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
614 {
615 int pos = 0;
616 int ret_pos = 0;
617 int break_pos, prev_break_pos;
618 int count = 0;
619
620 while ( 1 ) {
621 pos = ubrk_next(bi);
622
623 if ( UBRK_DONE == pos ) {
624 break;
625 }
626
627 for ( break_pos = ret_pos; break_pos < pos; ) {
628 count++;
629 prev_break_pos = break_pos;
630 U8_FWD_1(pstr, break_pos, str_len);
631
632 if ( prev_break_pos == break_pos ) {
633 /* something wrong - malformed utf8? */
634 csize = 0;
635 break;
636 }
637 }
638
639 /* if we are beyond our limit, then the loop is done */
640 if ( count > csize ) {
641 break;
642 }
643
644 ret_pos = break_pos;
645 }
646
647 return ret_pos;
648 }
649 /* }}} */
650
651 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
652 static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator * bi,int32_t bsize,unsigned char * pstr,int32_t str_len)653 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
654 {
655 int pos = 0;
656 int ret_pos = 0;
657
658 while ( 1 ) {
659 pos = ubrk_next(bi);
660
661 if ( UBRK_DONE == pos ) {
662 break;
663 }
664
665 if ( pos > bsize ) {
666 break;
667 }
668
669 ret_pos = pos;
670 }
671
672 return ret_pos;
673 }
674 /* }}} */
675
676 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
677 static inline int32_t
grapheme_extract_count_iter(UBreakIterator * bi,int32_t size,unsigned char * pstr,int32_t str_len)678 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
679 {
680 int next_pos = 0;
681 int ret_pos = 0;
682
683 while ( size ) {
684 next_pos = ubrk_next(bi);
685
686 if ( UBRK_DONE == next_pos ) {
687 break;
688 }
689 ret_pos = next_pos;
690 size--;
691 }
692
693 return ret_pos;
694 }
695 /* }}} */
696
697 /* {{{ grapheme extract iter function pointer array */
698 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
699
700 static const grapheme_extract_iter grapheme_extract_iters[] = {
701 &grapheme_extract_count_iter,
702 &grapheme_extract_bytecount_iter,
703 &grapheme_extract_charcount_iter,
704 };
705 /* }}} */
706
707 /* {{{ Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)708 PHP_FUNCTION(grapheme_extract)
709 {
710 char *str, *pstr;
711 UText ut = UTEXT_INITIALIZER;
712 size_t str_len;
713 zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
714 zend_long lstart = 0; /* starting position in str in bytes */
715 int32_t start = 0;
716 zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
717 UErrorCode status;
718 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
719 UBreakIterator* bi = NULL;
720 int ret_pos;
721 zval *next = NULL; /* return offset of next part of the string */
722
723 ZEND_PARSE_PARAMETERS_START(2, 5)
724 Z_PARAM_STRING(str, str_len)
725 Z_PARAM_LONG(size)
726 Z_PARAM_OPTIONAL
727 Z_PARAM_LONG(extract_type)
728 Z_PARAM_LONG(lstart)
729 Z_PARAM_ZVAL(next)
730 ZEND_PARSE_PARAMETERS_END();
731
732 if (lstart < 0) {
733 lstart += str_len;
734 }
735
736 if ( NULL != next ) {
737 if ( !Z_ISREF_P(next) ) {
738 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
739 "grapheme_extract: 'next' was not passed by reference", 0 );
740 RETURN_FALSE;
741 } else {
742 ZVAL_DEREF(next);
743 /* initialize next */
744 zval_ptr_dtor(next);
745 ZVAL_LONG(next, lstart);
746 }
747 }
748
749 if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
750 zend_argument_value_error(3, "must be one of GRAPHEME_EXTR_COUNT, GRAPHEME_EXTR_MAXBYTES, or GRAPHEME_EXTR_MAXCHARS");
751 RETURN_THROWS();
752 }
753
754 if ( lstart > INT32_MAX || lstart < 0 || (size_t)lstart >= str_len ) {
755 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
756 RETURN_FALSE;
757 }
758
759 if (size < 0) {
760 zend_argument_value_error(2, "must be greater than or equal to 0");
761 RETURN_THROWS();
762 }
763
764 if (size > INT32_MAX) {
765 zend_argument_value_error(2, "is too large");
766 RETURN_THROWS();
767 }
768
769 if (size == 0) {
770 RETURN_EMPTY_STRING();
771 }
772
773 /* we checked that it will fit: */
774 start = (int32_t) lstart;
775
776 pstr = str + start;
777
778 /* just in case pstr points in the middle of a character, move forward to the start of the next char */
779 if ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
780 char *str_end = str + str_len;
781
782 while ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
783 pstr++;
784 if ( pstr >= str_end ) {
785 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
786 "grapheme_extract: invalid input string", 0 );
787
788 RETURN_FALSE;
789 }
790 }
791 }
792
793 str_len -= (pstr - str);
794
795 /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
796 (size + 1 because the size-th character might be the beginning of a grapheme cluster)
797 */
798
799 if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
800 size_t nsize = MIN(size, str_len);
801 if ( NULL != next ) {
802 ZVAL_LONG(next, start+nsize);
803 }
804 RETURN_STRINGL(pstr, nsize);
805 }
806
807 status = U_ZERO_ERROR;
808 utext_openUTF8(&ut, pstr, str_len, &status);
809
810 if ( U_FAILURE( status ) ) {
811 /* Set global error code. */
812 intl_error_set_code( NULL, status );
813
814 /* Set error messages. */
815 intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
816
817 RETURN_FALSE;
818 }
819
820 bi = NULL;
821 status = U_ZERO_ERROR;
822 bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
823
824 ubrk_setUText(bi, &ut, &status);
825 /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
826 can't back up. So, we will not do anything. */
827
828 /* now we need to find the end of the chunk the user wants us to return */
829 /* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
830 ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
831
832 utext_close(&ut);
833 ubrk_close(bi);
834
835 if ( NULL != next ) {
836 ZVAL_LONG(next, start+ret_pos);
837 }
838
839 RETURN_STRINGL(((char *)pstr), ret_pos);
840 }
841
PHP_FUNCTION(grapheme_str_split)842 PHP_FUNCTION(grapheme_str_split)
843 {
844 char *pstr, *end;
845 zend_string *str;
846 zend_long split_len = 1;
847
848 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
849 UErrorCode ustatus = U_ZERO_ERROR;
850 int32_t pos, current, i, end_len = 0;
851 UBreakIterator* bi;
852 UText *ut = NULL;
853
854 ZEND_PARSE_PARAMETERS_START(1, 2)
855 Z_PARAM_STR(str)
856 Z_PARAM_OPTIONAL
857 Z_PARAM_LONG(split_len)
858 ZEND_PARSE_PARAMETERS_END();
859
860 if (split_len <= 0 || split_len > UINT_MAX / 4) {
861 zend_argument_value_error(2, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
862 RETURN_THROWS();
863 }
864
865 if (ZSTR_LEN(str) == 0) {
866 RETURN_EMPTY_ARRAY();
867 }
868
869 pstr = ZSTR_VAL(str);
870 ut = utext_openUTF8(ut, pstr, ZSTR_LEN(str), &ustatus);
871
872 if ( U_FAILURE( ustatus ) ) {
873 /* Set global error code. */
874 intl_error_set_code( NULL, ustatus );
875
876 /* Set error messages. */
877 intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
878
879 RETURN_FALSE;
880 }
881
882 bi = NULL;
883 ustatus = U_ZERO_ERROR;
884 bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &ustatus );
885
886 if( U_FAILURE(ustatus) ) {
887 RETURN_FALSE;
888 }
889
890 ubrk_setUText(bi, ut, &ustatus);
891
892 pos = 0;
893 array_init(return_value);
894
895 for (end = pstr, i = 0, current = 0; pos != UBRK_DONE;) {
896 end_len = pos - current;
897 pos = ubrk_next(bi);
898
899 if (i == split_len - 1) {
900 if ( pos != UBRK_DONE ) {
901 add_next_index_stringl(return_value, pstr, pos - current);
902 end = pstr + pos - current;
903 i = 0;
904 }
905 pstr += pos - current;
906 current = pos;
907 } else {
908 i += 1;
909 }
910 }
911
912 if (i != 0 && end_len != 0) {
913 add_next_index_stringl(return_value, end, end_len);
914 }
915
916 utext_close(ut);
917 ubrk_close(bi);
918 }
919
920 /* }}} */
921