1 /*
2 +----------------------------------------------------------------------+
3 | This source file is subject to version 3.01 of the PHP license, |
4 | that is bundled with this package in the file LICENSE, and is |
5 | available through the world-wide-web at the following url: |
6 | http://www.php.net/license/3_01.txt |
7 | If you did not receive a copy of the PHP license and are unable to |
8 | obtain it through the world-wide-web, please send a note to |
9 | license@php.net so we can mail you a copy immediately. |
10 +----------------------------------------------------------------------+
11 | Author: Ed Batutis <ed@batutis.com> |
12 +----------------------------------------------------------------------+
13 */
14
15 /* {{{ includes */
16 #ifdef HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19
20 #include <php.h>
21 #include "grapheme.h"
22 #include "grapheme_util.h"
23
24 #include <unicode/utypes.h>
25 #include <unicode/utf8.h>
26 #include <unicode/ucol.h>
27 #include <unicode/ustring.h>
28 #include <unicode/ubrk.h>
29
30 #include "ext/standard/php_string.h"
31
32 /* }}} */
33
34 #define GRAPHEME_EXTRACT_TYPE_COUNT 0
35 #define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1
36 #define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2
37 #define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT
38 #define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS
39
40
41 /* {{{ grapheme_register_constants
42 * Register API constants
43 */
grapheme_register_constants(INIT_FUNC_ARGS)44 void grapheme_register_constants( INIT_FUNC_ARGS )
45 {
46 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
47 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
48 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
49 }
50 /* }}} */
51
52 /* {{{ Get number of graphemes in a string */
PHP_FUNCTION(grapheme_strlen)53 PHP_FUNCTION(grapheme_strlen)
54 {
55 char* string;
56 size_t string_len;
57 UChar* ustring = NULL;
58 int ustring_len = 0;
59 zend_long ret_len;
60 UErrorCode status;
61
62 if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &string, &string_len) == FAILURE) {
63 RETURN_THROWS();
64 }
65
66 ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
67
68 if ( ret_len >= 0 )
69 RETURN_LONG(string_len);
70
71 /* convert the string to UTF-16. */
72 status = U_ZERO_ERROR;
73 intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
74
75 if ( U_FAILURE( status ) ) {
76 /* Set global error code. */
77 intl_error_set_code( NULL, status );
78
79 /* Set error messages. */
80 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
81 if (ustring) {
82 efree( ustring );
83 }
84 RETURN_NULL();
85 }
86
87 ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
88
89 if (ustring) {
90 efree( ustring );
91 }
92
93 if (ret_len >= 0) {
94 RETVAL_LONG(ret_len);
95 } else {
96 RETVAL_FALSE;
97 }
98 }
99 /* }}} */
100
101 /* {{{ Find position of first occurrence of a string within another */
PHP_FUNCTION(grapheme_strpos)102 PHP_FUNCTION(grapheme_strpos)
103 {
104 char *haystack, *needle;
105 size_t haystack_len, needle_len;
106 const char *found;
107 zend_long loffset = 0;
108 int32_t offset = 0;
109 size_t noffset = 0;
110 zend_long ret_pos;
111
112 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
113 RETURN_THROWS();
114 }
115
116 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
117 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
118 RETURN_THROWS();
119 }
120
121 /* we checked that it will fit: */
122 offset = (int32_t) loffset;
123 noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
124
125 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
126
127 if (offset >= 0 && grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0) {
128 /* quick check to see if the string might be there
129 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
130 */
131 found = php_memnstr(haystack + noffset, needle, needle_len, haystack + haystack_len);
132
133 /* if it isn't there the we are done */
134 if (found) {
135 RETURN_LONG(found - haystack);
136 }
137 RETURN_FALSE;
138 }
139
140 /* do utf16 part of the strpos */
141 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
142
143 if ( ret_pos >= 0 ) {
144 RETURN_LONG(ret_pos);
145 } else {
146 RETURN_FALSE;
147 }
148 }
149 /* }}} */
150
151 /* {{{ Find position of first occurrence of a string within another, ignoring case differences */
PHP_FUNCTION(grapheme_stripos)152 PHP_FUNCTION(grapheme_stripos)
153 {
154 char *haystack, *needle;
155 size_t haystack_len, needle_len;
156 const char *found;
157 zend_long loffset = 0;
158 int32_t offset = 0;
159 zend_long ret_pos;
160 int is_ascii;
161
162 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
163 RETURN_THROWS();
164 }
165
166 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
167 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
168 RETURN_THROWS();
169 }
170
171 /* we checked that it will fit: */
172 offset = (int32_t) loffset;
173
174 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
175
176 is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
177
178 if ( is_ascii ) {
179 char *haystack_dup, *needle_dup;
180 int32_t noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
181 needle_dup = estrndup(needle, needle_len);
182 php_strtolower(needle_dup, needle_len);
183 haystack_dup = estrndup(haystack, haystack_len);
184 php_strtolower(haystack_dup, haystack_len);
185
186 found = php_memnstr(haystack_dup + noffset, needle_dup, needle_len, haystack_dup + haystack_len);
187
188 efree(haystack_dup);
189 efree(needle_dup);
190
191 if (found) {
192 RETURN_LONG(found - haystack_dup);
193 }
194
195 /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
196 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
197 RETURN_FALSE;
198 }
199 }
200
201 /* do utf16 part of the strpos */
202 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
203
204 if ( ret_pos >= 0 ) {
205 RETURN_LONG(ret_pos);
206 } else {
207 RETURN_FALSE;
208 }
209
210 }
211 /* }}} */
212
213 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(grapheme_strrpos)214 PHP_FUNCTION(grapheme_strrpos)
215 {
216 char *haystack, *needle;
217 size_t haystack_len, needle_len;
218 zend_long loffset = 0;
219 int32_t offset = 0;
220 zend_long ret_pos;
221 int is_ascii;
222
223 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
224 RETURN_THROWS();
225 }
226
227 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
228 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
229 RETURN_THROWS();
230 }
231
232 /* we checked that it will fit: */
233 offset = (int32_t) loffset;
234
235 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
236
237 is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
238
239 if ( is_ascii ) {
240
241 ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
242
243 if ( ret_pos >= 0 ) {
244 RETURN_LONG(ret_pos);
245 }
246
247 /* if the needle was ascii too, we are done */
248
249 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
250 RETURN_FALSE;
251 }
252
253 /* else we need to continue via utf16 */
254 }
255
256 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
257
258 if ( ret_pos >= 0 ) {
259 RETURN_LONG(ret_pos);
260 } else {
261 RETURN_FALSE;
262 }
263
264
265 }
266 /* }}} */
267
268 /* {{{ Find position of last occurrence of a string within another, ignoring case */
PHP_FUNCTION(grapheme_strripos)269 PHP_FUNCTION(grapheme_strripos)
270 {
271 char *haystack, *needle;
272 size_t haystack_len, needle_len;
273 zend_long loffset = 0;
274 int32_t offset = 0;
275 zend_long ret_pos;
276 int is_ascii;
277
278 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
279 RETURN_THROWS();
280 }
281
282 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
283 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
284 RETURN_THROWS();
285 }
286
287 /* we checked that it will fit: */
288 offset = (int32_t) loffset;
289
290 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
291
292 is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
293
294 if ( is_ascii ) {
295 char *needle_dup, *haystack_dup;
296
297 needle_dup = estrndup(needle, needle_len);
298 php_strtolower(needle_dup, needle_len);
299 haystack_dup = estrndup(haystack, haystack_len);
300 php_strtolower(haystack_dup, haystack_len);
301
302 ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
303
304 efree(haystack_dup);
305 efree(needle_dup);
306
307 if ( ret_pos >= 0 ) {
308 RETURN_LONG(ret_pos);
309 }
310
311 /* if the needle was ascii too, we are done */
312
313 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
314 RETURN_FALSE;
315 }
316
317 /* else we need to continue via utf16 */
318 }
319
320 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* f_ignore_case */, 1 /*last */);
321
322 if ( ret_pos >= 0 ) {
323 RETURN_LONG(ret_pos);
324 } else {
325 RETURN_FALSE;
326 }
327
328
329 }
330 /* }}} */
331
332 /* {{{ Returns part of a string */
PHP_FUNCTION(grapheme_substr)333 PHP_FUNCTION(grapheme_substr)
334 {
335 char *str;
336 zend_string *u8_sub_str;
337 UChar *ustr;
338 size_t str_len;
339 int32_t ustr_len;
340 zend_long lstart = 0, length = 0;
341 int32_t start = 0;
342 int iter_val;
343 UErrorCode status;
344 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
345 UBreakIterator* bi = NULL;
346 int sub_str_start_pos, sub_str_end_pos;
347 int32_t (*iter_func)(UBreakIterator *);
348 zend_bool no_length = 1;
349
350 if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|l!", &str, &str_len, &lstart, &length, &no_length) == FAILURE) {
351 RETURN_THROWS();
352 }
353
354 if (lstart < INT32_MIN || lstart > INT32_MAX) {
355 zend_argument_value_error(2, "is too large");
356 RETURN_THROWS();
357 }
358
359 start = (int32_t) lstart;
360
361 if (no_length) {
362 length = str_len;
363 }
364
365 if (length < INT32_MIN || length > INT32_MAX) {
366 zend_argument_value_error(3, "is too large");
367 RETURN_THROWS();
368 }
369
370 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
371
372 if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
373 int32_t asub_str_len;
374 char *sub_str;
375 grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
376
377 if ( NULL == sub_str ) {
378 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
379 RETURN_FALSE;
380 }
381
382 RETURN_STRINGL(sub_str, asub_str_len);
383 }
384
385 ustr = NULL;
386 ustr_len = 0;
387 status = U_ZERO_ERROR;
388 intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
389
390 if ( U_FAILURE( status ) ) {
391 /* Set global error code. */
392 intl_error_set_code( NULL, status );
393
394 /* Set error messages. */
395 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
396 if (ustr) {
397 efree( ustr );
398 }
399 RETURN_FALSE;
400 }
401
402 bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
403
404 if( U_FAILURE(status) ) {
405 RETURN_FALSE;
406 }
407
408 ubrk_setText(bi, ustr, ustr_len, &status);
409
410 if ( start < 0 ) {
411 iter_func = ubrk_previous;
412 ubrk_last(bi);
413 iter_val = 1;
414 }
415 else {
416 iter_func = ubrk_next;
417 iter_val = -1;
418 }
419
420 sub_str_start_pos = 0;
421
422 while ( start ) {
423 sub_str_start_pos = iter_func(bi);
424
425 if ( UBRK_DONE == sub_str_start_pos ) {
426 break;
427 }
428
429 start += iter_val;
430 }
431
432 if (0 != start) {
433 if (start > 0) {
434 if (ustr) {
435 efree(ustr);
436 }
437 ubrk_close(bi);
438 RETURN_EMPTY_STRING();
439 }
440
441 sub_str_start_pos = 0;
442 ubrk_first(bi);
443 }
444
445 /* OK to convert here since if str_len were big, convert above would fail */
446 if (length >= (int32_t)str_len) {
447
448 /* no length supplied or length is too big, return the rest of the string */
449
450 status = U_ZERO_ERROR;
451 u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
452
453 if (ustr) {
454 efree( ustr );
455 }
456 ubrk_close( bi );
457
458 if ( !u8_sub_str ) {
459 /* Set global error code. */
460 intl_error_set_code( NULL, status );
461
462 /* Set error messages. */
463 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
464
465 RETURN_FALSE;
466 }
467
468 /* return the allocated string, not a duplicate */
469 RETVAL_NEW_STR(u8_sub_str);
470 return;
471 }
472
473 if(length == 0) {
474 /* empty length - we've validated start, we can return "" now */
475 if (ustr) {
476 efree(ustr);
477 }
478 ubrk_close(bi);
479 RETURN_EMPTY_STRING();
480 }
481
482 /* find the end point of the string to return */
483
484 if ( length < 0 ) {
485 iter_func = ubrk_previous;
486 ubrk_last(bi);
487 iter_val = 1;
488 }
489 else {
490 iter_func = ubrk_next;
491 iter_val = -1;
492 }
493
494 sub_str_end_pos = 0;
495
496 while ( length ) {
497 sub_str_end_pos = iter_func(bi);
498
499 if ( UBRK_DONE == sub_str_end_pos ) {
500 break;
501 }
502
503 length += iter_val;
504 }
505
506 ubrk_close(bi);
507
508 if ( UBRK_DONE == sub_str_end_pos) {
509 if (length < 0) {
510 efree(ustr);
511 RETURN_EMPTY_STRING();
512 } else {
513 sub_str_end_pos = ustr_len;
514 }
515 }
516
517 if (sub_str_start_pos > sub_str_end_pos) {
518 efree(ustr);
519 RETURN_EMPTY_STRING();
520 }
521
522 status = U_ZERO_ERROR;
523 u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
524
525 efree( ustr );
526
527 if ( !u8_sub_str ) {
528 /* Set global error code. */
529 intl_error_set_code( NULL, status );
530
531 /* Set error messages. */
532 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
533
534 RETURN_FALSE;
535 }
536
537 /* return the allocated string, not a duplicate */
538 RETVAL_NEW_STR(u8_sub_str);
539 }
540 /* }}} */
541
542 /* {{{ strstr_common_handler */
strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS,int f_ignore_case)543 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
544 {
545 char *haystack, *needle;
546 const char *found;
547 size_t haystack_len, needle_len;
548 int32_t ret_pos, uchar_pos;
549 zend_bool part = 0;
550
551 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|b", &haystack, &haystack_len, &needle, &needle_len, &part) == FAILURE) {
552 RETURN_THROWS();
553 }
554
555 if ( !f_ignore_case ) {
556
557 /* ASCII optimization: quick check to see if the string might be there */
558 found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
559
560 /* if it isn't there the we are done */
561 if ( !found ) {
562 RETURN_FALSE;
563 }
564
565 /* if it is there, and if the haystack is ascii, we are all done */
566 if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
567 size_t found_offset = found - haystack;
568
569 if (part) {
570 RETURN_STRINGL(haystack, found_offset);
571 } else {
572 RETURN_STRINGL(found, haystack_len - found_offset);
573 }
574 }
575
576 }
577
578 /* need to work in utf16 */
579 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
580
581 if ( ret_pos < 0 ) {
582 RETURN_FALSE;
583 }
584
585 /* uchar_pos is the 'nth' Unicode character position of the needle */
586
587 ret_pos = 0;
588 U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
589
590 if (part) {
591 RETURN_STRINGL(haystack, ret_pos);
592 } else {
593 RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
594 }
595
596 }
597 /* }}} */
598
599 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_strstr)600 PHP_FUNCTION(grapheme_strstr)
601 {
602 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
603 }
604 /* }}} */
605
606 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_stristr)607 PHP_FUNCTION(grapheme_stristr)
608 {
609 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
610 }
611 /* }}} */
612
613 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
614 static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator * bi,int32_t csize,unsigned char * pstr,int32_t str_len)615 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
616 {
617 int pos = 0;
618 int ret_pos = 0;
619 int break_pos, prev_break_pos;
620 int count = 0;
621
622 while ( 1 ) {
623 pos = ubrk_next(bi);
624
625 if ( UBRK_DONE == pos ) {
626 break;
627 }
628
629 for ( break_pos = ret_pos; break_pos < pos; ) {
630 count++;
631 prev_break_pos = break_pos;
632 U8_FWD_1(pstr, break_pos, str_len);
633
634 if ( prev_break_pos == break_pos ) {
635 /* something wrong - malformed utf8? */
636 csize = 0;
637 break;
638 }
639 }
640
641 /* if we are beyond our limit, then the loop is done */
642 if ( count > csize ) {
643 break;
644 }
645
646 ret_pos = break_pos;
647 }
648
649 return ret_pos;
650 }
651 /* }}} */
652
653 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
654 static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator * bi,int32_t bsize,unsigned char * pstr,int32_t str_len)655 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
656 {
657 int pos = 0;
658 int ret_pos = 0;
659
660 while ( 1 ) {
661 pos = ubrk_next(bi);
662
663 if ( UBRK_DONE == pos ) {
664 break;
665 }
666
667 if ( pos > bsize ) {
668 break;
669 }
670
671 ret_pos = pos;
672 }
673
674 return ret_pos;
675 }
676 /* }}} */
677
678 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
679 static inline int32_t
grapheme_extract_count_iter(UBreakIterator * bi,int32_t size,unsigned char * pstr,int32_t str_len)680 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
681 {
682 int next_pos = 0;
683 int ret_pos = 0;
684
685 while ( size ) {
686 next_pos = ubrk_next(bi);
687
688 if ( UBRK_DONE == next_pos ) {
689 break;
690 }
691 ret_pos = next_pos;
692 size--;
693 }
694
695 return ret_pos;
696 }
697 /* }}} */
698
699 /* {{{ grapheme extract iter function pointer array */
700 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
701
702 static grapheme_extract_iter grapheme_extract_iters[] = {
703 &grapheme_extract_count_iter,
704 &grapheme_extract_bytecount_iter,
705 &grapheme_extract_charcount_iter,
706 };
707 /* }}} */
708
709 /* {{{ Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)710 PHP_FUNCTION(grapheme_extract)
711 {
712 char *str, *pstr;
713 UText ut = UTEXT_INITIALIZER;
714 size_t str_len;
715 zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
716 zend_long lstart = 0; /* starting position in str in bytes */
717 int32_t start = 0;
718 zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
719 UErrorCode status;
720 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
721 UBreakIterator* bi = NULL;
722 int ret_pos;
723 zval *next = NULL; /* return offset of next part of the string */
724
725 if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|llz", &str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
726 RETURN_THROWS();
727 }
728
729 if (lstart < 0) {
730 lstart += str_len;
731 }
732
733 if ( NULL != next ) {
734 if ( !Z_ISREF_P(next) ) {
735 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
736 "grapheme_extract: 'next' was not passed by reference", 0 );
737 RETURN_FALSE;
738 } else {
739 ZVAL_DEREF(next);
740 /* initialize next */
741 zval_ptr_dtor(next);
742 ZVAL_LONG(next, lstart);
743 }
744 }
745
746 if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
747 zend_argument_value_error(3, "must be one of GRAPHEME_EXTR_COUNT, GRAPHEME_EXTR_MAXBYTES, or GRAPHEME_EXTR_MAXCHARS");
748 RETURN_THROWS();
749 }
750
751 if ( lstart > INT32_MAX || lstart < 0 || (size_t)lstart >= str_len ) {
752 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
753 RETURN_FALSE;
754 }
755
756 if (size < 0) {
757 zend_argument_value_error(2, "must be greater than or equal to 0");
758 RETURN_THROWS();
759 }
760
761 if (size > INT32_MAX) {
762 zend_argument_value_error(2, "is too large");
763 RETURN_THROWS();
764 }
765
766 if (size == 0) {
767 RETURN_EMPTY_STRING();
768 }
769
770 /* we checked that it will fit: */
771 start = (int32_t) lstart;
772
773 pstr = str + start;
774
775 /* just in case pstr points in the middle of a character, move forward to the start of the next char */
776 if ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
777 char *str_end = str + str_len;
778
779 while ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
780 pstr++;
781 if ( pstr >= str_end ) {
782 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
783 "grapheme_extract: invalid input string", 0 );
784
785 RETURN_FALSE;
786 }
787 }
788 }
789
790 str_len -= (pstr - str);
791
792 /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
793 (size + 1 because the size-th character might be the beginning of a grapheme cluster)
794 */
795
796 if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
797 size_t nsize = MIN(size, str_len);
798 if ( NULL != next ) {
799 ZVAL_LONG(next, start+nsize);
800 }
801 RETURN_STRINGL(pstr, nsize);
802 }
803
804 status = U_ZERO_ERROR;
805 utext_openUTF8(&ut, pstr, str_len, &status);
806
807 if ( U_FAILURE( status ) ) {
808 /* Set global error code. */
809 intl_error_set_code( NULL, status );
810
811 /* Set error messages. */
812 intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
813
814 RETURN_FALSE;
815 }
816
817 bi = NULL;
818 status = U_ZERO_ERROR;
819 bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
820
821 ubrk_setUText(bi, &ut, &status);
822 /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
823 can't back up. So, we will not do anything. */
824
825 /* now we need to find the end of the chunk the user wants us to return */
826 /* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
827 ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
828
829 utext_close(&ut);
830 ubrk_close(bi);
831
832 if ( NULL != next ) {
833 ZVAL_LONG(next, start+ret_pos);
834 }
835
836 RETURN_STRINGL(((char *)pstr), ret_pos);
837 }
838
839 /* }}} */
840