xref: /php-src/ext/intl/grapheme/grapheme_string.c (revision 44e8301c)
1 /*
2    +----------------------------------------------------------------------+
3    | This source file is subject to version 3.01 of the PHP license,      |
4    | that is bundled with this package in the file LICENSE, and is        |
5    | available through the world-wide-web at the following url:           |
6    | https://www.php.net/license/3_01.txt                                 |
7    | If you did not receive a copy of the PHP license and are unable to   |
8    | obtain it through the world-wide-web, please send a note to          |
9    | license@php.net so we can mail you a copy immediately.               |
10    +----------------------------------------------------------------------+
11    | Author: Ed Batutis <ed@batutis.com>								  |
12    +----------------------------------------------------------------------+
13  */
14 
15 /* {{{ includes */
16 #ifdef HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19 
20 #include <php.h>
21 #include "grapheme.h"
22 #include "grapheme_util.h"
23 
24 #include <unicode/utypes.h>
25 #include <unicode/utf8.h>
26 #include <unicode/ucol.h>
27 #include <unicode/ustring.h>
28 #include <unicode/ubrk.h>
29 
30 /* }}} */
31 
32 /* {{{ Get number of graphemes in a string */
PHP_FUNCTION(grapheme_strlen)33 PHP_FUNCTION(grapheme_strlen)
34 {
35 	char* string;
36 	size_t string_len;
37 	UChar* ustring = NULL;
38 	int ustring_len = 0;
39 	zend_long ret_len;
40 	UErrorCode status;
41 
42 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &string, &string_len) == FAILURE) {
43 		RETURN_THROWS();
44 	}
45 
46 	ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
47 
48 	if ( ret_len >= 0 )
49 		RETURN_LONG(string_len);
50 
51 	/* convert the string to UTF-16. */
52 	status = U_ZERO_ERROR;
53 	intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
54 
55 	if ( U_FAILURE( status ) ) {
56 		/* Set global error code. */
57 		intl_error_set_code( NULL, status );
58 
59 		/* Set error messages. */
60 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
61 		if (ustring) {
62 			efree( ustring );
63 		}
64 		RETURN_NULL();
65 	}
66 
67 	ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
68 
69 	if (ustring) {
70 		efree( ustring );
71 	}
72 
73 	if (ret_len >= 0) {
74 		RETVAL_LONG(ret_len);
75 	} else {
76 		RETVAL_FALSE;
77 	}
78 }
79 /* }}} */
80 
81 /* {{{ Find position of first occurrence of a string within another */
PHP_FUNCTION(grapheme_strpos)82 PHP_FUNCTION(grapheme_strpos)
83 {
84 	char *haystack, *needle;
85 	size_t haystack_len, needle_len;
86 	const char *found;
87 	zend_long loffset = 0;
88 	int32_t offset = 0;
89 	size_t noffset = 0;
90 	zend_long ret_pos;
91 
92 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
93 		RETURN_THROWS();
94 	}
95 
96 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
97 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
98 		RETURN_THROWS();
99 	}
100 
101 	/* we checked that it will fit: */
102 	offset = (int32_t) loffset;
103 	noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
104 
105 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
106 
107 	if (offset >= 0 && grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0) {
108 		/* quick check to see if the string might be there
109 		 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
110 		*/
111 		found = php_memnstr(haystack + noffset, needle, needle_len, haystack + haystack_len);
112 
113 		/* if it isn't there the we are done */
114 		if (found) {
115 			RETURN_LONG(found - haystack);
116 		}
117 		RETURN_FALSE;
118 	}
119 
120 	/* do utf16 part of the strpos */
121 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
122 
123 	if ( ret_pos >= 0 ) {
124 		RETURN_LONG(ret_pos);
125 	} else {
126 		RETURN_FALSE;
127 	}
128 }
129 /* }}} */
130 
131 /* {{{ Find position of first occurrence of a string within another, ignoring case differences */
PHP_FUNCTION(grapheme_stripos)132 PHP_FUNCTION(grapheme_stripos)
133 {
134 	char *haystack, *needle;
135 	size_t haystack_len, needle_len;
136 	const char *found;
137 	zend_long loffset = 0;
138 	int32_t offset = 0;
139 	zend_long ret_pos;
140 	int is_ascii;
141 
142 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
143 		RETURN_THROWS();
144 	}
145 
146 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
147 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
148 		RETURN_THROWS();
149 	}
150 
151 	/* we checked that it will fit: */
152 	offset = (int32_t) loffset;
153 
154 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
155 
156 	is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
157 
158 	if ( is_ascii ) {
159 		char *haystack_dup, *needle_dup;
160 		int32_t noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
161 		needle_dup = estrndup(needle, needle_len);
162 		zend_str_tolower(needle_dup, needle_len);
163 		haystack_dup = estrndup(haystack, haystack_len);
164 		zend_str_tolower(haystack_dup, haystack_len);
165 
166 		found = php_memnstr(haystack_dup + noffset, needle_dup, needle_len, haystack_dup + haystack_len);
167 
168 		efree(haystack_dup);
169 		efree(needle_dup);
170 
171 		if (found) {
172 			RETURN_LONG(found - haystack_dup);
173 		}
174 
175 		/* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
176 		if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
177 			RETURN_FALSE;
178 		}
179 	}
180 
181 	/* do utf16 part of the strpos */
182 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
183 
184 	if ( ret_pos >= 0 ) {
185 		RETURN_LONG(ret_pos);
186 	} else {
187 		RETURN_FALSE;
188 	}
189 
190 }
191 /* }}} */
192 
193 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(grapheme_strrpos)194 PHP_FUNCTION(grapheme_strrpos)
195 {
196 	char *haystack, *needle;
197 	size_t haystack_len, needle_len;
198 	zend_long loffset = 0;
199 	int32_t offset = 0;
200 	zend_long ret_pos;
201 	int is_ascii;
202 
203 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
204 		RETURN_THROWS();
205 	}
206 
207 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
208 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
209 		RETURN_THROWS();
210 	}
211 
212 	/* we checked that it will fit: */
213 	offset = (int32_t) loffset;
214 
215 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
216 
217 	is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
218 
219 	if ( is_ascii ) {
220 
221 		ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
222 
223 		if ( ret_pos >= 0 ) {
224 			RETURN_LONG(ret_pos);
225 		}
226 
227 		/* if the needle was ascii too, we are done */
228 
229 		if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
230 			RETURN_FALSE;
231 		}
232 
233 		/* else we need to continue via utf16 */
234 	}
235 
236 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
237 
238 	if ( ret_pos >= 0 ) {
239 		RETURN_LONG(ret_pos);
240 	} else {
241 		RETURN_FALSE;
242 	}
243 
244 
245 }
246 /* }}} */
247 
248 /* {{{ Find position of last occurrence of a string within another, ignoring case */
PHP_FUNCTION(grapheme_strripos)249 PHP_FUNCTION(grapheme_strripos)
250 {
251 	char *haystack, *needle;
252 	size_t haystack_len, needle_len;
253 	zend_long loffset = 0;
254 	int32_t offset = 0;
255 	zend_long ret_pos;
256 	int is_ascii;
257 
258 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
259 		RETURN_THROWS();
260 	}
261 
262 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
263 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
264 		RETURN_THROWS();
265 	}
266 
267 	/* we checked that it will fit: */
268 	offset = (int32_t) loffset;
269 
270 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
271 
272 	is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
273 
274 	if ( is_ascii ) {
275 		char *needle_dup, *haystack_dup;
276 
277 		needle_dup = estrndup(needle, needle_len);
278 		zend_str_tolower(needle_dup, needle_len);
279 		haystack_dup = estrndup(haystack, haystack_len);
280 		zend_str_tolower(haystack_dup, haystack_len);
281 
282 		ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
283 
284 		efree(haystack_dup);
285 		efree(needle_dup);
286 
287 		if ( ret_pos >= 0 ) {
288 			RETURN_LONG(ret_pos);
289 		}
290 
291 		/* if the needle was ascii too, we are done */
292 
293 		if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
294 			RETURN_FALSE;
295 		}
296 
297 		/* else we need to continue via utf16 */
298 	}
299 
300 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL,  1 /* f_ignore_case */, 1 /*last */);
301 
302 	if ( ret_pos >= 0 ) {
303 		RETURN_LONG(ret_pos);
304 	} else {
305 		RETURN_FALSE;
306 	}
307 
308 
309 }
310 /* }}} */
311 
312 /* {{{ Returns part of a string */
PHP_FUNCTION(grapheme_substr)313 PHP_FUNCTION(grapheme_substr)
314 {
315 	char *str;
316 	zend_string *u8_sub_str;
317 	UChar *ustr;
318 	size_t str_len;
319 	int32_t ustr_len;
320 	zend_long lstart = 0, length = 0;
321 	int32_t start = 0;
322 	int iter_val;
323 	UErrorCode status;
324 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
325 	UBreakIterator* bi = NULL;
326 	int sub_str_start_pos, sub_str_end_pos;
327 	int32_t (*iter_func)(UBreakIterator *);
328 	bool no_length = 1;
329 
330 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|l!", &str, &str_len, &lstart, &length, &no_length) == FAILURE) {
331 		RETURN_THROWS();
332 	}
333 
334 	if (lstart < INT32_MIN || lstart > INT32_MAX) {
335 		zend_argument_value_error(2, "is too large");
336 		RETURN_THROWS();
337 	}
338 
339 	start = (int32_t) lstart;
340 
341 	if (no_length) {
342 		length = str_len;
343 	}
344 
345 	if (length < INT32_MIN || length > INT32_MAX) {
346 		zend_argument_value_error(3, "is too large");
347 		RETURN_THROWS();
348 	}
349 
350 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
351 
352 	if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
353 		int32_t asub_str_len;
354 		char *sub_str;
355 		grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
356 
357 		if ( NULL == sub_str ) {
358 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
359 			RETURN_FALSE;
360 		}
361 
362 		RETURN_STRINGL(sub_str, asub_str_len);
363 	}
364 
365 	ustr = NULL;
366 	ustr_len = 0;
367 	status = U_ZERO_ERROR;
368 	intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
369 
370 	if ( U_FAILURE( status ) ) {
371 		/* Set global error code. */
372 		intl_error_set_code( NULL, status );
373 
374 		/* Set error messages. */
375 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
376 		if (ustr) {
377 			efree( ustr );
378 		}
379 		RETURN_FALSE;
380 	}
381 
382 	bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
383 
384 	if( U_FAILURE(status) ) {
385 		RETURN_FALSE;
386 	}
387 
388 	ubrk_setText(bi, ustr, ustr_len,	&status);
389 
390 	if ( start < 0 ) {
391 		iter_func = ubrk_previous;
392 		ubrk_last(bi);
393 		iter_val = 1;
394 	}
395 	else {
396 		iter_func = ubrk_next;
397 		iter_val = -1;
398 	}
399 
400 	sub_str_start_pos = 0;
401 
402 	while ( start ) {
403 		sub_str_start_pos = iter_func(bi);
404 
405 		if ( UBRK_DONE == sub_str_start_pos ) {
406 			break;
407 		}
408 
409 		start += iter_val;
410 	}
411 
412 	if (0 != start) {
413 		if (start > 0) {
414 			if (ustr) {
415 				efree(ustr);
416 			}
417 			ubrk_close(bi);
418 			RETURN_EMPTY_STRING();
419 		}
420 
421 		sub_str_start_pos = 0;
422 		ubrk_first(bi);
423 	}
424 
425 	/* OK to convert here since if str_len were big, convert above would fail */
426 	if (length >= (int32_t)str_len) {
427 
428 		/* no length supplied or length is too big, return the rest of the string */
429 
430 		status = U_ZERO_ERROR;
431 		u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
432 
433 		if (ustr) {
434 			efree( ustr );
435 		}
436 		ubrk_close( bi );
437 
438 		if ( !u8_sub_str ) {
439 			/* Set global error code. */
440 			intl_error_set_code( NULL, status );
441 
442 			/* Set error messages. */
443 			intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
444 
445 			RETURN_FALSE;
446 		}
447 
448 		/* return the allocated string, not a duplicate */
449 		RETVAL_NEW_STR(u8_sub_str);
450 		return;
451 	}
452 
453 	if(length == 0) {
454 		/* empty length - we've validated start, we can return "" now */
455 		if (ustr) {
456 			efree(ustr);
457 		}
458 		ubrk_close(bi);
459 		RETURN_EMPTY_STRING();
460 	}
461 
462 	/* find the end point of the string to return */
463 
464 	if ( length < 0 ) {
465 		iter_func = ubrk_previous;
466 		ubrk_last(bi);
467 		iter_val = 1;
468 	}
469 	else {
470 		iter_func = ubrk_next;
471 		iter_val = -1;
472 	}
473 
474 	sub_str_end_pos = 0;
475 
476 	while ( length ) {
477 		sub_str_end_pos = iter_func(bi);
478 
479 		if ( UBRK_DONE == sub_str_end_pos ) {
480 			break;
481 		}
482 
483 		length += iter_val;
484 	}
485 
486 	ubrk_close(bi);
487 
488 	if ( UBRK_DONE == sub_str_end_pos) {
489 		if (length < 0) {
490 			efree(ustr);
491 			RETURN_EMPTY_STRING();
492 		} else {
493 			sub_str_end_pos = ustr_len;
494 		}
495 	}
496 
497 	if (sub_str_start_pos > sub_str_end_pos) {
498 		efree(ustr);
499 		RETURN_EMPTY_STRING();
500 	}
501 
502 	status = U_ZERO_ERROR;
503 	u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
504 
505 	efree( ustr );
506 
507 	if ( !u8_sub_str ) {
508 		/* Set global error code. */
509 		intl_error_set_code( NULL, status );
510 
511 		/* Set error messages. */
512 		intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
513 
514 		RETURN_FALSE;
515 	}
516 
517 	 /* return the allocated string, not a duplicate */
518 	RETVAL_NEW_STR(u8_sub_str);
519 }
520 /* }}} */
521 
522 /* {{{	strstr_common_handler */
strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS,int f_ignore_case)523 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
524 {
525 	char *haystack, *needle;
526 	const char *found;
527 	size_t haystack_len, needle_len;
528 	int32_t ret_pos, uchar_pos;
529 	bool part = 0;
530 
531 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|b", &haystack, &haystack_len, &needle, &needle_len, &part) == FAILURE) {
532 		RETURN_THROWS();
533 	}
534 
535 	if ( !f_ignore_case ) {
536 
537 		/* ASCII optimization: quick check to see if the string might be there */
538 		found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
539 
540 		/* if it isn't there the we are done */
541 		if ( !found ) {
542 			RETURN_FALSE;
543 		}
544 
545 		/* if it is there, and if the haystack is ascii, we are all done */
546 		if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
547 			size_t found_offset = found - haystack;
548 
549 			if (part) {
550 				RETURN_STRINGL(haystack, found_offset);
551 			} else {
552 				RETURN_STRINGL(found, haystack_len - found_offset);
553 			}
554 		}
555 
556 	}
557 
558 	/* need to work in utf16 */
559 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
560 
561 	if ( ret_pos < 0 ) {
562 		RETURN_FALSE;
563 	}
564 
565 	/* uchar_pos is the 'nth' Unicode character position of the needle */
566 
567 	ret_pos = 0;
568 	U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
569 
570 	if (part) {
571 		RETURN_STRINGL(haystack, ret_pos);
572 	} else {
573 		RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
574 	}
575 
576 }
577 /* }}} */
578 
579 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_strstr)580 PHP_FUNCTION(grapheme_strstr)
581 {
582 	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
583 }
584 /* }}} */
585 
586 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_stristr)587 PHP_FUNCTION(grapheme_stristr)
588 {
589 	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
590 }
591 /* }}} */
592 
593 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
594 static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator * bi,int32_t csize,unsigned char * pstr,int32_t str_len)595 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
596 {
597 	int pos = 0;
598 	int ret_pos = 0;
599 	int break_pos, prev_break_pos;
600 	int count = 0;
601 
602 	while ( 1 ) {
603 		pos = ubrk_next(bi);
604 
605 		if ( UBRK_DONE == pos ) {
606 			break;
607 		}
608 
609 		for ( break_pos = ret_pos; break_pos < pos; ) {
610 			count++;
611 			prev_break_pos = break_pos;
612 			U8_FWD_1(pstr, break_pos, str_len);
613 
614 			if ( prev_break_pos == break_pos ) {
615 				/* something wrong - malformed utf8? */
616 				csize = 0;
617 				break;
618 			}
619 		}
620 
621 		/* if we are beyond our limit, then the loop is done */
622 		if ( count > csize ) {
623 			break;
624 		}
625 
626 		ret_pos = break_pos;
627 	}
628 
629 	return ret_pos;
630 }
631 /* }}} */
632 
633 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
634 static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator * bi,int32_t bsize,unsigned char * pstr,int32_t str_len)635 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
636 {
637 	int pos = 0;
638 	int ret_pos = 0;
639 
640 	while ( 1 ) {
641 		pos = ubrk_next(bi);
642 
643 		if ( UBRK_DONE == pos ) {
644 			break;
645 		}
646 
647 		if ( pos > bsize ) {
648 			break;
649 		}
650 
651 		ret_pos = pos;
652 	}
653 
654 	return ret_pos;
655 }
656 /* }}} */
657 
658 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
659 static inline int32_t
grapheme_extract_count_iter(UBreakIterator * bi,int32_t size,unsigned char * pstr,int32_t str_len)660 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
661 {
662 	int next_pos = 0;
663 	int ret_pos = 0;
664 
665 	while ( size ) {
666 		next_pos = ubrk_next(bi);
667 
668 		if ( UBRK_DONE == next_pos ) {
669 			break;
670 		}
671 		ret_pos = next_pos;
672 		size--;
673 	}
674 
675 	return ret_pos;
676 }
677 /* }}} */
678 
679 /* {{{ grapheme extract iter function pointer array */
680 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
681 
682 static const grapheme_extract_iter grapheme_extract_iters[] = {
683 	&grapheme_extract_count_iter,
684 	&grapheme_extract_bytecount_iter,
685 	&grapheme_extract_charcount_iter,
686 };
687 /* }}} */
688 
689 /* {{{ Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)690 PHP_FUNCTION(grapheme_extract)
691 {
692 	char *str, *pstr;
693 	UText ut = UTEXT_INITIALIZER;
694 	size_t str_len;
695 	zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
696 	zend_long lstart = 0; /* starting position in str in bytes */
697 	int32_t start = 0;
698 	zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
699 	UErrorCode status;
700 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
701 	UBreakIterator* bi = NULL;
702 	int ret_pos;
703 	zval *next = NULL; /* return offset of next part of the string */
704 
705 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|llz", &str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
706 		RETURN_THROWS();
707 	}
708 
709 	if (lstart < 0) {
710 		lstart += str_len;
711 	}
712 
713 	if ( NULL != next ) {
714 		if ( !Z_ISREF_P(next) ) {
715 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
716 				 "grapheme_extract: 'next' was not passed by reference", 0 );
717 			RETURN_FALSE;
718 		} else {
719 			ZVAL_DEREF(next);
720 			/* initialize next */
721 			zval_ptr_dtor(next);
722 			ZVAL_LONG(next, lstart);
723 		}
724 	}
725 
726 	if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
727 		zend_argument_value_error(3, "must be one of GRAPHEME_EXTR_COUNT, GRAPHEME_EXTR_MAXBYTES, or GRAPHEME_EXTR_MAXCHARS");
728 		RETURN_THROWS();
729 	}
730 
731 	if ( lstart > INT32_MAX || lstart < 0 || (size_t)lstart >= str_len ) {
732 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
733 		RETURN_FALSE;
734 	}
735 
736 	if (size < 0) {
737 		zend_argument_value_error(2, "must be greater than or equal to 0");
738 		RETURN_THROWS();
739 	}
740 
741 	if (size > INT32_MAX) {
742 		zend_argument_value_error(2, "is too large");
743 		RETURN_THROWS();
744 	}
745 
746 	if (size == 0) {
747 		RETURN_EMPTY_STRING();
748 	}
749 
750 	/* we checked that it will fit: */
751 	start = (int32_t) lstart;
752 
753 	pstr = str + start;
754 
755 	/* just in case pstr points in the middle of a character, move forward to the start of the next char */
756 	if ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
757 		char *str_end = str + str_len;
758 
759 		while ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
760 			pstr++;
761 			if ( pstr >= str_end ) {
762 				intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
763 								"grapheme_extract: invalid input string", 0 );
764 
765 				RETURN_FALSE;
766 			}
767 		}
768 	}
769 
770 	str_len -= (pstr - str);
771 
772 	/* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
773 		(size + 1 because the size-th character might be the beginning of a grapheme cluster)
774 	 */
775 
776 	if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
777 		size_t nsize = MIN(size, str_len);
778 		if ( NULL != next ) {
779 			ZVAL_LONG(next, start+nsize);
780 		}
781 		RETURN_STRINGL(pstr, nsize);
782 	}
783 
784 	status = U_ZERO_ERROR;
785 	utext_openUTF8(&ut, pstr, str_len, &status);
786 
787 	if ( U_FAILURE( status ) ) {
788 		/* Set global error code. */
789 		intl_error_set_code( NULL, status );
790 
791 		/* Set error messages. */
792 		intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
793 
794 		RETURN_FALSE;
795 	}
796 
797 	bi = NULL;
798 	status = U_ZERO_ERROR;
799 	bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
800 
801 	ubrk_setUText(bi, &ut, &status);
802 	/* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
803 		can't back up. So, we will not do anything. */
804 
805 	/* now we need to find the end of the chunk the user wants us to return */
806 	/* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
807 	ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
808 
809 	utext_close(&ut);
810 	ubrk_close(bi);
811 
812 	if ( NULL != next ) {
813 		ZVAL_LONG(next, start+ret_pos);
814 	}
815 
816 	RETURN_STRINGL(((char *)pstr), ret_pos);
817 }
818 
PHP_FUNCTION(grapheme_str_split)819 PHP_FUNCTION(grapheme_str_split)
820 {
821 	char *pstr, *end;
822 	zend_string *str;
823 	zend_long split_len = 1;
824 
825 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
826 	UErrorCode ustatus = U_ZERO_ERROR;
827 	int32_t pos, current, i, end_len = 0;
828 	UBreakIterator* bi;
829 	UText *ut = NULL;
830 
831 	ZEND_PARSE_PARAMETERS_START(1, 2)
832 		Z_PARAM_STR(str)
833 		Z_PARAM_OPTIONAL
834 		Z_PARAM_LONG(split_len)
835 	ZEND_PARSE_PARAMETERS_END();
836 
837 	if (split_len <= 0 || split_len > UINT_MAX / 4) {
838 		zend_argument_value_error(2, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
839 		RETURN_THROWS();
840 	}
841 
842 	if (ZSTR_LEN(str) == 0) {
843 		RETURN_EMPTY_ARRAY();
844 	}
845 
846 	pstr = ZSTR_VAL(str);
847 	ut = utext_openUTF8(ut, pstr, ZSTR_LEN(str), &ustatus);
848 
849 	if ( U_FAILURE( ustatus ) ) {
850 		/* Set global error code. */
851 		intl_error_set_code( NULL, ustatus );
852 
853 		/* Set error messages. */
854 		intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
855 
856 		RETURN_FALSE;
857 	}
858 
859 	bi = NULL;
860 	ustatus = U_ZERO_ERROR;
861 	bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &ustatus );
862 
863 	if( U_FAILURE(ustatus) ) {
864 		RETURN_FALSE;
865 	}
866 
867 	ubrk_setUText(bi, ut, &ustatus);
868 
869 	pos = 0;
870 	array_init(return_value);
871 
872 	for (end = pstr, i = 0, current = 0; pos != UBRK_DONE;) {
873 		end_len = pos - current;
874 		pos = ubrk_next(bi);
875 
876 		if (i == split_len - 1) {
877 			if ( pos != UBRK_DONE ) {
878 				add_next_index_stringl(return_value, pstr, pos - current);
879 				end = pstr + pos - current;
880 				i = 0;
881 			}
882 			pstr += pos - current;
883 			current = pos;
884 		} else {
885 			i += 1;
886 		}
887 	}
888 
889 	if (i != 0 && end_len != 0) {
890 		add_next_index_stringl(return_value, end, end_len);
891 	}
892 
893 	utext_close(ut);
894 	ubrk_close(bi);
895 }
896 
897 /* }}} */
898