xref: /PHP-8.0/ext/intl/grapheme/grapheme_string.c (revision 7af24eae)
1 /*
2    +----------------------------------------------------------------------+
3    | This source file is subject to version 3.01 of the PHP license,	  |
4    | that is bundled with this package in the file LICENSE, and is		  |
5    | available through the world-wide-web at the following url:			  |
6    | http://www.php.net/license/3_01.txt								  |
7    | If you did not receive a copy of the PHP license and are unable to   |
8    | obtain it through the world-wide-web, please send a note to		  |
9    | license@php.net so we can mail you a copy immediately.				  |
10    +----------------------------------------------------------------------+
11    | Author: Ed Batutis <ed@batutis.com>								  |
12    +----------------------------------------------------------------------+
13  */
14 
15 /* {{{ includes */
16 #ifdef HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19 
20 #include <php.h>
21 #include "grapheme.h"
22 #include "grapheme_util.h"
23 
24 #include <unicode/utypes.h>
25 #include <unicode/utf8.h>
26 #include <unicode/ucol.h>
27 #include <unicode/ustring.h>
28 #include <unicode/ubrk.h>
29 
30 #include "ext/standard/php_string.h"
31 
32 /* }}} */
33 
34 #define GRAPHEME_EXTRACT_TYPE_COUNT		0
35 #define GRAPHEME_EXTRACT_TYPE_MAXBYTES	1
36 #define GRAPHEME_EXTRACT_TYPE_MAXCHARS	2
37 #define GRAPHEME_EXTRACT_TYPE_MIN	GRAPHEME_EXTRACT_TYPE_COUNT
38 #define GRAPHEME_EXTRACT_TYPE_MAX	GRAPHEME_EXTRACT_TYPE_MAXCHARS
39 
40 
41 /* {{{ grapheme_register_constants
42  * Register API constants
43  */
grapheme_register_constants(INIT_FUNC_ARGS)44 void grapheme_register_constants( INIT_FUNC_ARGS )
45 {
46 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
47 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
48 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
49 }
50 /* }}} */
51 
52 /* {{{ Get number of graphemes in a string */
PHP_FUNCTION(grapheme_strlen)53 PHP_FUNCTION(grapheme_strlen)
54 {
55 	char* string;
56 	size_t string_len;
57 	UChar* ustring = NULL;
58 	int ustring_len = 0;
59 	zend_long ret_len;
60 	UErrorCode status;
61 
62 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &string, &string_len) == FAILURE) {
63 		RETURN_THROWS();
64 	}
65 
66 	ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
67 
68 	if ( ret_len >= 0 )
69 		RETURN_LONG(string_len);
70 
71 	/* convert the string to UTF-16. */
72 	status = U_ZERO_ERROR;
73 	intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
74 
75 	if ( U_FAILURE( status ) ) {
76 		/* Set global error code. */
77 		intl_error_set_code( NULL, status );
78 
79 		/* Set error messages. */
80 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
81 		if (ustring) {
82 			efree( ustring );
83 		}
84 		RETURN_NULL();
85 	}
86 
87 	ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
88 
89 	if (ustring) {
90 		efree( ustring );
91 	}
92 
93 	if (ret_len >= 0) {
94 		RETVAL_LONG(ret_len);
95 	} else {
96 		RETVAL_FALSE;
97 	}
98 }
99 /* }}} */
100 
101 /* {{{ Find position of first occurrence of a string within another */
PHP_FUNCTION(grapheme_strpos)102 PHP_FUNCTION(grapheme_strpos)
103 {
104 	char *haystack, *needle;
105 	size_t haystack_len, needle_len;
106 	const char *found;
107 	zend_long loffset = 0;
108 	int32_t offset = 0;
109 	size_t noffset = 0;
110 	zend_long ret_pos;
111 
112 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
113 		RETURN_THROWS();
114 	}
115 
116 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
117 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
118 		RETURN_THROWS();
119 	}
120 
121 	/* we checked that it will fit: */
122 	offset = (int32_t) loffset;
123 	noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
124 
125 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
126 
127 	if (offset >= 0 && grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0) {
128 		/* quick check to see if the string might be there
129 		 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
130 		*/
131 		found = php_memnstr(haystack + noffset, needle, needle_len, haystack + haystack_len);
132 
133 		/* if it isn't there the we are done */
134 		if (found) {
135 			RETURN_LONG(found - haystack);
136 		}
137 		RETURN_FALSE;
138 	}
139 
140 	/* do utf16 part of the strpos */
141 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
142 
143 	if ( ret_pos >= 0 ) {
144 		RETURN_LONG(ret_pos);
145 	} else {
146 		RETURN_FALSE;
147 	}
148 }
149 /* }}} */
150 
151 /* {{{ Find position of first occurrence of a string within another, ignoring case differences */
PHP_FUNCTION(grapheme_stripos)152 PHP_FUNCTION(grapheme_stripos)
153 {
154 	char *haystack, *needle;
155 	size_t haystack_len, needle_len;
156 	const char *found;
157 	zend_long loffset = 0;
158 	int32_t offset = 0;
159 	zend_long ret_pos;
160 	int is_ascii;
161 
162 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
163 		RETURN_THROWS();
164 	}
165 
166 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
167 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
168 		RETURN_THROWS();
169 	}
170 
171 	/* we checked that it will fit: */
172 	offset = (int32_t) loffset;
173 
174 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
175 
176 	is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
177 
178 	if ( is_ascii ) {
179 		char *haystack_dup, *needle_dup;
180 		int32_t noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
181 		needle_dup = estrndup(needle, needle_len);
182 		php_strtolower(needle_dup, needle_len);
183 		haystack_dup = estrndup(haystack, haystack_len);
184 		php_strtolower(haystack_dup, haystack_len);
185 
186 		found = php_memnstr(haystack_dup + noffset, needle_dup, needle_len, haystack_dup + haystack_len);
187 
188 		efree(haystack_dup);
189 		efree(needle_dup);
190 
191 		if (found) {
192 			RETURN_LONG(found - haystack_dup);
193 		}
194 
195 		/* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
196 		if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
197 			RETURN_FALSE;
198 		}
199 	}
200 
201 	/* do utf16 part of the strpos */
202 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
203 
204 	if ( ret_pos >= 0 ) {
205 		RETURN_LONG(ret_pos);
206 	} else {
207 		RETURN_FALSE;
208 	}
209 
210 }
211 /* }}} */
212 
213 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(grapheme_strrpos)214 PHP_FUNCTION(grapheme_strrpos)
215 {
216 	char *haystack, *needle;
217 	size_t haystack_len, needle_len;
218 	zend_long loffset = 0;
219 	int32_t offset = 0;
220 	zend_long ret_pos;
221 	int is_ascii;
222 
223 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
224 		RETURN_THROWS();
225 	}
226 
227 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
228 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
229 		RETURN_THROWS();
230 	}
231 
232 	/* we checked that it will fit: */
233 	offset = (int32_t) loffset;
234 
235 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
236 
237 	is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
238 
239 	if ( is_ascii ) {
240 
241 		ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
242 
243 		if ( ret_pos >= 0 ) {
244 			RETURN_LONG(ret_pos);
245 		}
246 
247 		/* if the needle was ascii too, we are done */
248 
249 		if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
250 			RETURN_FALSE;
251 		}
252 
253 		/* else we need to continue via utf16 */
254 	}
255 
256 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
257 
258 	if ( ret_pos >= 0 ) {
259 		RETURN_LONG(ret_pos);
260 	} else {
261 		RETURN_FALSE;
262 	}
263 
264 
265 }
266 /* }}} */
267 
268 /* {{{ Find position of last occurrence of a string within another, ignoring case */
PHP_FUNCTION(grapheme_strripos)269 PHP_FUNCTION(grapheme_strripos)
270 {
271 	char *haystack, *needle;
272 	size_t haystack_len, needle_len;
273 	zend_long loffset = 0;
274 	int32_t offset = 0;
275 	zend_long ret_pos;
276 	int is_ascii;
277 
278 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
279 		RETURN_THROWS();
280 	}
281 
282 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
283 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
284 		RETURN_THROWS();
285 	}
286 
287 	/* we checked that it will fit: */
288 	offset = (int32_t) loffset;
289 
290 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
291 
292 	is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
293 
294 	if ( is_ascii ) {
295 		char *needle_dup, *haystack_dup;
296 
297 		needle_dup = estrndup(needle, needle_len);
298 		php_strtolower(needle_dup, needle_len);
299 		haystack_dup = estrndup(haystack, haystack_len);
300 		php_strtolower(haystack_dup, haystack_len);
301 
302 		ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
303 
304 		efree(haystack_dup);
305 		efree(needle_dup);
306 
307 		if ( ret_pos >= 0 ) {
308 			RETURN_LONG(ret_pos);
309 		}
310 
311 		/* if the needle was ascii too, we are done */
312 
313 		if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
314 			RETURN_FALSE;
315 		}
316 
317 		/* else we need to continue via utf16 */
318 	}
319 
320 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL,  1 /* f_ignore_case */, 1 /*last */);
321 
322 	if ( ret_pos >= 0 ) {
323 		RETURN_LONG(ret_pos);
324 	} else {
325 		RETURN_FALSE;
326 	}
327 
328 
329 }
330 /* }}} */
331 
332 /* {{{ Returns part of a string */
PHP_FUNCTION(grapheme_substr)333 PHP_FUNCTION(grapheme_substr)
334 {
335 	char *str;
336 	zend_string *u8_sub_str;
337 	UChar *ustr;
338 	size_t str_len;
339 	int32_t ustr_len;
340 	zend_long lstart = 0, length = 0;
341 	int32_t start = 0;
342 	int iter_val;
343 	UErrorCode status;
344 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
345 	UBreakIterator* bi = NULL;
346 	int sub_str_start_pos, sub_str_end_pos;
347 	int32_t (*iter_func)(UBreakIterator *);
348 	zend_bool no_length = 1;
349 
350 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|l!", &str, &str_len, &lstart, &length, &no_length) == FAILURE) {
351 		RETURN_THROWS();
352 	}
353 
354 	if (lstart < INT32_MIN || lstart > INT32_MAX) {
355 		zend_argument_value_error(2, "is too large");
356 		RETURN_THROWS();
357 	}
358 
359 	start = (int32_t) lstart;
360 
361 	if (no_length) {
362 		length = str_len;
363 	}
364 
365 	if (length < INT32_MIN || length > INT32_MAX) {
366 		zend_argument_value_error(3, "is too large");
367 		RETURN_THROWS();
368 	}
369 
370 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
371 
372 	if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
373 		int32_t asub_str_len;
374 		char *sub_str;
375 		grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
376 
377 		if ( NULL == sub_str ) {
378 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
379 			RETURN_FALSE;
380 		}
381 
382 		RETURN_STRINGL(sub_str, asub_str_len);
383 	}
384 
385 	ustr = NULL;
386 	ustr_len = 0;
387 	status = U_ZERO_ERROR;
388 	intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
389 
390 	if ( U_FAILURE( status ) ) {
391 		/* Set global error code. */
392 		intl_error_set_code( NULL, status );
393 
394 		/* Set error messages. */
395 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
396 		if (ustr) {
397 			efree( ustr );
398 		}
399 		RETURN_FALSE;
400 	}
401 
402 	bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
403 
404 	if( U_FAILURE(status) ) {
405 		RETURN_FALSE;
406 	}
407 
408 	ubrk_setText(bi, ustr, ustr_len,	&status);
409 
410 	if ( start < 0 ) {
411 		iter_func = ubrk_previous;
412 		ubrk_last(bi);
413 		iter_val = 1;
414 	}
415 	else {
416 		iter_func = ubrk_next;
417 		iter_val = -1;
418 	}
419 
420 	sub_str_start_pos = 0;
421 
422 	while ( start ) {
423 		sub_str_start_pos = iter_func(bi);
424 
425 		if ( UBRK_DONE == sub_str_start_pos ) {
426 			break;
427 		}
428 
429 		start += iter_val;
430 	}
431 
432 	if (0 != start) {
433 		if (start > 0) {
434 			if (ustr) {
435 				efree(ustr);
436 			}
437 			ubrk_close(bi);
438 			RETURN_EMPTY_STRING();
439 		}
440 
441 		sub_str_start_pos = 0;
442 		ubrk_first(bi);
443 	}
444 
445 	/* OK to convert here since if str_len were big, convert above would fail */
446 	if (length >= (int32_t)str_len) {
447 
448 		/* no length supplied or length is too big, return the rest of the string */
449 
450 		status = U_ZERO_ERROR;
451 		u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
452 
453 		if (ustr) {
454 			efree( ustr );
455 		}
456 		ubrk_close( bi );
457 
458 		if ( !u8_sub_str ) {
459 			/* Set global error code. */
460 			intl_error_set_code( NULL, status );
461 
462 			/* Set error messages. */
463 			intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
464 
465 			RETURN_FALSE;
466 		}
467 
468 		/* return the allocated string, not a duplicate */
469 		RETVAL_NEW_STR(u8_sub_str);
470 		return;
471 	}
472 
473 	if(length == 0) {
474 		/* empty length - we've validated start, we can return "" now */
475 		if (ustr) {
476 			efree(ustr);
477 		}
478 		ubrk_close(bi);
479 		RETURN_EMPTY_STRING();
480 	}
481 
482 	/* find the end point of the string to return */
483 
484 	if ( length < 0 ) {
485 		iter_func = ubrk_previous;
486 		ubrk_last(bi);
487 		iter_val = 1;
488 	}
489 	else {
490 		iter_func = ubrk_next;
491 		iter_val = -1;
492 	}
493 
494 	sub_str_end_pos = 0;
495 
496 	while ( length ) {
497 		sub_str_end_pos = iter_func(bi);
498 
499 		if ( UBRK_DONE == sub_str_end_pos ) {
500 			break;
501 		}
502 
503 		length += iter_val;
504 	}
505 
506 	ubrk_close(bi);
507 
508 	if ( UBRK_DONE == sub_str_end_pos) {
509 		if (length < 0) {
510 			efree(ustr);
511 			RETURN_EMPTY_STRING();
512 		} else {
513 			sub_str_end_pos = ustr_len;
514 		}
515 	}
516 
517 	if (sub_str_start_pos > sub_str_end_pos) {
518 		efree(ustr);
519 		RETURN_EMPTY_STRING();
520 	}
521 
522 	status = U_ZERO_ERROR;
523 	u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
524 
525 	efree( ustr );
526 
527 	if ( !u8_sub_str ) {
528 		/* Set global error code. */
529 		intl_error_set_code( NULL, status );
530 
531 		/* Set error messages. */
532 		intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
533 
534 		RETURN_FALSE;
535 	}
536 
537 	 /* return the allocated string, not a duplicate */
538 	RETVAL_NEW_STR(u8_sub_str);
539 }
540 /* }}} */
541 
542 /* {{{	strstr_common_handler */
strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS,int f_ignore_case)543 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
544 {
545 	char *haystack, *needle;
546 	const char *found;
547 	size_t haystack_len, needle_len;
548 	int32_t ret_pos, uchar_pos;
549 	zend_bool part = 0;
550 
551 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|b", &haystack, &haystack_len, &needle, &needle_len, &part) == FAILURE) {
552 		RETURN_THROWS();
553 	}
554 
555 	if ( !f_ignore_case ) {
556 
557 		/* ASCII optimization: quick check to see if the string might be there */
558 		found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
559 
560 		/* if it isn't there the we are done */
561 		if ( !found ) {
562 			RETURN_FALSE;
563 		}
564 
565 		/* if it is there, and if the haystack is ascii, we are all done */
566 		if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
567 			size_t found_offset = found - haystack;
568 
569 			if (part) {
570 				RETURN_STRINGL(haystack, found_offset);
571 			} else {
572 				RETURN_STRINGL(found, haystack_len - found_offset);
573 			}
574 		}
575 
576 	}
577 
578 	/* need to work in utf16 */
579 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
580 
581 	if ( ret_pos < 0 ) {
582 		RETURN_FALSE;
583 	}
584 
585 	/* uchar_pos is the 'nth' Unicode character position of the needle */
586 
587 	ret_pos = 0;
588 	U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
589 
590 	if (part) {
591 		RETURN_STRINGL(haystack, ret_pos);
592 	} else {
593 		RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
594 	}
595 
596 }
597 /* }}} */
598 
599 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_strstr)600 PHP_FUNCTION(grapheme_strstr)
601 {
602 	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
603 }
604 /* }}} */
605 
606 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_stristr)607 PHP_FUNCTION(grapheme_stristr)
608 {
609 	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
610 }
611 /* }}} */
612 
613 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
614 static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator * bi,int32_t csize,unsigned char * pstr,int32_t str_len)615 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
616 {
617 	int pos = 0;
618 	int ret_pos = 0;
619 	int break_pos, prev_break_pos;
620 	int count = 0;
621 
622 	while ( 1 ) {
623 		pos = ubrk_next(bi);
624 
625 		if ( UBRK_DONE == pos ) {
626 			break;
627 		}
628 
629 		for ( break_pos = ret_pos; break_pos < pos; ) {
630 			count++;
631 			prev_break_pos = break_pos;
632 			U8_FWD_1(pstr, break_pos, str_len);
633 
634 			if ( prev_break_pos == break_pos ) {
635 				/* something wrong - malformed utf8? */
636 				csize = 0;
637 				break;
638 			}
639 		}
640 
641 		/* if we are beyond our limit, then the loop is done */
642 		if ( count > csize ) {
643 			break;
644 		}
645 
646 		ret_pos = break_pos;
647 	}
648 
649 	return ret_pos;
650 }
651 /* }}} */
652 
653 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
654 static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator * bi,int32_t bsize,unsigned char * pstr,int32_t str_len)655 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
656 {
657 	int pos = 0;
658 	int ret_pos = 0;
659 
660 	while ( 1 ) {
661 		pos = ubrk_next(bi);
662 
663 		if ( UBRK_DONE == pos ) {
664 			break;
665 		}
666 
667 		if ( pos > bsize ) {
668 			break;
669 		}
670 
671 		ret_pos = pos;
672 	}
673 
674 	return ret_pos;
675 }
676 /* }}} */
677 
678 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
679 static inline int32_t
grapheme_extract_count_iter(UBreakIterator * bi,int32_t size,unsigned char * pstr,int32_t str_len)680 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
681 {
682 	int next_pos = 0;
683 	int ret_pos = 0;
684 
685 	while ( size ) {
686 		next_pos = ubrk_next(bi);
687 
688 		if ( UBRK_DONE == next_pos ) {
689 			break;
690 		}
691 		ret_pos = next_pos;
692 		size--;
693 	}
694 
695 	return ret_pos;
696 }
697 /* }}} */
698 
699 /* {{{ grapheme extract iter function pointer array */
700 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
701 
702 static grapheme_extract_iter grapheme_extract_iters[] = {
703 	&grapheme_extract_count_iter,
704 	&grapheme_extract_bytecount_iter,
705 	&grapheme_extract_charcount_iter,
706 };
707 /* }}} */
708 
709 /* {{{ Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)710 PHP_FUNCTION(grapheme_extract)
711 {
712 	char *str, *pstr;
713 	UText ut = UTEXT_INITIALIZER;
714 	size_t str_len;
715 	zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
716 	zend_long lstart = 0; /* starting position in str in bytes */
717 	int32_t start = 0;
718 	zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
719 	UErrorCode status;
720 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
721 	UBreakIterator* bi = NULL;
722 	int ret_pos;
723 	zval *next = NULL; /* return offset of next part of the string */
724 
725 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|llz", &str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
726 		RETURN_THROWS();
727 	}
728 
729 	if (lstart < 0) {
730 		lstart += str_len;
731 	}
732 
733 	if ( NULL != next ) {
734 		if ( !Z_ISREF_P(next) ) {
735 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
736 				 "grapheme_extract: 'next' was not passed by reference", 0 );
737 			RETURN_FALSE;
738 		} else {
739 			ZVAL_DEREF(next);
740 			/* initialize next */
741 			zval_ptr_dtor(next);
742             ZVAL_LONG(next, lstart);
743 		}
744 	}
745 
746 	if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
747 		zend_argument_value_error(3, "must be one of GRAPHEME_EXTR_COUNT, GRAPHEME_EXTR_MAXBYTES, or GRAPHEME_EXTR_MAXCHARS");
748 		RETURN_THROWS();
749 	}
750 
751 	if ( lstart > INT32_MAX || lstart < 0 || (size_t)lstart >= str_len ) {
752 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
753 		RETURN_FALSE;
754 	}
755 
756 	if (size < 0) {
757 		zend_argument_value_error(2, "must be greater than or equal to 0");
758 		RETURN_THROWS();
759 	}
760 
761 	if (size > INT32_MAX) {
762 		zend_argument_value_error(2, "is too large");
763 		RETURN_THROWS();
764 	}
765 
766 	if (size == 0) {
767 		RETURN_EMPTY_STRING();
768 	}
769 
770 	/* we checked that it will fit: */
771 	start = (int32_t) lstart;
772 
773 	pstr = str + start;
774 
775 	/* just in case pstr points in the middle of a character, move forward to the start of the next char */
776 	if ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
777 		char *str_end = str + str_len;
778 
779 		while ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
780 			pstr++;
781 			if ( pstr >= str_end ) {
782 				intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
783 								"grapheme_extract: invalid input string", 0 );
784 
785 				RETURN_FALSE;
786 			}
787 		}
788 	}
789 
790 	str_len -= (pstr - str);
791 
792 	/* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
793 		(size + 1 because the size-th character might be the beginning of a grapheme cluster)
794 	 */
795 
796 	if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
797         size_t nsize = MIN(size, str_len);
798 		if ( NULL != next ) {
799 			ZVAL_LONG(next, start+nsize);
800 		}
801 		RETURN_STRINGL(pstr, nsize);
802 	}
803 
804 	status = U_ZERO_ERROR;
805 	utext_openUTF8(&ut, pstr, str_len, &status);
806 
807 	if ( U_FAILURE( status ) ) {
808 		/* Set global error code. */
809 		intl_error_set_code( NULL, status );
810 
811 		/* Set error messages. */
812 		intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
813 
814 		RETURN_FALSE;
815 	}
816 
817 	bi = NULL;
818 	status = U_ZERO_ERROR;
819 	bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
820 
821 	ubrk_setUText(bi, &ut, &status);
822 	/* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
823 		can't back up. So, we will not do anything. */
824 
825 	/* now we need to find the end of the chunk the user wants us to return */
826 	/* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
827 	ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
828 
829 	utext_close(&ut);
830 	ubrk_close(bi);
831 
832 	if ( NULL != next ) {
833 		ZVAL_LONG(next, start+ret_pos);
834 	}
835 
836 	RETURN_STRINGL(((char *)pstr), ret_pos);
837 }
838 
839 /* }}} */
840