xref: /PHP-8.1/ext/intl/grapheme/grapheme_string.c (revision c96be7b8)
1 /*
2    +----------------------------------------------------------------------+
3    | This source file is subject to version 3.01 of the PHP license,      |
4    | that is bundled with this package in the file LICENSE, and is        |
5    | available through the world-wide-web at the following url:           |
6    | https://www.php.net/license/3_01.txt                                 |
7    | If you did not receive a copy of the PHP license and are unable to   |
8    | obtain it through the world-wide-web, please send a note to          |
9    | license@php.net so we can mail you a copy immediately.               |
10    +----------------------------------------------------------------------+
11    | Author: Ed Batutis <ed@batutis.com>								  |
12    +----------------------------------------------------------------------+
13  */
14 
15 /* {{{ includes */
16 #ifdef HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19 
20 #include <php.h>
21 #include "grapheme.h"
22 #include "grapheme_util.h"
23 
24 #include <unicode/utypes.h>
25 #include <unicode/utf8.h>
26 #include <unicode/ucol.h>
27 #include <unicode/ustring.h>
28 #include <unicode/ubrk.h>
29 
30 /* }}} */
31 
32 #define GRAPHEME_EXTRACT_TYPE_COUNT		0
33 #define GRAPHEME_EXTRACT_TYPE_MAXBYTES	1
34 #define GRAPHEME_EXTRACT_TYPE_MAXCHARS	2
35 #define GRAPHEME_EXTRACT_TYPE_MIN	GRAPHEME_EXTRACT_TYPE_COUNT
36 #define GRAPHEME_EXTRACT_TYPE_MAX	GRAPHEME_EXTRACT_TYPE_MAXCHARS
37 
38 
39 /* {{{ grapheme_register_constants
40  * Register API constants
41  */
grapheme_register_constants(INIT_FUNC_ARGS)42 void grapheme_register_constants( INIT_FUNC_ARGS )
43 {
44 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
45 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
46 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
47 }
48 /* }}} */
49 
50 /* {{{ Get number of graphemes in a string */
PHP_FUNCTION(grapheme_strlen)51 PHP_FUNCTION(grapheme_strlen)
52 {
53 	char* string;
54 	size_t string_len;
55 	UChar* ustring = NULL;
56 	int ustring_len = 0;
57 	zend_long ret_len;
58 	UErrorCode status;
59 
60 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &string, &string_len) == FAILURE) {
61 		RETURN_THROWS();
62 	}
63 
64 	ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
65 
66 	if ( ret_len >= 0 )
67 		RETURN_LONG(string_len);
68 
69 	/* convert the string to UTF-16. */
70 	status = U_ZERO_ERROR;
71 	intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
72 
73 	if ( U_FAILURE( status ) ) {
74 		/* Set global error code. */
75 		intl_error_set_code( NULL, status );
76 
77 		/* Set error messages. */
78 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
79 		if (ustring) {
80 			efree( ustring );
81 		}
82 		RETURN_NULL();
83 	}
84 
85 	ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
86 
87 	if (ustring) {
88 		efree( ustring );
89 	}
90 
91 	if (ret_len >= 0) {
92 		RETVAL_LONG(ret_len);
93 	} else {
94 		RETVAL_FALSE;
95 	}
96 }
97 /* }}} */
98 
99 /* {{{ Find position of first occurrence of a string within another */
PHP_FUNCTION(grapheme_strpos)100 PHP_FUNCTION(grapheme_strpos)
101 {
102 	char *haystack, *needle;
103 	size_t haystack_len, needle_len;
104 	const char *found;
105 	zend_long loffset = 0;
106 	int32_t offset = 0;
107 	size_t noffset = 0;
108 	zend_long ret_pos;
109 
110 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
111 		RETURN_THROWS();
112 	}
113 
114 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
115 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
116 		RETURN_THROWS();
117 	}
118 
119 	/* we checked that it will fit: */
120 	offset = (int32_t) loffset;
121 	noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
122 
123 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
124 
125 	if (offset >= 0 && grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0) {
126 		/* quick check to see if the string might be there
127 		 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
128 		*/
129 		found = php_memnstr(haystack + noffset, needle, needle_len, haystack + haystack_len);
130 
131 		/* if it isn't there the we are done */
132 		if (found) {
133 			RETURN_LONG(found - haystack);
134 		}
135 		RETURN_FALSE;
136 	}
137 
138 	/* do utf16 part of the strpos */
139 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
140 
141 	if ( ret_pos >= 0 ) {
142 		RETURN_LONG(ret_pos);
143 	} else {
144 		RETURN_FALSE;
145 	}
146 }
147 /* }}} */
148 
149 /* {{{ Find position of first occurrence of a string within another, ignoring case differences */
PHP_FUNCTION(grapheme_stripos)150 PHP_FUNCTION(grapheme_stripos)
151 {
152 	char *haystack, *needle;
153 	size_t haystack_len, needle_len;
154 	const char *found;
155 	zend_long loffset = 0;
156 	int32_t offset = 0;
157 	zend_long ret_pos;
158 	int is_ascii;
159 
160 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
161 		RETURN_THROWS();
162 	}
163 
164 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
165 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
166 		RETURN_THROWS();
167 	}
168 
169 	/* we checked that it will fit: */
170 	offset = (int32_t) loffset;
171 
172 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
173 
174 	is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
175 
176 	if ( is_ascii ) {
177 		char *haystack_dup, *needle_dup;
178 		int32_t noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
179 		needle_dup = estrndup(needle, needle_len);
180 		zend_str_tolower(needle_dup, needle_len);
181 		haystack_dup = estrndup(haystack, haystack_len);
182 		zend_str_tolower(haystack_dup, haystack_len);
183 
184 		found = php_memnstr(haystack_dup + noffset, needle_dup, needle_len, haystack_dup + haystack_len);
185 
186 		efree(haystack_dup);
187 		efree(needle_dup);
188 
189 		if (found) {
190 			RETURN_LONG(found - haystack_dup);
191 		}
192 
193 		/* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
194 		if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
195 			RETURN_FALSE;
196 		}
197 	}
198 
199 	/* do utf16 part of the strpos */
200 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
201 
202 	if ( ret_pos >= 0 ) {
203 		RETURN_LONG(ret_pos);
204 	} else {
205 		RETURN_FALSE;
206 	}
207 
208 }
209 /* }}} */
210 
211 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(grapheme_strrpos)212 PHP_FUNCTION(grapheme_strrpos)
213 {
214 	char *haystack, *needle;
215 	size_t haystack_len, needle_len;
216 	zend_long loffset = 0;
217 	int32_t offset = 0;
218 	zend_long ret_pos;
219 	int is_ascii;
220 
221 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
222 		RETURN_THROWS();
223 	}
224 
225 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
226 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
227 		RETURN_THROWS();
228 	}
229 
230 	/* we checked that it will fit: */
231 	offset = (int32_t) loffset;
232 
233 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
234 
235 	is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
236 
237 	if ( is_ascii ) {
238 
239 		ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
240 
241 		if ( ret_pos >= 0 ) {
242 			RETURN_LONG(ret_pos);
243 		}
244 
245 		/* if the needle was ascii too, we are done */
246 
247 		if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
248 			RETURN_FALSE;
249 		}
250 
251 		/* else we need to continue via utf16 */
252 	}
253 
254 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
255 
256 	if ( ret_pos >= 0 ) {
257 		RETURN_LONG(ret_pos);
258 	} else {
259 		RETURN_FALSE;
260 	}
261 
262 
263 }
264 /* }}} */
265 
266 /* {{{ Find position of last occurrence of a string within another, ignoring case */
PHP_FUNCTION(grapheme_strripos)267 PHP_FUNCTION(grapheme_strripos)
268 {
269 	char *haystack, *needle;
270 	size_t haystack_len, needle_len;
271 	zend_long loffset = 0;
272 	int32_t offset = 0;
273 	zend_long ret_pos;
274 	int is_ascii;
275 
276 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
277 		RETURN_THROWS();
278 	}
279 
280 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
281 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
282 		RETURN_THROWS();
283 	}
284 
285 	/* we checked that it will fit: */
286 	offset = (int32_t) loffset;
287 
288 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
289 
290 	is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
291 
292 	if ( is_ascii ) {
293 		char *needle_dup, *haystack_dup;
294 
295 		needle_dup = estrndup(needle, needle_len);
296 		zend_str_tolower(needle_dup, needle_len);
297 		haystack_dup = estrndup(haystack, haystack_len);
298 		zend_str_tolower(haystack_dup, haystack_len);
299 
300 		ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
301 
302 		efree(haystack_dup);
303 		efree(needle_dup);
304 
305 		if ( ret_pos >= 0 ) {
306 			RETURN_LONG(ret_pos);
307 		}
308 
309 		/* if the needle was ascii too, we are done */
310 
311 		if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
312 			RETURN_FALSE;
313 		}
314 
315 		/* else we need to continue via utf16 */
316 	}
317 
318 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL,  1 /* f_ignore_case */, 1 /*last */);
319 
320 	if ( ret_pos >= 0 ) {
321 		RETURN_LONG(ret_pos);
322 	} else {
323 		RETURN_FALSE;
324 	}
325 
326 
327 }
328 /* }}} */
329 
330 /* {{{ Returns part of a string */
PHP_FUNCTION(grapheme_substr)331 PHP_FUNCTION(grapheme_substr)
332 {
333 	char *str;
334 	zend_string *u8_sub_str;
335 	UChar *ustr;
336 	size_t str_len;
337 	int32_t ustr_len;
338 	zend_long lstart = 0, length = 0;
339 	int32_t start = 0;
340 	int iter_val;
341 	UErrorCode status;
342 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
343 	UBreakIterator* bi = NULL;
344 	int sub_str_start_pos, sub_str_end_pos;
345 	int32_t (*iter_func)(UBreakIterator *);
346 	bool no_length = 1;
347 
348 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|l!", &str, &str_len, &lstart, &length, &no_length) == FAILURE) {
349 		RETURN_THROWS();
350 	}
351 
352 	if (lstart < INT32_MIN || lstart > INT32_MAX) {
353 		zend_argument_value_error(2, "is too large");
354 		RETURN_THROWS();
355 	}
356 
357 	start = (int32_t) lstart;
358 
359 	if (no_length) {
360 		length = str_len;
361 	}
362 
363 	if (length < INT32_MIN || length > INT32_MAX) {
364 		zend_argument_value_error(3, "is too large");
365 		RETURN_THROWS();
366 	}
367 
368 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
369 
370 	if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
371 		int32_t asub_str_len;
372 		char *sub_str;
373 		grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
374 
375 		if ( NULL == sub_str ) {
376 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
377 			RETURN_FALSE;
378 		}
379 
380 		RETURN_STRINGL(sub_str, asub_str_len);
381 	}
382 
383 	ustr = NULL;
384 	ustr_len = 0;
385 	status = U_ZERO_ERROR;
386 	intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
387 
388 	if ( U_FAILURE( status ) ) {
389 		/* Set global error code. */
390 		intl_error_set_code( NULL, status );
391 
392 		/* Set error messages. */
393 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
394 		if (ustr) {
395 			efree( ustr );
396 		}
397 		RETURN_FALSE;
398 	}
399 
400 	bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
401 
402 	if( U_FAILURE(status) ) {
403 		RETURN_FALSE;
404 	}
405 
406 	ubrk_setText(bi, ustr, ustr_len,	&status);
407 
408 	if ( start < 0 ) {
409 		iter_func = ubrk_previous;
410 		ubrk_last(bi);
411 		iter_val = 1;
412 	}
413 	else {
414 		iter_func = ubrk_next;
415 		iter_val = -1;
416 	}
417 
418 	sub_str_start_pos = 0;
419 
420 	while ( start ) {
421 		sub_str_start_pos = iter_func(bi);
422 
423 		if ( UBRK_DONE == sub_str_start_pos ) {
424 			break;
425 		}
426 
427 		start += iter_val;
428 	}
429 
430 	if (0 != start) {
431 		if (start > 0) {
432 			if (ustr) {
433 				efree(ustr);
434 			}
435 			ubrk_close(bi);
436 			RETURN_EMPTY_STRING();
437 		}
438 
439 		sub_str_start_pos = 0;
440 		ubrk_first(bi);
441 	}
442 
443 	/* OK to convert here since if str_len were big, convert above would fail */
444 	if (length >= (int32_t)str_len) {
445 
446 		/* no length supplied or length is too big, return the rest of the string */
447 
448 		status = U_ZERO_ERROR;
449 		u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
450 
451 		if (ustr) {
452 			efree( ustr );
453 		}
454 		ubrk_close( bi );
455 
456 		if ( !u8_sub_str ) {
457 			/* Set global error code. */
458 			intl_error_set_code( NULL, status );
459 
460 			/* Set error messages. */
461 			intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
462 
463 			RETURN_FALSE;
464 		}
465 
466 		/* return the allocated string, not a duplicate */
467 		RETVAL_NEW_STR(u8_sub_str);
468 		return;
469 	}
470 
471 	if(length == 0) {
472 		/* empty length - we've validated start, we can return "" now */
473 		if (ustr) {
474 			efree(ustr);
475 		}
476 		ubrk_close(bi);
477 		RETURN_EMPTY_STRING();
478 	}
479 
480 	/* find the end point of the string to return */
481 
482 	if ( length < 0 ) {
483 		iter_func = ubrk_previous;
484 		ubrk_last(bi);
485 		iter_val = 1;
486 	}
487 	else {
488 		iter_func = ubrk_next;
489 		iter_val = -1;
490 	}
491 
492 	sub_str_end_pos = 0;
493 
494 	while ( length ) {
495 		sub_str_end_pos = iter_func(bi);
496 
497 		if ( UBRK_DONE == sub_str_end_pos ) {
498 			break;
499 		}
500 
501 		length += iter_val;
502 	}
503 
504 	ubrk_close(bi);
505 
506 	if ( UBRK_DONE == sub_str_end_pos) {
507 		if (length < 0) {
508 			efree(ustr);
509 			RETURN_EMPTY_STRING();
510 		} else {
511 			sub_str_end_pos = ustr_len;
512 		}
513 	}
514 
515 	if (sub_str_start_pos > sub_str_end_pos) {
516 		efree(ustr);
517 		RETURN_EMPTY_STRING();
518 	}
519 
520 	status = U_ZERO_ERROR;
521 	u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
522 
523 	efree( ustr );
524 
525 	if ( !u8_sub_str ) {
526 		/* Set global error code. */
527 		intl_error_set_code( NULL, status );
528 
529 		/* Set error messages. */
530 		intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
531 
532 		RETURN_FALSE;
533 	}
534 
535 	 /* return the allocated string, not a duplicate */
536 	RETVAL_NEW_STR(u8_sub_str);
537 }
538 /* }}} */
539 
540 /* {{{	strstr_common_handler */
strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS,int f_ignore_case)541 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
542 {
543 	char *haystack, *needle;
544 	const char *found;
545 	size_t haystack_len, needle_len;
546 	int32_t ret_pos, uchar_pos;
547 	bool part = 0;
548 
549 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|b", &haystack, &haystack_len, &needle, &needle_len, &part) == FAILURE) {
550 		RETURN_THROWS();
551 	}
552 
553 	if ( !f_ignore_case ) {
554 
555 		/* ASCII optimization: quick check to see if the string might be there */
556 		found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
557 
558 		/* if it isn't there the we are done */
559 		if ( !found ) {
560 			RETURN_FALSE;
561 		}
562 
563 		/* if it is there, and if the haystack is ascii, we are all done */
564 		if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
565 			size_t found_offset = found - haystack;
566 
567 			if (part) {
568 				RETURN_STRINGL(haystack, found_offset);
569 			} else {
570 				RETURN_STRINGL(found, haystack_len - found_offset);
571 			}
572 		}
573 
574 	}
575 
576 	/* need to work in utf16 */
577 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
578 
579 	if ( ret_pos < 0 ) {
580 		RETURN_FALSE;
581 	}
582 
583 	/* uchar_pos is the 'nth' Unicode character position of the needle */
584 
585 	ret_pos = 0;
586 	U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
587 
588 	if (part) {
589 		RETURN_STRINGL(haystack, ret_pos);
590 	} else {
591 		RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
592 	}
593 
594 }
595 /* }}} */
596 
597 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_strstr)598 PHP_FUNCTION(grapheme_strstr)
599 {
600 	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
601 }
602 /* }}} */
603 
604 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_stristr)605 PHP_FUNCTION(grapheme_stristr)
606 {
607 	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
608 }
609 /* }}} */
610 
611 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
612 static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator * bi,int32_t csize,unsigned char * pstr,int32_t str_len)613 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
614 {
615 	int pos = 0;
616 	int ret_pos = 0;
617 	int break_pos, prev_break_pos;
618 	int count = 0;
619 
620 	while ( 1 ) {
621 		pos = ubrk_next(bi);
622 
623 		if ( UBRK_DONE == pos ) {
624 			break;
625 		}
626 
627 		for ( break_pos = ret_pos; break_pos < pos; ) {
628 			count++;
629 			prev_break_pos = break_pos;
630 			U8_FWD_1(pstr, break_pos, str_len);
631 
632 			if ( prev_break_pos == break_pos ) {
633 				/* something wrong - malformed utf8? */
634 				csize = 0;
635 				break;
636 			}
637 		}
638 
639 		/* if we are beyond our limit, then the loop is done */
640 		if ( count > csize ) {
641 			break;
642 		}
643 
644 		ret_pos = break_pos;
645 	}
646 
647 	return ret_pos;
648 }
649 /* }}} */
650 
651 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
652 static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator * bi,int32_t bsize,unsigned char * pstr,int32_t str_len)653 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
654 {
655 	int pos = 0;
656 	int ret_pos = 0;
657 
658 	while ( 1 ) {
659 		pos = ubrk_next(bi);
660 
661 		if ( UBRK_DONE == pos ) {
662 			break;
663 		}
664 
665 		if ( pos > bsize ) {
666 			break;
667 		}
668 
669 		ret_pos = pos;
670 	}
671 
672 	return ret_pos;
673 }
674 /* }}} */
675 
676 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
677 static inline int32_t
grapheme_extract_count_iter(UBreakIterator * bi,int32_t size,unsigned char * pstr,int32_t str_len)678 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
679 {
680 	int next_pos = 0;
681 	int ret_pos = 0;
682 
683 	while ( size ) {
684 		next_pos = ubrk_next(bi);
685 
686 		if ( UBRK_DONE == next_pos ) {
687 			break;
688 		}
689 		ret_pos = next_pos;
690 		size--;
691 	}
692 
693 	return ret_pos;
694 }
695 /* }}} */
696 
697 /* {{{ grapheme extract iter function pointer array */
698 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
699 
700 static grapheme_extract_iter grapheme_extract_iters[] = {
701 	&grapheme_extract_count_iter,
702 	&grapheme_extract_bytecount_iter,
703 	&grapheme_extract_charcount_iter,
704 };
705 /* }}} */
706 
707 /* {{{ Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)708 PHP_FUNCTION(grapheme_extract)
709 {
710 	char *str, *pstr;
711 	UText ut = UTEXT_INITIALIZER;
712 	size_t str_len;
713 	zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
714 	zend_long lstart = 0; /* starting position in str in bytes */
715 	int32_t start = 0;
716 	zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
717 	UErrorCode status;
718 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
719 	UBreakIterator* bi = NULL;
720 	int ret_pos;
721 	zval *next = NULL; /* return offset of next part of the string */
722 
723 	if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|llz", &str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
724 		RETURN_THROWS();
725 	}
726 
727 	if (lstart < 0) {
728 		lstart += str_len;
729 	}
730 
731 	if ( NULL != next ) {
732 		if ( !Z_ISREF_P(next) ) {
733 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
734 				 "grapheme_extract: 'next' was not passed by reference", 0 );
735 			RETURN_FALSE;
736 		} else {
737 			ZVAL_DEREF(next);
738 			/* initialize next */
739 			zval_ptr_dtor(next);
740 			ZVAL_LONG(next, lstart);
741 		}
742 	}
743 
744 	if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
745 		zend_argument_value_error(3, "must be one of GRAPHEME_EXTR_COUNT, GRAPHEME_EXTR_MAXBYTES, or GRAPHEME_EXTR_MAXCHARS");
746 		RETURN_THROWS();
747 	}
748 
749 	if ( lstart > INT32_MAX || lstart < 0 || (size_t)lstart >= str_len ) {
750 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
751 		RETURN_FALSE;
752 	}
753 
754 	if (size < 0) {
755 		zend_argument_value_error(2, "must be greater than or equal to 0");
756 		RETURN_THROWS();
757 	}
758 
759 	if (size > INT32_MAX) {
760 		zend_argument_value_error(2, "is too large");
761 		RETURN_THROWS();
762 	}
763 
764 	if (size == 0) {
765 		RETURN_EMPTY_STRING();
766 	}
767 
768 	/* we checked that it will fit: */
769 	start = (int32_t) lstart;
770 
771 	pstr = str + start;
772 
773 	/* just in case pstr points in the middle of a character, move forward to the start of the next char */
774 	if ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
775 		char *str_end = str + str_len;
776 
777 		while ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
778 			pstr++;
779 			if ( pstr >= str_end ) {
780 				intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
781 								"grapheme_extract: invalid input string", 0 );
782 
783 				RETURN_FALSE;
784 			}
785 		}
786 	}
787 
788 	str_len -= (pstr - str);
789 
790 	/* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
791 		(size + 1 because the size-th character might be the beginning of a grapheme cluster)
792 	 */
793 
794 	if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
795 		size_t nsize = MIN(size, str_len);
796 		if ( NULL != next ) {
797 			ZVAL_LONG(next, start+nsize);
798 		}
799 		RETURN_STRINGL(pstr, nsize);
800 	}
801 
802 	status = U_ZERO_ERROR;
803 	utext_openUTF8(&ut, pstr, str_len, &status);
804 
805 	if ( U_FAILURE( status ) ) {
806 		/* Set global error code. */
807 		intl_error_set_code( NULL, status );
808 
809 		/* Set error messages. */
810 		intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
811 
812 		RETURN_FALSE;
813 	}
814 
815 	bi = NULL;
816 	status = U_ZERO_ERROR;
817 	bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
818 
819 	ubrk_setUText(bi, &ut, &status);
820 	/* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
821 		can't back up. So, we will not do anything. */
822 
823 	/* now we need to find the end of the chunk the user wants us to return */
824 	/* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
825 	ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
826 
827 	utext_close(&ut);
828 	ubrk_close(bi);
829 
830 	if ( NULL != next ) {
831 		ZVAL_LONG(next, start+ret_pos);
832 	}
833 
834 	RETURN_STRINGL(((char *)pstr), ret_pos);
835 }
836 
837 /* }}} */
838