xref: /PHP-8.4/ext/intl/grapheme/grapheme_string.c (revision 11accb5c)
1 /*
2    +----------------------------------------------------------------------+
3    | This source file is subject to version 3.01 of the PHP license,      |
4    | that is bundled with this package in the file LICENSE, and is        |
5    | available through the world-wide-web at the following url:           |
6    | https://www.php.net/license/3_01.txt                                 |
7    | If you did not receive a copy of the PHP license and are unable to   |
8    | obtain it through the world-wide-web, please send a note to          |
9    | license@php.net so we can mail you a copy immediately.               |
10    +----------------------------------------------------------------------+
11    | Author: Ed Batutis <ed@batutis.com>								  |
12    +----------------------------------------------------------------------+
13  */
14 
15 /* {{{ includes */
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif
19 
20 #include <php.h>
21 #include "grapheme.h"
22 #include "grapheme_util.h"
23 
24 #include <unicode/utypes.h>
25 #include <unicode/utf8.h>
26 #include <unicode/ucol.h>
27 #include <unicode/ustring.h>
28 #include <unicode/ubrk.h>
29 
30 /* }}} */
31 
32 /* {{{ Get number of graphemes in a string */
PHP_FUNCTION(grapheme_strlen)33 PHP_FUNCTION(grapheme_strlen)
34 {
35 	char* string;
36 	size_t string_len;
37 	UChar* ustring = NULL;
38 	int ustring_len = 0;
39 	zend_long ret_len;
40 	UErrorCode status;
41 
42 	ZEND_PARSE_PARAMETERS_START(1, 1)
43 		Z_PARAM_STRING(string, string_len)
44 	ZEND_PARSE_PARAMETERS_END();
45 
46 	ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
47 
48 	if ( ret_len >= 0 )
49 		RETURN_LONG(string_len);
50 
51 	/* convert the string to UTF-16. */
52 	status = U_ZERO_ERROR;
53 	intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
54 
55 	if ( U_FAILURE( status ) ) {
56 		/* Set global error code. */
57 		intl_error_set_code( NULL, status );
58 
59 		/* Set error messages. */
60 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
61 		if (ustring) {
62 			efree( ustring );
63 		}
64 		RETURN_NULL();
65 	}
66 
67 	ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
68 
69 	if (ustring) {
70 		efree( ustring );
71 	}
72 
73 	if (ret_len >= 0) {
74 		RETVAL_LONG(ret_len);
75 	} else {
76 		RETVAL_FALSE;
77 	}
78 }
79 /* }}} */
80 
81 /* {{{ Find position of first occurrence of a string within another */
PHP_FUNCTION(grapheme_strpos)82 PHP_FUNCTION(grapheme_strpos)
83 {
84 	char *haystack, *needle;
85 	size_t haystack_len, needle_len;
86 	const char *found;
87 	zend_long loffset = 0;
88 	int32_t offset = 0;
89 	size_t noffset = 0;
90 	zend_long ret_pos;
91 
92 	ZEND_PARSE_PARAMETERS_START(2, 3)
93 		Z_PARAM_STRING(haystack, haystack_len)
94 		Z_PARAM_STRING(needle, needle_len)
95 		Z_PARAM_OPTIONAL
96 		Z_PARAM_LONG(loffset)
97 	ZEND_PARSE_PARAMETERS_END();
98 
99 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
100 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
101 		RETURN_THROWS();
102 	}
103 
104 	/* we checked that it will fit: */
105 	offset = (int32_t) loffset;
106 	noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
107 
108 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
109 
110 	if (offset >= 0 && grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0) {
111 		/* quick check to see if the string might be there
112 		 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
113 		*/
114 		found = php_memnstr(haystack + noffset, needle, needle_len, haystack + haystack_len);
115 
116 		/* if it isn't there the we are done */
117 		if (found) {
118 			RETURN_LONG(found - haystack);
119 		}
120 		RETURN_FALSE;
121 	}
122 
123 	/* do utf16 part of the strpos */
124 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
125 
126 	if ( ret_pos >= 0 ) {
127 		RETURN_LONG(ret_pos);
128 	} else {
129 		RETURN_FALSE;
130 	}
131 }
132 /* }}} */
133 
134 /* {{{ Find position of first occurrence of a string within another, ignoring case differences */
PHP_FUNCTION(grapheme_stripos)135 PHP_FUNCTION(grapheme_stripos)
136 {
137 	char *haystack, *needle;
138 	size_t haystack_len, needle_len;
139 	const char *found;
140 	zend_long loffset = 0;
141 	int32_t offset = 0;
142 	zend_long ret_pos;
143 	int is_ascii;
144 
145 	ZEND_PARSE_PARAMETERS_START(2, 3)
146 		Z_PARAM_STRING(haystack, haystack_len)
147 		Z_PARAM_STRING(needle, needle_len)
148 		Z_PARAM_OPTIONAL
149 		Z_PARAM_LONG(loffset)
150 	ZEND_PARSE_PARAMETERS_END();
151 
152 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
153 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
154 		RETURN_THROWS();
155 	}
156 
157 	/* we checked that it will fit: */
158 	offset = (int32_t) loffset;
159 
160 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
161 
162 	is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
163 
164 	if ( is_ascii ) {
165 		char *haystack_dup, *needle_dup;
166 		int32_t noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
167 		needle_dup = estrndup(needle, needle_len);
168 		zend_str_tolower(needle_dup, needle_len);
169 		haystack_dup = estrndup(haystack, haystack_len);
170 		zend_str_tolower(haystack_dup, haystack_len);
171 
172 		found = php_memnstr(haystack_dup + noffset, needle_dup, needle_len, haystack_dup + haystack_len);
173 
174 		efree(haystack_dup);
175 		efree(needle_dup);
176 
177 		if (found) {
178 			RETURN_LONG(found - haystack_dup);
179 		}
180 
181 		/* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
182 		if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
183 			RETURN_FALSE;
184 		}
185 	}
186 
187 	/* do utf16 part of the strpos */
188 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
189 
190 	if ( ret_pos >= 0 ) {
191 		RETURN_LONG(ret_pos);
192 	} else {
193 		RETURN_FALSE;
194 	}
195 
196 }
197 /* }}} */
198 
199 /* {{{ Find position of last occurrence of a string within another */
PHP_FUNCTION(grapheme_strrpos)200 PHP_FUNCTION(grapheme_strrpos)
201 {
202 	char *haystack, *needle;
203 	size_t haystack_len, needle_len;
204 	zend_long loffset = 0;
205 	int32_t offset = 0;
206 	zend_long ret_pos;
207 	int is_ascii;
208 
209 	ZEND_PARSE_PARAMETERS_START(2, 3)
210 		Z_PARAM_STRING(haystack, haystack_len)
211 		Z_PARAM_STRING(needle, needle_len)
212 		Z_PARAM_OPTIONAL
213 		Z_PARAM_LONG(loffset)
214 	ZEND_PARSE_PARAMETERS_END();
215 
216 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
217 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
218 		RETURN_THROWS();
219 	}
220 
221 	/* we checked that it will fit: */
222 	offset = (int32_t) loffset;
223 
224 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
225 
226 	is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
227 
228 	if ( is_ascii ) {
229 
230 		ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
231 
232 		if ( ret_pos >= 0 ) {
233 			RETURN_LONG(ret_pos);
234 		}
235 
236 		/* if the needle was ascii too, we are done */
237 
238 		if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
239 			RETURN_FALSE;
240 		}
241 
242 		/* else we need to continue via utf16 */
243 	}
244 
245 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
246 
247 	if ( ret_pos >= 0 ) {
248 		RETURN_LONG(ret_pos);
249 	} else {
250 		RETURN_FALSE;
251 	}
252 
253 
254 }
255 /* }}} */
256 
257 /* {{{ Find position of last occurrence of a string within another, ignoring case */
PHP_FUNCTION(grapheme_strripos)258 PHP_FUNCTION(grapheme_strripos)
259 {
260 	char *haystack, *needle;
261 	size_t haystack_len, needle_len;
262 	zend_long loffset = 0;
263 	int32_t offset = 0;
264 	zend_long ret_pos;
265 	int is_ascii;
266 
267 	ZEND_PARSE_PARAMETERS_START(2, 3)
268 		Z_PARAM_STRING(haystack, haystack_len)
269 		Z_PARAM_STRING(needle, needle_len)
270 		Z_PARAM_OPTIONAL
271 		Z_PARAM_LONG(loffset)
272 	ZEND_PARSE_PARAMETERS_END();
273 
274 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
275 		zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
276 		RETURN_THROWS();
277 	}
278 
279 	/* we checked that it will fit: */
280 	offset = (int32_t) loffset;
281 
282 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
283 
284 	is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
285 
286 	if ( is_ascii ) {
287 		char *needle_dup, *haystack_dup;
288 
289 		needle_dup = estrndup(needle, needle_len);
290 		zend_str_tolower(needle_dup, needle_len);
291 		haystack_dup = estrndup(haystack, haystack_len);
292 		zend_str_tolower(haystack_dup, haystack_len);
293 
294 		ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
295 
296 		efree(haystack_dup);
297 		efree(needle_dup);
298 
299 		if ( ret_pos >= 0 ) {
300 			RETURN_LONG(ret_pos);
301 		}
302 
303 		/* if the needle was ascii too, we are done */
304 
305 		if (  grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
306 			RETURN_FALSE;
307 		}
308 
309 		/* else we need to continue via utf16 */
310 	}
311 
312 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL,  1 /* f_ignore_case */, 1 /*last */);
313 
314 	if ( ret_pos >= 0 ) {
315 		RETURN_LONG(ret_pos);
316 	} else {
317 		RETURN_FALSE;
318 	}
319 
320 
321 }
322 /* }}} */
323 
324 /* {{{ Returns part of a string */
PHP_FUNCTION(grapheme_substr)325 PHP_FUNCTION(grapheme_substr)
326 {
327 	char *str;
328 	zend_string *u8_sub_str;
329 	UChar *ustr;
330 	size_t str_len;
331 	int32_t ustr_len;
332 	zend_long lstart = 0, length = 0;
333 	int32_t start = 0;
334 	int iter_val;
335 	UErrorCode status;
336 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
337 	UBreakIterator* bi = NULL;
338 	int sub_str_start_pos, sub_str_end_pos;
339 	int32_t (*iter_func)(UBreakIterator *);
340 	bool no_length = true;
341 
342 	ZEND_PARSE_PARAMETERS_START(2, 3)
343 		Z_PARAM_STRING(str, str_len)
344 		Z_PARAM_LONG(lstart)
345 		Z_PARAM_OPTIONAL
346 		Z_PARAM_LONG_OR_NULL(length, no_length)
347 	ZEND_PARSE_PARAMETERS_END();
348 
349 	if (lstart < INT32_MIN || lstart > INT32_MAX) {
350 		zend_argument_value_error(2, "is too large");
351 		RETURN_THROWS();
352 	}
353 
354 	start = (int32_t) lstart;
355 
356 	if (no_length) {
357 		length = str_len;
358 	}
359 
360 	if (length < INT32_MIN || length > INT32_MAX) {
361 		zend_argument_value_error(3, "is too large");
362 		RETURN_THROWS();
363 	}
364 
365 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
366 
367 	if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
368 		int32_t asub_str_len;
369 		char *sub_str;
370 		grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
371 
372 		if ( NULL == sub_str ) {
373 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
374 			RETURN_FALSE;
375 		}
376 
377 		RETURN_STRINGL(sub_str, asub_str_len);
378 	}
379 
380 	ustr = NULL;
381 	ustr_len = 0;
382 	status = U_ZERO_ERROR;
383 	intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
384 
385 	if ( U_FAILURE( status ) ) {
386 		/* Set global error code. */
387 		intl_error_set_code( NULL, status );
388 
389 		/* Set error messages. */
390 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
391 		if (ustr) {
392 			efree( ustr );
393 		}
394 		RETURN_FALSE;
395 	}
396 
397 	bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
398 
399 	if( U_FAILURE(status) ) {
400 		RETURN_FALSE;
401 	}
402 
403 	ubrk_setText(bi, ustr, ustr_len,	&status);
404 
405 	if ( start < 0 ) {
406 		iter_func = ubrk_previous;
407 		ubrk_last(bi);
408 		iter_val = 1;
409 	}
410 	else {
411 		iter_func = ubrk_next;
412 		iter_val = -1;
413 	}
414 
415 	sub_str_start_pos = 0;
416 
417 	while ( start ) {
418 		sub_str_start_pos = iter_func(bi);
419 
420 		if ( UBRK_DONE == sub_str_start_pos ) {
421 			break;
422 		}
423 
424 		start += iter_val;
425 	}
426 
427 	if (0 != start) {
428 		if (start > 0) {
429 			if (ustr) {
430 				efree(ustr);
431 			}
432 			ubrk_close(bi);
433 			RETURN_EMPTY_STRING();
434 		}
435 
436 		sub_str_start_pos = 0;
437 		ubrk_first(bi);
438 	}
439 
440 	/* OK to convert here since if str_len were big, convert above would fail */
441 	if (length >= (int32_t)str_len) {
442 
443 		/* no length supplied or length is too big, return the rest of the string */
444 
445 		status = U_ZERO_ERROR;
446 		u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
447 
448 		if (ustr) {
449 			efree( ustr );
450 		}
451 		ubrk_close( bi );
452 
453 		if ( !u8_sub_str ) {
454 			/* Set global error code. */
455 			intl_error_set_code( NULL, status );
456 
457 			/* Set error messages. */
458 			intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
459 
460 			RETURN_FALSE;
461 		}
462 
463 		/* return the allocated string, not a duplicate */
464 		RETVAL_NEW_STR(u8_sub_str);
465 		return;
466 	}
467 
468 	if(length == 0) {
469 		/* empty length - we've validated start, we can return "" now */
470 		if (ustr) {
471 			efree(ustr);
472 		}
473 		ubrk_close(bi);
474 		RETURN_EMPTY_STRING();
475 	}
476 
477 	/* find the end point of the string to return */
478 
479 	if ( length < 0 ) {
480 		iter_func = ubrk_previous;
481 		ubrk_last(bi);
482 		iter_val = 1;
483 	}
484 	else {
485 		iter_func = ubrk_next;
486 		iter_val = -1;
487 	}
488 
489 	sub_str_end_pos = 0;
490 
491 	while ( length ) {
492 		sub_str_end_pos = iter_func(bi);
493 
494 		if ( UBRK_DONE == sub_str_end_pos ) {
495 			break;
496 		}
497 
498 		length += iter_val;
499 	}
500 
501 	ubrk_close(bi);
502 
503 	if ( UBRK_DONE == sub_str_end_pos) {
504 		if (length < 0) {
505 			efree(ustr);
506 			RETURN_EMPTY_STRING();
507 		} else {
508 			sub_str_end_pos = ustr_len;
509 		}
510 	}
511 
512 	if (sub_str_start_pos > sub_str_end_pos) {
513 		efree(ustr);
514 		RETURN_EMPTY_STRING();
515 	}
516 
517 	status = U_ZERO_ERROR;
518 	u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
519 
520 	efree( ustr );
521 
522 	if ( !u8_sub_str ) {
523 		/* Set global error code. */
524 		intl_error_set_code( NULL, status );
525 
526 		/* Set error messages. */
527 		intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
528 
529 		RETURN_FALSE;
530 	}
531 
532 	 /* return the allocated string, not a duplicate */
533 	RETVAL_NEW_STR(u8_sub_str);
534 }
535 /* }}} */
536 
537 /* {{{	strstr_common_handler */
strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS,int f_ignore_case)538 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
539 {
540 	char *haystack, *needle;
541 	const char *found;
542 	size_t haystack_len, needle_len;
543 	int32_t ret_pos, uchar_pos;
544 	bool part = false;
545 
546 	ZEND_PARSE_PARAMETERS_START(2, 3)
547 		Z_PARAM_STRING(haystack, haystack_len)
548 		Z_PARAM_STRING(needle, needle_len)
549 		Z_PARAM_OPTIONAL
550 		Z_PARAM_BOOL(part)
551 	ZEND_PARSE_PARAMETERS_END();
552 
553 	if ( !f_ignore_case ) {
554 
555 		/* ASCII optimization: quick check to see if the string might be there */
556 		found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
557 
558 		/* if it isn't there the we are done */
559 		if ( !found ) {
560 			RETURN_FALSE;
561 		}
562 
563 		/* if it is there, and if the haystack is ascii, we are all done */
564 		if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
565 			size_t found_offset = found - haystack;
566 
567 			if (part) {
568 				RETURN_STRINGL(haystack, found_offset);
569 			} else {
570 				RETURN_STRINGL(found, haystack_len - found_offset);
571 			}
572 		}
573 
574 	}
575 
576 	/* need to work in utf16 */
577 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
578 
579 	if ( ret_pos < 0 ) {
580 		RETURN_FALSE;
581 	}
582 
583 	/* uchar_pos is the 'nth' Unicode character position of the needle */
584 
585 	ret_pos = 0;
586 	U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
587 
588 	if (part) {
589 		RETURN_STRINGL(haystack, ret_pos);
590 	} else {
591 		RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
592 	}
593 
594 }
595 /* }}} */
596 
597 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_strstr)598 PHP_FUNCTION(grapheme_strstr)
599 {
600 	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
601 }
602 /* }}} */
603 
604 /* {{{ Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_stristr)605 PHP_FUNCTION(grapheme_stristr)
606 {
607 	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
608 }
609 /* }}} */
610 
611 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
612 static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator * bi,int32_t csize,unsigned char * pstr,int32_t str_len)613 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
614 {
615 	int pos = 0;
616 	int ret_pos = 0;
617 	int break_pos, prev_break_pos;
618 	int count = 0;
619 
620 	while ( 1 ) {
621 		pos = ubrk_next(bi);
622 
623 		if ( UBRK_DONE == pos ) {
624 			break;
625 		}
626 
627 		for ( break_pos = ret_pos; break_pos < pos; ) {
628 			count++;
629 			prev_break_pos = break_pos;
630 			U8_FWD_1(pstr, break_pos, str_len);
631 
632 			if ( prev_break_pos == break_pos ) {
633 				/* something wrong - malformed utf8? */
634 				csize = 0;
635 				break;
636 			}
637 		}
638 
639 		/* if we are beyond our limit, then the loop is done */
640 		if ( count > csize ) {
641 			break;
642 		}
643 
644 		ret_pos = break_pos;
645 	}
646 
647 	return ret_pos;
648 }
649 /* }}} */
650 
651 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
652 static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator * bi,int32_t bsize,unsigned char * pstr,int32_t str_len)653 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
654 {
655 	int pos = 0;
656 	int ret_pos = 0;
657 
658 	while ( 1 ) {
659 		pos = ubrk_next(bi);
660 
661 		if ( UBRK_DONE == pos ) {
662 			break;
663 		}
664 
665 		if ( pos > bsize ) {
666 			break;
667 		}
668 
669 		ret_pos = pos;
670 	}
671 
672 	return ret_pos;
673 }
674 /* }}} */
675 
676 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
677 static inline int32_t
grapheme_extract_count_iter(UBreakIterator * bi,int32_t size,unsigned char * pstr,int32_t str_len)678 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
679 {
680 	int next_pos = 0;
681 	int ret_pos = 0;
682 
683 	while ( size ) {
684 		next_pos = ubrk_next(bi);
685 
686 		if ( UBRK_DONE == next_pos ) {
687 			break;
688 		}
689 		ret_pos = next_pos;
690 		size--;
691 	}
692 
693 	return ret_pos;
694 }
695 /* }}} */
696 
697 /* {{{ grapheme extract iter function pointer array */
698 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
699 
700 static const grapheme_extract_iter grapheme_extract_iters[] = {
701 	&grapheme_extract_count_iter,
702 	&grapheme_extract_bytecount_iter,
703 	&grapheme_extract_charcount_iter,
704 };
705 /* }}} */
706 
707 /* {{{ Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)708 PHP_FUNCTION(grapheme_extract)
709 {
710 	char *str, *pstr;
711 	UText ut = UTEXT_INITIALIZER;
712 	size_t str_len;
713 	zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
714 	zend_long lstart = 0; /* starting position in str in bytes */
715 	int32_t start = 0;
716 	zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
717 	UErrorCode status;
718 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
719 	UBreakIterator* bi = NULL;
720 	int ret_pos;
721 	zval *next = NULL; /* return offset of next part of the string */
722 
723 	ZEND_PARSE_PARAMETERS_START(2, 5)
724 		Z_PARAM_STRING(str, str_len)
725 		Z_PARAM_LONG(size)
726 		Z_PARAM_OPTIONAL
727 		Z_PARAM_LONG(extract_type)
728 		Z_PARAM_LONG(lstart)
729 		Z_PARAM_ZVAL(next)
730 	ZEND_PARSE_PARAMETERS_END();
731 
732 	if (lstart < 0) {
733 		lstart += str_len;
734 	}
735 
736 	if ( NULL != next ) {
737 		if ( !Z_ISREF_P(next) ) {
738 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
739 				 "grapheme_extract: 'next' was not passed by reference", 0 );
740 			RETURN_FALSE;
741 		} else {
742 			ZVAL_DEREF(next);
743 			/* initialize next */
744 			zval_ptr_dtor(next);
745 			ZVAL_LONG(next, lstart);
746 		}
747 	}
748 
749 	if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
750 		zend_argument_value_error(3, "must be one of GRAPHEME_EXTR_COUNT, GRAPHEME_EXTR_MAXBYTES, or GRAPHEME_EXTR_MAXCHARS");
751 		RETURN_THROWS();
752 	}
753 
754 	if ( lstart > INT32_MAX || lstart < 0 || (size_t)lstart >= str_len ) {
755 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
756 		RETURN_FALSE;
757 	}
758 
759 	if (size < 0) {
760 		zend_argument_value_error(2, "must be greater than or equal to 0");
761 		RETURN_THROWS();
762 	}
763 
764 	if (size > INT32_MAX) {
765 		zend_argument_value_error(2, "is too large");
766 		RETURN_THROWS();
767 	}
768 
769 	if (size == 0) {
770 		RETURN_EMPTY_STRING();
771 	}
772 
773 	/* we checked that it will fit: */
774 	start = (int32_t) lstart;
775 
776 	pstr = str + start;
777 
778 	/* just in case pstr points in the middle of a character, move forward to the start of the next char */
779 	if ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
780 		char *str_end = str + str_len;
781 
782 		while ( !U8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
783 			pstr++;
784 			if ( pstr >= str_end ) {
785 				intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
786 								"grapheme_extract: invalid input string", 0 );
787 
788 				RETURN_FALSE;
789 			}
790 		}
791 	}
792 
793 	str_len -= (pstr - str);
794 
795 	/* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
796 		(size + 1 because the size-th character might be the beginning of a grapheme cluster)
797 	 */
798 
799 	if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
800 		size_t nsize = MIN(size, str_len);
801 		if ( NULL != next ) {
802 			ZVAL_LONG(next, start+nsize);
803 		}
804 		RETURN_STRINGL(pstr, nsize);
805 	}
806 
807 	status = U_ZERO_ERROR;
808 	utext_openUTF8(&ut, pstr, str_len, &status);
809 
810 	if ( U_FAILURE( status ) ) {
811 		/* Set global error code. */
812 		intl_error_set_code( NULL, status );
813 
814 		/* Set error messages. */
815 		intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
816 
817 		RETURN_FALSE;
818 	}
819 
820 	bi = NULL;
821 	status = U_ZERO_ERROR;
822 	bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
823 
824 	ubrk_setUText(bi, &ut, &status);
825 	/* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
826 		can't back up. So, we will not do anything. */
827 
828 	/* now we need to find the end of the chunk the user wants us to return */
829 	/* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
830 	ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
831 
832 	utext_close(&ut);
833 	ubrk_close(bi);
834 
835 	if ( NULL != next ) {
836 		ZVAL_LONG(next, start+ret_pos);
837 	}
838 
839 	RETURN_STRINGL(((char *)pstr), ret_pos);
840 }
841 
PHP_FUNCTION(grapheme_str_split)842 PHP_FUNCTION(grapheme_str_split)
843 {
844 	char *pstr, *end;
845 	zend_string *str;
846 	zend_long split_len = 1;
847 
848 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
849 	UErrorCode ustatus = U_ZERO_ERROR;
850 	int32_t pos, current, i, end_len = 0;
851 	UBreakIterator* bi;
852 	UText *ut = NULL;
853 
854 	ZEND_PARSE_PARAMETERS_START(1, 2)
855 		Z_PARAM_STR(str)
856 		Z_PARAM_OPTIONAL
857 		Z_PARAM_LONG(split_len)
858 	ZEND_PARSE_PARAMETERS_END();
859 
860 	if (split_len <= 0 || split_len > UINT_MAX / 4) {
861 		zend_argument_value_error(2, "must be greater than 0 and less than or equal to %d", UINT_MAX / 4);
862 		RETURN_THROWS();
863 	}
864 
865 	if (ZSTR_LEN(str) == 0) {
866 		RETURN_EMPTY_ARRAY();
867 	}
868 
869 	pstr = ZSTR_VAL(str);
870 	ut = utext_openUTF8(ut, pstr, ZSTR_LEN(str), &ustatus);
871 
872 	if ( U_FAILURE( ustatus ) ) {
873 		/* Set global error code. */
874 		intl_error_set_code( NULL, ustatus );
875 
876 		/* Set error messages. */
877 		intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
878 
879 		RETURN_FALSE;
880 	}
881 
882 	bi = NULL;
883 	ustatus = U_ZERO_ERROR;
884 	bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &ustatus );
885 
886 	if( U_FAILURE(ustatus) ) {
887 		RETURN_FALSE;
888 	}
889 
890 	ubrk_setUText(bi, ut, &ustatus);
891 
892 	pos = 0;
893 	array_init(return_value);
894 
895 	for (end = pstr, i = 0, current = 0; pos != UBRK_DONE;) {
896 		end_len = pos - current;
897 		pos = ubrk_next(bi);
898 
899 		if (i == split_len - 1) {
900 			if ( pos != UBRK_DONE ) {
901 				add_next_index_stringl(return_value, pstr, pos - current);
902 				end = pstr + pos - current;
903 				i = 0;
904 			}
905 			pstr += pos - current;
906 			current = pos;
907 		} else {
908 			i += 1;
909 		}
910 	}
911 
912 	if (i != 0 && end_len != 0) {
913 		add_next_index_stringl(return_value, end, end_len);
914 	}
915 
916 	utext_close(ut);
917 	ubrk_close(bi);
918 }
919 
920 /* }}} */
921