xref: /PHP-5.4/ext/intl/grapheme/grapheme_string.c (revision 8aba119f)
1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 5														  |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,	  |
6    | that is bundled with this package in the file LICENSE, and is		  |
7    | available through the world-wide-web at the following url:			  |
8    | http://www.php.net/license/3_01.txt								  |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to		  |
11    | license@php.net so we can mail you a copy immediately.				  |
12    +----------------------------------------------------------------------+
13    | Author: Ed Batutis <ed@batutis.com>								  |
14    +----------------------------------------------------------------------+
15  */
16 
17 /* {{{ includes */
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21 
22 #include <php.h>
23 #include "grapheme.h"
24 #include "grapheme_util.h"
25 
26 #include <unicode/utypes.h>
27 #include <unicode/ucol.h>
28 #include <unicode/ustring.h>
29 #include <unicode/ubrk.h>
30 
31 #include "ext/standard/php_string.h"
32 
33 /* }}} */
34 
35 #define GRAPHEME_EXTRACT_TYPE_COUNT		0
36 #define GRAPHEME_EXTRACT_TYPE_MAXBYTES	1
37 #define GRAPHEME_EXTRACT_TYPE_MAXCHARS	2
38 #define GRAPHEME_EXTRACT_TYPE_MIN	GRAPHEME_EXTRACT_TYPE_COUNT
39 #define GRAPHEME_EXTRACT_TYPE_MAX	GRAPHEME_EXTRACT_TYPE_MAXCHARS
40 
41 
42 /* {{{ grapheme_register_constants
43  * Register API constants
44  */
grapheme_register_constants(INIT_FUNC_ARGS)45 void grapheme_register_constants( INIT_FUNC_ARGS )
46 {
47 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
48 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
49 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
50 }
51 /* }}} */
52 
53 /* {{{ proto int grapheme_strlen(string str)
54    Get number of graphemes in a string */
PHP_FUNCTION(grapheme_strlen)55 PHP_FUNCTION(grapheme_strlen)
56 {
57 	unsigned char* string;
58 	int string_len;
59 	UChar* ustring = NULL;
60 	int ustring_len = 0;
61 	int ret_len;
62 	UErrorCode status;
63 
64 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) {
65 
66 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
67 			 "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC );
68 
69 		RETURN_FALSE;
70 	}
71 
72 	ret_len = grapheme_ascii_check(string, string_len);
73 
74 	if ( ret_len >= 0 )
75 		RETURN_LONG(ret_len);
76 
77 	/* convert the string to UTF-16. */
78 	status = U_ZERO_ERROR;
79 	intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status );
80 
81 	if ( U_FAILURE( status ) ) {
82 		/* Set global error code. */
83 		intl_error_set_code( NULL, status TSRMLS_CC );
84 
85 		/* Set error messages. */
86 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
87 		if (ustring) {
88 			efree( ustring );
89 		}
90 		RETURN_NULL();
91 	}
92 
93 	ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC );
94 
95 	if (ustring) {
96 		efree( ustring );
97 	}
98 
99 	if (ret_len >= 0) {
100 		RETVAL_LONG(ret_len);
101 	} else {
102 		RETVAL_FALSE;
103 	}
104 }
105 /* }}} */
106 
107 /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
108    Find position of first occurrence of a string within another */
PHP_FUNCTION(grapheme_strpos)109 PHP_FUNCTION(grapheme_strpos)
110 {
111 	unsigned char *haystack, *needle;
112 	int haystack_len, needle_len;
113 	unsigned char *found;
114 	long loffset = 0;
115 	int32_t offset = 0;
116 	int ret_pos;
117 
118 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
119 
120 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
121 			 "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC );
122 
123 		RETURN_FALSE;
124 	}
125 
126 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
127 
128 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
129 
130 		RETURN_FALSE;
131 	}
132 
133 	/* we checked that it will fit: */
134 	offset = (int32_t) loffset;
135 
136 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
137 
138 	if (needle_len == 0) {
139 
140 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
141 
142 		RETURN_FALSE;
143 	}
144 
145 
146 	/* quick check to see if the string might be there
147 	 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
148 	*/
149 	found = (unsigned char *)php_memnstr((char *)haystack + offset, (char *)needle, needle_len, (char *)haystack + haystack_len);
150 
151 	/* if it isn't there the we are done */
152 	if (!found) {
153 		RETURN_FALSE;
154 	}
155 
156 	/* if it is there, and if the haystack is ascii, we are all done */
157 	if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
158 
159 		RETURN_LONG(found - haystack);
160 	}
161 
162 	/* do utf16 part of the strpos */
163 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ TSRMLS_CC );
164 
165 	if ( ret_pos >= 0 ) {
166 		RETURN_LONG(ret_pos);
167 	} else {
168 		RETURN_FALSE;
169 	}
170 
171 }
172 /* }}} */
173 
174 /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
175    Find position of first occurrence of a string within another, ignoring case differences */
PHP_FUNCTION(grapheme_stripos)176 PHP_FUNCTION(grapheme_stripos)
177 {
178 	unsigned char *haystack, *needle, *haystack_dup, *needle_dup;
179 	int haystack_len, needle_len;
180 	unsigned char *found;
181 	long loffset = 0;
182 	int32_t offset = 0;
183 	int ret_pos;
184 	int is_ascii;
185 
186 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
187 
188 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
189 			 "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC );
190 
191 		RETURN_FALSE;
192 	}
193 
194 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
195 
196 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC );
197 
198 		RETURN_FALSE;
199 	}
200 
201 	/* we checked that it will fit: */
202 	offset = (int32_t) loffset;
203 
204 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
205 
206 	if (needle_len == 0) {
207 
208 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC );
209 
210 		RETURN_FALSE;
211 	}
212 
213 
214 	is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 );
215 
216 	if ( is_ascii ) {
217 		needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
218 		php_strtolower((char *)needle_dup, needle_len);
219 		haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
220 		php_strtolower((char *)haystack_dup, haystack_len);
221 
222 		found = (unsigned char*) php_memnstr((char *)haystack_dup + offset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len);
223 
224 		efree(haystack_dup);
225 		efree(needle_dup);
226 
227 		if (found) {
228 			RETURN_LONG(found - haystack_dup);
229 		}
230 
231 		/* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
232 		if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
233 			RETURN_FALSE;
234 		}
235 	}
236 
237 	/* do utf16 part of the strpos */
238 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ TSRMLS_CC );
239 
240 	if ( ret_pos >= 0 ) {
241 		RETURN_LONG(ret_pos);
242 	} else {
243 		RETURN_FALSE;
244 	}
245 
246 }
247 /* }}} */
248 
249 /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
250    Find position of last occurrence of a string within another */
PHP_FUNCTION(grapheme_strrpos)251 PHP_FUNCTION(grapheme_strrpos)
252 {
253 	unsigned char *haystack, *needle;
254 	int haystack_len, needle_len;
255 	long loffset = 0;
256 	int32_t offset = 0;
257 	int32_t ret_pos;
258 	int is_ascii;
259 
260 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
261 
262 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
263 			 "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
264 
265 		RETURN_FALSE;
266 	}
267 
268 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
269 
270 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
271 
272 		RETURN_FALSE;
273 	}
274 
275 	/* we checked that it will fit: */
276 	offset = (int32_t) loffset;
277 
278 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
279 
280 	if (needle_len == 0) {
281 
282 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
283 
284 		RETURN_FALSE;
285 	}
286 
287 	is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
288 
289 	if ( is_ascii ) {
290 
291 		ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
292 
293 
294 		if ( ret_pos >= 0 ) {
295 			RETURN_LONG(ret_pos);
296 		}
297 
298 		/* if the needle was ascii too, we are done */
299 
300 		if (  grapheme_ascii_check(needle, needle_len) >= 0 ) {
301 			RETURN_FALSE;
302 		}
303 
304 		/* else we need to continue via utf16 */
305 	}
306 
307 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */ TSRMLS_CC);
308 
309 	if ( ret_pos >= 0 ) {
310 		RETURN_LONG(ret_pos);
311 	} else {
312 		RETURN_FALSE;
313 	}
314 
315 
316 }
317 /* }}} */
318 
319 /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
320    Find position of last occurrence of a string within another, ignoring case */
PHP_FUNCTION(grapheme_strripos)321 PHP_FUNCTION(grapheme_strripos)
322 {
323 	unsigned char *haystack, *needle;
324 	int haystack_len, needle_len;
325 	long loffset = 0;
326 	int32_t offset = 0;
327 	int32_t ret_pos;
328 	int is_ascii;
329 
330 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
331 
332 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
333 			 "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
334 
335 		RETURN_FALSE;
336 	}
337 
338 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
339 
340 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
341 
342 		RETURN_FALSE;
343 	}
344 
345 	/* we checked that it will fit: */
346 	offset = (int32_t) loffset;
347 
348 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
349 
350 	if (needle_len == 0) {
351 
352 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
353 
354 		RETURN_FALSE;
355 	}
356 
357 	is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
358 
359 	if ( is_ascii ) {
360 		unsigned char *needle_dup, *haystack_dup;
361 
362 		needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
363 		php_strtolower((char *)needle_dup, needle_len);
364 		haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
365 		php_strtolower((char *)haystack_dup, haystack_len);
366 
367 		ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
368 
369 		efree(haystack_dup);
370 		efree(needle_dup);
371 
372 		if ( ret_pos >= 0 ) {
373 			RETURN_LONG(ret_pos);
374 		}
375 
376 		/* if the needle was ascii too, we are done */
377 
378 		if (  grapheme_ascii_check(needle, needle_len) >= 0 ) {
379 			RETURN_FALSE;
380 		}
381 
382 		/* else we need to continue via utf16 */
383 	}
384 
385 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL,  1 /* f_ignore_case */, 1 /*last */ TSRMLS_CC);
386 
387 	if ( ret_pos >= 0 ) {
388 		RETURN_LONG(ret_pos);
389 	} else {
390 		RETURN_FALSE;
391 	}
392 
393 
394 }
395 /* }}} */
396 
397 /* {{{ proto string grapheme_substr(string str, int start [, int length])
398    Returns part of a string */
PHP_FUNCTION(grapheme_substr)399 PHP_FUNCTION(grapheme_substr)
400 {
401 	unsigned char *str, *sub_str;
402 	UChar *ustr;
403 	int str_len, sub_str_len, ustr_len;
404 	long lstart = 0, length = 0;
405 	int32_t start = 0;
406 	int iter_val;
407 	UErrorCode status;
408 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
409 	UBreakIterator* bi = NULL;
410 	int sub_str_start_pos, sub_str_end_pos;
411 	int32_t (*iter_func)(UBreakIterator *);
412 
413 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) {
414 
415 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
416 			 "grapheme_substr: unable to parse input param", 0 TSRMLS_CC );
417 
418 		RETURN_FALSE;
419 	}
420 
421 	if ( OUTSIDE_STRING(lstart, str_len) ) {
422 
423 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
424 
425 		RETURN_FALSE;
426 	}
427 
428 	/* we checked that it will fit: */
429 	start = (int32_t) lstart;
430 
431 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
432 
433 	if ( grapheme_ascii_check(str, str_len) >= 0 ) {
434 		grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len);
435 
436 		if ( NULL == sub_str ) {
437 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 TSRMLS_CC );
438 			RETURN_FALSE;
439 		}
440 
441 		RETURN_STRINGL(((char *)sub_str), sub_str_len, 1);
442 	}
443 
444 	ustr = NULL;
445 	ustr_len = 0;
446 	status = U_ZERO_ERROR;
447 	intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status);
448 
449 	if ( U_FAILURE( status ) ) {
450 		/* Set global error code. */
451 		intl_error_set_code( NULL, status TSRMLS_CC );
452 
453 		/* Set error messages. */
454 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
455 		if (ustr) {
456 			efree( ustr );
457 		}
458 		RETURN_FALSE;
459 	}
460 
461 	bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
462 
463 	if( U_FAILURE(status) ) {
464 		RETURN_FALSE;
465 	}
466 
467 	ubrk_setText(bi, ustr, ustr_len,	&status);
468 
469 	if ( start < 0 ) {
470 		iter_func = ubrk_previous;
471 		ubrk_last(bi);
472 		iter_val = 1;
473 	}
474 	else {
475 		iter_func = ubrk_next;
476 		iter_val = -1;
477 	}
478 
479 	sub_str_start_pos = 0;
480 
481 	while ( start ) {
482 		sub_str_start_pos = iter_func(bi);
483 
484 		if ( UBRK_DONE == sub_str_start_pos ) {
485 			break;
486 		}
487 
488 		start += iter_val;
489 	}
490 
491 	if ( 0 != start || sub_str_start_pos >= ustr_len ) {
492 
493 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
494 
495 		if (ustr) {
496 			efree(ustr);
497 		}
498 		ubrk_close(bi);
499 		RETURN_FALSE;
500 	}
501 
502 	if (ZEND_NUM_ARGS() <= 2) {
503 
504 		/* no length supplied, return the rest of the string */
505 
506 		sub_str = NULL;
507 		sub_str_len = 0;
508 		status = U_ZERO_ERROR;
509 		intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
510 
511 		if (ustr) {
512 			efree( ustr );
513 		}
514 		ubrk_close( bi );
515 
516 		if ( U_FAILURE( status ) ) {
517 			/* Set global error code. */
518 			intl_error_set_code( NULL, status TSRMLS_CC );
519 
520 			/* Set error messages. */
521 			intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
522 
523 			if (sub_str) {
524 				efree( sub_str );
525 			}
526 
527 			RETURN_FALSE;
528 		}
529 
530 		/* return the allocated string, not a duplicate */
531 		RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
532 	}
533 
534 	if(length == 0) {
535 		/* empty length - we've validated start, we can return "" now */
536 		if (ustr) {
537 			efree(ustr);
538 		}
539 		ubrk_close(bi);
540 		RETURN_EMPTY_STRING();
541 	}
542 
543 	/* find the end point of the string to return */
544 
545 	if ( length < 0 ) {
546 		iter_func = ubrk_previous;
547 		ubrk_last(bi);
548 		iter_val = 1;
549 	}
550 	else {
551 		iter_func = ubrk_next;
552 		iter_val = -1;
553 	}
554 
555 	sub_str_end_pos = 0;
556 
557 	while ( length ) {
558 		sub_str_end_pos = iter_func(bi);
559 
560 		if ( UBRK_DONE == sub_str_end_pos ) {
561 			break;
562 		}
563 
564 		length += iter_val;
565 	}
566 
567 	ubrk_close(bi);
568 
569 	if ( UBRK_DONE == sub_str_end_pos) {
570 		if(length < 0) {
571 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC );
572 
573 			efree(ustr);
574 			RETURN_FALSE;
575 		} else {
576 			sub_str_end_pos = ustr_len;
577 		}
578 	}
579 
580 	if(sub_str_start_pos > sub_str_end_pos) {
581 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length is beyond start", 1 TSRMLS_CC );
582 
583 		efree(ustr);
584 		RETURN_FALSE;
585 	}
586 
587 	sub_str = NULL;
588 	status = U_ZERO_ERROR;
589 	intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
590 
591 	efree( ustr );
592 
593 	if ( U_FAILURE( status ) ) {
594 		/* Set global error code. */
595 		intl_error_set_code( NULL, status TSRMLS_CC );
596 
597 		/* Set error messages. */
598 		intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
599 
600 		if ( NULL != sub_str )
601 			efree( sub_str );
602 
603 		RETURN_FALSE;
604 	}
605 
606 	 /* return the allocated string, not a duplicate */
607 	RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
608 
609 }
610 /* }}} */
611 
612 /* {{{	strstr_common_handler */
strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS,int f_ignore_case)613 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
614 {
615 	unsigned char *haystack, *needle, *found;
616 	int haystack_len, needle_len;
617 	int ret_pos, uchar_pos;
618 	zend_bool part = 0;
619 
620 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) {
621 
622 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
623 			 "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC );
624 
625 		RETURN_FALSE;
626 	}
627 
628 	if (needle_len == 0) {
629 
630 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
631 
632 		RETURN_FALSE;
633 	}
634 
635 
636 	if ( !f_ignore_case ) {
637 
638 		/* ASCII optimization: quick check to see if the string might be there
639 		 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
640 		*/
641 		found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len);
642 
643 		/* if it isn't there the we are done */
644 		if ( !found ) {
645 			RETURN_FALSE;
646 		}
647 
648 		/* if it is there, and if the haystack is ascii, we are all done */
649 		if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
650 			size_t found_offset = found - haystack;
651 
652 			if (part) {
653 				RETURN_STRINGL(((char *)haystack) , found_offset, 1);
654 			} else {
655 				RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1);
656 			}
657 		}
658 
659 	}
660 
661 	/* need to work in utf16 */
662 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ TSRMLS_CC );
663 
664 	if ( ret_pos < 0 ) {
665 		RETURN_FALSE;
666 	}
667 
668 	/* uchar_pos is the 'nth' Unicode character position of the needle */
669 
670 	ret_pos = 0;
671 	U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
672 
673 	if (part) {
674 		RETURN_STRINGL(((char *)haystack), ret_pos, 1);
675 	}
676 	else {
677 		RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1);
678 	}
679 
680 }
681 /* }}} */
682 
683 /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
684    Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_strstr)685 PHP_FUNCTION(grapheme_strstr)
686 {
687 	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
688 }
689 /* }}} */
690 
691 /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
692    Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_stristr)693 PHP_FUNCTION(grapheme_stristr)
694 {
695 	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
696 }
697 /* }}} */
698 
699 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
700 static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator * bi,int32_t csize,unsigned char * pstr,int32_t str_len)701 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
702 {
703 	int pos = 0, prev_pos = 0;
704 	int ret_pos = 0, prev_ret_pos = 0;
705 
706 	while ( 1 ) {
707 		pos = ubrk_next(bi);
708 
709 		if ( UBRK_DONE == pos ) {
710 			break;
711 		}
712 
713 		/* if we are beyond our limit, then the loop is done */
714 		if ( pos > csize ) {
715 			break;
716 		}
717 
718 		/* update our pointer in the original UTF-8 buffer by as many characters
719 		   as ubrk_next iterated over */
720 
721 		prev_ret_pos = ret_pos;
722 		U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
723 
724 		if ( prev_ret_pos == ret_pos ) {
725 			/* something wrong - malformed utf8? */
726 			break;
727 		}
728 
729 		prev_pos = pos;
730 	}
731 
732 	return ret_pos;
733 }
734 /* }}} */
735 
736 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
737 static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator * bi,int32_t bsize,unsigned char * pstr,int32_t str_len)738 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
739 {
740 	int pos = 0, prev_pos = 0;
741 	int ret_pos = 0, prev_ret_pos = 0;
742 
743 	while ( 1 ) {
744 		pos = ubrk_next(bi);
745 
746 		if ( UBRK_DONE == pos ) {
747 			break;
748 		}
749 
750 		prev_ret_pos = ret_pos;
751 		U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
752 
753 		if ( ret_pos > bsize ) {
754 			ret_pos = prev_ret_pos;
755 			break;
756 		}
757 
758 		if ( prev_ret_pos == ret_pos ) {
759 			/* something wrong - malformed utf8? */
760 			break;
761 		}
762 
763 		prev_pos = pos;
764 	}
765 
766 	return ret_pos;
767 }
768 /* }}} */
769 
770 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
771 static inline int32_t
grapheme_extract_count_iter(UBreakIterator * bi,int32_t size,unsigned char * pstr,int32_t str_len)772 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
773 {
774 	int pos = 0, next_pos = 0;
775 	int ret_pos = 0;
776 
777 	while ( size ) {
778 		next_pos = ubrk_next(bi);
779 
780 		if ( UBRK_DONE == next_pos ) {
781 			break;
782 		}
783 		pos = next_pos;
784 		size--;
785 	}
786 
787 	/* pos is one past the last UChar - and represent the number of code units to
788 		advance in the utf-8 buffer
789 	*/
790 
791 	U8_FWD_N(pstr, ret_pos, str_len, pos);
792 
793 	return ret_pos;
794 }
795 /* }}} */
796 
797 /* {{{ grapheme extract iter function pointer array */
798 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
799 
800 static grapheme_extract_iter grapheme_extract_iters[] = {
801 	&grapheme_extract_count_iter,
802 	&grapheme_extract_bytecount_iter,
803 	&grapheme_extract_charcount_iter,
804 };
805 /* }}} */
806 
807 /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
808 	Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)809 PHP_FUNCTION(grapheme_extract)
810 {
811 	unsigned char *str, *pstr;
812 	UChar *ustr;
813 	int str_len, ustr_len;
814 	long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
815 	long lstart = 0; /* starting position in str in bytes */
816 	int32_t start = 0;
817 	long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
818 	UErrorCode status;
819 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
820 	UBreakIterator* bi = NULL;
821 	int ret_pos;
822 	zval *next = NULL; /* return offset of next part of the string */
823 
824 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
825 
826 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
827 			 "grapheme_extract: unable to parse input param", 0 TSRMLS_CC );
828 
829 		RETURN_FALSE;
830 	}
831 
832 	if ( NULL != next ) {
833 		if ( !PZVAL_IS_REF(next) ) {
834 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
835 				 "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC );
836 
837 			RETURN_FALSE;
838 		}
839 		else {
840 			/* initialize next */
841 			zval_dtor(next);
842             ZVAL_LONG(next, lstart);
843 		}
844 	}
845 
846 	if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
847 
848 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
849 			 "grapheme_extract: unknown extract type param", 0 TSRMLS_CC );
850 
851 		RETURN_FALSE;
852 	}
853 
854 	if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
855 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 TSRMLS_CC );
856 		RETURN_FALSE;
857 	}
858 
859 	if ( size > INT32_MAX || size < 0) {
860 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 TSRMLS_CC );
861 		RETURN_FALSE;
862 	}
863 	if (size == 0) {
864 		RETURN_EMPTY_STRING();
865 	}
866 
867 	/* we checked that it will fit: */
868 	start = (int32_t) lstart;
869 
870 	pstr = str + start;
871 
872 	/* just in case pstr points in the middle of a character, move forward to the start of the next char */
873 	if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
874 		unsigned char *str_end = str + str_len;
875 
876 		while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
877 			pstr++;
878 			if ( pstr >= str_end ) {
879 				intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
880 								"grapheme_extract: invalid input string", 0 TSRMLS_CC );
881 
882 				RETURN_FALSE;
883 			}
884 		}
885 	}
886 
887 	str_len -= (pstr - str);
888 
889 	/* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
890 		(size + 1 because the size-th character might be the beginning of a grapheme cluster)
891 	 */
892 
893 	if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) {
894         long nsize = ( size < str_len ? size : str_len );
895 		if ( NULL != next ) {
896 			ZVAL_LONG(next, start+nsize);
897 		}
898 		RETURN_STRINGL(((char *)pstr), nsize, 1);
899 	}
900 
901 	/* convert the strings to UTF-16. */
902 	ustr = NULL;
903 	ustr_len = 0;
904 	status = U_ZERO_ERROR;
905 	intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
906 
907 	if ( U_FAILURE( status ) ) {
908 		/* Set global error code. */
909 		intl_error_set_code( NULL, status TSRMLS_CC );
910 
911 		/* Set error messages. */
912 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
913 
914 		if ( NULL != ustr )
915 			efree( ustr );
916 
917 		RETURN_FALSE;
918 	}
919 
920 	bi = NULL;
921 	status = U_ZERO_ERROR;
922 	bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
923 
924 	ubrk_setText(bi, ustr, ustr_len, &status);
925 
926 	/* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
927 		can't back up. So, we will not do anything. */
928 
929 	/* now we need to find the end of the chunk the user wants us to return */
930 
931 	ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
932 
933 	if (ustr) {
934 		efree(ustr);
935 	}
936 	ubrk_close(bi);
937 
938 	if ( NULL != next ) {
939 		ZVAL_LONG(next, start+ret_pos);
940 	}
941 
942 	RETURN_STRINGL(((char *)pstr), ret_pos, 1);
943 }
944 
945 /* }}} */
946 
947 /*
948  * Local variables:
949  * tab-width: 4
950  * c-basic-offset: 4
951  * End:
952  * vim600: fdm=marker
953  * vim: noet sw=4 ts=4
954  */
955 
956