xref: /PHP-5.6/ext/intl/grapheme/grapheme_string.c (revision c8778eb2)
1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 5														  |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,	  |
6    | that is bundled with this package in the file LICENSE, and is		  |
7    | available through the world-wide-web at the following url:			  |
8    | http://www.php.net/license/3_01.txt								  |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to		  |
11    | license@php.net so we can mail you a copy immediately.				  |
12    +----------------------------------------------------------------------+
13    | Author: Ed Batutis <ed@batutis.com>								  |
14    +----------------------------------------------------------------------+
15  */
16 
17 /* {{{ includes */
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21 
22 #include <php.h>
23 #include "grapheme.h"
24 #include "grapheme_util.h"
25 
26 #include <unicode/utypes.h>
27 #include <unicode/ucol.h>
28 #include <unicode/ustring.h>
29 #include <unicode/ubrk.h>
30 
31 #include "ext/standard/php_string.h"
32 
33 /* }}} */
34 
35 #define GRAPHEME_EXTRACT_TYPE_COUNT		0
36 #define GRAPHEME_EXTRACT_TYPE_MAXBYTES	1
37 #define GRAPHEME_EXTRACT_TYPE_MAXCHARS	2
38 #define GRAPHEME_EXTRACT_TYPE_MIN	GRAPHEME_EXTRACT_TYPE_COUNT
39 #define GRAPHEME_EXTRACT_TYPE_MAX	GRAPHEME_EXTRACT_TYPE_MAXCHARS
40 
41 
42 /* {{{ grapheme_register_constants
43  * Register API constants
44  */
grapheme_register_constants(INIT_FUNC_ARGS)45 void grapheme_register_constants( INIT_FUNC_ARGS )
46 {
47 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
48 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
49 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
50 }
51 /* }}} */
52 
53 /* {{{ proto int grapheme_strlen(string str)
54    Get number of graphemes in a string */
PHP_FUNCTION(grapheme_strlen)55 PHP_FUNCTION(grapheme_strlen)
56 {
57 	unsigned char* string;
58 	int string_len;
59 	UChar* ustring = NULL;
60 	int ustring_len = 0;
61 	int ret_len;
62 	UErrorCode status;
63 
64 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) {
65 
66 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
67 			 "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC );
68 
69 		RETURN_FALSE;
70 	}
71 
72 	ret_len = grapheme_ascii_check(string, string_len);
73 
74 	if ( ret_len >= 0 )
75 		RETURN_LONG(ret_len);
76 
77 	/* convert the string to UTF-16. */
78 	status = U_ZERO_ERROR;
79 	intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status );
80 
81 	if ( U_FAILURE( status ) ) {
82 		/* Set global error code. */
83 		intl_error_set_code( NULL, status TSRMLS_CC );
84 
85 		/* Set error messages. */
86 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
87 		if (ustring) {
88 			efree( ustring );
89 		}
90 		RETURN_NULL();
91 	}
92 
93 	ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC );
94 
95 	if (ustring) {
96 		efree( ustring );
97 	}
98 
99 	if (ret_len >= 0) {
100 		RETVAL_LONG(ret_len);
101 	} else {
102 		RETVAL_FALSE;
103 	}
104 }
105 /* }}} */
106 
107 /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
108    Find position of first occurrence of a string within another */
PHP_FUNCTION(grapheme_strpos)109 PHP_FUNCTION(grapheme_strpos)
110 {
111 	unsigned char *haystack, *needle;
112 	int haystack_len, needle_len;
113 	unsigned char *found;
114 	long loffset = 0;
115 	int32_t offset = 0, noffset = 0;
116 	int ret_pos;
117 
118 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
119 
120 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
121 			 "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC );
122 
123 		RETURN_FALSE;
124 	}
125 
126 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
127 
128 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
129 
130 		RETURN_FALSE;
131 	}
132 
133 	/* we checked that it will fit: */
134 	offset = (int32_t) loffset;
135 	noffset = offset >= 0 ? offset : haystack_len + offset;
136 
137 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
138 
139 	if (needle_len == 0) {
140 
141 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
142 
143 		RETURN_FALSE;
144 	}
145 
146 
147 	/* quick check to see if the string might be there
148 	 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
149 	*/
150 	found = (unsigned char *)php_memnstr((char *)haystack + noffset, (char *)needle, needle_len, (char *)haystack + haystack_len);
151 
152 	/* if it isn't there the we are done */
153 	if (!found) {
154 		RETURN_FALSE;
155 	}
156 
157 	/* if it is there, and if the haystack is ascii, we are all done */
158 	if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
159 
160 		RETURN_LONG(found - haystack);
161 	}
162 
163 	/* do utf16 part of the strpos */
164 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ TSRMLS_CC );
165 
166 	if ( ret_pos >= 0 ) {
167 		RETURN_LONG(ret_pos);
168 	} else {
169 		RETURN_FALSE;
170 	}
171 
172 }
173 /* }}} */
174 
175 /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
176    Find position of first occurrence of a string within another, ignoring case differences */
PHP_FUNCTION(grapheme_stripos)177 PHP_FUNCTION(grapheme_stripos)
178 {
179 	unsigned char *haystack, *needle, *haystack_dup, *needle_dup;
180 	int haystack_len, needle_len;
181 	unsigned char *found;
182 	long loffset = 0;
183 	int32_t offset = 0;
184 	int ret_pos;
185 	int is_ascii;
186 
187 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
188 
189 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
190 			 "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC );
191 
192 		RETURN_FALSE;
193 	}
194 
195 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
196 
197 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC );
198 
199 		RETURN_FALSE;
200 	}
201 
202 	/* we checked that it will fit: */
203 	offset = (int32_t) loffset;
204 
205 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
206 
207 	if (needle_len == 0) {
208 
209 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC );
210 
211 		RETURN_FALSE;
212 	}
213 
214 
215 	is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 );
216 
217 	if ( is_ascii ) {
218 		int32_t noffset = offset >= 0 ? offset : haystack_len + offset;
219 		needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
220 		php_strtolower((char *)needle_dup, needle_len);
221 		haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
222 		php_strtolower((char *)haystack_dup, haystack_len);
223 
224 		found = (unsigned char*) php_memnstr((char *)haystack_dup + noffset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len);
225 
226 		efree(haystack_dup);
227 		efree(needle_dup);
228 
229 		if (found) {
230 			RETURN_LONG(found - haystack_dup);
231 		}
232 
233 		/* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
234 		if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
235 			RETURN_FALSE;
236 		}
237 	}
238 
239 	/* do utf16 part of the strpos */
240 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ TSRMLS_CC );
241 
242 	if ( ret_pos >= 0 ) {
243 		RETURN_LONG(ret_pos);
244 	} else {
245 		RETURN_FALSE;
246 	}
247 
248 }
249 /* }}} */
250 
251 /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
252    Find position of last occurrence of a string within another */
PHP_FUNCTION(grapheme_strrpos)253 PHP_FUNCTION(grapheme_strrpos)
254 {
255 	unsigned char *haystack, *needle;
256 	int haystack_len, needle_len;
257 	long loffset = 0;
258 	int32_t offset = 0;
259 	int32_t ret_pos;
260 	int is_ascii;
261 
262 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
263 
264 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
265 			 "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
266 
267 		RETURN_FALSE;
268 	}
269 
270 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
271 
272 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
273 
274 		RETURN_FALSE;
275 	}
276 
277 	/* we checked that it will fit: */
278 	offset = (int32_t) loffset;
279 
280 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
281 
282 	if (needle_len == 0) {
283 
284 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
285 
286 		RETURN_FALSE;
287 	}
288 
289 	is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
290 
291 	if ( is_ascii ) {
292 
293 		ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
294 
295 
296 		if ( ret_pos >= 0 ) {
297 			RETURN_LONG(ret_pos);
298 		}
299 
300 		/* if the needle was ascii too, we are done */
301 
302 		if (  grapheme_ascii_check(needle, needle_len) >= 0 ) {
303 			RETURN_FALSE;
304 		}
305 
306 		/* else we need to continue via utf16 */
307 	}
308 
309 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */ TSRMLS_CC);
310 
311 	if ( ret_pos >= 0 ) {
312 		RETURN_LONG(ret_pos);
313 	} else {
314 		RETURN_FALSE;
315 	}
316 
317 
318 }
319 /* }}} */
320 
321 /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
322    Find position of last occurrence of a string within another, ignoring case */
PHP_FUNCTION(grapheme_strripos)323 PHP_FUNCTION(grapheme_strripos)
324 {
325 	unsigned char *haystack, *needle;
326 	int haystack_len, needle_len;
327 	long loffset = 0;
328 	int32_t offset = 0;
329 	int32_t ret_pos;
330 	int is_ascii;
331 
332 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
333 
334 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
335 			 "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
336 
337 		RETURN_FALSE;
338 	}
339 
340 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
341 
342 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
343 
344 		RETURN_FALSE;
345 	}
346 
347 	/* we checked that it will fit: */
348 	offset = (int32_t) loffset;
349 
350 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
351 
352 	if (needle_len == 0) {
353 
354 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
355 
356 		RETURN_FALSE;
357 	}
358 
359 	is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
360 
361 	if ( is_ascii ) {
362 		unsigned char *needle_dup, *haystack_dup;
363 
364 		needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
365 		php_strtolower((char *)needle_dup, needle_len);
366 		haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
367 		php_strtolower((char *)haystack_dup, haystack_len);
368 
369 		ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
370 
371 		efree(haystack_dup);
372 		efree(needle_dup);
373 
374 		if ( ret_pos >= 0 ) {
375 			RETURN_LONG(ret_pos);
376 		}
377 
378 		/* if the needle was ascii too, we are done */
379 
380 		if (  grapheme_ascii_check(needle, needle_len) >= 0 ) {
381 			RETURN_FALSE;
382 		}
383 
384 		/* else we need to continue via utf16 */
385 	}
386 
387 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL,  1 /* f_ignore_case */, 1 /*last */ TSRMLS_CC);
388 
389 	if ( ret_pos >= 0 ) {
390 		RETURN_LONG(ret_pos);
391 	} else {
392 		RETURN_FALSE;
393 	}
394 
395 
396 }
397 /* }}} */
398 
399 /* {{{ proto string grapheme_substr(string str, int start [, int length])
400    Returns part of a string */
PHP_FUNCTION(grapheme_substr)401 PHP_FUNCTION(grapheme_substr)
402 {
403 	unsigned char *str, *sub_str;
404 	UChar *ustr;
405 	int str_len, sub_str_len, ustr_len;
406 	long lstart = 0, length = 0;
407 	int32_t start = 0;
408 	int iter_val;
409 	UErrorCode status;
410 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
411 	UBreakIterator* bi = NULL;
412 	int sub_str_start_pos, sub_str_end_pos;
413 	int32_t (*iter_func)(UBreakIterator *);
414 
415 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) {
416 
417 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
418 			 "grapheme_substr: unable to parse input param", 0 TSRMLS_CC );
419 
420 		RETURN_FALSE;
421 	}
422 
423 	if ( OUTSIDE_STRING(lstart, str_len) ) {
424 
425 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
426 
427 		RETURN_FALSE;
428 	}
429 
430 	/* we checked that it will fit: */
431 	start = (int32_t) lstart;
432 
433 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
434 
435 	if ( grapheme_ascii_check(str, str_len) >= 0 ) {
436 		grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len);
437 
438 		if ( NULL == sub_str ) {
439 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 TSRMLS_CC );
440 			RETURN_FALSE;
441 		}
442 
443 		RETURN_STRINGL(((char *)sub_str), sub_str_len, 1);
444 	}
445 
446 	ustr = NULL;
447 	ustr_len = 0;
448 	status = U_ZERO_ERROR;
449 	intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status);
450 
451 	if ( U_FAILURE( status ) ) {
452 		/* Set global error code. */
453 		intl_error_set_code( NULL, status TSRMLS_CC );
454 
455 		/* Set error messages. */
456 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
457 		if (ustr) {
458 			efree( ustr );
459 		}
460 		RETURN_FALSE;
461 	}
462 
463 	bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
464 
465 	if( U_FAILURE(status) ) {
466 		RETURN_FALSE;
467 	}
468 
469 	ubrk_setText(bi, ustr, ustr_len,	&status);
470 
471 	if ( start < 0 ) {
472 		iter_func = ubrk_previous;
473 		ubrk_last(bi);
474 		iter_val = 1;
475 	}
476 	else {
477 		iter_func = ubrk_next;
478 		iter_val = -1;
479 	}
480 
481 	sub_str_start_pos = 0;
482 
483 	while ( start ) {
484 		sub_str_start_pos = iter_func(bi);
485 
486 		if ( UBRK_DONE == sub_str_start_pos ) {
487 			break;
488 		}
489 
490 		start += iter_val;
491 	}
492 
493 	if ( 0 != start || sub_str_start_pos >= ustr_len ) {
494 
495 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
496 
497 		if (ustr) {
498 			efree(ustr);
499 		}
500 		ubrk_close(bi);
501 		RETURN_FALSE;
502 	}
503 
504 	if (ZEND_NUM_ARGS() <= 2) {
505 
506 		/* no length supplied, return the rest of the string */
507 
508 		sub_str = NULL;
509 		sub_str_len = 0;
510 		status = U_ZERO_ERROR;
511 		intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
512 
513 		if (ustr) {
514 			efree( ustr );
515 		}
516 		ubrk_close( bi );
517 
518 		if ( U_FAILURE( status ) ) {
519 			/* Set global error code. */
520 			intl_error_set_code( NULL, status TSRMLS_CC );
521 
522 			/* Set error messages. */
523 			intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
524 
525 			if (sub_str) {
526 				efree( sub_str );
527 			}
528 
529 			RETURN_FALSE;
530 		}
531 
532 		/* return the allocated string, not a duplicate */
533 		RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
534 	}
535 
536 	if(length == 0) {
537 		/* empty length - we've validated start, we can return "" now */
538 		if (ustr) {
539 			efree(ustr);
540 		}
541 		ubrk_close(bi);
542 		RETURN_EMPTY_STRING();
543 	}
544 
545 	/* find the end point of the string to return */
546 
547 	if ( length < 0 ) {
548 		iter_func = ubrk_previous;
549 		ubrk_last(bi);
550 		iter_val = 1;
551 	}
552 	else {
553 		iter_func = ubrk_next;
554 		iter_val = -1;
555 	}
556 
557 	sub_str_end_pos = 0;
558 
559 	while ( length ) {
560 		sub_str_end_pos = iter_func(bi);
561 
562 		if ( UBRK_DONE == sub_str_end_pos ) {
563 			break;
564 		}
565 
566 		length += iter_val;
567 	}
568 
569 	ubrk_close(bi);
570 
571 	if ( UBRK_DONE == sub_str_end_pos) {
572 		if(length < 0) {
573 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC );
574 
575 			efree(ustr);
576 			RETURN_FALSE;
577 		} else {
578 			sub_str_end_pos = ustr_len;
579 		}
580 	}
581 
582 	if(sub_str_start_pos > sub_str_end_pos) {
583 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length is beyond start", 1 TSRMLS_CC );
584 
585 		efree(ustr);
586 		RETURN_FALSE;
587 	}
588 
589 	sub_str = NULL;
590 	status = U_ZERO_ERROR;
591 	intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
592 
593 	efree( ustr );
594 
595 	if ( U_FAILURE( status ) ) {
596 		/* Set global error code. */
597 		intl_error_set_code( NULL, status TSRMLS_CC );
598 
599 		/* Set error messages. */
600 		intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
601 
602 		if ( NULL != sub_str )
603 			efree( sub_str );
604 
605 		RETURN_FALSE;
606 	}
607 
608 	 /* return the allocated string, not a duplicate */
609 	RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
610 
611 }
612 /* }}} */
613 
614 /* {{{	strstr_common_handler */
strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS,int f_ignore_case)615 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
616 {
617 	unsigned char *haystack, *needle, *found;
618 	int haystack_len, needle_len;
619 	int ret_pos, uchar_pos;
620 	zend_bool part = 0;
621 
622 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) {
623 
624 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
625 			 "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC );
626 
627 		RETURN_FALSE;
628 	}
629 
630 	if (needle_len == 0) {
631 
632 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
633 
634 		RETURN_FALSE;
635 	}
636 
637 
638 	if ( !f_ignore_case ) {
639 
640 		/* ASCII optimization: quick check to see if the string might be there
641 		 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
642 		*/
643 		found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len);
644 
645 		/* if it isn't there the we are done */
646 		if ( !found ) {
647 			RETURN_FALSE;
648 		}
649 
650 		/* if it is there, and if the haystack is ascii, we are all done */
651 		if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
652 			size_t found_offset = found - haystack;
653 
654 			if (part) {
655 				RETURN_STRINGL(((char *)haystack) , found_offset, 1);
656 			} else {
657 				RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1);
658 			}
659 		}
660 
661 	}
662 
663 	/* need to work in utf16 */
664 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ TSRMLS_CC );
665 
666 	if ( ret_pos < 0 ) {
667 		RETURN_FALSE;
668 	}
669 
670 	/* uchar_pos is the 'nth' Unicode character position of the needle */
671 
672 	ret_pos = 0;
673 	U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
674 
675 	if (part) {
676 		RETURN_STRINGL(((char *)haystack), ret_pos, 1);
677 	}
678 	else {
679 		RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1);
680 	}
681 
682 }
683 /* }}} */
684 
685 /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
686    Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_strstr)687 PHP_FUNCTION(grapheme_strstr)
688 {
689 	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
690 }
691 /* }}} */
692 
693 /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
694    Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_stristr)695 PHP_FUNCTION(grapheme_stristr)
696 {
697 	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
698 }
699 /* }}} */
700 
701 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
702 static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator * bi,int32_t csize,unsigned char * pstr,int32_t str_len)703 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
704 {
705 	int pos = 0;
706 	int ret_pos = 0;
707 	int break_pos, prev_break_pos;
708 	int count = 0;
709 
710 	while ( 1 ) {
711 		pos = ubrk_next(bi);
712 
713 		if ( UBRK_DONE == pos ) {
714 			break;
715 		}
716 
717 		for ( break_pos = ret_pos; break_pos < pos; ) {
718 			count++;
719 			prev_break_pos = break_pos;
720 			U8_FWD_1(pstr, break_pos, str_len);
721 
722 			if ( prev_break_pos == break_pos ) {
723 				/* something wrong - malformed utf8? */
724 				csize = 0;
725 				break;
726 			}
727 		}
728 
729 		/* if we are beyond our limit, then the loop is done */
730 		if ( count > csize ) {
731 			break;
732 		}
733 
734 		ret_pos = break_pos;
735 	}
736 
737 	return ret_pos;
738 }
739 /* }}} */
740 
741 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
742 static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator * bi,int32_t bsize,unsigned char * pstr,int32_t str_len)743 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
744 {
745 	int pos = 0;
746 	int ret_pos = 0;
747 
748 	while ( 1 ) {
749 		pos = ubrk_next(bi);
750 
751 		if ( UBRK_DONE == pos ) {
752 			break;
753 		}
754 
755 		if ( pos > bsize ) {
756 			break;
757 		}
758 
759 		ret_pos = pos;
760 	}
761 
762 	return ret_pos;
763 }
764 /* }}} */
765 
766 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
767 static inline int32_t
grapheme_extract_count_iter(UBreakIterator * bi,int32_t size,unsigned char * pstr,int32_t str_len)768 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
769 {
770 	int next_pos = 0;
771 	int ret_pos = 0;
772 
773 	while ( size ) {
774 		next_pos = ubrk_next(bi);
775 
776 		if ( UBRK_DONE == next_pos ) {
777 			break;
778 		}
779 		ret_pos = next_pos;
780 		size--;
781 	}
782 
783 	return ret_pos;
784 }
785 /* }}} */
786 
787 /* {{{ grapheme extract iter function pointer array */
788 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
789 
790 static grapheme_extract_iter grapheme_extract_iters[] = {
791 	&grapheme_extract_count_iter,
792 	&grapheme_extract_bytecount_iter,
793 	&grapheme_extract_charcount_iter,
794 };
795 /* }}} */
796 
797 /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
798 	Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)799 PHP_FUNCTION(grapheme_extract)
800 {
801 	char *str, *pstr;
802 	UText ut = UTEXT_INITIALIZER;
803 	int str_len;
804 	long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
805 	long lstart = 0; /* starting position in str in bytes */
806 	int32_t start = 0;
807 	long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
808 	UErrorCode status;
809 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
810 	UBreakIterator* bi = NULL;
811 	int ret_pos;
812 	zval *next = NULL; /* return offset of next part of the string */
813 
814 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
815 
816 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
817 			 "grapheme_extract: unable to parse input param", 0 TSRMLS_CC );
818 
819 		RETURN_FALSE;
820 	}
821 
822 	if ( NULL != next ) {
823 		if ( !PZVAL_IS_REF(next) ) {
824 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
825 				 "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC );
826 
827 			RETURN_FALSE;
828 		}
829 		else {
830 			/* initialize next */
831 			zval_dtor(next);
832             ZVAL_LONG(next, lstart);
833 		}
834 	}
835 
836 	if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
837 
838 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
839 			 "grapheme_extract: unknown extract type param", 0 TSRMLS_CC );
840 
841 		RETURN_FALSE;
842 	}
843 
844 	if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
845 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 TSRMLS_CC );
846 		RETURN_FALSE;
847 	}
848 
849 	if ( size > INT32_MAX || size < 0) {
850 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 TSRMLS_CC );
851 		RETURN_FALSE;
852 	}
853 	if (size == 0) {
854 		RETURN_EMPTY_STRING();
855 	}
856 
857 	/* we checked that it will fit: */
858 	start = (int32_t) lstart;
859 
860 	pstr = str + start;
861 
862 	/* just in case pstr points in the middle of a character, move forward to the start of the next char */
863 	if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
864 		unsigned char *str_end = str + str_len;
865 
866 		while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
867 			pstr++;
868 			if ( pstr >= str_end ) {
869 				intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
870 								"grapheme_extract: invalid input string", 0 TSRMLS_CC );
871 
872 				RETURN_FALSE;
873 			}
874 		}
875 	}
876 
877 	str_len -= (pstr - str);
878 
879 	/* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
880 		(size + 1 because the size-th character might be the beginning of a grapheme cluster)
881 	 */
882 
883 	if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) {
884         long nsize = ( size < str_len ? size : str_len );
885 		if ( NULL != next ) {
886 			ZVAL_LONG(next, start+nsize);
887 		}
888 		RETURN_STRINGL(((char *)pstr), nsize, 1);
889 	}
890 
891 	status = U_ZERO_ERROR;
892 	utext_openUTF8(&ut, pstr, str_len, &status);
893 
894 	if ( U_FAILURE( status ) ) {
895 		/* Set global error code. */
896 		intl_error_set_code( NULL, status TSRMLS_CC );
897 
898 		/* Set error messages. */
899 		intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 TSRMLS_CC );
900 
901 		RETURN_FALSE;
902 	}
903 
904 	bi = NULL;
905 	status = U_ZERO_ERROR;
906 	bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
907 
908 	ubrk_setUText(bi, &ut, &status);
909 	/* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
910 		can't back up. So, we will not do anything. */
911 
912 	/* now we need to find the end of the chunk the user wants us to return */
913 
914 	ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
915 
916 	utext_close(&ut);
917 	ubrk_close(bi);
918 
919 	if ( NULL != next ) {
920 		ZVAL_LONG(next, start+ret_pos);
921 	}
922 
923 	RETURN_STRINGL(((char *)pstr), ret_pos, 1);
924 }
925 
926 /* }}} */
927 
928 /*
929  * Local variables:
930  * tab-width: 4
931  * c-basic-offset: 4
932  * End:
933  * vim600: fdm=marker
934  * vim: noet sw=4 ts=4
935  */
936 
937