xref: /PHP-5.5/ext/intl/grapheme/grapheme_string.c (revision fd968974)
1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 5														  |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,	  |
6    | that is bundled with this package in the file LICENSE, and is		  |
7    | available through the world-wide-web at the following url:			  |
8    | http://www.php.net/license/3_01.txt								  |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to		  |
11    | license@php.net so we can mail you a copy immediately.				  |
12    +----------------------------------------------------------------------+
13    | Author: Ed Batutis <ed@batutis.com>								  |
14    +----------------------------------------------------------------------+
15  */
16 
17 /* {{{ includes */
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21 
22 #include <php.h>
23 #include "grapheme.h"
24 #include "grapheme_util.h"
25 
26 #include <unicode/utypes.h>
27 #include <unicode/ucol.h>
28 #include <unicode/ustring.h>
29 #include <unicode/ubrk.h>
30 
31 #include "ext/standard/php_string.h"
32 
33 /* }}} */
34 
35 #define GRAPHEME_EXTRACT_TYPE_COUNT		0
36 #define GRAPHEME_EXTRACT_TYPE_MAXBYTES	1
37 #define GRAPHEME_EXTRACT_TYPE_MAXCHARS	2
38 #define GRAPHEME_EXTRACT_TYPE_MIN	GRAPHEME_EXTRACT_TYPE_COUNT
39 #define GRAPHEME_EXTRACT_TYPE_MAX	GRAPHEME_EXTRACT_TYPE_MAXCHARS
40 
41 
42 /* {{{ grapheme_register_constants
43  * Register API constants
44  */
grapheme_register_constants(INIT_FUNC_ARGS)45 void grapheme_register_constants( INIT_FUNC_ARGS )
46 {
47 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
48 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
49 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
50 }
51 /* }}} */
52 
53 /* {{{ proto int grapheme_strlen(string str)
54    Get number of graphemes in a string */
PHP_FUNCTION(grapheme_strlen)55 PHP_FUNCTION(grapheme_strlen)
56 {
57 	unsigned char* string;
58 	int string_len;
59 	UChar* ustring = NULL;
60 	int ustring_len = 0;
61 	int ret_len;
62 	UErrorCode status;
63 
64 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) {
65 
66 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
67 			 "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC );
68 
69 		RETURN_FALSE;
70 	}
71 
72 	ret_len = grapheme_ascii_check(string, string_len);
73 
74 	if ( ret_len >= 0 )
75 		RETURN_LONG(ret_len);
76 
77 	/* convert the string to UTF-16. */
78 	status = U_ZERO_ERROR;
79 	intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status );
80 
81 	if ( U_FAILURE( status ) ) {
82 		/* Set global error code. */
83 		intl_error_set_code( NULL, status TSRMLS_CC );
84 
85 		/* Set error messages. */
86 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
87 		if (ustring) {
88 			efree( ustring );
89 		}
90 		RETURN_NULL();
91 	}
92 
93 	ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC );
94 
95 	if (ustring) {
96 		efree( ustring );
97 	}
98 
99 	if (ret_len >= 0) {
100 		RETVAL_LONG(ret_len);
101 	} else {
102 		RETVAL_FALSE;
103 	}
104 }
105 /* }}} */
106 
107 /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
108    Find position of first occurrence of a string within another */
PHP_FUNCTION(grapheme_strpos)109 PHP_FUNCTION(grapheme_strpos)
110 {
111 	unsigned char *haystack, *needle;
112 	int haystack_len, needle_len;
113 	unsigned char *found;
114 	long loffset = 0;
115 	int32_t offset = 0, noffset = 0;
116 	int ret_pos;
117 
118 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
119 
120 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
121 			 "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC );
122 
123 		RETURN_FALSE;
124 	}
125 
126 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
127 
128 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
129 
130 		RETURN_FALSE;
131 	}
132 
133 	/* we checked that it will fit: */
134 	offset = (int32_t) loffset;
135 	noffset = offset >= 0 ? offset : haystack_len + offset;
136 
137 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
138 
139 	if (needle_len == 0) {
140 
141 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
142 
143 		RETURN_FALSE;
144 	}
145 
146 
147 	/* quick check to see if the string might be there
148 	 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
149 	*/
150 	found = (unsigned char *)php_memnstr((char *)haystack + noffset, (char *)needle, needle_len, (char *)haystack + haystack_len);
151 
152 	/* if it isn't there the we are done */
153 	if (!found) {
154 		RETURN_FALSE;
155 	}
156 
157 	/* if it is there, and if the haystack is ascii, we are all done */
158 	if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
159 
160 		RETURN_LONG(found - haystack);
161 	}
162 
163 	/* do utf16 part of the strpos */
164 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ TSRMLS_CC );
165 
166 	if ( ret_pos >= 0 ) {
167 		RETURN_LONG(ret_pos);
168 	} else {
169 		RETURN_FALSE;
170 	}
171 
172 }
173 /* }}} */
174 
175 /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
176    Find position of first occurrence of a string within another, ignoring case differences */
PHP_FUNCTION(grapheme_stripos)177 PHP_FUNCTION(grapheme_stripos)
178 {
179 	unsigned char *haystack, *needle, *haystack_dup, *needle_dup;
180 	int haystack_len, needle_len;
181 	unsigned char *found;
182 	long loffset = 0;
183 	int32_t offset = 0;
184 	int ret_pos;
185 	int is_ascii;
186 
187 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
188 
189 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
190 			 "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC );
191 
192 		RETURN_FALSE;
193 	}
194 
195 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
196 
197 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC );
198 
199 		RETURN_FALSE;
200 	}
201 
202 	/* we checked that it will fit: */
203 	offset = (int32_t) loffset;
204 
205 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
206 
207 	if (needle_len == 0) {
208 
209 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC );
210 
211 		RETURN_FALSE;
212 	}
213 
214 
215 	is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 );
216 
217 	if ( is_ascii ) {
218 		int32_t noffset = offset >= 0 ? offset : haystack_len + offset;
219 		needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
220 		php_strtolower((char *)needle_dup, needle_len);
221 		haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
222 		php_strtolower((char *)haystack_dup, haystack_len);
223 
224 		found = (unsigned char*) php_memnstr((char *)haystack_dup + noffset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len);
225 
226 		efree(haystack_dup);
227 		efree(needle_dup);
228 
229 		if (found) {
230 			RETURN_LONG(found - haystack_dup);
231 		}
232 
233 		/* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
234 		if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
235 			RETURN_FALSE;
236 		}
237 	}
238 
239 	/* do utf16 part of the strpos */
240 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ TSRMLS_CC );
241 
242 	if ( ret_pos >= 0 ) {
243 		RETURN_LONG(ret_pos);
244 	} else {
245 		RETURN_FALSE;
246 	}
247 
248 }
249 /* }}} */
250 
251 /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
252    Find position of last occurrence of a string within another */
PHP_FUNCTION(grapheme_strrpos)253 PHP_FUNCTION(grapheme_strrpos)
254 {
255 	unsigned char *haystack, *needle;
256 	int haystack_len, needle_len;
257 	long loffset = 0;
258 	int32_t offset = 0;
259 	int32_t ret_pos;
260 	int is_ascii;
261 
262 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
263 
264 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
265 			 "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
266 
267 		RETURN_FALSE;
268 	}
269 
270 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
271 
272 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
273 
274 		RETURN_FALSE;
275 	}
276 
277 	/* we checked that it will fit: */
278 	offset = (int32_t) loffset;
279 
280 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
281 
282 	if (needle_len == 0) {
283 
284 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
285 
286 		RETURN_FALSE;
287 	}
288 
289 	is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
290 
291 	if ( is_ascii ) {
292 
293 		ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
294 
295 
296 		if ( ret_pos >= 0 ) {
297 			RETURN_LONG(ret_pos);
298 		}
299 
300 		/* if the needle was ascii too, we are done */
301 
302 		if (  grapheme_ascii_check(needle, needle_len) >= 0 ) {
303 			RETURN_FALSE;
304 		}
305 
306 		/* else we need to continue via utf16 */
307 	}
308 
309 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */ TSRMLS_CC);
310 
311 	if ( ret_pos >= 0 ) {
312 		RETURN_LONG(ret_pos);
313 	} else {
314 		RETURN_FALSE;
315 	}
316 
317 
318 }
319 /* }}} */
320 
321 /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
322    Find position of last occurrence of a string within another, ignoring case */
PHP_FUNCTION(grapheme_strripos)323 PHP_FUNCTION(grapheme_strripos)
324 {
325 	unsigned char *haystack, *needle;
326 	int haystack_len, needle_len;
327 	long loffset = 0;
328 	int32_t offset = 0;
329 	int32_t ret_pos;
330 	int is_ascii;
331 
332 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
333 
334 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
335 			 "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
336 
337 		RETURN_FALSE;
338 	}
339 
340 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
341 
342 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
343 
344 		RETURN_FALSE;
345 	}
346 
347 	/* we checked that it will fit: */
348 	offset = (int32_t) loffset;
349 
350 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
351 
352 	if (needle_len == 0) {
353 
354 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
355 
356 		RETURN_FALSE;
357 	}
358 
359 	is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
360 
361 	if ( is_ascii ) {
362 		unsigned char *needle_dup, *haystack_dup;
363 
364 		needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
365 		php_strtolower((char *)needle_dup, needle_len);
366 		haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
367 		php_strtolower((char *)haystack_dup, haystack_len);
368 
369 		ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
370 
371 		efree(haystack_dup);
372 		efree(needle_dup);
373 
374 		if ( ret_pos >= 0 ) {
375 			RETURN_LONG(ret_pos);
376 		}
377 
378 		/* if the needle was ascii too, we are done */
379 
380 		if (  grapheme_ascii_check(needle, needle_len) >= 0 ) {
381 			RETURN_FALSE;
382 		}
383 
384 		/* else we need to continue via utf16 */
385 	}
386 
387 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL,  1 /* f_ignore_case */, 1 /*last */ TSRMLS_CC);
388 
389 	if ( ret_pos >= 0 ) {
390 		RETURN_LONG(ret_pos);
391 	} else {
392 		RETURN_FALSE;
393 	}
394 
395 
396 }
397 /* }}} */
398 
399 /* {{{ proto string grapheme_substr(string str, int start [, int length])
400    Returns part of a string */
PHP_FUNCTION(grapheme_substr)401 PHP_FUNCTION(grapheme_substr)
402 {
403 	unsigned char *str, *sub_str;
404 	UChar *ustr;
405 	int str_len, sub_str_len, ustr_len;
406 	long lstart = 0, length = 0;
407 	int32_t start = 0;
408 	int iter_val;
409 	UErrorCode status;
410 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
411 	UBreakIterator* bi = NULL;
412 	int sub_str_start_pos, sub_str_end_pos;
413 	int32_t (*iter_func)(UBreakIterator *);
414 
415 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) {
416 
417 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
418 			 "grapheme_substr: unable to parse input param", 0 TSRMLS_CC );
419 
420 		RETURN_FALSE;
421 	}
422 
423 	if ( OUTSIDE_STRING(lstart, str_len) ) {
424 
425 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
426 
427 		RETURN_FALSE;
428 	}
429 
430 	/* we checked that it will fit: */
431 	start = (int32_t) lstart;
432 
433 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
434 
435 	if ( grapheme_ascii_check(str, str_len) >= 0 ) {
436 		grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len);
437 
438 		if ( NULL == sub_str ) {
439 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 TSRMLS_CC );
440 			RETURN_FALSE;
441 		}
442 
443 		RETURN_STRINGL(((char *)sub_str), sub_str_len, 1);
444 	}
445 
446 	ustr = NULL;
447 	ustr_len = 0;
448 	status = U_ZERO_ERROR;
449 	intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status);
450 
451 	if ( U_FAILURE( status ) ) {
452 		/* Set global error code. */
453 		intl_error_set_code( NULL, status TSRMLS_CC );
454 
455 		/* Set error messages. */
456 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
457 		if (ustr) {
458 			efree( ustr );
459 		}
460 		RETURN_FALSE;
461 	}
462 
463 	bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
464 
465 	if( U_FAILURE(status) ) {
466 		RETURN_FALSE;
467 	}
468 
469 	ubrk_setText(bi, ustr, ustr_len,	&status);
470 
471 	if ( start < 0 ) {
472 		iter_func = ubrk_previous;
473 		ubrk_last(bi);
474 		iter_val = 1;
475 	}
476 	else {
477 		iter_func = ubrk_next;
478 		iter_val = -1;
479 	}
480 
481 	sub_str_start_pos = 0;
482 
483 	while ( start ) {
484 		sub_str_start_pos = iter_func(bi);
485 
486 		if ( UBRK_DONE == sub_str_start_pos ) {
487 			break;
488 		}
489 
490 		start += iter_val;
491 	}
492 
493 	if ( 0 != start || sub_str_start_pos >= ustr_len ) {
494 
495 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
496 
497 		if (ustr) {
498 			efree(ustr);
499 		}
500 		ubrk_close(bi);
501 		RETURN_FALSE;
502 	}
503 
504 	if (ZEND_NUM_ARGS() <= 2) {
505 
506 		/* no length supplied, return the rest of the string */
507 
508 		sub_str = NULL;
509 		sub_str_len = 0;
510 		status = U_ZERO_ERROR;
511 		intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
512 
513 		if (ustr) {
514 			efree( ustr );
515 		}
516 		ubrk_close( bi );
517 
518 		if ( U_FAILURE( status ) ) {
519 			/* Set global error code. */
520 			intl_error_set_code( NULL, status TSRMLS_CC );
521 
522 			/* Set error messages. */
523 			intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
524 
525 			if (sub_str) {
526 				efree( sub_str );
527 			}
528 
529 			RETURN_FALSE;
530 		}
531 
532 		/* return the allocated string, not a duplicate */
533 		RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
534 	}
535 
536 	if(length == 0) {
537 		/* empty length - we've validated start, we can return "" now */
538 		if (ustr) {
539 			efree(ustr);
540 		}
541 		ubrk_close(bi);
542 		RETURN_EMPTY_STRING();
543 	}
544 
545 	/* find the end point of the string to return */
546 
547 	if ( length < 0 ) {
548 		iter_func = ubrk_previous;
549 		ubrk_last(bi);
550 		iter_val = 1;
551 	}
552 	else {
553 		iter_func = ubrk_next;
554 		iter_val = -1;
555 	}
556 
557 	sub_str_end_pos = 0;
558 
559 	while ( length ) {
560 		sub_str_end_pos = iter_func(bi);
561 
562 		if ( UBRK_DONE == sub_str_end_pos ) {
563 			break;
564 		}
565 
566 		length += iter_val;
567 	}
568 
569 	ubrk_close(bi);
570 
571 	if ( UBRK_DONE == sub_str_end_pos) {
572 		if(length < 0) {
573 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC );
574 
575 			efree(ustr);
576 			RETURN_FALSE;
577 		} else {
578 			sub_str_end_pos = ustr_len;
579 		}
580 	}
581 
582 	if(sub_str_start_pos > sub_str_end_pos) {
583 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length is beyond start", 1 TSRMLS_CC );
584 
585 		efree(ustr);
586 		RETURN_FALSE;
587 	}
588 
589 	sub_str = NULL;
590 	status = U_ZERO_ERROR;
591 	intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
592 
593 	efree( ustr );
594 
595 	if ( U_FAILURE( status ) ) {
596 		/* Set global error code. */
597 		intl_error_set_code( NULL, status TSRMLS_CC );
598 
599 		/* Set error messages. */
600 		intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
601 
602 		if ( NULL != sub_str )
603 			efree( sub_str );
604 
605 		RETURN_FALSE;
606 	}
607 
608 	 /* return the allocated string, not a duplicate */
609 	RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
610 
611 }
612 /* }}} */
613 
614 /* {{{	strstr_common_handler */
strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS,int f_ignore_case)615 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
616 {
617 	unsigned char *haystack, *needle, *found;
618 	int haystack_len, needle_len;
619 	int ret_pos, uchar_pos;
620 	zend_bool part = 0;
621 
622 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) {
623 
624 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
625 			 "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC );
626 
627 		RETURN_FALSE;
628 	}
629 
630 	if (needle_len == 0) {
631 
632 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
633 
634 		RETURN_FALSE;
635 	}
636 
637 
638 	if ( !f_ignore_case ) {
639 
640 		/* ASCII optimization: quick check to see if the string might be there
641 		 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
642 		*/
643 		found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len);
644 
645 		/* if it isn't there the we are done */
646 		if ( !found ) {
647 			RETURN_FALSE;
648 		}
649 
650 		/* if it is there, and if the haystack is ascii, we are all done */
651 		if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
652 			size_t found_offset = found - haystack;
653 
654 			if (part) {
655 				RETURN_STRINGL(((char *)haystack) , found_offset, 1);
656 			} else {
657 				RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1);
658 			}
659 		}
660 
661 	}
662 
663 	/* need to work in utf16 */
664 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ TSRMLS_CC );
665 
666 	if ( ret_pos < 0 ) {
667 		RETURN_FALSE;
668 	}
669 
670 	/* uchar_pos is the 'nth' Unicode character position of the needle */
671 
672 	ret_pos = 0;
673 	U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
674 
675 	if (part) {
676 		RETURN_STRINGL(((char *)haystack), ret_pos, 1);
677 	}
678 	else {
679 		RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1);
680 	}
681 
682 }
683 /* }}} */
684 
685 /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
686    Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_strstr)687 PHP_FUNCTION(grapheme_strstr)
688 {
689 	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
690 }
691 /* }}} */
692 
693 /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
694    Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_stristr)695 PHP_FUNCTION(grapheme_stristr)
696 {
697 	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
698 }
699 /* }}} */
700 
701 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
702 static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator * bi,int32_t csize,unsigned char * pstr,int32_t str_len)703 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
704 {
705 	int pos = 0, prev_pos = 0;
706 	int ret_pos = 0, prev_ret_pos = 0;
707 
708 	while ( 1 ) {
709 		pos = ubrk_next(bi);
710 
711 		if ( UBRK_DONE == pos ) {
712 			break;
713 		}
714 
715 		/* if we are beyond our limit, then the loop is done */
716 		if ( pos > csize ) {
717 			break;
718 		}
719 
720 		/* update our pointer in the original UTF-8 buffer by as many characters
721 		   as ubrk_next iterated over */
722 
723 		prev_ret_pos = ret_pos;
724 		U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
725 
726 		if ( prev_ret_pos == ret_pos ) {
727 			/* something wrong - malformed utf8? */
728 			break;
729 		}
730 
731 		prev_pos = pos;
732 	}
733 
734 	return ret_pos;
735 }
736 /* }}} */
737 
738 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
739 static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator * bi,int32_t bsize,unsigned char * pstr,int32_t str_len)740 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
741 {
742 	int pos = 0, prev_pos = 0;
743 	int ret_pos = 0, prev_ret_pos = 0;
744 
745 	while ( 1 ) {
746 		pos = ubrk_next(bi);
747 
748 		if ( UBRK_DONE == pos ) {
749 			break;
750 		}
751 
752 		prev_ret_pos = ret_pos;
753 		U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
754 
755 		if ( ret_pos > bsize ) {
756 			ret_pos = prev_ret_pos;
757 			break;
758 		}
759 
760 		if ( prev_ret_pos == ret_pos ) {
761 			/* something wrong - malformed utf8? */
762 			break;
763 		}
764 
765 		prev_pos = pos;
766 	}
767 
768 	return ret_pos;
769 }
770 /* }}} */
771 
772 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
773 static inline int32_t
grapheme_extract_count_iter(UBreakIterator * bi,int32_t size,unsigned char * pstr,int32_t str_len)774 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
775 {
776 	int pos = 0, next_pos = 0;
777 	int ret_pos = 0;
778 
779 	while ( size ) {
780 		next_pos = ubrk_next(bi);
781 
782 		if ( UBRK_DONE == next_pos ) {
783 			break;
784 		}
785 		pos = next_pos;
786 		size--;
787 	}
788 
789 	/* pos is one past the last UChar - and represent the number of code units to
790 		advance in the utf-8 buffer
791 	*/
792 
793 	U8_FWD_N(pstr, ret_pos, str_len, pos);
794 
795 	return ret_pos;
796 }
797 /* }}} */
798 
799 /* {{{ grapheme extract iter function pointer array */
800 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
801 
802 static grapheme_extract_iter grapheme_extract_iters[] = {
803 	&grapheme_extract_count_iter,
804 	&grapheme_extract_bytecount_iter,
805 	&grapheme_extract_charcount_iter,
806 };
807 /* }}} */
808 
809 /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
810 	Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)811 PHP_FUNCTION(grapheme_extract)
812 {
813 	unsigned char *str, *pstr;
814 	UChar *ustr;
815 	int str_len, ustr_len;
816 	long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
817 	long lstart = 0; /* starting position in str in bytes */
818 	int32_t start = 0;
819 	long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
820 	UErrorCode status;
821 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
822 	UBreakIterator* bi = NULL;
823 	int ret_pos;
824 	zval *next = NULL; /* return offset of next part of the string */
825 
826 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
827 
828 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
829 			 "grapheme_extract: unable to parse input param", 0 TSRMLS_CC );
830 
831 		RETURN_FALSE;
832 	}
833 
834 	if ( NULL != next ) {
835 		if ( !PZVAL_IS_REF(next) ) {
836 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
837 				 "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC );
838 
839 			RETURN_FALSE;
840 		}
841 		else {
842 			/* initialize next */
843 			zval_dtor(next);
844             ZVAL_LONG(next, lstart);
845 		}
846 	}
847 
848 	if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
849 
850 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
851 			 "grapheme_extract: unknown extract type param", 0 TSRMLS_CC );
852 
853 		RETURN_FALSE;
854 	}
855 
856 	if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
857 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 TSRMLS_CC );
858 		RETURN_FALSE;
859 	}
860 
861 	if ( size > INT32_MAX || size < 0) {
862 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 TSRMLS_CC );
863 		RETURN_FALSE;
864 	}
865 	if (size == 0) {
866 		RETURN_EMPTY_STRING();
867 	}
868 
869 	/* we checked that it will fit: */
870 	start = (int32_t) lstart;
871 
872 	pstr = str + start;
873 
874 	/* just in case pstr points in the middle of a character, move forward to the start of the next char */
875 	if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
876 		unsigned char *str_end = str + str_len;
877 
878 		while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
879 			pstr++;
880 			if ( pstr >= str_end ) {
881 				intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
882 								"grapheme_extract: invalid input string", 0 TSRMLS_CC );
883 
884 				RETURN_FALSE;
885 			}
886 		}
887 	}
888 
889 	str_len -= (pstr - str);
890 
891 	/* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
892 		(size + 1 because the size-th character might be the beginning of a grapheme cluster)
893 	 */
894 
895 	if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) {
896         long nsize = ( size < str_len ? size : str_len );
897 		if ( NULL != next ) {
898 			ZVAL_LONG(next, start+nsize);
899 		}
900 		RETURN_STRINGL(((char *)pstr), nsize, 1);
901 	}
902 
903 	/* convert the strings to UTF-16. */
904 	ustr = NULL;
905 	ustr_len = 0;
906 	status = U_ZERO_ERROR;
907 	intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
908 
909 	if ( U_FAILURE( status ) ) {
910 		/* Set global error code. */
911 		intl_error_set_code( NULL, status TSRMLS_CC );
912 
913 		/* Set error messages. */
914 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
915 
916 		if ( NULL != ustr )
917 			efree( ustr );
918 
919 		RETURN_FALSE;
920 	}
921 
922 	bi = NULL;
923 	status = U_ZERO_ERROR;
924 	bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
925 
926 	ubrk_setText(bi, ustr, ustr_len, &status);
927 
928 	/* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
929 		can't back up. So, we will not do anything. */
930 
931 	/* now we need to find the end of the chunk the user wants us to return */
932 
933 	ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
934 
935 	if (ustr) {
936 		efree(ustr);
937 	}
938 	ubrk_close(bi);
939 
940 	if ( NULL != next ) {
941 		ZVAL_LONG(next, start+ret_pos);
942 	}
943 
944 	RETURN_STRINGL(((char *)pstr), ret_pos, 1);
945 }
946 
947 /* }}} */
948 
949 /*
950  * Local variables:
951  * tab-width: 4
952  * c-basic-offset: 4
953  * End:
954  * vim600: fdm=marker
955  * vim: noet sw=4 ts=4
956  */
957 
958