xref: /PHP-5.3/ext/intl/grapheme/grapheme_string.c (revision 9762609c)
1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 5														  |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,	  |
6    | that is bundled with this package in the file LICENSE, and is		  |
7    | available through the world-wide-web at the following url:			  |
8    | http://www.php.net/license/3_01.txt								  |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to		  |
11    | license@php.net so we can mail you a copy immediately.				  |
12    +----------------------------------------------------------------------+
13    | Author: Ed Batutis <ed@batutis.com>								  |
14    +----------------------------------------------------------------------+
15  */
16 
17 /* {{{ includes */
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21 
22 #include <php.h>
23 #include "grapheme.h"
24 #include "grapheme_util.h"
25 
26 #include <unicode/utypes.h>
27 #include <unicode/ucol.h>
28 #include <unicode/ustring.h>
29 #include <unicode/ubrk.h>
30 
31 #include "ext/standard/php_string.h"
32 
33 /* }}} */
34 
35 #define GRAPHEME_EXTRACT_TYPE_COUNT		0
36 #define GRAPHEME_EXTRACT_TYPE_MAXBYTES	1
37 #define GRAPHEME_EXTRACT_TYPE_MAXCHARS	2
38 #define GRAPHEME_EXTRACT_TYPE_MIN	GRAPHEME_EXTRACT_TYPE_COUNT
39 #define GRAPHEME_EXTRACT_TYPE_MAX	GRAPHEME_EXTRACT_TYPE_MAXCHARS
40 
41 
42 /* {{{ grapheme_register_constants
43  * Register API constants
44  */
grapheme_register_constants(INIT_FUNC_ARGS)45 void grapheme_register_constants( INIT_FUNC_ARGS )
46 {
47 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
48 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
49 	REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
50 }
51 /* }}} */
52 
53 /* {{{ proto int grapheme_strlen(string str)
54    Get number of graphemes in a string */
PHP_FUNCTION(grapheme_strlen)55 PHP_FUNCTION(grapheme_strlen)
56 {
57 	unsigned char* string;
58 	int string_len;
59 	UChar* ustring = NULL;
60 	int ustring_len = 0;
61 	int ret_len;
62 	UErrorCode status;
63 
64 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) {
65 
66 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
67 			 "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC );
68 
69 		RETURN_FALSE;
70 	}
71 
72 	ret_len = grapheme_ascii_check(string, string_len);
73 
74 	if ( ret_len >= 0 )
75 		RETURN_LONG(ret_len);
76 
77 	/* convert the string to UTF-16. */
78 	status = U_ZERO_ERROR;
79 	intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status );
80 
81 	if ( U_FAILURE( status ) ) {
82 		/* Set global error code. */
83 		intl_error_set_code( NULL, status TSRMLS_CC );
84 
85 		/* Set error messages. */
86 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
87 		if (ustring) {
88 			efree( ustring );
89 		}
90 		RETURN_NULL();
91 	}
92 
93 	ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC );
94 
95 	if (ustring) {
96 		efree( ustring );
97 	}
98 
99 	if (ret_len >= 0) {
100 		RETVAL_LONG(ret_len);
101 	} else {
102 		RETVAL_FALSE;
103 	}
104 }
105 /* }}} */
106 
107 /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
108    Find position of first occurrence of a string within another */
PHP_FUNCTION(grapheme_strpos)109 PHP_FUNCTION(grapheme_strpos)
110 {
111 	unsigned char *haystack, *needle;
112 	int haystack_len, needle_len;
113 	unsigned char *found;
114 	long loffset = 0;
115 	int32_t offset = 0;
116 	int ret_pos, uchar_pos;
117 
118 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
119 
120 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
121 			 "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC );
122 
123 		RETURN_FALSE;
124 	}
125 
126 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
127 
128 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
129 
130 		RETURN_FALSE;
131 	}
132 
133 	/* we checked that it will fit: */
134 	offset = (int32_t) loffset;
135 
136 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
137 
138 	if (needle_len == 0) {
139 
140 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
141 
142 		RETURN_FALSE;
143 	}
144 
145 
146 	/* quick check to see if the string might be there
147 	 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
148 	*/
149 	found = (unsigned char *)php_memnstr((char *)haystack + offset, (char *)needle, needle_len, (char *)haystack + haystack_len);
150 
151 	/* if it isn't there the we are done */
152 	if (!found) {
153 		RETURN_FALSE;
154 	}
155 
156 	/* if it is there, and if the haystack is ascii, we are all done */
157 	if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
158 
159 		RETURN_LONG(found - haystack);
160 	}
161 
162 	/* do utf16 part of the strpos */
163 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 0 /* fIgnoreCase */ TSRMLS_CC );
164 
165 	if ( ret_pos >= 0 ) {
166 		RETURN_LONG(ret_pos + offset);
167 	} else {
168 		RETURN_FALSE;
169 	}
170 
171 }
172 /* }}} */
173 
174 /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
175    Find position of first occurrence of a string within another, ignoring case differences */
PHP_FUNCTION(grapheme_stripos)176 PHP_FUNCTION(grapheme_stripos)
177 {
178 	unsigned char *haystack, *needle, *haystack_dup, *needle_dup;
179 	int haystack_len, needle_len;
180 	unsigned char *found;
181 	long loffset = 0;
182 	int32_t offset = 0;
183 	int ret_pos, uchar_pos;
184 	int is_ascii;
185 
186 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
187 
188 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
189 			 "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC );
190 
191 		RETURN_FALSE;
192 	}
193 
194 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
195 
196 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC );
197 
198 		RETURN_FALSE;
199 	}
200 
201 	/* we checked that it will fit: */
202 	offset = (int32_t) loffset;
203 
204 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
205 
206 	if (needle_len == 0) {
207 
208 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC );
209 
210 		RETURN_FALSE;
211 	}
212 
213 
214 	is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 );
215 
216 	if ( is_ascii ) {
217 		needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
218 		php_strtolower((char *)needle_dup, needle_len);
219 		haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
220 		php_strtolower((char *)haystack_dup, haystack_len);
221 
222 		found = (unsigned char*) php_memnstr((char *)haystack_dup + offset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len);
223 
224 		efree(haystack_dup);
225 		efree(needle_dup);
226 
227 		if (found) {
228 			RETURN_LONG(found - haystack_dup);
229 		}
230 
231 		/* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
232 		if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
233 			RETURN_FALSE;
234 		}
235 	}
236 
237 	/* do utf16 part of the strpos */
238 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, &uchar_pos, 1 /* fIgnoreCase */ TSRMLS_CC );
239 
240 	if ( ret_pos >= 0 ) {
241 		RETURN_LONG(ret_pos + offset);
242 	} else {
243 		RETURN_FALSE;
244 	}
245 
246 }
247 /* }}} */
248 
249 /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
250    Find position of last occurrence of a string within another */
PHP_FUNCTION(grapheme_strrpos)251 PHP_FUNCTION(grapheme_strrpos)
252 {
253 	unsigned char *haystack, *needle;
254 	int haystack_len, needle_len;
255 	long loffset = 0;
256 	int32_t offset = 0;
257 	int32_t ret_pos;
258 	int is_ascii;
259 
260 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
261 
262 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
263 			 "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
264 
265 		RETURN_FALSE;
266 	}
267 
268 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
269 
270 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
271 
272 		RETURN_FALSE;
273 	}
274 
275 	/* we checked that it will fit: */
276 	offset = (int32_t) loffset;
277 
278 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
279 
280 	if (needle_len == 0) {
281 
282 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
283 
284 		RETURN_FALSE;
285 	}
286 
287 	is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
288 
289 	if ( is_ascii ) {
290 
291 		ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
292 
293 
294 		if ( ret_pos >= 0 ) {
295 			RETURN_LONG(ret_pos);
296 		}
297 
298 		/* if the needle was ascii too, we are done */
299 
300 		if (  grapheme_ascii_check(needle, needle_len) >= 0 ) {
301 			RETURN_FALSE;
302 		}
303 
304 		/* else we need to continue via utf16 */
305 	}
306 
307 	ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 0 /* f_ignore_case */ TSRMLS_CC);
308 
309 	if ( ret_pos >= 0 ) {
310 		RETURN_LONG(ret_pos);
311 	} else {
312 		RETURN_FALSE;
313 	}
314 
315 
316 }
317 /* }}} */
318 
319 /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
320    Find position of last occurrence of a string within another, ignoring case */
PHP_FUNCTION(grapheme_strripos)321 PHP_FUNCTION(grapheme_strripos)
322 {
323 	unsigned char *haystack, *needle;
324 	int haystack_len, needle_len;
325 	long loffset = 0;
326 	int32_t offset = 0;
327 	int32_t ret_pos;
328 	int is_ascii;
329 
330 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
331 
332 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
333 			 "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
334 
335 		RETURN_FALSE;
336 	}
337 
338 	if ( OUTSIDE_STRING(loffset, haystack_len) ) {
339 
340 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
341 
342 		RETURN_FALSE;
343 	}
344 
345 	/* we checked that it will fit: */
346 	offset = (int32_t) loffset;
347 
348 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
349 
350 	if (needle_len == 0) {
351 
352 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
353 
354 		RETURN_FALSE;
355 	}
356 
357 	is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
358 
359 	if ( is_ascii ) {
360 		unsigned char *needle_dup, *haystack_dup;
361 
362 		needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
363 		php_strtolower((char *)needle_dup, needle_len);
364 		haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
365 		php_strtolower((char *)haystack_dup, haystack_len);
366 
367 		ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
368 
369 		efree(haystack_dup);
370 		efree(needle_dup);
371 
372 		if ( ret_pos >= 0 ) {
373 			RETURN_LONG(ret_pos);
374 		}
375 
376 		/* if the needle was ascii too, we are done */
377 
378 		if (  grapheme_ascii_check(needle, needle_len) >= 0 ) {
379 			RETURN_FALSE;
380 		}
381 
382 		/* else we need to continue via utf16 */
383 	}
384 
385 	ret_pos = grapheme_strrpos_utf16(haystack, haystack_len, needle, needle_len, offset, 1 /* f_ignore_case */ TSRMLS_CC);
386 
387 	if ( ret_pos >= 0 ) {
388 		RETURN_LONG(ret_pos);
389 	} else {
390 		RETURN_FALSE;
391 	}
392 
393 
394 }
395 /* }}} */
396 
397 /* {{{ proto string grapheme_substr(string str, int start [, int length])
398    Returns part of a string */
PHP_FUNCTION(grapheme_substr)399 PHP_FUNCTION(grapheme_substr)
400 {
401 	unsigned char *str, *sub_str;
402 	UChar *ustr;
403 	int str_len, sub_str_len, ustr_len;
404 	long lstart = 0, length = 0;
405 	int32_t start = 0;
406 	int iter_val;
407 	UErrorCode status;
408 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
409 	UBreakIterator* bi = NULL;
410 	int sub_str_start_pos, sub_str_end_pos;
411 	int32_t (*iter_func)(UBreakIterator *);
412 
413 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) {
414 
415 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
416 			 "grapheme_substr: unable to parse input param", 0 TSRMLS_CC );
417 
418 		RETURN_FALSE;
419 	}
420 
421 	if ( OUTSIDE_STRING(lstart, str_len) ) {
422 
423 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
424 
425 		RETURN_FALSE;
426 	}
427 
428 	/* we checked that it will fit: */
429 	start = (int32_t) lstart;
430 
431 	/* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
432 
433 	if ( grapheme_ascii_check(str, str_len) >= 0 ) {
434 		grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len);
435 
436 		if ( NULL == sub_str ) {
437 			RETURN_FALSE;
438 		}
439 
440 		RETURN_STRINGL(((char *)sub_str), sub_str_len, 1);
441 	}
442 
443 	ustr = NULL;
444 	ustr_len = 0;
445 	status = U_ZERO_ERROR;
446 	intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status);
447 
448 	if ( U_FAILURE( status ) ) {
449 		/* Set global error code. */
450 		intl_error_set_code( NULL, status TSRMLS_CC );
451 
452 		/* Set error messages. */
453 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
454 		if (ustr) {
455 			efree( ustr );
456 		}
457 		RETURN_FALSE;
458 	}
459 
460 	bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
461 
462 	if( U_FAILURE(status) ) {
463 		RETURN_FALSE;
464 	}
465 
466 	ubrk_setText(bi, ustr, ustr_len,	&status);
467 
468 	if ( start < 0 ) {
469 		iter_func = ubrk_previous;
470 		ubrk_last(bi);
471 		iter_val = 1;
472 	}
473 	else {
474 		iter_func = ubrk_next;
475 		iter_val = -1;
476 	}
477 
478 	sub_str_start_pos = 0;
479 
480 	while ( start ) {
481 		sub_str_start_pos = iter_func(bi);
482 
483 		if ( UBRK_DONE == sub_str_start_pos ) {
484 			break;
485 		}
486 
487 		start += iter_val;
488 	}
489 
490 	if ( 0 != start || sub_str_start_pos >= ustr_len ) {
491 
492 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
493 
494 		if (ustr) {
495 			efree(ustr);
496 		}
497 		ubrk_close(bi);
498 		RETURN_FALSE;
499 	}
500 
501 	if (ZEND_NUM_ARGS() <= 2) {
502 
503 		/* no length supplied, return the rest of the string */
504 
505 		sub_str = NULL;
506 		sub_str_len = 0;
507 		status = U_ZERO_ERROR;
508 		intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
509 
510 		if (ustr) {
511 			efree( ustr );
512 		}
513 		ubrk_close( bi );
514 
515 		if ( U_FAILURE( status ) ) {
516 			/* Set global error code. */
517 			intl_error_set_code( NULL, status TSRMLS_CC );
518 
519 			/* Set error messages. */
520 			intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
521 
522 			if (sub_str) {
523 				efree( sub_str );
524 			}
525 
526 			RETURN_FALSE;
527 		}
528 
529 		/* return the allocated string, not a duplicate */
530 		RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
531 	}
532 
533 	/* find the end point of the string to return */
534 
535 	if ( length < 0 ) {
536 		iter_func = ubrk_previous;
537 		ubrk_last(bi);
538 		iter_val = 1;
539 	}
540 	else {
541 		iter_func = ubrk_next;
542 		iter_val = -1;
543 	}
544 
545 	sub_str_end_pos = 0;
546 
547 	while ( length ) {
548 		sub_str_end_pos = iter_func(bi);
549 
550 		if ( UBRK_DONE == sub_str_end_pos ) {
551 			break;
552 		}
553 
554 		length += iter_val;
555 	}
556 
557 	if ( UBRK_DONE == sub_str_end_pos && length < 0) {
558 
559 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC );
560 
561 		efree(ustr);
562 		ubrk_close(bi);
563 		RETURN_FALSE;
564 	}
565 
566 	sub_str = NULL;
567 	status = U_ZERO_ERROR;
568 	intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
569 
570 	efree( ustr );
571 	ubrk_close( bi );
572 
573 	if ( U_FAILURE( status ) ) {
574 		/* Set global error code. */
575 		intl_error_set_code( NULL, status TSRMLS_CC );
576 
577 		/* Set error messages. */
578 		intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
579 
580 		if ( NULL != sub_str )
581 			efree( sub_str );
582 
583 		RETURN_FALSE;
584 	}
585 
586 	 /* return the allocated string, not a duplicate */
587 	RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
588 
589 }
590 /* }}} */
591 
592 /* {{{	strstr_common_handler */
strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS,int f_ignore_case)593 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
594 {
595 	unsigned char *haystack, *needle, *found;
596 	int haystack_len, needle_len;
597 	int ret_pos, uchar_pos;
598 	zend_bool part = 0;
599 
600 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) {
601 
602 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
603 			 "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC );
604 
605 		RETURN_FALSE;
606 	}
607 
608 	if (needle_len == 0) {
609 
610 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
611 
612 		RETURN_FALSE;
613 	}
614 
615 
616 	if ( !f_ignore_case ) {
617 
618 		/* ASCII optimization: quick check to see if the string might be there
619 		 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
620 		*/
621 		found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len);
622 
623 		/* if it isn't there the we are done */
624 		if ( !found ) {
625 			RETURN_FALSE;
626 		}
627 
628 		/* if it is there, and if the haystack is ascii, we are all done */
629 		if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
630 			size_t found_offset = found - haystack;
631 
632 			if (part) {
633 				RETURN_STRINGL(((char *)haystack) , found_offset, 1);
634 			} else {
635 				RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1);
636 			}
637 		}
638 
639 	}
640 
641 	/* need to work in utf16 */
642 	ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case TSRMLS_CC );
643 
644 	if ( ret_pos < 0 ) {
645 		RETURN_FALSE;
646 	}
647 
648 	/* uchar_pos is the 'nth' Unicode character position of the needle */
649 
650 	ret_pos = 0;
651 	U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
652 
653 	if (part) {
654 		RETURN_STRINGL(((char *)haystack), ret_pos, 1);
655 	}
656 	else {
657 		RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1);
658 	}
659 
660 }
661 /* }}} */
662 
663 /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
664    Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_strstr)665 PHP_FUNCTION(grapheme_strstr)
666 {
667 	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
668 }
669 /* }}} */
670 
671 /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
672    Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_stristr)673 PHP_FUNCTION(grapheme_stristr)
674 {
675 	strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
676 }
677 /* }}} */
678 
679 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
680 static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator * bi,int32_t csize,unsigned char * pstr,int32_t str_len)681 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
682 {
683 	int pos = 0, prev_pos = 0;
684 	int ret_pos = 0, prev_ret_pos = 0;
685 
686 	while ( 1 ) {
687 		pos = ubrk_next(bi);
688 
689 		if ( UBRK_DONE == pos ) {
690 			break;
691 		}
692 
693 		/* if we are beyond our limit, then the loop is done */
694 		if ( pos > csize ) {
695 			break;
696 		}
697 
698 		/* update our pointer in the original UTF-8 buffer by as many characters
699 		   as ubrk_next iterated over */
700 
701 		prev_ret_pos = ret_pos;
702 		U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
703 
704 		if ( prev_ret_pos == ret_pos ) {
705 			/* something wrong - malformed utf8? */
706 			break;
707 		}
708 
709 		prev_pos = pos;
710 	}
711 
712 	return ret_pos;
713 }
714 /* }}} */
715 
716 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
717 static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator * bi,int32_t bsize,unsigned char * pstr,int32_t str_len)718 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
719 {
720 	int pos = 0, prev_pos = 0;
721 	int ret_pos = 0, prev_ret_pos = 0;
722 
723 	while ( 1 ) {
724 		pos = ubrk_next(bi);
725 
726 		if ( UBRK_DONE == pos ) {
727 			break;
728 		}
729 
730 		prev_ret_pos = ret_pos;
731 		U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);
732 
733 		if ( ret_pos > bsize ) {
734 			ret_pos = prev_ret_pos;
735 			break;
736 		}
737 
738 		if ( prev_ret_pos == ret_pos ) {
739 			/* something wrong - malformed utf8? */
740 			break;
741 		}
742 
743 		prev_pos = pos;
744 	}
745 
746 	return ret_pos;
747 }
748 /* }}} */
749 
750 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
751 static inline int32_t
grapheme_extract_count_iter(UBreakIterator * bi,int32_t size,unsigned char * pstr,int32_t str_len)752 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
753 {
754 	int pos = 0, next_pos = 0;
755 	int ret_pos = 0;
756 
757 	while ( size ) {
758 		next_pos = ubrk_next(bi);
759 
760 		if ( UBRK_DONE == next_pos ) {
761 			break;
762 		}
763 		pos = next_pos;
764 		size--;
765 	}
766 
767 	/* pos is one past the last UChar - and represent the number of code units to
768 		advance in the utf-8 buffer
769 	*/
770 
771 	U8_FWD_N(pstr, ret_pos, str_len, pos);
772 
773 	return ret_pos;
774 }
775 /* }}} */
776 
777 /* {{{ grapheme extract iter function pointer array */
778 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
779 
780 static grapheme_extract_iter grapheme_extract_iters[] = {
781 	&grapheme_extract_count_iter,
782 	&grapheme_extract_bytecount_iter,
783 	&grapheme_extract_charcount_iter,
784 };
785 /* }}} */
786 
787 /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
788 	Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)789 PHP_FUNCTION(grapheme_extract)
790 {
791 	unsigned char *str, *pstr;
792 	UChar *ustr;
793 	int str_len, ustr_len;
794 	long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
795 	long lstart = 0; /* starting position in str in bytes */
796 	int32_t start = 0;
797 	long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
798 	UErrorCode status;
799 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
800 	UBreakIterator* bi = NULL;
801 	int ret_pos;
802 	zval *next = NULL; /* return offset of next part of the string */
803 
804 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
805 
806 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
807 			 "grapheme_extract: unable to parse input param", 0 TSRMLS_CC );
808 
809 		RETURN_FALSE;
810 	}
811 
812 	if ( NULL != next ) {
813 		if ( !PZVAL_IS_REF(next) ) {
814 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
815 				 "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC );
816 
817 			RETURN_FALSE;
818 		}
819 		else {
820 			/* initialize next */
821 			zval_dtor(next);
822             ZVAL_LONG(next, lstart);
823 		}
824 	}
825 
826 	if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
827 
828 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
829 			 "grapheme_extract: unknown extract type param", 0 TSRMLS_CC );
830 
831 		RETURN_FALSE;
832 	}
833 
834 	if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
835 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 TSRMLS_CC );
836 		RETURN_FALSE;
837 	}
838 
839 	if ( size > INT32_MAX || size < 0) {
840 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 TSRMLS_CC );
841 		RETURN_FALSE;
842 	}
843 	if (size == 0) {
844 		RETURN_EMPTY_STRING();
845 	}
846 
847 	/* we checked that it will fit: */
848 	start = (int32_t) lstart;
849 
850 	pstr = str + start;
851 
852 	/* just in case pstr points in the middle of a character, move forward to the start of the next char */
853 	if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
854 		unsigned char *str_end = str + str_len;
855 
856 		while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
857 			pstr++;
858 			if ( pstr >= str_end ) {
859 				intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
860 								"grapheme_extract: invalid input string", 0 TSRMLS_CC );
861 
862 				RETURN_FALSE;
863 			}
864 		}
865 	}
866 
867 	str_len -= (pstr - str);
868 
869 	/* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
870 		(size + 1 because the size-th character might be the beginning of a grapheme cluster)
871 	 */
872 
873 	if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) {
874         long nsize = ( size < str_len ? size : str_len );
875 		if ( NULL != next ) {
876 			ZVAL_LONG(next, start+nsize);
877 		}
878 		RETURN_STRINGL(((char *)pstr), nsize, 1);
879 	}
880 
881 	/* convert the strings to UTF-16. */
882 	ustr = NULL;
883 	ustr_len = 0;
884 	status = U_ZERO_ERROR;
885 	intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)pstr, str_len, &status );
886 
887 	if ( U_FAILURE( status ) ) {
888 		/* Set global error code. */
889 		intl_error_set_code( NULL, status TSRMLS_CC );
890 
891 		/* Set error messages. */
892 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
893 
894 		if ( NULL != ustr )
895 			efree( ustr );
896 
897 		RETURN_FALSE;
898 	}
899 
900 	bi = NULL;
901 	status = U_ZERO_ERROR;
902 	bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
903 
904 	ubrk_setText(bi, ustr, ustr_len, &status);
905 
906 	/* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
907 		can't back up. So, we will not do anything. */
908 
909 	/* now we need to find the end of the chunk the user wants us to return */
910 
911 	ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
912 
913 	if (ustr) {
914 		efree(ustr);
915 	}
916 	ubrk_close(bi);
917 
918 	if ( NULL != next ) {
919 		ZVAL_LONG(next, start+ret_pos);
920 	}
921 
922 	RETURN_STRINGL(((char *)pstr), ret_pos, 1);
923 }
924 
925 /* }}} */
926 
927 /*
928  * Local variables:
929  * tab-width: 4
930  * c-basic-offset: 4
931  * End:
932  * vim600: fdm=marker
933  * vim: noet sw=4 ts=4
934  */
935 
936