1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 7 |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | http://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Ed Batutis <ed@batutis.com> |
14 +----------------------------------------------------------------------+
15 */
16
17 /* {{{ includes */
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21
22 #include <php.h>
23 #include "grapheme.h"
24 #include "grapheme_util.h"
25
26 #include <unicode/utypes.h>
27 #include <unicode/ucol.h>
28 #include <unicode/ustring.h>
29 #include <unicode/ubrk.h>
30
31 #include "ext/standard/php_string.h"
32
33 /* }}} */
34
35 #define GRAPHEME_EXTRACT_TYPE_COUNT 0
36 #define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1
37 #define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2
38 #define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT
39 #define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS
40
41
42 /* {{{ grapheme_register_constants
43 * Register API constants
44 */
grapheme_register_constants(INIT_FUNC_ARGS)45 void grapheme_register_constants( INIT_FUNC_ARGS )
46 {
47 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
48 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
49 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
50 }
51 /* }}} */
52
53 /* {{{ proto size_t grapheme_strlen(string str)
54 Get number of graphemes in a string */
PHP_FUNCTION(grapheme_strlen)55 PHP_FUNCTION(grapheme_strlen)
56 {
57 char* string;
58 size_t string_len;
59 UChar* ustring = NULL;
60 int ustring_len = 0;
61 zend_long ret_len;
62 UErrorCode status;
63
64 if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &string, &string_len) == FAILURE) {
65 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
66 "grapheme_strlen: unable to parse input param", 0 );
67 RETURN_FALSE;
68 }
69
70 ret_len = grapheme_ascii_check((unsigned char *)string, string_len);
71
72 if ( ret_len >= 0 )
73 RETURN_LONG(string_len);
74
75 /* convert the string to UTF-16. */
76 status = U_ZERO_ERROR;
77 intl_convert_utf8_to_utf16(&ustring, &ustring_len, string, string_len, &status );
78
79 if ( U_FAILURE( status ) ) {
80 /* Set global error code. */
81 intl_error_set_code( NULL, status );
82
83 /* Set error messages. */
84 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
85 if (ustring) {
86 efree( ustring );
87 }
88 RETURN_NULL();
89 }
90
91 ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 );
92
93 if (ustring) {
94 efree( ustring );
95 }
96
97 if (ret_len >= 0) {
98 RETVAL_LONG(ret_len);
99 } else {
100 RETVAL_FALSE;
101 }
102 }
103 /* }}} */
104
105 /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
106 Find position of first occurrence of a string within another */
PHP_FUNCTION(grapheme_strpos)107 PHP_FUNCTION(grapheme_strpos)
108 {
109 char *haystack, *needle;
110 size_t haystack_len, needle_len;
111 const char *found;
112 zend_long loffset = 0;
113 int32_t offset = 0;
114 size_t noffset = 0;
115 zend_long ret_pos;
116
117 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
118 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
119 "grapheme_strpos: unable to parse input param", 0 );
120 RETURN_FALSE;
121 }
122
123 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
124 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
125 RETURN_FALSE;
126 }
127
128 /* we checked that it will fit: */
129 offset = (int32_t) loffset;
130 noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
131
132 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
133
134 if (needle_len == 0) {
135 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
136 RETURN_FALSE;
137 }
138
139 if (offset >= 0) {
140 /* quick check to see if the string might be there
141 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
142 */
143 found = php_memnstr(haystack + noffset, needle, needle_len, haystack + haystack_len);
144
145 /* if it isn't there the we are done */
146 if (!found) {
147 RETURN_FALSE;
148 }
149
150 /* if it is there, and if the haystack is ascii, we are all done */
151 if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
152 RETURN_LONG(found - haystack);
153 }
154 }
155
156 /* do utf16 part of the strpos */
157 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ );
158
159 if ( ret_pos >= 0 ) {
160 RETURN_LONG(ret_pos);
161 } else {
162 RETURN_FALSE;
163 }
164
165 }
166 /* }}} */
167
168 /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
169 Find position of first occurrence of a string within another, ignoring case differences */
PHP_FUNCTION(grapheme_stripos)170 PHP_FUNCTION(grapheme_stripos)
171 {
172 char *haystack, *needle, *haystack_dup, *needle_dup;
173 size_t haystack_len, needle_len;
174 const char *found;
175 zend_long loffset = 0;
176 int32_t offset = 0;
177 zend_long ret_pos;
178 int is_ascii;
179
180 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
181 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
182 "grapheme_stripos: unable to parse input param", 0 );
183 RETURN_FALSE;
184 }
185
186 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
187 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 );
188 RETURN_FALSE;
189 }
190
191 /* we checked that it will fit: */
192 offset = (int32_t) loffset;
193
194 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
195
196 if (needle_len == 0) {
197 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 );
198 RETURN_FALSE;
199 }
200
201 is_ascii = ( grapheme_ascii_check((unsigned char*)haystack, haystack_len) >= 0 );
202
203 if ( is_ascii ) {
204 int32_t noffset = offset >= 0 ? offset : (int32_t)haystack_len + offset;
205 needle_dup = estrndup(needle, needle_len);
206 php_strtolower(needle_dup, needle_len);
207 haystack_dup = estrndup(haystack, haystack_len);
208 php_strtolower(haystack_dup, haystack_len);
209
210 found = php_memnstr(haystack_dup + noffset, needle_dup, needle_len, haystack_dup + haystack_len);
211
212 efree(haystack_dup);
213 efree(needle_dup);
214
215 if (found) {
216 RETURN_LONG(found - haystack_dup);
217 }
218
219 /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
220 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
221 RETURN_FALSE;
222 }
223 }
224
225 /* do utf16 part of the strpos */
226 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ );
227
228 if ( ret_pos >= 0 ) {
229 RETURN_LONG(ret_pos);
230 } else {
231 RETURN_FALSE;
232 }
233
234 }
235 /* }}} */
236
237 /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
238 Find position of last occurrence of a string within another */
PHP_FUNCTION(grapheme_strrpos)239 PHP_FUNCTION(grapheme_strrpos)
240 {
241 char *haystack, *needle;
242 size_t haystack_len, needle_len;
243 zend_long loffset = 0;
244 int32_t offset = 0;
245 zend_long ret_pos;
246 int is_ascii;
247
248 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
249 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
250 "grapheme_strrpos: unable to parse input param", 0 );
251 RETURN_FALSE;
252 }
253
254 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
255 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
256 RETURN_FALSE;
257 }
258
259 /* we checked that it will fit: */
260 offset = (int32_t) loffset;
261
262 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
263
264 if (needle_len == 0) {
265 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
266 RETURN_FALSE;
267 }
268
269 is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
270
271 if ( is_ascii ) {
272
273 ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
274
275 if ( ret_pos >= 0 ) {
276 RETURN_LONG(ret_pos);
277 }
278
279 /* if the needle was ascii too, we are done */
280
281 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
282 RETURN_FALSE;
283 }
284
285 /* else we need to continue via utf16 */
286 }
287
288 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */);
289
290 if ( ret_pos >= 0 ) {
291 RETURN_LONG(ret_pos);
292 } else {
293 RETURN_FALSE;
294 }
295
296
297 }
298 /* }}} */
299
300 /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
301 Find position of last occurrence of a string within another, ignoring case */
PHP_FUNCTION(grapheme_strripos)302 PHP_FUNCTION(grapheme_strripos)
303 {
304 char *haystack, *needle;
305 size_t haystack_len, needle_len;
306 zend_long loffset = 0;
307 int32_t offset = 0;
308 zend_long ret_pos;
309 int is_ascii;
310
311 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|l", &haystack, &haystack_len, &needle, &needle_len, &loffset) == FAILURE) {
312 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
313 "grapheme_strrpos: unable to parse input param", 0 );
314 RETURN_FALSE;
315 }
316
317 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
318 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 );
319 RETURN_FALSE;
320 }
321
322 /* we checked that it will fit: */
323 offset = (int32_t) loffset;
324
325 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
326
327 if (needle_len == 0) {
328 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
329 RETURN_FALSE;
330 }
331
332 is_ascii = grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0;
333
334 if ( is_ascii ) {
335 char *needle_dup, *haystack_dup;
336
337 needle_dup = estrndup(needle, needle_len);
338 php_strtolower(needle_dup, needle_len);
339 haystack_dup = estrndup(haystack, haystack_len);
340 php_strtolower(haystack_dup, haystack_len);
341
342 ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
343
344 efree(haystack_dup);
345 efree(needle_dup);
346
347 if ( ret_pos >= 0 ) {
348 RETURN_LONG(ret_pos);
349 }
350
351 /* if the needle was ascii too, we are done */
352
353 if ( grapheme_ascii_check((unsigned char *)needle, needle_len) >= 0 ) {
354 RETURN_FALSE;
355 }
356
357 /* else we need to continue via utf16 */
358 }
359
360 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* f_ignore_case */, 1 /*last */);
361
362 if ( ret_pos >= 0 ) {
363 RETURN_LONG(ret_pos);
364 } else {
365 RETURN_FALSE;
366 }
367
368
369 }
370 /* }}} */
371
372 /* {{{ proto string grapheme_substr(string str, int start [, int length])
373 Returns part of a string */
PHP_FUNCTION(grapheme_substr)374 PHP_FUNCTION(grapheme_substr)
375 {
376 char *str;
377 zend_string *u8_sub_str;
378 UChar *ustr;
379 size_t str_len;
380 int32_t ustr_len;
381 zend_long lstart = 0, length = 0;
382 int32_t start = 0;
383 int iter_val;
384 UErrorCode status;
385 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
386 UBreakIterator* bi = NULL;
387 int sub_str_start_pos, sub_str_end_pos;
388 int32_t (*iter_func)(UBreakIterator *);
389 zend_bool no_length = 1;
390
391 if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|l!", &str, &str_len, &lstart, &length, &no_length) == FAILURE) {
392 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
393 "grapheme_substr: unable to parse input param", 0 );
394 RETURN_FALSE;
395 }
396
397 if ( OUTSIDE_STRING(lstart, str_len)) {
398 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 );
399 RETURN_FALSE;
400 }
401
402 /* we checked that it will fit: */
403 start = (int32_t) lstart;
404
405 if(no_length) {
406 length = str_len;
407 }
408
409 if(length < INT32_MIN) {
410 length = INT32_MIN;
411 } else if(length > INT32_MAX) {
412 length = INT32_MAX;
413 }
414
415 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
416
417 if ( grapheme_ascii_check((unsigned char *)str, str_len) >= 0 ) {
418 int32_t asub_str_len;
419 char *sub_str;
420 grapheme_substr_ascii(str, str_len, start, (int32_t)length, &sub_str, &asub_str_len);
421
422 if ( NULL == sub_str ) {
423 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 );
424 RETURN_FALSE;
425 }
426
427 RETURN_STRINGL(sub_str, asub_str_len);
428 }
429
430 ustr = NULL;
431 ustr_len = 0;
432 status = U_ZERO_ERROR;
433 intl_convert_utf8_to_utf16(&ustr, &ustr_len, str, str_len, &status);
434
435 if ( U_FAILURE( status ) ) {
436 /* Set global error code. */
437 intl_error_set_code( NULL, status );
438
439 /* Set error messages. */
440 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
441 if (ustr) {
442 efree( ustr );
443 }
444 RETURN_FALSE;
445 }
446
447 bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
448
449 if( U_FAILURE(status) ) {
450 RETURN_FALSE;
451 }
452
453 ubrk_setText(bi, ustr, ustr_len, &status);
454
455 if ( start < 0 ) {
456 iter_func = ubrk_previous;
457 ubrk_last(bi);
458 iter_val = 1;
459 }
460 else {
461 iter_func = ubrk_next;
462 iter_val = -1;
463 }
464
465 sub_str_start_pos = 0;
466
467 while ( start ) {
468 sub_str_start_pos = iter_func(bi);
469
470 if ( UBRK_DONE == sub_str_start_pos ) {
471 break;
472 }
473
474 start += iter_val;
475 }
476
477 if ( 0 != start || sub_str_start_pos >= ustr_len ) {
478
479 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 );
480
481 if (ustr) {
482 efree(ustr);
483 }
484 ubrk_close(bi);
485 RETURN_FALSE;
486 }
487
488 /* OK to convert here since if str_len were big, convert above would fail */
489 if (length >= (int32_t)str_len) {
490
491 /* no length supplied or length is too big, return the rest of the string */
492
493 status = U_ZERO_ERROR;
494 u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
495
496 if (ustr) {
497 efree( ustr );
498 }
499 ubrk_close( bi );
500
501 if ( !u8_sub_str ) {
502 /* Set global error code. */
503 intl_error_set_code( NULL, status );
504
505 /* Set error messages. */
506 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
507
508 RETURN_FALSE;
509 }
510
511 /* return the allocated string, not a duplicate */
512 RETVAL_NEW_STR(u8_sub_str);
513 return;
514 }
515
516 if(length == 0) {
517 /* empty length - we've validated start, we can return "" now */
518 if (ustr) {
519 efree(ustr);
520 }
521 ubrk_close(bi);
522 RETURN_EMPTY_STRING();
523 }
524
525 /* find the end point of the string to return */
526
527 if ( length < 0 ) {
528 iter_func = ubrk_previous;
529 ubrk_last(bi);
530 iter_val = 1;
531 }
532 else {
533 iter_func = ubrk_next;
534 iter_val = -1;
535 }
536
537 sub_str_end_pos = 0;
538
539 while ( length ) {
540 sub_str_end_pos = iter_func(bi);
541
542 if ( UBRK_DONE == sub_str_end_pos ) {
543 break;
544 }
545
546 length += iter_val;
547 }
548
549 ubrk_close(bi);
550
551 if ( UBRK_DONE == sub_str_end_pos) {
552 if(length < 0) {
553 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 );
554
555 efree(ustr);
556 RETURN_FALSE;
557 } else {
558 sub_str_end_pos = ustr_len;
559 }
560 }
561
562 if(sub_str_start_pos > sub_str_end_pos) {
563 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length is beyond start", 1 );
564
565 efree(ustr);
566 RETURN_FALSE;
567 }
568
569 status = U_ZERO_ERROR;
570 u8_sub_str = intl_convert_utf16_to_utf8(ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
571
572 efree( ustr );
573
574 if ( !u8_sub_str ) {
575 /* Set global error code. */
576 intl_error_set_code( NULL, status );
577
578 /* Set error messages. */
579 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 );
580
581 RETURN_FALSE;
582 }
583
584 /* return the allocated string, not a duplicate */
585 RETVAL_NEW_STR(u8_sub_str);
586 }
587 /* }}} */
588
589 /* {{{ strstr_common_handler */
strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS,int f_ignore_case)590 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
591 {
592 char *haystack, *needle;
593 const char *found;
594 size_t haystack_len, needle_len;
595 int32_t ret_pos, uchar_pos;
596 zend_bool part = 0;
597
598 if (zend_parse_parameters(ZEND_NUM_ARGS(), "ss|b", &haystack, &haystack_len, &needle, &needle_len, &part) == FAILURE) {
599
600 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
601 "grapheme_strstr: unable to parse input param", 0 );
602
603 RETURN_FALSE;
604 }
605
606 if (needle_len == 0) {
607
608 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 );
609
610 RETURN_FALSE;
611 }
612
613
614 if ( !f_ignore_case ) {
615
616 /* ASCII optimization: quick check to see if the string might be there
617 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
618 */
619 found = php_memnstr(haystack, needle, needle_len, haystack + haystack_len);
620
621 /* if it isn't there the we are done */
622 if ( !found ) {
623 RETURN_FALSE;
624 }
625
626 /* if it is there, and if the haystack is ascii, we are all done */
627 if ( grapheme_ascii_check((unsigned char *)haystack, haystack_len) >= 0 ) {
628 size_t found_offset = found - haystack;
629
630 if (part) {
631 RETURN_STRINGL(haystack, found_offset);
632 } else {
633 RETURN_STRINGL(found, haystack_len - found_offset);
634 }
635 }
636
637 }
638
639 /* need to work in utf16 */
640 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ );
641
642 if ( ret_pos < 0 ) {
643 RETURN_FALSE;
644 }
645
646 /* uchar_pos is the 'nth' Unicode character position of the needle */
647
648 ret_pos = 0;
649 U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
650
651 if (part) {
652 RETURN_STRINGL(haystack, ret_pos);
653 } else {
654 RETURN_STRINGL(haystack + ret_pos, haystack_len - ret_pos);
655 }
656
657 }
658 /* }}} */
659
660 /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
661 Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_strstr)662 PHP_FUNCTION(grapheme_strstr)
663 {
664 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
665 }
666 /* }}} */
667
668 /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
669 Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_stristr)670 PHP_FUNCTION(grapheme_stristr)
671 {
672 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
673 }
674 /* }}} */
675
676 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
677 static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator * bi,int32_t csize,unsigned char * pstr,int32_t str_len)678 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
679 {
680 int pos = 0;
681 int ret_pos = 0;
682 int break_pos, prev_break_pos;
683 int count = 0;
684
685 while ( 1 ) {
686 pos = ubrk_next(bi);
687
688 if ( UBRK_DONE == pos ) {
689 break;
690 }
691
692 for ( break_pos = ret_pos; break_pos < pos; ) {
693 count++;
694 prev_break_pos = break_pos;
695 U8_FWD_1(pstr, break_pos, str_len);
696
697 if ( prev_break_pos == break_pos ) {
698 /* something wrong - malformed utf8? */
699 csize = 0;
700 break;
701 }
702 }
703
704 /* if we are beyond our limit, then the loop is done */
705 if ( count > csize ) {
706 break;
707 }
708
709 ret_pos = break_pos;
710 }
711
712 return ret_pos;
713 }
714 /* }}} */
715
716 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
717 static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator * bi,int32_t bsize,unsigned char * pstr,int32_t str_len)718 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
719 {
720 int pos = 0;
721 int ret_pos = 0;
722
723 while ( 1 ) {
724 pos = ubrk_next(bi);
725
726 if ( UBRK_DONE == pos ) {
727 break;
728 }
729
730 if ( pos > bsize ) {
731 break;
732 }
733
734 ret_pos = pos;
735 }
736
737 return ret_pos;
738 }
739 /* }}} */
740
741 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
742 static inline int32_t
grapheme_extract_count_iter(UBreakIterator * bi,int32_t size,unsigned char * pstr,int32_t str_len)743 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
744 {
745 int next_pos = 0;
746 int ret_pos = 0;
747
748 while ( size ) {
749 next_pos = ubrk_next(bi);
750
751 if ( UBRK_DONE == next_pos ) {
752 break;
753 }
754 ret_pos = next_pos;
755 size--;
756 }
757
758 return ret_pos;
759 }
760 /* }}} */
761
762 /* {{{ grapheme extract iter function pointer array */
763 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
764
765 static grapheme_extract_iter grapheme_extract_iters[] = {
766 &grapheme_extract_count_iter,
767 &grapheme_extract_bytecount_iter,
768 &grapheme_extract_charcount_iter,
769 };
770 /* }}} */
771
772 /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
773 Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)774 PHP_FUNCTION(grapheme_extract)
775 {
776 char *str, *pstr;
777 UText ut = UTEXT_INITIALIZER;
778 size_t str_len;
779 zend_long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
780 zend_long lstart = 0; /* starting position in str in bytes */
781 int32_t start = 0;
782 zend_long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
783 UErrorCode status;
784 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
785 UBreakIterator* bi = NULL;
786 int ret_pos;
787 zval *next = NULL; /* return offset of next part of the string */
788
789 if (zend_parse_parameters(ZEND_NUM_ARGS(), "sl|llz", &str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
790 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
791 "grapheme_extract: unable to parse input param", 0 );
792 RETURN_FALSE;
793 }
794
795 if (lstart < 0) {
796 lstart += str_len;
797 }
798
799 if ( NULL != next ) {
800 if ( !Z_ISREF_P(next) ) {
801 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
802 "grapheme_extract: 'next' was not passed by reference", 0 );
803 RETURN_FALSE;
804 } else {
805 ZVAL_DEREF(next);
806 /* initialize next */
807 SEPARATE_ZVAL_NOREF(next);
808 zval_dtor(next);
809 ZVAL_LONG(next, lstart);
810 }
811 }
812
813 if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
814 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
815 "grapheme_extract: unknown extract type param", 0 );
816 RETURN_FALSE;
817 }
818
819 if ( lstart > INT32_MAX || lstart < 0 || (size_t)lstart >= str_len ) {
820 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 );
821 RETURN_FALSE;
822 }
823
824 if ( size > INT32_MAX || size < 0) {
825 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 );
826 RETURN_FALSE;
827 }
828 if (size == 0) {
829 RETURN_EMPTY_STRING();
830 }
831
832 /* we checked that it will fit: */
833 start = (int32_t) lstart;
834
835 pstr = str + start;
836
837 /* just in case pstr points in the middle of a character, move forward to the start of the next char */
838 if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
839 char *str_end = str + str_len;
840
841 while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
842 pstr++;
843 if ( pstr >= str_end ) {
844 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
845 "grapheme_extract: invalid input string", 0 );
846
847 RETURN_FALSE;
848 }
849 }
850 }
851
852 str_len -= (pstr - str);
853
854 /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
855 (size + 1 because the size-th character might be the beginning of a grapheme cluster)
856 */
857
858 if ( -1 != grapheme_ascii_check((unsigned char *)pstr, MIN(size + 1, str_len)) ) {
859 size_t nsize = MIN(size, str_len);
860 if ( NULL != next ) {
861 ZVAL_LONG(next, start+nsize);
862 }
863 RETURN_STRINGL(pstr, nsize);
864 }
865
866 status = U_ZERO_ERROR;
867 utext_openUTF8(&ut, pstr, str_len, &status);
868
869 if ( U_FAILURE( status ) ) {
870 /* Set global error code. */
871 intl_error_set_code( NULL, status );
872
873 /* Set error messages. */
874 intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 );
875
876 RETURN_FALSE;
877 }
878
879 bi = NULL;
880 status = U_ZERO_ERROR;
881 bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
882
883 ubrk_setUText(bi, &ut, &status);
884 /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
885 can't back up. So, we will not do anything. */
886
887 /* now we need to find the end of the chunk the user wants us to return */
888 /* it's ok to convert str_len to in32_t since if it were too big intl_convert_utf8_to_utf16 above would fail */
889 ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, (unsigned char *)pstr, (int32_t)str_len);
890
891 utext_close(&ut);
892 ubrk_close(bi);
893
894 if ( NULL != next ) {
895 ZVAL_LONG(next, start+ret_pos);
896 }
897
898 RETURN_STRINGL(((char *)pstr), ret_pos);
899 }
900
901 /* }}} */
902
903 /*
904 * Local variables:
905 * tab-width: 4
906 * c-basic-offset: 4
907 * End:
908 * vim600: fdm=marker
909 * vim: noet sw=4 ts=4
910 */
911