1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 5 |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | http://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Ed Batutis <ed@batutis.com> |
14 +----------------------------------------------------------------------+
15 */
16
17 /* {{{ includes */
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21
22 #include <php.h>
23 #include "grapheme.h"
24 #include "grapheme_util.h"
25
26 #include <unicode/utypes.h>
27 #include <unicode/ucol.h>
28 #include <unicode/ustring.h>
29 #include <unicode/ubrk.h>
30
31 #include "ext/standard/php_string.h"
32
33 /* }}} */
34
35 #define GRAPHEME_EXTRACT_TYPE_COUNT 0
36 #define GRAPHEME_EXTRACT_TYPE_MAXBYTES 1
37 #define GRAPHEME_EXTRACT_TYPE_MAXCHARS 2
38 #define GRAPHEME_EXTRACT_TYPE_MIN GRAPHEME_EXTRACT_TYPE_COUNT
39 #define GRAPHEME_EXTRACT_TYPE_MAX GRAPHEME_EXTRACT_TYPE_MAXCHARS
40
41
42 /* {{{ grapheme_register_constants
43 * Register API constants
44 */
grapheme_register_constants(INIT_FUNC_ARGS)45 void grapheme_register_constants( INIT_FUNC_ARGS )
46 {
47 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_COUNT", GRAPHEME_EXTRACT_TYPE_COUNT, CONST_CS | CONST_PERSISTENT);
48 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXBYTES", GRAPHEME_EXTRACT_TYPE_MAXBYTES, CONST_CS | CONST_PERSISTENT);
49 REGISTER_LONG_CONSTANT("GRAPHEME_EXTR_MAXCHARS", GRAPHEME_EXTRACT_TYPE_MAXCHARS, CONST_CS | CONST_PERSISTENT);
50 }
51 /* }}} */
52
53 /* {{{ proto int grapheme_strlen(string str)
54 Get number of graphemes in a string */
PHP_FUNCTION(grapheme_strlen)55 PHP_FUNCTION(grapheme_strlen)
56 {
57 unsigned char* string;
58 int string_len;
59 UChar* ustring = NULL;
60 int ustring_len = 0;
61 int ret_len;
62 UErrorCode status;
63
64 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", (char **)&string, &string_len) == FAILURE) {
65
66 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
67 "grapheme_strlen: unable to parse input param", 0 TSRMLS_CC );
68
69 RETURN_FALSE;
70 }
71
72 ret_len = grapheme_ascii_check(string, string_len);
73
74 if ( ret_len >= 0 )
75 RETURN_LONG(ret_len);
76
77 /* convert the string to UTF-16. */
78 status = U_ZERO_ERROR;
79 intl_convert_utf8_to_utf16(&ustring, &ustring_len, (char*) string, string_len, &status );
80
81 if ( U_FAILURE( status ) ) {
82 /* Set global error code. */
83 intl_error_set_code( NULL, status TSRMLS_CC );
84
85 /* Set error messages. */
86 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
87 if (ustring) {
88 efree( ustring );
89 }
90 RETURN_NULL();
91 }
92
93 ret_len = grapheme_split_string(ustring, ustring_len, NULL, 0 TSRMLS_CC );
94
95 if (ustring) {
96 efree( ustring );
97 }
98
99 if (ret_len >= 0) {
100 RETVAL_LONG(ret_len);
101 } else {
102 RETVAL_FALSE;
103 }
104 }
105 /* }}} */
106
107 /* {{{ proto int grapheme_strpos(string haystack, string needle [, int offset ])
108 Find position of first occurrence of a string within another */
PHP_FUNCTION(grapheme_strpos)109 PHP_FUNCTION(grapheme_strpos)
110 {
111 unsigned char *haystack, *needle;
112 int haystack_len, needle_len;
113 unsigned char *found;
114 long loffset = 0;
115 int32_t offset = 0, noffset = 0;
116 int ret_pos;
117
118 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
119
120 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
121 "grapheme_strpos: unable to parse input param", 0 TSRMLS_CC );
122
123 RETURN_FALSE;
124 }
125
126 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
127
128 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
129
130 RETURN_FALSE;
131 }
132
133 /* we checked that it will fit: */
134 offset = (int32_t) loffset;
135 noffset = offset >= 0 ? offset : haystack_len + offset;
136
137 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
138
139 if (needle_len == 0) {
140
141 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
142
143 RETURN_FALSE;
144 }
145
146
147 /* quick check to see if the string might be there
148 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
149 */
150 found = (unsigned char *)php_memnstr((char *)haystack + noffset, (char *)needle, needle_len, (char *)haystack + haystack_len);
151
152 /* if it isn't there the we are done */
153 if (!found) {
154 RETURN_FALSE;
155 }
156
157 /* if it is there, and if the haystack is ascii, we are all done */
158 if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
159
160 RETURN_LONG(found - haystack);
161 }
162
163 /* do utf16 part of the strpos */
164 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* fIgnoreCase */, 0 /* last */ TSRMLS_CC );
165
166 if ( ret_pos >= 0 ) {
167 RETURN_LONG(ret_pos);
168 } else {
169 RETURN_FALSE;
170 }
171
172 }
173 /* }}} */
174
175 /* {{{ proto int grapheme_stripos(string haystack, string needle [, int offset ])
176 Find position of first occurrence of a string within another, ignoring case differences */
PHP_FUNCTION(grapheme_stripos)177 PHP_FUNCTION(grapheme_stripos)
178 {
179 unsigned char *haystack, *needle, *haystack_dup, *needle_dup;
180 int haystack_len, needle_len;
181 unsigned char *found;
182 long loffset = 0;
183 int32_t offset = 0;
184 int ret_pos;
185 int is_ascii;
186
187 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
188
189 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
190 "grapheme_stripos: unable to parse input param", 0 TSRMLS_CC );
191
192 RETURN_FALSE;
193 }
194
195 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
196
197 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Offset not contained in string", 1 TSRMLS_CC );
198
199 RETURN_FALSE;
200 }
201
202 /* we checked that it will fit: */
203 offset = (int32_t) loffset;
204
205 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
206
207 if (needle_len == 0) {
208
209 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_stripos: Empty delimiter", 1 TSRMLS_CC );
210
211 RETURN_FALSE;
212 }
213
214
215 is_ascii = ( grapheme_ascii_check(haystack, haystack_len) >= 0 );
216
217 if ( is_ascii ) {
218 int32_t noffset = offset >= 0 ? offset : haystack_len + offset;
219 needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
220 php_strtolower((char *)needle_dup, needle_len);
221 haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
222 php_strtolower((char *)haystack_dup, haystack_len);
223
224 found = (unsigned char*) php_memnstr((char *)haystack_dup + noffset, (char *)needle_dup, needle_len, (char *)haystack_dup + haystack_len);
225
226 efree(haystack_dup);
227 efree(needle_dup);
228
229 if (found) {
230 RETURN_LONG(found - haystack_dup);
231 }
232
233 /* if needle was ascii too, we are all done, otherwise we need to try using Unicode to see what we get */
234 if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
235 RETURN_FALSE;
236 }
237 }
238
239 /* do utf16 part of the strpos */
240 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* fIgnoreCase */, 0 /*last */ TSRMLS_CC );
241
242 if ( ret_pos >= 0 ) {
243 RETURN_LONG(ret_pos);
244 } else {
245 RETURN_FALSE;
246 }
247
248 }
249 /* }}} */
250
251 /* {{{ proto int grapheme_strrpos(string haystack, string needle [, int offset])
252 Find position of last occurrence of a string within another */
PHP_FUNCTION(grapheme_strrpos)253 PHP_FUNCTION(grapheme_strrpos)
254 {
255 unsigned char *haystack, *needle;
256 int haystack_len, needle_len;
257 long loffset = 0;
258 int32_t offset = 0;
259 int32_t ret_pos;
260 int is_ascii;
261
262 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
263
264 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
265 "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
266
267 RETURN_FALSE;
268 }
269
270 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
271
272 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
273
274 RETURN_FALSE;
275 }
276
277 /* we checked that it will fit: */
278 offset = (int32_t) loffset;
279
280 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
281
282 if (needle_len == 0) {
283
284 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
285
286 RETURN_FALSE;
287 }
288
289 is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
290
291 if ( is_ascii ) {
292
293 ret_pos = grapheme_strrpos_ascii(haystack, haystack_len, needle, needle_len, offset);
294
295
296 if ( ret_pos >= 0 ) {
297 RETURN_LONG(ret_pos);
298 }
299
300 /* if the needle was ascii too, we are done */
301
302 if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
303 RETURN_FALSE;
304 }
305
306 /* else we need to continue via utf16 */
307 }
308
309 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 0 /* f_ignore_case */, 1/* last */ TSRMLS_CC);
310
311 if ( ret_pos >= 0 ) {
312 RETURN_LONG(ret_pos);
313 } else {
314 RETURN_FALSE;
315 }
316
317
318 }
319 /* }}} */
320
321 /* {{{ proto int grapheme_strripos(string haystack, string needle [, int offset])
322 Find position of last occurrence of a string within another, ignoring case */
PHP_FUNCTION(grapheme_strripos)323 PHP_FUNCTION(grapheme_strripos)
324 {
325 unsigned char *haystack, *needle;
326 int haystack_len, needle_len;
327 long loffset = 0;
328 int32_t offset = 0;
329 int32_t ret_pos;
330 int is_ascii;
331
332 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|l", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &loffset) == FAILURE) {
333
334 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
335 "grapheme_strrpos: unable to parse input param", 0 TSRMLS_CC );
336
337 RETURN_FALSE;
338 }
339
340 if ( OUTSIDE_STRING(loffset, haystack_len) ) {
341
342 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
343
344 RETURN_FALSE;
345 }
346
347 /* we checked that it will fit: */
348 offset = (int32_t) loffset;
349
350 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
351
352 if (needle_len == 0) {
353
354 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
355
356 RETURN_FALSE;
357 }
358
359 is_ascii = grapheme_ascii_check(haystack, haystack_len) >= 0;
360
361 if ( is_ascii ) {
362 unsigned char *needle_dup, *haystack_dup;
363
364 needle_dup = (unsigned char *)estrndup((char *)needle, needle_len);
365 php_strtolower((char *)needle_dup, needle_len);
366 haystack_dup = (unsigned char *)estrndup((char *)haystack, haystack_len);
367 php_strtolower((char *)haystack_dup, haystack_len);
368
369 ret_pos = grapheme_strrpos_ascii(haystack_dup, haystack_len, needle_dup, needle_len, offset);
370
371 efree(haystack_dup);
372 efree(needle_dup);
373
374 if ( ret_pos >= 0 ) {
375 RETURN_LONG(ret_pos);
376 }
377
378 /* if the needle was ascii too, we are done */
379
380 if ( grapheme_ascii_check(needle, needle_len) >= 0 ) {
381 RETURN_FALSE;
382 }
383
384 /* else we need to continue via utf16 */
385 }
386
387 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, offset, NULL, 1 /* f_ignore_case */, 1 /*last */ TSRMLS_CC);
388
389 if ( ret_pos >= 0 ) {
390 RETURN_LONG(ret_pos);
391 } else {
392 RETURN_FALSE;
393 }
394
395
396 }
397 /* }}} */
398
399 /* {{{ proto string grapheme_substr(string str, int start [, int length])
400 Returns part of a string */
PHP_FUNCTION(grapheme_substr)401 PHP_FUNCTION(grapheme_substr)
402 {
403 unsigned char *str, *sub_str;
404 UChar *ustr;
405 int str_len, sub_str_len, ustr_len;
406 long lstart = 0, length = 0;
407 int32_t start = 0;
408 int iter_val;
409 UErrorCode status;
410 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
411 UBreakIterator* bi = NULL;
412 int sub_str_start_pos, sub_str_end_pos;
413 int32_t (*iter_func)(UBreakIterator *);
414
415 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|l", (char **)&str, &str_len, &lstart, &length) == FAILURE) {
416
417 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
418 "grapheme_substr: unable to parse input param", 0 TSRMLS_CC );
419
420 RETURN_FALSE;
421 }
422
423 if ( OUTSIDE_STRING(lstart, str_len) ) {
424
425 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
426
427 RETURN_FALSE;
428 }
429
430 /* we checked that it will fit: */
431 start = (int32_t) lstart;
432
433 /* the offset is 'grapheme count offset' so it still might be invalid - we'll check it later */
434
435 if ( grapheme_ascii_check(str, str_len) >= 0 ) {
436 grapheme_substr_ascii((char *)str, str_len, start, length, ZEND_NUM_ARGS(), (char **) &sub_str, &sub_str_len);
437
438 if ( NULL == sub_str ) {
439 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: invalid parameters", 1 TSRMLS_CC );
440 RETURN_FALSE;
441 }
442
443 RETURN_STRINGL(((char *)sub_str), sub_str_len, 1);
444 }
445
446 ustr = NULL;
447 ustr_len = 0;
448 status = U_ZERO_ERROR;
449 intl_convert_utf8_to_utf16(&ustr, &ustr_len, (char *)str, str_len, &status);
450
451 if ( U_FAILURE( status ) ) {
452 /* Set global error code. */
453 intl_error_set_code( NULL, status TSRMLS_CC );
454
455 /* Set error messages. */
456 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
457 if (ustr) {
458 efree( ustr );
459 }
460 RETURN_FALSE;
461 }
462
463 bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
464
465 if( U_FAILURE(status) ) {
466 RETURN_FALSE;
467 }
468
469 ubrk_setText(bi, ustr, ustr_len, &status);
470
471 if ( start < 0 ) {
472 iter_func = ubrk_previous;
473 ubrk_last(bi);
474 iter_val = 1;
475 }
476 else {
477 iter_func = ubrk_next;
478 iter_val = -1;
479 }
480
481 sub_str_start_pos = 0;
482
483 while ( start ) {
484 sub_str_start_pos = iter_func(bi);
485
486 if ( UBRK_DONE == sub_str_start_pos ) {
487 break;
488 }
489
490 start += iter_val;
491 }
492
493 if ( 0 != start || sub_str_start_pos >= ustr_len ) {
494
495 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: start not contained in string", 1 TSRMLS_CC );
496
497 if (ustr) {
498 efree(ustr);
499 }
500 ubrk_close(bi);
501 RETURN_FALSE;
502 }
503
504 if (ZEND_NUM_ARGS() <= 2) {
505
506 /* no length supplied, return the rest of the string */
507
508 sub_str = NULL;
509 sub_str_len = 0;
510 status = U_ZERO_ERROR;
511 intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ustr_len - sub_str_start_pos, &status);
512
513 if (ustr) {
514 efree( ustr );
515 }
516 ubrk_close( bi );
517
518 if ( U_FAILURE( status ) ) {
519 /* Set global error code. */
520 intl_error_set_code( NULL, status TSRMLS_CC );
521
522 /* Set error messages. */
523 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
524
525 if (sub_str) {
526 efree( sub_str );
527 }
528
529 RETURN_FALSE;
530 }
531
532 /* return the allocated string, not a duplicate */
533 RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
534 }
535
536 if(length == 0) {
537 /* empty length - we've validated start, we can return "" now */
538 if (ustr) {
539 efree(ustr);
540 }
541 ubrk_close(bi);
542 RETURN_EMPTY_STRING();
543 }
544
545 /* find the end point of the string to return */
546
547 if ( length < 0 ) {
548 iter_func = ubrk_previous;
549 ubrk_last(bi);
550 iter_val = 1;
551 }
552 else {
553 iter_func = ubrk_next;
554 iter_val = -1;
555 }
556
557 sub_str_end_pos = 0;
558
559 while ( length ) {
560 sub_str_end_pos = iter_func(bi);
561
562 if ( UBRK_DONE == sub_str_end_pos ) {
563 break;
564 }
565
566 length += iter_val;
567 }
568
569 ubrk_close(bi);
570
571 if ( UBRK_DONE == sub_str_end_pos) {
572 if(length < 0) {
573 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length not contained in string", 1 TSRMLS_CC );
574
575 efree(ustr);
576 RETURN_FALSE;
577 } else {
578 sub_str_end_pos = ustr_len;
579 }
580 }
581
582 if(sub_str_start_pos > sub_str_end_pos) {
583 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_substr: length is beyond start", 1 TSRMLS_CC );
584
585 efree(ustr);
586 RETURN_FALSE;
587 }
588
589 sub_str = NULL;
590 status = U_ZERO_ERROR;
591 intl_convert_utf16_to_utf8((char **)&sub_str, &sub_str_len, ustr + sub_str_start_pos, ( sub_str_end_pos - sub_str_start_pos ), &status);
592
593 efree( ustr );
594
595 if ( U_FAILURE( status ) ) {
596 /* Set global error code. */
597 intl_error_set_code( NULL, status TSRMLS_CC );
598
599 /* Set error messages. */
600 intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
601
602 if ( NULL != sub_str )
603 efree( sub_str );
604
605 RETURN_FALSE;
606 }
607
608 /* return the allocated string, not a duplicate */
609 RETURN_STRINGL(((char *)sub_str), sub_str_len, 0);
610
611 }
612 /* }}} */
613
614 /* {{{ strstr_common_handler */
strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS,int f_ignore_case)615 static void strstr_common_handler(INTERNAL_FUNCTION_PARAMETERS, int f_ignore_case)
616 {
617 unsigned char *haystack, *needle, *found;
618 int haystack_len, needle_len;
619 int ret_pos, uchar_pos;
620 zend_bool part = 0;
621
622 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "ss|b", (char **)&haystack, &haystack_len, (char **)&needle, &needle_len, &part) == FAILURE) {
623
624 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
625 "grapheme_strstr: unable to parse input param", 0 TSRMLS_CC );
626
627 RETURN_FALSE;
628 }
629
630 if (needle_len == 0) {
631
632 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Empty delimiter", 1 TSRMLS_CC );
633
634 RETURN_FALSE;
635 }
636
637
638 if ( !f_ignore_case ) {
639
640 /* ASCII optimization: quick check to see if the string might be there
641 * I realize that 'offset' is 'grapheme count offset' but will work in spite of that
642 */
643 found = (unsigned char *)php_memnstr((char *)haystack, (char *)needle, needle_len, (char *)haystack + haystack_len);
644
645 /* if it isn't there the we are done */
646 if ( !found ) {
647 RETURN_FALSE;
648 }
649
650 /* if it is there, and if the haystack is ascii, we are all done */
651 if ( grapheme_ascii_check(haystack, haystack_len) >= 0 ) {
652 size_t found_offset = found - haystack;
653
654 if (part) {
655 RETURN_STRINGL(((char *)haystack) , found_offset, 1);
656 } else {
657 RETURN_STRINGL(((char *)found), haystack_len - found_offset, 1);
658 }
659 }
660
661 }
662
663 /* need to work in utf16 */
664 ret_pos = grapheme_strpos_utf16(haystack, haystack_len, needle, needle_len, 0, &uchar_pos, f_ignore_case, 0 /*last */ TSRMLS_CC );
665
666 if ( ret_pos < 0 ) {
667 RETURN_FALSE;
668 }
669
670 /* uchar_pos is the 'nth' Unicode character position of the needle */
671
672 ret_pos = 0;
673 U8_FWD_N(haystack, ret_pos, haystack_len, uchar_pos);
674
675 if (part) {
676 RETURN_STRINGL(((char *)haystack), ret_pos, 1);
677 }
678 else {
679 RETURN_STRINGL(((char *)haystack) + ret_pos, haystack_len - ret_pos, 1);
680 }
681
682 }
683 /* }}} */
684
685 /* {{{ proto string grapheme_strstr(string haystack, string needle[, bool part])
686 Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_strstr)687 PHP_FUNCTION(grapheme_strstr)
688 {
689 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 0 /* f_ignore_case */);
690 }
691 /* }}} */
692
693 /* {{{ proto string grapheme_stristr(string haystack, string needle[, bool part])
694 Finds first occurrence of a string within another */
PHP_FUNCTION(grapheme_stristr)695 PHP_FUNCTION(grapheme_stristr)
696 {
697 strstr_common_handler(INTERNAL_FUNCTION_PARAM_PASSTHRU, 1 /* f_ignore_case */);
698 }
699 /* }}} */
700
701 /* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
702 static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator * bi,int32_t csize,unsigned char * pstr,int32_t str_len)703 grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
704 {
705 int pos = 0;
706 int ret_pos = 0;
707 int break_pos, prev_break_pos;
708 int count = 0;
709
710 while ( 1 ) {
711 pos = ubrk_next(bi);
712
713 if ( UBRK_DONE == pos ) {
714 break;
715 }
716
717 for ( break_pos = ret_pos; break_pos < pos; ) {
718 count++;
719 prev_break_pos = break_pos;
720 U8_FWD_1(pstr, break_pos, str_len);
721
722 if ( prev_break_pos == break_pos ) {
723 /* something wrong - malformed utf8? */
724 csize = 0;
725 break;
726 }
727 }
728
729 /* if we are beyond our limit, then the loop is done */
730 if ( count > csize ) {
731 break;
732 }
733
734 ret_pos = break_pos;
735 }
736
737 return ret_pos;
738 }
739 /* }}} */
740
741 /* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
742 static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator * bi,int32_t bsize,unsigned char * pstr,int32_t str_len)743 grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
744 {
745 int pos = 0;
746 int ret_pos = 0;
747
748 while ( 1 ) {
749 pos = ubrk_next(bi);
750
751 if ( UBRK_DONE == pos ) {
752 break;
753 }
754
755 if ( pos > bsize ) {
756 break;
757 }
758
759 ret_pos = pos;
760 }
761
762 return ret_pos;
763 }
764 /* }}} */
765
766 /* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
767 static inline int32_t
grapheme_extract_count_iter(UBreakIterator * bi,int32_t size,unsigned char * pstr,int32_t str_len)768 grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
769 {
770 int next_pos = 0;
771 int ret_pos = 0;
772
773 while ( size ) {
774 next_pos = ubrk_next(bi);
775
776 if ( UBRK_DONE == next_pos ) {
777 break;
778 }
779 ret_pos = next_pos;
780 size--;
781 }
782
783 return ret_pos;
784 }
785 /* }}} */
786
787 /* {{{ grapheme extract iter function pointer array */
788 typedef int32_t (*grapheme_extract_iter)(UBreakIterator * /*bi*/, int32_t /*size*/, unsigned char * /*pstr*/, int32_t /*str_len*/);
789
790 static grapheme_extract_iter grapheme_extract_iters[] = {
791 &grapheme_extract_count_iter,
792 &grapheme_extract_bytecount_iter,
793 &grapheme_extract_charcount_iter,
794 };
795 /* }}} */
796
797 /* {{{ proto string grapheme_extract(string str, int size[, int extract_type[, int start[, int next]]])
798 Function to extract a sequence of default grapheme clusters */
PHP_FUNCTION(grapheme_extract)799 PHP_FUNCTION(grapheme_extract)
800 {
801 char *str, *pstr;
802 UText ut = UTEXT_INITIALIZER;
803 int str_len;
804 long size; /* maximum number of grapheme clusters, bytes, or characters (based on extract_type) to return */
805 long lstart = 0; /* starting position in str in bytes */
806 int32_t start = 0;
807 long extract_type = GRAPHEME_EXTRACT_TYPE_COUNT;
808 UErrorCode status;
809 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
810 UBreakIterator* bi = NULL;
811 int ret_pos;
812 zval *next = NULL; /* return offset of next part of the string */
813
814 if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sl|llz", (char **)&str, &str_len, &size, &extract_type, &lstart, &next) == FAILURE) {
815
816 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
817 "grapheme_extract: unable to parse input param", 0 TSRMLS_CC );
818
819 RETURN_FALSE;
820 }
821
822 if ( NULL != next ) {
823 if ( !PZVAL_IS_REF(next) ) {
824 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
825 "grapheme_extract: 'next' was not passed by reference", 0 TSRMLS_CC );
826
827 RETURN_FALSE;
828 }
829 else {
830 /* initialize next */
831 zval_dtor(next);
832 ZVAL_LONG(next, lstart);
833 }
834 }
835
836 if ( extract_type < GRAPHEME_EXTRACT_TYPE_MIN || extract_type > GRAPHEME_EXTRACT_TYPE_MAX ) {
837
838 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
839 "grapheme_extract: unknown extract type param", 0 TSRMLS_CC );
840
841 RETURN_FALSE;
842 }
843
844 if ( lstart > INT32_MAX || lstart < 0 || lstart >= str_len ) {
845 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: start not contained in string", 0 TSRMLS_CC );
846 RETURN_FALSE;
847 }
848
849 if ( size > INT32_MAX || size < 0) {
850 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_extract: size is invalid", 0 TSRMLS_CC );
851 RETURN_FALSE;
852 }
853 if (size == 0) {
854 RETURN_EMPTY_STRING();
855 }
856
857 /* we checked that it will fit: */
858 start = (int32_t) lstart;
859
860 pstr = str + start;
861
862 /* just in case pstr points in the middle of a character, move forward to the start of the next char */
863 if ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
864 unsigned char *str_end = str + str_len;
865
866 while ( !UTF8_IS_SINGLE(*pstr) && !U8_IS_LEAD(*pstr) ) {
867 pstr++;
868 if ( pstr >= str_end ) {
869 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
870 "grapheme_extract: invalid input string", 0 TSRMLS_CC );
871
872 RETURN_FALSE;
873 }
874 }
875 }
876
877 str_len -= (pstr - str);
878
879 /* if the string is all ASCII up to size+1 - or str_len whichever is first - then we are done.
880 (size + 1 because the size-th character might be the beginning of a grapheme cluster)
881 */
882
883 if ( -1 != grapheme_ascii_check(pstr, size + 1 < str_len ? size + 1 : str_len ) ) {
884 long nsize = ( size < str_len ? size : str_len );
885 if ( NULL != next ) {
886 ZVAL_LONG(next, start+nsize);
887 }
888 RETURN_STRINGL(((char *)pstr), nsize, 1);
889 }
890
891 status = U_ZERO_ERROR;
892 utext_openUTF8(&ut, pstr, str_len, &status);
893
894 if ( U_FAILURE( status ) ) {
895 /* Set global error code. */
896 intl_error_set_code( NULL, status TSRMLS_CC );
897
898 /* Set error messages. */
899 intl_error_set_custom_msg( NULL, "Error opening UTF-8 text", 0 TSRMLS_CC );
900
901 RETURN_FALSE;
902 }
903
904 bi = NULL;
905 status = U_ZERO_ERROR;
906 bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
907
908 ubrk_setUText(bi, &ut, &status);
909 /* if the caller put us in the middle of a grapheme, we can't detect it in all cases since we
910 can't back up. So, we will not do anything. */
911
912 /* now we need to find the end of the chunk the user wants us to return */
913
914 ret_pos = (*grapheme_extract_iters[extract_type])(bi, size, pstr, str_len);
915
916 utext_close(&ut);
917 ubrk_close(bi);
918
919 if ( NULL != next ) {
920 ZVAL_LONG(next, start+ret_pos);
921 }
922
923 RETURN_STRINGL(((char *)pstr), ret_pos, 1);
924 }
925
926 /* }}} */
927
928 /*
929 * Local variables:
930 * tab-width: 4
931 * c-basic-offset: 4
932 * End:
933 * vim600: fdm=marker
934 * vim: noet sw=4 ts=4
935 */
936
937