1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 5 |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | http://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Ed Batutis <ed@batutis.com> |
14 +----------------------------------------------------------------------+
15 */
16
17 /* {{{ includes */
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21
22 #include <php.h>
23 #include "grapheme.h"
24 #include "grapheme_util.h"
25 #include "intl_common.h"
26
27 #include <unicode/utypes.h>
28 #include <unicode/ucol.h>
29 #include <unicode/ustring.h>
30 #include <unicode/ubrk.h>
31
32 #include "ext/standard/php_string.h"
33
ZEND_EXTERN_MODULE_GLOBALS(intl)34 ZEND_EXTERN_MODULE_GLOBALS( intl )
35
36 /* }}} */
37
38 /* {{{ grapheme_close_global_iterator - clean up */
39 void
40 grapheme_close_global_iterator( TSRMLS_D )
41 {
42 UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
43
44 if ( NULL != global_break_iterator ) {
45 ubrk_close(global_break_iterator);
46 }
47 }
48 /* }}} */
49
50 /* {{{ grapheme_intl_case_fold: convert string to lowercase */
51 void
grapheme_intl_case_fold(UChar ** ptr_to_free,UChar ** str,int32_t * str_len,UErrorCode * pstatus)52 grapheme_intl_case_fold(UChar** ptr_to_free, UChar **str, int32_t *str_len, UErrorCode *pstatus )
53 {
54 UChar *dest;
55 int32_t dest_len, size_required;
56
57 /* allocate a destination string that is a bit larger than the src, hoping that is enough */
58 dest_len = (*str_len) + ( *str_len / 10 );
59 dest = (UChar*) eumalloc(dest_len);
60
61 *pstatus = U_ZERO_ERROR;
62 size_required = u_strFoldCase(dest, dest_len, *str, *str_len, U_FOLD_CASE_DEFAULT, pstatus);
63
64 dest_len = size_required;
65
66 if ( U_BUFFER_OVERFLOW_ERROR == *pstatus ) {
67
68 dest = (UChar*) eurealloc(dest, dest_len);
69
70 *pstatus = U_ZERO_ERROR;
71 size_required = u_strFoldCase(dest, dest_len, *str, *str_len, U_FOLD_CASE_DEFAULT, pstatus);
72 }
73
74 if ( U_FAILURE(*pstatus) ) {
75 return;
76 }
77
78 if ( NULL != ptr_to_free) {
79 efree(*ptr_to_free);
80 *ptr_to_free = dest;
81 }
82
83 *str = dest;
84 *str_len = dest_len;
85
86 return;
87 }
88 /* }}} */
89
90 /* {{{ grapheme_substr_ascii f='from' - starting point, l='length' */
91 void
grapheme_substr_ascii(char * str,int str_len,int f,int l,int argc,char ** sub_str,int * sub_str_len)92 grapheme_substr_ascii(char *str, int str_len, int f, int l, int argc, char **sub_str, int *sub_str_len)
93 {
94 *sub_str = NULL;
95
96 if (argc > 2) {
97 if ((l < 0 && -l > str_len)) {
98 return;
99 } else if (l > str_len) {
100 l = str_len;
101 }
102 } else {
103 l = str_len;
104 }
105
106 if (f > str_len || (f < 0 && -f > str_len)) {
107 return;
108 }
109
110 if (l < 0 && (l + str_len - f) < 0) {
111 return;
112 }
113
114 /* if "from" position is negative, count start position from the end
115 * of the string
116 */
117 if (f < 0) {
118 f = str_len + f;
119 if (f < 0) {
120 f = 0;
121 }
122 }
123
124
125 /* if "length" position is negative, set it to the length
126 * needed to stop that many chars from the end of the string
127 */
128 if (l < 0) {
129 l = (str_len - f) + l;
130 if (l < 0) {
131 l = 0;
132 }
133 }
134
135 if (f >= str_len) {
136 return;
137 }
138
139 if ((f + l) > str_len) {
140 l = str_len - f;
141 }
142
143 *sub_str = str + f;
144 *sub_str_len = l;
145
146 return;
147 }
148 /* }}} */
149
150 /* {{{ grapheme_strrpos_utf16 - strrpos using utf16 */
151 int
grapheme_strrpos_utf16(unsigned char * haystack,int32_t haystack_len,unsigned char * needle,int32_t needle_len,int32_t offset,int f_ignore_case TSRMLS_DC)152 grapheme_strrpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int f_ignore_case TSRMLS_DC)
153 {
154 UChar *uhaystack, *puhaystack, *uhaystack_end, *uneedle;
155 int32_t uhaystack_len, uneedle_len;
156 UErrorCode status;
157 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
158 UBreakIterator* bi = NULL;
159 int ret_pos, pos;
160
161 /* convert the strings to UTF-16. */
162 uhaystack = NULL;
163 uhaystack_len = 0;
164 status = U_ZERO_ERROR;
165 intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, (char *) haystack, haystack_len, &status );
166
167 if ( U_FAILURE( status ) ) {
168 /* Set global error code. */
169 intl_error_set_code( NULL, status TSRMLS_CC );
170
171 /* Set error messages. */
172 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
173 if (uhaystack) {
174 efree( uhaystack );
175 }
176 return -1;
177 }
178
179 if ( f_ignore_case ) {
180 grapheme_intl_case_fold(&uhaystack, &uhaystack, &uhaystack_len, &status );
181 }
182
183 /* get a pointer to the haystack taking into account the offset */
184 bi = NULL;
185 status = U_ZERO_ERROR;
186 bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
187
188 puhaystack = grapheme_get_haystack_offset(bi, uhaystack, uhaystack_len, offset);
189
190 if ( NULL == puhaystack ) {
191 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
192 if (uhaystack) {
193 efree( uhaystack );
194 }
195 ubrk_close (bi);
196 return -1;
197 }
198
199 uneedle = NULL;
200 uneedle_len = 0;
201 status = U_ZERO_ERROR;
202 intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, (char *) needle, needle_len, &status );
203
204 if ( U_FAILURE( status ) ) {
205 /* Set global error code. */
206 intl_error_set_code( NULL, status TSRMLS_CC );
207
208 /* Set error messages. */
209 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
210 if (uhaystack) {
211 efree( uhaystack );
212 }
213 if (uneedle) {
214 efree( uneedle );
215 }
216 ubrk_close (bi);
217 return -1;
218 }
219
220 if ( f_ignore_case ) {
221 grapheme_intl_case_fold(&uneedle, &uneedle, &uneedle_len, &status );
222 }
223
224 ret_pos = -1; /* -1 represents 'not found' */
225
226 /* back up until there's needle_len characters to compare */
227
228 uhaystack_end = uhaystack + uhaystack_len;
229 pos = ubrk_last(bi);
230 puhaystack = uhaystack + pos;
231
232 while ( uhaystack_end - puhaystack < uneedle_len ) {
233
234 pos = ubrk_previous(bi);
235
236 if ( UBRK_DONE == pos ) {
237 break;
238 }
239
240 puhaystack = uhaystack + pos;
241 }
242
243 /* is there enough haystack left to hold the needle? */
244 if ( ( uhaystack_end - puhaystack ) < uneedle_len ) {
245 /* not enough, not found */
246 goto exit;
247 }
248
249 while ( UBRK_DONE != pos ) {
250
251 if (!u_memcmp(uneedle, puhaystack, uneedle_len)) { /* needle_len - 1 in zend memnstr? */
252
253 /* does the grapheme in the haystack end at the same place as the last grapheme in the needle? */
254
255 if ( ubrk_isBoundary(bi, pos + uneedle_len) ) {
256
257 /* found it, get grapheme count offset */
258 ret_pos = grapheme_count_graphemes(bi, uhaystack, pos);
259 break;
260 }
261
262 /* set position back */
263 ubrk_isBoundary(bi, pos);
264 }
265
266 pos = ubrk_previous(bi);
267 puhaystack = uhaystack + pos;
268 }
269
270 exit:
271 if (uhaystack) {
272 efree( uhaystack );
273 }
274 if (uneedle) {
275 efree( uneedle );
276 }
277 ubrk_close (bi);
278
279 return ret_pos;
280 }
281
282 /* }}} */
283
284 /* {{{ grapheme_strpos_utf16 - strrpos using utf16*/
285 int
grapheme_strpos_utf16(unsigned char * haystack,int32_t haystack_len,unsigned char * needle,int32_t needle_len,int32_t offset,int32_t * puchar_pos,int f_ignore_case TSRMLS_DC)286 grapheme_strpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case TSRMLS_DC)
287 {
288 UChar *uhaystack, *puhaystack, *uneedle;
289 int32_t uhaystack_len, uneedle_len;
290 int ret_pos;
291 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
292 UBreakIterator* bi;
293 UErrorCode status;
294
295 *puchar_pos = -1;
296
297 /* convert the strings to UTF-16. */
298
299 uhaystack = NULL;
300 uhaystack_len = 0;
301 status = U_ZERO_ERROR;
302 intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, (char *) haystack, haystack_len, &status );
303
304 if ( U_FAILURE( status ) ) {
305 /* Set global error code. */
306 intl_error_set_code( NULL, status TSRMLS_CC );
307
308 /* Set error messages. */
309 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
310 if (uhaystack) {
311 efree( uhaystack );
312 }
313 return -1;
314 }
315
316 /* get a pointer to the haystack taking into account the offset */
317 bi = NULL;
318 status = U_ZERO_ERROR;
319 bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
320
321 puhaystack = grapheme_get_haystack_offset(bi, uhaystack, uhaystack_len, offset);
322 uhaystack_len = (uhaystack_len - ( puhaystack - uhaystack));
323
324 if ( NULL == puhaystack ) {
325
326 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
327 if (uhaystack) {
328 efree( uhaystack );
329 }
330 ubrk_close (bi);
331
332 return -1;
333 }
334
335 if ( f_ignore_case ) {
336 grapheme_intl_case_fold(&uhaystack, &puhaystack, &uhaystack_len, &status );
337 }
338
339 uneedle = NULL;
340 uneedle_len = 0;
341 status = U_ZERO_ERROR;
342 intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, (char *) needle, needle_len, &status );
343
344 if ( U_FAILURE( status ) ) {
345 /* Set global error code. */
346 intl_error_set_code( NULL, status TSRMLS_CC );
347
348 /* Set error messages. */
349 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
350 if (uhaystack) {
351 efree( uhaystack );
352 }
353 if (uneedle) {
354 efree( uneedle );
355 }
356 ubrk_close (bi);
357
358 return -1;
359 }
360
361 if ( f_ignore_case ) {
362 grapheme_intl_case_fold(&uneedle, &uneedle, &uneedle_len, &status );
363 }
364
365 ret_pos = grapheme_memnstr_grapheme(bi, puhaystack, uneedle, uneedle_len, puhaystack + uhaystack_len );
366
367 *puchar_pos = ubrk_current(bi);
368
369 if (uhaystack) {
370 efree( uhaystack );
371 }
372 if (uneedle) {
373 efree( uneedle );
374 }
375 ubrk_close (bi);
376
377 return ret_pos;
378 }
379
380 /* }}} */
381
382 /* {{{ grapheme_ascii_check: ASCII check */
grapheme_ascii_check(const unsigned char * day,int32_t len)383 int grapheme_ascii_check(const unsigned char *day, int32_t len)
384 {
385 int ret_len = len;
386 while ( len-- ) {
387 if ( *day++ > 0x7f )
388 return -1;
389 }
390
391 return ret_len;
392 }
393
394 /* }}} */
395
396 /* {{{ grapheme_split_string: find and optionally return grapheme boundaries */
grapheme_split_string(const UChar * text,int32_t text_length,int boundary_array[],int boundary_array_len TSRMLS_DC)397 int grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len TSRMLS_DC )
398 {
399 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
400 UErrorCode status = U_ZERO_ERROR;
401 int ret_len, pos;
402 UBreakIterator* bi;
403
404 bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
405
406 if( U_FAILURE(status) ) {
407 return -1;
408 }
409
410 ubrk_setText(bi, text, text_length, &status);
411
412 pos = 0;
413
414 for ( ret_len = 0; pos != UBRK_DONE; ) {
415
416 pos = ubrk_next(bi);
417
418 if ( pos != UBRK_DONE ) {
419
420 if ( NULL != boundary_array && ret_len < boundary_array_len ) {
421 boundary_array[ret_len] = pos;
422 }
423
424 ret_len++;
425 }
426 }
427
428 ubrk_close(bi);
429
430 return ret_len;
431 }
432 /* }}} */
433
434 /* {{{ grapheme_count_graphemes */
435 int32_t
grapheme_count_graphemes(UBreakIterator * bi,UChar * string,int32_t string_len)436 grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len)
437 {
438 int ret_len = 0;
439 int pos = 0;
440 UErrorCode status = U_ZERO_ERROR;
441
442 ubrk_setText(bi, string, string_len, &status);
443
444 do {
445
446 pos = ubrk_next(bi);
447
448 if ( UBRK_DONE != pos ) {
449 ret_len++;
450 }
451
452 } while ( UBRK_DONE != pos );
453
454 return ret_len;
455 }
456 /* }}} */
457
458 /* {{{ grapheme_memnstr_grapheme: find needle in haystack using grapheme boundaries */
459 int32_t
grapheme_memnstr_grapheme(UBreakIterator * bi,UChar * haystack,UChar * needle,int32_t needle_len,UChar * end)460 grapheme_memnstr_grapheme(UBreakIterator *bi, UChar *haystack, UChar *needle, int32_t needle_len, UChar *end)
461 {
462 UChar *p = haystack;
463 UChar ne = needle[needle_len-1];
464 UErrorCode status;
465 int32_t grapheme_offset;
466
467 end -= needle_len;
468
469 while (p <= end) {
470
471 if ((p = u_memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) {
472
473 if (!u_memcmp(needle, p, needle_len - 1)) { /* needle_len - 1 works because if needle_len is 1, we've already tested the char */
474
475 /* does the grapheme end here? */
476
477 status = U_ZERO_ERROR;
478 ubrk_setText (bi, haystack, (end - haystack) + needle_len, &status);
479
480 if ( ubrk_isBoundary (bi, (p - haystack) + needle_len) ) {
481
482 /* found it, get grapheme count offset */
483 grapheme_offset = grapheme_count_graphemes(bi, haystack, (p - haystack));
484
485 return grapheme_offset;
486 }
487 }
488 }
489
490 if (p == NULL) {
491 return -1;
492 }
493
494 p++;
495 }
496
497 return -1;
498 }
499
500 /* }}} */
501
502 /* {{{ grapheme_memrstr_grapheme: reverse find needle in haystack using grapheme boundaries */
grapheme_memrchr_grapheme(const void * s,int c,int32_t n)503 inline void *grapheme_memrchr_grapheme(const void *s, int c, int32_t n)
504 {
505 register unsigned char *e;
506
507 if (n <= 0) {
508 return NULL;
509 }
510
511 for (e = (unsigned char *)s + n - 1; e >= (unsigned char *)s; e--) {
512 if (*e == (unsigned char)c) {
513 return (void *)e;
514 }
515 }
516
517 return NULL;
518 }
519 /* }}} */
520
521 /* {{{ grapheme_get_haystack_offset - bump the haystack pointer based on the grapheme count offset */
522 UChar *
grapheme_get_haystack_offset(UBreakIterator * bi,UChar * uhaystack,int32_t uhaystack_len,int32_t offset)523 grapheme_get_haystack_offset(UBreakIterator* bi, UChar *uhaystack, int32_t uhaystack_len, int32_t offset)
524 {
525 UErrorCode status;
526 int32_t pos;
527 int32_t (*iter_op)(UBreakIterator* bi);
528 int iter_incr;
529
530 if ( NULL != bi ) {
531 status = U_ZERO_ERROR;
532 ubrk_setText (bi, uhaystack, uhaystack_len, &status);
533 }
534
535 if ( 0 == offset ) {
536 return uhaystack;
537 }
538
539 if ( offset < 0 ) {
540 iter_op = ubrk_previous;
541 ubrk_last(bi); /* one past the end */
542 iter_incr = 1;
543 }
544 else {
545 iter_op = ubrk_next;
546 iter_incr = -1;
547 }
548
549 pos = 0;
550
551 while ( pos != UBRK_DONE && offset != 0 ) {
552
553 pos = iter_op(bi);
554
555 if ( UBRK_DONE != pos ) {
556 offset += iter_incr;
557 }
558 }
559
560 if ( offset != 0 ) {
561 return NULL;
562 }
563
564 return uhaystack + pos;
565 }
566 /* }}} */
567
568 /* {{{ grapheme_strrpos_ascii: borrowed from the php ext/standard/string.c */
569 int32_t
grapheme_strrpos_ascii(unsigned char * haystack,int32_t haystack_len,unsigned char * needle,int32_t needle_len,int32_t offset)570 grapheme_strrpos_ascii(unsigned char *haystack, int32_t haystack_len, unsigned char *needle, int32_t needle_len, int32_t offset)
571 {
572 unsigned char *p, *e;
573
574 if (offset >= 0) {
575 p = haystack + offset;
576 e = haystack + haystack_len - needle_len;
577 } else {
578 p = haystack;
579 if (needle_len > -offset) {
580 e = haystack + haystack_len - needle_len;
581 } else {
582 e = haystack + haystack_len + offset;
583 }
584 }
585
586 if (needle_len == 1) {
587 /* Single character search can shortcut memcmps */
588 while (e >= p) {
589 if (*e == *needle) {
590 return (e - p + (offset > 0 ? offset : 0));
591 }
592 e--;
593 }
594 return -1;
595 }
596
597 while (e >= p) {
598 if (memcmp(e, needle, needle_len) == 0) {
599 return (e - p + (offset > 0 ? offset : 0));
600 }
601 e--;
602 }
603
604 return -1;
605 }
606
607 /* }}} */
608
609 /* {{{ grapheme_get_break_iterator: get a clone of the global character break iterator */
610 UBreakIterator*
grapheme_get_break_iterator(void * stack_buffer,UErrorCode * status TSRMLS_DC)611 grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status TSRMLS_DC )
612 {
613 int32_t buffer_size;
614
615 UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
616
617 if ( NULL == global_break_iterator ) {
618
619 global_break_iterator = ubrk_open(UBRK_CHARACTER,
620 NULL, /* icu default locale - locale has no effect on this iterator */
621 NULL, /* text not set in global iterator */
622 0, /* text length = 0 */
623 status);
624
625 INTL_G(grapheme_iterator) = global_break_iterator;
626 }
627
628 buffer_size = U_BRK_SAFECLONE_BUFFERSIZE;
629
630 return ubrk_safeClone(global_break_iterator, stack_buffer, &buffer_size, status);
631 }
632 /* }}} */
633
634 /*
635 * Local variables:
636 * tab-width: 4
637 * c-basic-offset: 4
638 * End:
639 * vim600: fdm=marker
640 * vim: noet sw=4 ts=4
641 */
642
643