xref: /PHP-5.3/ext/intl/grapheme/grapheme_util.c (revision 9762609c)
1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 5                                                        |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | http://www.php.net/license/3_01.txt                                  |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Ed Batutis <ed@batutis.com>                                  |
14    +----------------------------------------------------------------------+
15  */
16 
17 /* {{{ includes */
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21 
22 #include <php.h>
23 #include "grapheme.h"
24 #include "grapheme_util.h"
25 #include "intl_common.h"
26 
27 #include <unicode/utypes.h>
28 #include <unicode/ucol.h>
29 #include <unicode/ustring.h>
30 #include <unicode/ubrk.h>
31 
32 #include "ext/standard/php_string.h"
33 
ZEND_EXTERN_MODULE_GLOBALS(intl)34 ZEND_EXTERN_MODULE_GLOBALS( intl )
35 
36 /* }}} */
37 
38 /* {{{ grapheme_close_global_iterator - clean up */
39 void
40 grapheme_close_global_iterator( TSRMLS_D )
41 {
42 	UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
43 
44 	if ( NULL != global_break_iterator ) {
45 		ubrk_close(global_break_iterator);
46 	}
47 }
48 /* }}} */
49 
50 /* {{{ grapheme_intl_case_fold: convert string to lowercase */
51 void
grapheme_intl_case_fold(UChar ** ptr_to_free,UChar ** str,int32_t * str_len,UErrorCode * pstatus)52 grapheme_intl_case_fold(UChar** ptr_to_free, UChar **str, int32_t *str_len, UErrorCode *pstatus )
53 {
54     UChar *dest;
55     int32_t dest_len, size_required;
56 
57     /* allocate a destination string that is a bit larger than the src, hoping that is enough */
58     dest_len = (*str_len) + ( *str_len / 10 );
59     dest = (UChar*) eumalloc(dest_len);
60 
61     *pstatus = U_ZERO_ERROR;
62     size_required = u_strFoldCase(dest, dest_len, *str, *str_len, U_FOLD_CASE_DEFAULT, pstatus);
63 
64     dest_len = size_required;
65 
66     if ( U_BUFFER_OVERFLOW_ERROR == *pstatus ) {
67 
68         dest = (UChar*) eurealloc(dest, dest_len);
69 
70         *pstatus = U_ZERO_ERROR;
71         size_required = u_strFoldCase(dest, dest_len, *str, *str_len, U_FOLD_CASE_DEFAULT, pstatus);
72     }
73 
74     if ( U_FAILURE(*pstatus) ) {
75         return;
76     }
77 
78     if ( NULL != ptr_to_free) {
79         efree(*ptr_to_free);
80         *ptr_to_free = dest;
81     }
82 
83     *str = dest;
84     *str_len = dest_len;
85 
86     return;
87 }
88 /* }}} */
89 
90 /* {{{ grapheme_substr_ascii f='from' - starting point, l='length' */
91 void
grapheme_substr_ascii(char * str,int str_len,int f,int l,int argc,char ** sub_str,int * sub_str_len)92 grapheme_substr_ascii(char *str, int str_len, int f, int l, int argc, char **sub_str, int *sub_str_len)
93 {
94     *sub_str = NULL;
95 
96     if (argc > 2) {
97         if ((l < 0 && -l > str_len)) {
98             return;
99         } else if (l > str_len) {
100             l = str_len;
101         }
102     } else {
103         l = str_len;
104     }
105 
106     if (f > str_len || (f < 0 && -f > str_len)) {
107         return;
108     }
109 
110     if (l < 0 && (l + str_len - f) < 0) {
111         return;
112     }
113 
114     /* if "from" position is negative, count start position from the end
115      * of the string
116      */
117     if (f < 0) {
118         f = str_len + f;
119         if (f < 0) {
120             f = 0;
121         }
122     }
123 
124 
125     /* if "length" position is negative, set it to the length
126      * needed to stop that many chars from the end of the string
127      */
128     if (l < 0) {
129         l = (str_len - f) + l;
130         if (l < 0) {
131             l = 0;
132         }
133     }
134 
135     if (f >= str_len) {
136         return;
137     }
138 
139     if ((f + l) > str_len) {
140         l = str_len - f;
141     }
142 
143     *sub_str = str + f;
144     *sub_str_len = l;
145 
146     return;
147 }
148 /* }}} */
149 
150 /* {{{ grapheme_strrpos_utf16 - strrpos using utf16 */
151 int
grapheme_strrpos_utf16(unsigned char * haystack,int32_t haystack_len,unsigned char * needle,int32_t needle_len,int32_t offset,int f_ignore_case TSRMLS_DC)152 grapheme_strrpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int f_ignore_case TSRMLS_DC)
153 {
154     UChar *uhaystack, *puhaystack, *uhaystack_end, *uneedle;
155     int32_t uhaystack_len, uneedle_len;
156     UErrorCode status;
157     unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
158     UBreakIterator* bi = NULL;
159     int ret_pos, pos;
160 
161     /* convert the strings to UTF-16. */
162     uhaystack = NULL;
163     uhaystack_len = 0;
164     status = U_ZERO_ERROR;
165     intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, (char *) haystack, haystack_len, &status );
166 
167     if ( U_FAILURE( status ) ) {
168         /* Set global error code. */
169         intl_error_set_code( NULL, status TSRMLS_CC );
170 
171         /* Set error messages. */
172         intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
173         if (uhaystack) {
174 			efree( uhaystack );
175 		}
176         return -1;
177     }
178 
179     if ( f_ignore_case ) {
180         grapheme_intl_case_fold(&uhaystack, &uhaystack, &uhaystack_len, &status );
181     }
182 
183     /* get a pointer to the haystack taking into account the offset */
184     bi = NULL;
185     status = U_ZERO_ERROR;
186     bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
187 
188     puhaystack = grapheme_get_haystack_offset(bi, uhaystack, uhaystack_len, offset);
189 
190     if ( NULL == puhaystack ) {
191         intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
192         if (uhaystack) {
193 			efree( uhaystack );
194 		}
195         ubrk_close (bi);
196         return -1;
197     }
198 
199     uneedle = NULL;
200     uneedle_len = 0;
201     status = U_ZERO_ERROR;
202     intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, (char *) needle, needle_len, &status );
203 
204     if ( U_FAILURE( status ) ) {
205         /* Set global error code. */
206         intl_error_set_code( NULL, status TSRMLS_CC );
207 
208         /* Set error messages. */
209         intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
210         if (uhaystack) {
211 			efree( uhaystack );
212 		}
213 		if (uneedle) {
214 			efree( uneedle );
215 		}
216         ubrk_close (bi);
217         return -1;
218     }
219 
220     if ( f_ignore_case ) {
221         grapheme_intl_case_fold(&uneedle, &uneedle, &uneedle_len, &status );
222     }
223 
224     ret_pos = -1;   /* -1 represents 'not found' */
225 
226     /* back up until there's needle_len characters to compare */
227 
228     uhaystack_end = uhaystack + uhaystack_len;
229     pos = ubrk_last(bi);
230     puhaystack = uhaystack + pos;
231 
232     while ( uhaystack_end - puhaystack < uneedle_len ) {
233 
234         pos = ubrk_previous(bi);
235 
236         if ( UBRK_DONE == pos ) {
237             break;
238         }
239 
240         puhaystack = uhaystack + pos;
241     }
242 
243     /* is there enough haystack left to hold the needle? */
244     if ( ( uhaystack_end - puhaystack ) < uneedle_len ) {
245         /* not enough, not found */
246         goto exit;
247     }
248 
249     while ( UBRK_DONE != pos ) {
250 
251         if (!u_memcmp(uneedle, puhaystack, uneedle_len)) {  /* needle_len - 1 in zend memnstr? */
252 
253             /* does the grapheme in the haystack end at the same place as the last grapheme in the needle? */
254 
255             if ( ubrk_isBoundary(bi, pos + uneedle_len) ) {
256 
257                 /* found it, get grapheme count offset */
258                 ret_pos = grapheme_count_graphemes(bi, uhaystack, pos);
259                 break;
260             }
261 
262             /* set position back */
263             ubrk_isBoundary(bi, pos);
264         }
265 
266         pos = ubrk_previous(bi);
267         puhaystack = uhaystack + pos;
268     }
269 
270 exit:
271 	if (uhaystack) {
272 		efree( uhaystack );
273 	}
274 	if (uneedle) {
275 		efree( uneedle );
276 	}
277     ubrk_close (bi);
278 
279     return ret_pos;
280 }
281 
282 /* }}} */
283 
284 /* {{{ grapheme_strpos_utf16 - strrpos using utf16*/
285 int
grapheme_strpos_utf16(unsigned char * haystack,int32_t haystack_len,unsigned char * needle,int32_t needle_len,int32_t offset,int32_t * puchar_pos,int f_ignore_case TSRMLS_DC)286 grapheme_strpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case TSRMLS_DC)
287 {
288 	UChar *uhaystack, *puhaystack, *uneedle;
289 	int32_t uhaystack_len, uneedle_len;
290 	int ret_pos;
291 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
292 	UBreakIterator* bi;
293 	UErrorCode status;
294 
295 	*puchar_pos = -1;
296 
297 	/* convert the strings to UTF-16. */
298 
299 	uhaystack = NULL;
300 	uhaystack_len = 0;
301 	status = U_ZERO_ERROR;
302 	intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, (char *) haystack, haystack_len, &status );
303 
304 	if ( U_FAILURE( status ) ) {
305 		/* Set global error code. */
306 		intl_error_set_code( NULL, status TSRMLS_CC );
307 
308 		/* Set error messages. */
309 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
310 		if (uhaystack) {
311 			efree( uhaystack );
312 		}
313 		return -1;
314 	}
315 
316 	/* get a pointer to the haystack taking into account the offset */
317 	bi = NULL;
318 	status = U_ZERO_ERROR;
319 	bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );
320 
321 	puhaystack = grapheme_get_haystack_offset(bi, uhaystack, uhaystack_len, offset);
322 	uhaystack_len = (uhaystack_len - ( puhaystack - uhaystack));
323 
324 	if ( NULL == puhaystack ) {
325 
326 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
327 		if (uhaystack) {
328 			efree( uhaystack );
329 		}
330 		ubrk_close (bi);
331 
332 		return -1;
333 	}
334 
335 	if ( f_ignore_case ) {
336 		grapheme_intl_case_fold(&uhaystack, &puhaystack, &uhaystack_len, &status );
337 	}
338 
339 	uneedle = NULL;
340 	uneedle_len = 0;
341 	status = U_ZERO_ERROR;
342 	intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, (char *) needle, needle_len, &status );
343 
344 	if ( U_FAILURE( status ) ) {
345 		/* Set global error code. */
346 		intl_error_set_code( NULL, status TSRMLS_CC );
347 
348 		/* Set error messages. */
349 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
350 		if (uhaystack) {
351 			efree( uhaystack );
352 		}
353 		if (uneedle) {
354 			efree( uneedle );
355 		}
356 		ubrk_close (bi);
357 
358 		return -1;
359 	}
360 
361 	if ( f_ignore_case ) {
362 		grapheme_intl_case_fold(&uneedle, &uneedle, &uneedle_len, &status );
363 	}
364 
365 	ret_pos = grapheme_memnstr_grapheme(bi, puhaystack, uneedle, uneedle_len, puhaystack + uhaystack_len );
366 
367 	*puchar_pos = ubrk_current(bi);
368 
369 	if (uhaystack) {
370 		efree( uhaystack );
371 	}
372 	if (uneedle) {
373 		efree( uneedle );
374 	}
375 	ubrk_close (bi);
376 
377 	return ret_pos;
378 }
379 
380 /* }}} */
381 
382 /* {{{ grapheme_ascii_check: ASCII check */
grapheme_ascii_check(const unsigned char * day,int32_t len)383 int grapheme_ascii_check(const unsigned char *day, int32_t len)
384 {
385 	int ret_len = len;
386 	while ( len-- ) {
387 	if ( *day++ > 0x7f )
388 		return -1;
389 	}
390 
391 	return ret_len;
392 }
393 
394 /* }}} */
395 
396 /* {{{ grapheme_split_string: find and optionally return grapheme boundaries */
grapheme_split_string(const UChar * text,int32_t text_length,int boundary_array[],int boundary_array_len TSRMLS_DC)397 int grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len TSRMLS_DC )
398 {
399 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
400 	UErrorCode		status = U_ZERO_ERROR;
401 	int ret_len, pos;
402 	UBreakIterator* bi;
403 
404 	bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );
405 
406 	if( U_FAILURE(status) ) {
407 		return -1;
408 	}
409 
410 	ubrk_setText(bi, text, text_length,	&status);
411 
412 	pos = 0;
413 
414 	for ( ret_len = 0; pos != UBRK_DONE; ) {
415 
416 		pos = ubrk_next(bi);
417 
418 		if ( pos != UBRK_DONE ) {
419 
420 			if ( NULL != boundary_array && ret_len < boundary_array_len ) {
421 				boundary_array[ret_len] = pos;
422 			}
423 
424 			ret_len++;
425 		}
426 	}
427 
428 	ubrk_close(bi);
429 
430 	return ret_len;
431 }
432 /* }}} */
433 
434 /* {{{ grapheme_count_graphemes */
435 int32_t
grapheme_count_graphemes(UBreakIterator * bi,UChar * string,int32_t string_len)436 grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len)
437 {
438 	int ret_len = 0;
439 	int pos = 0;
440 	UErrorCode		status = U_ZERO_ERROR;
441 
442 	ubrk_setText(bi, string, string_len, &status);
443 
444 	do {
445 
446 		pos = ubrk_next(bi);
447 
448 		if ( UBRK_DONE != pos ) {
449 			ret_len++;
450 		}
451 
452 	} while ( UBRK_DONE != pos );
453 
454 	return ret_len;
455 }
456 /* }}} */
457 
458 /* {{{ grapheme_memnstr_grapheme: find needle in haystack using grapheme boundaries */
459 int32_t
grapheme_memnstr_grapheme(UBreakIterator * bi,UChar * haystack,UChar * needle,int32_t needle_len,UChar * end)460 grapheme_memnstr_grapheme(UBreakIterator *bi, UChar *haystack, UChar *needle, int32_t needle_len, UChar *end)
461 {
462 	UChar *p = haystack;
463 	UChar ne = needle[needle_len-1];
464 	UErrorCode status;
465 	int32_t grapheme_offset;
466 
467 	end -= needle_len;
468 
469 	while (p <= end) {
470 
471 		if ((p = u_memchr(p, *needle, (end-p+1))) && ne == p[needle_len-1]) {
472 
473 			if (!u_memcmp(needle, p, needle_len - 1)) {  /* needle_len - 1 works because if needle_len is 1, we've already tested the char */
474 
475 				/* does the grapheme end here? */
476 
477 				status = U_ZERO_ERROR;
478 				ubrk_setText (bi, haystack, (end - haystack) + needle_len, &status);
479 
480 				if ( ubrk_isBoundary (bi, (p - haystack) + needle_len) ) {
481 
482 					/* found it, get grapheme count offset */
483 					grapheme_offset = grapheme_count_graphemes(bi, haystack, (p - haystack));
484 
485 					return grapheme_offset;
486 				}
487 			}
488 		}
489 
490 		if (p == NULL) {
491 			return -1;
492 		}
493 
494 		p++;
495 	}
496 
497 	return -1;
498 }
499 
500 /* }}} */
501 
502 /* {{{ grapheme_memrstr_grapheme: reverse find needle in haystack using grapheme boundaries */
grapheme_memrchr_grapheme(const void * s,int c,int32_t n)503 inline void *grapheme_memrchr_grapheme(const void *s, int c, int32_t n)
504 {
505 	register unsigned char *e;
506 
507 	if (n <= 0) {
508 		return NULL;
509 	}
510 
511 	for (e = (unsigned char *)s + n - 1; e >= (unsigned char *)s; e--) {
512 		if (*e == (unsigned char)c) {
513 			return (void *)e;
514 		}
515 	}
516 
517 	return NULL;
518 }
519 /* }}} */
520 
521 /* {{{ 	grapheme_get_haystack_offset - bump the haystack pointer based on the grapheme count offset */
522 UChar *
grapheme_get_haystack_offset(UBreakIterator * bi,UChar * uhaystack,int32_t uhaystack_len,int32_t offset)523 grapheme_get_haystack_offset(UBreakIterator* bi, UChar *uhaystack, int32_t uhaystack_len, int32_t offset)
524 {
525 	UErrorCode		status;
526 	int32_t pos;
527 	int32_t (*iter_op)(UBreakIterator* bi);
528 	int iter_incr;
529 
530 	if ( NULL != bi ) {
531 		status = U_ZERO_ERROR;
532 		ubrk_setText (bi, uhaystack, uhaystack_len, &status);
533 	}
534 
535 	if ( 0 == offset ) {
536 		return uhaystack;
537 	}
538 
539 	if ( offset < 0 ) {
540 		iter_op = ubrk_previous;
541 		ubrk_last(bi); /* one past the end */
542 		iter_incr = 1;
543 	}
544 	else {
545 		iter_op = ubrk_next;
546 		iter_incr = -1;
547 	}
548 
549 	pos = 0;
550 
551 	while ( pos != UBRK_DONE && offset != 0 ) {
552 
553 		pos = iter_op(bi);
554 
555 		if ( UBRK_DONE != pos ) {
556 			offset += iter_incr;
557 		}
558 	}
559 
560 	if ( offset != 0 ) {
561 		return NULL;
562 	}
563 
564 	return uhaystack + pos;
565 }
566 /* }}} */
567 
568 /* {{{ grapheme_strrpos_ascii: borrowed from the php ext/standard/string.c */
569  int32_t
grapheme_strrpos_ascii(unsigned char * haystack,int32_t haystack_len,unsigned char * needle,int32_t needle_len,int32_t offset)570 grapheme_strrpos_ascii(unsigned char *haystack, int32_t haystack_len, unsigned char *needle, int32_t needle_len, int32_t offset)
571 {
572 	unsigned char *p, *e;
573 
574 	if (offset >= 0) {
575 		p = haystack + offset;
576 		e = haystack + haystack_len - needle_len;
577 	} else {
578 		p = haystack;
579 		if (needle_len > -offset) {
580 			e = haystack + haystack_len - needle_len;
581 		} else {
582 			e = haystack + haystack_len + offset;
583 		}
584 	}
585 
586 	if (needle_len == 1) {
587 		/* Single character search can shortcut memcmps */
588 		while (e >= p) {
589 			if (*e == *needle) {
590 				return (e - p + (offset > 0 ? offset : 0));
591 			}
592 			e--;
593 		}
594 		return -1;
595 	}
596 
597 	while (e >= p) {
598 		if (memcmp(e, needle, needle_len) == 0) {
599 			return (e - p + (offset > 0 ? offset : 0));
600 		}
601 		e--;
602 	}
603 
604 	return -1;
605 }
606 
607 /* }}} */
608 
609 /* {{{ grapheme_get_break_iterator: get a clone of the global character break iterator */
610 UBreakIterator*
grapheme_get_break_iterator(void * stack_buffer,UErrorCode * status TSRMLS_DC)611 grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status TSRMLS_DC )
612 {
613 	int32_t buffer_size;
614 
615 	UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
616 
617 	if ( NULL == global_break_iterator ) {
618 
619 		global_break_iterator = ubrk_open(UBRK_CHARACTER,
620 											NULL,	/* icu default locale - locale has no effect on this iterator */
621 											NULL,	/* text not set in global iterator */
622 											0,		/* text length = 0 */
623 											status);
624 
625 		INTL_G(grapheme_iterator) = global_break_iterator;
626 	}
627 
628 	buffer_size = U_BRK_SAFECLONE_BUFFERSIZE;
629 
630 	return ubrk_safeClone(global_break_iterator, stack_buffer, &buffer_size, status);
631 }
632 /* }}} */
633 
634 /*
635  * Local variables:
636  * tab-width: 4
637  * c-basic-offset: 4
638  * End:
639  * vim600: fdm=marker
640  * vim: noet sw=4 ts=4
641  */
642 
643