xref: /php-src/ext/intl/grapheme/grapheme_util.c (revision b22d2bf5)
1 /*
2    +----------------------------------------------------------------------+
3    | This source file is subject to version 3.01 of the PHP license,      |
4    | that is bundled with this package in the file LICENSE, and is        |
5    | available through the world-wide-web at the following url:           |
6    | https://www.php.net/license/3_01.txt                                 |
7    | If you did not receive a copy of the PHP license and are unable to   |
8    | obtain it through the world-wide-web, please send a note to          |
9    | license@php.net so we can mail you a copy immediately.               |
10    +----------------------------------------------------------------------+
11    | Author: Ed Batutis <ed@batutis.com>                                  |
12    +----------------------------------------------------------------------+
13  */
14 
15 /* {{{ includes */
16 #ifdef HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19 
20 #include <php.h>
21 #include "grapheme.h"
22 #include "grapheme_util.h"
23 #include "intl_common.h"
24 
25 #include <unicode/utypes.h>
26 #include <unicode/ucol.h>
27 #include <unicode/ustring.h>
28 #include <unicode/ubrk.h>
29 #include <unicode/usearch.h>
30 
31 #include "ext/standard/php_string.h"
32 
ZEND_EXTERN_MODULE_GLOBALS(intl)33 ZEND_EXTERN_MODULE_GLOBALS( intl )
34 
35 /* }}} */
36 
37 /* {{{ grapheme_close_global_iterator - clean up */
38 void
39 grapheme_close_global_iterator( void )
40 {
41 	UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
42 
43 	if ( NULL != global_break_iterator ) {
44 		ubrk_close(global_break_iterator);
45 	}
46 }
47 /* }}} */
48 
49 /* {{{ grapheme_substr_ascii f='from' - starting point, l='length' */
grapheme_substr_ascii(char * str,size_t str_len,int32_t f,int32_t l,char ** sub_str,int32_t * sub_str_len)50 void grapheme_substr_ascii(char *str, size_t str_len, int32_t f, int32_t l, char **sub_str, int32_t *sub_str_len)
51 {
52 	int32_t str_len2 = (int32_t)str_len; /* in order to avoid signed/unsigned problems */
53 	*sub_str = NULL;
54 
55 	if(str_len > INT32_MAX) {
56 		/* We cannot return long strings from ICU functions, so we won't here too */
57 		return;
58 	}
59 
60 	/* if "from" position is negative, count start position from the end
61 	 * of the string
62 	 */
63 	if (f < 0) {
64 		f = str_len2 + f;
65 		if (f < 0) {
66 			f = 0;
67 		}
68 	} else if (f > str_len2) {
69 		f = str_len2;
70 	}
71 
72 	/* if "length" position is negative, set it to the length
73 	 * needed to stop that many chars from the end of the string
74 	 */
75 	if (l < 0) {
76 		l = (str_len2 - f) + l;
77 		if (l < 0) {
78 			l = 0;
79 		}
80 	} else if (l > str_len2 - f) {
81 		l = str_len2 - f;
82 	}
83 
84 	*sub_str = str + f;
85 	*sub_str_len = l;
86 }
87 /* }}} */
88 
89 #define STRPOS_CHECK_STATUS(status, error) \
90 	if ( U_FAILURE( (status) ) ) { \
91 		intl_error_set_code( NULL, (status) ); \
92 		intl_error_set_custom_msg( NULL, (error), 0 ); \
93 		ret_pos = -1; \
94 		goto finish; \
95 	}
96 
97 
98 /* {{{ grapheme_strpos_utf16 - strrpos using utf16*/
grapheme_strpos_utf16(char * haystack,size_t haystack_len,char * needle,size_t needle_len,int32_t offset,int32_t * puchar_pos,int f_ignore_case,int last)99 int32_t grapheme_strpos_utf16(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case, int last)
100 {
101 	UChar *uhaystack = NULL, *uneedle = NULL;
102 	int32_t uhaystack_len = 0, uneedle_len = 0, char_pos, ret_pos, offset_pos = 0;
103 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
104 	UBreakIterator* bi = NULL;
105 	UErrorCode status;
106 	UStringSearch* src = NULL;
107 
108 	if(puchar_pos) {
109 		*puchar_pos = -1;
110 	}
111 	/* convert the strings to UTF-16. */
112 
113 	status = U_ZERO_ERROR;
114 	intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, haystack, haystack_len, &status );
115 	STRPOS_CHECK_STATUS(status, "Error converting input string to UTF-16");
116 
117 	status = U_ZERO_ERROR;
118 	intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, needle, needle_len, &status );
119 	STRPOS_CHECK_STATUS(status, "Error converting needle string to UTF-16");
120 
121 	/* get a pointer to the haystack taking into account the offset */
122 	status = U_ZERO_ERROR;
123 	bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
124 	STRPOS_CHECK_STATUS(status, "Failed to get iterator");
125 	status = U_ZERO_ERROR;
126 	ubrk_setText(bi, uhaystack, uhaystack_len, &status);
127 	STRPOS_CHECK_STATUS(status, "Failed to set up iterator");
128 
129 	if (uneedle_len == 0) {
130 		offset_pos = grapheme_get_haystack_offset(bi, offset);
131 		if (offset_pos == -1) {
132 			zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
133 			ret_pos = -1;
134 			goto finish;
135 		}
136 		ret_pos = last && offset >= 0 ? uhaystack_len : offset_pos;
137 		goto finish;
138 	}
139 
140 	status = U_ZERO_ERROR;
141 	src = usearch_open(uneedle, uneedle_len, uhaystack, uhaystack_len, "", bi, &status);
142 	STRPOS_CHECK_STATUS(status, "Error creating search object");
143 
144 	if(f_ignore_case) {
145 		UCollator *coll = usearch_getCollator(src);
146 		status = U_ZERO_ERROR;
147 		ucol_setAttribute(coll, UCOL_STRENGTH, UCOL_SECONDARY, &status);
148 		STRPOS_CHECK_STATUS(status, "Error setting collation strength");
149 		usearch_reset(src);
150 	}
151 
152 	if(offset != 0) {
153 		offset_pos = grapheme_get_haystack_offset(bi, offset);
154 		if (offset_pos == -1) {
155 			zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
156 			ret_pos = -1;
157 			goto finish;
158 		}
159 		status = U_ZERO_ERROR;
160 		usearch_setOffset(src, last ? 0 : offset_pos, &status);
161 		STRPOS_CHECK_STATUS(status, "Invalid search offset");
162 	}
163 
164 
165 	if(last) {
166 		if (offset >= 0) {
167 			char_pos = usearch_last(src, &status);
168 			if(char_pos < offset_pos) {
169 				/* last one is beyond our start offset */
170 				char_pos = USEARCH_DONE;
171 			}
172 		} else {
173 			/* searching backwards is broken, so we search forwards, albeit it's less efficient */
174 			int32_t prev_pos = USEARCH_DONE;
175 			do {
176 				char_pos = usearch_next(src, &status);
177 				if (char_pos == USEARCH_DONE || char_pos > offset_pos) {
178 					char_pos = prev_pos;
179 					break;
180 				}
181 				prev_pos = char_pos;
182 			} while(1);
183 		}
184 	} else {
185 		char_pos = usearch_next(src, &status);
186 	}
187 	STRPOS_CHECK_STATUS(status, "Error looking up string");
188 	if(char_pos != USEARCH_DONE && ubrk_isBoundary(bi, char_pos)) {
189 		ret_pos = grapheme_count_graphemes(bi, uhaystack,char_pos);
190 		if(puchar_pos) {
191 			*puchar_pos = char_pos;
192 		}
193 	} else {
194 		ret_pos = -1;
195 	}
196 
197 finish:
198 	if (uhaystack) {
199 		efree( uhaystack );
200 	}
201 	if (uneedle) {
202 		efree( uneedle );
203 	}
204 	if (bi) {
205 		ubrk_close (bi);
206 	}
207 	if (src) {
208 		usearch_close (src);
209 	}
210 
211 	return ret_pos;
212 }
213 
214 /* }}} */
215 
216 /* {{{ grapheme_ascii_check: ASCII check */
grapheme_ascii_check(const unsigned char * day,size_t len)217 zend_long grapheme_ascii_check(const unsigned char *day, size_t len)
218 {
219 	int ret_len = len;
220 	while ( len-- ) {
221 	if ( *day++ > 0x7f || (*day == '\n' && *(day - 1) == '\r') )
222 		return -1;
223 	}
224 
225 	return ret_len;
226 }
227 
228 /* }}} */
229 
230 /* {{{ grapheme_split_string: find and optionally return grapheme boundaries */
grapheme_split_string(const UChar * text,int32_t text_length,int boundary_array[],int boundary_array_len)231 int32_t grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len )
232 {
233 	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
234 	UErrorCode		status = U_ZERO_ERROR;
235 	int ret_len, pos;
236 	UBreakIterator* bi;
237 
238 	bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
239 
240 	if( U_FAILURE(status) ) {
241 		return -1;
242 	}
243 
244 	ubrk_setText(bi, text, text_length,	&status);
245 
246 	pos = 0;
247 
248 	for ( ret_len = 0; pos != UBRK_DONE; ) {
249 
250 		pos = ubrk_next(bi);
251 
252 		if ( pos != UBRK_DONE ) {
253 
254 			if ( NULL != boundary_array && ret_len < boundary_array_len ) {
255 				boundary_array[ret_len] = pos;
256 			}
257 
258 			ret_len++;
259 		}
260 	}
261 
262 	ubrk_close(bi);
263 
264 	return ret_len;
265 }
266 /* }}} */
267 
268 /* {{{ grapheme_count_graphemes */
grapheme_count_graphemes(UBreakIterator * bi,UChar * string,int32_t string_len)269 int32_t grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len)
270 {
271 	int ret_len = 0;
272 	int pos = 0;
273 	UErrorCode		status = U_ZERO_ERROR;
274 
275 	ubrk_setText(bi, string, string_len, &status);
276 
277 	do {
278 
279 		pos = ubrk_next(bi);
280 
281 		if ( UBRK_DONE != pos ) {
282 			ret_len++;
283 		}
284 
285 	} while ( UBRK_DONE != pos );
286 
287 	return ret_len;
288 }
289 /* }}} */
290 
291 
292 /* {{{ 	grapheme_get_haystack_offset - bump the haystack pointer based on the grapheme count offset */
grapheme_get_haystack_offset(UBreakIterator * bi,int32_t offset)293 int32_t grapheme_get_haystack_offset(UBreakIterator* bi, int32_t offset)
294 {
295 	int32_t pos;
296 	int32_t (*iter_op)(UBreakIterator* bi);
297 	int iter_incr;
298 
299 	if ( 0 == offset ) {
300 		return 0;
301 	}
302 
303 	if ( offset < 0 ) {
304 		iter_op = ubrk_previous;
305 		ubrk_last(bi); /* one past the end */
306 		iter_incr = 1;
307 	}
308 	else {
309 		iter_op = ubrk_next;
310 		iter_incr = -1;
311 	}
312 
313 	pos = 0;
314 
315 	while ( pos != UBRK_DONE && offset != 0 ) {
316 
317 		pos = iter_op(bi);
318 
319 		if ( UBRK_DONE != pos ) {
320 			offset += iter_incr;
321 		}
322 	}
323 
324 	if ( offset != 0 ) {
325 		return -1;
326 	}
327 
328 	return pos;
329 }
330 /* }}} */
331 
332 /* {{{ grapheme_strrpos_ascii: borrowed from the php ext/standard/string.c */
grapheme_strrpos_ascii(char * haystack,size_t haystack_len,char * needle,size_t needle_len,int32_t offset)333 zend_long grapheme_strrpos_ascii(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset)
334 {
335 	char *p, *e;
336 
337 	if (offset >= 0) {
338 		p = haystack + offset;
339 		e = haystack + haystack_len - needle_len;
340 	} else {
341 		p = haystack;
342 		if (needle_len > (size_t)-offset) {
343 			e = haystack + haystack_len - needle_len;
344 		} else {
345 			e = haystack + haystack_len + offset;
346 		}
347 	}
348 
349 	if (needle_len == 1) {
350 		/* Single character search can shortcut memcmps */
351 		while (e >= p) {
352 			if (*e == *needle) {
353 				return (e - p + (offset > 0 ? offset : 0));
354 			}
355 			e--;
356 		}
357 		return -1;
358 	}
359 
360 	while (e >= p) {
361 		if (memcmp(e, needle, needle_len) == 0) {
362 			return (e - p + (offset > 0 ? offset : 0));
363 		}
364 		e--;
365 	}
366 
367 	return -1;
368 }
369 
370 /* }}} */
371 
372 /* {{{ grapheme_get_break_iterator: get a clone of the global character break iterator */
grapheme_get_break_iterator(void * stack_buffer,UErrorCode * status)373 UBreakIterator* grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status )
374 {
375 	UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
376 
377 	if ( NULL == global_break_iterator ) {
378 
379 		global_break_iterator = ubrk_open(UBRK_CHARACTER,
380 											NULL,	/* icu default locale - locale has no effect on this iterator */
381 											NULL,	/* text not set in global iterator */
382 											0,		/* text length = 0 */
383 											status);
384 
385 		INTL_G(grapheme_iterator) = global_break_iterator;
386 	}
387 
388 #if U_ICU_VERSION_MAJOR_NUM >= 69
389 	return ubrk_clone(global_break_iterator, status);
390 #else
391 	int32_t buffer_size = U_BRK_SAFECLONE_BUFFERSIZE;
392 
393 	return ubrk_safeClone(global_break_iterator, stack_buffer, &buffer_size, status);
394 #endif
395 }
396 /* }}} */
397