1 /*
2 +----------------------------------------------------------------------+
3 | This source file is subject to version 3.01 of the PHP license, |
4 | that is bundled with this package in the file LICENSE, and is |
5 | available through the world-wide-web at the following url: |
6 | http://www.php.net/license/3_01.txt |
7 | If you did not receive a copy of the PHP license and are unable to |
8 | obtain it through the world-wide-web, please send a note to |
9 | license@php.net so we can mail you a copy immediately. |
10 +----------------------------------------------------------------------+
11 | Author: Ed Batutis <ed@batutis.com> |
12 +----------------------------------------------------------------------+
13 */
14
15 /* {{{ includes */
16 #ifdef HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19
20 #include <php.h>
21 #include "grapheme.h"
22 #include "grapheme_util.h"
23 #include "intl_common.h"
24
25 #include <unicode/utypes.h>
26 #include <unicode/ucol.h>
27 #include <unicode/ustring.h>
28 #include <unicode/ubrk.h>
29 #include <unicode/usearch.h>
30
31 #include "ext/standard/php_string.h"
32
ZEND_EXTERN_MODULE_GLOBALS(intl)33 ZEND_EXTERN_MODULE_GLOBALS( intl )
34
35 /* }}} */
36
37 /* {{{ grapheme_close_global_iterator - clean up */
38 void
39 grapheme_close_global_iterator( void )
40 {
41 UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
42
43 if ( NULL != global_break_iterator ) {
44 ubrk_close(global_break_iterator);
45 }
46 }
47 /* }}} */
48
49 /* {{{ grapheme_substr_ascii f='from' - starting point, l='length' */
grapheme_substr_ascii(char * str,size_t str_len,int32_t f,int32_t l,char ** sub_str,int32_t * sub_str_len)50 void grapheme_substr_ascii(char *str, size_t str_len, int32_t f, int32_t l, char **sub_str, int32_t *sub_str_len)
51 {
52 int32_t str_len2 = (int32_t)str_len; /* in order to avoid signed/unsigned problems */
53 *sub_str = NULL;
54
55 if(str_len > INT32_MAX) {
56 /* We can not return long strings from ICU functions, so we won't here too */
57 return;
58 }
59
60 /* if "from" position is negative, count start position from the end
61 * of the string
62 */
63 if (f < 0) {
64 f = str_len2 + f;
65 if (f < 0) {
66 f = 0;
67 }
68 } else if (f > str_len2) {
69 f = str_len2;
70 }
71
72 /* if "length" position is negative, set it to the length
73 * needed to stop that many chars from the end of the string
74 */
75 if (l < 0) {
76 l = (str_len2 - f) + l;
77 if (l < 0) {
78 l = 0;
79 }
80 } else if (l > str_len2 - f) {
81 l = str_len2 - f;
82 }
83
84 *sub_str = str + f;
85 *sub_str_len = l;
86 }
87 /* }}} */
88
89 #define STRPOS_CHECK_STATUS(status, error) \
90 if ( U_FAILURE( (status) ) ) { \
91 intl_error_set_code( NULL, (status) ); \
92 intl_error_set_custom_msg( NULL, (error), 0 ); \
93 ret_pos = -1; \
94 goto finish; \
95 }
96
97
98 /* {{{ grapheme_strpos_utf16 - strrpos using utf16*/
grapheme_strpos_utf16(char * haystack,size_t haystack_len,char * needle,size_t needle_len,int32_t offset,int32_t * puchar_pos,int f_ignore_case,int last)99 int32_t grapheme_strpos_utf16(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case, int last)
100 {
101 UChar *uhaystack = NULL, *uneedle = NULL;
102 int32_t uhaystack_len = 0, uneedle_len = 0, char_pos, ret_pos, offset_pos = 0;
103 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
104 UBreakIterator* bi = NULL;
105 UErrorCode status;
106 UStringSearch* src = NULL;
107
108 if(puchar_pos) {
109 *puchar_pos = -1;
110 }
111 /* convert the strings to UTF-16. */
112
113 status = U_ZERO_ERROR;
114 intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, haystack, haystack_len, &status );
115 STRPOS_CHECK_STATUS(status, "Error converting input string to UTF-16");
116
117 status = U_ZERO_ERROR;
118 intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, needle, needle_len, &status );
119 STRPOS_CHECK_STATUS(status, "Error converting needle string to UTF-16");
120
121 /* get a pointer to the haystack taking into account the offset */
122 status = U_ZERO_ERROR;
123 bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
124 STRPOS_CHECK_STATUS(status, "Failed to get iterator");
125 status = U_ZERO_ERROR;
126 ubrk_setText(bi, uhaystack, uhaystack_len, &status);
127 STRPOS_CHECK_STATUS(status, "Failed to set up iterator");
128
129 if (uneedle_len == 0) {
130 offset_pos = grapheme_get_haystack_offset(bi, offset);
131 if (offset_pos == -1) {
132 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
133 ret_pos = -1;
134 goto finish;
135 }
136 ret_pos = last && offset >= 0 ? uhaystack_len : offset_pos;
137 goto finish;
138 }
139
140 status = U_ZERO_ERROR;
141 src = usearch_open(uneedle, uneedle_len, uhaystack, uhaystack_len, "", bi, &status);
142 STRPOS_CHECK_STATUS(status, "Error creating search object");
143
144 if(f_ignore_case) {
145 UCollator *coll = usearch_getCollator(src);
146 status = U_ZERO_ERROR;
147 ucol_setAttribute(coll, UCOL_STRENGTH, UCOL_SECONDARY, &status);
148 STRPOS_CHECK_STATUS(status, "Error setting collation strength");
149 usearch_reset(src);
150 }
151
152 if(offset != 0) {
153 offset_pos = grapheme_get_haystack_offset(bi, offset);
154 if (offset_pos == -1) {
155 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
156 ret_pos = -1;
157 goto finish;
158 }
159 status = U_ZERO_ERROR;
160 usearch_setOffset(src, last ? 0 : offset_pos, &status);
161 STRPOS_CHECK_STATUS(status, "Invalid search offset");
162 }
163
164
165 if(last) {
166 if (offset >= 0) {
167 char_pos = usearch_last(src, &status);
168 if(char_pos < offset_pos) {
169 /* last one is beyond our start offset */
170 char_pos = USEARCH_DONE;
171 }
172 } else {
173 /* searching backwards is broken, so we search forwards, albeit it's less efficient */
174 int32_t prev_pos = USEARCH_DONE;
175 do {
176 char_pos = usearch_next(src, &status);
177 if (char_pos == USEARCH_DONE || char_pos > offset_pos) {
178 char_pos = prev_pos;
179 break;
180 }
181 prev_pos = char_pos;
182 } while(1);
183 }
184 } else {
185 char_pos = usearch_next(src, &status);
186 }
187 STRPOS_CHECK_STATUS(status, "Error looking up string");
188 if(char_pos != USEARCH_DONE && ubrk_isBoundary(bi, char_pos)) {
189 ret_pos = grapheme_count_graphemes(bi, uhaystack,char_pos);
190 if(puchar_pos) {
191 *puchar_pos = char_pos;
192 }
193 } else {
194 ret_pos = -1;
195 }
196
197 finish:
198 if (uhaystack) {
199 efree( uhaystack );
200 }
201 if (uneedle) {
202 efree( uneedle );
203 }
204 if (bi) {
205 ubrk_close (bi);
206 }
207 if (src) {
208 usearch_close (src);
209 }
210
211 return ret_pos;
212 }
213
214 /* }}} */
215
216 /* {{{ grapheme_ascii_check: ASCII check */
grapheme_ascii_check(const unsigned char * day,size_t len)217 zend_long grapheme_ascii_check(const unsigned char *day, size_t len)
218 {
219 int ret_len = len;
220 while ( len-- ) {
221 if ( *day++ > 0x7f || (*day == '\n' && *(day - 1) == '\r') )
222 return -1;
223 }
224
225 return ret_len;
226 }
227
228 /* }}} */
229
230 /* {{{ grapheme_split_string: find and optionally return grapheme boundaries */
grapheme_split_string(const UChar * text,int32_t text_length,int boundary_array[],int boundary_array_len)231 int32_t grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len )
232 {
233 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
234 UErrorCode status = U_ZERO_ERROR;
235 int ret_len, pos;
236 UBreakIterator* bi;
237
238 bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
239
240 if( U_FAILURE(status) ) {
241 return -1;
242 }
243
244 ubrk_setText(bi, text, text_length, &status);
245
246 pos = 0;
247
248 for ( ret_len = 0; pos != UBRK_DONE; ) {
249
250 pos = ubrk_next(bi);
251
252 if ( pos != UBRK_DONE ) {
253
254 if ( NULL != boundary_array && ret_len < boundary_array_len ) {
255 boundary_array[ret_len] = pos;
256 }
257
258 ret_len++;
259 }
260 }
261
262 ubrk_close(bi);
263
264 return ret_len;
265 }
266 /* }}} */
267
268 /* {{{ grapheme_count_graphemes */
grapheme_count_graphemes(UBreakIterator * bi,UChar * string,int32_t string_len)269 int32_t grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len)
270 {
271 int ret_len = 0;
272 int pos = 0;
273 UErrorCode status = U_ZERO_ERROR;
274
275 ubrk_setText(bi, string, string_len, &status);
276
277 do {
278
279 pos = ubrk_next(bi);
280
281 if ( UBRK_DONE != pos ) {
282 ret_len++;
283 }
284
285 } while ( UBRK_DONE != pos );
286
287 return ret_len;
288 }
289 /* }}} */
290
291
292 /* {{{ grapheme_get_haystack_offset - bump the haystack pointer based on the grapheme count offset */
grapheme_get_haystack_offset(UBreakIterator * bi,int32_t offset)293 int32_t grapheme_get_haystack_offset(UBreakIterator* bi, int32_t offset)
294 {
295 int32_t pos;
296 int32_t (*iter_op)(UBreakIterator* bi);
297 int iter_incr;
298
299 if ( 0 == offset ) {
300 return 0;
301 }
302
303 if ( offset < 0 ) {
304 iter_op = ubrk_previous;
305 ubrk_last(bi); /* one past the end */
306 iter_incr = 1;
307 }
308 else {
309 iter_op = ubrk_next;
310 iter_incr = -1;
311 }
312
313 pos = 0;
314
315 while ( pos != UBRK_DONE && offset != 0 ) {
316
317 pos = iter_op(bi);
318
319 if ( UBRK_DONE != pos ) {
320 offset += iter_incr;
321 }
322 }
323
324 if ( offset != 0 ) {
325 return -1;
326 }
327
328 return pos;
329 }
330 /* }}} */
331
332 /* {{{ grapheme_strrpos_ascii: borrowed from the php ext/standard/string.c */
333 zend_long
grapheme_strrpos_ascii(char * haystack,size_t haystack_len,char * needle,size_t needle_len,int32_t offset)334 grapheme_strrpos_ascii(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset)
335 {
336 char *p, *e;
337
338 if (offset >= 0) {
339 p = haystack + offset;
340 e = haystack + haystack_len - needle_len;
341 } else {
342 p = haystack;
343 if (needle_len > (size_t)-offset) {
344 e = haystack + haystack_len - needle_len;
345 } else {
346 e = haystack + haystack_len + offset;
347 }
348 }
349
350 if (needle_len == 1) {
351 /* Single character search can shortcut memcmps */
352 while (e >= p) {
353 if (*e == *needle) {
354 return (e - p + (offset > 0 ? offset : 0));
355 }
356 e--;
357 }
358 return -1;
359 }
360
361 while (e >= p) {
362 if (memcmp(e, needle, needle_len) == 0) {
363 return (e - p + (offset > 0 ? offset : 0));
364 }
365 e--;
366 }
367
368 return -1;
369 }
370
371 /* }}} */
372
373 /* {{{ grapheme_get_break_iterator: get a clone of the global character break iterator */
grapheme_get_break_iterator(void * stack_buffer,UErrorCode * status)374 UBreakIterator* grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status )
375 {
376 UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
377
378 if ( NULL == global_break_iterator ) {
379
380 global_break_iterator = ubrk_open(UBRK_CHARACTER,
381 NULL, /* icu default locale - locale has no effect on this iterator */
382 NULL, /* text not set in global iterator */
383 0, /* text length = 0 */
384 status);
385
386 INTL_G(grapheme_iterator) = global_break_iterator;
387 }
388
389 #if U_ICU_VERSION_MAJOR_NUM >= 69
390 return ubrk_clone(global_break_iterator, status);
391 #else
392 int32_t buffer_size = U_BRK_SAFECLONE_BUFFERSIZE;
393
394 return ubrk_safeClone(global_break_iterator, stack_buffer, &buffer_size, status);
395 #endif
396 }
397 /* }}} */
398