1 /*
2 +----------------------------------------------------------------------+
3 | This source file is subject to version 3.01 of the PHP license, |
4 | that is bundled with this package in the file LICENSE, and is |
5 | available through the world-wide-web at the following url: |
6 | https://www.php.net/license/3_01.txt |
7 | If you did not receive a copy of the PHP license and are unable to |
8 | obtain it through the world-wide-web, please send a note to |
9 | license@php.net so we can mail you a copy immediately. |
10 +----------------------------------------------------------------------+
11 | Author: Ed Batutis <ed@batutis.com> |
12 +----------------------------------------------------------------------+
13 */
14
15 /* {{{ includes */
16 #ifdef HAVE_CONFIG_H
17 #include <config.h>
18 #endif
19
20 #include <php.h>
21 #include "grapheme.h"
22 #include "grapheme_util.h"
23 #include "intl_common.h"
24
25 #include <unicode/utypes.h>
26 #include <unicode/ucol.h>
27 #include <unicode/ustring.h>
28 #include <unicode/ubrk.h>
29 #include <unicode/usearch.h>
30
ZEND_EXTERN_MODULE_GLOBALS(intl)31 ZEND_EXTERN_MODULE_GLOBALS( intl )
32
33 /* }}} */
34
35 /* {{{ grapheme_close_global_iterator - clean up */
36 void
37 grapheme_close_global_iterator( void )
38 {
39 UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
40
41 if ( NULL != global_break_iterator ) {
42 ubrk_close(global_break_iterator);
43 }
44 }
45 /* }}} */
46
47 /* {{{ grapheme_substr_ascii f='from' - starting point, l='length' */
grapheme_substr_ascii(char * str,size_t str_len,int32_t f,int32_t l,char ** sub_str,int32_t * sub_str_len)48 void grapheme_substr_ascii(char *str, size_t str_len, int32_t f, int32_t l, char **sub_str, int32_t *sub_str_len)
49 {
50 int32_t str_len2 = (int32_t)str_len; /* in order to avoid signed/unsigned problems */
51 *sub_str = NULL;
52
53 if(str_len > INT32_MAX) {
54 /* We cannot return long strings from ICU functions, so we won't here too */
55 return;
56 }
57
58 /* if "from" position is negative, count start position from the end
59 * of the string
60 */
61 if (f < 0) {
62 f = str_len2 + f;
63 if (f < 0) {
64 f = 0;
65 }
66 } else if (f > str_len2) {
67 f = str_len2;
68 }
69
70 /* if "length" position is negative, set it to the length
71 * needed to stop that many chars from the end of the string
72 */
73 if (l < 0) {
74 l = (str_len2 - f) + l;
75 if (l < 0) {
76 l = 0;
77 }
78 } else if (l > str_len2 - f) {
79 l = str_len2 - f;
80 }
81
82 *sub_str = str + f;
83 *sub_str_len = l;
84 }
85 /* }}} */
86
87 #define STRPOS_CHECK_STATUS(status, error) \
88 if ( U_FAILURE( (status) ) ) { \
89 intl_error_set_code( NULL, (status) ); \
90 intl_error_set_custom_msg( NULL, (error), 0 ); \
91 ret_pos = -1; \
92 goto finish; \
93 }
94
95
96 /* {{{ grapheme_strpos_utf16 - strrpos using utf16*/
grapheme_strpos_utf16(char * haystack,size_t haystack_len,char * needle,size_t needle_len,int32_t offset,int32_t * puchar_pos,int f_ignore_case,int last)97 int32_t grapheme_strpos_utf16(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case, int last)
98 {
99 UChar *uhaystack = NULL, *uneedle = NULL;
100 int32_t uhaystack_len = 0, uneedle_len = 0, char_pos, ret_pos, offset_pos = 0;
101 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
102 UBreakIterator* bi = NULL;
103 UErrorCode status;
104 UStringSearch* src = NULL;
105
106 if(puchar_pos) {
107 *puchar_pos = -1;
108 }
109 /* convert the strings to UTF-16. */
110
111 status = U_ZERO_ERROR;
112 intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, haystack, haystack_len, &status );
113 STRPOS_CHECK_STATUS(status, "Error converting input string to UTF-16");
114
115 status = U_ZERO_ERROR;
116 intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, needle, needle_len, &status );
117 STRPOS_CHECK_STATUS(status, "Error converting needle string to UTF-16");
118
119 /* get a pointer to the haystack taking into account the offset */
120 status = U_ZERO_ERROR;
121 bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status );
122 STRPOS_CHECK_STATUS(status, "Failed to get iterator");
123 status = U_ZERO_ERROR;
124 ubrk_setText(bi, uhaystack, uhaystack_len, &status);
125 STRPOS_CHECK_STATUS(status, "Failed to set up iterator");
126
127 if (uneedle_len == 0) {
128 offset_pos = grapheme_get_haystack_offset(bi, offset);
129 if (offset_pos == -1) {
130 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
131 ret_pos = -1;
132 goto finish;
133 }
134 ret_pos = last && offset >= 0 ? uhaystack_len : offset_pos;
135 goto finish;
136 }
137
138 status = U_ZERO_ERROR;
139 src = usearch_open(uneedle, uneedle_len, uhaystack, uhaystack_len, "", bi, &status);
140 STRPOS_CHECK_STATUS(status, "Error creating search object");
141
142 if(f_ignore_case) {
143 UCollator *coll = usearch_getCollator(src);
144 status = U_ZERO_ERROR;
145 ucol_setAttribute(coll, UCOL_STRENGTH, UCOL_SECONDARY, &status);
146 STRPOS_CHECK_STATUS(status, "Error setting collation strength");
147 usearch_reset(src);
148 }
149
150 if(offset != 0) {
151 offset_pos = grapheme_get_haystack_offset(bi, offset);
152 if (offset_pos == -1) {
153 zend_argument_value_error(3, "must be contained in argument #1 ($haystack)");
154 ret_pos = -1;
155 goto finish;
156 }
157 status = U_ZERO_ERROR;
158 usearch_setOffset(src, last ? 0 : offset_pos, &status);
159 STRPOS_CHECK_STATUS(status, "Invalid search offset");
160 }
161
162
163 if(last) {
164 if (offset >= 0) {
165 char_pos = usearch_last(src, &status);
166 if(char_pos < offset_pos) {
167 /* last one is beyond our start offset */
168 char_pos = USEARCH_DONE;
169 }
170 } else {
171 /* searching backwards is broken, so we search forwards, albeit it's less efficient */
172 int32_t prev_pos = USEARCH_DONE;
173 do {
174 char_pos = usearch_next(src, &status);
175 if (char_pos == USEARCH_DONE || char_pos > offset_pos) {
176 char_pos = prev_pos;
177 break;
178 }
179 prev_pos = char_pos;
180 } while(1);
181 }
182 } else {
183 char_pos = usearch_next(src, &status);
184 }
185 STRPOS_CHECK_STATUS(status, "Error looking up string");
186 if(char_pos != USEARCH_DONE && ubrk_isBoundary(bi, char_pos)) {
187 ret_pos = grapheme_count_graphemes(bi, uhaystack,char_pos);
188 if(puchar_pos) {
189 *puchar_pos = char_pos;
190 }
191 } else {
192 ret_pos = -1;
193 }
194
195 finish:
196 if (uhaystack) {
197 efree( uhaystack );
198 }
199 if (uneedle) {
200 efree( uneedle );
201 }
202 if (bi) {
203 ubrk_close (bi);
204 }
205 if (src) {
206 usearch_close (src);
207 }
208
209 return ret_pos;
210 }
211
212 /* }}} */
213
214 /* {{{ grapheme_ascii_check: ASCII check */
grapheme_ascii_check(const unsigned char * day,size_t len)215 zend_long grapheme_ascii_check(const unsigned char *day, size_t len)
216 {
217 int ret_len = len;
218 while ( len-- ) {
219 if ( *day++ > 0x7f || (*day == '\n' && *(day - 1) == '\r') )
220 return -1;
221 }
222
223 return ret_len;
224 }
225
226 /* }}} */
227
228 /* {{{ grapheme_split_string: find and optionally return grapheme boundaries */
grapheme_split_string(const UChar * text,int32_t text_length,int boundary_array[],int boundary_array_len)229 int32_t grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len )
230 {
231 unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
232 UErrorCode status = U_ZERO_ERROR;
233 int ret_len, pos;
234 UBreakIterator* bi;
235
236 bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status );
237
238 if( U_FAILURE(status) ) {
239 return -1;
240 }
241
242 ubrk_setText(bi, text, text_length, &status);
243
244 pos = 0;
245
246 for ( ret_len = 0; pos != UBRK_DONE; ) {
247
248 pos = ubrk_next(bi);
249
250 if ( pos != UBRK_DONE ) {
251
252 if ( NULL != boundary_array && ret_len < boundary_array_len ) {
253 boundary_array[ret_len] = pos;
254 }
255
256 ret_len++;
257 }
258 }
259
260 ubrk_close(bi);
261
262 return ret_len;
263 }
264 /* }}} */
265
266 /* {{{ grapheme_count_graphemes */
grapheme_count_graphemes(UBreakIterator * bi,UChar * string,int32_t string_len)267 int32_t grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len)
268 {
269 int ret_len = 0;
270 int pos = 0;
271 UErrorCode status = U_ZERO_ERROR;
272
273 ubrk_setText(bi, string, string_len, &status);
274
275 do {
276
277 pos = ubrk_next(bi);
278
279 if ( UBRK_DONE != pos ) {
280 ret_len++;
281 }
282
283 } while ( UBRK_DONE != pos );
284
285 return ret_len;
286 }
287 /* }}} */
288
289
290 /* {{{ grapheme_get_haystack_offset - bump the haystack pointer based on the grapheme count offset */
grapheme_get_haystack_offset(UBreakIterator * bi,int32_t offset)291 int32_t grapheme_get_haystack_offset(UBreakIterator* bi, int32_t offset)
292 {
293 int32_t pos;
294 int32_t (*iter_op)(UBreakIterator* bi);
295 int iter_incr;
296
297 if ( 0 == offset ) {
298 return 0;
299 }
300
301 if ( offset < 0 ) {
302 iter_op = ubrk_previous;
303 ubrk_last(bi); /* one past the end */
304 iter_incr = 1;
305 }
306 else {
307 iter_op = ubrk_next;
308 iter_incr = -1;
309 }
310
311 pos = 0;
312
313 while ( pos != UBRK_DONE && offset != 0 ) {
314
315 pos = iter_op(bi);
316
317 if ( UBRK_DONE != pos ) {
318 offset += iter_incr;
319 }
320 }
321
322 if ( offset != 0 ) {
323 return -1;
324 }
325
326 return pos;
327 }
328 /* }}} */
329
330 /* {{{ grapheme_strrpos_ascii: borrowed from the php ext/standard/string.c */
grapheme_strrpos_ascii(char * haystack,size_t haystack_len,char * needle,size_t needle_len,int32_t offset)331 zend_long grapheme_strrpos_ascii(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset)
332 {
333 char *p, *e;
334
335 if (offset >= 0) {
336 p = haystack + offset;
337 e = haystack + haystack_len - needle_len;
338 } else {
339 p = haystack;
340 if (needle_len > (size_t)-offset) {
341 e = haystack + haystack_len - needle_len;
342 } else {
343 e = haystack + haystack_len + offset;
344 }
345 }
346
347 if (needle_len == 1) {
348 /* Single character search can shortcut memcmps */
349 while (e >= p) {
350 if (*e == *needle) {
351 return (e - p + (offset > 0 ? offset : 0));
352 }
353 e--;
354 }
355 return -1;
356 }
357
358 while (e >= p) {
359 if (memcmp(e, needle, needle_len) == 0) {
360 return (e - p + (offset > 0 ? offset : 0));
361 }
362 e--;
363 }
364
365 return -1;
366 }
367
368 /* }}} */
369
370 /* {{{ grapheme_get_break_iterator: get a clone of the global character break iterator */
grapheme_get_break_iterator(void * stack_buffer,UErrorCode * status)371 UBreakIterator* grapheme_get_break_iterator(void *stack_buffer, UErrorCode *status )
372 {
373 UBreakIterator *global_break_iterator = INTL_G( grapheme_iterator );
374
375 if ( NULL == global_break_iterator ) {
376
377 global_break_iterator = ubrk_open(UBRK_CHARACTER,
378 NULL, /* icu default locale - locale has no effect on this iterator */
379 NULL, /* text not set in global iterator */
380 0, /* text length = 0 */
381 status);
382
383 INTL_G(grapheme_iterator) = global_break_iterator;
384 }
385
386 #if U_ICU_VERSION_MAJOR_NUM >= 69
387 return ubrk_clone(global_break_iterator, status);
388 #else
389 int32_t buffer_size = U_BRK_SAFECLONE_BUFFERSIZE;
390
391 return ubrk_safeClone(global_break_iterator, stack_buffer, &buffer_size, status);
392 #endif
393 }
394 /* }}} */
395