1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 7 |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | http://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Authors: Ed Batutis <ed@batutis.com> |
14 +----------------------------------------------------------------------+
15 */
16
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20
21 #include "php_intl.h"
22 #if U_ICU_VERSION_MAJOR_NUM < 56
23 #include "unicode/unorm.h"
24 #else
25 #include <unicode/unorm2.h>
26 #endif
27 #include "normalizer.h"
28 #include "normalizer_class.h"
29 #include "normalizer_normalize.h"
30 #include "intl_convert.h"
31 #include <unicode/utf8.h>
32
33
34 #if U_ICU_VERSION_MAJOR_NUM >= 56
intl_get_normalizer(zend_long form,UErrorCode * err)35 static const UNormalizer2 *intl_get_normalizer(zend_long form, UErrorCode *err)
36 {/*{{{*/
37 switch (form)
38 {
39 case NORMALIZER_FORM_C:
40 return unorm2_getNFCInstance(err);
41 break;
42 case NORMALIZER_FORM_D:
43 return unorm2_getNFDInstance(err);
44 break;
45 case NORMALIZER_FORM_KC:
46 return unorm2_getNFKCInstance(err);
47 break;
48 case NORMALIZER_FORM_KD:
49 return unorm2_getNFKDInstance(err);
50 break;
51 case NORMALIZER_FORM_KC_CF:
52 return unorm2_getNFKCCasefoldInstance(err);
53 break;
54 }
55
56 *err = U_ILLEGAL_ARGUMENT_ERROR;
57 return NULL;
58 }/*}}}*/
59
intl_normalize(zend_long form,const UChar * src,int32_t src_len,UChar * dst,int32_t dst_len,UErrorCode * err)60 static int32_t intl_normalize(zend_long form, const UChar *src, int32_t src_len, UChar *dst, int32_t dst_len, UErrorCode *err)
61 {/*{{{*/
62 const UNormalizer2 *norm;
63
64 /* Mimic the behavior of ICU < 56. */
65 if (UNEXPECTED(NORMALIZER_NONE == form)) {
66 /* FIXME This is a noop which should be removed somewhen after PHP 7.3.*/
67 zend_error(E_DEPRECATED, "Normalizer::NONE is obsolete with ICU 56 and above and will be removed in later PHP versions");
68
69 if (dst_len >= src_len) {
70 memmove(dst, src, sizeof(UChar) * src_len);
71 dst[src_len] = '\0';
72 *err = U_ZERO_ERROR;
73 return src_len;
74 }
75
76 *err = U_BUFFER_OVERFLOW_ERROR;
77 return -1;
78 }
79
80 norm = intl_get_normalizer(form, err);
81 if(U_FAILURE(*err)) {
82 return -1;
83 }
84
85 return unorm2_normalize(norm, src, src_len, dst, dst_len, err);
86 }/*}}}*/
87
intl_is_normalized(zend_long form,const UChar * uinput,int32_t uinput_len,UErrorCode * err)88 static UBool intl_is_normalized(zend_long form, const UChar *uinput, int32_t uinput_len, UErrorCode *err)
89 {/*{{{*/
90 const UNormalizer2 *norm = intl_get_normalizer(form, err);
91
92 if(U_FAILURE(*err)) {
93 return FALSE;
94 }
95
96 return unorm2_isNormalized(norm, uinput, uinput_len, err);
97 }/*}}}*/
98 #endif
99
100 /* {{{ proto string Normalizer::normalize( string $input [, string $form = FORM_C] )
101 * Normalize a string. }}} */
102 /* {{{ proto string normalizer_normalize( string $input [, string $form = FORM_C] )
103 * Normalize a string.
104 */
PHP_FUNCTION(normalizer_normalize)105 PHP_FUNCTION( normalizer_normalize )
106 {
107 char* input = NULL;
108 /* form is optional, defaults to FORM_C */
109 zend_long form = NORMALIZER_DEFAULT;
110 size_t input_len = 0;
111
112 UChar* uinput = NULL;
113 int32_t uinput_len = 0;
114 int expansion_factor = 1;
115 UErrorCode status = U_ZERO_ERROR;
116
117 UChar* uret_buf = NULL;
118 int32_t uret_len = 0;
119
120 zend_string* u8str;
121
122 int32_t size_needed;
123
124 intl_error_reset( NULL );
125
126 /* Parse parameters. */
127 if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "s|l",
128 &input, &input_len, &form ) == FAILURE )
129 {
130 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
131 "normalizer_normalize: unable to parse input params", 0 );
132
133 RETURN_FALSE;
134 }
135
136 expansion_factor = 1;
137
138 switch(form) {
139 case NORMALIZER_NONE:
140 break;
141 case NORMALIZER_FORM_D:
142 expansion_factor = 3;
143 break;
144 case NORMALIZER_FORM_KD:
145 expansion_factor = 3;
146 break;
147 case NORMALIZER_FORM_C:
148 case NORMALIZER_FORM_KC:
149 #if U_ICU_VERSION_MAJOR_NUM >= 56
150 case NORMALIZER_FORM_KC_CF:
151 #endif
152 break;
153 default:
154 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
155 "normalizer_normalize: illegal normalization form", 0 );
156 RETURN_FALSE;
157 }
158
159 /*
160 * Normalize string (converting it to UTF-16 first).
161 */
162
163 /* First convert the string to UTF-16. */
164 intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
165
166 if( U_FAILURE( status ) )
167 {
168 /* Set global error code. */
169 intl_error_set_code( NULL, status );
170
171 /* Set error messages. */
172 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
173 if (uinput) {
174 efree( uinput );
175 }
176 RETURN_FALSE;
177 }
178
179
180 /* Allocate memory for the destination buffer for normalization */
181 uret_len = uinput_len * expansion_factor;
182 uret_buf = eumalloc( uret_len + 1 );
183
184 /* normalize */
185 #if U_ICU_VERSION_MAJOR_NUM < 56
186 size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
187 #else
188 size_needed = intl_normalize(form, uinput, uinput_len, uret_buf, uret_len, &status);
189 #endif
190
191 /* Bail out if an unexpected error occurred.
192 * (U_BUFFER_OVERFLOW_ERROR means that *target buffer is not large enough).
193 * (U_STRING_NOT_TERMINATED_WARNING usually means that the input string is empty).
194 */
195 if( U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && status != U_STRING_NOT_TERMINATED_WARNING ) {
196 efree( uret_buf );
197 efree( uinput );
198 RETURN_NULL();
199 }
200
201 if ( size_needed > uret_len ) {
202 /* realloc does not seem to work properly - memory is corrupted
203 * uret_buf = eurealloc(uret_buf, size_needed + 1);
204 */
205 efree( uret_buf );
206 uret_buf = eumalloc( size_needed + 1 );
207 uret_len = size_needed;
208
209 status = U_ZERO_ERROR;
210
211 /* try normalize again */
212 #if U_ICU_VERSION_MAJOR_NUM < 56
213 size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
214 #else
215 size_needed = intl_normalize(form, uinput, uinput_len, uret_buf, uret_len, &status);
216 #endif
217
218 /* Bail out if an unexpected error occurred. */
219 if( U_FAILURE(status) ) {
220 /* Set error messages. */
221 intl_error_set_custom_msg( NULL,"Error normalizing string", 0 );
222 efree( uret_buf );
223 efree( uinput );
224 RETURN_FALSE;
225 }
226 }
227
228 efree( uinput );
229
230 /* the buffer we actually used */
231 uret_len = size_needed;
232
233 /* Convert normalized string from UTF-16 to UTF-8. */
234 u8str = intl_convert_utf16_to_utf8(uret_buf, uret_len, &status );
235 efree( uret_buf );
236 if( !u8str )
237 {
238 intl_error_set( NULL, status,
239 "normalizer_normalize: error converting normalized text UTF-8", 0 );
240 RETURN_FALSE;
241 }
242
243 /* Return it. */
244 RETVAL_NEW_STR( u8str );
245 }
246 /* }}} */
247
248 /* {{{ proto bool Normalizer::isNormalized( string $input [, string $form = FORM_C] )
249 * Test if a string is in a given normalization form. }}} */
250 /* {{{ proto bool normalizer_is_normalized( string $input [, string $form = FORM_C] )
251 * Test if a string is in a given normalization form.
252 */
PHP_FUNCTION(normalizer_is_normalized)253 PHP_FUNCTION( normalizer_is_normalized )
254 {
255 char* input = NULL;
256 /* form is optional, defaults to FORM_C */
257 zend_long form = NORMALIZER_DEFAULT;
258 size_t input_len = 0;
259
260 UChar* uinput = NULL;
261 int uinput_len = 0;
262 UErrorCode status = U_ZERO_ERROR;
263
264 UBool uret = FALSE;
265
266 intl_error_reset( NULL );
267
268 /* Parse parameters. */
269 if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "s|l",
270 &input, &input_len, &form) == FAILURE )
271 {
272 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
273 "normalizer_is_normalized: unable to parse input params", 0 );
274
275 RETURN_FALSE;
276 }
277
278 switch(form) {
279 /* case NORMALIZER_NONE: not allowed - doesn't make sense */
280
281 case NORMALIZER_FORM_D:
282 case NORMALIZER_FORM_KD:
283 case NORMALIZER_FORM_C:
284 case NORMALIZER_FORM_KC:
285 #if U_ICU_VERSION_MAJOR_NUM >= 56
286 case NORMALIZER_FORM_KC_CF:
287 #endif
288 break;
289 default:
290 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
291 "normalizer_normalize: illegal normalization form", 0 );
292 RETURN_FALSE;
293 }
294
295
296 /*
297 * Test normalization of string (converting it to UTF-16 first).
298 */
299
300 /* First convert the string to UTF-16. */
301 intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
302
303 if( U_FAILURE( status ) )
304 {
305 /* Set global error code. */
306 intl_error_set_code( NULL, status );
307
308 /* Set error messages. */
309 intl_error_set_custom_msg( NULL, "Error converting string to UTF-16.", 0 );
310 if (uinput) {
311 efree( uinput );
312 }
313 RETURN_FALSE;
314 }
315
316
317 /* test string */
318 #if U_ICU_VERSION_MAJOR_NUM < 56
319 uret = unorm_isNormalizedWithOptions( uinput, uinput_len, form, (int32_t) 0 /* options */, &status);
320 #else
321 uret = intl_is_normalized(form, uinput, uinput_len, &status);
322 #endif
323
324 efree( uinput );
325
326 /* Bail out if an unexpected error occurred. */
327 if( U_FAILURE(status) ) {
328 /* Set error messages. */
329 intl_error_set_custom_msg( NULL,"Error testing if string is the given normalization form.", 0 );
330 RETURN_FALSE;
331 }
332
333 if ( uret )
334 RETURN_TRUE;
335
336 RETURN_FALSE;
337 }
338 /* }}} */
339
340 /* {{{ proto string|null Normalizer::getRawDecomposition( string $input [, string $form = FORM_C] )
341 * Returns the Decomposition_Mapping property for the given UTF-8 encoded code point. }}} */
342 /* {{{ proto string|null normalizer_get_raw_decomposition( string $input [, string $form = FORM_C] )
343 * Returns the Decomposition_Mapping property for the given UTF-8 encoded code point.
344 */
345 #if U_ICU_VERSION_MAJOR_NUM >= 56
PHP_FUNCTION(normalizer_get_raw_decomposition)346 PHP_FUNCTION( normalizer_get_raw_decomposition )
347 {
348 char* input = NULL;
349 size_t input_length = 0;
350
351 UChar32 codepoint = -1;
352 int32_t offset = 0;
353
354 UErrorCode status = U_ZERO_ERROR;
355 const UNormalizer2 *norm;
356 UChar decomposition[32];
357 int32_t decomposition_length;
358
359 zend_long form = NORMALIZER_DEFAULT;
360
361 intl_error_reset(NULL);
362
363 if ((zend_parse_parameters(ZEND_NUM_ARGS(), "s|l", &input, &input_length, &form) == FAILURE)) {
364 return;
365 }
366
367 norm = intl_get_normalizer(form, &status);
368
369 U8_NEXT(input, offset, input_length, codepoint);
370 if ((size_t)offset != input_length) {
371 intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
372 intl_error_set_custom_msg(NULL, "Input string must be exactly one UTF-8 encoded code point long.", 0);
373 return;
374 }
375
376 if ((codepoint < UCHAR_MIN_VALUE) || (codepoint > UCHAR_MAX_VALUE)) {
377 intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
378 intl_error_set_custom_msg(NULL, "Code point out of range", 0);
379 return;
380 }
381
382 decomposition_length = unorm2_getRawDecomposition(norm, codepoint, decomposition, 32, &status);
383 if (decomposition_length == -1) {
384 RETURN_NULL();
385 }
386
387 RETVAL_NEW_STR(intl_convert_utf16_to_utf8(decomposition, decomposition_length, &status));
388 }
389 #endif
390 /* }}} */
391