1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 7 |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | http://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Authors: Ed Batutis <ed@batutis.com> |
14 +----------------------------------------------------------------------+
15 */
16
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20
21 #include "php_intl.h"
22 #if U_ICU_VERSION_MAJOR_NUM < 56
23 #include "unicode/unorm.h"
24 #else
25 #include <unicode/unorm2.h>
26 #endif
27 #include "normalizer.h"
28 #include "normalizer_class.h"
29 #include "normalizer_normalize.h"
30 #include "intl_convert.h"
31 #if U_ICU_VERSION_MAJOR_NUM >= 49
32 #include <unicode/utf8.h>
33 #endif
34
35
36 #if U_ICU_VERSION_MAJOR_NUM >= 56
intl_get_normalizer(zend_long form,UErrorCode * err)37 static const UNormalizer2 *intl_get_normalizer(zend_long form, UErrorCode *err)
38 {/*{{{*/
39 switch (form)
40 {
41 case NORMALIZER_FORM_C:
42 return unorm2_getNFCInstance(err);
43 break;
44 case NORMALIZER_FORM_D:
45 return unorm2_getNFDInstance(err);
46 break;
47 case NORMALIZER_FORM_KC:
48 return unorm2_getNFKCInstance(err);
49 break;
50 case NORMALIZER_FORM_KD:
51 return unorm2_getNFKDInstance(err);
52 break;
53 case NORMALIZER_FORM_KC_CF:
54 return unorm2_getNFKCCasefoldInstance(err);
55 break;
56 }
57
58 *err = U_ILLEGAL_ARGUMENT_ERROR;
59 return NULL;
60 }/*}}}*/
61
intl_normalize(zend_long form,const UChar * src,int32_t src_len,UChar * dst,int32_t dst_len,UErrorCode * err)62 static int32_t intl_normalize(zend_long form, const UChar *src, int32_t src_len, UChar *dst, int32_t dst_len, UErrorCode *err)
63 {/*{{{*/
64 const UNormalizer2 *norm;
65
66 /* Mimic the behavior of ICU < 56. */
67 if (UNEXPECTED(NORMALIZER_NONE == form)) {
68 /* FIXME This is a noop which should be removed somewhen after PHP 7.3.*/
69 zend_error(E_DEPRECATED, "Normalizer::NONE is obsolete with ICU 56 and above and will be removed in later PHP versions");
70
71 if (dst_len >= src_len) {
72 memmove(dst, src, sizeof(UChar) * src_len);
73 dst[src_len] = '\0';
74 *err = U_ZERO_ERROR;
75 return src_len;
76 }
77
78 *err = U_BUFFER_OVERFLOW_ERROR;
79 return -1;
80 }
81
82 norm = intl_get_normalizer(form, err);
83 if(U_FAILURE(*err)) {
84 return -1;
85 }
86
87 return unorm2_normalize(norm, src, src_len, dst, dst_len, err);
88 }/*}}}*/
89
intl_is_normalized(zend_long form,const UChar * uinput,int32_t uinput_len,UErrorCode * err)90 static UBool intl_is_normalized(zend_long form, const UChar *uinput, int32_t uinput_len, UErrorCode *err)
91 {/*{{{*/
92 const UNormalizer2 *norm = intl_get_normalizer(form, err);
93
94 if(U_FAILURE(*err)) {
95 return FALSE;
96 }
97
98 return unorm2_isNormalized(norm, uinput, uinput_len, err);
99 }/*}}}*/
100 #endif
101
102 /* {{{ proto string Normalizer::normalize( string $input [, string $form = FORM_C] )
103 * Normalize a string. }}} */
104 /* {{{ proto string normalizer_normalize( string $input [, string $form = FORM_C] )
105 * Normalize a string.
106 */
PHP_FUNCTION(normalizer_normalize)107 PHP_FUNCTION( normalizer_normalize )
108 {
109 char* input = NULL;
110 /* form is optional, defaults to FORM_C */
111 zend_long form = NORMALIZER_DEFAULT;
112 size_t input_len = 0;
113
114 UChar* uinput = NULL;
115 int32_t uinput_len = 0;
116 int expansion_factor = 1;
117 UErrorCode status = U_ZERO_ERROR;
118
119 UChar* uret_buf = NULL;
120 int32_t uret_len = 0;
121
122 zend_string* u8str;
123
124 int32_t size_needed;
125
126 intl_error_reset( NULL );
127
128 /* Parse parameters. */
129 if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "s|l",
130 &input, &input_len, &form ) == FAILURE )
131 {
132 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
133 "normalizer_normalize: unable to parse input params", 0 );
134
135 RETURN_FALSE;
136 }
137
138 expansion_factor = 1;
139
140 switch(form) {
141 case NORMALIZER_NONE:
142 break;
143 case NORMALIZER_FORM_D:
144 expansion_factor = 3;
145 break;
146 case NORMALIZER_FORM_KD:
147 expansion_factor = 3;
148 break;
149 case NORMALIZER_FORM_C:
150 case NORMALIZER_FORM_KC:
151 #if U_ICU_VERSION_MAJOR_NUM >= 56
152 case NORMALIZER_FORM_KC_CF:
153 #endif
154 break;
155 default:
156 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
157 "normalizer_normalize: illegal normalization form", 0 );
158 RETURN_FALSE;
159 }
160
161 /*
162 * Normalize string (converting it to UTF-16 first).
163 */
164
165 /* First convert the string to UTF-16. */
166 intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
167
168 if( U_FAILURE( status ) )
169 {
170 /* Set global error code. */
171 intl_error_set_code( NULL, status );
172
173 /* Set error messages. */
174 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
175 if (uinput) {
176 efree( uinput );
177 }
178 RETURN_FALSE;
179 }
180
181
182 /* Allocate memory for the destination buffer for normalization */
183 uret_len = uinput_len * expansion_factor;
184 uret_buf = eumalloc( uret_len + 1 );
185
186 /* normalize */
187 #if U_ICU_VERSION_MAJOR_NUM < 56
188 size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
189 #else
190 size_needed = intl_normalize(form, uinput, uinput_len, uret_buf, uret_len, &status);
191 #endif
192
193 /* Bail out if an unexpected error occurred.
194 * (U_BUFFER_OVERFLOW_ERROR means that *target buffer is not large enough).
195 * (U_STRING_NOT_TERMINATED_WARNING usually means that the input string is empty).
196 */
197 if( U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && status != U_STRING_NOT_TERMINATED_WARNING ) {
198 efree( uret_buf );
199 efree( uinput );
200 RETURN_NULL();
201 }
202
203 if ( size_needed > uret_len ) {
204 /* realloc does not seem to work properly - memory is corrupted
205 * uret_buf = eurealloc(uret_buf, size_needed + 1);
206 */
207 efree( uret_buf );
208 uret_buf = eumalloc( size_needed + 1 );
209 uret_len = size_needed;
210
211 status = U_ZERO_ERROR;
212
213 /* try normalize again */
214 #if U_ICU_VERSION_MAJOR_NUM < 56
215 size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
216 #else
217 size_needed = intl_normalize(form, uinput, uinput_len, uret_buf, uret_len, &status);
218 #endif
219
220 /* Bail out if an unexpected error occurred. */
221 if( U_FAILURE(status) ) {
222 /* Set error messages. */
223 intl_error_set_custom_msg( NULL,"Error normalizing string", 0 );
224 efree( uret_buf );
225 efree( uinput );
226 RETURN_FALSE;
227 }
228 }
229
230 efree( uinput );
231
232 /* the buffer we actually used */
233 uret_len = size_needed;
234
235 /* Convert normalized string from UTF-16 to UTF-8. */
236 u8str = intl_convert_utf16_to_utf8(uret_buf, uret_len, &status );
237 efree( uret_buf );
238 if( !u8str )
239 {
240 intl_error_set( NULL, status,
241 "normalizer_normalize: error converting normalized text UTF-8", 0 );
242 RETURN_FALSE;
243 }
244
245 /* Return it. */
246 RETVAL_NEW_STR( u8str );
247 }
248 /* }}} */
249
250 /* {{{ proto bool Normalizer::isNormalized( string $input [, string $form = FORM_C] )
251 * Test if a string is in a given normalization form. }}} */
252 /* {{{ proto bool normalizer_is_normalized( string $input [, string $form = FORM_C] )
253 * Test if a string is in a given normalization form.
254 */
PHP_FUNCTION(normalizer_is_normalized)255 PHP_FUNCTION( normalizer_is_normalized )
256 {
257 char* input = NULL;
258 /* form is optional, defaults to FORM_C */
259 zend_long form = NORMALIZER_DEFAULT;
260 size_t input_len = 0;
261
262 UChar* uinput = NULL;
263 int uinput_len = 0;
264 UErrorCode status = U_ZERO_ERROR;
265
266 UBool uret = FALSE;
267
268 intl_error_reset( NULL );
269
270 /* Parse parameters. */
271 if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "s|l",
272 &input, &input_len, &form) == FAILURE )
273 {
274 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
275 "normalizer_is_normalized: unable to parse input params", 0 );
276
277 RETURN_FALSE;
278 }
279
280 switch(form) {
281 /* case NORMALIZER_NONE: not allowed - doesn't make sense */
282
283 case NORMALIZER_FORM_D:
284 case NORMALIZER_FORM_KD:
285 case NORMALIZER_FORM_C:
286 case NORMALIZER_FORM_KC:
287 #if U_ICU_VERSION_MAJOR_NUM >= 56
288 case NORMALIZER_FORM_KC_CF:
289 #endif
290 break;
291 default:
292 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
293 "normalizer_normalize: illegal normalization form", 0 );
294 RETURN_FALSE;
295 }
296
297
298 /*
299 * Test normalization of string (converting it to UTF-16 first).
300 */
301
302 /* First convert the string to UTF-16. */
303 intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
304
305 if( U_FAILURE( status ) )
306 {
307 /* Set global error code. */
308 intl_error_set_code( NULL, status );
309
310 /* Set error messages. */
311 intl_error_set_custom_msg( NULL, "Error converting string to UTF-16.", 0 );
312 if (uinput) {
313 efree( uinput );
314 }
315 RETURN_FALSE;
316 }
317
318
319 /* test string */
320 #if U_ICU_VERSION_MAJOR_NUM < 56
321 uret = unorm_isNormalizedWithOptions( uinput, uinput_len, form, (int32_t) 0 /* options */, &status);
322 #else
323 uret = intl_is_normalized(form, uinput, uinput_len, &status);
324 #endif
325
326 efree( uinput );
327
328 /* Bail out if an unexpected error occurred. */
329 if( U_FAILURE(status) ) {
330 /* Set error messages. */
331 intl_error_set_custom_msg( NULL,"Error testing if string is the given normalization form.", 0 );
332 RETURN_FALSE;
333 }
334
335 if ( uret )
336 RETURN_TRUE;
337
338 RETURN_FALSE;
339 }
340 /* }}} */
341
342 /* {{{ proto string|null Normalizer::getRawDecomposition( string $input [, string $form = FORM_C] )
343 * Returns the Decomposition_Mapping property for the given UTF-8 encoded code point. }}} */
344 /* {{{ proto string|null normalizer_get_raw_decomposition( string $input [, string $form = FORM_C] )
345 * Returns the Decomposition_Mapping property for the given UTF-8 encoded code point.
346 */
347 #if U_ICU_VERSION_MAJOR_NUM >= 56
PHP_FUNCTION(normalizer_get_raw_decomposition)348 PHP_FUNCTION( normalizer_get_raw_decomposition )
349 {
350 char* input = NULL;
351 size_t input_length = 0;
352
353 UChar32 codepoint = -1;
354 int32_t offset = 0;
355
356 UErrorCode status = U_ZERO_ERROR;
357 const UNormalizer2 *norm;
358 UChar decomposition[32];
359 int32_t decomposition_length;
360
361 zend_long form = NORMALIZER_DEFAULT;
362
363 intl_error_reset(NULL);
364
365 if ((zend_parse_parameters(ZEND_NUM_ARGS(), "s|l", &input, &input_length, &form) == FAILURE)) {
366 return;
367 }
368
369 norm = intl_get_normalizer(form, &status);
370
371 U8_NEXT(input, offset, input_length, codepoint);
372 if ((size_t)offset != input_length) {
373 intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
374 intl_error_set_custom_msg(NULL, "Input string must be exactly one UTF-8 encoded code point long.", 0);
375 return;
376 }
377
378 if ((codepoint < UCHAR_MIN_VALUE) || (codepoint > UCHAR_MAX_VALUE)) {
379 intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
380 intl_error_set_custom_msg(NULL, "Code point out of range", 0);
381 return;
382 }
383
384 decomposition_length = unorm2_getRawDecomposition(norm, codepoint, decomposition, 32, &status);
385 if (decomposition_length == -1) {
386 RETURN_NULL();
387 }
388
389 RETVAL_NEW_STR(intl_convert_utf16_to_utf8(decomposition, decomposition_length, &status));
390 }
391 #endif
392 /* }}} */
393
394 /*
395 * Local variables:
396 * tab-width: 4
397 * c-basic-offset: 4
398 * End:
399 * vim600: noet sw=4 ts=4 fdm=marker
400 * vim<600: noet sw=4 ts=4
401 */
402