1 /*
2 +----------------------------------------------------------------------+
3 | This source file is subject to version 3.01 of the PHP license, |
4 | that is bundled with this package in the file LICENSE, and is |
5 | available through the world-wide-web at the following url: |
6 | http://www.php.net/license/3_01.txt |
7 | If you did not receive a copy of the PHP license and are unable to |
8 | obtain it through the world-wide-web, please send a note to |
9 | license@php.net so we can mail you a copy immediately. |
10 +----------------------------------------------------------------------+
11 | Authors: Ed Batutis <ed@batutis.com> |
12 +----------------------------------------------------------------------+
13 */
14
15 #ifdef HAVE_CONFIG_H
16 #include "config.h"
17 #endif
18
19 #include "php_intl.h"
20 #if U_ICU_VERSION_MAJOR_NUM < 56
21 #include "unicode/unorm.h"
22 #else
23 #include <unicode/unorm2.h>
24 #endif
25 #include "normalizer.h"
26 #include "normalizer_class.h"
27 #include "intl_convert.h"
28 #include <unicode/utf8.h>
29
30
31 #if U_ICU_VERSION_MAJOR_NUM >= 56
intl_get_normalizer(zend_long form,UErrorCode * err)32 static const UNormalizer2 *intl_get_normalizer(zend_long form, UErrorCode *err)
33 {/*{{{*/
34 switch (form)
35 {
36 case NORMALIZER_FORM_C:
37 return unorm2_getNFCInstance(err);
38 break;
39 case NORMALIZER_FORM_D:
40 return unorm2_getNFDInstance(err);
41 break;
42 case NORMALIZER_FORM_KC:
43 return unorm2_getNFKCInstance(err);
44 break;
45 case NORMALIZER_FORM_KD:
46 return unorm2_getNFKDInstance(err);
47 break;
48 case NORMALIZER_FORM_KC_CF:
49 return unorm2_getNFKCCasefoldInstance(err);
50 break;
51 }
52
53 *err = U_ILLEGAL_ARGUMENT_ERROR;
54 return NULL;
55 }/*}}}*/
56
intl_normalize(zend_long form,const UChar * src,int32_t src_len,UChar * dst,int32_t dst_len,UErrorCode * err)57 static int32_t intl_normalize(zend_long form, const UChar *src, int32_t src_len, UChar *dst, int32_t dst_len, UErrorCode *err)
58 {/*{{{*/
59 const UNormalizer2 *norm = intl_get_normalizer(form, err);
60 if (U_FAILURE(*err)) {
61 return -1;
62 }
63
64 return unorm2_normalize(norm, src, src_len, dst, dst_len, err);
65 }/*}}}*/
66
intl_is_normalized(zend_long form,const UChar * uinput,int32_t uinput_len,UErrorCode * err)67 static UBool intl_is_normalized(zend_long form, const UChar *uinput, int32_t uinput_len, UErrorCode *err)
68 {/*{{{*/
69 const UNormalizer2 *norm = intl_get_normalizer(form, err);
70
71 if(U_FAILURE(*err)) {
72 return false;
73 }
74
75 return unorm2_isNormalized(norm, uinput, uinput_len, err);
76 }/*}}}*/
77 #endif
78
79 /* {{{ Normalize a string. */
PHP_FUNCTION(normalizer_normalize)80 PHP_FUNCTION( normalizer_normalize )
81 {
82 char* input = NULL;
83 /* form is optional, defaults to FORM_C */
84 zend_long form = NORMALIZER_DEFAULT;
85 size_t input_len = 0;
86
87 UChar* uinput = NULL;
88 int32_t uinput_len = 0;
89 int expansion_factor = 1;
90 UErrorCode status = U_ZERO_ERROR;
91
92 UChar* uret_buf = NULL;
93 int32_t uret_len = 0;
94
95 zend_string* u8str;
96
97 int32_t size_needed;
98
99 intl_error_reset( NULL );
100
101 /* Parse parameters. */
102 if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "s|l",
103 &input, &input_len, &form ) == FAILURE )
104 {
105 RETURN_THROWS();
106 }
107
108 expansion_factor = 1;
109
110 switch(form) {
111 case NORMALIZER_FORM_D:
112 expansion_factor = 3;
113 break;
114 case NORMALIZER_FORM_KD:
115 expansion_factor = 3;
116 break;
117 case NORMALIZER_FORM_C:
118 case NORMALIZER_FORM_KC:
119 #if U_ICU_VERSION_MAJOR_NUM >= 56
120 case NORMALIZER_FORM_KC_CF:
121 #endif
122 break;
123 default:
124 zend_argument_value_error(2, "must be a a valid normalization form");
125 RETURN_THROWS();
126 }
127
128 /*
129 * Normalize string (converting it to UTF-16 first).
130 */
131
132 /* First convert the string to UTF-16. */
133 intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
134
135 if( U_FAILURE( status ) )
136 {
137 /* Set global error code. */
138 intl_error_set_code( NULL, status );
139
140 /* Set error messages. */
141 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
142 if (uinput) {
143 efree( uinput );
144 }
145 RETURN_FALSE;
146 }
147
148
149 /* Allocate memory for the destination buffer for normalization */
150 uret_len = uinput_len * expansion_factor;
151 uret_buf = eumalloc( uret_len + 1 );
152
153 /* normalize */
154 #if U_ICU_VERSION_MAJOR_NUM < 56
155 size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
156 #else
157 size_needed = intl_normalize(form, uinput, uinput_len, uret_buf, uret_len, &status);
158 #endif
159
160 /* Bail out if an unexpected error occurred.
161 * (U_BUFFER_OVERFLOW_ERROR means that *target buffer is not large enough).
162 * (U_STRING_NOT_TERMINATED_WARNING usually means that the input string is empty).
163 */
164 if( U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && status != U_STRING_NOT_TERMINATED_WARNING ) {
165 efree( uret_buf );
166 efree( uinput );
167 RETURN_NULL();
168 }
169
170 if ( size_needed > uret_len ) {
171 /* realloc does not seem to work properly - memory is corrupted
172 * uret_buf = eurealloc(uret_buf, size_needed + 1);
173 */
174 efree( uret_buf );
175 uret_buf = eumalloc( size_needed + 1 );
176 uret_len = size_needed;
177
178 status = U_ZERO_ERROR;
179
180 /* try normalize again */
181 #if U_ICU_VERSION_MAJOR_NUM < 56
182 size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
183 #else
184 size_needed = intl_normalize(form, uinput, uinput_len, uret_buf, uret_len, &status);
185 #endif
186
187 /* Bail out if an unexpected error occurred. */
188 if( U_FAILURE(status) ) {
189 /* Set error messages. */
190 intl_error_set_custom_msg( NULL,"Error normalizing string", 0 );
191 efree( uret_buf );
192 efree( uinput );
193 RETURN_FALSE;
194 }
195 }
196
197 efree( uinput );
198
199 /* the buffer we actually used */
200 uret_len = size_needed;
201
202 /* Convert normalized string from UTF-16 to UTF-8. */
203 u8str = intl_convert_utf16_to_utf8(uret_buf, uret_len, &status );
204 efree( uret_buf );
205 if( !u8str )
206 {
207 intl_error_set( NULL, status,
208 "normalizer_normalize: error converting normalized text UTF-8", 0 );
209 RETURN_FALSE;
210 }
211
212 /* Return it. */
213 RETVAL_NEW_STR( u8str );
214 }
215 /* }}} */
216
217 /* {{{ Test if a string is in a given normalization form. */
PHP_FUNCTION(normalizer_is_normalized)218 PHP_FUNCTION( normalizer_is_normalized )
219 {
220 char* input = NULL;
221 /* form is optional, defaults to FORM_C */
222 zend_long form = NORMALIZER_DEFAULT;
223 size_t input_len = 0;
224
225 UChar* uinput = NULL;
226 int uinput_len = 0;
227 UErrorCode status = U_ZERO_ERROR;
228
229 UBool uret = false;
230
231 intl_error_reset( NULL );
232
233 /* Parse parameters. */
234 if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "s|l",
235 &input, &input_len, &form) == FAILURE )
236 {
237 RETURN_THROWS();
238 }
239
240 switch(form) {
241 case NORMALIZER_FORM_D:
242 case NORMALIZER_FORM_KD:
243 case NORMALIZER_FORM_C:
244 case NORMALIZER_FORM_KC:
245 #if U_ICU_VERSION_MAJOR_NUM >= 56
246 case NORMALIZER_FORM_KC_CF:
247 #endif
248 break;
249 default:
250 zend_argument_value_error(2, "must be a a valid normalization form");
251 RETURN_THROWS();
252 }
253
254
255 /*
256 * Test normalization of string (converting it to UTF-16 first).
257 */
258
259 /* First convert the string to UTF-16. */
260 intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
261
262 if( U_FAILURE( status ) )
263 {
264 /* Set global error code. */
265 intl_error_set_code( NULL, status );
266
267 /* Set error messages. */
268 intl_error_set_custom_msg( NULL, "Error converting string to UTF-16.", 0 );
269 if (uinput) {
270 efree( uinput );
271 }
272 RETURN_FALSE;
273 }
274
275
276 /* test string */
277 #if U_ICU_VERSION_MAJOR_NUM < 56
278 uret = unorm_isNormalizedWithOptions( uinput, uinput_len, form, (int32_t) 0 /* options */, &status);
279 #else
280 uret = intl_is_normalized(form, uinput, uinput_len, &status);
281 #endif
282
283 efree( uinput );
284
285 /* Bail out if an unexpected error occurred. */
286 if( U_FAILURE(status) ) {
287 /* Set error messages. */
288 intl_error_set_custom_msg( NULL,"Error testing if string is the given normalization form.", 0 );
289 RETURN_FALSE;
290 }
291
292 if ( uret )
293 RETURN_TRUE;
294
295 RETURN_FALSE;
296 }
297 /* }}} */
298
299 /* {{{ Returns the Decomposition_Mapping property for the given UTF-8 encoded code point. */
300 #if U_ICU_VERSION_MAJOR_NUM >= 56
PHP_FUNCTION(normalizer_get_raw_decomposition)301 PHP_FUNCTION( normalizer_get_raw_decomposition )
302 {
303 char* input = NULL;
304 size_t input_length = 0;
305
306 UChar32 codepoint = -1;
307 int32_t offset = 0;
308
309 UErrorCode status = U_ZERO_ERROR;
310 const UNormalizer2 *norm;
311 UChar decomposition[32];
312 int32_t decomposition_length;
313
314 zend_long form = NORMALIZER_DEFAULT;
315
316 intl_error_reset(NULL);
317
318 if ((zend_parse_parameters(ZEND_NUM_ARGS(), "s|l", &input, &input_length, &form) == FAILURE)) {
319 RETURN_THROWS();
320 }
321
322 norm = intl_get_normalizer(form, &status);
323
324 U8_NEXT(input, offset, input_length, codepoint);
325 if ((size_t)offset != input_length) {
326 intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
327 intl_error_set_custom_msg(NULL, "Input string must be exactly one UTF-8 encoded code point long.", 0);
328 return;
329 }
330
331 if ((codepoint < UCHAR_MIN_VALUE) || (codepoint > UCHAR_MAX_VALUE)) {
332 intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
333 intl_error_set_custom_msg(NULL, "Code point out of range", 0);
334 return;
335 }
336
337 decomposition_length = unorm2_getRawDecomposition(norm, codepoint, decomposition, 32, &status);
338 if (decomposition_length == -1) {
339 RETURN_NULL();
340 }
341
342 RETVAL_NEW_STR(intl_convert_utf16_to_utf8(decomposition, decomposition_length, &status));
343 }
344 #endif
345 /* }}} */
346