1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 7														  |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,	  |
6    | that is bundled with this package in the file LICENSE, and is		  |
7    | available through the world-wide-web at the following url:			  |
8    | http://www.php.net/license/3_01.txt								  |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to		  |
11    | license@php.net so we can mail you a copy immediately.				  |
12    +----------------------------------------------------------------------+
13    | Authors: Ed Batutis <ed@batutis.com>								  |
14    +----------------------------------------------------------------------+
15  */
16 
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20 
21 #include "php_intl.h"
22 #if U_ICU_VERSION_MAJOR_NUM < 56
23 #include "unicode/unorm.h"
24 #else
25 #include <unicode/unorm2.h>
26 #endif
27 #include "normalizer.h"
28 #include "normalizer_class.h"
29 #include "normalizer_normalize.h"
30 #include "intl_convert.h"
31 #if U_ICU_VERSION_MAJOR_NUM >= 49
32 #include <unicode/utf8.h>
33 #endif
34 
35 
36 #if U_ICU_VERSION_MAJOR_NUM >= 56
intl_get_normalizer(zend_long form,UErrorCode * err)37 static const UNormalizer2 *intl_get_normalizer(zend_long form, UErrorCode *err)
38 {/*{{{*/
39 	switch (form)
40 	{
41 		case NORMALIZER_FORM_C:
42 			return unorm2_getNFCInstance(err);
43 			break;
44 		case NORMALIZER_FORM_D:
45 			return unorm2_getNFDInstance(err);
46 			break;
47 		case NORMALIZER_FORM_KC:
48 			return unorm2_getNFKCInstance(err);
49 			break;
50 		case NORMALIZER_FORM_KD:
51 			return unorm2_getNFKDInstance(err);
52 			break;
53 		case NORMALIZER_FORM_KC_CF:
54 			return unorm2_getNFKCCasefoldInstance(err);
55 			break;
56 	}
57 
58 	*err = U_ILLEGAL_ARGUMENT_ERROR;
59 	return NULL;
60 }/*}}}*/
61 
intl_normalize(zend_long form,const UChar * src,int32_t src_len,UChar * dst,int32_t dst_len,UErrorCode * err)62 static int32_t intl_normalize(zend_long form, const UChar *src, int32_t src_len, UChar *dst, int32_t dst_len, UErrorCode *err)
63 {/*{{{*/
64 	const UNormalizer2 *norm;
65 
66 	/* Mimic the behavior of ICU < 56. */
67 	if (UNEXPECTED(NORMALIZER_NONE == form)) {
68 		/* FIXME This is a noop which should be removed somewhen after PHP 7.3.*/
69 		zend_error(E_DEPRECATED, "Normalizer::NONE is obsolete with ICU 56 and above and will be removed in later PHP versions");
70 
71 		if (dst_len >= src_len) {
72 			memmove(dst, src, sizeof(UChar) * src_len);
73 			dst[src_len] = '\0';
74 			*err = U_ZERO_ERROR;
75 			return src_len;
76 		}
77 
78 		*err = U_BUFFER_OVERFLOW_ERROR;
79 		return -1;
80 	}
81 
82 	norm = intl_get_normalizer(form, err);
83 	if(U_FAILURE(*err)) {
84 		return -1;
85 	}
86 
87 	return unorm2_normalize(norm, src, src_len, dst, dst_len, err);
88 }/*}}}*/
89 
intl_is_normalized(zend_long form,const UChar * uinput,int32_t uinput_len,UErrorCode * err)90 static UBool intl_is_normalized(zend_long form, const UChar *uinput, int32_t uinput_len, UErrorCode *err)
91 {/*{{{*/
92 	const UNormalizer2 *norm = intl_get_normalizer(form, err);
93 
94 	if(U_FAILURE(*err)) {
95 		return FALSE;
96 	}
97 
98 	return unorm2_isNormalized(norm, uinput, uinput_len, err);
99 }/*}}}*/
100 #endif
101 
102 /* {{{ proto string Normalizer::normalize( string $input [, string $form = FORM_C] )
103  * Normalize a string. }}} */
104 /* {{{ proto string normalizer_normalize( string $input [, string $form = FORM_C] )
105  * Normalize a string.
106  */
PHP_FUNCTION(normalizer_normalize)107 PHP_FUNCTION( normalizer_normalize )
108 {
109 	char*			input = NULL;
110 	/* form is optional, defaults to FORM_C */
111 	zend_long	    form = NORMALIZER_DEFAULT;
112 	size_t			input_len = 0;
113 
114 	UChar*			uinput = NULL;
115 	int32_t		    uinput_len = 0;
116 	int			    expansion_factor = 1;
117 	UErrorCode		status = U_ZERO_ERROR;
118 
119 	UChar*			uret_buf = NULL;
120 	int32_t			uret_len = 0;
121 
122 	zend_string*    u8str;
123 
124 	int32_t			size_needed;
125 
126 	intl_error_reset( NULL );
127 
128 	/* Parse parameters. */
129 	if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "s|l",
130 				&input, &input_len, &form ) == FAILURE )
131 	{
132 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
133 						 "normalizer_normalize: unable to parse input params", 0 );
134 
135 		RETURN_FALSE;
136 	}
137 
138 	expansion_factor = 1;
139 
140 	switch(form) {
141 		case NORMALIZER_NONE:
142 			break;
143 		case NORMALIZER_FORM_D:
144 			expansion_factor = 3;
145 			break;
146 		case NORMALIZER_FORM_KD:
147 			expansion_factor = 3;
148 			break;
149 		case NORMALIZER_FORM_C:
150 		case NORMALIZER_FORM_KC:
151 #if U_ICU_VERSION_MAJOR_NUM >= 56
152 		case NORMALIZER_FORM_KC_CF:
153 #endif
154 			break;
155 		default:
156 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
157 						"normalizer_normalize: illegal normalization form", 0 );
158 			RETURN_FALSE;
159 	}
160 
161 	/*
162 	 * Normalize string (converting it to UTF-16 first).
163 	 */
164 
165 	/* First convert the string to UTF-16. */
166 	intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
167 
168 	if( U_FAILURE( status ) )
169 	{
170 		/* Set global error code. */
171 		intl_error_set_code( NULL, status );
172 
173 		/* Set error messages. */
174 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
175 		if (uinput) {
176 			efree( uinput );
177 		}
178 		RETURN_FALSE;
179 	}
180 
181 
182 	/* Allocate memory for the destination buffer for normalization */
183 	uret_len = uinput_len * expansion_factor;
184 	uret_buf = eumalloc( uret_len + 1 );
185 
186 	/* normalize */
187 #if U_ICU_VERSION_MAJOR_NUM < 56
188 	size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
189 #else
190 	size_needed = intl_normalize(form, uinput, uinput_len, uret_buf, uret_len, &status);
191 #endif
192 
193 	/* Bail out if an unexpected error occurred.
194 	 * (U_BUFFER_OVERFLOW_ERROR means that *target buffer is not large enough).
195 	 * (U_STRING_NOT_TERMINATED_WARNING usually means that the input string is empty).
196 	 */
197 	if( U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && status != U_STRING_NOT_TERMINATED_WARNING ) {
198 		efree( uret_buf );
199 		efree( uinput );
200 		RETURN_NULL();
201 	}
202 
203 	if ( size_needed > uret_len ) {
204 		/* realloc does not seem to work properly - memory is corrupted
205 		 * uret_buf =  eurealloc(uret_buf, size_needed + 1);
206 		 */
207 		efree( uret_buf );
208 		uret_buf = eumalloc( size_needed + 1 );
209 		uret_len = size_needed;
210 
211 		status = U_ZERO_ERROR;
212 
213 		/* try normalize again */
214 #if U_ICU_VERSION_MAJOR_NUM < 56
215 		size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
216 #else
217 		size_needed = intl_normalize(form, uinput, uinput_len, uret_buf, uret_len, &status);
218 #endif
219 
220 		/* Bail out if an unexpected error occurred. */
221 		if( U_FAILURE(status)  ) {
222 			/* Set error messages. */
223 			intl_error_set_custom_msg( NULL,"Error normalizing string", 0 );
224 			efree( uret_buf );
225 			efree( uinput );
226 			RETURN_FALSE;
227 		}
228 	}
229 
230 	efree( uinput );
231 
232 	/* the buffer we actually used */
233 	uret_len = size_needed;
234 
235 	/* Convert normalized string from UTF-16 to UTF-8. */
236 	u8str = intl_convert_utf16_to_utf8(uret_buf, uret_len, &status );
237 	efree( uret_buf );
238 	if( !u8str )
239 	{
240 		intl_error_set( NULL, status,
241 				"normalizer_normalize: error converting normalized text UTF-8", 0 );
242 		RETURN_FALSE;
243 	}
244 
245 	/* Return it. */
246 	RETVAL_NEW_STR( u8str );
247 }
248 /* }}} */
249 
250 /* {{{ proto bool Normalizer::isNormalized( string $input [, string $form = FORM_C] )
251  * Test if a string is in a given normalization form. }}} */
252 /* {{{ proto bool normalizer_is_normalized( string $input [, string $form = FORM_C] )
253  * Test if a string is in a given normalization form.
254  */
PHP_FUNCTION(normalizer_is_normalized)255 PHP_FUNCTION( normalizer_is_normalized )
256 {
257 	char*	 	input = NULL;
258 	/* form is optional, defaults to FORM_C */
259 	zend_long		form = NORMALIZER_DEFAULT;
260 	size_t		input_len = 0;
261 
262 	UChar*	 	uinput = NULL;
263 	int		uinput_len = 0;
264 	UErrorCode	status = U_ZERO_ERROR;
265 
266 	UBool		uret = FALSE;
267 
268 	intl_error_reset( NULL );
269 
270 	/* Parse parameters. */
271 	if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "s|l",
272 				&input, &input_len, &form) == FAILURE )
273 	{
274 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
275 				"normalizer_is_normalized: unable to parse input params", 0 );
276 
277 		RETURN_FALSE;
278 	}
279 
280 	switch(form) {
281 		/* case NORMALIZER_NONE: not allowed - doesn't make sense */
282 
283 		case NORMALIZER_FORM_D:
284 		case NORMALIZER_FORM_KD:
285 		case NORMALIZER_FORM_C:
286 		case NORMALIZER_FORM_KC:
287 #if U_ICU_VERSION_MAJOR_NUM >= 56
288 		case NORMALIZER_FORM_KC_CF:
289 #endif
290 			break;
291 		default:
292 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
293 						"normalizer_normalize: illegal normalization form", 0 );
294 			RETURN_FALSE;
295 	}
296 
297 
298 	/*
299 	 * Test normalization of string (converting it to UTF-16 first).
300 	 */
301 
302 	/* First convert the string to UTF-16. */
303 	intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
304 
305 	if( U_FAILURE( status ) )
306 	{
307 		/* Set global error code. */
308 		intl_error_set_code( NULL, status );
309 
310 		/* Set error messages. */
311 		intl_error_set_custom_msg( NULL, "Error converting string to UTF-16.", 0 );
312 		if (uinput) {
313 			efree( uinput );
314 		}
315 		RETURN_FALSE;
316 	}
317 
318 
319 	/* test string */
320 #if U_ICU_VERSION_MAJOR_NUM < 56
321 	uret = unorm_isNormalizedWithOptions( uinput, uinput_len, form, (int32_t) 0 /* options */, &status);
322 #else
323 	uret = intl_is_normalized(form, uinput, uinput_len, &status);
324 #endif
325 
326 	efree( uinput );
327 
328 	/* Bail out if an unexpected error occurred. */
329 	if( U_FAILURE(status)  ) {
330 		/* Set error messages. */
331 		intl_error_set_custom_msg( NULL,"Error testing if string is the given normalization form.", 0 );
332 		RETURN_FALSE;
333 	}
334 
335 	if ( uret )
336 		RETURN_TRUE;
337 
338 	RETURN_FALSE;
339 }
340 /* }}} */
341 
342 /* {{{ proto string|null Normalizer::getRawDecomposition( string $input [, string $form = FORM_C] )
343  * Returns the Decomposition_Mapping property for the given UTF-8 encoded code point. }}} */
344 /* {{{ proto string|null normalizer_get_raw_decomposition( string $input [, string $form = FORM_C] )
345  * Returns the Decomposition_Mapping property for the given UTF-8 encoded code point.
346  */
347 #if U_ICU_VERSION_MAJOR_NUM >= 56
PHP_FUNCTION(normalizer_get_raw_decomposition)348 PHP_FUNCTION( normalizer_get_raw_decomposition )
349 {
350 	char* input = NULL;
351 	size_t input_length = 0;
352 
353 	UChar32 codepoint = -1;
354 	int32_t offset = 0;
355 
356     UErrorCode status = U_ZERO_ERROR;
357     const UNormalizer2 *norm;
358     UChar decomposition[32];
359     int32_t decomposition_length;
360 
361 	zend_long form = NORMALIZER_DEFAULT;
362 
363 	intl_error_reset(NULL);
364 
365 	if ((zend_parse_parameters(ZEND_NUM_ARGS(), "s|l", &input, &input_length, &form) == FAILURE)) {
366 		return;
367 	}
368 
369 	norm = intl_get_normalizer(form, &status);
370 
371 	U8_NEXT(input, offset, input_length, codepoint);
372 	if ((size_t)offset != input_length) {
373 		intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
374 		intl_error_set_custom_msg(NULL, "Input string must be exactly one UTF-8 encoded code point long.", 0);
375 		return;
376 	}
377 
378 	if ((codepoint < UCHAR_MIN_VALUE) || (codepoint > UCHAR_MAX_VALUE)) {
379 		intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
380 		intl_error_set_custom_msg(NULL, "Code point out of range", 0);
381 		return;
382 	}
383 
384 	decomposition_length = unorm2_getRawDecomposition(norm, codepoint, decomposition, 32, &status);
385 	if (decomposition_length == -1) {
386 		RETURN_NULL();
387 	}
388 
389 	RETVAL_NEW_STR(intl_convert_utf16_to_utf8(decomposition, decomposition_length, &status));
390 }
391 #endif
392 /* }}} */
393 
394 /*
395  * Local variables:
396  * tab-width: 4
397  * c-basic-offset: 4
398  * End:
399  * vim600: noet sw=4 ts=4 fdm=marker
400  * vim<600: noet sw=4 ts=4
401  */
402