1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 7														  |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,	  |
6    | that is bundled with this package in the file LICENSE, and is		  |
7    | available through the world-wide-web at the following url:			  |
8    | http://www.php.net/license/3_01.txt								  |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to		  |
11    | license@php.net so we can mail you a copy immediately.				  |
12    +----------------------------------------------------------------------+
13    | Authors: Ed Batutis <ed@batutis.com>								  |
14    +----------------------------------------------------------------------+
15  */
16 
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20 
21 #include "php_intl.h"
22 #if U_ICU_VERSION_MAJOR_NUM < 56
23 #include "unicode/unorm.h"
24 #else
25 #include <unicode/unorm2.h>
26 #endif
27 #include "normalizer.h"
28 #include "normalizer_class.h"
29 #include "normalizer_normalize.h"
30 #include "intl_convert.h"
31 #include <unicode/utf8.h>
32 
33 
34 #if U_ICU_VERSION_MAJOR_NUM >= 56
intl_get_normalizer(zend_long form,UErrorCode * err)35 static const UNormalizer2 *intl_get_normalizer(zend_long form, UErrorCode *err)
36 {/*{{{*/
37 	switch (form)
38 	{
39 		case NORMALIZER_FORM_C:
40 			return unorm2_getNFCInstance(err);
41 			break;
42 		case NORMALIZER_FORM_D:
43 			return unorm2_getNFDInstance(err);
44 			break;
45 		case NORMALIZER_FORM_KC:
46 			return unorm2_getNFKCInstance(err);
47 			break;
48 		case NORMALIZER_FORM_KD:
49 			return unorm2_getNFKDInstance(err);
50 			break;
51 		case NORMALIZER_FORM_KC_CF:
52 			return unorm2_getNFKCCasefoldInstance(err);
53 			break;
54 	}
55 
56 	*err = U_ILLEGAL_ARGUMENT_ERROR;
57 	return NULL;
58 }/*}}}*/
59 
intl_normalize(zend_long form,const UChar * src,int32_t src_len,UChar * dst,int32_t dst_len,UErrorCode * err)60 static int32_t intl_normalize(zend_long form, const UChar *src, int32_t src_len, UChar *dst, int32_t dst_len, UErrorCode *err)
61 {/*{{{*/
62 	const UNormalizer2 *norm;
63 
64 	/* Mimic the behavior of ICU < 56. */
65 	if (UNEXPECTED(NORMALIZER_NONE == form)) {
66 		/* FIXME This is a noop which should be removed somewhen after PHP 7.3.*/
67 		zend_error(E_DEPRECATED, "Normalizer::NONE is obsolete with ICU 56 and above and will be removed in later PHP versions");
68 
69 		if (dst_len >= src_len) {
70 			memmove(dst, src, sizeof(UChar) * src_len);
71 			dst[src_len] = '\0';
72 			*err = U_ZERO_ERROR;
73 			return src_len;
74 		}
75 
76 		*err = U_BUFFER_OVERFLOW_ERROR;
77 		return -1;
78 	}
79 
80 	norm = intl_get_normalizer(form, err);
81 	if(U_FAILURE(*err)) {
82 		return -1;
83 	}
84 
85 	return unorm2_normalize(norm, src, src_len, dst, dst_len, err);
86 }/*}}}*/
87 
intl_is_normalized(zend_long form,const UChar * uinput,int32_t uinput_len,UErrorCode * err)88 static UBool intl_is_normalized(zend_long form, const UChar *uinput, int32_t uinput_len, UErrorCode *err)
89 {/*{{{*/
90 	const UNormalizer2 *norm = intl_get_normalizer(form, err);
91 
92 	if(U_FAILURE(*err)) {
93 		return FALSE;
94 	}
95 
96 	return unorm2_isNormalized(norm, uinput, uinput_len, err);
97 }/*}}}*/
98 #endif
99 
100 /* {{{ proto string Normalizer::normalize( string $input [, string $form = FORM_C] )
101  * Normalize a string. }}} */
102 /* {{{ proto string normalizer_normalize( string $input [, string $form = FORM_C] )
103  * Normalize a string.
104  */
PHP_FUNCTION(normalizer_normalize)105 PHP_FUNCTION( normalizer_normalize )
106 {
107 	char*			input = NULL;
108 	/* form is optional, defaults to FORM_C */
109 	zend_long	    form = NORMALIZER_DEFAULT;
110 	size_t			input_len = 0;
111 
112 	UChar*			uinput = NULL;
113 	int32_t		    uinput_len = 0;
114 	int			    expansion_factor = 1;
115 	UErrorCode		status = U_ZERO_ERROR;
116 
117 	UChar*			uret_buf = NULL;
118 	int32_t			uret_len = 0;
119 
120 	zend_string*    u8str;
121 
122 	int32_t			size_needed;
123 
124 	intl_error_reset( NULL );
125 
126 	/* Parse parameters. */
127 	if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "s|l",
128 				&input, &input_len, &form ) == FAILURE )
129 	{
130 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
131 						 "normalizer_normalize: unable to parse input params", 0 );
132 
133 		RETURN_FALSE;
134 	}
135 
136 	expansion_factor = 1;
137 
138 	switch(form) {
139 		case NORMALIZER_NONE:
140 			break;
141 		case NORMALIZER_FORM_D:
142 			expansion_factor = 3;
143 			break;
144 		case NORMALIZER_FORM_KD:
145 			expansion_factor = 3;
146 			break;
147 		case NORMALIZER_FORM_C:
148 		case NORMALIZER_FORM_KC:
149 #if U_ICU_VERSION_MAJOR_NUM >= 56
150 		case NORMALIZER_FORM_KC_CF:
151 #endif
152 			break;
153 		default:
154 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
155 						"normalizer_normalize: illegal normalization form", 0 );
156 			RETURN_FALSE;
157 	}
158 
159 	/*
160 	 * Normalize string (converting it to UTF-16 first).
161 	 */
162 
163 	/* First convert the string to UTF-16. */
164 	intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
165 
166 	if( U_FAILURE( status ) )
167 	{
168 		/* Set global error code. */
169 		intl_error_set_code( NULL, status );
170 
171 		/* Set error messages. */
172 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
173 		if (uinput) {
174 			efree( uinput );
175 		}
176 		RETURN_FALSE;
177 	}
178 
179 
180 	/* Allocate memory for the destination buffer for normalization */
181 	uret_len = uinput_len * expansion_factor;
182 	uret_buf = eumalloc( uret_len + 1 );
183 
184 	/* normalize */
185 #if U_ICU_VERSION_MAJOR_NUM < 56
186 	size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
187 #else
188 	size_needed = intl_normalize(form, uinput, uinput_len, uret_buf, uret_len, &status);
189 #endif
190 
191 	/* Bail out if an unexpected error occurred.
192 	 * (U_BUFFER_OVERFLOW_ERROR means that *target buffer is not large enough).
193 	 * (U_STRING_NOT_TERMINATED_WARNING usually means that the input string is empty).
194 	 */
195 	if( U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && status != U_STRING_NOT_TERMINATED_WARNING ) {
196 		efree( uret_buf );
197 		efree( uinput );
198 		RETURN_NULL();
199 	}
200 
201 	if ( size_needed > uret_len ) {
202 		/* realloc does not seem to work properly - memory is corrupted
203 		 * uret_buf =  eurealloc(uret_buf, size_needed + 1);
204 		 */
205 		efree( uret_buf );
206 		uret_buf = eumalloc( size_needed + 1 );
207 		uret_len = size_needed;
208 
209 		status = U_ZERO_ERROR;
210 
211 		/* try normalize again */
212 #if U_ICU_VERSION_MAJOR_NUM < 56
213 		size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
214 #else
215 		size_needed = intl_normalize(form, uinput, uinput_len, uret_buf, uret_len, &status);
216 #endif
217 
218 		/* Bail out if an unexpected error occurred. */
219 		if( U_FAILURE(status)  ) {
220 			/* Set error messages. */
221 			intl_error_set_custom_msg( NULL,"Error normalizing string", 0 );
222 			efree( uret_buf );
223 			efree( uinput );
224 			RETURN_FALSE;
225 		}
226 	}
227 
228 	efree( uinput );
229 
230 	/* the buffer we actually used */
231 	uret_len = size_needed;
232 
233 	/* Convert normalized string from UTF-16 to UTF-8. */
234 	u8str = intl_convert_utf16_to_utf8(uret_buf, uret_len, &status );
235 	efree( uret_buf );
236 	if( !u8str )
237 	{
238 		intl_error_set( NULL, status,
239 				"normalizer_normalize: error converting normalized text UTF-8", 0 );
240 		RETURN_FALSE;
241 	}
242 
243 	/* Return it. */
244 	RETVAL_NEW_STR( u8str );
245 }
246 /* }}} */
247 
248 /* {{{ proto bool Normalizer::isNormalized( string $input [, string $form = FORM_C] )
249  * Test if a string is in a given normalization form. }}} */
250 /* {{{ proto bool normalizer_is_normalized( string $input [, string $form = FORM_C] )
251  * Test if a string is in a given normalization form.
252  */
PHP_FUNCTION(normalizer_is_normalized)253 PHP_FUNCTION( normalizer_is_normalized )
254 {
255 	char*	 	input = NULL;
256 	/* form is optional, defaults to FORM_C */
257 	zend_long		form = NORMALIZER_DEFAULT;
258 	size_t		input_len = 0;
259 
260 	UChar*	 	uinput = NULL;
261 	int		uinput_len = 0;
262 	UErrorCode	status = U_ZERO_ERROR;
263 
264 	UBool		uret = FALSE;
265 
266 	intl_error_reset( NULL );
267 
268 	/* Parse parameters. */
269 	if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "s|l",
270 				&input, &input_len, &form) == FAILURE )
271 	{
272 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
273 				"normalizer_is_normalized: unable to parse input params", 0 );
274 
275 		RETURN_FALSE;
276 	}
277 
278 	switch(form) {
279 		/* case NORMALIZER_NONE: not allowed - doesn't make sense */
280 
281 		case NORMALIZER_FORM_D:
282 		case NORMALIZER_FORM_KD:
283 		case NORMALIZER_FORM_C:
284 		case NORMALIZER_FORM_KC:
285 #if U_ICU_VERSION_MAJOR_NUM >= 56
286 		case NORMALIZER_FORM_KC_CF:
287 #endif
288 			break;
289 		default:
290 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
291 						"normalizer_normalize: illegal normalization form", 0 );
292 			RETURN_FALSE;
293 	}
294 
295 
296 	/*
297 	 * Test normalization of string (converting it to UTF-16 first).
298 	 */
299 
300 	/* First convert the string to UTF-16. */
301 	intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
302 
303 	if( U_FAILURE( status ) )
304 	{
305 		/* Set global error code. */
306 		intl_error_set_code( NULL, status );
307 
308 		/* Set error messages. */
309 		intl_error_set_custom_msg( NULL, "Error converting string to UTF-16.", 0 );
310 		if (uinput) {
311 			efree( uinput );
312 		}
313 		RETURN_FALSE;
314 	}
315 
316 
317 	/* test string */
318 #if U_ICU_VERSION_MAJOR_NUM < 56
319 	uret = unorm_isNormalizedWithOptions( uinput, uinput_len, form, (int32_t) 0 /* options */, &status);
320 #else
321 	uret = intl_is_normalized(form, uinput, uinput_len, &status);
322 #endif
323 
324 	efree( uinput );
325 
326 	/* Bail out if an unexpected error occurred. */
327 	if( U_FAILURE(status)  ) {
328 		/* Set error messages. */
329 		intl_error_set_custom_msg( NULL,"Error testing if string is the given normalization form.", 0 );
330 		RETURN_FALSE;
331 	}
332 
333 	if ( uret )
334 		RETURN_TRUE;
335 
336 	RETURN_FALSE;
337 }
338 /* }}} */
339 
340 /* {{{ proto string|null Normalizer::getRawDecomposition( string $input [, string $form = FORM_C] )
341  * Returns the Decomposition_Mapping property for the given UTF-8 encoded code point. }}} */
342 /* {{{ proto string|null normalizer_get_raw_decomposition( string $input [, string $form = FORM_C] )
343  * Returns the Decomposition_Mapping property for the given UTF-8 encoded code point.
344  */
345 #if U_ICU_VERSION_MAJOR_NUM >= 56
PHP_FUNCTION(normalizer_get_raw_decomposition)346 PHP_FUNCTION( normalizer_get_raw_decomposition )
347 {
348 	char* input = NULL;
349 	size_t input_length = 0;
350 
351 	UChar32 codepoint = -1;
352 	int32_t offset = 0;
353 
354     UErrorCode status = U_ZERO_ERROR;
355     const UNormalizer2 *norm;
356     UChar decomposition[32];
357     int32_t decomposition_length;
358 
359 	zend_long form = NORMALIZER_DEFAULT;
360 
361 	intl_error_reset(NULL);
362 
363 	if ((zend_parse_parameters(ZEND_NUM_ARGS(), "s|l", &input, &input_length, &form) == FAILURE)) {
364 		return;
365 	}
366 
367 	norm = intl_get_normalizer(form, &status);
368 
369 	U8_NEXT(input, offset, input_length, codepoint);
370 	if ((size_t)offset != input_length) {
371 		intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
372 		intl_error_set_custom_msg(NULL, "Input string must be exactly one UTF-8 encoded code point long.", 0);
373 		return;
374 	}
375 
376 	if ((codepoint < UCHAR_MIN_VALUE) || (codepoint > UCHAR_MAX_VALUE)) {
377 		intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
378 		intl_error_set_custom_msg(NULL, "Code point out of range", 0);
379 		return;
380 	}
381 
382 	decomposition_length = unorm2_getRawDecomposition(norm, codepoint, decomposition, 32, &status);
383 	if (decomposition_length == -1) {
384 		RETURN_NULL();
385 	}
386 
387 	RETVAL_NEW_STR(intl_convert_utf16_to_utf8(decomposition, decomposition_length, &status));
388 }
389 #endif
390 /* }}} */
391