1 /*
2    +----------------------------------------------------------------------+
3    | This source file is subject to version 3.01 of the PHP license,	  |
4    | that is bundled with this package in the file LICENSE, and is		  |
5    | available through the world-wide-web at the following url:			  |
6    | https://www.php.net/license/3_01.txt                                 |
7    | If you did not receive a copy of the PHP license and are unable to   |
8    | obtain it through the world-wide-web, please send a note to		  |
9    | license@php.net so we can mail you a copy immediately.				  |
10    +----------------------------------------------------------------------+
11    | Authors: Ed Batutis <ed@batutis.com>								  |
12    +----------------------------------------------------------------------+
13  */
14 
15 #ifdef HAVE_CONFIG_H
16 #include "config.h"
17 #endif
18 
19 #include "php_intl.h"
20 #if U_ICU_VERSION_MAJOR_NUM < 56
21 #include "unicode/unorm.h"
22 #else
23 #include <unicode/unorm2.h>
24 #endif
25 #include "normalizer.h"
26 #include "normalizer_class.h"
27 #include "intl_convert.h"
28 #include <unicode/utf8.h>
29 
30 
31 #if U_ICU_VERSION_MAJOR_NUM >= 56
intl_get_normalizer(zend_long form,UErrorCode * err)32 static const UNormalizer2 *intl_get_normalizer(zend_long form, UErrorCode *err)
33 {/*{{{*/
34 	switch (form)
35 	{
36 		case NORMALIZER_FORM_C:
37 			return unorm2_getNFCInstance(err);
38 			break;
39 		case NORMALIZER_FORM_D:
40 			return unorm2_getNFDInstance(err);
41 			break;
42 		case NORMALIZER_FORM_KC:
43 			return unorm2_getNFKCInstance(err);
44 			break;
45 		case NORMALIZER_FORM_KD:
46 			return unorm2_getNFKDInstance(err);
47 			break;
48 		case NORMALIZER_FORM_KC_CF:
49 			return unorm2_getNFKCCasefoldInstance(err);
50 			break;
51 	}
52 
53 	*err = U_ILLEGAL_ARGUMENT_ERROR;
54 	return NULL;
55 }/*}}}*/
56 
intl_normalize(zend_long form,const UChar * src,int32_t src_len,UChar * dst,int32_t dst_len,UErrorCode * err)57 static int32_t intl_normalize(zend_long form, const UChar *src, int32_t src_len, UChar *dst, int32_t dst_len, UErrorCode *err)
58 {/*{{{*/
59 	const UNormalizer2 *norm = intl_get_normalizer(form, err);
60 	if (U_FAILURE(*err)) {
61 		return -1;
62 	}
63 
64 	return unorm2_normalize(norm, src, src_len, dst, dst_len, err);
65 }/*}}}*/
66 
intl_is_normalized(zend_long form,const UChar * uinput,int32_t uinput_len,UErrorCode * err)67 static UBool intl_is_normalized(zend_long form, const UChar *uinput, int32_t uinput_len, UErrorCode *err)
68 {/*{{{*/
69 	const UNormalizer2 *norm = intl_get_normalizer(form, err);
70 
71 	if(U_FAILURE(*err)) {
72 		return false;
73 	}
74 
75 	return unorm2_isNormalized(norm, uinput, uinput_len, err);
76 }/*}}}*/
77 #endif
78 
79 /* {{{ Normalize a string. */
PHP_FUNCTION(normalizer_normalize)80 PHP_FUNCTION( normalizer_normalize )
81 {
82 	char*			input = NULL;
83 	/* form is optional, defaults to FORM_C */
84 	zend_long	    form = NORMALIZER_DEFAULT;
85 	size_t			input_len = 0;
86 
87 	UChar*			uinput = NULL;
88 	int32_t		    uinput_len = 0;
89 	int			    expansion_factor = 1;
90 	UErrorCode		status = U_ZERO_ERROR;
91 
92 	UChar*			uret_buf = NULL;
93 	int32_t			uret_len = 0;
94 
95 	zend_string*    u8str;
96 
97 	int32_t			size_needed;
98 
99 	intl_error_reset( NULL );
100 
101 	/* Parse parameters. */
102 	if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "s|l",
103 				&input, &input_len, &form ) == FAILURE )
104 	{
105 		RETURN_THROWS();
106 	}
107 
108 	expansion_factor = 1;
109 
110 	switch(form) {
111 		case NORMALIZER_FORM_D:
112 			expansion_factor = 3;
113 			break;
114 		case NORMALIZER_FORM_KD:
115 			expansion_factor = 3;
116 			break;
117 		case NORMALIZER_FORM_C:
118 		case NORMALIZER_FORM_KC:
119 #if U_ICU_VERSION_MAJOR_NUM >= 56
120 		case NORMALIZER_FORM_KC_CF:
121 #endif
122 			break;
123 		default:
124 			zend_argument_value_error(2, "must be a a valid normalization form");
125 			RETURN_THROWS();
126 	}
127 
128 	/*
129 	 * Normalize string (converting it to UTF-16 first).
130 	 */
131 
132 	/* First convert the string to UTF-16. */
133 	intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
134 
135 	if( U_FAILURE( status ) )
136 	{
137 		/* Set global error code. */
138 		intl_error_set_code( NULL, status );
139 
140 		/* Set error messages. */
141 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
142 		if (uinput) {
143 			efree( uinput );
144 		}
145 		RETURN_FALSE;
146 	}
147 
148 
149 	/* Allocate memory for the destination buffer for normalization */
150 	uret_len = uinput_len * expansion_factor;
151 	uret_buf = eumalloc( uret_len + 1 );
152 
153 	/* normalize */
154 #if U_ICU_VERSION_MAJOR_NUM < 56
155 	size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
156 #else
157 	size_needed = intl_normalize(form, uinput, uinput_len, uret_buf, uret_len, &status);
158 #endif
159 
160 	/* Bail out if an unexpected error occurred.
161 	 * (U_BUFFER_OVERFLOW_ERROR means that *target buffer is not large enough).
162 	 * (U_STRING_NOT_TERMINATED_WARNING usually means that the input string is empty).
163 	 */
164 	if( U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && status != U_STRING_NOT_TERMINATED_WARNING ) {
165 		intl_error_set_custom_msg( NULL, "Error normalizing string", 0 );
166 		efree( uret_buf );
167 		efree( uinput );
168 		RETURN_FALSE;
169 	}
170 
171 	if ( size_needed > uret_len ) {
172 		/* realloc does not seem to work properly - memory is corrupted
173 		 * uret_buf =  eurealloc(uret_buf, size_needed + 1);
174 		 */
175 		efree( uret_buf );
176 		uret_buf = eumalloc( size_needed + 1 );
177 		uret_len = size_needed;
178 
179 		status = U_ZERO_ERROR;
180 
181 		/* try normalize again */
182 #if U_ICU_VERSION_MAJOR_NUM < 56
183 		size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
184 #else
185 		size_needed = intl_normalize(form, uinput, uinput_len, uret_buf, uret_len, &status);
186 #endif
187 
188 		/* Bail out if an unexpected error occurred. */
189 		if( U_FAILURE(status)  ) {
190 			/* Set error messages. */
191 			intl_error_set_custom_msg( NULL,"Error normalizing string", 0 );
192 			efree( uret_buf );
193 			efree( uinput );
194 			RETURN_FALSE;
195 		}
196 	}
197 
198 	efree( uinput );
199 
200 	/* the buffer we actually used */
201 	uret_len = size_needed;
202 
203 	/* Convert normalized string from UTF-16 to UTF-8. */
204 	u8str = intl_convert_utf16_to_utf8(uret_buf, uret_len, &status );
205 	efree( uret_buf );
206 	if( !u8str )
207 	{
208 		intl_error_set( NULL, status,
209 				"normalizer_normalize: error converting normalized text UTF-8", 0 );
210 		RETURN_FALSE;
211 	}
212 
213 	/* Return it. */
214 	RETVAL_NEW_STR( u8str );
215 }
216 /* }}} */
217 
218 /* {{{ Test if a string is in a given normalization form. */
PHP_FUNCTION(normalizer_is_normalized)219 PHP_FUNCTION( normalizer_is_normalized )
220 {
221 	char*	 	input = NULL;
222 	/* form is optional, defaults to FORM_C */
223 	zend_long		form = NORMALIZER_DEFAULT;
224 	size_t		input_len = 0;
225 
226 	UChar*	 	uinput = NULL;
227 	int		uinput_len = 0;
228 	UErrorCode	status = U_ZERO_ERROR;
229 
230 	UBool		uret = false;
231 
232 	intl_error_reset( NULL );
233 
234 	/* Parse parameters. */
235 	if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "s|l",
236 				&input, &input_len, &form) == FAILURE )
237 	{
238 		RETURN_THROWS();
239 	}
240 
241 	switch(form) {
242 		case NORMALIZER_FORM_D:
243 		case NORMALIZER_FORM_KD:
244 		case NORMALIZER_FORM_C:
245 		case NORMALIZER_FORM_KC:
246 #if U_ICU_VERSION_MAJOR_NUM >= 56
247 		case NORMALIZER_FORM_KC_CF:
248 #endif
249 			break;
250 		default:
251 			zend_argument_value_error(2, "must be a a valid normalization form");
252 			RETURN_THROWS();
253 	}
254 
255 
256 	/*
257 	 * Test normalization of string (converting it to UTF-16 first).
258 	 */
259 
260 	/* First convert the string to UTF-16. */
261 	intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
262 
263 	if( U_FAILURE( status ) )
264 	{
265 		/* Set global error code. */
266 		intl_error_set_code( NULL, status );
267 
268 		/* Set error messages. */
269 		intl_error_set_custom_msg( NULL, "Error converting string to UTF-16.", 0 );
270 		if (uinput) {
271 			efree( uinput );
272 		}
273 		RETURN_FALSE;
274 	}
275 
276 
277 	/* test string */
278 #if U_ICU_VERSION_MAJOR_NUM < 56
279 	uret = unorm_isNormalizedWithOptions( uinput, uinput_len, form, (int32_t) 0 /* options */, &status);
280 #else
281 	uret = intl_is_normalized(form, uinput, uinput_len, &status);
282 #endif
283 
284 	efree( uinput );
285 
286 	/* Bail out if an unexpected error occurred. */
287 	if( U_FAILURE(status)  ) {
288 		/* Set error messages. */
289 		intl_error_set_custom_msg( NULL,"Error testing if string is the given normalization form.", 0 );
290 		RETURN_FALSE;
291 	}
292 
293 	if ( uret )
294 		RETURN_TRUE;
295 
296 	RETURN_FALSE;
297 }
298 /* }}} */
299 
300 /* {{{ Returns the Decomposition_Mapping property for the given UTF-8 encoded code point. */
301 #if U_ICU_VERSION_MAJOR_NUM >= 56
PHP_FUNCTION(normalizer_get_raw_decomposition)302 PHP_FUNCTION( normalizer_get_raw_decomposition )
303 {
304 	char* input = NULL;
305 	size_t input_length = 0;
306 
307 	UChar32 codepoint = -1;
308 	int32_t offset = 0;
309 
310 	UErrorCode status = U_ZERO_ERROR;
311 	const UNormalizer2 *norm;
312 	UChar decomposition[32];
313 	int32_t decomposition_length;
314 
315 	zend_long form = NORMALIZER_DEFAULT;
316 
317 	intl_error_reset(NULL);
318 
319 	if ((zend_parse_parameters(ZEND_NUM_ARGS(), "s|l", &input, &input_length, &form) == FAILURE)) {
320 		RETURN_THROWS();
321 	}
322 
323 	norm = intl_get_normalizer(form, &status);
324 
325 	U8_NEXT(input, offset, input_length, codepoint);
326 	if ((size_t)offset != input_length) {
327 		intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
328 		intl_error_set_custom_msg(NULL, "Input string must be exactly one UTF-8 encoded code point long.", 0);
329 		return;
330 	}
331 
332 	if ((codepoint < UCHAR_MIN_VALUE) || (codepoint > UCHAR_MAX_VALUE)) {
333 		intl_error_set_code(NULL, U_ILLEGAL_ARGUMENT_ERROR);
334 		intl_error_set_custom_msg(NULL, "Code point out of range", 0);
335 		return;
336 	}
337 
338 	decomposition_length = unorm2_getRawDecomposition(norm, codepoint, decomposition, 32, &status);
339 	if (decomposition_length == -1) {
340 		RETURN_NULL();
341 	}
342 
343 	RETVAL_NEW_STR(intl_convert_utf16_to_utf8(decomposition, decomposition_length, &status));
344 }
345 #endif
346 /* }}} */
347