1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 7														  |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,	  |
6    | that is bundled with this package in the file LICENSE, and is		  |
7    | available through the world-wide-web at the following url:			  |
8    | http://www.php.net/license/3_01.txt								  |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to		  |
11    | license@php.net so we can mail you a copy immediately.				  |
12    +----------------------------------------------------------------------+
13    | Authors: Ed Batutis <ed@batutis.com>								  |
14    +----------------------------------------------------------------------+
15  */
16 
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20 
21 #include "php_intl.h"
22 #include "unicode/unorm.h"
23 #include "normalizer.h"
24 #include "normalizer_class.h"
25 #include "normalizer_normalize.h"
26 #include "intl_convert.h"
27 
28 /* {{{ proto string Normalizer::normalize( string $input [, string $form = FORM_C] )
29  * Normalize a string. }}} */
30 /* {{{ proto string normalizer_normalize( string $input [, string $form = FORM_C] )
31  * Normalize a string.
32  */
PHP_FUNCTION(normalizer_normalize)33 PHP_FUNCTION( normalizer_normalize )
34 {
35 	char*			input = NULL;
36 	/* form is optional, defaults to FORM_C */
37 	zend_long	    form = NORMALIZER_DEFAULT;
38 	size_t			input_len = 0;
39 
40 	UChar*			uinput = NULL;
41 	int32_t		    uinput_len = 0;
42 	int			    expansion_factor = 1;
43 	UErrorCode		status = U_ZERO_ERROR;
44 
45 	UChar*			uret_buf = NULL;
46 	int32_t			uret_len = 0;
47 
48 	zend_string*    u8str;
49 
50 	int32_t			size_needed;
51 
52 	intl_error_reset( NULL );
53 
54 	/* Parse parameters. */
55 	if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "s|l",
56 				&input, &input_len, &form ) == FAILURE )
57 	{
58 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
59 						 "normalizer_normalize: unable to parse input params", 0 );
60 
61 		RETURN_FALSE;
62 	}
63 
64 	expansion_factor = 1;
65 
66 	switch(form) {
67 		case NORMALIZER_NONE:
68 			break;
69 		case NORMALIZER_FORM_D:
70 			expansion_factor = 3;
71 			break;
72 		case NORMALIZER_FORM_KD:
73 			expansion_factor = 3;
74 			break;
75 		case NORMALIZER_FORM_C:
76 		case NORMALIZER_FORM_KC:
77 			break;
78 		default:
79 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
80 						"normalizer_normalize: illegal normalization form", 0 );
81 			RETURN_FALSE;
82 	}
83 
84 	/*
85 	 * Normalize string (converting it to UTF-16 first).
86 	 */
87 
88 	/* First convert the string to UTF-16. */
89 	intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
90 
91 	if( U_FAILURE( status ) )
92 	{
93 		/* Set global error code. */
94 		intl_error_set_code( NULL, status );
95 
96 		/* Set error messages. */
97 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 );
98 		if (uinput) {
99 			efree( uinput );
100 		}
101 		RETURN_FALSE;
102 	}
103 
104 
105 	/* Allocate memory for the destination buffer for normalization */
106 	uret_len = uinput_len * expansion_factor;
107 	uret_buf = eumalloc( uret_len + 1 );
108 
109 	/* normalize */
110 	size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
111 
112 	/* Bail out if an unexpected error occurred.
113 	 * (U_BUFFER_OVERFLOW_ERROR means that *target buffer is not large enough).
114 	 * (U_STRING_NOT_TERMINATED_WARNING usually means that the input string is empty).
115 	 */
116 	if( U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && status != U_STRING_NOT_TERMINATED_WARNING ) {
117 		efree( uret_buf );
118 		efree( uinput );
119 		RETURN_NULL();
120 	}
121 
122 	if ( size_needed > uret_len ) {
123 		/* realloc does not seem to work properly - memory is corrupted
124 		 * uret_buf =  eurealloc(uret_buf, size_needed + 1);
125 		 */
126 		efree( uret_buf );
127 		uret_buf = eumalloc( size_needed + 1 );
128 		uret_len = size_needed;
129 
130 		status = U_ZERO_ERROR;
131 
132 		/* try normalize again */
133 		size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
134 
135 		/* Bail out if an unexpected error occurred. */
136 		if( U_FAILURE(status)  ) {
137 			/* Set error messages. */
138 			intl_error_set_custom_msg( NULL,"Error normalizing string", 0 );
139 			efree( uret_buf );
140 			efree( uinput );
141 			RETURN_FALSE;
142 		}
143 	}
144 
145 	efree( uinput );
146 
147 	/* the buffer we actually used */
148 	uret_len = size_needed;
149 
150 	/* Convert normalized string from UTF-16 to UTF-8. */
151 	u8str = intl_convert_utf16_to_utf8(uret_buf, uret_len, &status );
152 	efree( uret_buf );
153 	if( !u8str )
154 	{
155 		intl_error_set( NULL, status,
156 				"normalizer_normalize: error converting normalized text UTF-8", 0 );
157 		RETURN_FALSE;
158 	}
159 
160 	/* Return it. */
161 	RETVAL_NEW_STR( u8str );
162 }
163 /* }}} */
164 
165 /* {{{ proto bool Normalizer::isNormalized( string $input [, string $form = FORM_C] )
166  * Test if a string is in a given normalization form. }}} */
167 /* {{{ proto bool normalizer_is_normalize( string $input [, string $form = FORM_C] )
168  * Test if a string is in a given normalization form.
169  */
PHP_FUNCTION(normalizer_is_normalized)170 PHP_FUNCTION( normalizer_is_normalized )
171 {
172 	char*	 	input = NULL;
173 	/* form is optional, defaults to FORM_C */
174 	zend_long		form = NORMALIZER_DEFAULT;
175 	size_t		input_len = 0;
176 
177 	UChar*	 	uinput = NULL;
178 	int		uinput_len = 0;
179 	UErrorCode	status = U_ZERO_ERROR;
180 
181 	UBool		uret = FALSE;
182 
183 	intl_error_reset( NULL );
184 
185 	/* Parse parameters. */
186 	if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "s|l",
187 				&input, &input_len, &form) == FAILURE )
188 	{
189 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
190 				"normalizer_is_normalized: unable to parse input params", 0 );
191 
192 		RETURN_FALSE;
193 	}
194 
195 	switch(form) {
196 		/* case NORMALIZER_NONE: not allowed - doesn't make sense */
197 
198 		case NORMALIZER_FORM_D:
199 		case NORMALIZER_FORM_KD:
200 		case NORMALIZER_FORM_C:
201 		case NORMALIZER_FORM_KC:
202 			break;
203 		default:
204 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
205 						"normalizer_normalize: illegal normalization form", 0 );
206 			RETURN_FALSE;
207 	}
208 
209 
210 	/*
211 	 * Test normalization of string (converting it to UTF-16 first).
212 	 */
213 
214 	/* First convert the string to UTF-16. */
215 	intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
216 
217 	if( U_FAILURE( status ) )
218 	{
219 		/* Set global error code. */
220 		intl_error_set_code( NULL, status );
221 
222 		/* Set error messages. */
223 		intl_error_set_custom_msg( NULL, "Error converting string to UTF-16.", 0 );
224 		if (uinput) {
225 			efree( uinput );
226 		}
227 		RETURN_FALSE;
228 	}
229 
230 
231 	/* test string */
232 	uret = unorm_isNormalizedWithOptions( uinput, uinput_len, form, (int32_t) 0 /* options */, &status);
233 
234 	efree( uinput );
235 
236 	/* Bail out if an unexpected error occurred. */
237 	if( U_FAILURE(status)  ) {
238 		/* Set error messages. */
239 		intl_error_set_custom_msg( NULL,"Error testing if string is the given normalization form.", 0 );
240 		RETURN_FALSE;
241 	}
242 
243 	if ( uret )
244 		RETURN_TRUE;
245 
246 	RETURN_FALSE;
247 }
248 /* }}} */
249 
250 /*
251  * Local variables:
252  * tab-width: 4
253  * c-basic-offset: 4
254  * End:
255  * vim600: noet sw=4 ts=4 fdm=marker
256  * vim<600: noet sw=4 ts=4
257  */
258