1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 5														  |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,	  |
6    | that is bundled with this package in the file LICENSE, and is		  |
7    | available through the world-wide-web at the following url:			  |
8    | http://www.php.net/license/3_01.txt								  |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to		  |
11    | license@php.net so we can mail you a copy immediately.				  |
12    +----------------------------------------------------------------------+
13    | Authors: Ed Batutis <ed@batutis.com>								  |
14    +----------------------------------------------------------------------+
15  */
16 
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20 
21 #include "php_intl.h"
22 #include "unicode/unorm.h"
23 #include "normalizer.h"
24 #include "normalizer_class.h"
25 #include "normalizer_normalize.h"
26 #include "intl_convert.h"
27 
28 /* {{{ proto string Normalizer::normalize( string $input [, string $form = FORM_C] )
29  * Normalize a string. }}} */
30 /* {{{ proto string normalizer_normalize( string $input [, string $form = FORM_C] )
31  * Normalize a string.
32  */
PHP_FUNCTION(normalizer_normalize)33 PHP_FUNCTION( normalizer_normalize )
34 {
35 	char*			input = NULL;
36 	/* form is optional, defaults to FORM_C */
37 	long			form = NORMALIZER_DEFAULT;
38 	int			input_len = 0;
39 
40 	UChar*			uinput = NULL;
41 	int			uinput_len = 0;
42 	int			expansion_factor = 1;
43 	UErrorCode		status = U_ZERO_ERROR;
44 
45 	UChar*			uret_buf = NULL;
46 	int			uret_len = 0;
47 
48 	char*			ret_buf = NULL;
49 	int32_t			ret_len = 0;
50 
51 	int32_t			size_needed;
52 
53 	intl_error_reset( NULL TSRMLS_CC );
54 
55 	/* Parse parameters. */
56 	if( zend_parse_method_parameters( ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "s|l",
57 				&input, &input_len, &form ) == FAILURE )
58 	{
59 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
60 						 "normalizer_normalize: unable to parse input params", 0 TSRMLS_CC );
61 
62 		RETURN_FALSE;
63 	}
64 
65 	expansion_factor = 1;
66 
67 	switch(form) {
68 		case NORMALIZER_NONE:
69 			break;
70 		case NORMALIZER_FORM_D:
71 			expansion_factor = 3;
72 			break;
73 		case NORMALIZER_FORM_KD:
74 			expansion_factor = 3;
75 			break;
76 		case NORMALIZER_FORM_C:
77 		case NORMALIZER_FORM_KC:
78 			break;
79 		default:
80 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
81 						"normalizer_normalize: illegal normalization form", 0 TSRMLS_CC );
82 			RETURN_FALSE;
83 	}
84 
85 	/*
86 	 * Normalize string (converting it to UTF-16 first).
87 	 */
88 
89 	/* First convert the string to UTF-16. */
90 	intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
91 
92 	if( U_FAILURE( status ) )
93 	{
94 		/* Set global error code. */
95 		intl_error_set_code( NULL, status TSRMLS_CC );
96 
97 		/* Set error messages. */
98 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
99 		if (uinput) {
100 			efree( uinput );
101 		}
102 		RETURN_FALSE;
103 	}
104 
105 
106 	/* Allocate memory for the destination buffer for normalization */
107 	uret_len = uinput_len * expansion_factor;
108 	uret_buf = eumalloc( uret_len + 1 );
109 
110 	/* normalize */
111 	size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
112 
113 	/* Bail out if an unexpected error occured.
114 	 * (U_BUFFER_OVERFLOW_ERROR means that *target buffer is not large enough).
115 	 * (U_STRING_NOT_TERMINATED_WARNING usually means that the input string is empty).
116 	 */
117 	if( U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && status != U_STRING_NOT_TERMINATED_WARNING ) {
118 		efree( uret_buf );
119 		efree( uinput );
120 		RETURN_NULL();
121 	}
122 
123 	if ( size_needed > uret_len ) {
124 		/* realloc does not seem to work properly - memory is corrupted
125 		 * uret_buf =  eurealloc(uret_buf, size_needed + 1);
126 		 */
127 		efree( uret_buf );
128 		uret_buf = eumalloc( size_needed + 1 );
129 		uret_len = size_needed;
130 
131 		status = U_ZERO_ERROR;
132 
133 		/* try normalize again */
134 		size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
135 
136 		/* Bail out if an unexpected error occured. */
137 		if( U_FAILURE(status)  ) {
138 			/* Set error messages. */
139 			intl_error_set_custom_msg( NULL,"Error normalizing string", 0 TSRMLS_CC );
140 			efree( uret_buf );
141 			efree( uinput );
142 			RETURN_FALSE;
143 		}
144 	}
145 
146 	efree( uinput );
147 
148 	/* the buffer we actually used */
149 	uret_len = size_needed;
150 
151 	/* Convert normalized string from UTF-16 to UTF-8. */
152 	intl_convert_utf16_to_utf8( &ret_buf, &ret_len, uret_buf, uret_len, &status );
153 	efree( uret_buf );
154 	if( U_FAILURE( status ) )
155 	{
156 		intl_error_set( NULL, status,
157 				"normalizer_normalize: error converting normalized text UTF-8", 0 TSRMLS_CC );
158 		RETURN_FALSE;
159 	}
160 
161 	/* Return it. */
162 	RETVAL_STRINGL( ret_buf, ret_len, FALSE );
163 }
164 /* }}} */
165 
166 /* {{{ proto bool Normalizer::isNormalized( string $input [, string $form = FORM_C] )
167  * Test if a string is in a given normalization form. }}} */
168 /* {{{ proto bool normalizer_is_normalize( string $input [, string $form = FORM_C] )
169  * Test if a string is in a given normalization form.
170  */
PHP_FUNCTION(normalizer_is_normalized)171 PHP_FUNCTION( normalizer_is_normalized )
172 {
173 	char*	 	input = NULL;
174 	/* form is optional, defaults to FORM_C */
175 	long		form = NORMALIZER_DEFAULT;
176 	int		input_len = 0;
177 
178 	UChar*	 	uinput = NULL;
179 	int		uinput_len = 0;
180 	UErrorCode	status = U_ZERO_ERROR;
181 
182 	UBool		uret = FALSE;
183 
184 	intl_error_reset( NULL TSRMLS_CC );
185 
186 	/* Parse parameters. */
187 	if( zend_parse_method_parameters( ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "s|l",
188 				&input, &input_len, &form) == FAILURE )
189 	{
190 		intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
191 				"normalizer_is_normalized: unable to parse input params", 0 TSRMLS_CC );
192 
193 		RETURN_FALSE;
194 	}
195 
196 	switch(form) {
197 		/* case NORMALIZER_NONE: not allowed - doesn't make sense */
198 
199 		case NORMALIZER_FORM_D:
200 		case NORMALIZER_FORM_KD:
201 		case NORMALIZER_FORM_C:
202 		case NORMALIZER_FORM_KC:
203 			break;
204 		default:
205 			intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
206 						"normalizer_normalize: illegal normalization form", 0 TSRMLS_CC );
207 			RETURN_FALSE;
208 	}
209 
210 
211 	/*
212 	 * Test normalization of string (converting it to UTF-16 first).
213 	 */
214 
215 	/* First convert the string to UTF-16. */
216 	intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
217 
218 	if( U_FAILURE( status ) )
219 	{
220 		/* Set global error code. */
221 		intl_error_set_code( NULL, status TSRMLS_CC );
222 
223 		/* Set error messages. */
224 		intl_error_set_custom_msg( NULL, "Error converting string to UTF-16.", 0 TSRMLS_CC );
225 		if (uinput) {
226 			efree( uinput );
227 		}
228 		RETURN_FALSE;
229 	}
230 
231 
232 	/* test string */
233 	uret = unorm_isNormalizedWithOptions( uinput, uinput_len, form, (int32_t) 0 /* options */, &status);
234 
235 	efree( uinput );
236 
237 	/* Bail out if an unexpected error occured. */
238 	if( U_FAILURE(status)  ) {
239 		/* Set error messages. */
240 		intl_error_set_custom_msg( NULL,"Error testing if string is the given normalization form.", 0 TSRMLS_CC );
241 		RETURN_FALSE;
242 	}
243 
244 	if ( uret )
245 		RETURN_TRUE;
246 
247 	RETURN_FALSE;
248 }
249 /* }}} */
250 
251 /*
252  * Local variables:
253  * tab-width: 4
254  * c-basic-offset: 4
255  * End:
256  * vim600: noet sw=4 ts=4 fdm=marker
257  * vim<600: noet sw=4 ts=4
258  */
259