1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 5 |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | http://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Authors: Ed Batutis <ed@batutis.com> |
14 +----------------------------------------------------------------------+
15 */
16
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20
21 #include "php_intl.h"
22 #include "unicode/unorm.h"
23 #include "normalizer.h"
24 #include "normalizer_class.h"
25 #include "normalizer_normalize.h"
26 #include "intl_convert.h"
27
28 /* {{{ proto string Normalizer::normalize( string $input [, string $form = FORM_C] )
29 * Normalize a string. }}} */
30 /* {{{ proto string normalizer_normalize( string $input [, string $form = FORM_C] )
31 * Normalize a string.
32 */
PHP_FUNCTION(normalizer_normalize)33 PHP_FUNCTION( normalizer_normalize )
34 {
35 char* input = NULL;
36 /* form is optional, defaults to FORM_C */
37 long form = NORMALIZER_DEFAULT;
38 int input_len = 0;
39
40 UChar* uinput = NULL;
41 int uinput_len = 0;
42 int expansion_factor = 1;
43 UErrorCode status = U_ZERO_ERROR;
44
45 UChar* uret_buf = NULL;
46 int uret_len = 0;
47
48 char* ret_buf = NULL;
49 int32_t ret_len = 0;
50
51 int32_t size_needed;
52
53 intl_error_reset( NULL TSRMLS_CC );
54
55 /* Parse parameters. */
56 if( zend_parse_method_parameters( ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "s|l",
57 &input, &input_len, &form ) == FAILURE )
58 {
59 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
60 "normalizer_normalize: unable to parse input params", 0 TSRMLS_CC );
61
62 RETURN_FALSE;
63 }
64
65 expansion_factor = 1;
66
67 switch(form) {
68 case NORMALIZER_NONE:
69 break;
70 case NORMALIZER_FORM_D:
71 expansion_factor = 3;
72 break;
73 case NORMALIZER_FORM_KD:
74 expansion_factor = 3;
75 break;
76 case NORMALIZER_FORM_C:
77 case NORMALIZER_FORM_KC:
78 break;
79 default:
80 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
81 "normalizer_normalize: illegal normalization form", 0 TSRMLS_CC );
82 RETURN_FALSE;
83 }
84
85 /*
86 * Normalize string (converting it to UTF-16 first).
87 */
88
89 /* First convert the string to UTF-16. */
90 intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
91
92 if( U_FAILURE( status ) )
93 {
94 /* Set global error code. */
95 intl_error_set_code( NULL, status TSRMLS_CC );
96
97 /* Set error messages. */
98 intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
99 if (uinput) {
100 efree( uinput );
101 }
102 RETURN_FALSE;
103 }
104
105
106 /* Allocate memory for the destination buffer for normalization */
107 uret_len = uinput_len * expansion_factor;
108 uret_buf = eumalloc( uret_len + 1 );
109
110 /* normalize */
111 size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
112
113 /* Bail out if an unexpected error occured.
114 * (U_BUFFER_OVERFLOW_ERROR means that *target buffer is not large enough).
115 * (U_STRING_NOT_TERMINATED_WARNING usually means that the input string is empty).
116 */
117 if( U_FAILURE(status) && status != U_BUFFER_OVERFLOW_ERROR && status != U_STRING_NOT_TERMINATED_WARNING ) {
118 efree( uret_buf );
119 efree( uinput );
120 RETURN_NULL();
121 }
122
123 if ( size_needed > uret_len ) {
124 /* realloc does not seem to work properly - memory is corrupted
125 * uret_buf = eurealloc(uret_buf, size_needed + 1);
126 */
127 efree( uret_buf );
128 uret_buf = eumalloc( size_needed + 1 );
129 uret_len = size_needed;
130
131 status = U_ZERO_ERROR;
132
133 /* try normalize again */
134 size_needed = unorm_normalize( uinput, uinput_len, form, (int32_t) 0 /* options */, uret_buf, uret_len, &status);
135
136 /* Bail out if an unexpected error occured. */
137 if( U_FAILURE(status) ) {
138 /* Set error messages. */
139 intl_error_set_custom_msg( NULL,"Error normalizing string", 0 TSRMLS_CC );
140 efree( uret_buf );
141 efree( uinput );
142 RETURN_FALSE;
143 }
144 }
145
146 efree( uinput );
147
148 /* the buffer we actually used */
149 uret_len = size_needed;
150
151 /* Convert normalized string from UTF-16 to UTF-8. */
152 intl_convert_utf16_to_utf8( &ret_buf, &ret_len, uret_buf, uret_len, &status );
153 efree( uret_buf );
154 if( U_FAILURE( status ) )
155 {
156 intl_error_set( NULL, status,
157 "normalizer_normalize: error converting normalized text UTF-8", 0 TSRMLS_CC );
158 RETURN_FALSE;
159 }
160
161 /* Return it. */
162 RETVAL_STRINGL( ret_buf, ret_len, FALSE );
163 }
164 /* }}} */
165
166 /* {{{ proto bool Normalizer::isNormalized( string $input [, string $form = FORM_C] )
167 * Test if a string is in a given normalization form. }}} */
168 /* {{{ proto bool normalizer_is_normalize( string $input [, string $form = FORM_C] )
169 * Test if a string is in a given normalization form.
170 */
PHP_FUNCTION(normalizer_is_normalized)171 PHP_FUNCTION( normalizer_is_normalized )
172 {
173 char* input = NULL;
174 /* form is optional, defaults to FORM_C */
175 long form = NORMALIZER_DEFAULT;
176 int input_len = 0;
177
178 UChar* uinput = NULL;
179 int uinput_len = 0;
180 UErrorCode status = U_ZERO_ERROR;
181
182 UBool uret = FALSE;
183
184 intl_error_reset( NULL TSRMLS_CC );
185
186 /* Parse parameters. */
187 if( zend_parse_method_parameters( ZEND_NUM_ARGS() TSRMLS_CC, getThis(), "s|l",
188 &input, &input_len, &form) == FAILURE )
189 {
190 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
191 "normalizer_is_normalized: unable to parse input params", 0 TSRMLS_CC );
192
193 RETURN_FALSE;
194 }
195
196 switch(form) {
197 /* case NORMALIZER_NONE: not allowed - doesn't make sense */
198
199 case NORMALIZER_FORM_D:
200 case NORMALIZER_FORM_KD:
201 case NORMALIZER_FORM_C:
202 case NORMALIZER_FORM_KC:
203 break;
204 default:
205 intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR,
206 "normalizer_normalize: illegal normalization form", 0 TSRMLS_CC );
207 RETURN_FALSE;
208 }
209
210
211 /*
212 * Test normalization of string (converting it to UTF-16 first).
213 */
214
215 /* First convert the string to UTF-16. */
216 intl_convert_utf8_to_utf16(&uinput, &uinput_len, input, input_len, &status );
217
218 if( U_FAILURE( status ) )
219 {
220 /* Set global error code. */
221 intl_error_set_code( NULL, status TSRMLS_CC );
222
223 /* Set error messages. */
224 intl_error_set_custom_msg( NULL, "Error converting string to UTF-16.", 0 TSRMLS_CC );
225 if (uinput) {
226 efree( uinput );
227 }
228 RETURN_FALSE;
229 }
230
231
232 /* test string */
233 uret = unorm_isNormalizedWithOptions( uinput, uinput_len, form, (int32_t) 0 /* options */, &status);
234
235 efree( uinput );
236
237 /* Bail out if an unexpected error occured. */
238 if( U_FAILURE(status) ) {
239 /* Set error messages. */
240 intl_error_set_custom_msg( NULL,"Error testing if string is the given normalization form.", 0 TSRMLS_CC );
241 RETURN_FALSE;
242 }
243
244 if ( uret )
245 RETURN_TRUE;
246
247 RETURN_FALSE;
248 }
249 /* }}} */
250
251 /*
252 * Local variables:
253 * tab-width: 4
254 * c-basic-offset: 4
255 * End:
256 * vim600: noet sw=4 ts=4 fdm=marker
257 * vim<600: noet sw=4 ts=4
258 */
259