xref: /PHP-5.4/ext/intl/idn/idn.c (revision d06c4e90)
1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 5                                                        |
4    +----------------------------------------------------------------------+
5    | Copyright (c) 2009 The PHP Group                                     |
6    +----------------------------------------------------------------------+
7    | This source file is subject to version 3.01 of the PHP license,      |
8    | that is bundled with this package in the file LICENSE, and is        |
9    | available through the world-wide-web at the following url:           |
10    | http://www.php.net/license/3_01.txt                                  |
11    | If you did not receive a copy of the PHP license and are unable to   |
12    | obtain it through the world-wide-web, please send a note to          |
13    | license@php.net so we can mail you a copy immediately.               |
14    +----------------------------------------------------------------------+
15    | Author: Pierre A. Joye <pierre@php.net>                              |
16    |         Gustavo Lopes  <cataphract@php.net>                          |
17    +----------------------------------------------------------------------+
18  */
19 /* $Id$ */
20 
21 /* {{{ includes */
22 #ifdef HAVE_CONFIG_H
23 #include "config.h"
24 #endif
25 
26 #include <php.h>
27 
28 #include <unicode/uidna.h>
29 #include <unicode/ustring.h>
30 #include "ext/standard/php_string.h"
31 
32 #include "intl_error.h"
33 #include "intl_convert.h"
34 /* }}} */
35 
36 #ifdef UIDNA_INFO_INITIALIZER
37 #define HAVE_46_API 1 /* has UTS#46 API (introduced in ICU 4.6) */
38 #endif
39 
40 enum {
41 	INTL_IDN_VARIANT_2003 = 0,
42 	INTL_IDN_VARIANT_UTS46
43 };
44 
45 /* {{{ grapheme_register_constants
46  * Register API constants
47  */
idn_register_constants(INIT_FUNC_ARGS)48 void idn_register_constants( INIT_FUNC_ARGS )
49 {
50 	/* OPTIONS */
51 
52 	/* Option to prohibit processing of unassigned codepoints in the input and
53 	   do not check if the input conforms to STD-3 ASCII rules. */
54 	REGISTER_LONG_CONSTANT("IDNA_DEFAULT", UIDNA_DEFAULT, CONST_CS | CONST_PERSISTENT);
55 
56 	/* Option to allow processing of unassigned codepoints in the input */
57 	REGISTER_LONG_CONSTANT("IDNA_ALLOW_UNASSIGNED", UIDNA_ALLOW_UNASSIGNED, CONST_CS | CONST_PERSISTENT);
58 
59 	/* Option to check if input conforms to STD-3 ASCII rules */
60 	REGISTER_LONG_CONSTANT("IDNA_USE_STD3_RULES", UIDNA_USE_STD3_RULES, CONST_CS | CONST_PERSISTENT);
61 
62 #ifdef HAVE_46_API
63 
64 	/* Option to check for whether the input conforms to the BiDi rules.
65 	 * Ignored by the IDNA2003 implementation. (IDNA2003 always performs a BiDi check.) */
66 	REGISTER_LONG_CONSTANT("IDNA_CHECK_BIDI", UIDNA_CHECK_BIDI, CONST_CS | CONST_PERSISTENT);
67 
68 	/* Option to check for whether the input conforms to the CONTEXTJ rules.
69 	 * Ignored by the IDNA2003 implementation. (The CONTEXTJ check is new in IDNA2008.) */
70 	REGISTER_LONG_CONSTANT("IDNA_CHECK_CONTEXTJ", UIDNA_CHECK_CONTEXTJ, CONST_CS | CONST_PERSISTENT);
71 
72 	/* Option for nontransitional processing in ToASCII().
73 	 * By default, ToASCII() uses transitional processing.
74 	 * Ignored by the IDNA2003 implementation. */
75 	REGISTER_LONG_CONSTANT("IDNA_NONTRANSITIONAL_TO_ASCII", UIDNA_NONTRANSITIONAL_TO_ASCII, CONST_CS | CONST_PERSISTENT);
76 
77 	/* Option for nontransitional processing in ToUnicode().
78 	 * By default, ToUnicode() uses transitional processing.
79 	 * Ignored by the IDNA2003 implementation. */
80 	REGISTER_LONG_CONSTANT("IDNA_NONTRANSITIONAL_TO_UNICODE", UIDNA_NONTRANSITIONAL_TO_UNICODE, CONST_CS | CONST_PERSISTENT);
81 #endif
82 
83 	/* VARIANTS */
84 	REGISTER_LONG_CONSTANT("INTL_IDNA_VARIANT_2003", INTL_IDN_VARIANT_2003, CONST_CS | CONST_PERSISTENT);
85 #ifdef HAVE_46_API
86 	REGISTER_LONG_CONSTANT("INTL_IDNA_VARIANT_UTS46", INTL_IDN_VARIANT_UTS46, CONST_CS | CONST_PERSISTENT);
87 #endif
88 
89 #ifdef HAVE_46_API
90 	/* PINFO ERROR CODES */
91 	REGISTER_LONG_CONSTANT("IDNA_ERROR_EMPTY_LABEL", UIDNA_ERROR_EMPTY_LABEL, CONST_CS | CONST_PERSISTENT);
92 	REGISTER_LONG_CONSTANT("IDNA_ERROR_LABEL_TOO_LONG", UIDNA_ERROR_LABEL_TOO_LONG, CONST_CS | CONST_PERSISTENT);
93 	REGISTER_LONG_CONSTANT("IDNA_ERROR_DOMAIN_NAME_TOO_LONG", UIDNA_ERROR_DOMAIN_NAME_TOO_LONG, CONST_CS | CONST_PERSISTENT);
94 	REGISTER_LONG_CONSTANT("IDNA_ERROR_LEADING_HYPHEN", UIDNA_ERROR_LEADING_HYPHEN, CONST_CS | CONST_PERSISTENT);
95 	REGISTER_LONG_CONSTANT("IDNA_ERROR_TRAILING_HYPHEN", UIDNA_ERROR_TRAILING_HYPHEN, CONST_CS | CONST_PERSISTENT);
96 	REGISTER_LONG_CONSTANT("IDNA_ERROR_HYPHEN_3_4", UIDNA_ERROR_HYPHEN_3_4, CONST_CS | CONST_PERSISTENT);
97 	REGISTER_LONG_CONSTANT("IDNA_ERROR_LEADING_COMBINING_MARK", UIDNA_ERROR_LEADING_COMBINING_MARK, CONST_CS | CONST_PERSISTENT);
98 	REGISTER_LONG_CONSTANT("IDNA_ERROR_DISALLOWED", UIDNA_ERROR_DISALLOWED, CONST_CS | CONST_PERSISTENT);
99 	REGISTER_LONG_CONSTANT("IDNA_ERROR_PUNYCODE", UIDNA_ERROR_PUNYCODE, CONST_CS | CONST_PERSISTENT);
100 	REGISTER_LONG_CONSTANT("IDNA_ERROR_LABEL_HAS_DOT", UIDNA_ERROR_LABEL_HAS_DOT, CONST_CS | CONST_PERSISTENT);
101 	REGISTER_LONG_CONSTANT("IDNA_ERROR_INVALID_ACE_LABEL", UIDNA_ERROR_INVALID_ACE_LABEL, CONST_CS | CONST_PERSISTENT);
102 	REGISTER_LONG_CONSTANT("IDNA_ERROR_BIDI", UIDNA_ERROR_BIDI, CONST_CS | CONST_PERSISTENT);
103 	REGISTER_LONG_CONSTANT("IDNA_ERROR_CONTEXTJ", UIDNA_ERROR_CONTEXTJ, CONST_CS | CONST_PERSISTENT);
104 #endif
105 }
106 /* }}} */
107 
108 enum {
109 	INTL_IDN_TO_ASCII = 0,
110 	INTL_IDN_TO_UTF8
111 };
112 
113 /* like INTL_CHECK_STATUS, but as a function and varying the name of the func */
php_intl_idn_check_status(UErrorCode err,const char * msg,int mode TSRMLS_DC)114 static int php_intl_idn_check_status(UErrorCode err, const char *msg, int mode TSRMLS_DC)
115 {
116 	intl_error_set_code(NULL, err TSRMLS_CC);
117 	if (U_FAILURE(err)) {
118 		char *buff;
119 		spprintf(&buff, 0, "%s: %s",
120 			mode == INTL_IDN_TO_ASCII ? "idn_to_ascii" : "idn_to_utf8",
121 			msg);
122 		intl_error_set_custom_msg(NULL, buff, 1 TSRMLS_CC);
123 		efree(buff);
124 		return FAILURE;
125 	}
126 
127 	return SUCCESS;
128 }
129 
php_intl_bad_args(const char * msg,int mode TSRMLS_DC)130 static inline void php_intl_bad_args(const char *msg, int mode TSRMLS_DC)
131 {
132 	php_intl_idn_check_status(U_ILLEGAL_ARGUMENT_ERROR, msg, mode TSRMLS_CC);
133 }
134 
135 #ifdef HAVE_46_API
php_intl_idn_to_46(INTERNAL_FUNCTION_PARAMETERS,const char * domain,int domain_len,uint32_t option,int mode,zval * idna_info)136 static void php_intl_idn_to_46(INTERNAL_FUNCTION_PARAMETERS,
137 		const char *domain, int domain_len, uint32_t option, int mode, zval *idna_info)
138 {
139 	UErrorCode	  status = U_ZERO_ERROR;
140 	UIDNA		  *uts46;
141 	int32_t		  len;
142 	int32_t		  buffer_capac = 255; /* no domain name may exceed this */
143 	char		  *buffer = emalloc(buffer_capac);
144 	UIDNAInfo	  info = UIDNA_INFO_INITIALIZER;
145 	int			  buffer_used = 0;
146 
147 	uts46 = uidna_openUTS46(option, &status);
148 	if (php_intl_idn_check_status(status, "failed to open UIDNA instance",
149 			mode TSRMLS_CC) == FAILURE) {
150 		efree(buffer);
151 		RETURN_FALSE;
152 	}
153 
154 	if (mode == INTL_IDN_TO_ASCII) {
155 		len = uidna_nameToASCII_UTF8(uts46, domain, (int32_t)domain_len,
156 				buffer, buffer_capac, &info, &status);
157 	} else {
158 		len = uidna_nameToUnicodeUTF8(uts46, domain, (int32_t)domain_len,
159 				buffer, buffer_capac, &info, &status);
160 	}
161 	if (php_intl_idn_check_status(status, "failed to convert name",
162 			mode TSRMLS_CC) == FAILURE) {
163 		uidna_close(uts46);
164 		efree(buffer);
165 		RETURN_FALSE;
166 	}
167 	if (len >= 255) {
168 		php_error_docref(NULL TSRMLS_CC, E_ERROR, "ICU returned an unexpected length");
169 	}
170 
171 	buffer[len] = '\0';
172 
173 	if (info.errors == 0) {
174 		RETVAL_STRINGL(buffer, len, 0);
175 		buffer_used = 1;
176 	} else {
177 		RETVAL_FALSE;
178 	}
179 
180 	if (idna_info) {
181 		if (buffer_used) { /* used in return_value then */
182 			zval_addref_p(return_value);
183 			add_assoc_zval_ex(idna_info, "result", sizeof("result"), return_value);
184 		} else {
185 			zval *zv;
186 			ALLOC_INIT_ZVAL(zv);
187 			ZVAL_STRINGL(zv, buffer, len, 0);
188 			buffer_used = 1;
189 			add_assoc_zval_ex(idna_info, "result", sizeof("result"), zv);
190 		}
191 		add_assoc_bool_ex(idna_info, "isTransitionalDifferent",
192 				sizeof("isTransitionalDifferent"), info.isTransitionalDifferent);
193 		add_assoc_long_ex(idna_info, "errors", sizeof("errors"), (long)info.errors);
194 	}
195 
196 	if (!buffer_used) {
197 		efree(buffer);
198 	}
199 
200 	uidna_close(uts46);
201 }
202 #endif
203 
php_intl_idn_to(INTERNAL_FUNCTION_PARAMETERS,const char * domain,int domain_len,uint32_t option,int mode)204 static void php_intl_idn_to(INTERNAL_FUNCTION_PARAMETERS,
205 		const char *domain, int domain_len, uint32_t option, int mode)
206 {
207 	UChar* ustring = NULL;
208 	int ustring_len = 0;
209 	UErrorCode status;
210 	char     *converted_utf8;
211 	int32_t   converted_utf8_len;
212 	UChar     converted[MAXPATHLEN];
213 	int32_t   converted_ret_len;
214 
215 	/* convert the string to UTF-16. */
216 	status = U_ZERO_ERROR;
217 	intl_convert_utf8_to_utf16(&ustring, &ustring_len, domain, domain_len, &status);
218 
219 	if (U_FAILURE(status)) {
220 		intl_error_set_code(NULL, status TSRMLS_CC);
221 
222 		/* Set error messages. */
223 		intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
224 		if (ustring) {
225 			efree(ustring);
226 		}
227 		RETURN_FALSE;
228 	} else {
229 		UParseError parse_error;
230 
231 		status = U_ZERO_ERROR;
232 		if (mode == INTL_IDN_TO_ASCII) {
233 			converted_ret_len = uidna_IDNToASCII(ustring, ustring_len, converted, MAXPATHLEN, (int32_t)option, &parse_error, &status);
234 		} else {
235 			converted_ret_len = uidna_IDNToUnicode(ustring, ustring_len, converted, MAXPATHLEN, (int32_t)option, &parse_error, &status);
236 		}
237 		efree(ustring);
238 
239 		if (U_FAILURE(status)) {
240 			intl_error_set( NULL, status, "idn_to_ascii: cannot convert to ASCII", 0 TSRMLS_CC );
241 			RETURN_FALSE;
242 		}
243 
244 		status = U_ZERO_ERROR;
245 		intl_convert_utf16_to_utf8(&converted_utf8, &converted_utf8_len, converted, converted_ret_len, &status);
246 
247 		if (U_FAILURE(status)) {
248 			/* Set global error code. */
249 			intl_error_set_code(NULL, status TSRMLS_CC);
250 
251 			/* Set error messages. */
252 			intl_error_set_custom_msg( NULL, "Error converting output string to UTF-8", 0 TSRMLS_CC );
253 			efree(converted_utf8);
254 			RETURN_FALSE;
255 		}
256 	}
257 
258 	/* return the allocated string, not a duplicate */
259 	RETURN_STRINGL(((char *)converted_utf8), converted_utf8_len, 0);
260 }
261 
php_intl_idn_handoff(INTERNAL_FUNCTION_PARAMETERS,int mode)262 static void php_intl_idn_handoff(INTERNAL_FUNCTION_PARAMETERS, int mode)
263 {
264 	char *domain;
265 	int domain_len;
266 	long option = 0,
267 		 variant = INTL_IDN_VARIANT_2003;
268 	zval *idna_info = NULL;
269 
270 	intl_error_reset(NULL TSRMLS_CC);
271 
272 	if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s|llz",
273 			&domain, &domain_len, &option, &variant, &idna_info) == FAILURE) {
274 		php_intl_bad_args("bad arguments", mode TSRMLS_CC);
275 		RETURN_NULL(); /* don't set FALSE because that's not the way it was before... */
276 	}
277 
278 #ifdef HAVE_46_API
279 	if (variant != INTL_IDN_VARIANT_2003 && variant != INTL_IDN_VARIANT_UTS46) {
280 		php_intl_bad_args("invalid variant, must be one of {"
281 			"INTL_IDNA_VARIANT_2003, INTL_IDNA_VARIANT_UTS46}", mode TSRMLS_CC);
282 		RETURN_FALSE;
283 	}
284 #else
285 	if (variant != INTL_IDN_VARIANT_2003) {
286 		php_intl_bad_args("invalid variant, PHP was compiled against "
287 			"an old version of ICU and only supports INTL_IDN_VARIANT_2003",
288 			mode TSRMLS_CC);
289 		RETURN_FALSE;
290 	}
291 #endif
292 
293 	if (domain_len < 1) {
294 		php_intl_bad_args("empty domain name", mode TSRMLS_CC);
295 		RETURN_FALSE;
296 	}
297 	if (domain_len > INT32_MAX - 1) {
298 		php_intl_bad_args("domain name too large", mode TSRMLS_CC);
299 		RETURN_FALSE;
300 	}
301 	/* don't check options; it wasn't checked before */
302 
303 	if (idna_info != NULL) {
304 		if (variant == INTL_IDN_VARIANT_2003) {
305 			php_error_docref0(NULL TSRMLS_CC, E_NOTICE,
306 				"4 arguments were provided, but INTL_IDNA_VARIANT_2003 only "
307 				"takes 3 - extra argument ignored");
308 		} else {
309 			zval_dtor(idna_info);
310 			array_init(idna_info);
311 		}
312 	}
313 
314 	if (variant == INTL_IDN_VARIANT_2003) {
315 		php_intl_idn_to(INTERNAL_FUNCTION_PARAM_PASSTHRU,
316 				domain, domain_len, (uint32_t)option, mode);
317 	}
318 #ifdef HAVE_46_API
319 	else {
320 		php_intl_idn_to_46(INTERNAL_FUNCTION_PARAM_PASSTHRU, domain, domain_len,
321 				(uint32_t)option, mode, idna_info);
322 	}
323 #endif
324 }
325 
326 /* {{{ proto int idn_to_ascii(string domain[, int options[, int variant[, array &idna_info]]])
327    Converts an Unicode domain to ASCII representation, as defined in the IDNA RFC */
PHP_FUNCTION(idn_to_ascii)328 PHP_FUNCTION(idn_to_ascii)
329 {
330 	php_intl_idn_handoff(INTERNAL_FUNCTION_PARAM_PASSTHRU, INTL_IDN_TO_ASCII);
331 }
332 /* }}} */
333 
334 
335 /* {{{ proto int idn_to_utf8(string domain[, int options[, int variant[, array &idna_info]]])
336    Converts an ASCII representation of the domain to Unicode (UTF-8), as defined in the IDNA RFC */
PHP_FUNCTION(idn_to_utf8)337 PHP_FUNCTION(idn_to_utf8)
338 {
339 	php_intl_idn_handoff(INTERNAL_FUNCTION_PARAM_PASSTHRU, INTL_IDN_TO_UTF8);
340 }
341 /* }}} */
342 
343 
344 /*
345  * Local variables:
346  * tab-width: 4
347  * c-basic-offset: 4
348  * End:
349  * vim600: fdm=marker
350  * vim: noet sw=4 ts=4
351  */
352