xref: /PHP-5.5/ext/mbstring/php_unicode.c (revision 73c1be26)
1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 5                                                        |
4    +----------------------------------------------------------------------+
5    | Copyright (c) 1997-2015 The PHP Group                                |
6    +----------------------------------------------------------------------+
7    | This source file is subject to version 3.01 of the PHP license,      |
8    | that is bundled with this package in the file LICENSE, and is        |
9    | available through the world-wide-web at the following url:           |
10    | http://www.php.net/license/3_01.txt                                  |
11    | If you did not receive a copy of the PHP license and are unable to   |
12    | obtain it through the world-wide-web, please send a note to          |
13    | license@php.net so we can mail you a copy immediately.               |
14    +----------------------------------------------------------------------+
15    | Author: Wez Furlong (wez@thebrainroom.com)                           |
16    +----------------------------------------------------------------------+
17 
18 	Based on code from ucdata-2.5, which has the following Copyright:
19 
20 	Copyright 2001 Computing Research Labs, New Mexico State University
21 
22 	Permission is hereby granted, free of charge, to any person obtaining a
23 	copy of this software and associated documentation files (the "Software"),
24 	to deal in the Software without restriction, including without limitation
25 	the rights to use, copy, modify, merge, publish, distribute, sublicense,
26 	and/or sell copies of the Software, and to permit persons to whom the
27 	Software is furnished to do so, subject to the following conditions:
28 
29 	The above copyright notice and this permission notice shall be included in
30 	all copies or substantial portions of the Software.
31 */
32 
33 #ifdef HAVE_CONFIG_H
34 #include "config.h"
35 #endif
36 
37 #include "php.h"
38 #include "php_ini.h"
39 
40 #if HAVE_MBSTRING
41 
42 /* include case folding data generated from the official UnicodeData.txt file */
43 #include "mbstring.h"
44 #include "php_unicode.h"
45 #include "unicode_data.h"
46 
47 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
48 
49 /*
50  * A simple array of 32-bit masks for lookup.
51  */
52 static unsigned long masks32[32] = {
53     0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
54     0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
55     0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
56     0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
57     0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
58     0x40000000, 0x80000000
59 };
60 
61 
prop_lookup(unsigned long code,unsigned long n)62 static int prop_lookup(unsigned long code, unsigned long n)
63 {
64 	long l, r, m;
65 
66 	/*
67 	 * There is an extra node on the end of the offsets to allow this routine
68 	 * to work right.  If the index is 0xffff, then there are no nodes for the
69 	 * property.
70 	 */
71 	if ((l = _ucprop_offsets[n]) == 0xffff)
72 		return 0;
73 
74 	/*
75 	 * Locate the next offset that is not 0xffff.  The sentinel at the end of
76 	 * the array is the max index value.
77 	 */
78 	for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
79 		;
80 
81 	r = _ucprop_offsets[n + m] - 1;
82 
83 	while (l <= r) {
84 		/*
85 		 * Determine a "mid" point and adjust to make sure the mid point is at
86 		 * the beginning of a range pair.
87 		 */
88 		m = (l + r) >> 1;
89 		m -= (m & 1);
90 		if (code > _ucprop_ranges[m + 1])
91 			l = m + 2;
92 		else if (code < _ucprop_ranges[m])
93 			r = m - 2;
94 		else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
95 			return 1;
96 	}
97 	return 0;
98 
99 }
100 
php_unicode_is_prop(unsigned long code,unsigned long mask1,unsigned long mask2)101 MBSTRING_API int php_unicode_is_prop(unsigned long code, unsigned long mask1,
102 		unsigned long mask2)
103 {
104 	unsigned long i;
105 
106 	if (mask1 == 0 && mask2 == 0)
107 		return 0;
108 
109 	for (i = 0; mask1 && i < 32; i++) {
110 		if ((mask1 & masks32[i]) && prop_lookup(code, i))
111 			return 1;
112 	}
113 
114 	for (i = 32; mask2 && i < _ucprop_size; i++) {
115 		if ((mask2 & masks32[i & 31]) && prop_lookup(code, i))
116 			return 1;
117 	}
118 
119 	return 0;
120 }
121 
case_lookup(unsigned long code,long l,long r,int field)122 static unsigned long case_lookup(unsigned long code, long l, long r, int field)
123 {
124 	long m;
125 
126 	/*
127 	 * Do the binary search.
128 	 */
129 	while (l <= r) {
130 		/*
131 		 * Determine a "mid" point and adjust to make sure the mid point is at
132 		 * the beginning of a case mapping triple.
133 		 */
134 		m = (l + r) >> 1;
135 		m -= (m % 3);
136 		if (code > _uccase_map[m])
137 			l = m + 3;
138 		else if (code < _uccase_map[m])
139 			r = m - 3;
140 		else if (code == _uccase_map[m])
141 			return _uccase_map[m + field];
142 	}
143 
144 	return code;
145 }
146 
php_turkish_toupper(unsigned long code,long l,long r,int field)147 MBSTRING_API unsigned long php_turkish_toupper(unsigned long code, long l, long r, int field)
148 {
149 	if (code == 0x0069L) {
150 		return 0x0130L;
151 	}
152 	return case_lookup(code, l, r, field);
153 }
154 
php_turkish_tolower(unsigned long code,long l,long r,int field)155 MBSTRING_API unsigned long php_turkish_tolower(unsigned long code, long l, long r, int field)
156 {
157 	if (code == 0x0049L) {
158 		return 0x0131L;
159 	}
160 	return case_lookup(code, l, r, field);
161 }
162 
php_unicode_toupper(unsigned long code,enum mbfl_no_encoding enc TSRMLS_DC)163 MBSTRING_API unsigned long php_unicode_toupper(unsigned long code, enum mbfl_no_encoding enc TSRMLS_DC)
164 {
165 	int field;
166 	long l, r;
167 
168 	if (php_unicode_is_upper(code))
169 		return code;
170 
171 	if (php_unicode_is_lower(code)) {
172 		/*
173 		 * The character is lower case.
174 		 */
175 		field = 2;
176 		l = _uccase_len[0];
177 		r = (l + _uccase_len[1]) - 3;
178 
179 		if (enc == mbfl_no_encoding_8859_9) {
180 			return php_turkish_toupper(code, l, r, field);
181 		}
182 
183 	} else {
184 		/*
185 		 * The character is title case.
186 		 */
187 		field = 1;
188 		l = _uccase_len[0] + _uccase_len[1];
189 		r = _uccase_size - 3;
190 	}
191 	return case_lookup(code, l, r, field);
192 }
193 
php_unicode_tolower(unsigned long code,enum mbfl_no_encoding enc TSRMLS_DC)194 MBSTRING_API unsigned long php_unicode_tolower(unsigned long code, enum mbfl_no_encoding enc TSRMLS_DC)
195 {
196 	int field;
197 	long l, r;
198 
199 	if (php_unicode_is_lower(code))
200 		return code;
201 
202 	if (php_unicode_is_upper(code)) {
203 		/*
204 		 * The character is upper case.
205 		 */
206 		field = 1;
207 		l = 0;
208 		r = _uccase_len[0] - 3;
209 
210 		if (enc == mbfl_no_encoding_8859_9) {
211 			return php_turkish_tolower(code, l, r, field);
212 		}
213 
214 	} else {
215 		/*
216 		 * The character is title case.
217 		 */
218 		field = 2;
219 		l = _uccase_len[0] + _uccase_len[1];
220 		r = _uccase_size - 3;
221 	}
222 	return case_lookup(code, l, r, field);
223 }
224 
php_unicode_totitle(unsigned long code,enum mbfl_no_encoding enc TSRMLS_DC)225 MBSTRING_API unsigned long php_unicode_totitle(unsigned long code, enum mbfl_no_encoding enc TSRMLS_DC)
226 {
227 	int field;
228 	long l, r;
229 
230 	if (php_unicode_is_title(code))
231 		return code;
232 
233 	/*
234 	 * The offset will always be the same for converting to title case.
235 	 */
236 	field = 2;
237 
238 	if (php_unicode_is_upper(code)) {
239 		/*
240 		 * The character is upper case.
241 		 */
242 		l = 0;
243 		r = _uccase_len[0] - 3;
244 	} else {
245 		/*
246 		 * The character is lower case.
247 		 */
248 		l = _uccase_len[0];
249 		r = (l + _uccase_len[1]) - 3;
250 	}
251 	return case_lookup(code, l, r, field);
252 
253 }
254 
255 
256 #define BE_ARY_TO_UINT32(ptr) (\
257 	((unsigned char*)(ptr))[0]<<24 |\
258 	((unsigned char*)(ptr))[1]<<16 |\
259 	((unsigned char*)(ptr))[2]<< 8 |\
260 	((unsigned char*)(ptr))[3] )
261 
262 #define UINT32_TO_BE_ARY(ptr,val) { \
263 	unsigned int v = val; \
264 	((unsigned char*)(ptr))[0] = (v>>24) & 0xff,\
265 	((unsigned char*)(ptr))[1] = (v>>16) & 0xff,\
266 	((unsigned char*)(ptr))[2] = (v>> 8) & 0xff,\
267 	((unsigned char*)(ptr))[3] = (v    ) & 0xff;\
268 }
269 
php_unicode_convert_case(int case_mode,const char * srcstr,size_t srclen,size_t * ret_len,const char * src_encoding TSRMLS_DC)270 MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
271 		const char *src_encoding TSRMLS_DC)
272 {
273 	char *unicode, *newstr;
274 	size_t unicode_len;
275 	unsigned char *unicode_ptr;
276 	size_t i;
277 	enum mbfl_no_encoding _src_encoding = mbfl_name2no_encoding(src_encoding);
278 
279 	if (_src_encoding == mbfl_no_encoding_invalid) {
280 		php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown encoding \"%s\"", src_encoding);
281 		return NULL;
282 	}
283 
284 	unicode = php_mb_convert_encoding(srcstr, srclen, "UCS-4BE", src_encoding, &unicode_len TSRMLS_CC);
285 	if (unicode == NULL)
286 		return NULL;
287 
288 	unicode_ptr = (unsigned char *)unicode;
289 
290 	switch(case_mode) {
291 		case PHP_UNICODE_CASE_UPPER:
292 			for (i = 0; i < unicode_len; i+=4) {
293 				UINT32_TO_BE_ARY(&unicode_ptr[i],
294 					php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
295 			}
296 			break;
297 
298 		case PHP_UNICODE_CASE_LOWER:
299 			for (i = 0; i < unicode_len; i+=4) {
300 				UINT32_TO_BE_ARY(&unicode_ptr[i],
301 					php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
302 			}
303 			break;
304 
305 		case PHP_UNICODE_CASE_TITLE: {
306 			int mode = 0;
307 
308 			for (i = 0; i < unicode_len; i+=4) {
309 				int res = php_unicode_is_prop(
310 					BE_ARY_TO_UINT32(&unicode_ptr[i]),
311 					UC_MN|UC_ME|UC_CF|UC_LM|UC_SK|UC_LU|UC_LL|UC_LT|UC_PO|UC_OS, 0);
312 				if (mode) {
313 					if (res) {
314 						UINT32_TO_BE_ARY(&unicode_ptr[i],
315 							php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
316 					} else {
317 						mode = 0;
318 					}
319 				} else {
320 					if (res) {
321 						mode = 1;
322 						UINT32_TO_BE_ARY(&unicode_ptr[i],
323 							php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
324 					}
325 				}
326 			}
327 		} break;
328 
329 	}
330 
331 	newstr = php_mb_convert_encoding(unicode, unicode_len, src_encoding, "UCS-4BE", ret_len TSRMLS_CC);
332 	efree(unicode);
333 
334 	return newstr;
335 }
336 
337 
338 #endif /* HAVE_MBSTRING */
339 
340 /*
341  * Local variables:
342  * tab-width: 4
343  * c-basic-offset: 4
344  * End:
345  * vim600: sw=4 ts=4 fdm=marker
346  * vim<600: sw=4 ts=4
347  */
348