xref: /PHP-7.2/ext/mbstring/php_unicode.c (revision 7a7ec01a)
1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 7                                                        |
4    +----------------------------------------------------------------------+
5    | Copyright (c) 1997-2018 The PHP Group                                |
6    +----------------------------------------------------------------------+
7    | This source file is subject to version 3.01 of the PHP license,      |
8    | that is bundled with this package in the file LICENSE, and is        |
9    | available through the world-wide-web at the following url:           |
10    | http://www.php.net/license/3_01.txt                                  |
11    | If you did not receive a copy of the PHP license and are unable to   |
12    | obtain it through the world-wide-web, please send a note to          |
13    | license@php.net so we can mail you a copy immediately.               |
14    +----------------------------------------------------------------------+
15    | Author: Wez Furlong (wez@thebrainroom.com)                           |
16    +----------------------------------------------------------------------+
17 
18 	Based on code from ucdata-2.5, which has the following Copyright:
19 
20 	Copyright 2001 Computing Research Labs, New Mexico State University
21 
22 	Permission is hereby granted, free of charge, to any person obtaining a
23 	copy of this software and associated documentation files (the "Software"),
24 	to deal in the Software without restriction, including without limitation
25 	the rights to use, copy, modify, merge, publish, distribute, sublicense,
26 	and/or sell copies of the Software, and to permit persons to whom the
27 	Software is furnished to do so, subject to the following conditions:
28 
29 	The above copyright notice and this permission notice shall be included in
30 	all copies or substantial portions of the Software.
31 */
32 
33 #ifdef HAVE_CONFIG_H
34 #include "config.h"
35 #endif
36 
37 #include "php.h"
38 #include "php_ini.h"
39 
40 #if HAVE_MBSTRING
41 
42 /* include case folding data generated from the official UnicodeData.txt file */
43 #include "mbstring.h"
44 #include "php_unicode.h"
45 #include "unicode_data.h"
46 
47 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
48 
49 /*
50  * A simple array of 32-bit masks for lookup.
51  */
52 static unsigned long masks32[32] = {
53     0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
54     0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
55     0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
56     0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
57     0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
58     0x40000000, 0x80000000
59 };
60 
61 
prop_lookup(unsigned long code,unsigned long n)62 static int prop_lookup(unsigned long code, unsigned long n)
63 {
64 	long l, r, m;
65 
66 	/*
67 	 * There is an extra node on the end of the offsets to allow this routine
68 	 * to work right.  If the index is 0xffff, then there are no nodes for the
69 	 * property.
70 	 */
71 	if ((l = _ucprop_offsets[n]) == 0xffff)
72 		return 0;
73 
74 	/*
75 	 * Locate the next offset that is not 0xffff.  The sentinel at the end of
76 	 * the array is the max index value.
77 	 */
78 	for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
79 		;
80 
81 	r = _ucprop_offsets[n + m] - 1;
82 
83 	while (l <= r) {
84 		/*
85 		 * Determine a "mid" point and adjust to make sure the mid point is at
86 		 * the beginning of a range pair.
87 		 */
88 		m = (l + r) >> 1;
89 		m -= (m & 1);
90 		if (code > _ucprop_ranges[m + 1])
91 			l = m + 2;
92 		else if (code < _ucprop_ranges[m])
93 			r = m - 2;
94 		else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
95 			return 1;
96 	}
97 	return 0;
98 
99 }
100 
php_unicode_is_prop(unsigned long code,unsigned long mask1,unsigned long mask2)101 MBSTRING_API int php_unicode_is_prop(unsigned long code, unsigned long mask1,
102 		unsigned long mask2)
103 {
104 	unsigned long i;
105 
106 	if (mask1 == 0 && mask2 == 0)
107 		return 0;
108 
109 	for (i = 0; mask1 && i < 32; i++) {
110 		if ((mask1 & masks32[i]) && prop_lookup(code, i))
111 			return 1;
112 	}
113 
114 	for (i = 32; mask2 && i < _ucprop_size; i++) {
115 		if ((mask2 & masks32[i & 31]) && prop_lookup(code, i))
116 			return 1;
117 	}
118 
119 	return 0;
120 }
121 
case_lookup(unsigned long code,long l,long r,int field)122 static unsigned long case_lookup(unsigned long code, long l, long r, int field)
123 {
124 	long m;
125 	const unsigned int *tmp;
126 
127 	/*
128 	 * Do the binary search.
129 	 */
130 	while (l <= r) {
131 		/*
132 		 * Determine a "mid" point and adjust to make sure the mid point is at
133 		 * the beginning of a case mapping triple.
134 		 */
135 		m = (l + r) >> 1;
136 		tmp = &_uccase_map[m*3];
137 		if (code > *tmp)
138 			l = m + 1;
139 		else if (code < *tmp)
140 			r = m - 1;
141 		else if (code == *tmp)
142 			return tmp[field];
143 	}
144 
145 	return code;
146 }
147 
php_turkish_toupper(unsigned long code,long l,long r,int field)148 MBSTRING_API unsigned long php_turkish_toupper(unsigned long code, long l, long r, int field)
149 {
150 	if (code == 0x0069L) {
151 		return 0x0130L;
152 	}
153 	return case_lookup(code, l, r, field);
154 }
155 
php_turkish_tolower(unsigned long code,long l,long r,int field)156 MBSTRING_API unsigned long php_turkish_tolower(unsigned long code, long l, long r, int field)
157 {
158 	if (code == 0x0049L) {
159 		return 0x0131L;
160 	}
161 	return case_lookup(code, l, r, field);
162 }
163 
php_unicode_toupper(unsigned long code,enum mbfl_no_encoding enc)164 MBSTRING_API unsigned long php_unicode_toupper(unsigned long code, enum mbfl_no_encoding enc)
165 {
166 	int field;
167 	long l, r;
168 
169 	if (php_unicode_is_upper(code))
170 		return code;
171 
172 	if (php_unicode_is_lower(code)) {
173 		/*
174 		 * The character is lower case.
175 		 */
176 		field = 1;
177 		l = _uccase_len[0];
178 		r = (l + _uccase_len[1]) - 1;
179 
180 		if (enc == mbfl_no_encoding_8859_9) {
181 			return php_turkish_toupper(code, l, r, field);
182 		}
183 
184 	} else {
185 		/*
186 		 * The character is title case.
187 		 */
188 		field = 1;
189 		l = _uccase_len[0] + _uccase_len[1];
190 		r = _uccase_size - 1;
191 	}
192 	return case_lookup(code, l, r, field);
193 }
194 
php_unicode_tolower(unsigned long code,enum mbfl_no_encoding enc)195 MBSTRING_API unsigned long php_unicode_tolower(unsigned long code, enum mbfl_no_encoding enc)
196 {
197 	int field;
198 	long l, r;
199 
200 	if (php_unicode_is_lower(code))
201 		return code;
202 
203 	if (php_unicode_is_upper(code)) {
204 		/*
205 		 * The character is upper case.
206 		 */
207 		field = 1;
208 		l = 0;
209 		r = _uccase_len[0] - 1;
210 
211 		if (enc == mbfl_no_encoding_8859_9) {
212 			return php_turkish_tolower(code, l, r, field);
213 		}
214 
215 	} else {
216 		/*
217 		 * The character is title case.
218 		 */
219 		field = 2;
220 		l = _uccase_len[0] + _uccase_len[1];
221 		r = _uccase_size - 1;
222 	}
223 	return case_lookup(code, l, r, field);
224 }
225 
php_unicode_totitle(unsigned long code,enum mbfl_no_encoding enc)226 MBSTRING_API unsigned long php_unicode_totitle(unsigned long code, enum mbfl_no_encoding enc)
227 {
228 	int field;
229 	long l, r;
230 
231 	if (php_unicode_is_title(code))
232 		return code;
233 
234 	/*
235 	 * The offset will always be the same for converting to title case.
236 	 */
237 	field = 2;
238 
239 	if (php_unicode_is_upper(code)) {
240 		/*
241 		 * The character is upper case.
242 		 */
243 		l = 0;
244 		r = _uccase_len[0] - 1;
245 	} else {
246 		/*
247 		 * The character is lower case.
248 		 */
249 		l = _uccase_len[0];
250 		r = (l + _uccase_len[1]) - 1;
251 	}
252 	return case_lookup(code, l, r, field);
253 
254 }
255 
256 
257 #define BE_ARY_TO_UINT32(ptr) (\
258 	((unsigned char*)(ptr))[0]<<24 |\
259 	((unsigned char*)(ptr))[1]<<16 |\
260 	((unsigned char*)(ptr))[2]<< 8 |\
261 	((unsigned char*)(ptr))[3] )
262 
263 #define UINT32_TO_BE_ARY(ptr,val) { \
264 	unsigned int v = val; \
265 	((unsigned char*)(ptr))[0] = (v>>24) & 0xff,\
266 	((unsigned char*)(ptr))[1] = (v>>16) & 0xff,\
267 	((unsigned char*)(ptr))[2] = (v>> 8) & 0xff,\
268 	((unsigned char*)(ptr))[3] = (v    ) & 0xff;\
269 }
270 
php_unicode_convert_case(int case_mode,const char * srcstr,size_t srclen,size_t * ret_len,const char * src_encoding)271 MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
272 		const char *src_encoding)
273 {
274 	char *unicode, *newstr;
275 	size_t unicode_len;
276 	unsigned char *unicode_ptr;
277 	size_t i;
278 	enum mbfl_no_encoding _src_encoding = mbfl_name2no_encoding(src_encoding);
279 
280 	if (_src_encoding == mbfl_no_encoding_invalid) {
281 		php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", src_encoding);
282 		return NULL;
283 	}
284 
285 	unicode = php_mb_convert_encoding(srcstr, srclen, "UCS-4BE", src_encoding, &unicode_len);
286 	if (unicode == NULL)
287 		return NULL;
288 
289 	unicode_ptr = (unsigned char *)unicode;
290 
291 	switch(case_mode) {
292 		case PHP_UNICODE_CASE_UPPER:
293 			for (i = 0; i < unicode_len; i+=4) {
294 				UINT32_TO_BE_ARY(&unicode_ptr[i],
295 					php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding));
296 			}
297 			break;
298 
299 		case PHP_UNICODE_CASE_LOWER:
300 			for (i = 0; i < unicode_len; i+=4) {
301 				UINT32_TO_BE_ARY(&unicode_ptr[i],
302 					php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding));
303 			}
304 			break;
305 
306 		case PHP_UNICODE_CASE_TITLE: {
307 			int mode = 0;
308 
309 			for (i = 0; i < unicode_len; i+=4) {
310 				int res = php_unicode_is_prop(
311 					BE_ARY_TO_UINT32(&unicode_ptr[i]),
312 					UC_MN|UC_ME|UC_CF|UC_LM|UC_SK|UC_LU|UC_LL|UC_LT|UC_PO|UC_OS, 0);
313 				if (mode) {
314 					if (res) {
315 						UINT32_TO_BE_ARY(&unicode_ptr[i],
316 							php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding));
317 					} else {
318 						mode = 0;
319 					}
320 				} else {
321 					if (res) {
322 						mode = 1;
323 						UINT32_TO_BE_ARY(&unicode_ptr[i],
324 							php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding));
325 					}
326 				}
327 			}
328 		} break;
329 
330 	}
331 
332 	newstr = php_mb_convert_encoding(unicode, unicode_len, src_encoding, "UCS-4BE", ret_len);
333 	efree(unicode);
334 
335 	return newstr;
336 }
337 
338 
339 #endif /* HAVE_MBSTRING */
340 
341 /*
342  * Local variables:
343  * tab-width: 4
344  * c-basic-offset: 4
345  * End:
346  * vim600: sw=4 ts=4 fdm=marker
347  * vim<600: sw=4 ts=4
348  */
349