1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 5 |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 1997-2014 The PHP Group |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
15 | Author: Wez Furlong (wez@thebrainroom.com) |
16 +----------------------------------------------------------------------+
17
18 Based on code from ucdata-2.5, which has the following Copyright:
19
20 Copyright 2001 Computing Research Labs, New Mexico State University
21
22 Permission is hereby granted, free of charge, to any person obtaining a
23 copy of this software and associated documentation files (the "Software"),
24 to deal in the Software without restriction, including without limitation
25 the rights to use, copy, modify, merge, publish, distribute, sublicense,
26 and/or sell copies of the Software, and to permit persons to whom the
27 Software is furnished to do so, subject to the following conditions:
28
29 The above copyright notice and this permission notice shall be included in
30 all copies or substantial portions of the Software.
31 */
32
33 #ifdef HAVE_CONFIG_H
34 #include "config.h"
35 #endif
36
37 #include "php.h"
38 #include "php_ini.h"
39
40 #if HAVE_MBSTRING
41
42 /* include case folding data generated from the official UnicodeData.txt file */
43 #include "mbstring.h"
44 #include "php_unicode.h"
45 #include "unicode_data.h"
46
47 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
48
49 /*
50 * A simple array of 32-bit masks for lookup.
51 */
52 static unsigned long masks32[32] = {
53 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
54 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
55 0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
56 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
57 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
58 0x40000000, 0x80000000
59 };
60
61
prop_lookup(unsigned long code,unsigned long n)62 static int prop_lookup(unsigned long code, unsigned long n)
63 {
64 long l, r, m;
65
66 /*
67 * There is an extra node on the end of the offsets to allow this routine
68 * to work right. If the index is 0xffff, then there are no nodes for the
69 * property.
70 */
71 if ((l = _ucprop_offsets[n]) == 0xffff)
72 return 0;
73
74 /*
75 * Locate the next offset that is not 0xffff. The sentinel at the end of
76 * the array is the max index value.
77 */
78 for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
79 ;
80
81 r = _ucprop_offsets[n + m] - 1;
82
83 while (l <= r) {
84 /*
85 * Determine a "mid" point and adjust to make sure the mid point is at
86 * the beginning of a range pair.
87 */
88 m = (l + r) >> 1;
89 m -= (m & 1);
90 if (code > _ucprop_ranges[m + 1])
91 l = m + 2;
92 else if (code < _ucprop_ranges[m])
93 r = m - 2;
94 else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
95 return 1;
96 }
97 return 0;
98
99 }
100
php_unicode_is_prop(unsigned long code,unsigned long mask1,unsigned long mask2)101 MBSTRING_API int php_unicode_is_prop(unsigned long code, unsigned long mask1,
102 unsigned long mask2)
103 {
104 unsigned long i;
105
106 if (mask1 == 0 && mask2 == 0)
107 return 0;
108
109 for (i = 0; mask1 && i < 32; i++) {
110 if ((mask1 & masks32[i]) && prop_lookup(code, i))
111 return 1;
112 }
113
114 for (i = 32; mask2 && i < _ucprop_size; i++) {
115 if ((mask2 & masks32[i & 31]) && prop_lookup(code, i))
116 return 1;
117 }
118
119 return 0;
120 }
121
case_lookup(unsigned long code,long l,long r,int field)122 static unsigned long case_lookup(unsigned long code, long l, long r, int field)
123 {
124 long m;
125
126 /*
127 * Do the binary search.
128 */
129 while (l <= r) {
130 /*
131 * Determine a "mid" point and adjust to make sure the mid point is at
132 * the beginning of a case mapping triple.
133 */
134 m = (l + r) >> 1;
135 m -= (m % 3);
136 if (code > _uccase_map[m])
137 l = m + 3;
138 else if (code < _uccase_map[m])
139 r = m - 3;
140 else if (code == _uccase_map[m])
141 return _uccase_map[m + field];
142 }
143
144 return code;
145 }
146
php_turkish_toupper(unsigned long code,long l,long r,int field)147 MBSTRING_API unsigned long php_turkish_toupper(unsigned long code, long l, long r, int field)
148 {
149 if (code == 0x0069L) {
150 return 0x0130L;
151 }
152 return case_lookup(code, l, r, field);
153 }
154
php_turkish_tolower(unsigned long code,long l,long r,int field)155 MBSTRING_API unsigned long php_turkish_tolower(unsigned long code, long l, long r, int field)
156 {
157 if (code == 0x0049L) {
158 return 0x0131L;
159 }
160 return case_lookup(code, l, r, field);
161 }
162
php_unicode_toupper(unsigned long code,enum mbfl_no_encoding enc TSRMLS_DC)163 MBSTRING_API unsigned long php_unicode_toupper(unsigned long code, enum mbfl_no_encoding enc TSRMLS_DC)
164 {
165 int field;
166 long l, r;
167
168 if (php_unicode_is_upper(code))
169 return code;
170
171 if (php_unicode_is_lower(code)) {
172 /*
173 * The character is lower case.
174 */
175 field = 2;
176 l = _uccase_len[0];
177 r = (l + _uccase_len[1]) - 3;
178
179 if (enc == mbfl_no_encoding_8859_9) {
180 return php_turkish_toupper(code, l, r, field);
181 }
182
183 } else {
184 /*
185 * The character is title case.
186 */
187 field = 1;
188 l = _uccase_len[0] + _uccase_len[1];
189 r = _uccase_size - 3;
190 }
191 return case_lookup(code, l, r, field);
192 }
193
php_unicode_tolower(unsigned long code,enum mbfl_no_encoding enc TSRMLS_DC)194 MBSTRING_API unsigned long php_unicode_tolower(unsigned long code, enum mbfl_no_encoding enc TSRMLS_DC)
195 {
196 int field;
197 long l, r;
198
199 if (php_unicode_is_lower(code))
200 return code;
201
202 if (php_unicode_is_upper(code)) {
203 /*
204 * The character is upper case.
205 */
206 field = 1;
207 l = 0;
208 r = _uccase_len[0] - 3;
209
210 if (enc == mbfl_no_encoding_8859_9) {
211 return php_turkish_tolower(code, l, r, field);
212 }
213
214 } else {
215 /*
216 * The character is title case.
217 */
218 field = 2;
219 l = _uccase_len[0] + _uccase_len[1];
220 r = _uccase_size - 3;
221 }
222 return case_lookup(code, l, r, field);
223 }
224
php_unicode_totitle(unsigned long code,enum mbfl_no_encoding enc TSRMLS_DC)225 MBSTRING_API unsigned long php_unicode_totitle(unsigned long code, enum mbfl_no_encoding enc TSRMLS_DC)
226 {
227 int field;
228 long l, r;
229
230 if (php_unicode_is_title(code))
231 return code;
232
233 /*
234 * The offset will always be the same for converting to title case.
235 */
236 field = 2;
237
238 if (php_unicode_is_upper(code)) {
239 /*
240 * The character is upper case.
241 */
242 l = 0;
243 r = _uccase_len[0] - 3;
244 } else {
245 /*
246 * The character is lower case.
247 */
248 l = _uccase_len[0];
249 r = (l + _uccase_len[1]) - 3;
250 }
251 return case_lookup(code, l, r, field);
252
253 }
254
255
256 #define BE_ARY_TO_UINT32(ptr) (\
257 ((unsigned char*)(ptr))[0]<<24 |\
258 ((unsigned char*)(ptr))[1]<<16 |\
259 ((unsigned char*)(ptr))[2]<< 8 |\
260 ((unsigned char*)(ptr))[3] )
261
262 #define UINT32_TO_BE_ARY(ptr,val) { \
263 unsigned int v = val; \
264 ((unsigned char*)(ptr))[0] = (v>>24) & 0xff,\
265 ((unsigned char*)(ptr))[1] = (v>>16) & 0xff,\
266 ((unsigned char*)(ptr))[2] = (v>> 8) & 0xff,\
267 ((unsigned char*)(ptr))[3] = (v ) & 0xff;\
268 }
269
php_unicode_convert_case(int case_mode,const char * srcstr,size_t srclen,size_t * ret_len,const char * src_encoding TSRMLS_DC)270 MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
271 const char *src_encoding TSRMLS_DC)
272 {
273 char *unicode, *newstr;
274 size_t unicode_len;
275 unsigned char *unicode_ptr;
276 size_t i;
277 enum mbfl_no_encoding _src_encoding = mbfl_name2no_encoding(src_encoding);
278
279 if (_src_encoding == mbfl_no_encoding_invalid) {
280 php_error_docref(NULL TSRMLS_CC, E_WARNING, "Unknown encoding \"%s\"", src_encoding);
281 return NULL;
282 }
283
284 unicode = php_mb_convert_encoding(srcstr, srclen, "UCS-4BE", src_encoding, &unicode_len TSRMLS_CC);
285 if (unicode == NULL)
286 return NULL;
287
288 unicode_ptr = (unsigned char *)unicode;
289
290 switch(case_mode) {
291 case PHP_UNICODE_CASE_UPPER:
292 for (i = 0; i < unicode_len; i+=4) {
293 UINT32_TO_BE_ARY(&unicode_ptr[i],
294 php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
295 }
296 break;
297
298 case PHP_UNICODE_CASE_LOWER:
299 for (i = 0; i < unicode_len; i+=4) {
300 UINT32_TO_BE_ARY(&unicode_ptr[i],
301 php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
302 }
303 break;
304
305 case PHP_UNICODE_CASE_TITLE: {
306 int mode = 0;
307
308 for (i = 0; i < unicode_len; i+=4) {
309 int res = php_unicode_is_prop(
310 BE_ARY_TO_UINT32(&unicode_ptr[i]),
311 UC_MN|UC_ME|UC_CF|UC_LM|UC_SK|UC_LU|UC_LL|UC_LT|UC_PO|UC_OS, 0);
312 if (mode) {
313 if (res) {
314 UINT32_TO_BE_ARY(&unicode_ptr[i],
315 php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
316 } else {
317 mode = 0;
318 }
319 } else {
320 if (res) {
321 mode = 1;
322 UINT32_TO_BE_ARY(&unicode_ptr[i],
323 php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding TSRMLS_CC));
324 }
325 }
326 }
327 } break;
328
329 }
330
331 newstr = php_mb_convert_encoding(unicode, unicode_len, src_encoding, "UCS-4BE", ret_len TSRMLS_CC);
332 efree(unicode);
333
334 return newstr;
335 }
336
337
338 #endif /* HAVE_MBSTRING */
339
340 /*
341 * Local variables:
342 * tab-width: 4
343 * c-basic-offset: 4
344 * End:
345 * vim600: sw=4 ts=4 fdm=marker
346 * vim<600: sw=4 ts=4
347 */
348