1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 7 |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 1997-2018 The PHP Group |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
15 | Author: Wez Furlong (wez@thebrainroom.com) |
16 +----------------------------------------------------------------------+
17
18 Based on code from ucdata-2.5, which has the following Copyright:
19
20 Copyright 2001 Computing Research Labs, New Mexico State University
21
22 Permission is hereby granted, free of charge, to any person obtaining a
23 copy of this software and associated documentation files (the "Software"),
24 to deal in the Software without restriction, including without limitation
25 the rights to use, copy, modify, merge, publish, distribute, sublicense,
26 and/or sell copies of the Software, and to permit persons to whom the
27 Software is furnished to do so, subject to the following conditions:
28
29 The above copyright notice and this permission notice shall be included in
30 all copies or substantial portions of the Software.
31 */
32
33 #ifdef HAVE_CONFIG_H
34 #include "config.h"
35 #endif
36
37 #include "php.h"
38 #include "php_ini.h"
39
40 #if HAVE_MBSTRING
41
42 /* include case folding data generated from the official UnicodeData.txt file */
43 #include "mbstring.h"
44 #include "php_unicode.h"
45 #include "unicode_data.h"
46
47 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
48
49 /*
50 * A simple array of 32-bit masks for lookup.
51 */
52 static unsigned long masks32[32] = {
53 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
54 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
55 0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
56 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
57 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
58 0x40000000, 0x80000000
59 };
60
61
prop_lookup(unsigned long code,unsigned long n)62 static int prop_lookup(unsigned long code, unsigned long n)
63 {
64 long l, r, m;
65
66 /*
67 * There is an extra node on the end of the offsets to allow this routine
68 * to work right. If the index is 0xffff, then there are no nodes for the
69 * property.
70 */
71 if ((l = _ucprop_offsets[n]) == 0xffff)
72 return 0;
73
74 /*
75 * Locate the next offset that is not 0xffff. The sentinel at the end of
76 * the array is the max index value.
77 */
78 for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
79 ;
80
81 r = _ucprop_offsets[n + m] - 1;
82
83 while (l <= r) {
84 /*
85 * Determine a "mid" point and adjust to make sure the mid point is at
86 * the beginning of a range pair.
87 */
88 m = (l + r) >> 1;
89 m -= (m & 1);
90 if (code > _ucprop_ranges[m + 1])
91 l = m + 2;
92 else if (code < _ucprop_ranges[m])
93 r = m - 2;
94 else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
95 return 1;
96 }
97 return 0;
98
99 }
100
php_unicode_is_prop(unsigned long code,unsigned long mask1,unsigned long mask2)101 MBSTRING_API int php_unicode_is_prop(unsigned long code, unsigned long mask1,
102 unsigned long mask2)
103 {
104 unsigned long i;
105
106 if (mask1 == 0 && mask2 == 0)
107 return 0;
108
109 for (i = 0; mask1 && i < 32; i++) {
110 if ((mask1 & masks32[i]) && prop_lookup(code, i))
111 return 1;
112 }
113
114 for (i = 32; mask2 && i < _ucprop_size; i++) {
115 if ((mask2 & masks32[i & 31]) && prop_lookup(code, i))
116 return 1;
117 }
118
119 return 0;
120 }
121
case_lookup(unsigned long code,long l,long r,int field)122 static unsigned long case_lookup(unsigned long code, long l, long r, int field)
123 {
124 long m;
125 const unsigned int *tmp;
126
127 /*
128 * Do the binary search.
129 */
130 while (l <= r) {
131 /*
132 * Determine a "mid" point and adjust to make sure the mid point is at
133 * the beginning of a case mapping triple.
134 */
135 m = (l + r) >> 1;
136 tmp = &_uccase_map[m*3];
137 if (code > *tmp)
138 l = m + 1;
139 else if (code < *tmp)
140 r = m - 1;
141 else if (code == *tmp)
142 return tmp[field];
143 }
144
145 return code;
146 }
147
php_turkish_toupper(unsigned long code,long l,long r,int field)148 MBSTRING_API unsigned long php_turkish_toupper(unsigned long code, long l, long r, int field)
149 {
150 if (code == 0x0069L) {
151 return 0x0130L;
152 }
153 return case_lookup(code, l, r, field);
154 }
155
php_turkish_tolower(unsigned long code,long l,long r,int field)156 MBSTRING_API unsigned long php_turkish_tolower(unsigned long code, long l, long r, int field)
157 {
158 if (code == 0x0049L) {
159 return 0x0131L;
160 }
161 return case_lookup(code, l, r, field);
162 }
163
php_unicode_toupper(unsigned long code,enum mbfl_no_encoding enc)164 MBSTRING_API unsigned long php_unicode_toupper(unsigned long code, enum mbfl_no_encoding enc)
165 {
166 int field;
167 long l, r;
168
169 if (php_unicode_is_upper(code))
170 return code;
171
172 if (php_unicode_is_lower(code)) {
173 /*
174 * The character is lower case.
175 */
176 field = 1;
177 l = _uccase_len[0];
178 r = (l + _uccase_len[1]) - 1;
179
180 if (enc == mbfl_no_encoding_8859_9) {
181 return php_turkish_toupper(code, l, r, field);
182 }
183
184 } else {
185 /*
186 * The character is title case.
187 */
188 field = 1;
189 l = _uccase_len[0] + _uccase_len[1];
190 r = _uccase_size - 1;
191 }
192 return case_lookup(code, l, r, field);
193 }
194
php_unicode_tolower(unsigned long code,enum mbfl_no_encoding enc)195 MBSTRING_API unsigned long php_unicode_tolower(unsigned long code, enum mbfl_no_encoding enc)
196 {
197 int field;
198 long l, r;
199
200 if (php_unicode_is_lower(code))
201 return code;
202
203 if (php_unicode_is_upper(code)) {
204 /*
205 * The character is upper case.
206 */
207 field = 1;
208 l = 0;
209 r = _uccase_len[0] - 1;
210
211 if (enc == mbfl_no_encoding_8859_9) {
212 return php_turkish_tolower(code, l, r, field);
213 }
214
215 } else {
216 /*
217 * The character is title case.
218 */
219 field = 2;
220 l = _uccase_len[0] + _uccase_len[1];
221 r = _uccase_size - 1;
222 }
223 return case_lookup(code, l, r, field);
224 }
225
php_unicode_totitle(unsigned long code,enum mbfl_no_encoding enc)226 MBSTRING_API unsigned long php_unicode_totitle(unsigned long code, enum mbfl_no_encoding enc)
227 {
228 int field;
229 long l, r;
230
231 if (php_unicode_is_title(code))
232 return code;
233
234 /*
235 * The offset will always be the same for converting to title case.
236 */
237 field = 2;
238
239 if (php_unicode_is_upper(code)) {
240 /*
241 * The character is upper case.
242 */
243 l = 0;
244 r = _uccase_len[0] - 1;
245 } else {
246 /*
247 * The character is lower case.
248 */
249 l = _uccase_len[0];
250 r = (l + _uccase_len[1]) - 1;
251 }
252 return case_lookup(code, l, r, field);
253
254 }
255
256
257 #define BE_ARY_TO_UINT32(ptr) (\
258 ((unsigned char*)(ptr))[0]<<24 |\
259 ((unsigned char*)(ptr))[1]<<16 |\
260 ((unsigned char*)(ptr))[2]<< 8 |\
261 ((unsigned char*)(ptr))[3] )
262
263 #define UINT32_TO_BE_ARY(ptr,val) { \
264 unsigned int v = val; \
265 ((unsigned char*)(ptr))[0] = (v>>24) & 0xff,\
266 ((unsigned char*)(ptr))[1] = (v>>16) & 0xff,\
267 ((unsigned char*)(ptr))[2] = (v>> 8) & 0xff,\
268 ((unsigned char*)(ptr))[3] = (v ) & 0xff;\
269 }
270
php_unicode_convert_case(int case_mode,const char * srcstr,size_t srclen,size_t * ret_len,const char * src_encoding)271 MBSTRING_API char *php_unicode_convert_case(int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
272 const char *src_encoding)
273 {
274 char *unicode, *newstr;
275 size_t unicode_len;
276 unsigned char *unicode_ptr;
277 size_t i;
278 enum mbfl_no_encoding _src_encoding = mbfl_name2no_encoding(src_encoding);
279
280 if (_src_encoding == mbfl_no_encoding_invalid) {
281 php_error_docref(NULL, E_WARNING, "Unknown encoding \"%s\"", src_encoding);
282 return NULL;
283 }
284
285 unicode = php_mb_convert_encoding(srcstr, srclen, "UCS-4BE", src_encoding, &unicode_len);
286 if (unicode == NULL)
287 return NULL;
288
289 unicode_ptr = (unsigned char *)unicode;
290
291 switch(case_mode) {
292 case PHP_UNICODE_CASE_UPPER:
293 for (i = 0; i < unicode_len; i+=4) {
294 UINT32_TO_BE_ARY(&unicode_ptr[i],
295 php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding));
296 }
297 break;
298
299 case PHP_UNICODE_CASE_LOWER:
300 for (i = 0; i < unicode_len; i+=4) {
301 UINT32_TO_BE_ARY(&unicode_ptr[i],
302 php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding));
303 }
304 break;
305
306 case PHP_UNICODE_CASE_TITLE: {
307 int mode = 0;
308
309 for (i = 0; i < unicode_len; i+=4) {
310 int res = php_unicode_is_prop(
311 BE_ARY_TO_UINT32(&unicode_ptr[i]),
312 UC_MN|UC_ME|UC_CF|UC_LM|UC_SK|UC_LU|UC_LL|UC_LT|UC_PO|UC_OS, 0);
313 if (mode) {
314 if (res) {
315 UINT32_TO_BE_ARY(&unicode_ptr[i],
316 php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding));
317 } else {
318 mode = 0;
319 }
320 } else {
321 if (res) {
322 mode = 1;
323 UINT32_TO_BE_ARY(&unicode_ptr[i],
324 php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i]), _src_encoding));
325 }
326 }
327 }
328 } break;
329
330 }
331
332 newstr = php_mb_convert_encoding(unicode, unicode_len, src_encoding, "UCS-4BE", ret_len);
333 efree(unicode);
334
335 return newstr;
336 }
337
338
339 #endif /* HAVE_MBSTRING */
340
341 /*
342 * Local variables:
343 * tab-width: 4
344 * c-basic-offset: 4
345 * End:
346 * vim600: sw=4 ts=4 fdm=marker
347 * vim<600: sw=4 ts=4
348 */
349