1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | http://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Wez Furlong (wez@thebrainroom.com) |
14 +----------------------------------------------------------------------+
15
16 Based on code from ucdata-2.5, which has the following Copyright:
17
18 Copyright 2001 Computing Research Labs, New Mexico State University
19
20 Permission is hereby granted, free of charge, to any person obtaining a
21 copy of this software and associated documentation files (the "Software"),
22 to deal in the Software without restriction, including without limitation
23 the rights to use, copy, modify, merge, publish, distribute, sublicense,
24 and/or sell copies of the Software, and to permit persons to whom the
25 Software is furnished to do so, subject to the following conditions:
26
27 The above copyright notice and this permission notice shall be included in
28 all copies or substantial portions of the Software.
29 */
30
31 #include "php.h"
32
33 /* include case folding data generated from the official UnicodeData.txt file */
34 #include "mbstring.h"
35 #include "php_unicode.h"
36 #include "unicode_data.h"
37 #include "libmbfl/mbfl/mbfilter_wchar.h"
38
ZEND_EXTERN_MODULE_GLOBALS(mbstring)39 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
40
41 static int prop_lookup(unsigned long code, unsigned long n)
42 {
43 long l, r, m;
44
45 /*
46 * There is an extra node on the end of the offsets to allow this routine
47 * to work right. If the index is 0xffff, then there are no nodes for the
48 * property.
49 */
50 if ((l = _ucprop_offsets[n]) == 0xffff)
51 return 0;
52
53 /*
54 * Locate the next offset that is not 0xffff. The sentinel at the end of
55 * the array is the max index value.
56 */
57 for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
58 ;
59
60 r = _ucprop_offsets[n + m] - 1;
61
62 while (l <= r) {
63 /*
64 * Determine a "mid" point and adjust to make sure the mid point is at
65 * the beginning of a range pair.
66 */
67 m = (l + r) >> 1;
68 m -= (m & 1);
69 if (code > _ucprop_ranges[m + 1])
70 l = m + 2;
71 else if (code < _ucprop_ranges[m])
72 r = m - 2;
73 else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
74 return 1;
75 }
76 return 0;
77
78 }
79
php_unicode_is_prop1(unsigned long code,int prop)80 MBSTRING_API int php_unicode_is_prop1(unsigned long code, int prop)
81 {
82 return prop_lookup(code, prop);
83 }
84
php_unicode_is_prop(unsigned long code,...)85 MBSTRING_API int php_unicode_is_prop(unsigned long code, ...)
86 {
87 int result = 0;
88 va_list va;
89 va_start(va, code);
90
91 while (1) {
92 int prop = va_arg(va, int);
93 if (prop < 0) {
94 break;
95 }
96
97 if (prop_lookup(code, prop)) {
98 result = 1;
99 break;
100 }
101 }
102
103 va_end(va);
104 return result;
105 }
106
mph_hash(unsigned d,unsigned x)107 static inline unsigned mph_hash(unsigned d, unsigned x) {
108 x ^= d;
109 x = ((x >> 16) ^ x) * 0x45d9f3b;
110 return x;
111 }
112
113 #define CODE_NOT_FOUND ((unsigned) -1)
114
mph_lookup(unsigned code,const short * g_table,unsigned g_table_size,const unsigned * table,unsigned table_size)115 static inline unsigned mph_lookup(
116 unsigned code,
117 const short *g_table, unsigned g_table_size,
118 const unsigned *table, unsigned table_size)
119 {
120 short g = g_table[mph_hash(0, code) % g_table_size];
121
122 unsigned idx;
123 if (g <= 0) {
124 idx = -g;
125 } else {
126 idx = mph_hash(g, code) % table_size;
127 }
128
129 if (table[2*idx] == code) {
130 return table[2*idx + 1];
131 }
132 return CODE_NOT_FOUND;
133 }
134
135 #define CASE_LOOKUP(code, type) \
136 mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \
137 _uccase_##type##_table, _uccase_##type##_table_size)
138
php_unicode_toupper_raw(unsigned code,enum mbfl_no_encoding enc)139 static unsigned php_unicode_toupper_raw(unsigned code, enum mbfl_no_encoding enc)
140 {
141 if (code < 0x80) {
142 /* Fast path for ASCII */
143 if (code >= 0x61 && code <= 0x7A) {
144 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x69)) {
145 return 0x130;
146 }
147 return code - 0x20;
148 }
149 return code;
150 } else {
151 unsigned new_code = CASE_LOOKUP(code, upper);
152 if (new_code != CODE_NOT_FOUND) {
153 return new_code;
154 }
155 return code;
156 }
157 }
158
php_unicode_tolower_raw(unsigned code,enum mbfl_no_encoding enc)159 static unsigned php_unicode_tolower_raw(unsigned code, enum mbfl_no_encoding enc)
160 {
161 if (code < 0x80) {
162 /* Fast path for ASCII */
163 if (code >= 0x41 && code <= 0x5A) {
164 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x0049L)) {
165 return 0x0131L;
166 }
167 return code + 0x20;
168 }
169 return code;
170 } else {
171 unsigned new_code = CASE_LOOKUP(code, lower);
172 if (new_code != CODE_NOT_FOUND) {
173 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
174 return 0x69;
175 }
176 return new_code;
177 }
178 return code;
179 }
180 }
181
php_unicode_totitle_raw(unsigned code,enum mbfl_no_encoding enc)182 static unsigned php_unicode_totitle_raw(unsigned code, enum mbfl_no_encoding enc)
183 {
184 unsigned new_code = CASE_LOOKUP(code, title);
185 if (new_code != CODE_NOT_FOUND) {
186 return new_code;
187 }
188
189 /* No dedicated title-case variant, use to-upper instead */
190 return php_unicode_toupper_raw(code, enc);
191 }
192
php_unicode_tofold_raw(unsigned code,enum mbfl_no_encoding enc)193 unsigned php_unicode_tofold_raw(unsigned code, enum mbfl_no_encoding enc)
194 {
195 if (code < 0x80) {
196 /* Fast path for ASCII */
197 if (code >= 0x41 && code <= 0x5A) {
198 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x49)) {
199 return 0x131;
200 }
201 return code + 0x20;
202 }
203 return code;
204 } else {
205 unsigned new_code = CASE_LOOKUP(code, fold);
206 if (new_code != CODE_NOT_FOUND) {
207 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
208 return 0x69;
209 }
210 return new_code;
211 }
212 return code;
213 }
214 }
215
php_unicode_tolower_simple(unsigned code,enum mbfl_no_encoding enc)216 static inline unsigned php_unicode_tolower_simple(unsigned code, enum mbfl_no_encoding enc) {
217 code = php_unicode_tolower_raw(code, enc);
218 if (UNEXPECTED(code > 0xffffff)) {
219 return _uccase_extra_table[code & 0xffffff];
220 }
221 return code;
222 }
php_unicode_toupper_simple(unsigned code,enum mbfl_no_encoding enc)223 static inline unsigned php_unicode_toupper_simple(unsigned code, enum mbfl_no_encoding enc) {
224 code = php_unicode_toupper_raw(code, enc);
225 if (UNEXPECTED(code > 0xffffff)) {
226 return _uccase_extra_table[code & 0xffffff];
227 }
228 return code;
229 }
php_unicode_totitle_simple(unsigned code,enum mbfl_no_encoding enc)230 static inline unsigned php_unicode_totitle_simple(unsigned code, enum mbfl_no_encoding enc) {
231 code = php_unicode_totitle_raw(code, enc);
232 if (UNEXPECTED(code > 0xffffff)) {
233 return _uccase_extra_table[code & 0xffffff];
234 }
235 return code;
236 }
php_unicode_tofold_simple(unsigned code,enum mbfl_no_encoding enc)237 static inline unsigned php_unicode_tofold_simple(unsigned code, enum mbfl_no_encoding enc) {
238 code = php_unicode_tofold_raw(code, enc);
239 if (UNEXPECTED(code > 0xffffff)) {
240 return _uccase_extra_table[code & 0xffffff];
241 }
242 return code;
243 }
244
php_unicode_tolower_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)245 static inline void php_unicode_tolower_full(unsigned code, enum mbfl_no_encoding enc,
246 mbfl_convert_filter* next_filter) {
247 code = php_unicode_tolower_raw(code, enc);
248 if (UNEXPECTED(code > 0xffffff)) {
249 unsigned len = code >> 24;
250 const unsigned *p = &_uccase_extra_table[code & 0xffffff];
251 while (len--) {
252 (next_filter->filter_function)(*++p, next_filter);
253 }
254 } else {
255 (next_filter->filter_function)(code, next_filter);
256 }
257 }
258
php_unicode_toupper_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)259 static inline void php_unicode_toupper_full(unsigned code, enum mbfl_no_encoding enc,
260 mbfl_convert_filter* next_filter) {
261 code = php_unicode_toupper_raw(code, enc);
262 if (UNEXPECTED(code > 0xffffff)) {
263 unsigned len = code >> 24;
264 const unsigned *p = &_uccase_extra_table[code & 0xffffff];
265 while (len--) {
266 (next_filter->filter_function)(*++p, next_filter);
267 }
268 } else {
269 (next_filter->filter_function)(code, next_filter);
270 }
271 }
272
php_unicode_totitle_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)273 static inline void php_unicode_totitle_full(unsigned code, enum mbfl_no_encoding enc,
274 mbfl_convert_filter* next_filter) {
275 code = php_unicode_totitle_raw(code, enc);
276 if (UNEXPECTED(code > 0xffffff)) {
277 unsigned len = code >> 24;
278 const unsigned *p = &_uccase_extra_table[code & 0xffffff];
279 while (len--) {
280 (next_filter->filter_function)(*++p, next_filter);
281 }
282 } else {
283 (next_filter->filter_function)(code, next_filter);
284 }
285 }
286
php_unicode_tofold_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)287 static inline void php_unicode_tofold_full(unsigned code, enum mbfl_no_encoding enc,
288 mbfl_convert_filter* next_filter) {
289 code = php_unicode_tofold_raw(code, enc);
290 if (UNEXPECTED(code > 0xffffff)) {
291 unsigned len = code >> 24;
292 const unsigned *p = &_uccase_extra_table[code & 0xffffff];
293 while (len--) {
294 (next_filter->filter_function)(*++p, next_filter);
295 }
296 } else {
297 (next_filter->filter_function)(code, next_filter);
298 }
299 }
300
301 struct convert_case_data {
302 mbfl_convert_filter *next_filter;
303 enum mbfl_no_encoding no_encoding;
304 int case_mode;
305 int title_mode;
306 };
307
convert_case_filter(int c,void * void_data)308 static int convert_case_filter(int c, void *void_data)
309 {
310 struct convert_case_data *data = (struct convert_case_data *) void_data;
311 unsigned code;
312
313 /* Handle invalid characters early, as we assign special meaning to
314 * codepoints above 0xffffff. */
315 if (UNEXPECTED((unsigned) c > 0xffffff)) {
316 (*data->next_filter->filter_function)(c, data->next_filter);
317 return 0;
318 }
319
320 switch (data->case_mode) {
321 case PHP_UNICODE_CASE_UPPER_SIMPLE:
322 code = php_unicode_toupper_simple(c, data->no_encoding);
323 (data->next_filter->filter_function)(code, data->next_filter);
324 break;
325
326 case PHP_UNICODE_CASE_UPPER:
327 php_unicode_toupper_full(c, data->no_encoding, data->next_filter);
328 break;
329
330 case PHP_UNICODE_CASE_LOWER_SIMPLE:
331 code = php_unicode_tolower_simple(c, data->no_encoding);
332 (data->next_filter->filter_function)(code, data->next_filter);
333 break;
334
335 case PHP_UNICODE_CASE_LOWER:
336 php_unicode_tolower_full(c, data->no_encoding, data->next_filter);
337 break;
338
339 case PHP_UNICODE_CASE_FOLD:
340 php_unicode_tofold_full(c, data->no_encoding, data->next_filter);
341 break;
342
343 case PHP_UNICODE_CASE_FOLD_SIMPLE:
344 code = php_unicode_tofold_simple(c, data->no_encoding);
345 (data->next_filter->filter_function)(code, data->next_filter);
346 break;
347
348 case PHP_UNICODE_CASE_TITLE_SIMPLE:
349 case PHP_UNICODE_CASE_TITLE:
350 {
351 if (data->title_mode) {
352 if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
353 code = php_unicode_tolower_simple(c, data->no_encoding);
354 (data->next_filter->filter_function)(code, data->next_filter);
355 } else {
356 php_unicode_tolower_full(c, data->no_encoding, data->next_filter);
357 }
358 } else {
359 if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
360 code = php_unicode_totitle_simple(c, data->no_encoding);
361 (data->next_filter->filter_function)(code, data->next_filter);
362 } else {
363 php_unicode_totitle_full(c, data->no_encoding, data->next_filter);
364 }
365 }
366 if (!php_unicode_is_case_ignorable(c)) {
367 data->title_mode = php_unicode_is_cased(c);
368 }
369 break;
370 }
371 EMPTY_SWITCH_DEFAULT_CASE()
372 }
373
374 return 0;
375 }
376
php_unicode_convert_case(int case_mode,const char * srcstr,size_t srclen,size_t * ret_len,const mbfl_encoding * src_encoding,int illegal_mode,int illegal_substchar)377 MBSTRING_API char *php_unicode_convert_case(
378 int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
379 const mbfl_encoding *src_encoding, int illegal_mode, int illegal_substchar)
380 {
381 struct convert_case_data data;
382 mbfl_convert_filter *from_wchar, *to_wchar;
383 mbfl_string result, *result_ptr;
384
385 mbfl_memory_device device;
386 mbfl_memory_device_init(&device, srclen + 1, 0);
387
388 /* encoding -> wchar filter */
389 to_wchar = mbfl_convert_filter_new(src_encoding,
390 &mbfl_encoding_wchar, convert_case_filter, NULL, &data);
391 if (to_wchar == NULL) {
392 mbfl_memory_device_clear(&device);
393 return NULL;
394 }
395
396 /* wchar -> encoding filter */
397 from_wchar = mbfl_convert_filter_new(
398 &mbfl_encoding_wchar, src_encoding,
399 mbfl_memory_device_output, NULL, &device);
400 if (from_wchar == NULL) {
401 mbfl_convert_filter_delete(to_wchar);
402 mbfl_memory_device_clear(&device);
403 return NULL;
404 }
405
406 to_wchar->illegal_mode = illegal_mode;
407 to_wchar->illegal_substchar = illegal_substchar;
408 from_wchar->illegal_mode = illegal_mode;
409 from_wchar->illegal_substchar = illegal_substchar;
410
411 data.next_filter = from_wchar;
412 data.no_encoding = src_encoding->no_encoding;
413 data.case_mode = case_mode;
414 data.title_mode = 0;
415
416 {
417 /* feed data */
418 const unsigned char *p = (const unsigned char *) srcstr;
419 size_t n = srclen;
420 while (n > 0) {
421 if ((*to_wchar->filter_function)(*p++, to_wchar) < 0) {
422 break;
423 }
424 n--;
425 }
426 }
427
428 mbfl_convert_filter_flush(to_wchar);
429 mbfl_convert_filter_flush(from_wchar);
430 result_ptr = mbfl_memory_device_result(&device, &result);
431 mbfl_convert_filter_delete(to_wchar);
432 mbfl_convert_filter_delete(from_wchar);
433
434 if (!result_ptr) {
435 return NULL;
436 }
437
438 *ret_len = result.len;
439 return (char *) result.val;
440 }
441