1 /*
2 +----------------------------------------------------------------------+
3 | PHP Version 7 |
4 +----------------------------------------------------------------------+
5 | Copyright (c) 1997-2018 The PHP Group |
6 +----------------------------------------------------------------------+
7 | This source file is subject to version 3.01 of the PHP license, |
8 | that is bundled with this package in the file LICENSE, and is |
9 | available through the world-wide-web at the following url: |
10 | http://www.php.net/license/3_01.txt |
11 | If you did not receive a copy of the PHP license and are unable to |
12 | obtain it through the world-wide-web, please send a note to |
13 | license@php.net so we can mail you a copy immediately. |
14 +----------------------------------------------------------------------+
15 | Author: Wez Furlong (wez@thebrainroom.com) |
16 +----------------------------------------------------------------------+
17
18 Based on code from ucdata-2.5, which has the following Copyright:
19
20 Copyright 2001 Computing Research Labs, New Mexico State University
21
22 Permission is hereby granted, free of charge, to any person obtaining a
23 copy of this software and associated documentation files (the "Software"),
24 to deal in the Software without restriction, including without limitation
25 the rights to use, copy, modify, merge, publish, distribute, sublicense,
26 and/or sell copies of the Software, and to permit persons to whom the
27 Software is furnished to do so, subject to the following conditions:
28
29 The above copyright notice and this permission notice shall be included in
30 all copies or substantial portions of the Software.
31 */
32
33 #ifdef HAVE_CONFIG_H
34 #include "config.h"
35 #endif
36
37 #include "php.h"
38 #include "php_ini.h"
39
40 #if HAVE_MBSTRING
41
42 /* include case folding data generated from the official UnicodeData.txt file */
43 #include "mbstring.h"
44 #include "php_unicode.h"
45 #include "unicode_data.h"
46 #include "libmbfl/mbfl/mbfilter_wchar.h"
47
ZEND_EXTERN_MODULE_GLOBALS(mbstring)48 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
49
50 static int prop_lookup(unsigned long code, unsigned long n)
51 {
52 long l, r, m;
53
54 /*
55 * There is an extra node on the end of the offsets to allow this routine
56 * to work right. If the index is 0xffff, then there are no nodes for the
57 * property.
58 */
59 if ((l = _ucprop_offsets[n]) == 0xffff)
60 return 0;
61
62 /*
63 * Locate the next offset that is not 0xffff. The sentinel at the end of
64 * the array is the max index value.
65 */
66 for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
67 ;
68
69 r = _ucprop_offsets[n + m] - 1;
70
71 while (l <= r) {
72 /*
73 * Determine a "mid" point and adjust to make sure the mid point is at
74 * the beginning of a range pair.
75 */
76 m = (l + r) >> 1;
77 m -= (m & 1);
78 if (code > _ucprop_ranges[m + 1])
79 l = m + 2;
80 else if (code < _ucprop_ranges[m])
81 r = m - 2;
82 else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
83 return 1;
84 }
85 return 0;
86
87 }
88
php_unicode_is_prop1(unsigned long code,int prop)89 MBSTRING_API int php_unicode_is_prop1(unsigned long code, int prop)
90 {
91 return prop_lookup(code, prop);
92 }
93
php_unicode_is_prop(unsigned long code,...)94 MBSTRING_API int php_unicode_is_prop(unsigned long code, ...)
95 {
96 int result = 0;
97 va_list va;
98 va_start(va, code);
99
100 while (1) {
101 int prop = va_arg(va, int);
102 if (prop < 0) {
103 break;
104 }
105
106 if (prop_lookup(code, prop)) {
107 result = 1;
108 break;
109 }
110 }
111
112 va_end(va);
113 return result;
114 }
115
mph_hash(unsigned d,unsigned x)116 static inline unsigned mph_hash(unsigned d, unsigned x) {
117 x ^= d;
118 x = ((x >> 16) ^ x) * 0x45d9f3b;
119 return x;
120 }
121
122 #define CODE_NOT_FOUND ((unsigned) -1)
123
mph_lookup(unsigned code,const short * g_table,unsigned g_table_size,const unsigned * table,unsigned table_size)124 static inline unsigned mph_lookup(
125 unsigned code,
126 const short *g_table, unsigned g_table_size,
127 const unsigned *table, unsigned table_size)
128 {
129 short g = g_table[mph_hash(0, code) % g_table_size];
130
131 unsigned idx;
132 if (g <= 0) {
133 idx = -g;
134 } else {
135 idx = mph_hash(g, code) % table_size;
136 }
137
138 if (table[2*idx] == code) {
139 return table[2*idx + 1];
140 }
141 return CODE_NOT_FOUND;
142 }
143
144 #define CASE_LOOKUP(code, type) \
145 mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \
146 _uccase_##type##_table, _uccase_##type##_table_size)
147
php_unicode_toupper_raw(unsigned code,enum mbfl_no_encoding enc)148 static unsigned php_unicode_toupper_raw(unsigned code, enum mbfl_no_encoding enc)
149 {
150 if (code < 0x80) {
151 /* Fast path for ASCII */
152 if (code >= 0x61 && code <= 0x7A) {
153 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x69)) {
154 return 0x130;
155 }
156 return code - 0x20;
157 }
158 return code;
159 } else {
160 unsigned new_code = CASE_LOOKUP(code, upper);
161 if (new_code != CODE_NOT_FOUND) {
162 return new_code;
163 }
164 return code;
165 }
166 }
167
php_unicode_tolower_raw(unsigned code,enum mbfl_no_encoding enc)168 static unsigned php_unicode_tolower_raw(unsigned code, enum mbfl_no_encoding enc)
169 {
170 if (code < 0x80) {
171 /* Fast path for ASCII */
172 if (code >= 0x41 && code <= 0x5A) {
173 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x0049L)) {
174 return 0x0131L;
175 }
176 return code + 0x20;
177 }
178 return code;
179 } else {
180 unsigned new_code = CASE_LOOKUP(code, lower);
181 if (new_code != CODE_NOT_FOUND) {
182 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
183 return 0x69;
184 }
185 return new_code;
186 }
187 return code;
188 }
189 }
190
php_unicode_totitle_raw(unsigned code,enum mbfl_no_encoding enc)191 static unsigned php_unicode_totitle_raw(unsigned code, enum mbfl_no_encoding enc)
192 {
193 unsigned new_code = CASE_LOOKUP(code, title);
194 if (new_code != CODE_NOT_FOUND) {
195 return new_code;
196 }
197
198 /* No dedicated title-case variant, use to-upper instead */
199 return php_unicode_toupper_raw(code, enc);
200 }
201
php_unicode_tofold_raw(unsigned code,enum mbfl_no_encoding enc)202 unsigned php_unicode_tofold_raw(unsigned code, enum mbfl_no_encoding enc)
203 {
204 if (code < 0x80) {
205 /* Fast path for ASCII */
206 if (code >= 0x41 && code <= 0x5A) {
207 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x49)) {
208 return 0x131;
209 }
210 return code + 0x20;
211 }
212 return code;
213 } else {
214 unsigned new_code = CASE_LOOKUP(code, fold);
215 if (new_code != CODE_NOT_FOUND) {
216 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
217 return 0x69;
218 }
219 return new_code;
220 }
221 return code;
222 }
223 }
224
php_unicode_tolower_simple(unsigned code,enum mbfl_no_encoding enc)225 static inline unsigned php_unicode_tolower_simple(unsigned code, enum mbfl_no_encoding enc) {
226 code = php_unicode_tolower_raw(code, enc);
227 if (UNEXPECTED(code > 0xffffff)) {
228 return _uccase_extra_table[code & 0xffffff];
229 }
230 return code;
231 }
php_unicode_toupper_simple(unsigned code,enum mbfl_no_encoding enc)232 static inline unsigned php_unicode_toupper_simple(unsigned code, enum mbfl_no_encoding enc) {
233 code = php_unicode_toupper_raw(code, enc);
234 if (UNEXPECTED(code > 0xffffff)) {
235 return _uccase_extra_table[code & 0xffffff];
236 }
237 return code;
238 }
php_unicode_totitle_simple(unsigned code,enum mbfl_no_encoding enc)239 static inline unsigned php_unicode_totitle_simple(unsigned code, enum mbfl_no_encoding enc) {
240 code = php_unicode_totitle_raw(code, enc);
241 if (UNEXPECTED(code > 0xffffff)) {
242 return _uccase_extra_table[code & 0xffffff];
243 }
244 return code;
245 }
php_unicode_tofold_simple(unsigned code,enum mbfl_no_encoding enc)246 static inline unsigned php_unicode_tofold_simple(unsigned code, enum mbfl_no_encoding enc) {
247 code = php_unicode_tofold_raw(code, enc);
248 if (UNEXPECTED(code > 0xffffff)) {
249 return _uccase_extra_table[code & 0xffffff];
250 }
251 return code;
252 }
253
php_unicode_tolower_full(unsigned code,enum mbfl_no_encoding enc,unsigned * out)254 static inline unsigned php_unicode_tolower_full(
255 unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
256 code = php_unicode_tolower_raw(code, enc);
257 if (UNEXPECTED(code > 0xffffff)) {
258 unsigned len = code >> 24;
259 const unsigned *p = &_uccase_extra_table[code & 0xffffff];
260 memcpy(out, p + 1, len * sizeof(unsigned));
261 return len;
262 }
263 *out = code;
264 return 1;
265 }
php_unicode_toupper_full(unsigned code,enum mbfl_no_encoding enc,unsigned * out)266 static inline unsigned php_unicode_toupper_full(
267 unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
268 code = php_unicode_toupper_raw(code, enc);
269 if (UNEXPECTED(code > 0xffffff)) {
270 unsigned len = code >> 24;
271 const unsigned *p = &_uccase_extra_table[code & 0xffffff];
272 memcpy(out, p + 1, len * sizeof(unsigned));
273 return len;
274 }
275 *out = code;
276 return 1;
277 }
php_unicode_totitle_full(unsigned code,enum mbfl_no_encoding enc,unsigned * out)278 static inline unsigned php_unicode_totitle_full(
279 unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
280 code = php_unicode_totitle_raw(code, enc);
281 if (UNEXPECTED(code > 0xffffff)) {
282 unsigned len = code >> 24;
283 const unsigned *p = &_uccase_extra_table[code & 0xffffff];
284 memcpy(out, p + 1, len * sizeof(unsigned));
285 return len;
286 }
287 *out = code;
288 return 1;
289 }
php_unicode_tofold_full(unsigned code,enum mbfl_no_encoding enc,unsigned * out)290 static inline unsigned php_unicode_tofold_full(
291 unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
292 code = php_unicode_tofold_raw(code, enc);
293 if (UNEXPECTED(code > 0xffffff)) {
294 unsigned len = code >> 24;
295 const unsigned *p = &_uccase_extra_table[code & 0xffffff];
296 memcpy(out, p + 1, len * sizeof(unsigned));
297 return len;
298 }
299 *out = code;
300 return 1;
301 }
302
303 struct convert_case_data {
304 mbfl_convert_filter *next_filter;
305 enum mbfl_no_encoding no_encoding;
306 int case_mode;
307 int title_mode;
308 };
309
convert_case_filter(int c,void * void_data)310 static int convert_case_filter(int c, void *void_data)
311 {
312 struct convert_case_data *data = (struct convert_case_data *) void_data;
313 unsigned out[3];
314 unsigned len, i;
315
316 /* Handle invalid characters early, as we assign special meaning to
317 * codepoints above 0xffffff. */
318 if (UNEXPECTED((unsigned) c > 0xffffff)) {
319 (*data->next_filter->filter_function)(c, data->next_filter);
320 return 0;
321 }
322
323 switch (data->case_mode) {
324 case PHP_UNICODE_CASE_UPPER_SIMPLE:
325 out[0] = php_unicode_toupper_simple(c, data->no_encoding);
326 len = 1;
327 break;
328
329 case PHP_UNICODE_CASE_UPPER:
330 len = php_unicode_toupper_full(c, data->no_encoding, out);
331 break;
332
333 case PHP_UNICODE_CASE_LOWER_SIMPLE:
334 out[0] = php_unicode_tolower_simple(c, data->no_encoding);
335 len = 1;
336 break;
337
338 case PHP_UNICODE_CASE_LOWER:
339 len = php_unicode_tolower_full(c, data->no_encoding, out);
340 break;
341
342 case PHP_UNICODE_CASE_FOLD:
343 len = php_unicode_tofold_full(c, data->no_encoding, out);
344 break;
345
346 case PHP_UNICODE_CASE_FOLD_SIMPLE:
347 out[0] = php_unicode_tofold_simple(c, data->no_encoding);
348 len = 1;
349 break;
350
351 case PHP_UNICODE_CASE_TITLE_SIMPLE:
352 case PHP_UNICODE_CASE_TITLE:
353 {
354 if (data->title_mode) {
355 if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
356 out[0] = php_unicode_tolower_simple(c, data->no_encoding);
357 len = 1;
358 } else {
359 len = php_unicode_tolower_full(c, data->no_encoding, out);
360 }
361 } else {
362 if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
363 out[0] = php_unicode_totitle_simple(c, data->no_encoding);
364 len = 1;
365 } else {
366 len = php_unicode_totitle_full(c, data->no_encoding, out);
367 }
368 }
369 if (!php_unicode_is_case_ignorable(c)) {
370 data->title_mode = php_unicode_is_cased(c);
371 }
372 break;
373 }
374 default:
375 assert(0);
376 break;
377 }
378
379 for (i = 0; i < len; i++) {
380 (*data->next_filter->filter_function)(out[i], data->next_filter);
381 }
382 return 0;
383 }
384
php_unicode_convert_case(int case_mode,const char * srcstr,size_t srclen,size_t * ret_len,const mbfl_encoding * src_encoding,int illegal_mode,int illegal_substchar)385 MBSTRING_API char *php_unicode_convert_case(
386 int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
387 const mbfl_encoding *src_encoding, int illegal_mode, int illegal_substchar)
388 {
389 struct convert_case_data data;
390 mbfl_convert_filter *from_wchar, *to_wchar;
391 mbfl_string result, *result_ptr;
392
393 mbfl_memory_device device;
394 mbfl_memory_device_init(&device, srclen + 1, 0);
395
396 /* encoding -> wchar filter */
397 to_wchar = mbfl_convert_filter_new(src_encoding,
398 &mbfl_encoding_wchar, convert_case_filter, NULL, &data);
399 if (to_wchar == NULL) {
400 mbfl_memory_device_clear(&device);
401 return NULL;
402 }
403
404 /* wchar -> encoding filter */
405 from_wchar = mbfl_convert_filter_new(
406 &mbfl_encoding_wchar, src_encoding,
407 mbfl_memory_device_output, NULL, &device);
408 if (from_wchar == NULL) {
409 mbfl_convert_filter_delete(to_wchar);
410 mbfl_memory_device_clear(&device);
411 return NULL;
412 }
413
414 to_wchar->illegal_mode = illegal_mode;
415 to_wchar->illegal_substchar = illegal_substchar;
416 from_wchar->illegal_mode = illegal_mode;
417 from_wchar->illegal_substchar = illegal_substchar;
418
419 data.next_filter = from_wchar;
420 data.no_encoding = src_encoding->no_encoding;
421 data.case_mode = case_mode;
422 data.title_mode = 0;
423
424 {
425 /* feed data */
426 const unsigned char *p = (const unsigned char *) srcstr;
427 size_t n = srclen;
428 while (n > 0) {
429 if ((*to_wchar->filter_function)(*p++, to_wchar) < 0) {
430 break;
431 }
432 n--;
433 }
434 }
435
436 mbfl_convert_filter_flush(to_wchar);
437 mbfl_convert_filter_flush(from_wchar);
438 result_ptr = mbfl_memory_device_result(&device, &result);
439 mbfl_convert_filter_delete(to_wchar);
440 mbfl_convert_filter_delete(from_wchar);
441
442 if (!result_ptr) {
443 return NULL;
444 }
445
446 *ret_len = result.len;
447 return (char *) result.val;
448 }
449
450
451 #endif /* HAVE_MBSTRING */
452
453 /*
454 * Local variables:
455 * tab-width: 4
456 * c-basic-offset: 4
457 * End:
458 * vim600: sw=4 ts=4 fdm=marker
459 * vim<600: sw=4 ts=4
460 */
461