1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Wez Furlong (wez@thebrainroom.com) |
14 +----------------------------------------------------------------------+
15
16 Based on code from ucdata-2.5, which has the following Copyright:
17
18 Copyright 2001 Computing Research Labs, New Mexico State University
19
20 Permission is hereby granted, free of charge, to any person obtaining a
21 copy of this software and associated documentation files (the "Software"),
22 to deal in the Software without restriction, including without limitation
23 the rights to use, copy, modify, merge, publish, distribute, sublicense,
24 and/or sell copies of the Software, and to permit persons to whom the
25 Software is furnished to do so, subject to the following conditions:
26
27 The above copyright notice and this permission notice shall be included in
28 all copies or substantial portions of the Software.
29 */
30
31 #include "php.h"
32
33 /* include case folding data generated from the official UnicodeData.txt file */
34 #include "mbstring.h"
35 #include "php_unicode.h"
36 #include "unicode_data.h"
37 #include "libmbfl/mbfl/mbfilter_wchar.h"
38
ZEND_EXTERN_MODULE_GLOBALS(mbstring)39 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
40
41 static bool prop_lookup(unsigned long code, unsigned long n)
42 {
43 long l = _ucprop_offsets[n];
44 long r = _ucprop_offsets[n + 1] - 1;
45 while (l <= r) {
46 /*
47 * Determine a "mid" point and adjust to make sure the mid point is at
48 * the beginning of a range pair.
49 */
50 long m = (l + r) >> 1;
51 m -= (m & 1);
52 if (code > _ucprop_ranges[m + 1])
53 l = m + 2;
54 else if (code < _ucprop_ranges[m])
55 r = m - 2;
56 else
57 return true;
58 }
59 return false;
60
61 }
62
php_unicode_is_prop1(unsigned long code,int prop)63 MBSTRING_API bool php_unicode_is_prop1(unsigned long code, int prop)
64 {
65 return prop_lookup(code, prop);
66 }
67
php_unicode_is_prop(unsigned long code,...)68 MBSTRING_API bool php_unicode_is_prop(unsigned long code, ...)
69 {
70 bool result = false;
71 va_list va;
72 va_start(va, code);
73
74 while (1) {
75 int prop = va_arg(va, int);
76 if (prop < 0) {
77 break;
78 }
79
80 if (prop_lookup(code, prop)) {
81 result = true;
82 break;
83 }
84 }
85
86 va_end(va);
87 return result;
88 }
89
mph_hash(unsigned d,unsigned x)90 static inline unsigned mph_hash(unsigned d, unsigned x) {
91 x ^= d;
92 x = ((x >> 16) ^ x) * 0x45d9f3b;
93 return x;
94 }
95
96 #define CODE_NOT_FOUND ((unsigned) -1)
97
mph_lookup(unsigned code,const short * g_table,unsigned g_table_size,const unsigned * table,unsigned table_size)98 static inline unsigned mph_lookup(
99 unsigned code,
100 const short *g_table, unsigned g_table_size,
101 const unsigned *table, unsigned table_size)
102 {
103 short g = g_table[mph_hash(0, code) % g_table_size];
104
105 unsigned idx;
106 if (g <= 0) {
107 idx = -g;
108 } else {
109 idx = mph_hash(g, code) % table_size;
110 }
111
112 if (table[2*idx] == code) {
113 return table[2*idx + 1];
114 }
115 return CODE_NOT_FOUND;
116 }
117
118 #define CASE_LOOKUP(code, type) \
119 mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \
120 _uccase_##type##_table, _uccase_##type##_table_size)
121
php_unicode_toupper_raw(unsigned code,enum mbfl_no_encoding enc)122 static unsigned php_unicode_toupper_raw(unsigned code, enum mbfl_no_encoding enc)
123 {
124 if (code < 0x80) {
125 /* Fast path for ASCII */
126 if (code >= 0x61 && code <= 0x7A) {
127 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x69)) {
128 return 0x130;
129 }
130 return code - 0x20;
131 }
132 return code;
133 } else {
134 unsigned new_code = CASE_LOOKUP(code, upper);
135 if (new_code != CODE_NOT_FOUND) {
136 return new_code;
137 }
138 return code;
139 }
140 }
141
php_unicode_tolower_raw(unsigned code,enum mbfl_no_encoding enc)142 static unsigned php_unicode_tolower_raw(unsigned code, enum mbfl_no_encoding enc)
143 {
144 if (code < 0x80) {
145 /* Fast path for ASCII */
146 if (code >= 0x41 && code <= 0x5A) {
147 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x0049L)) {
148 return 0x0131L;
149 }
150 return code + 0x20;
151 }
152 return code;
153 } else {
154 unsigned new_code = CASE_LOOKUP(code, lower);
155 if (new_code != CODE_NOT_FOUND) {
156 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
157 return 0x69;
158 }
159 return new_code;
160 }
161 return code;
162 }
163 }
164
php_unicode_totitle_raw(unsigned code,enum mbfl_no_encoding enc)165 static unsigned php_unicode_totitle_raw(unsigned code, enum mbfl_no_encoding enc)
166 {
167 unsigned new_code = CASE_LOOKUP(code, title);
168 if (new_code != CODE_NOT_FOUND) {
169 return new_code;
170 }
171
172 /* No dedicated title-case variant, use to-upper instead */
173 return php_unicode_toupper_raw(code, enc);
174 }
175
php_unicode_tofold_raw(unsigned code,enum mbfl_no_encoding enc)176 unsigned php_unicode_tofold_raw(unsigned code, enum mbfl_no_encoding enc)
177 {
178 if (code < 0x80) {
179 /* Fast path for ASCII */
180 if (code >= 0x41 && code <= 0x5A) {
181 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x49)) {
182 return 0x131;
183 }
184 return code + 0x20;
185 }
186 return code;
187 } else {
188 unsigned new_code = CASE_LOOKUP(code, fold);
189 if (new_code != CODE_NOT_FOUND) {
190 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
191 return 0x69;
192 }
193 return new_code;
194 }
195 return code;
196 }
197 }
198
php_unicode_tolower_simple(unsigned code,enum mbfl_no_encoding enc)199 static inline unsigned php_unicode_tolower_simple(unsigned code, enum mbfl_no_encoding enc) {
200 code = php_unicode_tolower_raw(code, enc);
201 if (UNEXPECTED(code > 0xffffff)) {
202 return _uccase_extra_table[code & 0xffffff];
203 }
204 return code;
205 }
php_unicode_toupper_simple(unsigned code,enum mbfl_no_encoding enc)206 static inline unsigned php_unicode_toupper_simple(unsigned code, enum mbfl_no_encoding enc) {
207 code = php_unicode_toupper_raw(code, enc);
208 if (UNEXPECTED(code > 0xffffff)) {
209 return _uccase_extra_table[code & 0xffffff];
210 }
211 return code;
212 }
php_unicode_totitle_simple(unsigned code,enum mbfl_no_encoding enc)213 static inline unsigned php_unicode_totitle_simple(unsigned code, enum mbfl_no_encoding enc) {
214 code = php_unicode_totitle_raw(code, enc);
215 if (UNEXPECTED(code > 0xffffff)) {
216 return _uccase_extra_table[code & 0xffffff];
217 }
218 return code;
219 }
php_unicode_tofold_simple(unsigned code,enum mbfl_no_encoding enc)220 static inline unsigned php_unicode_tofold_simple(unsigned code, enum mbfl_no_encoding enc) {
221 code = php_unicode_tofold_raw(code, enc);
222 if (UNEXPECTED(code > 0xffffff)) {
223 return _uccase_extra_table[code & 0xffffff];
224 }
225 return code;
226 }
227
php_unicode_tolower_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)228 static inline void php_unicode_tolower_full(unsigned code, enum mbfl_no_encoding enc,
229 mbfl_convert_filter* next_filter) {
230 code = php_unicode_tolower_raw(code, enc);
231 if (UNEXPECTED(code > 0xffffff)) {
232 unsigned len = code >> 24;
233 const unsigned *p = &_uccase_extra_table[code & 0xffffff];
234 while (len--) {
235 (next_filter->filter_function)(*++p, next_filter);
236 }
237 } else {
238 (next_filter->filter_function)(code, next_filter);
239 }
240 }
241
php_unicode_toupper_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)242 static inline void php_unicode_toupper_full(unsigned code, enum mbfl_no_encoding enc,
243 mbfl_convert_filter* next_filter) {
244 code = php_unicode_toupper_raw(code, enc);
245 if (UNEXPECTED(code > 0xffffff)) {
246 unsigned len = code >> 24;
247 const unsigned *p = &_uccase_extra_table[code & 0xffffff];
248 while (len--) {
249 (next_filter->filter_function)(*++p, next_filter);
250 }
251 } else {
252 (next_filter->filter_function)(code, next_filter);
253 }
254 }
255
php_unicode_totitle_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)256 static inline void php_unicode_totitle_full(unsigned code, enum mbfl_no_encoding enc,
257 mbfl_convert_filter* next_filter) {
258 code = php_unicode_totitle_raw(code, enc);
259 if (UNEXPECTED(code > 0xffffff)) {
260 unsigned len = code >> 24;
261 const unsigned *p = &_uccase_extra_table[code & 0xffffff];
262 while (len--) {
263 (next_filter->filter_function)(*++p, next_filter);
264 }
265 } else {
266 (next_filter->filter_function)(code, next_filter);
267 }
268 }
269
php_unicode_tofold_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)270 static inline void php_unicode_tofold_full(unsigned code, enum mbfl_no_encoding enc,
271 mbfl_convert_filter* next_filter) {
272 code = php_unicode_tofold_raw(code, enc);
273 if (UNEXPECTED(code > 0xffffff)) {
274 unsigned len = code >> 24;
275 const unsigned *p = &_uccase_extra_table[code & 0xffffff];
276 while (len--) {
277 (next_filter->filter_function)(*++p, next_filter);
278 }
279 } else {
280 (next_filter->filter_function)(code, next_filter);
281 }
282 }
283
284 struct convert_case_data {
285 mbfl_convert_filter *next_filter;
286 enum mbfl_no_encoding no_encoding;
287 int case_mode;
288 int title_mode;
289 };
290
convert_case_filter(int c,void * void_data)291 static int convert_case_filter(int c, void *void_data)
292 {
293 struct convert_case_data *data = (struct convert_case_data *) void_data;
294 unsigned code;
295
296 /* Handle invalid characters early, as we assign special meaning to
297 * codepoints above 0xffffff. */
298 if (UNEXPECTED((unsigned) c > 0xffffff)) {
299 (*data->next_filter->filter_function)(c, data->next_filter);
300 return 0;
301 }
302
303 switch (data->case_mode) {
304 case PHP_UNICODE_CASE_UPPER_SIMPLE:
305 code = php_unicode_toupper_simple(c, data->no_encoding);
306 (data->next_filter->filter_function)(code, data->next_filter);
307 break;
308
309 case PHP_UNICODE_CASE_UPPER:
310 php_unicode_toupper_full(c, data->no_encoding, data->next_filter);
311 break;
312
313 case PHP_UNICODE_CASE_LOWER_SIMPLE:
314 code = php_unicode_tolower_simple(c, data->no_encoding);
315 (data->next_filter->filter_function)(code, data->next_filter);
316 break;
317
318 case PHP_UNICODE_CASE_LOWER:
319 php_unicode_tolower_full(c, data->no_encoding, data->next_filter);
320 break;
321
322 case PHP_UNICODE_CASE_FOLD:
323 php_unicode_tofold_full(c, data->no_encoding, data->next_filter);
324 break;
325
326 case PHP_UNICODE_CASE_FOLD_SIMPLE:
327 code = php_unicode_tofold_simple(c, data->no_encoding);
328 (data->next_filter->filter_function)(code, data->next_filter);
329 break;
330
331 case PHP_UNICODE_CASE_TITLE_SIMPLE:
332 case PHP_UNICODE_CASE_TITLE:
333 {
334 if (data->title_mode) {
335 if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
336 code = php_unicode_tolower_simple(c, data->no_encoding);
337 (data->next_filter->filter_function)(code, data->next_filter);
338 } else {
339 php_unicode_tolower_full(c, data->no_encoding, data->next_filter);
340 }
341 } else {
342 if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
343 code = php_unicode_totitle_simple(c, data->no_encoding);
344 (data->next_filter->filter_function)(code, data->next_filter);
345 } else {
346 php_unicode_totitle_full(c, data->no_encoding, data->next_filter);
347 }
348 }
349 if (!php_unicode_is_case_ignorable(c)) {
350 data->title_mode = php_unicode_is_cased(c);
351 }
352 break;
353 }
354 EMPTY_SWITCH_DEFAULT_CASE()
355 }
356
357 return 0;
358 }
359
php_unicode_convert_case(int case_mode,const char * srcstr,size_t srclen,size_t * ret_len,const mbfl_encoding * src_encoding,int illegal_mode,int illegal_substchar)360 MBSTRING_API char *php_unicode_convert_case(
361 int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
362 const mbfl_encoding *src_encoding, int illegal_mode, int illegal_substchar)
363 {
364 struct convert_case_data data;
365 mbfl_convert_filter *from_wchar, *to_wchar;
366 mbfl_string result, *result_ptr;
367
368 mbfl_memory_device device;
369 mbfl_memory_device_init(&device, srclen + 1, 0);
370
371 /* encoding -> wchar filter */
372 to_wchar = mbfl_convert_filter_new(src_encoding,
373 &mbfl_encoding_wchar, convert_case_filter, NULL, &data);
374 if (to_wchar == NULL) {
375 mbfl_memory_device_clear(&device);
376 return NULL;
377 }
378
379 /* wchar -> encoding filter */
380 from_wchar = mbfl_convert_filter_new(
381 &mbfl_encoding_wchar, src_encoding,
382 mbfl_memory_device_output, NULL, &device);
383 if (from_wchar == NULL) {
384 mbfl_convert_filter_delete(to_wchar);
385 mbfl_memory_device_clear(&device);
386 return NULL;
387 }
388
389 to_wchar->illegal_mode = illegal_mode;
390 to_wchar->illegal_substchar = illegal_substchar;
391 from_wchar->illegal_mode = illegal_mode;
392 from_wchar->illegal_substchar = illegal_substchar;
393
394 data.next_filter = from_wchar;
395 data.no_encoding = src_encoding->no_encoding;
396 data.case_mode = case_mode;
397 data.title_mode = 0;
398
399 {
400 /* feed data */
401 const unsigned char *p = (const unsigned char *) srcstr;
402 size_t n = srclen;
403 while (n > 0) {
404 if ((*to_wchar->filter_function)(*p++, to_wchar) < 0) {
405 break;
406 }
407 n--;
408 }
409 }
410
411 mbfl_convert_filter_flush(to_wchar);
412 mbfl_convert_filter_flush(from_wchar);
413 result_ptr = mbfl_memory_device_result(&device, &result);
414 mbfl_convert_filter_delete(to_wchar);
415 mbfl_convert_filter_delete(from_wchar);
416
417 if (!result_ptr) {
418 return NULL;
419 }
420
421 *ret_len = result.len;
422 return (char *) result.val;
423 }
424