1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Wez Furlong (wez@thebrainroom.com) |
14 +----------------------------------------------------------------------+
15
16 Based on code from ucdata-2.5, which has the following Copyright:
17
18 Copyright 2001 Computing Research Labs, New Mexico State University
19
20 Permission is hereby granted, free of charge, to any person obtaining a
21 copy of this software and associated documentation files (the "Software"),
22 to deal in the Software without restriction, including without limitation
23 the rights to use, copy, modify, merge, publish, distribute, sublicense,
24 and/or sell copies of the Software, and to permit persons to whom the
25 Software is furnished to do so, subject to the following conditions:
26
27 The above copyright notice and this permission notice shall be included in
28 all copies or substantial portions of the Software.
29 */
30
31 #include "php.h"
32
33 /* include case folding data generated from the official UnicodeData.txt file */
34 #include "mbstring.h"
35 #include "php_unicode.h"
36 #include "unicode_data.h"
37 #include "libmbfl/mbfl/mbfilter_wchar.h"
38
ZEND_EXTERN_MODULE_GLOBALS(mbstring)39 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
40
41 static bool prop_lookup(unsigned long code, unsigned long n)
42 {
43 long l = _ucprop_offsets[n];
44 long r = _ucprop_offsets[n + 1] - 1;
45 while (l <= r) {
46 /*
47 * Determine a "mid" point and adjust to make sure the mid point is at
48 * the beginning of a range pair.
49 */
50 long m = (l + r) >> 1;
51 m -= (m & 1);
52 if (code > _ucprop_ranges[m + 1])
53 l = m + 2;
54 else if (code < _ucprop_ranges[m])
55 r = m - 2;
56 else
57 return true;
58 }
59 return false;
60
61 }
62
php_unicode_is_prop1(unsigned long code,int prop)63 MBSTRING_API bool php_unicode_is_prop1(unsigned long code, int prop)
64 {
65 return prop_lookup(code, prop);
66 }
67
php_unicode_is_prop(unsigned long code,...)68 MBSTRING_API bool php_unicode_is_prop(unsigned long code, ...)
69 {
70 bool result = false;
71 va_list va;
72 va_start(va, code);
73
74 while (1) {
75 int prop = va_arg(va, int);
76 if (prop < 0) {
77 break;
78 }
79
80 if (prop_lookup(code, prop)) {
81 result = true;
82 break;
83 }
84 }
85
86 va_end(va);
87 return result;
88 }
89
mph_hash(unsigned d,unsigned x)90 static inline unsigned mph_hash(unsigned d, unsigned x) {
91 x ^= d;
92 x = ((x >> 16) ^ x) * 0x45d9f3b;
93 return x;
94 }
95
96 #define CODE_NOT_FOUND ((unsigned) -1)
97
mph_lookup(unsigned code,const short * g_table,unsigned g_table_size,const unsigned * table,unsigned table_size)98 static inline unsigned mph_lookup(
99 unsigned code,
100 const short *g_table, unsigned g_table_size,
101 const unsigned *table, unsigned table_size)
102 {
103 short g = g_table[mph_hash(0, code) % g_table_size];
104
105 unsigned idx;
106 if (g <= 0) {
107 idx = -g;
108 } else {
109 idx = mph_hash(g, code) % table_size;
110 }
111
112 if (table[2*idx] == code) {
113 return table[2*idx + 1];
114 }
115 return CODE_NOT_FOUND;
116 }
117
118 #define CASE_LOOKUP(code, type) \
119 mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \
120 _uccase_##type##_table, _uccase_##type##_table_size)
121
php_unicode_toupper_raw(unsigned code,enum mbfl_no_encoding enc)122 static unsigned php_unicode_toupper_raw(unsigned code, enum mbfl_no_encoding enc)
123 {
124 /* After the ASCII characters, the first codepoint with an uppercase version
125 * is 0xB5 (MICRO SIGN) */
126 if (code < 0xB5) {
127 /* Fast path for ASCII */
128 if (code >= 0x61 && code <= 0x7A) {
129 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x69)) {
130 return 0x130;
131 }
132 return code - 0x20;
133 }
134 return code;
135 } else {
136 unsigned new_code = CASE_LOOKUP(code, upper);
137 if (new_code != CODE_NOT_FOUND) {
138 return new_code;
139 }
140 return code;
141 }
142 }
143
php_unicode_tolower_raw(unsigned code,enum mbfl_no_encoding enc)144 static unsigned php_unicode_tolower_raw(unsigned code, enum mbfl_no_encoding enc)
145 {
146 /* After the ASCII characters, the first codepoint with a lowercase version
147 * is 0xC0 (LATIN CAPITAL LETTER A WITH GRAVE) */
148 if (code < 0xC0) {
149 /* Fast path for ASCII */
150 if (code >= 0x41 && code <= 0x5A) {
151 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x0049L)) {
152 return 0x0131L;
153 }
154 return code + 0x20;
155 }
156 return code;
157 } else {
158 unsigned new_code = CASE_LOOKUP(code, lower);
159 if (new_code != CODE_NOT_FOUND) {
160 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
161 return 0x69;
162 }
163 return new_code;
164 }
165 return code;
166 }
167 }
168
php_unicode_totitle_raw(unsigned code,enum mbfl_no_encoding enc)169 static unsigned php_unicode_totitle_raw(unsigned code, enum mbfl_no_encoding enc)
170 {
171 unsigned new_code = CASE_LOOKUP(code, title);
172 if (new_code != CODE_NOT_FOUND) {
173 return new_code;
174 }
175
176 /* No dedicated title-case variant, use to-upper instead */
177 return php_unicode_toupper_raw(code, enc);
178 }
179
php_unicode_tofold_raw(unsigned code,enum mbfl_no_encoding enc)180 unsigned php_unicode_tofold_raw(unsigned code, enum mbfl_no_encoding enc)
181 {
182 if (code < 0x80) {
183 /* Fast path for ASCII */
184 if (code >= 0x41 && code <= 0x5A) {
185 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x49)) {
186 return 0x131;
187 }
188 return code + 0x20;
189 }
190 return code;
191 } else {
192 unsigned new_code = CASE_LOOKUP(code, fold);
193 if (new_code != CODE_NOT_FOUND) {
194 if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
195 return 0x69;
196 }
197 return new_code;
198 }
199 return code;
200 }
201 }
202
php_unicode_tolower_simple(unsigned code,enum mbfl_no_encoding enc)203 static inline unsigned php_unicode_tolower_simple(unsigned code, enum mbfl_no_encoding enc) {
204 code = php_unicode_tolower_raw(code, enc);
205 if (UNEXPECTED(code > 0xffffff)) {
206 return _uccase_extra_table[code & 0xffffff];
207 }
208 return code;
209 }
php_unicode_toupper_simple(unsigned code,enum mbfl_no_encoding enc)210 static inline unsigned php_unicode_toupper_simple(unsigned code, enum mbfl_no_encoding enc) {
211 code = php_unicode_toupper_raw(code, enc);
212 if (UNEXPECTED(code > 0xffffff)) {
213 return _uccase_extra_table[code & 0xffffff];
214 }
215 return code;
216 }
php_unicode_totitle_simple(unsigned code,enum mbfl_no_encoding enc)217 static inline unsigned php_unicode_totitle_simple(unsigned code, enum mbfl_no_encoding enc) {
218 code = php_unicode_totitle_raw(code, enc);
219 if (UNEXPECTED(code > 0xffffff)) {
220 return _uccase_extra_table[code & 0xffffff];
221 }
222 return code;
223 }
php_unicode_tofold_simple(unsigned code,enum mbfl_no_encoding enc)224 static inline unsigned php_unicode_tofold_simple(unsigned code, enum mbfl_no_encoding enc) {
225 code = php_unicode_tofold_raw(code, enc);
226 if (UNEXPECTED(code > 0xffffff)) {
227 return _uccase_extra_table[code & 0xffffff];
228 }
229 return code;
230 }
231
php_unicode_tolower_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)232 static inline void php_unicode_tolower_full(unsigned code, enum mbfl_no_encoding enc,
233 mbfl_convert_filter* next_filter) {
234 code = php_unicode_tolower_raw(code, enc);
235 if (UNEXPECTED(code > 0xffffff)) {
236 unsigned len = code >> 24;
237 const unsigned *p = &_uccase_extra_table[code & 0xffffff];
238 while (len--) {
239 (next_filter->filter_function)(*++p, next_filter);
240 }
241 } else {
242 (next_filter->filter_function)(code, next_filter);
243 }
244 }
245
php_unicode_toupper_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)246 static inline void php_unicode_toupper_full(unsigned code, enum mbfl_no_encoding enc,
247 mbfl_convert_filter* next_filter) {
248 code = php_unicode_toupper_raw(code, enc);
249 if (UNEXPECTED(code > 0xffffff)) {
250 unsigned len = code >> 24;
251 const unsigned *p = &_uccase_extra_table[code & 0xffffff];
252 while (len--) {
253 (next_filter->filter_function)(*++p, next_filter);
254 }
255 } else {
256 (next_filter->filter_function)(code, next_filter);
257 }
258 }
259
php_unicode_totitle_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)260 static inline void php_unicode_totitle_full(unsigned code, enum mbfl_no_encoding enc,
261 mbfl_convert_filter* next_filter) {
262 code = php_unicode_totitle_raw(code, enc);
263 if (UNEXPECTED(code > 0xffffff)) {
264 unsigned len = code >> 24;
265 const unsigned *p = &_uccase_extra_table[code & 0xffffff];
266 while (len--) {
267 (next_filter->filter_function)(*++p, next_filter);
268 }
269 } else {
270 (next_filter->filter_function)(code, next_filter);
271 }
272 }
273
php_unicode_tofold_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)274 static inline void php_unicode_tofold_full(unsigned code, enum mbfl_no_encoding enc,
275 mbfl_convert_filter* next_filter) {
276 code = php_unicode_tofold_raw(code, enc);
277 if (UNEXPECTED(code > 0xffffff)) {
278 unsigned len = code >> 24;
279 const unsigned *p = &_uccase_extra_table[code & 0xffffff];
280 while (len--) {
281 (next_filter->filter_function)(*++p, next_filter);
282 }
283 } else {
284 (next_filter->filter_function)(code, next_filter);
285 }
286 }
287
288 struct convert_case_data {
289 mbfl_convert_filter *next_filter;
290 enum mbfl_no_encoding no_encoding;
291 int case_mode;
292 int title_mode;
293 };
294
convert_case_filter(int c,void * void_data)295 static int convert_case_filter(int c, void *void_data)
296 {
297 struct convert_case_data *data = (struct convert_case_data *) void_data;
298 unsigned code;
299
300 /* Handle invalid characters early, as we assign special meaning to
301 * codepoints above 0xffffff. */
302 if (UNEXPECTED((unsigned) c > 0xffffff)) {
303 (*data->next_filter->filter_function)(c, data->next_filter);
304 return 0;
305 }
306
307 switch (data->case_mode) {
308 case PHP_UNICODE_CASE_UPPER_SIMPLE:
309 code = php_unicode_toupper_simple(c, data->no_encoding);
310 (data->next_filter->filter_function)(code, data->next_filter);
311 break;
312
313 case PHP_UNICODE_CASE_UPPER:
314 php_unicode_toupper_full(c, data->no_encoding, data->next_filter);
315 break;
316
317 case PHP_UNICODE_CASE_LOWER_SIMPLE:
318 code = php_unicode_tolower_simple(c, data->no_encoding);
319 (data->next_filter->filter_function)(code, data->next_filter);
320 break;
321
322 case PHP_UNICODE_CASE_LOWER:
323 php_unicode_tolower_full(c, data->no_encoding, data->next_filter);
324 break;
325
326 case PHP_UNICODE_CASE_FOLD:
327 php_unicode_tofold_full(c, data->no_encoding, data->next_filter);
328 break;
329
330 case PHP_UNICODE_CASE_FOLD_SIMPLE:
331 code = php_unicode_tofold_simple(c, data->no_encoding);
332 (data->next_filter->filter_function)(code, data->next_filter);
333 break;
334
335 case PHP_UNICODE_CASE_TITLE_SIMPLE:
336 case PHP_UNICODE_CASE_TITLE:
337 {
338 if (data->title_mode) {
339 if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
340 code = php_unicode_tolower_simple(c, data->no_encoding);
341 (data->next_filter->filter_function)(code, data->next_filter);
342 } else {
343 php_unicode_tolower_full(c, data->no_encoding, data->next_filter);
344 }
345 } else {
346 if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
347 code = php_unicode_totitle_simple(c, data->no_encoding);
348 (data->next_filter->filter_function)(code, data->next_filter);
349 } else {
350 php_unicode_totitle_full(c, data->no_encoding, data->next_filter);
351 }
352 }
353 if (!php_unicode_is_case_ignorable(c)) {
354 data->title_mode = php_unicode_is_cased(c);
355 }
356 break;
357 }
358 EMPTY_SWITCH_DEFAULT_CASE()
359 }
360
361 return 0;
362 }
363
php_unicode_convert_case(int case_mode,const char * srcstr,size_t srclen,size_t * ret_len,const mbfl_encoding * src_encoding,int illegal_mode,int illegal_substchar)364 MBSTRING_API char *php_unicode_convert_case(
365 int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
366 const mbfl_encoding *src_encoding, int illegal_mode, int illegal_substchar)
367 {
368 struct convert_case_data data;
369 mbfl_convert_filter *from_wchar, *to_wchar;
370 mbfl_string result;
371
372 mbfl_memory_device device;
373 mbfl_memory_device_init(&device, srclen + 1, 0);
374
375 /* encoding -> wchar filter */
376 to_wchar = mbfl_convert_filter_new(src_encoding,
377 &mbfl_encoding_wchar, convert_case_filter, NULL, &data);
378 if (to_wchar == NULL) {
379 mbfl_memory_device_clear(&device);
380 return NULL;
381 }
382
383 /* wchar -> encoding filter */
384 from_wchar = mbfl_convert_filter_new(
385 &mbfl_encoding_wchar, src_encoding,
386 mbfl_memory_device_output, NULL, &device);
387 if (from_wchar == NULL) {
388 mbfl_convert_filter_delete(to_wchar);
389 mbfl_memory_device_clear(&device);
390 return NULL;
391 }
392
393 to_wchar->illegal_mode = illegal_mode;
394 to_wchar->illegal_substchar = illegal_substchar;
395 from_wchar->illegal_mode = illegal_mode;
396 from_wchar->illegal_substchar = illegal_substchar;
397
398 data.next_filter = from_wchar;
399 data.no_encoding = src_encoding->no_encoding;
400 data.case_mode = case_mode;
401 data.title_mode = 0;
402
403 {
404 /* feed data */
405 const unsigned char *p = (const unsigned char *) srcstr;
406 size_t n = srclen;
407 while (n > 0) {
408 if ((*to_wchar->filter_function)(*p++, to_wchar) < 0) {
409 break;
410 }
411 n--;
412 }
413 }
414
415 mbfl_convert_filter_flush(to_wchar);
416 mbfl_convert_filter_flush(from_wchar);
417 mbfl_memory_device_result(&device, &result);
418 mbfl_convert_filter_delete(to_wchar);
419 mbfl_convert_filter_delete(from_wchar);
420
421 *ret_len = result.len;
422 return (char *) result.val;
423 }
424