1 /*
2 +----------------------------------------------------------------------+
3 | Copyright (c) The PHP Group |
4 +----------------------------------------------------------------------+
5 | This source file is subject to version 3.01 of the PHP license, |
6 | that is bundled with this package in the file LICENSE, and is |
7 | available through the world-wide-web at the following url: |
8 | https://www.php.net/license/3_01.txt |
9 | If you did not receive a copy of the PHP license and are unable to |
10 | obtain it through the world-wide-web, please send a note to |
11 | license@php.net so we can mail you a copy immediately. |
12 +----------------------------------------------------------------------+
13 | Author: Wez Furlong (wez@thebrainroom.com) |
14 +----------------------------------------------------------------------+
15
16 Based on code from ucdata-2.5, which has the following Copyright:
17
18 Copyright 2001 Computing Research Labs, New Mexico State University
19
20 Permission is hereby granted, free of charge, to any person obtaining a
21 copy of this software and associated documentation files (the "Software"),
22 to deal in the Software without restriction, including without limitation
23 the rights to use, copy, modify, merge, publish, distribute, sublicense,
24 and/or sell copies of the Software, and to permit persons to whom the
25 Software is furnished to do so, subject to the following conditions:
26
27 The above copyright notice and this permission notice shall be included in
28 all copies or substantial portions of the Software.
29 */
30
31 #include "php.h"
32
33 /* include case folding data generated from the official UnicodeData.txt file */
34 #include "mbstring.h"
35 #include "php_unicode.h"
36 #include "unicode_data.h"
37
38 extern const mbfl_encoding mbfl_encoding_8859_9;
39
ZEND_EXTERN_MODULE_GLOBALS(mbstring)40 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
41
42 static bool prop_lookup(unsigned long code, unsigned long n)
43 {
44 long l = _ucprop_offsets[n];
45 long r = _ucprop_offsets[n + 1] - 1;
46 while (l <= r) {
47 /*
48 * Determine a "mid" point and adjust to make sure the mid point is at
49 * the beginning of a range pair.
50 */
51 long m = (l + r) >> 1;
52 m -= (m & 1);
53 if (code > _ucprop_ranges[m + 1])
54 l = m + 2;
55 else if (code < _ucprop_ranges[m])
56 r = m - 2;
57 else
58 return true;
59 }
60 return false;
61
62 }
63
php_unicode_is_prop1(unsigned long code,int prop)64 MBSTRING_API bool php_unicode_is_prop1(unsigned long code, int prop)
65 {
66 return prop_lookup(code, prop);
67 }
68
php_unicode_is_prop(unsigned long code,...)69 MBSTRING_API bool php_unicode_is_prop(unsigned long code, ...)
70 {
71 bool result = false;
72 va_list va;
73 va_start(va, code);
74
75 while (1) {
76 int prop = va_arg(va, int);
77 if (prop < 0) {
78 break;
79 }
80
81 if (prop_lookup(code, prop)) {
82 result = true;
83 break;
84 }
85 }
86
87 va_end(va);
88 return result;
89 }
90
mph_hash(unsigned d,unsigned x)91 static inline unsigned mph_hash(unsigned d, unsigned x) {
92 x ^= d;
93 x = ((x >> 16) ^ x) * 0x45d9f3b;
94 return x;
95 }
96
97 #define CODE_NOT_FOUND ((unsigned) -1)
98
mph_lookup(unsigned code,const short * g_table,unsigned g_table_size,const unsigned * table,unsigned table_size)99 static inline unsigned mph_lookup(
100 unsigned code,
101 const short *g_table, unsigned g_table_size,
102 const unsigned *table, unsigned table_size)
103 {
104 short g = g_table[mph_hash(0, code) % g_table_size];
105
106 unsigned idx;
107 if (g <= 0) {
108 idx = -g;
109 } else {
110 idx = mph_hash(g, code) % table_size;
111 }
112
113 if (table[2*idx] == code) {
114 return table[2*idx + 1];
115 }
116 return CODE_NOT_FOUND;
117 }
118
119 #define CASE_LOOKUP(code, type) \
120 mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \
121 _uccase_##type##_table, _uccase_##type##_table_size)
122
php_unicode_toupper_raw(unsigned code,const mbfl_encoding * enc)123 static unsigned php_unicode_toupper_raw(unsigned code, const mbfl_encoding *enc)
124 {
125 /* After the ASCII characters, the first codepoint with an uppercase version
126 * is 0xB5 (MICRO SIGN) */
127 if (code < 0xB5) {
128 /* Fast path for ASCII */
129 if (code >= 0x61 && code <= 0x7A) {
130 if (UNEXPECTED(enc == &mbfl_encoding_8859_9 && code == 0x69)) {
131 return 0x130;
132 }
133 return code - 0x20;
134 }
135 return code;
136 } else {
137 unsigned new_code = CASE_LOOKUP(code, upper);
138 if (new_code != CODE_NOT_FOUND) {
139 return new_code;
140 }
141 return code;
142 }
143 }
144
php_unicode_tolower_raw(unsigned code,const mbfl_encoding * enc)145 static unsigned php_unicode_tolower_raw(unsigned code, const mbfl_encoding *enc)
146 {
147 /* After the ASCII characters, the first codepoint with a lowercase version
148 * is 0xC0 (LATIN CAPITAL LETTER A WITH GRAVE) */
149 if (code < 0xC0) {
150 /* Fast path for ASCII */
151 if (code >= 0x41 && code <= 0x5A) {
152 if (UNEXPECTED(enc == &mbfl_encoding_8859_9 && code == 0x0049L)) {
153 return 0x0131L;
154 }
155 return code + 0x20;
156 }
157 return code;
158 } else {
159 unsigned new_code = CASE_LOOKUP(code, lower);
160 if (new_code != CODE_NOT_FOUND) {
161 if (UNEXPECTED(enc == &mbfl_encoding_8859_9 && code == 0x130)) {
162 return 0x69;
163 }
164 return new_code;
165 }
166 return code;
167 }
168 }
169
php_unicode_totitle_raw(unsigned code,const mbfl_encoding * enc)170 static unsigned php_unicode_totitle_raw(unsigned code, const mbfl_encoding *enc)
171 {
172 unsigned new_code = CASE_LOOKUP(code, title);
173 if (new_code != CODE_NOT_FOUND) {
174 return new_code;
175 }
176
177 /* No dedicated title-case variant, use to-upper instead */
178 return php_unicode_toupper_raw(code, enc);
179 }
180
php_unicode_tofold_raw(unsigned code,const mbfl_encoding * enc)181 static unsigned php_unicode_tofold_raw(unsigned code, const mbfl_encoding *enc)
182 {
183 if (code < 0x80) {
184 /* Fast path for ASCII */
185 if (code >= 0x41 && code <= 0x5A) {
186 if (UNEXPECTED(enc == &mbfl_encoding_8859_9 && code == 0x49)) {
187 return 0x131;
188 }
189 return code + 0x20;
190 }
191 return code;
192 } else {
193 unsigned new_code = CASE_LOOKUP(code, fold);
194 if (new_code != CODE_NOT_FOUND) {
195 if (UNEXPECTED(enc == &mbfl_encoding_8859_9 && code == 0x130)) {
196 return 0x69;
197 }
198 return new_code;
199 }
200 return code;
201 }
202 }
203
php_unicode_tolower_simple(unsigned code,const mbfl_encoding * enc)204 static inline unsigned php_unicode_tolower_simple(unsigned code, const mbfl_encoding *enc) {
205 code = php_unicode_tolower_raw(code, enc);
206 if (UNEXPECTED(code > 0xffffff)) {
207 return _uccase_extra_table[code & 0xffffff];
208 }
209 return code;
210 }
php_unicode_toupper_simple(unsigned code,const mbfl_encoding * enc)211 static inline unsigned php_unicode_toupper_simple(unsigned code, const mbfl_encoding *enc) {
212 code = php_unicode_toupper_raw(code, enc);
213 if (UNEXPECTED(code > 0xffffff)) {
214 return _uccase_extra_table[code & 0xffffff];
215 }
216 return code;
217 }
php_unicode_totitle_simple(unsigned code,const mbfl_encoding * enc)218 static inline unsigned php_unicode_totitle_simple(unsigned code, const mbfl_encoding *enc) {
219 code = php_unicode_totitle_raw(code, enc);
220 if (UNEXPECTED(code > 0xffffff)) {
221 return _uccase_extra_table[code & 0xffffff];
222 }
223 return code;
224 }
php_unicode_tofold_simple(unsigned code,const mbfl_encoding * enc)225 static inline unsigned php_unicode_tofold_simple(unsigned code, const mbfl_encoding *enc) {
226 code = php_unicode_tofold_raw(code, enc);
227 if (UNEXPECTED(code > 0xffffff)) {
228 return _uccase_extra_table[code & 0xffffff];
229 }
230 return code;
231 }
232
emit_special_casing_sequence(uint32_t w,uint32_t * out)233 static uint32_t *emit_special_casing_sequence(uint32_t w, uint32_t *out)
234 {
235 unsigned int len = w >> 24;
236 const unsigned int *p = &_uccase_extra_table[w & 0xFFFFFF];
237 while (len--) {
238 *out++ = *++p;
239 }
240 return out;
241 }
242
243 /* Used when determining whether special casing rules should be applied to Greek letter sigma */
scan_ahead_for_cased_letter(unsigned char * in,size_t in_len,unsigned int state,const mbfl_encoding * encoding)244 static bool scan_ahead_for_cased_letter(unsigned char *in, size_t in_len, unsigned int state, const mbfl_encoding *encoding)
245 {
246 uint32_t wchar_buf[64];
247
248 while (in_len) {
249 size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 64, &state);
250 ZEND_ASSERT(out_len <= 64);
251 for (unsigned int i = 0; i < out_len; i++) {
252 uint32_t w = wchar_buf[i];
253 if (php_unicode_is_cased(w)) {
254 return true;
255 }
256 if (!php_unicode_is_case_ignorable(w)) {
257 return false;
258 }
259 }
260 }
261
262 return false;
263 }
264
265 /* Used when determining whether special casing rules should be applied to Greek letter sigma */
scan_back_for_cased_letter(uint32_t * begin,uint32_t * end)266 static bool scan_back_for_cased_letter(uint32_t *begin, uint32_t *end)
267 {
268 if (end != NULL) {
269 while (--end >= begin) {
270 uint32_t w = *end;
271 if (php_unicode_is_cased(w)) {
272 return true;
273 }
274 if (!php_unicode_is_case_ignorable(w)) {
275 return false;
276 }
277 }
278 }
279 return false;
280 }
281
php_unicode_convert_case(php_case_mode case_mode,const char * srcstr,size_t in_len,const mbfl_encoding * src_encoding,const mbfl_encoding * dst_encoding,int illegal_mode,uint32_t illegal_substchar)282 MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, const char *srcstr, size_t in_len, const mbfl_encoding *src_encoding, const mbfl_encoding *dst_encoding, int illegal_mode, uint32_t illegal_substchar)
283 {
284 /* A Unicode codepoint can expand out to up to 3 codepoints when uppercased, lowercased, or title cased
285 * See http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt */
286 uint32_t wchar_buf[64], converted_buf[192];
287 unsigned int state = 0, title_mode = 0;
288 unsigned char *in = (unsigned char*)srcstr;
289 /* In rare cases, we need to scan backwards through the previously converted codepoints to see
290 * if special conversion rules should be used for the Greek letter sigma */
291 uint32_t *converted_end = NULL;
292
293 mb_convert_buf buf;
294 mb_convert_buf_init(&buf, in_len + 1, illegal_substchar, illegal_mode);
295
296 while (in_len) {
297 size_t out_len = src_encoding->to_wchar(&in, &in_len, wchar_buf, 64, &state);
298 ZEND_ASSERT(out_len <= 64);
299 uint32_t *p = converted_buf;
300
301 /* In all cases, handle invalid characters early, as we assign special meaning to codepoints > 0xFFFFFF */
302 switch (case_mode) {
303 case PHP_UNICODE_CASE_UPPER_SIMPLE:
304 for (size_t i = 0; i < out_len; i++) {
305 uint32_t w = wchar_buf[i];
306 *p++ = (UNEXPECTED(w > 0xFFFFFF)) ? w : php_unicode_toupper_simple(w, src_encoding);
307 }
308 break;
309
310 case PHP_UNICODE_CASE_LOWER_SIMPLE:
311 for (size_t i = 0; i < out_len; i++) {
312 uint32_t w = wchar_buf[i];
313 *p++ = (UNEXPECTED(w > 0xFFFFFF)) ? w : php_unicode_tolower_simple(w, src_encoding);
314 }
315 break;
316
317 case PHP_UNICODE_CASE_FOLD_SIMPLE:
318 for (size_t i = 0; i < out_len; i++) {
319 uint32_t w = wchar_buf[i];
320 *p++ = (UNEXPECTED(w > 0xFFFFFF)) ? w : php_unicode_tofold_simple(w, src_encoding);
321 }
322 break;
323
324 case PHP_UNICODE_CASE_TITLE_SIMPLE:
325 for (size_t i = 0; i < out_len; i++) {
326 uint32_t w = wchar_buf[i];
327 if (UNEXPECTED(w > 0xFFFFFF)) {
328 *p++ = w;
329 continue;
330 }
331 *p++ = title_mode ? php_unicode_tolower_simple(w, src_encoding) : php_unicode_totitle_simple(w, src_encoding);
332 if (!php_unicode_is_case_ignorable(w)) {
333 title_mode = php_unicode_is_cased(w);
334 }
335 }
336 break;
337
338 case PHP_UNICODE_CASE_UPPER:
339 for (size_t i = 0; i < out_len; i++) {
340 uint32_t w = wchar_buf[i];
341 if (UNEXPECTED(w > 0xFFFFFF)) {
342 *p++ = w;
343 continue;
344 }
345 w = php_unicode_toupper_raw(w, src_encoding);
346 if (UNEXPECTED(w > 0xFFFFFF)) {
347 p = emit_special_casing_sequence(w, p);
348 } else {
349 *p++ = w;
350 }
351 }
352 break;
353
354 case PHP_UNICODE_CASE_LOWER:
355 for (size_t i = 0; i < out_len; i++) {
356 uint32_t w = wchar_buf[i];
357 if (UNEXPECTED(w > 0xFFFFFF)) {
358 *p++ = w;
359 continue;
360 }
361 if (w == 0x3A3) {
362 /* For Greek capital letter sigma, there is a special casing rule;
363 * if it is the last letter in a word, it should be downcased to U+03C2
364 * (GREEK SMALL LETTER FINAL SIGMA)
365 * Specifically, we need to check if this codepoint is preceded by any
366 * number of case-ignorable codepoints, preceded by a cased letter, AND
367 * is NOT followed by any number of case-ignorable codepoints followed
368 * by a cased letter.
369 * Ref: http://www.unicode.org/reports/tr21/tr21-5.html
370 * Ref: https://unicode.org/Public/UNIDATA/SpecialCasing.txt
371 *
372 * While the special casing rules say we should scan backwards through "any number"
373 * of case-ignorable codepoints, that is a great implementation burden
374 * It would basically mean we need to keep all the codepoints in a big buffer
375 * during this conversion operation, but we don't want to do that (to reduce the
376 * amount of temporary scratch memory used)
377 * Hence, we only scan back through the codepoints in wchar_buf, and if we hit the
378 * beginning of the buffer, whatever codepoints have not yet been overwritten in
379 * the latter part of converted_buf */
380 int j = i - 1;
381 while (j >= 0 && php_unicode_is_case_ignorable(wchar_buf[j])) {
382 j--;
383 }
384 if (j >= 0 ? php_unicode_is_cased(wchar_buf[j]) : scan_back_for_cased_letter(p, converted_end)) {
385 /* Now scan ahead to look for a cased letter */
386 j = i + 1;
387 while (j < out_len && php_unicode_is_case_ignorable(wchar_buf[j])) {
388 j++;
389 }
390 /* If we hit the end of wchar_buf, convert more of the input string into
391 * codepoints and continue scanning */
392 if (j >= out_len ? !scan_ahead_for_cased_letter(in, in_len, state, src_encoding) : !php_unicode_is_cased(wchar_buf[j])) {
393 *p++ = 0x3C2;
394 continue;
395 }
396 }
397 }
398 w = php_unicode_tolower_raw(w, src_encoding);
399 if (UNEXPECTED(w > 0xFFFFFF)) {
400 p = emit_special_casing_sequence(w, p);
401 } else {
402 *p++ = w;
403 }
404 }
405 break;
406
407 case PHP_UNICODE_CASE_FOLD:
408 for (size_t i = 0; i < out_len; i++) {
409 uint32_t w = wchar_buf[i];
410 if (UNEXPECTED(w > 0xFFFFFF)) {
411 *p++ = w;
412 continue;
413 }
414 w = php_unicode_tofold_raw(w, src_encoding);
415 if (UNEXPECTED(w > 0xFFFFFF)) {
416 p = emit_special_casing_sequence(w, p);
417 } else {
418 *p++ = w;
419 }
420 }
421 break;
422
423 case PHP_UNICODE_CASE_TITLE:
424 for (size_t i = 0; i < out_len; i++) {
425 uint32_t w = wchar_buf[i];
426 if (UNEXPECTED(w > 0xFFFFFF)) {
427 *p++ = w;
428 continue;
429 }
430 uint32_t w2;
431 if (title_mode) {
432 if (w == 0x3A3) {
433 int j = i - 1;
434 while (j >= 0 && php_unicode_is_case_ignorable(wchar_buf[j])) {
435 j--;
436 }
437 if (j >= 0 ? php_unicode_is_cased(wchar_buf[j]) : scan_back_for_cased_letter(p, converted_end)) {
438 j = i + 1;
439 while (j < out_len && php_unicode_is_case_ignorable(wchar_buf[j])) {
440 j++;
441 }
442 if (j >= out_len ? !scan_ahead_for_cased_letter(in, in_len, state, src_encoding) : !php_unicode_is_cased(wchar_buf[j])) {
443 *p++ = 0x3C2;
444 goto set_title_mode;
445 }
446 }
447 }
448 w2 = php_unicode_tolower_raw(w, src_encoding);
449 } else {
450 w2 = php_unicode_totitle_raw(w, src_encoding);
451 }
452 if (UNEXPECTED(w2 > 0xFFFFFF)) {
453 p = emit_special_casing_sequence(w2, p);
454 } else {
455 *p++ = w2;
456 }
457 set_title_mode:
458 if (!php_unicode_is_case_ignorable(w)) {
459 title_mode = php_unicode_is_cased(w);
460 }
461 }
462 break;
463
464 EMPTY_SWITCH_DEFAULT_CASE()
465 }
466
467 converted_end = p;
468 ZEND_ASSERT(p - converted_buf <= 192);
469 dst_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len);
470 }
471
472 return mb_convert_buf_result(&buf, dst_encoding);
473 }
474