xref: /php-src/ext/mbstring/php_unicode.c (revision a9035863)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Wez Furlong (wez@thebrainroom.com)                           |
14    +----------------------------------------------------------------------+
15 
16 	Based on code from ucdata-2.5, which has the following Copyright:
17 
18 	Copyright 2001 Computing Research Labs, New Mexico State University
19 
20 	Permission is hereby granted, free of charge, to any person obtaining a
21 	copy of this software and associated documentation files (the "Software"),
22 	to deal in the Software without restriction, including without limitation
23 	the rights to use, copy, modify, merge, publish, distribute, sublicense,
24 	and/or sell copies of the Software, and to permit persons to whom the
25 	Software is furnished to do so, subject to the following conditions:
26 
27 	The above copyright notice and this permission notice shall be included in
28 	all copies or substantial portions of the Software.
29 */
30 
31 #include "php.h"
32 
33 /* include case folding data generated from the official UnicodeData.txt file */
34 #include "mbstring.h"
35 #include "php_unicode.h"
36 #include "unicode_data.h"
37 
38 extern const mbfl_encoding mbfl_encoding_8859_9;
39 
ZEND_EXTERN_MODULE_GLOBALS(mbstring)40 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
41 
42 static bool prop_lookup(unsigned long code, unsigned long n)
43 {
44 	long l = _ucprop_offsets[n];
45 	long r = _ucprop_offsets[n + 1] - 1;
46 	while (l <= r) {
47 		/*
48 		 * Determine a "mid" point and adjust to make sure the mid point is at
49 		 * the beginning of a range pair.
50 		 */
51 		long m = (l + r) >> 1;
52 		m -= (m & 1);
53 		if (code > _ucprop_ranges[m + 1])
54 			l = m + 2;
55 		else if (code < _ucprop_ranges[m])
56 			r = m - 2;
57 		else
58 			return true;
59 	}
60 	return false;
61 
62 }
63 
php_unicode_is_prop1(unsigned long code,int prop)64 MBSTRING_API bool php_unicode_is_prop1(unsigned long code, int prop)
65 {
66 	return prop_lookup(code, prop);
67 }
68 
php_unicode_is_prop(unsigned long code,...)69 MBSTRING_API bool php_unicode_is_prop(unsigned long code, ...)
70 {
71 	bool result = false;
72 	va_list va;
73 	va_start(va, code);
74 
75 	while (1) {
76 		int prop = va_arg(va, int);
77 		if (prop < 0) {
78 			break;
79 		}
80 
81 		if (prop_lookup(code, prop)) {
82 			result = true;
83 			break;
84 		}
85 	}
86 
87 	va_end(va);
88 	return result;
89 }
90 
mph_hash(unsigned d,unsigned x)91 static inline unsigned mph_hash(unsigned d, unsigned x) {
92 	x ^= d;
93 	x = ((x >> 16) ^ x) * 0x45d9f3b;
94 	return x;
95 }
96 
97 #define CODE_NOT_FOUND ((unsigned) -1)
98 
mph_lookup(unsigned code,const short * g_table,unsigned g_table_size,const unsigned * table,unsigned table_size)99 static inline unsigned mph_lookup(
100 		unsigned code,
101 		const short *g_table, unsigned g_table_size,
102 		const unsigned *table, unsigned table_size)
103 {
104 	short g = g_table[mph_hash(0, code) % g_table_size];
105 
106 	unsigned idx;
107 	if (g <= 0) {
108 		idx = -g;
109 	} else {
110 		idx = mph_hash(g, code) % table_size;
111 	}
112 
113 	if (table[2*idx] == code) {
114 		return table[2*idx + 1];
115 	}
116 	return CODE_NOT_FOUND;
117 }
118 
119 #define CASE_LOOKUP(code, type) \
120 	mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \
121 			_uccase_##type##_table, _uccase_##type##_table_size)
122 
php_unicode_toupper_raw(unsigned code,const mbfl_encoding * enc)123 static unsigned php_unicode_toupper_raw(unsigned code, const mbfl_encoding *enc)
124 {
125 	/* After the ASCII characters, the first codepoint with an uppercase version
126 	 * is 0xB5 (MICRO SIGN) */
127 	if (code < 0xB5) {
128 		/* Fast path for ASCII */
129 		if (code >= 0x61 && code <= 0x7A) {
130 			if (UNEXPECTED(enc == &mbfl_encoding_8859_9 && code == 0x69)) {
131 				return 0x130;
132 			}
133 			return code - 0x20;
134 		}
135 		return code;
136 	} else {
137 		unsigned new_code = CASE_LOOKUP(code, upper);
138 		if (new_code != CODE_NOT_FOUND) {
139 			return new_code;
140 		}
141 		return code;
142 	}
143 }
144 
php_unicode_tolower_raw(unsigned code,const mbfl_encoding * enc)145 static unsigned php_unicode_tolower_raw(unsigned code, const mbfl_encoding *enc)
146 {
147 	/* After the ASCII characters, the first codepoint with a lowercase version
148 	 * is 0xC0 (LATIN CAPITAL LETTER A WITH GRAVE) */
149 	if (code < 0xC0) {
150 		/* Fast path for ASCII */
151 		if (code >= 0x41 && code <= 0x5A) {
152 			if (UNEXPECTED(enc == &mbfl_encoding_8859_9 && code == 0x0049L)) {
153 				return 0x0131L;
154 			}
155 			return code + 0x20;
156 		}
157 		return code;
158 	} else {
159 		unsigned new_code = CASE_LOOKUP(code, lower);
160 		if (new_code != CODE_NOT_FOUND) {
161 			if (UNEXPECTED(enc == &mbfl_encoding_8859_9 && code == 0x130)) {
162 				return 0x69;
163 			}
164 			return new_code;
165 		}
166 		return code;
167 	}
168 }
169 
php_unicode_totitle_raw(unsigned code,const mbfl_encoding * enc)170 static unsigned php_unicode_totitle_raw(unsigned code, const mbfl_encoding *enc)
171 {
172 	unsigned new_code = CASE_LOOKUP(code, title);
173 	if (new_code != CODE_NOT_FOUND) {
174 		return new_code;
175 	}
176 
177 	/* No dedicated title-case variant, use to-upper instead */
178 	return php_unicode_toupper_raw(code, enc);
179 }
180 
php_unicode_tofold_raw(unsigned code,const mbfl_encoding * enc)181 static unsigned php_unicode_tofold_raw(unsigned code, const mbfl_encoding *enc)
182 {
183 	if (code < 0x80) {
184 		/* Fast path for ASCII */
185 		if (code >= 0x41 && code <= 0x5A) {
186 			if (UNEXPECTED(enc == &mbfl_encoding_8859_9 && code == 0x49)) {
187 				return 0x131;
188 			}
189 			return code + 0x20;
190 		}
191 		return code;
192 	} else {
193 		unsigned new_code = CASE_LOOKUP(code, fold);
194 		if (new_code != CODE_NOT_FOUND) {
195 			if (UNEXPECTED(enc == &mbfl_encoding_8859_9 && code == 0x130)) {
196 				return 0x69;
197 			}
198 			return new_code;
199 		}
200 		return code;
201 	}
202 }
203 
php_unicode_tolower_simple(unsigned code,const mbfl_encoding * enc)204 static inline unsigned php_unicode_tolower_simple(unsigned code, const mbfl_encoding *enc) {
205 	code = php_unicode_tolower_raw(code, enc);
206 	if (UNEXPECTED(code > 0xffffff)) {
207 		return _uccase_extra_table[code & 0xffffff];
208 	}
209 	return code;
210 }
php_unicode_toupper_simple(unsigned code,const mbfl_encoding * enc)211 static inline unsigned php_unicode_toupper_simple(unsigned code, const mbfl_encoding *enc) {
212 	code = php_unicode_toupper_raw(code, enc);
213 	if (UNEXPECTED(code > 0xffffff)) {
214 		return _uccase_extra_table[code & 0xffffff];
215 	}
216 	return code;
217 }
php_unicode_totitle_simple(unsigned code,const mbfl_encoding * enc)218 static inline unsigned php_unicode_totitle_simple(unsigned code, const mbfl_encoding *enc) {
219 	code = php_unicode_totitle_raw(code, enc);
220 	if (UNEXPECTED(code > 0xffffff)) {
221 		return _uccase_extra_table[code & 0xffffff];
222 	}
223 	return code;
224 }
php_unicode_tofold_simple(unsigned code,const mbfl_encoding * enc)225 static inline unsigned php_unicode_tofold_simple(unsigned code, const mbfl_encoding *enc) {
226 	code = php_unicode_tofold_raw(code, enc);
227 	if (UNEXPECTED(code > 0xffffff)) {
228 		return _uccase_extra_table[code & 0xffffff];
229 	}
230 	return code;
231 }
232 
emit_special_casing_sequence(uint32_t w,uint32_t * out)233 static uint32_t *emit_special_casing_sequence(uint32_t w, uint32_t *out)
234 {
235 	unsigned int len = w >> 24;
236 	const unsigned int *p = &_uccase_extra_table[w & 0xFFFFFF];
237 	while (len--) {
238 		*out++ = *++p;
239 	}
240 	return out;
241 }
242 
243 /* Used when determining whether special casing rules should be applied to Greek letter sigma */
scan_ahead_for_cased_letter(unsigned char * in,size_t in_len,unsigned int state,const mbfl_encoding * encoding)244 static bool scan_ahead_for_cased_letter(unsigned char *in, size_t in_len, unsigned int state, const mbfl_encoding *encoding)
245 {
246 	uint32_t wchar_buf[64];
247 
248 	while (in_len) {
249 		size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 64, &state);
250 		ZEND_ASSERT(out_len <= 64);
251 		for (unsigned int i = 0; i < out_len; i++) {
252 			uint32_t w = wchar_buf[i];
253 			if (php_unicode_is_cased(w)) {
254 				return true;
255 			}
256 			if (!php_unicode_is_case_ignorable(w)) {
257 				return false;
258 			}
259 		}
260 	}
261 
262 	return false;
263 }
264 
265 /* Used when determining whether special casing rules should be applied to Greek letter sigma */
scan_back_for_cased_letter(uint32_t * begin,uint32_t * end)266 static bool scan_back_for_cased_letter(uint32_t *begin, uint32_t *end)
267 {
268 	if (end != NULL) {
269 		while (--end >= begin) {
270 			uint32_t w = *end;
271 			if (php_unicode_is_cased(w)) {
272 				return true;
273 			}
274 			if (!php_unicode_is_case_ignorable(w)) {
275 				return false;
276 			}
277 		}
278 	}
279 	return false;
280 }
281 
php_unicode_convert_case(php_case_mode case_mode,const char * srcstr,size_t in_len,const mbfl_encoding * src_encoding,const mbfl_encoding * dst_encoding,int illegal_mode,uint32_t illegal_substchar)282 MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, const char *srcstr, size_t in_len, const mbfl_encoding *src_encoding, const mbfl_encoding *dst_encoding, int illegal_mode, uint32_t illegal_substchar)
283 {
284 	/* A Unicode codepoint can expand out to up to 3 codepoints when uppercased, lowercased, or title cased
285 	 * See http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt */
286 	uint32_t wchar_buf[64], converted_buf[192];
287 	unsigned int state = 0, title_mode = 0;
288 	unsigned char *in = (unsigned char*)srcstr;
289 	/* In rare cases, we need to scan backwards through the previously converted codepoints to see
290 	 * if special conversion rules should be used for the Greek letter sigma */
291 	uint32_t *converted_end = NULL;
292 
293 	mb_convert_buf buf;
294 	mb_convert_buf_init(&buf, in_len + 1, illegal_substchar, illegal_mode);
295 
296 	while (in_len) {
297 		size_t out_len = src_encoding->to_wchar(&in, &in_len, wchar_buf, 64, &state);
298 		ZEND_ASSERT(out_len <= 64);
299 		uint32_t *p = converted_buf;
300 
301 		/* In all cases, handle invalid characters early, as we assign special meaning to codepoints > 0xFFFFFF */
302 		switch (case_mode) {
303 		case PHP_UNICODE_CASE_UPPER_SIMPLE:
304 			for (int i = 0; i < out_len; i++) {
305 				uint32_t w = wchar_buf[i];
306 				*p++ = (UNEXPECTED(w > 0xFFFFFF)) ? w : php_unicode_toupper_simple(w, src_encoding);
307 			}
308 			break;
309 
310 		case PHP_UNICODE_CASE_LOWER_SIMPLE:
311 			for (int i = 0; i < out_len; i++) {
312 				uint32_t w = wchar_buf[i];
313 				*p++ = (UNEXPECTED(w > 0xFFFFFF)) ? w : php_unicode_tolower_simple(w, src_encoding);
314 			}
315 			break;
316 
317 		case PHP_UNICODE_CASE_FOLD_SIMPLE:
318 			for (int i = 0; i < out_len; i++) {
319 				uint32_t w = wchar_buf[i];
320 				*p++ = (UNEXPECTED(w > 0xFFFFFF)) ? w : php_unicode_tofold_simple(w, src_encoding);
321 			}
322 			break;
323 
324 		case PHP_UNICODE_CASE_TITLE_SIMPLE:
325 			for (int i = 0; i < out_len; i++) {
326 				uint32_t w = wchar_buf[i];
327 				if (UNEXPECTED(w > 0xFFFFFF)) {
328 					*p++ = w;
329 					continue;
330 				}
331 				*p++ = title_mode ? php_unicode_tolower_simple(w, src_encoding) : php_unicode_totitle_simple(w, src_encoding);
332 				if (!php_unicode_is_case_ignorable(w)) {
333 					title_mode = php_unicode_is_cased(w);
334 				}
335 			}
336 			break;
337 
338 		case PHP_UNICODE_CASE_UPPER:
339 			for (int i = 0; i < out_len; i++) {
340 				uint32_t w = wchar_buf[i];
341 				if (UNEXPECTED(w > 0xFFFFFF)) {
342 					*p++ = w;
343 					continue;
344 				}
345 				w = php_unicode_toupper_raw(w, src_encoding);
346 				if (UNEXPECTED(w > 0xFFFFFF)) {
347 					p = emit_special_casing_sequence(w, p);
348 				} else {
349 					*p++ = w;
350 				}
351 			}
352 			break;
353 
354 		case PHP_UNICODE_CASE_LOWER:
355 			for (int i = 0; i < out_len; i++) {
356 				uint32_t w = wchar_buf[i];
357 				if (UNEXPECTED(w > 0xFFFFFF)) {
358 					*p++ = w;
359 					continue;
360 				}
361 				if (w == 0x3A3) {
362 					/* For Greek capital letter sigma, there is a special casing rule;
363 					 * if it is the last letter in a word, it should be downcased to U+03C2
364 					 * (GREEK SMALL LETTER FINAL SIGMA)
365 					 * Specifically, we need to check if this codepoint is preceded by any
366 					 * number of case-ignorable codepoints, preceded by a cased letter, AND
367 					 * is NOT followed by any number of case-ignorable codepoints followed
368 					 * by a cased letter.
369 					 * Ref: http://www.unicode.org/reports/tr21/tr21-5.html
370 					 * Ref: https://unicode.org/Public/UNIDATA/SpecialCasing.txt
371 					 *
372 					 * While the special casing rules say we should scan backwards through "any number"
373 					 * of case-ignorable codepoints, that is a great implementation burden
374 					 * It would basically mean we need to keep all the codepoints in a big buffer
375 					 * during this conversion operation, but we don't want to do that (to reduce the
376 					 * amount of temporary scratch memory used)
377 					 * Hence, we only scan back through the codepoints in wchar_buf, and if we hit the
378 					 * beginning of the buffer, whatever codepoints have not yet been overwritten in
379 					 * the latter part of converted_buf */
380 					int j = i - 1;
381 					while (j >= 0 && php_unicode_is_case_ignorable(wchar_buf[j])) {
382 						j--;
383 					}
384 					if (j >= 0 ? php_unicode_is_cased(wchar_buf[j]) : scan_back_for_cased_letter(p, converted_end)) {
385 						/* Now scan ahead to look for a cased letter */
386 						j = i + 1;
387 						while (j < out_len && php_unicode_is_case_ignorable(wchar_buf[j])) {
388 							j++;
389 						}
390 						/* If we hit the end of wchar_buf, convert more of the input string into
391 						 * codepoints and continue scanning */
392 						if (j >= out_len ? !scan_ahead_for_cased_letter(in, in_len, state, src_encoding) : !php_unicode_is_cased(wchar_buf[j])) {
393 							*p++ = 0x3C2;
394 							continue;
395 						}
396 					}
397 				}
398 				w = php_unicode_tolower_raw(w, src_encoding);
399 				if (UNEXPECTED(w > 0xFFFFFF)) {
400 					p = emit_special_casing_sequence(w, p);
401 				} else {
402 					*p++ = w;
403 				}
404 			}
405 			break;
406 
407 		case PHP_UNICODE_CASE_FOLD:
408 			for (int i = 0; i < out_len; i++) {
409 				uint32_t w = wchar_buf[i];
410 				if (UNEXPECTED(w > 0xFFFFFF)) {
411 					*p++ = w;
412 					continue;
413 				}
414 				w = php_unicode_tofold_raw(w, src_encoding);
415 				if (UNEXPECTED(w > 0xFFFFFF)) {
416 					p = emit_special_casing_sequence(w, p);
417 				} else {
418 					*p++ = w;
419 				}
420 			}
421 			break;
422 
423 		case PHP_UNICODE_CASE_TITLE:
424 			for (int i = 0; i < out_len; i++) {
425 				uint32_t w = wchar_buf[i];
426 				if (UNEXPECTED(w > 0xFFFFFF)) {
427 					*p++ = w;
428 					continue;
429 				}
430 				uint32_t w2;
431 				if (title_mode) {
432 					if (w == 0x3A3) {
433 						int j = i - 1;
434 						while (j >= 0 && php_unicode_is_case_ignorable(wchar_buf[j])) {
435 							j--;
436 						}
437 						if (j >= 0 ? php_unicode_is_cased(wchar_buf[j]) : scan_back_for_cased_letter(p, converted_end)) {
438 							j = i + 1;
439 							while (j < out_len && php_unicode_is_case_ignorable(wchar_buf[j])) {
440 								j++;
441 							}
442 							if (j >= out_len ? !scan_ahead_for_cased_letter(in, in_len, state, src_encoding) : !php_unicode_is_cased(wchar_buf[j])) {
443 								*p++ = 0x3C2;
444 								goto set_title_mode;
445 							}
446 						}
447 					}
448 					w2 = php_unicode_tolower_raw(w, src_encoding);
449 				} else {
450 					w2 = php_unicode_totitle_raw(w, src_encoding);
451 				}
452 				if (UNEXPECTED(w2 > 0xFFFFFF)) {
453 					p = emit_special_casing_sequence(w2, p);
454 				} else {
455 					*p++ = w2;
456 				}
457 set_title_mode:
458 				if (!php_unicode_is_case_ignorable(w)) {
459 					title_mode = php_unicode_is_cased(w);
460 				}
461 			}
462 			break;
463 
464 			EMPTY_SWITCH_DEFAULT_CASE()
465 		}
466 
467 		converted_end = p;
468 		ZEND_ASSERT(p - converted_buf <= 192);
469 		dst_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len);
470 	}
471 
472 	return mb_convert_buf_result(&buf, dst_encoding);
473 }
474