xref: /PHP-7.4/ext/mbstring/php_unicode.c (revision 1fdffd1c)
1 /*
2    +----------------------------------------------------------------------+
3    | PHP Version 7                                                        |
4    +----------------------------------------------------------------------+
5    | Copyright (c) The PHP Group                                          |
6    +----------------------------------------------------------------------+
7    | This source file is subject to version 3.01 of the PHP license,      |
8    | that is bundled with this package in the file LICENSE, and is        |
9    | available through the world-wide-web at the following url:           |
10    | http://www.php.net/license/3_01.txt                                  |
11    | If you did not receive a copy of the PHP license and are unable to   |
12    | obtain it through the world-wide-web, please send a note to          |
13    | license@php.net so we can mail you a copy immediately.               |
14    +----------------------------------------------------------------------+
15    | Author: Wez Furlong (wez@thebrainroom.com)                           |
16    +----------------------------------------------------------------------+
17 
18 	Based on code from ucdata-2.5, which has the following Copyright:
19 
20 	Copyright 2001 Computing Research Labs, New Mexico State University
21 
22 	Permission is hereby granted, free of charge, to any person obtaining a
23 	copy of this software and associated documentation files (the "Software"),
24 	to deal in the Software without restriction, including without limitation
25 	the rights to use, copy, modify, merge, publish, distribute, sublicense,
26 	and/or sell copies of the Software, and to permit persons to whom the
27 	Software is furnished to do so, subject to the following conditions:
28 
29 	The above copyright notice and this permission notice shall be included in
30 	all copies or substantial portions of the Software.
31 */
32 
33 #ifdef HAVE_CONFIG_H
34 #include "config.h"
35 #endif
36 
37 #include "php.h"
38 #include "php_ini.h"
39 
40 #if HAVE_MBSTRING
41 
42 /* include case folding data generated from the official UnicodeData.txt file */
43 #include "mbstring.h"
44 #include "php_unicode.h"
45 #include "unicode_data.h"
46 #include "libmbfl/mbfl/mbfilter_wchar.h"
47 
ZEND_EXTERN_MODULE_GLOBALS(mbstring)48 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
49 
50 static int prop_lookup(unsigned long code, unsigned long n)
51 {
52 	long l, r, m;
53 
54 	/*
55 	 * There is an extra node on the end of the offsets to allow this routine
56 	 * to work right.  If the index is 0xffff, then there are no nodes for the
57 	 * property.
58 	 */
59 	if ((l = _ucprop_offsets[n]) == 0xffff)
60 		return 0;
61 
62 	/*
63 	 * Locate the next offset that is not 0xffff.  The sentinel at the end of
64 	 * the array is the max index value.
65 	 */
66 	for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
67 		;
68 
69 	r = _ucprop_offsets[n + m] - 1;
70 
71 	while (l <= r) {
72 		/*
73 		 * Determine a "mid" point and adjust to make sure the mid point is at
74 		 * the beginning of a range pair.
75 		 */
76 		m = (l + r) >> 1;
77 		m -= (m & 1);
78 		if (code > _ucprop_ranges[m + 1])
79 			l = m + 2;
80 		else if (code < _ucprop_ranges[m])
81 			r = m - 2;
82 		else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
83 			return 1;
84 	}
85 	return 0;
86 
87 }
88 
php_unicode_is_prop1(unsigned long code,int prop)89 MBSTRING_API int php_unicode_is_prop1(unsigned long code, int prop)
90 {
91 	return prop_lookup(code, prop);
92 }
93 
php_unicode_is_prop(unsigned long code,...)94 MBSTRING_API int php_unicode_is_prop(unsigned long code, ...)
95 {
96 	int result = 0;
97 	va_list va;
98 	va_start(va, code);
99 
100 	while (1) {
101 		int prop = va_arg(va, int);
102 		if (prop < 0) {
103 			break;
104 		}
105 
106 		if (prop_lookup(code, prop)) {
107 			result = 1;
108 			break;
109 		}
110 	}
111 
112 	va_end(va);
113 	return result;
114 }
115 
mph_hash(unsigned d,unsigned x)116 static inline unsigned mph_hash(unsigned d, unsigned x) {
117     x ^= d;
118     x = ((x >> 16) ^ x) * 0x45d9f3b;
119     return x;
120 }
121 
122 #define CODE_NOT_FOUND ((unsigned) -1)
123 
mph_lookup(unsigned code,const short * g_table,unsigned g_table_size,const unsigned * table,unsigned table_size)124 static inline unsigned mph_lookup(
125 		unsigned code,
126 		const short *g_table, unsigned g_table_size,
127 		const unsigned *table, unsigned table_size)
128 {
129 	short g = g_table[mph_hash(0, code) % g_table_size];
130 
131 	unsigned idx;
132 	if (g <= 0) {
133 		idx = -g;
134 	} else {
135 		idx = mph_hash(g, code) % table_size;
136 	}
137 
138 	if (table[2*idx] == code) {
139 		return table[2*idx + 1];
140 	}
141 	return CODE_NOT_FOUND;
142 }
143 
144 #define CASE_LOOKUP(code, type) \
145 	mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \
146 			_uccase_##type##_table, _uccase_##type##_table_size)
147 
php_unicode_toupper_raw(unsigned code,enum mbfl_no_encoding enc)148 static unsigned php_unicode_toupper_raw(unsigned code, enum mbfl_no_encoding enc)
149 {
150 	if (code < 0x80) {
151 		/* Fast path for ASCII */
152 		if (code >= 0x61 && code <= 0x7A) {
153 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x69)) {
154 				return 0x130;
155 			}
156 			return code - 0x20;
157 		}
158 		return code;
159 	} else {
160 		unsigned new_code = CASE_LOOKUP(code, upper);
161 		if (new_code != CODE_NOT_FOUND) {
162 			return new_code;
163 		}
164 		return code;
165 	}
166 }
167 
php_unicode_tolower_raw(unsigned code,enum mbfl_no_encoding enc)168 static unsigned php_unicode_tolower_raw(unsigned code, enum mbfl_no_encoding enc)
169 {
170 	if (code < 0x80) {
171 		/* Fast path for ASCII */
172 		if (code >= 0x41 && code <= 0x5A) {
173 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x0049L)) {
174 				return 0x0131L;
175 			}
176 			return code + 0x20;
177 		}
178 		return code;
179 	} else {
180 		unsigned new_code = CASE_LOOKUP(code, lower);
181 		if (new_code != CODE_NOT_FOUND) {
182 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
183 				return 0x69;
184 			}
185 			return new_code;
186 		}
187 		return code;
188 	}
189 }
190 
php_unicode_totitle_raw(unsigned code,enum mbfl_no_encoding enc)191 static unsigned php_unicode_totitle_raw(unsigned code, enum mbfl_no_encoding enc)
192 {
193 	unsigned new_code = CASE_LOOKUP(code, title);
194 	if (new_code != CODE_NOT_FOUND) {
195 		return new_code;
196 	}
197 
198 	/* No dedicated title-case variant, use to-upper instead */
199 	return php_unicode_toupper_raw(code, enc);
200 }
201 
php_unicode_tofold_raw(unsigned code,enum mbfl_no_encoding enc)202 unsigned php_unicode_tofold_raw(unsigned code, enum mbfl_no_encoding enc)
203 {
204 	if (code < 0x80) {
205 		/* Fast path for ASCII */
206 		if (code >= 0x41 && code <= 0x5A) {
207 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x49)) {
208 				return 0x131;
209 			}
210 			return code + 0x20;
211 		}
212 		return code;
213 	} else {
214 		unsigned new_code = CASE_LOOKUP(code, fold);
215 		if (new_code != CODE_NOT_FOUND) {
216 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
217 				return 0x69;
218 			}
219 			return new_code;
220 		}
221 		return code;
222 	}
223 }
224 
php_unicode_tolower_simple(unsigned code,enum mbfl_no_encoding enc)225 static inline unsigned php_unicode_tolower_simple(unsigned code, enum mbfl_no_encoding enc) {
226 	code = php_unicode_tolower_raw(code, enc);
227 	if (UNEXPECTED(code > 0xffffff)) {
228 		return _uccase_extra_table[code & 0xffffff];
229 	}
230 	return code;
231 }
php_unicode_toupper_simple(unsigned code,enum mbfl_no_encoding enc)232 static inline unsigned php_unicode_toupper_simple(unsigned code, enum mbfl_no_encoding enc) {
233 	code = php_unicode_toupper_raw(code, enc);
234 	if (UNEXPECTED(code > 0xffffff)) {
235 		return _uccase_extra_table[code & 0xffffff];
236 	}
237 	return code;
238 }
php_unicode_totitle_simple(unsigned code,enum mbfl_no_encoding enc)239 static inline unsigned php_unicode_totitle_simple(unsigned code, enum mbfl_no_encoding enc) {
240 	code = php_unicode_totitle_raw(code, enc);
241 	if (UNEXPECTED(code > 0xffffff)) {
242 		return _uccase_extra_table[code & 0xffffff];
243 	}
244 	return code;
245 }
php_unicode_tofold_simple(unsigned code,enum mbfl_no_encoding enc)246 static inline unsigned php_unicode_tofold_simple(unsigned code, enum mbfl_no_encoding enc) {
247 	code = php_unicode_tofold_raw(code, enc);
248 	if (UNEXPECTED(code > 0xffffff)) {
249 		return _uccase_extra_table[code & 0xffffff];
250 	}
251 	return code;
252 }
253 
php_unicode_tolower_full(unsigned code,enum mbfl_no_encoding enc,unsigned * out)254 static inline unsigned php_unicode_tolower_full(
255 		unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
256 	code = php_unicode_tolower_raw(code, enc);
257 	if (UNEXPECTED(code > 0xffffff)) {
258 		unsigned len = code >> 24;
259 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
260 		memcpy(out, p + 1, len * sizeof(unsigned));
261 		return len;
262 	}
263 	*out = code;
264 	return 1;
265 }
php_unicode_toupper_full(unsigned code,enum mbfl_no_encoding enc,unsigned * out)266 static inline unsigned php_unicode_toupper_full(
267 		unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
268 	code = php_unicode_toupper_raw(code, enc);
269 	if (UNEXPECTED(code > 0xffffff)) {
270 		unsigned len = code >> 24;
271 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
272 		memcpy(out, p + 1, len * sizeof(unsigned));
273 		return len;
274 	}
275 	*out = code;
276 	return 1;
277 }
php_unicode_totitle_full(unsigned code,enum mbfl_no_encoding enc,unsigned * out)278 static inline unsigned php_unicode_totitle_full(
279 		unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
280 	code = php_unicode_totitle_raw(code, enc);
281 	if (UNEXPECTED(code > 0xffffff)) {
282 		unsigned len = code >> 24;
283 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
284 		memcpy(out, p + 1, len * sizeof(unsigned));
285 		return len;
286 	}
287 	*out = code;
288 	return 1;
289 }
php_unicode_tofold_full(unsigned code,enum mbfl_no_encoding enc,unsigned * out)290 static inline unsigned php_unicode_tofold_full(
291 		unsigned code, enum mbfl_no_encoding enc, unsigned *out) {
292 	code = php_unicode_tofold_raw(code, enc);
293 	if (UNEXPECTED(code > 0xffffff)) {
294 		unsigned len = code >> 24;
295 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
296 		memcpy(out, p + 1, len * sizeof(unsigned));
297 		return len;
298 	}
299 	*out = code;
300 	return 1;
301 }
302 
303 struct convert_case_data {
304 	mbfl_convert_filter *next_filter;
305 	enum mbfl_no_encoding no_encoding;
306 	int case_mode;
307 	int title_mode;
308 };
309 
convert_case_filter(int c,void * void_data)310 static int convert_case_filter(int c, void *void_data)
311 {
312 	struct convert_case_data *data = (struct convert_case_data *) void_data;
313 	unsigned out[3];
314 	unsigned len, i;
315 
316 	/* Handle invalid characters early, as we assign special meaning to
317 	 * codepoints above 0xffffff. */
318 	if (UNEXPECTED((unsigned) c > 0xffffff)) {
319 		(*data->next_filter->filter_function)(c, data->next_filter);
320 		return 0;
321 	}
322 
323 	switch (data->case_mode) {
324 		case PHP_UNICODE_CASE_UPPER_SIMPLE:
325 			out[0] = php_unicode_toupper_simple(c, data->no_encoding);
326 			len = 1;
327 			break;
328 
329 		case PHP_UNICODE_CASE_UPPER:
330 			len = php_unicode_toupper_full(c, data->no_encoding, out);
331 			break;
332 
333 		case PHP_UNICODE_CASE_LOWER_SIMPLE:
334 			out[0] = php_unicode_tolower_simple(c, data->no_encoding);
335 			len = 1;
336 			break;
337 
338 		case PHP_UNICODE_CASE_LOWER:
339 			len = php_unicode_tolower_full(c, data->no_encoding, out);
340 			break;
341 
342 		case PHP_UNICODE_CASE_FOLD:
343 			len = php_unicode_tofold_full(c, data->no_encoding, out);
344 			break;
345 
346 		case PHP_UNICODE_CASE_FOLD_SIMPLE:
347 			out[0] = php_unicode_tofold_simple(c, data->no_encoding);
348 			len = 1;
349 			break;
350 
351 		case PHP_UNICODE_CASE_TITLE_SIMPLE:
352 		case PHP_UNICODE_CASE_TITLE:
353 		{
354 			if (data->title_mode) {
355 				if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
356 					out[0] = php_unicode_tolower_simple(c, data->no_encoding);
357 					len = 1;
358 				} else {
359 					len = php_unicode_tolower_full(c, data->no_encoding, out);
360 				}
361 			} else {
362 				if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
363 					out[0] = php_unicode_totitle_simple(c, data->no_encoding);
364 					len = 1;
365 				} else {
366 					len = php_unicode_totitle_full(c, data->no_encoding, out);
367 				}
368 			}
369 			if (!php_unicode_is_case_ignorable(c)) {
370 				data->title_mode = php_unicode_is_cased(c);
371 			}
372 			break;
373 		}
374 		EMPTY_SWITCH_DEFAULT_CASE()
375 	}
376 
377 	for (i = 0; i < len; i++) {
378 		(*data->next_filter->filter_function)(out[i], data->next_filter);
379 	}
380 	return 0;
381 }
382 
php_unicode_convert_case(int case_mode,const char * srcstr,size_t srclen,size_t * ret_len,const mbfl_encoding * src_encoding,int illegal_mode,int illegal_substchar)383 MBSTRING_API char *php_unicode_convert_case(
384 		int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
385 		const mbfl_encoding *src_encoding, int illegal_mode, int illegal_substchar)
386 {
387 	struct convert_case_data data;
388 	mbfl_convert_filter *from_wchar, *to_wchar;
389 	mbfl_string result, *result_ptr;
390 
391 	mbfl_memory_device device;
392 	mbfl_memory_device_init(&device, srclen + 1, 0);
393 
394 	/* encoding -> wchar filter */
395 	to_wchar = mbfl_convert_filter_new(src_encoding,
396 			&mbfl_encoding_wchar, convert_case_filter, NULL, &data);
397 	if (to_wchar == NULL) {
398 		mbfl_memory_device_clear(&device);
399 		return NULL;
400 	}
401 
402 	/* wchar -> encoding filter */
403 	from_wchar = mbfl_convert_filter_new(
404 			&mbfl_encoding_wchar, src_encoding,
405 			mbfl_memory_device_output, NULL, &device);
406 	if (from_wchar == NULL) {
407 		mbfl_convert_filter_delete(to_wchar);
408 		mbfl_memory_device_clear(&device);
409 		return NULL;
410 	}
411 
412 	to_wchar->illegal_mode = illegal_mode;
413 	to_wchar->illegal_substchar = illegal_substchar;
414 	from_wchar->illegal_mode = illegal_mode;
415 	from_wchar->illegal_substchar = illegal_substchar;
416 
417 	data.next_filter = from_wchar;
418 	data.no_encoding = src_encoding->no_encoding;
419 	data.case_mode = case_mode;
420 	data.title_mode = 0;
421 
422 	{
423 		/* feed data */
424 		const unsigned char *p = (const unsigned char *) srcstr;
425 		size_t n = srclen;
426 		while (n > 0) {
427 			if ((*to_wchar->filter_function)(*p++, to_wchar) < 0) {
428 				break;
429 			}
430 			n--;
431 		}
432 	}
433 
434 	mbfl_convert_filter_flush(to_wchar);
435 	mbfl_convert_filter_flush(from_wchar);
436 	result_ptr = mbfl_memory_device_result(&device, &result);
437 	mbfl_convert_filter_delete(to_wchar);
438 	mbfl_convert_filter_delete(from_wchar);
439 
440 	if (!result_ptr) {
441 		return NULL;
442 	}
443 
444 	*ret_len = result.len;
445 	return (char *) result.val;
446 }
447 
448 
449 #endif /* HAVE_MBSTRING */
450