xref: /PHP-8.0/ext/mbstring/php_unicode.c (revision 7eddcabe)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | http://www.php.net/license/3_01.txt                                  |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Wez Furlong (wez@thebrainroom.com)                           |
14    +----------------------------------------------------------------------+
15 
16 	Based on code from ucdata-2.5, which has the following Copyright:
17 
18 	Copyright 2001 Computing Research Labs, New Mexico State University
19 
20 	Permission is hereby granted, free of charge, to any person obtaining a
21 	copy of this software and associated documentation files (the "Software"),
22 	to deal in the Software without restriction, including without limitation
23 	the rights to use, copy, modify, merge, publish, distribute, sublicense,
24 	and/or sell copies of the Software, and to permit persons to whom the
25 	Software is furnished to do so, subject to the following conditions:
26 
27 	The above copyright notice and this permission notice shall be included in
28 	all copies or substantial portions of the Software.
29 */
30 
31 #include "php.h"
32 
33 /* include case folding data generated from the official UnicodeData.txt file */
34 #include "mbstring.h"
35 #include "php_unicode.h"
36 #include "unicode_data.h"
37 #include "libmbfl/mbfl/mbfilter_wchar.h"
38 
ZEND_EXTERN_MODULE_GLOBALS(mbstring)39 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
40 
41 static int prop_lookup(unsigned long code, unsigned long n)
42 {
43 	long l, r, m;
44 
45 	/*
46 	 * There is an extra node on the end of the offsets to allow this routine
47 	 * to work right.  If the index is 0xffff, then there are no nodes for the
48 	 * property.
49 	 */
50 	if ((l = _ucprop_offsets[n]) == 0xffff)
51 		return 0;
52 
53 	/*
54 	 * Locate the next offset that is not 0xffff.  The sentinel at the end of
55 	 * the array is the max index value.
56 	 */
57 	for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
58 		;
59 
60 	r = _ucprop_offsets[n + m] - 1;
61 
62 	while (l <= r) {
63 		/*
64 		 * Determine a "mid" point and adjust to make sure the mid point is at
65 		 * the beginning of a range pair.
66 		 */
67 		m = (l + r) >> 1;
68 		m -= (m & 1);
69 		if (code > _ucprop_ranges[m + 1])
70 			l = m + 2;
71 		else if (code < _ucprop_ranges[m])
72 			r = m - 2;
73 		else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
74 			return 1;
75 	}
76 	return 0;
77 
78 }
79 
php_unicode_is_prop1(unsigned long code,int prop)80 MBSTRING_API int php_unicode_is_prop1(unsigned long code, int prop)
81 {
82 	return prop_lookup(code, prop);
83 }
84 
php_unicode_is_prop(unsigned long code,...)85 MBSTRING_API int php_unicode_is_prop(unsigned long code, ...)
86 {
87 	int result = 0;
88 	va_list va;
89 	va_start(va, code);
90 
91 	while (1) {
92 		int prop = va_arg(va, int);
93 		if (prop < 0) {
94 			break;
95 		}
96 
97 		if (prop_lookup(code, prop)) {
98 			result = 1;
99 			break;
100 		}
101 	}
102 
103 	va_end(va);
104 	return result;
105 }
106 
mph_hash(unsigned d,unsigned x)107 static inline unsigned mph_hash(unsigned d, unsigned x) {
108     x ^= d;
109     x = ((x >> 16) ^ x) * 0x45d9f3b;
110     return x;
111 }
112 
113 #define CODE_NOT_FOUND ((unsigned) -1)
114 
mph_lookup(unsigned code,const short * g_table,unsigned g_table_size,const unsigned * table,unsigned table_size)115 static inline unsigned mph_lookup(
116 		unsigned code,
117 		const short *g_table, unsigned g_table_size,
118 		const unsigned *table, unsigned table_size)
119 {
120 	short g = g_table[mph_hash(0, code) % g_table_size];
121 
122 	unsigned idx;
123 	if (g <= 0) {
124 		idx = -g;
125 	} else {
126 		idx = mph_hash(g, code) % table_size;
127 	}
128 
129 	if (table[2*idx] == code) {
130 		return table[2*idx + 1];
131 	}
132 	return CODE_NOT_FOUND;
133 }
134 
135 #define CASE_LOOKUP(code, type) \
136 	mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \
137 			_uccase_##type##_table, _uccase_##type##_table_size)
138 
php_unicode_toupper_raw(unsigned code,enum mbfl_no_encoding enc)139 static unsigned php_unicode_toupper_raw(unsigned code, enum mbfl_no_encoding enc)
140 {
141 	if (code < 0x80) {
142 		/* Fast path for ASCII */
143 		if (code >= 0x61 && code <= 0x7A) {
144 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x69)) {
145 				return 0x130;
146 			}
147 			return code - 0x20;
148 		}
149 		return code;
150 	} else {
151 		unsigned new_code = CASE_LOOKUP(code, upper);
152 		if (new_code != CODE_NOT_FOUND) {
153 			return new_code;
154 		}
155 		return code;
156 	}
157 }
158 
php_unicode_tolower_raw(unsigned code,enum mbfl_no_encoding enc)159 static unsigned php_unicode_tolower_raw(unsigned code, enum mbfl_no_encoding enc)
160 {
161 	if (code < 0x80) {
162 		/* Fast path for ASCII */
163 		if (code >= 0x41 && code <= 0x5A) {
164 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x0049L)) {
165 				return 0x0131L;
166 			}
167 			return code + 0x20;
168 		}
169 		return code;
170 	} else {
171 		unsigned new_code = CASE_LOOKUP(code, lower);
172 		if (new_code != CODE_NOT_FOUND) {
173 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
174 				return 0x69;
175 			}
176 			return new_code;
177 		}
178 		return code;
179 	}
180 }
181 
php_unicode_totitle_raw(unsigned code,enum mbfl_no_encoding enc)182 static unsigned php_unicode_totitle_raw(unsigned code, enum mbfl_no_encoding enc)
183 {
184 	unsigned new_code = CASE_LOOKUP(code, title);
185 	if (new_code != CODE_NOT_FOUND) {
186 		return new_code;
187 	}
188 
189 	/* No dedicated title-case variant, use to-upper instead */
190 	return php_unicode_toupper_raw(code, enc);
191 }
192 
php_unicode_tofold_raw(unsigned code,enum mbfl_no_encoding enc)193 unsigned php_unicode_tofold_raw(unsigned code, enum mbfl_no_encoding enc)
194 {
195 	if (code < 0x80) {
196 		/* Fast path for ASCII */
197 		if (code >= 0x41 && code <= 0x5A) {
198 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x49)) {
199 				return 0x131;
200 			}
201 			return code + 0x20;
202 		}
203 		return code;
204 	} else {
205 		unsigned new_code = CASE_LOOKUP(code, fold);
206 		if (new_code != CODE_NOT_FOUND) {
207 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
208 				return 0x69;
209 			}
210 			return new_code;
211 		}
212 		return code;
213 	}
214 }
215 
php_unicode_tolower_simple(unsigned code,enum mbfl_no_encoding enc)216 static inline unsigned php_unicode_tolower_simple(unsigned code, enum mbfl_no_encoding enc) {
217 	code = php_unicode_tolower_raw(code, enc);
218 	if (UNEXPECTED(code > 0xffffff)) {
219 		return _uccase_extra_table[code & 0xffffff];
220 	}
221 	return code;
222 }
php_unicode_toupper_simple(unsigned code,enum mbfl_no_encoding enc)223 static inline unsigned php_unicode_toupper_simple(unsigned code, enum mbfl_no_encoding enc) {
224 	code = php_unicode_toupper_raw(code, enc);
225 	if (UNEXPECTED(code > 0xffffff)) {
226 		return _uccase_extra_table[code & 0xffffff];
227 	}
228 	return code;
229 }
php_unicode_totitle_simple(unsigned code,enum mbfl_no_encoding enc)230 static inline unsigned php_unicode_totitle_simple(unsigned code, enum mbfl_no_encoding enc) {
231 	code = php_unicode_totitle_raw(code, enc);
232 	if (UNEXPECTED(code > 0xffffff)) {
233 		return _uccase_extra_table[code & 0xffffff];
234 	}
235 	return code;
236 }
php_unicode_tofold_simple(unsigned code,enum mbfl_no_encoding enc)237 static inline unsigned php_unicode_tofold_simple(unsigned code, enum mbfl_no_encoding enc) {
238 	code = php_unicode_tofold_raw(code, enc);
239 	if (UNEXPECTED(code > 0xffffff)) {
240 		return _uccase_extra_table[code & 0xffffff];
241 	}
242 	return code;
243 }
244 
php_unicode_tolower_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)245 static inline void php_unicode_tolower_full(unsigned code, enum mbfl_no_encoding enc,
246 	mbfl_convert_filter* next_filter) {
247 	code = php_unicode_tolower_raw(code, enc);
248 	if (UNEXPECTED(code > 0xffffff)) {
249 		unsigned len = code >> 24;
250 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
251 		while (len--) {
252 			(next_filter->filter_function)(*++p, next_filter);
253 		}
254 	} else {
255 		(next_filter->filter_function)(code, next_filter);
256 	}
257 }
258 
php_unicode_toupper_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)259 static inline void php_unicode_toupper_full(unsigned code, enum mbfl_no_encoding enc,
260 	mbfl_convert_filter* next_filter) {
261 	code = php_unicode_toupper_raw(code, enc);
262 	if (UNEXPECTED(code > 0xffffff)) {
263 		unsigned len = code >> 24;
264 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
265 		while (len--) {
266 			(next_filter->filter_function)(*++p, next_filter);
267 		}
268 	} else {
269 		(next_filter->filter_function)(code, next_filter);
270 	}
271 }
272 
php_unicode_totitle_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)273 static inline void php_unicode_totitle_full(unsigned code, enum mbfl_no_encoding enc,
274 	mbfl_convert_filter* next_filter) {
275 	code = php_unicode_totitle_raw(code, enc);
276 	if (UNEXPECTED(code > 0xffffff)) {
277 		unsigned len = code >> 24;
278 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
279 		while (len--) {
280 			(next_filter->filter_function)(*++p, next_filter);
281 		}
282 	} else {
283 		(next_filter->filter_function)(code, next_filter);
284 	}
285 }
286 
php_unicode_tofold_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)287 static inline void php_unicode_tofold_full(unsigned code, enum mbfl_no_encoding enc,
288 	mbfl_convert_filter* next_filter) {
289 	code = php_unicode_tofold_raw(code, enc);
290 	if (UNEXPECTED(code > 0xffffff)) {
291 		unsigned len = code >> 24;
292 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
293 		while (len--) {
294 			(next_filter->filter_function)(*++p, next_filter);
295 		}
296 	} else {
297 		(next_filter->filter_function)(code, next_filter);
298 	}
299 }
300 
301 struct convert_case_data {
302 	mbfl_convert_filter *next_filter;
303 	enum mbfl_no_encoding no_encoding;
304 	int case_mode;
305 	int title_mode;
306 };
307 
convert_case_filter(int c,void * void_data)308 static int convert_case_filter(int c, void *void_data)
309 {
310 	struct convert_case_data *data = (struct convert_case_data *) void_data;
311 	unsigned code;
312 
313 	/* Handle invalid characters early, as we assign special meaning to
314 	 * codepoints above 0xffffff. */
315 	if (UNEXPECTED((unsigned) c > 0xffffff)) {
316 		(*data->next_filter->filter_function)(c, data->next_filter);
317 		return 0;
318 	}
319 
320 	switch (data->case_mode) {
321 		case PHP_UNICODE_CASE_UPPER_SIMPLE:
322 			code = php_unicode_toupper_simple(c, data->no_encoding);
323 			(data->next_filter->filter_function)(code, data->next_filter);
324 			break;
325 
326 		case PHP_UNICODE_CASE_UPPER:
327 			php_unicode_toupper_full(c, data->no_encoding, data->next_filter);
328 			break;
329 
330 		case PHP_UNICODE_CASE_LOWER_SIMPLE:
331 			code = php_unicode_tolower_simple(c, data->no_encoding);
332 			(data->next_filter->filter_function)(code, data->next_filter);
333 			break;
334 
335 		case PHP_UNICODE_CASE_LOWER:
336 			php_unicode_tolower_full(c, data->no_encoding, data->next_filter);
337 			break;
338 
339 		case PHP_UNICODE_CASE_FOLD:
340 			php_unicode_tofold_full(c, data->no_encoding, data->next_filter);
341 			break;
342 
343 		case PHP_UNICODE_CASE_FOLD_SIMPLE:
344 			code = php_unicode_tofold_simple(c, data->no_encoding);
345 			(data->next_filter->filter_function)(code, data->next_filter);
346 			break;
347 
348 		case PHP_UNICODE_CASE_TITLE_SIMPLE:
349 		case PHP_UNICODE_CASE_TITLE:
350 		{
351 			if (data->title_mode) {
352 				if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
353 					code = php_unicode_tolower_simple(c, data->no_encoding);
354 					(data->next_filter->filter_function)(code, data->next_filter);
355 				} else {
356 					php_unicode_tolower_full(c, data->no_encoding, data->next_filter);
357 				}
358 			} else {
359 				if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
360 					code = php_unicode_totitle_simple(c, data->no_encoding);
361 					(data->next_filter->filter_function)(code, data->next_filter);
362 				} else {
363 					php_unicode_totitle_full(c, data->no_encoding, data->next_filter);
364 				}
365 			}
366 			if (!php_unicode_is_case_ignorable(c)) {
367 				data->title_mode = php_unicode_is_cased(c);
368 			}
369 			break;
370 		}
371 		EMPTY_SWITCH_DEFAULT_CASE()
372 	}
373 
374 	return 0;
375 }
376 
php_unicode_convert_case(int case_mode,const char * srcstr,size_t srclen,size_t * ret_len,const mbfl_encoding * src_encoding,int illegal_mode,int illegal_substchar)377 MBSTRING_API char *php_unicode_convert_case(
378 		int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
379 		const mbfl_encoding *src_encoding, int illegal_mode, int illegal_substchar)
380 {
381 	struct convert_case_data data;
382 	mbfl_convert_filter *from_wchar, *to_wchar;
383 	mbfl_string result, *result_ptr;
384 
385 	mbfl_memory_device device;
386 	mbfl_memory_device_init(&device, srclen + 1, 0);
387 
388 	/* encoding -> wchar filter */
389 	to_wchar = mbfl_convert_filter_new(src_encoding,
390 			&mbfl_encoding_wchar, convert_case_filter, NULL, &data);
391 	if (to_wchar == NULL) {
392 		mbfl_memory_device_clear(&device);
393 		return NULL;
394 	}
395 
396 	/* wchar -> encoding filter */
397 	from_wchar = mbfl_convert_filter_new(
398 			&mbfl_encoding_wchar, src_encoding,
399 			mbfl_memory_device_output, NULL, &device);
400 	if (from_wchar == NULL) {
401 		mbfl_convert_filter_delete(to_wchar);
402 		mbfl_memory_device_clear(&device);
403 		return NULL;
404 	}
405 
406 	to_wchar->illegal_mode = illegal_mode;
407 	to_wchar->illegal_substchar = illegal_substchar;
408 	from_wchar->illegal_mode = illegal_mode;
409 	from_wchar->illegal_substchar = illegal_substchar;
410 
411 	data.next_filter = from_wchar;
412 	data.no_encoding = src_encoding->no_encoding;
413 	data.case_mode = case_mode;
414 	data.title_mode = 0;
415 
416 	{
417 		/* feed data */
418 		const unsigned char *p = (const unsigned char *) srcstr;
419 		size_t n = srclen;
420 		while (n > 0) {
421 			if ((*to_wchar->filter_function)(*p++, to_wchar) < 0) {
422 				break;
423 			}
424 			n--;
425 		}
426 	}
427 
428 	mbfl_convert_filter_flush(to_wchar);
429 	mbfl_convert_filter_flush(from_wchar);
430 	result_ptr = mbfl_memory_device_result(&device, &result);
431 	mbfl_convert_filter_delete(to_wchar);
432 	mbfl_convert_filter_delete(from_wchar);
433 
434 	if (!result_ptr) {
435 		return NULL;
436 	}
437 
438 	*ret_len = result.len;
439 	return (char *) result.val;
440 }
441