xref: /PHP-8.1/ext/mbstring/php_unicode.c (revision d2073179)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Wez Furlong (wez@thebrainroom.com)                           |
14    +----------------------------------------------------------------------+
15 
16 	Based on code from ucdata-2.5, which has the following Copyright:
17 
18 	Copyright 2001 Computing Research Labs, New Mexico State University
19 
20 	Permission is hereby granted, free of charge, to any person obtaining a
21 	copy of this software and associated documentation files (the "Software"),
22 	to deal in the Software without restriction, including without limitation
23 	the rights to use, copy, modify, merge, publish, distribute, sublicense,
24 	and/or sell copies of the Software, and to permit persons to whom the
25 	Software is furnished to do so, subject to the following conditions:
26 
27 	The above copyright notice and this permission notice shall be included in
28 	all copies or substantial portions of the Software.
29 */
30 
31 #include "php.h"
32 
33 /* include case folding data generated from the official UnicodeData.txt file */
34 #include "mbstring.h"
35 #include "php_unicode.h"
36 #include "unicode_data.h"
37 #include "libmbfl/mbfl/mbfilter_wchar.h"
38 
ZEND_EXTERN_MODULE_GLOBALS(mbstring)39 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
40 
41 static bool prop_lookup(unsigned long code, unsigned long n)
42 {
43 	long l = _ucprop_offsets[n];
44 	long r = _ucprop_offsets[n + 1] - 1;
45 	while (l <= r) {
46 		/*
47 		 * Determine a "mid" point and adjust to make sure the mid point is at
48 		 * the beginning of a range pair.
49 		 */
50 		long m = (l + r) >> 1;
51 		m -= (m & 1);
52 		if (code > _ucprop_ranges[m + 1])
53 			l = m + 2;
54 		else if (code < _ucprop_ranges[m])
55 			r = m - 2;
56 		else
57 			return true;
58 	}
59 	return false;
60 
61 }
62 
php_unicode_is_prop1(unsigned long code,int prop)63 MBSTRING_API bool php_unicode_is_prop1(unsigned long code, int prop)
64 {
65 	return prop_lookup(code, prop);
66 }
67 
php_unicode_is_prop(unsigned long code,...)68 MBSTRING_API bool php_unicode_is_prop(unsigned long code, ...)
69 {
70 	bool result = false;
71 	va_list va;
72 	va_start(va, code);
73 
74 	while (1) {
75 		int prop = va_arg(va, int);
76 		if (prop < 0) {
77 			break;
78 		}
79 
80 		if (prop_lookup(code, prop)) {
81 			result = true;
82 			break;
83 		}
84 	}
85 
86 	va_end(va);
87 	return result;
88 }
89 
mph_hash(unsigned d,unsigned x)90 static inline unsigned mph_hash(unsigned d, unsigned x) {
91 	x ^= d;
92 	x = ((x >> 16) ^ x) * 0x45d9f3b;
93 	return x;
94 }
95 
96 #define CODE_NOT_FOUND ((unsigned) -1)
97 
mph_lookup(unsigned code,const short * g_table,unsigned g_table_size,const unsigned * table,unsigned table_size)98 static inline unsigned mph_lookup(
99 		unsigned code,
100 		const short *g_table, unsigned g_table_size,
101 		const unsigned *table, unsigned table_size)
102 {
103 	short g = g_table[mph_hash(0, code) % g_table_size];
104 
105 	unsigned idx;
106 	if (g <= 0) {
107 		idx = -g;
108 	} else {
109 		idx = mph_hash(g, code) % table_size;
110 	}
111 
112 	if (table[2*idx] == code) {
113 		return table[2*idx + 1];
114 	}
115 	return CODE_NOT_FOUND;
116 }
117 
118 #define CASE_LOOKUP(code, type) \
119 	mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \
120 			_uccase_##type##_table, _uccase_##type##_table_size)
121 
php_unicode_toupper_raw(unsigned code,enum mbfl_no_encoding enc)122 static unsigned php_unicode_toupper_raw(unsigned code, enum mbfl_no_encoding enc)
123 {
124 	if (code < 0x80) {
125 		/* Fast path for ASCII */
126 		if (code >= 0x61 && code <= 0x7A) {
127 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x69)) {
128 				return 0x130;
129 			}
130 			return code - 0x20;
131 		}
132 		return code;
133 	} else {
134 		unsigned new_code = CASE_LOOKUP(code, upper);
135 		if (new_code != CODE_NOT_FOUND) {
136 			return new_code;
137 		}
138 		return code;
139 	}
140 }
141 
php_unicode_tolower_raw(unsigned code,enum mbfl_no_encoding enc)142 static unsigned php_unicode_tolower_raw(unsigned code, enum mbfl_no_encoding enc)
143 {
144 	if (code < 0x80) {
145 		/* Fast path for ASCII */
146 		if (code >= 0x41 && code <= 0x5A) {
147 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x0049L)) {
148 				return 0x0131L;
149 			}
150 			return code + 0x20;
151 		}
152 		return code;
153 	} else {
154 		unsigned new_code = CASE_LOOKUP(code, lower);
155 		if (new_code != CODE_NOT_FOUND) {
156 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
157 				return 0x69;
158 			}
159 			return new_code;
160 		}
161 		return code;
162 	}
163 }
164 
php_unicode_totitle_raw(unsigned code,enum mbfl_no_encoding enc)165 static unsigned php_unicode_totitle_raw(unsigned code, enum mbfl_no_encoding enc)
166 {
167 	unsigned new_code = CASE_LOOKUP(code, title);
168 	if (new_code != CODE_NOT_FOUND) {
169 		return new_code;
170 	}
171 
172 	/* No dedicated title-case variant, use to-upper instead */
173 	return php_unicode_toupper_raw(code, enc);
174 }
175 
php_unicode_tofold_raw(unsigned code,enum mbfl_no_encoding enc)176 unsigned php_unicode_tofold_raw(unsigned code, enum mbfl_no_encoding enc)
177 {
178 	if (code < 0x80) {
179 		/* Fast path for ASCII */
180 		if (code >= 0x41 && code <= 0x5A) {
181 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x49)) {
182 				return 0x131;
183 			}
184 			return code + 0x20;
185 		}
186 		return code;
187 	} else {
188 		unsigned new_code = CASE_LOOKUP(code, fold);
189 		if (new_code != CODE_NOT_FOUND) {
190 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
191 				return 0x69;
192 			}
193 			return new_code;
194 		}
195 		return code;
196 	}
197 }
198 
php_unicode_tolower_simple(unsigned code,enum mbfl_no_encoding enc)199 static inline unsigned php_unicode_tolower_simple(unsigned code, enum mbfl_no_encoding enc) {
200 	code = php_unicode_tolower_raw(code, enc);
201 	if (UNEXPECTED(code > 0xffffff)) {
202 		return _uccase_extra_table[code & 0xffffff];
203 	}
204 	return code;
205 }
php_unicode_toupper_simple(unsigned code,enum mbfl_no_encoding enc)206 static inline unsigned php_unicode_toupper_simple(unsigned code, enum mbfl_no_encoding enc) {
207 	code = php_unicode_toupper_raw(code, enc);
208 	if (UNEXPECTED(code > 0xffffff)) {
209 		return _uccase_extra_table[code & 0xffffff];
210 	}
211 	return code;
212 }
php_unicode_totitle_simple(unsigned code,enum mbfl_no_encoding enc)213 static inline unsigned php_unicode_totitle_simple(unsigned code, enum mbfl_no_encoding enc) {
214 	code = php_unicode_totitle_raw(code, enc);
215 	if (UNEXPECTED(code > 0xffffff)) {
216 		return _uccase_extra_table[code & 0xffffff];
217 	}
218 	return code;
219 }
php_unicode_tofold_simple(unsigned code,enum mbfl_no_encoding enc)220 static inline unsigned php_unicode_tofold_simple(unsigned code, enum mbfl_no_encoding enc) {
221 	code = php_unicode_tofold_raw(code, enc);
222 	if (UNEXPECTED(code > 0xffffff)) {
223 		return _uccase_extra_table[code & 0xffffff];
224 	}
225 	return code;
226 }
227 
php_unicode_tolower_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)228 static inline void php_unicode_tolower_full(unsigned code, enum mbfl_no_encoding enc,
229 	mbfl_convert_filter* next_filter) {
230 	code = php_unicode_tolower_raw(code, enc);
231 	if (UNEXPECTED(code > 0xffffff)) {
232 		unsigned len = code >> 24;
233 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
234 		while (len--) {
235 			(next_filter->filter_function)(*++p, next_filter);
236 		}
237 	} else {
238 		(next_filter->filter_function)(code, next_filter);
239 	}
240 }
241 
php_unicode_toupper_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)242 static inline void php_unicode_toupper_full(unsigned code, enum mbfl_no_encoding enc,
243 	mbfl_convert_filter* next_filter) {
244 	code = php_unicode_toupper_raw(code, enc);
245 	if (UNEXPECTED(code > 0xffffff)) {
246 		unsigned len = code >> 24;
247 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
248 		while (len--) {
249 			(next_filter->filter_function)(*++p, next_filter);
250 		}
251 	} else {
252 		(next_filter->filter_function)(code, next_filter);
253 	}
254 }
255 
php_unicode_totitle_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)256 static inline void php_unicode_totitle_full(unsigned code, enum mbfl_no_encoding enc,
257 	mbfl_convert_filter* next_filter) {
258 	code = php_unicode_totitle_raw(code, enc);
259 	if (UNEXPECTED(code > 0xffffff)) {
260 		unsigned len = code >> 24;
261 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
262 		while (len--) {
263 			(next_filter->filter_function)(*++p, next_filter);
264 		}
265 	} else {
266 		(next_filter->filter_function)(code, next_filter);
267 	}
268 }
269 
php_unicode_tofold_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)270 static inline void php_unicode_tofold_full(unsigned code, enum mbfl_no_encoding enc,
271 	mbfl_convert_filter* next_filter) {
272 	code = php_unicode_tofold_raw(code, enc);
273 	if (UNEXPECTED(code > 0xffffff)) {
274 		unsigned len = code >> 24;
275 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
276 		while (len--) {
277 			(next_filter->filter_function)(*++p, next_filter);
278 		}
279 	} else {
280 		(next_filter->filter_function)(code, next_filter);
281 	}
282 }
283 
284 struct convert_case_data {
285 	mbfl_convert_filter *next_filter;
286 	enum mbfl_no_encoding no_encoding;
287 	int case_mode;
288 	int title_mode;
289 };
290 
convert_case_filter(int c,void * void_data)291 static int convert_case_filter(int c, void *void_data)
292 {
293 	struct convert_case_data *data = (struct convert_case_data *) void_data;
294 	unsigned code;
295 
296 	/* Handle invalid characters early, as we assign special meaning to
297 	 * codepoints above 0xffffff. */
298 	if (UNEXPECTED((unsigned) c > 0xffffff)) {
299 		(*data->next_filter->filter_function)(c, data->next_filter);
300 		return 0;
301 	}
302 
303 	switch (data->case_mode) {
304 		case PHP_UNICODE_CASE_UPPER_SIMPLE:
305 			code = php_unicode_toupper_simple(c, data->no_encoding);
306 			(data->next_filter->filter_function)(code, data->next_filter);
307 			break;
308 
309 		case PHP_UNICODE_CASE_UPPER:
310 			php_unicode_toupper_full(c, data->no_encoding, data->next_filter);
311 			break;
312 
313 		case PHP_UNICODE_CASE_LOWER_SIMPLE:
314 			code = php_unicode_tolower_simple(c, data->no_encoding);
315 			(data->next_filter->filter_function)(code, data->next_filter);
316 			break;
317 
318 		case PHP_UNICODE_CASE_LOWER:
319 			php_unicode_tolower_full(c, data->no_encoding, data->next_filter);
320 			break;
321 
322 		case PHP_UNICODE_CASE_FOLD:
323 			php_unicode_tofold_full(c, data->no_encoding, data->next_filter);
324 			break;
325 
326 		case PHP_UNICODE_CASE_FOLD_SIMPLE:
327 			code = php_unicode_tofold_simple(c, data->no_encoding);
328 			(data->next_filter->filter_function)(code, data->next_filter);
329 			break;
330 
331 		case PHP_UNICODE_CASE_TITLE_SIMPLE:
332 		case PHP_UNICODE_CASE_TITLE:
333 		{
334 			if (data->title_mode) {
335 				if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
336 					code = php_unicode_tolower_simple(c, data->no_encoding);
337 					(data->next_filter->filter_function)(code, data->next_filter);
338 				} else {
339 					php_unicode_tolower_full(c, data->no_encoding, data->next_filter);
340 				}
341 			} else {
342 				if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
343 					code = php_unicode_totitle_simple(c, data->no_encoding);
344 					(data->next_filter->filter_function)(code, data->next_filter);
345 				} else {
346 					php_unicode_totitle_full(c, data->no_encoding, data->next_filter);
347 				}
348 			}
349 			if (!php_unicode_is_case_ignorable(c)) {
350 				data->title_mode = php_unicode_is_cased(c);
351 			}
352 			break;
353 		}
354 		EMPTY_SWITCH_DEFAULT_CASE()
355 	}
356 
357 	return 0;
358 }
359 
php_unicode_convert_case(int case_mode,const char * srcstr,size_t srclen,size_t * ret_len,const mbfl_encoding * src_encoding,int illegal_mode,int illegal_substchar)360 MBSTRING_API char *php_unicode_convert_case(
361 		int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
362 		const mbfl_encoding *src_encoding, int illegal_mode, int illegal_substchar)
363 {
364 	struct convert_case_data data;
365 	mbfl_convert_filter *from_wchar, *to_wchar;
366 	mbfl_string result, *result_ptr;
367 
368 	mbfl_memory_device device;
369 	mbfl_memory_device_init(&device, srclen + 1, 0);
370 
371 	/* encoding -> wchar filter */
372 	to_wchar = mbfl_convert_filter_new(src_encoding,
373 			&mbfl_encoding_wchar, convert_case_filter, NULL, &data);
374 	if (to_wchar == NULL) {
375 		mbfl_memory_device_clear(&device);
376 		return NULL;
377 	}
378 
379 	/* wchar -> encoding filter */
380 	from_wchar = mbfl_convert_filter_new(
381 			&mbfl_encoding_wchar, src_encoding,
382 			mbfl_memory_device_output, NULL, &device);
383 	if (from_wchar == NULL) {
384 		mbfl_convert_filter_delete(to_wchar);
385 		mbfl_memory_device_clear(&device);
386 		return NULL;
387 	}
388 
389 	to_wchar->illegal_mode = illegal_mode;
390 	to_wchar->illegal_substchar = illegal_substchar;
391 	from_wchar->illegal_mode = illegal_mode;
392 	from_wchar->illegal_substchar = illegal_substchar;
393 
394 	data.next_filter = from_wchar;
395 	data.no_encoding = src_encoding->no_encoding;
396 	data.case_mode = case_mode;
397 	data.title_mode = 0;
398 
399 	{
400 		/* feed data */
401 		const unsigned char *p = (const unsigned char *) srcstr;
402 		size_t n = srclen;
403 		while (n > 0) {
404 			if ((*to_wchar->filter_function)(*p++, to_wchar) < 0) {
405 				break;
406 			}
407 			n--;
408 		}
409 	}
410 
411 	mbfl_convert_filter_flush(to_wchar);
412 	mbfl_convert_filter_flush(from_wchar);
413 	result_ptr = mbfl_memory_device_result(&device, &result);
414 	mbfl_convert_filter_delete(to_wchar);
415 	mbfl_convert_filter_delete(from_wchar);
416 
417 	if (!result_ptr) {
418 		return NULL;
419 	}
420 
421 	*ret_len = result.len;
422 	return (char *) result.val;
423 }
424