xref: /PHP-8.2/ext/mbstring/php_unicode.c (revision 4e51810f)
1 /*
2    +----------------------------------------------------------------------+
3    | Copyright (c) The PHP Group                                          |
4    +----------------------------------------------------------------------+
5    | This source file is subject to version 3.01 of the PHP license,      |
6    | that is bundled with this package in the file LICENSE, and is        |
7    | available through the world-wide-web at the following url:           |
8    | https://www.php.net/license/3_01.txt                                 |
9    | If you did not receive a copy of the PHP license and are unable to   |
10    | obtain it through the world-wide-web, please send a note to          |
11    | license@php.net so we can mail you a copy immediately.               |
12    +----------------------------------------------------------------------+
13    | Author: Wez Furlong (wez@thebrainroom.com)                           |
14    +----------------------------------------------------------------------+
15 
16 	Based on code from ucdata-2.5, which has the following Copyright:
17 
18 	Copyright 2001 Computing Research Labs, New Mexico State University
19 
20 	Permission is hereby granted, free of charge, to any person obtaining a
21 	copy of this software and associated documentation files (the "Software"),
22 	to deal in the Software without restriction, including without limitation
23 	the rights to use, copy, modify, merge, publish, distribute, sublicense,
24 	and/or sell copies of the Software, and to permit persons to whom the
25 	Software is furnished to do so, subject to the following conditions:
26 
27 	The above copyright notice and this permission notice shall be included in
28 	all copies or substantial portions of the Software.
29 */
30 
31 #include "php.h"
32 
33 /* include case folding data generated from the official UnicodeData.txt file */
34 #include "mbstring.h"
35 #include "php_unicode.h"
36 #include "unicode_data.h"
37 #include "libmbfl/mbfl/mbfilter_wchar.h"
38 
ZEND_EXTERN_MODULE_GLOBALS(mbstring)39 ZEND_EXTERN_MODULE_GLOBALS(mbstring)
40 
41 static bool prop_lookup(unsigned long code, unsigned long n)
42 {
43 	long l = _ucprop_offsets[n];
44 	long r = _ucprop_offsets[n + 1] - 1;
45 	while (l <= r) {
46 		/*
47 		 * Determine a "mid" point and adjust to make sure the mid point is at
48 		 * the beginning of a range pair.
49 		 */
50 		long m = (l + r) >> 1;
51 		m -= (m & 1);
52 		if (code > _ucprop_ranges[m + 1])
53 			l = m + 2;
54 		else if (code < _ucprop_ranges[m])
55 			r = m - 2;
56 		else
57 			return true;
58 	}
59 	return false;
60 
61 }
62 
php_unicode_is_prop1(unsigned long code,int prop)63 MBSTRING_API bool php_unicode_is_prop1(unsigned long code, int prop)
64 {
65 	return prop_lookup(code, prop);
66 }
67 
php_unicode_is_prop(unsigned long code,...)68 MBSTRING_API bool php_unicode_is_prop(unsigned long code, ...)
69 {
70 	bool result = false;
71 	va_list va;
72 	va_start(va, code);
73 
74 	while (1) {
75 		int prop = va_arg(va, int);
76 		if (prop < 0) {
77 			break;
78 		}
79 
80 		if (prop_lookup(code, prop)) {
81 			result = true;
82 			break;
83 		}
84 	}
85 
86 	va_end(va);
87 	return result;
88 }
89 
mph_hash(unsigned d,unsigned x)90 static inline unsigned mph_hash(unsigned d, unsigned x) {
91 	x ^= d;
92 	x = ((x >> 16) ^ x) * 0x45d9f3b;
93 	return x;
94 }
95 
96 #define CODE_NOT_FOUND ((unsigned) -1)
97 
mph_lookup(unsigned code,const short * g_table,unsigned g_table_size,const unsigned * table,unsigned table_size)98 static inline unsigned mph_lookup(
99 		unsigned code,
100 		const short *g_table, unsigned g_table_size,
101 		const unsigned *table, unsigned table_size)
102 {
103 	short g = g_table[mph_hash(0, code) % g_table_size];
104 
105 	unsigned idx;
106 	if (g <= 0) {
107 		idx = -g;
108 	} else {
109 		idx = mph_hash(g, code) % table_size;
110 	}
111 
112 	if (table[2*idx] == code) {
113 		return table[2*idx + 1];
114 	}
115 	return CODE_NOT_FOUND;
116 }
117 
118 #define CASE_LOOKUP(code, type) \
119 	mph_lookup(code, _uccase_##type##_g, _uccase_##type##_g_size, \
120 			_uccase_##type##_table, _uccase_##type##_table_size)
121 
php_unicode_toupper_raw(unsigned code,enum mbfl_no_encoding enc)122 static unsigned php_unicode_toupper_raw(unsigned code, enum mbfl_no_encoding enc)
123 {
124 	/* After the ASCII characters, the first codepoint with an uppercase version
125 	 * is 0xB5 (MICRO SIGN) */
126 	if (code < 0xB5) {
127 		/* Fast path for ASCII */
128 		if (code >= 0x61 && code <= 0x7A) {
129 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x69)) {
130 				return 0x130;
131 			}
132 			return code - 0x20;
133 		}
134 		return code;
135 	} else {
136 		unsigned new_code = CASE_LOOKUP(code, upper);
137 		if (new_code != CODE_NOT_FOUND) {
138 			return new_code;
139 		}
140 		return code;
141 	}
142 }
143 
php_unicode_tolower_raw(unsigned code,enum mbfl_no_encoding enc)144 static unsigned php_unicode_tolower_raw(unsigned code, enum mbfl_no_encoding enc)
145 {
146 	/* After the ASCII characters, the first codepoint with a lowercase version
147 	 * is 0xC0 (LATIN CAPITAL LETTER A WITH GRAVE) */
148 	if (code < 0xC0) {
149 		/* Fast path for ASCII */
150 		if (code >= 0x41 && code <= 0x5A) {
151 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x0049L)) {
152 				return 0x0131L;
153 			}
154 			return code + 0x20;
155 		}
156 		return code;
157 	} else {
158 		unsigned new_code = CASE_LOOKUP(code, lower);
159 		if (new_code != CODE_NOT_FOUND) {
160 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
161 				return 0x69;
162 			}
163 			return new_code;
164 		}
165 		return code;
166 	}
167 }
168 
php_unicode_totitle_raw(unsigned code,enum mbfl_no_encoding enc)169 static unsigned php_unicode_totitle_raw(unsigned code, enum mbfl_no_encoding enc)
170 {
171 	unsigned new_code = CASE_LOOKUP(code, title);
172 	if (new_code != CODE_NOT_FOUND) {
173 		return new_code;
174 	}
175 
176 	/* No dedicated title-case variant, use to-upper instead */
177 	return php_unicode_toupper_raw(code, enc);
178 }
179 
php_unicode_tofold_raw(unsigned code,enum mbfl_no_encoding enc)180 unsigned php_unicode_tofold_raw(unsigned code, enum mbfl_no_encoding enc)
181 {
182 	if (code < 0x80) {
183 		/* Fast path for ASCII */
184 		if (code >= 0x41 && code <= 0x5A) {
185 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x49)) {
186 				return 0x131;
187 			}
188 			return code + 0x20;
189 		}
190 		return code;
191 	} else {
192 		unsigned new_code = CASE_LOOKUP(code, fold);
193 		if (new_code != CODE_NOT_FOUND) {
194 			if (UNEXPECTED(enc == mbfl_no_encoding_8859_9 && code == 0x130)) {
195 				return 0x69;
196 			}
197 			return new_code;
198 		}
199 		return code;
200 	}
201 }
202 
php_unicode_tolower_simple(unsigned code,enum mbfl_no_encoding enc)203 static inline unsigned php_unicode_tolower_simple(unsigned code, enum mbfl_no_encoding enc) {
204 	code = php_unicode_tolower_raw(code, enc);
205 	if (UNEXPECTED(code > 0xffffff)) {
206 		return _uccase_extra_table[code & 0xffffff];
207 	}
208 	return code;
209 }
php_unicode_toupper_simple(unsigned code,enum mbfl_no_encoding enc)210 static inline unsigned php_unicode_toupper_simple(unsigned code, enum mbfl_no_encoding enc) {
211 	code = php_unicode_toupper_raw(code, enc);
212 	if (UNEXPECTED(code > 0xffffff)) {
213 		return _uccase_extra_table[code & 0xffffff];
214 	}
215 	return code;
216 }
php_unicode_totitle_simple(unsigned code,enum mbfl_no_encoding enc)217 static inline unsigned php_unicode_totitle_simple(unsigned code, enum mbfl_no_encoding enc) {
218 	code = php_unicode_totitle_raw(code, enc);
219 	if (UNEXPECTED(code > 0xffffff)) {
220 		return _uccase_extra_table[code & 0xffffff];
221 	}
222 	return code;
223 }
php_unicode_tofold_simple(unsigned code,enum mbfl_no_encoding enc)224 static inline unsigned php_unicode_tofold_simple(unsigned code, enum mbfl_no_encoding enc) {
225 	code = php_unicode_tofold_raw(code, enc);
226 	if (UNEXPECTED(code > 0xffffff)) {
227 		return _uccase_extra_table[code & 0xffffff];
228 	}
229 	return code;
230 }
231 
php_unicode_tolower_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)232 static inline void php_unicode_tolower_full(unsigned code, enum mbfl_no_encoding enc,
233 	mbfl_convert_filter* next_filter) {
234 	code = php_unicode_tolower_raw(code, enc);
235 	if (UNEXPECTED(code > 0xffffff)) {
236 		unsigned len = code >> 24;
237 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
238 		while (len--) {
239 			(next_filter->filter_function)(*++p, next_filter);
240 		}
241 	} else {
242 		(next_filter->filter_function)(code, next_filter);
243 	}
244 }
245 
php_unicode_toupper_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)246 static inline void php_unicode_toupper_full(unsigned code, enum mbfl_no_encoding enc,
247 	mbfl_convert_filter* next_filter) {
248 	code = php_unicode_toupper_raw(code, enc);
249 	if (UNEXPECTED(code > 0xffffff)) {
250 		unsigned len = code >> 24;
251 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
252 		while (len--) {
253 			(next_filter->filter_function)(*++p, next_filter);
254 		}
255 	} else {
256 		(next_filter->filter_function)(code, next_filter);
257 	}
258 }
259 
php_unicode_totitle_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)260 static inline void php_unicode_totitle_full(unsigned code, enum mbfl_no_encoding enc,
261 	mbfl_convert_filter* next_filter) {
262 	code = php_unicode_totitle_raw(code, enc);
263 	if (UNEXPECTED(code > 0xffffff)) {
264 		unsigned len = code >> 24;
265 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
266 		while (len--) {
267 			(next_filter->filter_function)(*++p, next_filter);
268 		}
269 	} else {
270 		(next_filter->filter_function)(code, next_filter);
271 	}
272 }
273 
php_unicode_tofold_full(unsigned code,enum mbfl_no_encoding enc,mbfl_convert_filter * next_filter)274 static inline void php_unicode_tofold_full(unsigned code, enum mbfl_no_encoding enc,
275 	mbfl_convert_filter* next_filter) {
276 	code = php_unicode_tofold_raw(code, enc);
277 	if (UNEXPECTED(code > 0xffffff)) {
278 		unsigned len = code >> 24;
279 		const unsigned *p = &_uccase_extra_table[code & 0xffffff];
280 		while (len--) {
281 			(next_filter->filter_function)(*++p, next_filter);
282 		}
283 	} else {
284 		(next_filter->filter_function)(code, next_filter);
285 	}
286 }
287 
288 struct convert_case_data {
289 	mbfl_convert_filter *next_filter;
290 	enum mbfl_no_encoding no_encoding;
291 	int case_mode;
292 	int title_mode;
293 };
294 
convert_case_filter(int c,void * void_data)295 static int convert_case_filter(int c, void *void_data)
296 {
297 	struct convert_case_data *data = (struct convert_case_data *) void_data;
298 	unsigned code;
299 
300 	/* Handle invalid characters early, as we assign special meaning to
301 	 * codepoints above 0xffffff. */
302 	if (UNEXPECTED((unsigned) c > 0xffffff)) {
303 		(*data->next_filter->filter_function)(c, data->next_filter);
304 		return 0;
305 	}
306 
307 	switch (data->case_mode) {
308 		case PHP_UNICODE_CASE_UPPER_SIMPLE:
309 			code = php_unicode_toupper_simple(c, data->no_encoding);
310 			(data->next_filter->filter_function)(code, data->next_filter);
311 			break;
312 
313 		case PHP_UNICODE_CASE_UPPER:
314 			php_unicode_toupper_full(c, data->no_encoding, data->next_filter);
315 			break;
316 
317 		case PHP_UNICODE_CASE_LOWER_SIMPLE:
318 			code = php_unicode_tolower_simple(c, data->no_encoding);
319 			(data->next_filter->filter_function)(code, data->next_filter);
320 			break;
321 
322 		case PHP_UNICODE_CASE_LOWER:
323 			php_unicode_tolower_full(c, data->no_encoding, data->next_filter);
324 			break;
325 
326 		case PHP_UNICODE_CASE_FOLD:
327 			php_unicode_tofold_full(c, data->no_encoding, data->next_filter);
328 			break;
329 
330 		case PHP_UNICODE_CASE_FOLD_SIMPLE:
331 			code = php_unicode_tofold_simple(c, data->no_encoding);
332 			(data->next_filter->filter_function)(code, data->next_filter);
333 			break;
334 
335 		case PHP_UNICODE_CASE_TITLE_SIMPLE:
336 		case PHP_UNICODE_CASE_TITLE:
337 		{
338 			if (data->title_mode) {
339 				if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
340 					code = php_unicode_tolower_simple(c, data->no_encoding);
341 					(data->next_filter->filter_function)(code, data->next_filter);
342 				} else {
343 					php_unicode_tolower_full(c, data->no_encoding, data->next_filter);
344 				}
345 			} else {
346 				if (data->case_mode == PHP_UNICODE_CASE_TITLE_SIMPLE) {
347 					code = php_unicode_totitle_simple(c, data->no_encoding);
348 					(data->next_filter->filter_function)(code, data->next_filter);
349 				} else {
350 					php_unicode_totitle_full(c, data->no_encoding, data->next_filter);
351 				}
352 			}
353 			if (!php_unicode_is_case_ignorable(c)) {
354 				data->title_mode = php_unicode_is_cased(c);
355 			}
356 			break;
357 		}
358 		EMPTY_SWITCH_DEFAULT_CASE()
359 	}
360 
361 	return 0;
362 }
363 
php_unicode_convert_case(int case_mode,const char * srcstr,size_t srclen,size_t * ret_len,const mbfl_encoding * src_encoding,int illegal_mode,int illegal_substchar)364 MBSTRING_API char *php_unicode_convert_case(
365 		int case_mode, const char *srcstr, size_t srclen, size_t *ret_len,
366 		const mbfl_encoding *src_encoding, int illegal_mode, int illegal_substchar)
367 {
368 	struct convert_case_data data;
369 	mbfl_convert_filter *from_wchar, *to_wchar;
370 	mbfl_string result;
371 
372 	mbfl_memory_device device;
373 	mbfl_memory_device_init(&device, srclen + 1, 0);
374 
375 	/* encoding -> wchar filter */
376 	to_wchar = mbfl_convert_filter_new(src_encoding,
377 			&mbfl_encoding_wchar, convert_case_filter, NULL, &data);
378 	if (to_wchar == NULL) {
379 		mbfl_memory_device_clear(&device);
380 		return NULL;
381 	}
382 
383 	/* wchar -> encoding filter */
384 	from_wchar = mbfl_convert_filter_new(
385 			&mbfl_encoding_wchar, src_encoding,
386 			mbfl_memory_device_output, NULL, &device);
387 	if (from_wchar == NULL) {
388 		mbfl_convert_filter_delete(to_wchar);
389 		mbfl_memory_device_clear(&device);
390 		return NULL;
391 	}
392 
393 	to_wchar->illegal_mode = illegal_mode;
394 	to_wchar->illegal_substchar = illegal_substchar;
395 	from_wchar->illegal_mode = illegal_mode;
396 	from_wchar->illegal_substchar = illegal_substchar;
397 
398 	data.next_filter = from_wchar;
399 	data.no_encoding = src_encoding->no_encoding;
400 	data.case_mode = case_mode;
401 	data.title_mode = 0;
402 
403 	{
404 		/* feed data */
405 		const unsigned char *p = (const unsigned char *) srcstr;
406 		size_t n = srclen;
407 		while (n > 0) {
408 			if ((*to_wchar->filter_function)(*p++, to_wchar) < 0) {
409 				break;
410 			}
411 			n--;
412 		}
413 	}
414 
415 	mbfl_convert_filter_flush(to_wchar);
416 	mbfl_convert_filter_flush(from_wchar);
417 	mbfl_memory_device_result(&device, &result);
418 	mbfl_convert_filter_delete(to_wchar);
419 	mbfl_convert_filter_delete(from_wchar);
420 
421 	*ret_len = result.len;
422 	return (char *) result.val;
423 }
424