xref: /php-src/ext/intl/collator/collator_sort.c (revision 5c5727a5)
1 /*
2    +----------------------------------------------------------------------+
3    | This source file is subject to version 3.01 of the PHP license,      |
4    | that is bundled with this package in the file LICENSE, and is        |
5    | available through the world-wide-web at the following url:           |
6    | https://www.php.net/license/3_01.txt                                 |
7    | If you did not receive a copy of the PHP license and are unable to   |
8    | obtain it through the world-wide-web, please send a note to          |
9    | license@php.net so we can mail you a copy immediately.               |
10    +----------------------------------------------------------------------+
11    | Authors: Vadim Savchuk <vsavchuk@productengine.com>                  |
12    |          Dmitry Lakhtyuk <dlakhtyuk@productengine.com>               |
13    +----------------------------------------------------------------------+
14  */
15 
16 #ifdef HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19 
20 #include "php_intl.h"
21 #include "collator.h"
22 #include "collator_class.h"
23 #include "collator_sort.h"
24 #include "collator_convert.h"
25 #include "intl_convert.h"
26 
27 #if !defined(HAVE_PTRDIFF_T) && !defined(_PTRDIFF_T_DEFINED)
28 typedef zend_long ptrdiff_t;
29 #endif
30 
31 /**
32  * Declare 'index' which will point to sort key in sort key
33  * buffer.
34  */
35 typedef struct _collator_sort_key_index {
36 	char* key;       /* pointer to sort key */
37 	zval* zstr;     /* pointer to original string(hash-item) */
38 } collator_sort_key_index_t;
39 
40 ZEND_EXTERN_MODULE_GLOBALS( intl )
41 
42 static const size_t DEF_SORT_KEYS_BUF_SIZE = 1048576;
43 static const size_t DEF_SORT_KEYS_BUF_INCREMENT = 1048576;
44 
45 static const size_t DEF_SORT_KEYS_INDX_BUF_SIZE = 1048576;
46 static const size_t DEF_SORT_KEYS_INDX_BUF_INCREMENT = 1048576;
47 
48 static const size_t DEF_UTF16_BUF_SIZE = 1024;
49 
50 /* {{{ collator_regular_compare_function */
collator_regular_compare_function(zval * result,zval * op1,zval * op2)51 static int collator_regular_compare_function(zval *result, zval *op1, zval *op2)
52 {
53 	int rc = SUCCESS;
54 	zval str1, str2;
55 	zval num1, num2;
56 	zval norm1, norm2;
57 	zval *num1_p = NULL, *num2_p = NULL;
58 	zval *norm1_p = NULL, *norm2_p = NULL;
59 	zval *str1_p, *str2_p;
60 
61 	ZVAL_NULL(&str1);
62 	str1_p  = collator_convert_object_to_string( op1, &str1 );
63 	ZVAL_NULL(&str2);
64 	str2_p  = collator_convert_object_to_string( op2, &str2 );
65 
66 	/* If both args are strings AND either of args is not numeric string
67 	 * then use ICU-compare. Otherwise PHP-compare. */
68 	if( Z_TYPE_P(str1_p) == IS_STRING && Z_TYPE_P(str2_p) == IS_STRING &&
69 		( str1_p == ( num1_p = collator_convert_string_to_number_if_possible( str1_p, &num1 ) ) ||
70 		  str2_p == ( num2_p = collator_convert_string_to_number_if_possible( str2_p, &num2 ) ) ) )
71 	{
72 		/* Compare the strings using ICU. */
73 		ZEND_ASSERT(INTL_G(current_collator) != NULL);
74 		ZVAL_LONG(result, ucol_strcoll(
75 					INTL_G(current_collator),
76 					INTL_ZSTR_VAL(Z_STR_P(str1_p)), INTL_ZSTR_LEN(Z_STR_P(str1_p)),
77 					INTL_ZSTR_VAL(Z_STR_P(str2_p)), INTL_ZSTR_LEN(Z_STR_P(str2_p)) ));
78 	}
79 	else
80 	{
81 		/* num1 is set if str1 and str2 are strings. */
82 		if( num1_p )
83 		{
84 			if( num1_p == str1_p )
85 			{
86 				/* str1 is string but not numeric string
87 				 * just convert it to utf8.
88 				 */
89 				norm1_p = collator_convert_zstr_utf16_to_utf8( str1_p, &norm1 );
90 
91 				/* num2 is not set but str2 is string => do normalization. */
92 				norm2_p = collator_normalize_sort_argument( str2_p, &norm2 );
93 			}
94 			else
95 			{
96 				/* str1 is numeric strings => passthru to PHP-compare. */
97 				Z_TRY_ADDREF_P(num1_p);
98 				norm1_p = num1_p;
99 
100 				/* str2 is numeric strings => passthru to PHP-compare. */
101 				Z_TRY_ADDREF_P(num2_p);
102 				norm2_p = num2_p;
103 			}
104 		}
105 		else
106 		{
107 			/* num1 is not set if str1 or str2 is not a string => do normalization. */
108 			norm1_p = collator_normalize_sort_argument( str1_p, &norm1 );
109 
110 			/* if num1 is not set then num2 is not set as well => do normalization. */
111 			norm2_p = collator_normalize_sort_argument( str2_p, &norm2 );
112 		}
113 
114 		rc = compare_function( result, norm1_p, norm2_p );
115 
116 		zval_ptr_dtor( norm1_p );
117 		zval_ptr_dtor( norm2_p );
118 	}
119 
120 	if( num1_p )
121 		zval_ptr_dtor( num1_p );
122 
123 	if( num2_p )
124 		zval_ptr_dtor( num2_p );
125 
126 	zval_ptr_dtor( str1_p );
127 	zval_ptr_dtor( str2_p );
128 
129 	return rc;
130 }
131 /* }}} */
132 
133 /* {{{ collator_numeric_compare_function
134  * Convert input args to double and compare it.
135  */
collator_numeric_compare_function(zval * result,zval * op1,zval * op2)136 static int collator_numeric_compare_function(zval *result, zval *op1, zval *op2)
137 {
138 	zval num1, num2;
139 	zval *num1_p = NULL;
140 	zval *num2_p = NULL;
141 
142 	if( Z_TYPE_P(op1) == IS_STRING )
143 	{
144 		num1_p = collator_convert_string_to_double( op1, &num1 );
145 		op1 = num1_p;
146 	}
147 
148 	if( Z_TYPE_P(op2) == IS_STRING )
149 	{
150 		num2_p = collator_convert_string_to_double( op2, &num2 );
151 		op2 = num2_p;
152 	}
153 
154 	ZVAL_LONG(result, numeric_compare_function(op1, op2));
155 
156 	if( num1_p )
157 		zval_ptr_dtor( num1_p );
158 	if( num2_p )
159 		zval_ptr_dtor( num2_p );
160 
161 	return SUCCESS;
162 }
163 /* }}} */
164 
165 /* {{{ collator_icu_compare_function
166  * Direct use of ucol_strcoll.
167 */
collator_icu_compare_function(zval * result,zval * op1,zval * op2)168 static int collator_icu_compare_function(zval *result, zval *op1, zval *op2)
169 {
170 	int rc = SUCCESS;
171 	zend_string *str1 = collator_zval_to_string(op1);
172 	zend_string *str2 = collator_zval_to_string(op2);
173 
174 	/* Compare the strings using ICU. */
175 	ZEND_ASSERT(INTL_G(current_collator) != NULL);
176 	ZVAL_LONG(result, ucol_strcoll(
177 				INTL_G(current_collator),
178 				INTL_ZSTR_VAL(str1), INTL_ZSTR_LEN(str1),
179 				INTL_ZSTR_VAL(str2), INTL_ZSTR_LEN(str2) ));
180 
181 	zend_string_release(str1);
182 	zend_string_release(str2);
183 
184 	return rc;
185 }
186 /* }}} */
187 
188 /* {{{ collator_compare_func
189  * Taken from PHP7 source (array_data_compare).
190  */
collator_compare_func(Bucket * f,Bucket * s)191 static int collator_compare_func(Bucket *f, Bucket *s)
192 {
193 	zval result;
194 	zval *first = &f->val;
195 	zval *second = &s->val;
196 
197 	if( INTL_G(compare_func)( &result, first, second) == FAILURE )
198 		return 0;
199 
200 	if( Z_TYPE(result) == IS_DOUBLE )
201 	{
202 		if( Z_DVAL(result) < 0 )
203 			return -1;
204 		else if( Z_DVAL(result) > 0 )
205 			return 1;
206 		else
207 			return 0;
208 	}
209 
210 	convert_to_long(&result);
211 
212 	if( Z_LVAL(result) < 0 )
213 		return -1;
214 	else if( Z_LVAL(result) > 0 )
215 		return 1;
216 
217 	return 0;
218 }
219 /* }}} */
220 
221 /* {{{ Compare sort keys */
collator_cmp_sort_keys(const void * p1,const void * p2)222 static int collator_cmp_sort_keys( const void *p1, const void *p2 )
223 {
224 	char* key1 = ((collator_sort_key_index_t*)p1)->key;
225 	char* key2 = ((collator_sort_key_index_t*)p2)->key;
226 
227 	return strcmp( key1, key2 );
228 }
229 /* }}} */
230 
231 /* {{{ Choose compare function according to sort flags. */
collator_get_compare_function(const zend_long sort_flags)232 static collator_compare_func_t collator_get_compare_function( const zend_long sort_flags )
233 {
234 	collator_compare_func_t func;
235 
236 	switch( sort_flags )
237 	{
238 		case COLLATOR_SORT_NUMERIC:
239 			func = collator_numeric_compare_function;
240 			break;
241 
242 		case COLLATOR_SORT_STRING:
243 			func = collator_icu_compare_function;
244 			break;
245 
246 		case COLLATOR_SORT_REGULAR:
247 		default:
248 			func = collator_regular_compare_function;
249 			break;
250 	}
251 
252 	return func;
253 }
254 /* }}} */
255 
256 /* {{{ Common code shared by collator_sort() and collator_asort() API functions. */
collator_sort_internal(int renumber,INTERNAL_FUNCTION_PARAMETERS)257 static void collator_sort_internal( int renumber, INTERNAL_FUNCTION_PARAMETERS )
258 {
259 	UCollator*     saved_collator;
260 	zval*          array            = NULL;
261 	HashTable*     hash             = NULL;
262 	zend_long           sort_flags       = COLLATOR_SORT_REGULAR;
263 
264 	COLLATOR_METHOD_INIT_VARS
265 
266 	/* Parse parameters. */
267 	if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "Oa/|l",
268 		&object, Collator_ce_ptr, &array, &sort_flags ) == FAILURE )
269 	{
270 		RETURN_THROWS();
271 	}
272 
273 	/* Fetch the object. */
274 	COLLATOR_METHOD_FETCH_OBJECT;
275 
276 	if (!co->ucoll) {
277 		zend_throw_error(NULL, "Object not initialized");
278 		RETURN_THROWS();
279 	}
280 
281 	/* Set 'compare function' according to sort flags. */
282 	INTL_G(compare_func) = collator_get_compare_function( sort_flags );
283 
284 	hash = Z_ARRVAL_P( array );
285 
286 	/* Convert strings in the specified array from UTF-8 to UTF-16. */
287 	collator_convert_hash_from_utf8_to_utf16( hash, COLLATOR_ERROR_CODE_P( co ) );
288 	COLLATOR_CHECK_STATUS( co, "Error converting hash from UTF-8 to UTF-16" );
289 
290 	/* Save specified collator in the request-global (?) variable. */
291 	saved_collator = INTL_G( current_collator );
292 	INTL_G( current_collator ) = co->ucoll;
293 
294 	/* Sort specified array. */
295 	zend_hash_sort(hash, collator_compare_func, renumber);
296 
297 	/* Restore saved collator. */
298 	INTL_G( current_collator ) = saved_collator;
299 
300 	/* Convert strings in the specified array back to UTF-8. */
301 	collator_convert_hash_from_utf16_to_utf8( hash, COLLATOR_ERROR_CODE_P( co ) );
302 	COLLATOR_CHECK_STATUS( co, "Error converting hash from UTF-16 to UTF-8" );
303 
304 	RETURN_TRUE;
305 }
306 /* }}} */
307 
308 /* {{{ Sort array using specified collator. */
PHP_FUNCTION(collator_sort)309 PHP_FUNCTION( collator_sort )
310 {
311 	collator_sort_internal( true, INTERNAL_FUNCTION_PARAM_PASSTHRU );
312 }
313 /* }}} */
314 
collator_sortkey_swap(collator_sort_key_index_t * p,collator_sort_key_index_t * q)315 static void collator_sortkey_swap(collator_sort_key_index_t *p, collator_sort_key_index_t *q) /* {{{ */
316 {
317 	collator_sort_key_index_t t;
318 	t = *p;
319 	*p = *q;
320 	*q = t;
321 }
322 /* }}} */
323 
324 /* {{{ Equivalent to standard PHP sort using Collator.
325  * Uses ICU ucol_getSortKey for performance.
326  */
PHP_FUNCTION(collator_sort_with_sort_keys)327 PHP_FUNCTION( collator_sort_with_sort_keys )
328 {
329 	zval*       array                = NULL;
330 	zval        garbage;
331 	HashTable*  hash                 = NULL;
332 	zval*       hashData             = NULL;                     /* currently processed item of input hash */
333 
334 	char*       sortKeyBuf           = NULL;                     /* buffer to store sort keys */
335 	uint32_t    sortKeyBufSize       = DEF_SORT_KEYS_BUF_SIZE;   /* buffer size */
336 	ptrdiff_t   sortKeyBufOffset     = 0;                        /* pos in buffer to store sort key */
337 	uint32_t    sortKeyLen           = 0;                        /* the length of currently processing key */
338 	uint32_t    bufLeft              = 0;
339 	uint32_t    bufIncrement         = 0;
340 
341 	collator_sort_key_index_t* sortKeyIndxBuf = NULL;            /* buffer to store 'indexes' which will be passed to 'qsort' */
342 	uint32_t    sortKeyIndxBufSize   = DEF_SORT_KEYS_INDX_BUF_SIZE;
343 	uint32_t    sortKeyIndxSize      = sizeof( collator_sort_key_index_t );
344 
345 	uint32_t    sortKeyCount         = 0;
346 	uint32_t    j                    = 0;
347 
348 	UChar*      utf16_buf            = NULL;                     /* tmp buffer to hold current processing string in utf-16 */
349 	int         utf16_buf_size       = DEF_UTF16_BUF_SIZE;       /* the length of utf16_buf */
350 	int         utf16_len            = 0;                        /* length of converted string */
351 
352 	COLLATOR_METHOD_INIT_VARS
353 
354 	/* Parse parameters. */
355 	if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "Oa",
356 		&object, Collator_ce_ptr, &array ) == FAILURE )
357 	{
358 		RETURN_THROWS();
359 	}
360 
361 	/* Fetch the object. */
362 	COLLATOR_METHOD_FETCH_OBJECT;
363 
364 	if (!co || !co->ucoll) {
365 		intl_error_set_code( NULL, COLLATOR_ERROR_CODE( co ) );
366 		intl_errors_set_custom_msg( COLLATOR_ERROR_P( co ),
367 			"Object not initialized", 0 );
368 		zend_throw_error(NULL, "Object not initialized");
369 
370 		RETURN_THROWS();
371 	}
372 
373 	/*
374 	 * Sort specified array.
375 	 */
376 	hash = Z_ARRVAL_P( array );
377 
378 	if( !hash || zend_hash_num_elements( hash ) == 0 )
379 		RETURN_TRUE;
380 
381 	/* Create bufers */
382 	sortKeyBuf     = ecalloc( sortKeyBufSize,     sizeof( char    ) );
383 	sortKeyIndxBuf = ecalloc( sortKeyIndxBufSize, sizeof( uint8_t ) );
384 	utf16_buf      = eumalloc( utf16_buf_size );
385 
386 	/* Iterate through input hash and create a sort key for each value. */
387 	ZEND_HASH_FOREACH_VAL(hash, hashData) {
388 		/* Convert current hash item from UTF-8 to UTF-16LE and save the result to utf16_buf. */
389 
390 		utf16_len = utf16_buf_size;
391 
392 		/* Process string values only. */
393 		if( Z_TYPE_P( hashData ) == IS_STRING )
394 		{
395 			intl_convert_utf8_to_utf16( &utf16_buf, &utf16_len, Z_STRVAL_P( hashData ), Z_STRLEN_P( hashData ), COLLATOR_ERROR_CODE_P( co ) );
396 
397 			if( U_FAILURE( COLLATOR_ERROR_CODE( co ) ) )
398 			{
399 				intl_error_set_code( NULL, COLLATOR_ERROR_CODE( co ) );
400 				intl_errors_set_custom_msg( COLLATOR_ERROR_P( co ), "Sort with sort keys failed", 0 );
401 
402 				if( utf16_buf )
403 					efree( utf16_buf );
404 
405 				efree( sortKeyIndxBuf );
406 				efree( sortKeyBuf );
407 
408 				RETURN_FALSE;
409 			}
410 		}
411 		else
412 		{
413 			/* Set empty string */
414 			utf16_len = 0;
415 			utf16_buf[utf16_len] = 0;
416 		}
417 
418 		if( (utf16_len + 1) > utf16_buf_size )
419 			utf16_buf_size = utf16_len + 1;
420 
421 		/* Get sort key, reallocating the buffer if needed. */
422 		bufLeft = sortKeyBufSize - sortKeyBufOffset;
423 
424 		sortKeyLen = ucol_getSortKey( co->ucoll,
425 									  utf16_buf,
426 									  utf16_len,
427 									  (uint8_t*)sortKeyBuf + sortKeyBufOffset,
428 									  bufLeft );
429 
430 		/* check for sortKeyBuf overflow, increasing its size of the buffer if needed */
431 		if( sortKeyLen > bufLeft )
432 		{
433 			bufIncrement = ( sortKeyLen > DEF_SORT_KEYS_BUF_INCREMENT ) ? sortKeyLen : DEF_SORT_KEYS_BUF_INCREMENT;
434 
435 			sortKeyBufSize += bufIncrement;
436 			bufLeft += bufIncrement;
437 
438 			sortKeyBuf = erealloc( sortKeyBuf, sortKeyBufSize );
439 
440 			sortKeyLen = ucol_getSortKey( co->ucoll, utf16_buf, utf16_len, (uint8_t*)sortKeyBuf + sortKeyBufOffset, bufLeft );
441 		}
442 
443 		/*  check sortKeyIndxBuf overflow, increasing its size of the buffer if needed */
444 		if( ( sortKeyCount + 1 ) * sortKeyIndxSize > sortKeyIndxBufSize )
445 		{
446 			bufIncrement = ( sortKeyIndxSize > DEF_SORT_KEYS_INDX_BUF_INCREMENT ) ? sortKeyIndxSize : DEF_SORT_KEYS_INDX_BUF_INCREMENT;
447 
448 			sortKeyIndxBufSize += bufIncrement;
449 
450 			sortKeyIndxBuf = erealloc( sortKeyIndxBuf, sortKeyIndxBufSize );
451 		}
452 
453 		sortKeyIndxBuf[sortKeyCount].key = (char*)sortKeyBufOffset;    /* remember just offset, cause address */
454 		                                                               /* of 'sortKeyBuf' may be changed due to realloc. */
455 		sortKeyIndxBuf[sortKeyCount].zstr = hashData;
456 
457 		sortKeyBufOffset += sortKeyLen;
458 		++sortKeyCount;
459 
460 	} ZEND_HASH_FOREACH_END();
461 
462 	/* update ptrs to point to valid keys. */
463 	for( j = 0; j < sortKeyCount; j++ )
464 		sortKeyIndxBuf[j].key = sortKeyBuf + (ptrdiff_t)sortKeyIndxBuf[j].key;
465 
466 	/* sort it */
467 	zend_sort( sortKeyIndxBuf, sortKeyCount,
468 			sortKeyIndxSize, collator_cmp_sort_keys, (swap_func_t)collator_sortkey_swap);
469 
470 	ZVAL_COPY_VALUE(&garbage, array);
471 	/* for resulting hash we'll assign new hash keys rather then reordering */
472 	array_init(array);
473 
474 	for( j = 0; j < sortKeyCount; j++ )
475 	{
476 		Z_TRY_ADDREF_P( sortKeyIndxBuf[j].zstr );
477 		zend_hash_next_index_insert( Z_ARRVAL_P(array), sortKeyIndxBuf[j].zstr);
478 	}
479 
480 	if( utf16_buf )
481 		efree( utf16_buf );
482 
483 	zval_ptr_dtor(&garbage);
484 	efree( sortKeyIndxBuf );
485 	efree( sortKeyBuf );
486 
487 	RETURN_TRUE;
488 }
489 /* }}} */
490 
491 /* {{{ Sort array using specified collator, maintaining index association. */
PHP_FUNCTION(collator_asort)492 PHP_FUNCTION( collator_asort )
493 {
494 	collator_sort_internal( false, INTERNAL_FUNCTION_PARAM_PASSTHRU );
495 }
496 /* }}} */
497 
498 /* {{{ Get a sort key for a string from a Collator. */
PHP_FUNCTION(collator_get_sort_key)499 PHP_FUNCTION( collator_get_sort_key )
500 {
501 	char*            str      = NULL;
502 	size_t           str_len  = 0;
503 	UChar*           ustr     = NULL;
504 	int32_t          ustr_len = 0;
505 	int              key_len = 0;
506 	zend_string*     key_str;
507 
508 	COLLATOR_METHOD_INIT_VARS
509 
510 	/* Parse parameters. */
511 	if( zend_parse_method_parameters( ZEND_NUM_ARGS(), getThis(), "Os",
512 		&object, Collator_ce_ptr, &str, &str_len ) == FAILURE )
513 	{
514 		RETURN_THROWS();
515 	}
516 
517 	/* Fetch the object. */
518 	COLLATOR_METHOD_FETCH_OBJECT;
519 
520 	if (!co || !co->ucoll) {
521 		intl_error_set_code( NULL, COLLATOR_ERROR_CODE( co ) );
522 		intl_errors_set_custom_msg( COLLATOR_ERROR_P( co ),
523 			"Object not initialized", 0 );
524 		zend_throw_error(NULL, "Object not initialized");
525 
526 		RETURN_THROWS();
527 	}
528 
529 	/*
530 	 * Compare given strings (converting them to UTF-16 first).
531 	 */
532 
533 	/* First convert the strings to UTF-16. */
534 	intl_convert_utf8_to_utf16(
535 		&ustr, &ustr_len, str, str_len, COLLATOR_ERROR_CODE_P( co ) );
536 	if( U_FAILURE( COLLATOR_ERROR_CODE( co ) ) )
537 	{
538 		/* Set global error code. */
539 		intl_error_set_code( NULL, COLLATOR_ERROR_CODE( co ) );
540 
541 		/* Set error messages. */
542 		intl_errors_set_custom_msg( COLLATOR_ERROR_P( co ),
543 			"Error converting first argument to UTF-16", 0 );
544 		efree( ustr );
545 		RETURN_FALSE;
546 	}
547 
548 	/* ucol_getSortKey is exception in that the key length includes the
549 	 * NUL terminator*/
550 	key_len = ucol_getSortKey(co->ucoll, ustr, ustr_len, NULL, 0);
551 	if(!key_len) {
552 		efree( ustr );
553 		RETURN_FALSE;
554 	}
555 	key_str = zend_string_alloc(key_len, 0);
556 	key_len = ucol_getSortKey(co->ucoll, ustr, ustr_len, (uint8_t*)ZSTR_VAL(key_str), key_len);
557 	efree( ustr );
558 	if(!key_len) {
559 		RETURN_FALSE;
560 	}
561 	ZSTR_LEN(key_str) = key_len - 1;
562 	RETVAL_NEW_STR(key_str);
563 }
564 /* }}} */
565