/* +----------------------------------------------------------------------+ | This source file is subject to version 3.01 of the PHP license, | | that is bundled with this package in the file LICENSE, and is | | available through the world-wide-web at the following url: | | https://www.php.net/license/3_01.txt | | If you did not receive a copy of the PHP license and are unable to | | obtain it through the world-wide-web, please send a note to | | license@php.net so we can mail you a copy immediately. | +----------------------------------------------------------------------+ | Authors: Stanislav Malyshev | +----------------------------------------------------------------------+ */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "../intl_cppshims.h" #include #include #include #include #include #include #include #include #include #include "../intl_convertcpp.h" #include "../common/common_date.h" extern "C" { #include "php_intl.h" #include "msgformat_class.h" #include "msgformat_helpers.h" #include "intl_convert.h" #define USE_TIMEZONE_POINTER #include "../timezone/timezone_class.h" } U_NAMESPACE_BEGIN /** * ICU declares MessageFormatAdapter as a friend class of MessageFormat, * to use as a backdoor for accessing private MessageFormat members. * We use it for the same purpose here. Prefix the methods with php to * avoid clashes with any definitions in ICU. */ class MessageFormatAdapter { public: static const Formattable::Type* phpGetArgTypeList(const MessageFormat& m, int32_t& count); static const MessagePattern phpGetMessagePattern(MessageFormat* m); }; const Formattable::Type* MessageFormatAdapter::phpGetArgTypeList(const MessageFormat& m, int32_t& count) { return m.getArgTypeList(count); } const MessagePattern MessageFormatAdapter::phpGetMessagePattern(MessageFormat* m) { return m->msgPattern; } U_NAMESPACE_END using icu::Formattable; using icu::Format; using icu::DateFormat; using icu::MessageFormat; using icu::MessagePattern; using icu::MessageFormatAdapter; using icu::FieldPosition; U_CFUNC int32_t umsg_format_arg_count(UMessageFormat *fmt) { int32_t fmt_count = 0; MessageFormatAdapter::phpGetArgTypeList(*(const MessageFormat*)fmt, fmt_count); return fmt_count; } static void arg_types_dtor(zval *el) { efree(Z_PTR_P(el)); } static HashTable *umsg_get_numeric_types(MessageFormatter_object *mfo, intl_error& err) { HashTable *ret; int32_t parts_count; if (U_FAILURE(err.code)) { return NULL; } if (mfo->mf_data.arg_types) { /* already cached */ return mfo->mf_data.arg_types; } const Formattable::Type *types = MessageFormatAdapter::phpGetArgTypeList( *(MessageFormat*)mfo->mf_data.umsgf, parts_count); /* Hash table will store Formattable::Type objects directly, * so no need for destructor */ ALLOC_HASHTABLE(ret); zend_hash_init(ret, parts_count, NULL, arg_types_dtor, 0); for (int i = 0; i < parts_count; i++) { const Formattable::Type t = types[i]; zend_hash_index_update_mem(ret, (zend_ulong)i, (void*)&t, sizeof(t)); } if (U_FAILURE(err.code)) { zend_hash_destroy(ret); efree(ret); return NULL; } mfo->mf_data.arg_types = ret; return ret; } static HashTable *umsg_parse_format(MessageFormatter_object *mfo, const MessagePattern& mp, intl_error& err) { HashTable *ret; int32_t parts_count; if (U_FAILURE(err.code)) { return NULL; } if (!((MessageFormat *)mfo->mf_data.umsgf)->usesNamedArguments()) { return umsg_get_numeric_types(mfo, err); } if (mfo->mf_data.arg_types) { /* already cached */ return mfo->mf_data.arg_types; } /* Hash table will store Formattable::Type objects directly, * so no need for destructor */ ALLOC_HASHTABLE(ret); zend_hash_init(ret, 32, NULL, arg_types_dtor, 0); parts_count = mp.countParts(); // See MessageFormat::cacheExplicitFormats() /* * Looking through the pattern, go to each arg_start part type. * The arg-typeof that tells us the argument type (simple, complicated) * then the next part is either the arg_name or arg number * and then if it's simple after that there could be a part-type=arg-type * while substring will tell us number, spellout, etc. * If the next thing isn't an arg-type then assume string. */ /* The last two "parts" can at most be ARG_LIMIT and MSG_LIMIT * which we need not examine. */ for (int32_t i = 0; i < parts_count - 2 && U_SUCCESS(err.code); i++) { MessagePattern::Part p = mp.getPart(i); if (p.getType() != UMSGPAT_PART_TYPE_ARG_START) { continue; } MessagePattern::Part name_part = mp.getPart(++i); /* Getting name, advancing i */ Formattable::Type type, *storedType; if (name_part.getType() == UMSGPAT_PART_TYPE_ARG_NAME) { UnicodeString argName = mp.getSubstring(name_part); if ((storedType = (Formattable::Type*)zend_hash_str_find_ptr(ret, (char*)argName.getBuffer(), argName.length() * sizeof(UChar))) == NULL) { /* not found already; create new entry in HT */ Formattable::Type bogusType = Formattable::kObject; storedType = (Formattable::Type*)zend_hash_str_update_mem(ret, (char*)argName.getBuffer(), argName.length() * sizeof(UChar), (void*)&bogusType, sizeof(bogusType)); } } else if (name_part.getType() == UMSGPAT_PART_TYPE_ARG_NUMBER) { int32_t argNumber = name_part.getValue(); if (argNumber < 0) { intl_errors_set(&err, U_INVALID_FORMAT_ERROR, "Found part with negative number", 0); continue; } if ((storedType = (Formattable::Type*)zend_hash_index_find_ptr(ret, (zend_ulong)argNumber)) == NULL) { /* not found already; create new entry in HT */ Formattable::Type bogusType = Formattable::kObject; storedType = (Formattable::Type*)zend_hash_index_update_mem(ret, (zend_ulong)argNumber, (void*)&bogusType, sizeof(bogusType)); } } else { intl_errors_set(&err, U_INVALID_FORMAT_ERROR, "Invalid part type encountered", 0); continue; } UMessagePatternArgType argType = p.getArgType(); /* No type specified, treat it as a string */ if (argType == UMSGPAT_ARG_TYPE_NONE) { type = Formattable::kString; } else { /* Some type was specified, might be simple or complicated */ if (argType == UMSGPAT_ARG_TYPE_SIMPLE) { /* For a SIMPLE arg, after the name part, there should be * an ARG_TYPE part whose string value tells us what to do */ MessagePattern::Part type_part = mp.getPart(++i); /* Getting type, advancing i */ if (type_part.getType() == UMSGPAT_PART_TYPE_ARG_TYPE) { UnicodeString typeString = mp.getSubstring(type_part); /* This is all based on the rules in the docs for MessageFormat * @see http://icu-project.org/apiref/icu4c/classMessageFormat.html */ #define ASCII_LITERAL(s) UNICODE_STRING(s, sizeof(s)-1) if (typeString == ASCII_LITERAL("number")) { MessagePattern::Part style_part = mp.getPart(i + 1); /* Not advancing i */ if (style_part.getType() == UMSGPAT_PART_TYPE_ARG_STYLE) { UnicodeString styleString = mp.getSubstring(style_part); if (styleString == ASCII_LITERAL("integer")) { type = Formattable::kInt64; } else if (styleString == ASCII_LITERAL("currency")) { type = Formattable::kDouble; } else if (styleString == ASCII_LITERAL("percent")) { type = Formattable::kDouble; } else { /* some style invalid/unknown to us */ type = Formattable::kDouble; } } else { // if missing style, part, make it a double type = Formattable::kDouble; } } else if ((typeString == ASCII_LITERAL("date")) || (typeString == ASCII_LITERAL("time"))) { type = Formattable::kDate; } else if ((typeString == ASCII_LITERAL("spellout")) || (typeString == ASCII_LITERAL("ordinal")) || (typeString == ASCII_LITERAL("duration"))) { type = Formattable::kDouble; } #undef ASCII_LITERAL } else { /* If there's no UMSGPAT_PART_TYPE_ARG_TYPE right after a * UMSGPAT_ARG_TYPE_SIMPLE argument, then the pattern * is broken. */ intl_errors_set(&err, U_PARSE_ERROR, "Expected UMSGPAT_PART_TYPE_ARG_TYPE part following " "UMSGPAT_ARG_TYPE_SIMPLE part", 0); continue; } } else if (argType == UMSGPAT_ARG_TYPE_PLURAL) { type = Formattable::kDouble; } else if (argType == UMSGPAT_ARG_TYPE_CHOICE) { type = Formattable::kDouble; } else if (argType == UMSGPAT_ARG_TYPE_SELECT) { type = Formattable::kString; } else if (argType == UMSGPAT_ARG_TYPE_SELECTORDINAL) { type = Formattable::kDouble; } else { type = Formattable::kString; } } /* was type specified? */ /* We found a different type for the same arg! */ if (*storedType != Formattable::kObject && *storedType != type) { intl_errors_set(&err, U_ARGUMENT_TYPE_MISMATCH, "Inconsistent types declared for an argument", 0); continue; } *storedType = type; } /* visiting each part */ if (U_FAILURE(err.code)) { zend_hash_destroy(ret); efree(ret); return NULL; } mfo->mf_data.arg_types = ret; return ret; } static HashTable *umsg_get_types(MessageFormatter_object *mfo, intl_error& err) { MessageFormat *mf = (MessageFormat *)mfo->mf_data.umsgf; const MessagePattern mp = MessageFormatAdapter::phpGetMessagePattern(mf); return umsg_parse_format(mfo, mp, err); } static void umsg_set_timezone(MessageFormatter_object *mfo, intl_error& err) { MessageFormat *mf = (MessageFormat *)mfo->mf_data.umsgf; TimeZone *used_tz = NULL; const Format **formats; int32_t count; /* Unfortanely, this cannot change the time zone for arguments that * appear inside complex formats because ::getFormats() returns NULL * for all uncached formats, which is the case for complex formats * unless they were set via one of the ::setFormat() methods */ if (mfo->mf_data.tz_set) { return; /* already done */ } #if U_ICU_VERSION_MAJOR_NUM < 65 /* There is a bug in ICU < 64.1 (ICU-12584) which prevents MessageFormatter::getFormats() to handle more than 10 formats correctly. The enumerator could be used to walk through the present formatters using getFormat(), which however seems to provide just a readonly access. This workaround prevents crash when there are > 10 formats but doesn't set any error. As a result, only DateFormatters with > 10 subformats are affected. */ icu::StringEnumeration* fnames = mf->getFormatNames(err.code); if (!fnames || U_FAILURE(err.code)) { return; } count = fnames->count(err.code); delete fnames; if (count > 10) { return; } #endif formats = mf->getFormats(count); if (UNEXPECTED(formats == NULL)) { intl_errors_set(&err, U_MEMORY_ALLOCATION_ERROR, "Out of memory retrieving subformats", 0); } for (int i = 0; U_SUCCESS(err.code) && i < count; i++) { DateFormat* df = dynamic_cast( const_cast(formats[i])); if (df == NULL) { continue; } if (used_tz == NULL) { zval nullzv; ZVAL_NULL(&nullzv); used_tz = timezone_process_timezone_argument(&nullzv, &err, "msgfmt_format"); if (used_tz == NULL) { continue; } } df->adoptTimeZone(used_tz->clone()); } if (U_SUCCESS(err.code)) { mfo->mf_data.tz_set = 1; } if (used_tz) { delete used_tz; } } U_CFUNC void umsg_format_helper(MessageFormatter_object *mfo, HashTable *args, UChar **formatted, int32_t *formatted_len) { int arg_count = zend_hash_num_elements(args); std::vector fargs; std::vector farg_names; MessageFormat *mf = (MessageFormat *)mfo->mf_data.umsgf; HashTable *types; intl_error& err = INTL_DATA_ERROR(mfo); if (U_FAILURE(err.code)) { return; } types = umsg_get_types(mfo, err); umsg_set_timezone(mfo, err); fargs.resize(arg_count); farg_names.resize(arg_count); int argNum = 0; zval *elem; // Key related variables zend_string *str_index; zend_ulong num_index; ZEND_HASH_FOREACH_KEY_VAL(args, num_index, str_index, elem) { ZVAL_DEREF(elem); Formattable& formattable = fargs[argNum]; UnicodeString& key = farg_names[argNum]; Formattable::Type argType = Formattable::kObject, //unknown *storedArgType = NULL; if (!U_SUCCESS(err.code)) { break; } /* Process key and retrieve type */ if (str_index == NULL) { /* includes case where index < 0 because it's exposed as unsigned */ if (UNEXPECTED(num_index > (zend_ulong)INT32_MAX)) { intl_errors_set(&err, U_ILLEGAL_ARGUMENT_ERROR, "Found negative or too large array key", 0); continue; } UChar temp[16]; int32_t len = u_sprintf(temp, "%u", (uint32_t)num_index); key.append(temp, len); storedArgType = (Formattable::Type*)zend_hash_index_find_ptr(types, (zend_ulong)num_index); } else { //string; assumed to be in UTF-8 intl_stringFromChar(key, ZSTR_VAL(str_index), ZSTR_LEN(str_index), &err.code); if (U_FAILURE(err.code)) { char *message; spprintf(&message, 0, "Invalid UTF-8 data in argument key: '%s'", ZSTR_VAL(str_index)); intl_errors_set(&err, err.code, message, 1); efree(message); continue; } storedArgType = (Formattable::Type*)zend_hash_str_find_ptr(types, (char*)key.getBuffer(), key.length() * sizeof(UChar)); } if (storedArgType != NULL) { argType = *storedArgType; } /* Convert zval to formattable according to message format type * or (as a fallback) the zval type */ if (argType != Formattable::kObject) { switch (argType) { case Formattable::kString: { zend_string *str, *tmp_str; string_arg: /* This implicitly converts objects * Note that our vectors will leak if object conversion fails * and PHP ends up with a fatal error and calls longjmp * as a result of that. */ str = zval_get_tmp_string(elem, &tmp_str); UnicodeString *text = new UnicodeString(); intl_stringFromChar(*text, ZSTR_VAL(str), ZSTR_LEN(str), &err.code); if (U_FAILURE(err.code)) { char *message; spprintf(&message, 0, "Invalid UTF-8 data in string argument: " "'%s'", ZSTR_VAL(str)); intl_errors_set(&err, err.code, message, 1); efree(message); delete text; continue; } formattable.adoptString(text); zend_tmp_string_release(tmp_str); break; } case Formattable::kDouble: { double d = zval_get_double(elem); formattable.setDouble(d); break; } case Formattable::kLong: { int32_t tInt32 = 0; if (Z_TYPE_P(elem) == IS_DOUBLE) { if (UNEXPECTED(Z_DVAL_P(elem) > (double)INT32_MAX || Z_DVAL_P(elem) < (double)INT32_MIN)) { intl_errors_set(&err, U_ILLEGAL_ARGUMENT_ERROR, "Found PHP float with absolute value too large for " "32 bit integer argument", 0); } else { tInt32 = (int32_t)Z_DVAL_P(elem); } } else if (Z_TYPE_P(elem) == IS_LONG) { if (UNEXPECTED(Z_LVAL_P(elem) > INT32_MAX || Z_LVAL_P(elem) < INT32_MIN)) { intl_errors_set(&err, U_ILLEGAL_ARGUMENT_ERROR, "Found PHP integer with absolute value too large " "for 32 bit integer argument", 0); } else { tInt32 = (int32_t)Z_LVAL_P(elem); } } else { tInt32 = (int32_t)zval_get_long(elem); } formattable.setLong(tInt32); break; } case Formattable::kInt64: { int64_t tInt64 = 0; if (Z_TYPE_P(elem) == IS_DOUBLE) { if (UNEXPECTED(Z_DVAL_P(elem) > (double)U_INT64_MAX || Z_DVAL_P(elem) < (double)U_INT64_MIN)) { intl_errors_set(&err, U_ILLEGAL_ARGUMENT_ERROR, "Found PHP float with absolute value too large for " "64 bit integer argument", 0); } else { tInt64 = (int64_t)Z_DVAL_P(elem); } } else if (Z_TYPE_P(elem) == IS_LONG) { /* assume long is not wider than 64 bits */ tInt64 = (int64_t)Z_LVAL_P(elem); } else { tInt64 = (int64_t)zval_get_long(elem); } formattable.setInt64(tInt64); break; } case Formattable::kDate: { double dd = intl_zval_to_millis(elem, &err, "msgfmt_format"); if (U_FAILURE(err.code)) { char *message; zend_string *u8key; UErrorCode status = UErrorCode(); u8key = intl_charFromString(key, &status); if (u8key) { spprintf(&message, 0, "The argument for key '%s' " "cannot be used as a date or time", ZSTR_VAL(u8key)); intl_errors_set(&err, err.code, message, 1); zend_string_release_ex(u8key, 0); efree(message); } continue; } formattable.setDate(dd); break; } default: intl_errors_set(&err, U_ILLEGAL_ARGUMENT_ERROR, "Found unsupported argument type", 0); break; } } else { /* We couldn't find any information about the argument in the pattern, this * means it's an extra argument. So convert it to a number if it's a number or * bool or null and to a string if it's anything else except arrays . */ switch (Z_TYPE_P(elem)) { case IS_DOUBLE: formattable.setDouble(Z_DVAL_P(elem)); break; case IS_LONG: formattable.setInt64((int64_t)Z_LVAL_P(elem)); break; case IS_NULL: case IS_FALSE: formattable.setInt64((int64_t)0); break; case IS_TRUE: formattable.setInt64((int64_t)1); break; case IS_STRING: case IS_OBJECT: goto string_arg; default: { char *message; zend_string *u8key; UErrorCode status = UErrorCode(); u8key = intl_charFromString(key, &status); if (u8key) { spprintf(&message, 0, "No strategy to convert the " "value given for the argument with key '%s' " "is available", ZSTR_VAL(u8key)); intl_errors_set(&err, U_ILLEGAL_ARGUMENT_ERROR, message, 1); zend_string_release_ex(u8key, 0); efree(message); } } } } argNum++; } ZEND_HASH_FOREACH_END(); // visiting each argument if (U_FAILURE(err.code)) { return; } UnicodeString resultStr; FieldPosition fieldPosition(0); /* format the message */ mf->format(farg_names.empty() ? NULL : &farg_names[0], fargs.empty() ? NULL : &fargs[0], arg_count, resultStr, err.code); if (U_FAILURE(err.code)) { intl_errors_set(&err, err.code, "Call to ICU MessageFormat::format() has failed", 0); return; } *formatted_len = resultStr.length(); *formatted = eumalloc(*formatted_len+1); resultStr.extract(*formatted, *formatted_len+1, err.code); if (U_FAILURE(err.code)) { intl_errors_set(&err, err.code, "Error copying format() result", 0); return; } } #define cleanup_zvals() for(int j=i;j>=0;j--) { zval_ptr_dtor((*args)+i); } U_CFUNC void umsg_parse_helper(UMessageFormat *fmt, int *count, zval **args, UChar *source, int32_t source_len, UErrorCode *status) { UnicodeString srcString(source, source_len); Formattable *fargs = ((const MessageFormat*)fmt)->parse(srcString, *count, *status); if(U_FAILURE(*status)) { return; } *args = (zval *)safe_emalloc(*count, sizeof(zval), 0); // assign formattables to varargs for(int32_t i = 0; i < *count; i++) { int64_t aInt64; double aDate; UnicodeString temp; zend_string *u8str; switch(fargs[i].getType()) { case Formattable::kDate: aDate = ((double)fargs[i].getDate())/U_MILLIS_PER_SECOND; ZVAL_DOUBLE(&(*args)[i], aDate); break; case Formattable::kDouble: ZVAL_DOUBLE(&(*args)[i], (double)fargs[i].getDouble()); break; case Formattable::kLong: ZVAL_LONG(&(*args)[i], fargs[i].getLong()); break; case Formattable::kInt64: aInt64 = fargs[i].getInt64(); if(aInt64 > ZEND_LONG_MAX || aInt64 < -ZEND_LONG_MAX) { ZVAL_DOUBLE(&(*args)[i], (double)aInt64); } else { ZVAL_LONG(&(*args)[i], (zend_long)aInt64); } break; case Formattable::kString: fargs[i].getString(temp); u8str = intl_convert_utf16_to_utf8(temp.getBuffer(), temp.length(), status); if(!u8str) { cleanup_zvals(); return; } ZVAL_NEW_STR(&(*args)[i], u8str); break; case Formattable::kObject: case Formattable::kArray: *status = U_ILLEGAL_ARGUMENT_ERROR; cleanup_zvals(); break; } } delete[] fargs; }