> 16) . pack('n', $char & 0xFFFF); else $char = pack('N', $char); } $from[$char] = $codepoint; $to[$codepoint] = $char; } } } function dbgPrint($str) { $result = ''; if (mb_check_encoding($str, 'ASCII')) $result .= '"' . $str . '" '; return $result . "(" . bin2hex($str) . ")"; } function identifyValidString($goodString, $encoding) { $result = mb_check_encoding($goodString, $encoding); if (!$result) die("mb_check_encoding failed on good $encoding string: " . dbgPrint($goodString)); } function identifyInvalidString($badString, $encoding) { $result = mb_check_encoding($badString, $encoding); if ($result) die("mb_check_encoding passed on bad $encoding string: " . dbgPrint($badString)); } function testConversion($fromString, $toString, $fromEncoding, $toEncoding) { $result = mb_convert_encoding($fromString, $toEncoding, $fromEncoding); if ($result !== $toString) die("mb_convert_encoding not working on $fromEncoding input: " . dbgPrint($fromString) . "\nExpected $toEncoding: " . dbgPrint($toString) . "\nActually got: " . dbgPrint($result)); } function testValidConversion($fromString, $toString, $fromEncoding, $toEncoding) { $illegalChars = mb_get_info('illegal_chars'); testConversion($fromString, $toString, $fromEncoding, $toEncoding); if (mb_get_info('illegal_chars') !== $illegalChars) die("mb_convert_encoding incremented illegal_chars on valid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding"); } function convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) { testValidConversion($fromString, $toString, $fromEncoding, $toEncoding); if ($bothWays) testValidConversion($toString, $fromString, $toEncoding, $fromEncoding); } function convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding) { $illegalChars = mb_get_info('illegal_chars'); testConversion($fromString, $toString, $fromEncoding, $toEncoding); if (mb_get_info('illegal_chars') <= $illegalChars) die("mb_convert_encoding did not increment illegal_chars on invalid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding"); } function testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) { identifyValidString($fromString, $fromEncoding); convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays); } function testInvalidString($fromString, $toString, $fromEncoding, $toEncoding) { identifyInvalidString($fromString, $fromEncoding); convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding); } // Only for encodings where valid characters can be concatenated together in any // way, without any escape sequences function testAllValidChars($charMap, $fromEncoding, $toEncoding, $bothWays = true) { $goodChars = array_keys($charMap); shuffle($goodChars); while (!empty($goodChars)) { $length = min(rand(5,10), count($goodChars)); $fromString = $toString = ''; while ($length--) { $goodChar = array_pop($goodChars); $fromString .= $goodChar; $toString .= $charMap[$goodChar]; } testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays); } } function testAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) { $badChars = array_keys($badChars); $goodChars = array(); while (!empty($badChars)) { if (empty($goodChars)) { $goodChars = array_keys($charMap); shuffle($goodChars); } $goodChar = array_pop($goodChars); $fromString = array_pop($badChars) . $goodChar; $toString = $replacement . $charMap[$goodChar]; testInvalidString($fromString, $toString, $fromEncoding, $toEncoding); } } function convertAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) { $badChars = array_keys($badChars); $goodChars = array(); while (!empty($badChars)) { if (empty($goodChars)) { $goodChars = array_keys($charMap); shuffle($goodChars); } $goodChar = array_pop($goodChars); $fromString = array_pop($badChars) . $goodChar; $toString = $replacement . $charMap[$goodChar]; convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding); } } function testTruncatedChars($truncated, $fromEncoding, $toEncoding, $replacement) { $truncatedChars = array_keys($truncated); foreach ($truncatedChars as $truncatedChar) { testInvalidString($truncatedChar, $replacement, $fromEncoding, $toEncoding); } } // For variable-width encodings, where we have an exhaustive list of // all valid characters of any width // // `$startBytes` maps from first-byte values to the corresponding character length // (For encodings where the first byte can tell you the length of a multi-byte // character) // Note that `$startBytes` can be partial! function findInvalidChars($valid, &$invalid, &$truncated, $startBytes = array()) { $invalid = array(); $truncated = array(); $prefixes = array(); /* All sequences which are not (but can start) a valid character */ foreach ($valid as $char => $unicode) { for ($len = 1; $len < strlen($char); $len++) $prefixes[substr($char, 0, $len)] = true; } $varLength = function($prefix) use($valid, $prefixes, &$invalid, &$truncated, &$varLength) { for ($byte = 0; $byte < 256; $byte++) { $str = $prefix . chr($byte); if (!isset($valid[$str])) { if (isset($prefixes[$str])) { $truncated[$str] = true; $varLength($str); } else { $invalid[$str] = true; } } } }; $fixedLength = function($prefix, $remaining) use($valid, $prefixes, &$invalid, &$truncated, &$fixedLength) { if ($remaining == 0) { if (!isset($valid[$prefix])) $invalid[$prefix] = true; } else if ($remaining == 1) { $truncated[$prefix] = true; for ($i = 0; $i < 256; $i++) { $str = $prefix . chr($i); if (!isset($valid[$str])) $invalid[$str] = true; } } else { $truncated[$prefix] = true; for ($i = 0; $i < 256; $i++) $fixedLength($prefix . chr($i), $remaining - 1); } }; for ($byte = 0; $byte < 256; $byte++) { if (isset($startBytes[$byte])) { $fixedLength(chr($byte), $startBytes[$byte] - 1); } else { $str = chr($byte); if (!isset($valid[$str])) { if (isset($prefixes[$str])) { $truncated[$str] = true; $varLength($str); } else { $invalid[$str] = true; } } } } } function testEncodingFromUTF16ConversionTable($path, $encoding, $replacement = '%', $startBytes = array()) { srand(1000); // Make results consistent mb_substitute_character(0x25); // '%' readConversionTable($path, $toUnicode, $fromUnicode); findInvalidChars($toUnicode, $invalid, $truncated, $startBytes); testAllValidChars($toUnicode, $encoding, 'UTF-16BE'); testAllInvalidChars($invalid, $toUnicode, $encoding, 'UTF-16BE', "\x00%"); testTruncatedChars($truncated, $encoding, 'UTF-16BE', "\x00%"); echo "Tested $encoding -> UTF-16BE\n"; findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2)); convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', $encoding, $replacement); echo "Tested UTF-16BE -> $encoding\n"; } ?>