--TEST-- Exhaustive test of verification and conversion of GB18030-2022 text --EXTENSIONS-- mbstring --SKIPIF-- --FILE-- "\xFE\x10", "\xA6\xDA" => "\xFE\x12", "\xA6\xDB" => "\xFE\x11", "\xA6\xDC" => "\xFE\x13", "\xA6\xDD" => "\xFE\x14", "\xA6\xDE" => "\xFE\x15", "\xA6\xDF" => "\xFE\x16", "\xA6\xEC" => "\xFE\x17", "\xA6\xED" => "\xFE\x18", "\xA6\xF3" => "\xFE\x19", "\xA8\xBC" => "\x1E\x3F", "\xA8\xBF" => "\x01\xF9", "\xA9\x89" => "\x30\x3E", "\xA9\x8A" => "\x2F\xF0", "\xA9\x8B" => "\x2F\xF1", "\xA9\x8C" => "\x2F\xF2", "\xA9\x8D" => "\x2F\xF3", "\xA9\x8E" => "\x2F\xF4", "\xA9\x8F" => "\x2F\xF5", "\xA9\x90" => "\x2F\xF6", "\xA9\x91" => "\x2F\xF7", "\xA9\x92" => "\x2F\xF8", "\xA9\x93" => "\x2F\xF9", "\xA9\x94" => "\x2F\xFA", "\xA9\x95" => "\x2F\xFB", "\xFE\x50" => "\x2E\x81", "\xFE\x51" => "\xE8\x16", "\xFE\x52" => "\xE8\x17", "\xFE\x53" => "\xE8\x18", "\xFE\x54" => "\x2E\x84", "\xFE\x55" => "\x34\x73", "\xFE\x56" => "\x34\x47", "\xFE\x57" => "\x2E\x88", "\xFE\x58" => "\x2E\x8B", "\xFE\x59" => "\x9F\xB4", "\xFE\x5A" => "\x35\x9E", "\xFE\x5B" => "\x36\x1A", "\xFE\x5C" => "\x36\x0E", "\xFE\x5D" => "\x2E\x8C", "\xFE\x5E" => "\x2E\x97", "\xFE\x5F" => "\x39\x6E", "\xFE\x60" => "\x39\x18", "\xFE\x61" => "\x9F\xB5", "\xFE\x62" => "\x39\xCF", "\xFE\x63" => "\x39\xDF", "\xFE\x64" => "\x3A\x73", "\xFE\x65" => "\x39\xD0", "\xFE\x66" => "\x9F\xB6", "\xFE\x67" => "\x9F\xB7", "\xFE\x68" => "\x3B\x4E", "\xFE\x69" => "\x3C\x6E", "\xFE\x6A" => "\x3C\xE0", "\xFE\x6B" => "\x2E\xA7", "\xFE\x6C" => "\xE8\x31", "\xFE\x6D" => "\x9F\xB8", "\xFE\x6E" => "\x2E\xAA", "\xFE\x6F" => "\x40\x56", "\xFE\x76" => "\xE8\x3B", "\xFE\x7E" => "\x9F\xB9", "\xFE\x90" => "\x9F\xBA", "\xFE\x91" => "\xE8\x55", "\xFE\xA0" => "\x9F\xBB"]; testAllValidChars($updatedMappings, 'GB18030-2022', 'UTF-16BE', false); testAllValidChars(array_flip($updatedMappings), 'UTF-16BE', 'GB18030-2022', false); $sampleSMP = [ "\x00\x10\x03\x08" => "\xDE\x30\xE6\x36", "\x00\x10\x14\xEB" => "\xDE\x34\xB8\x35", "\x00\x10\x29\x76" => "\xDE\x38\xCE\x34", "\x00\x10\x40\x6E" => "\xDF\x33\xA4\x34", "\x00\x10\x78\x7B" => "\xE0\x34\xD5\x33", "\x00\x01\x25\x2A" => "\x90\x37\xC6\x34", "\x00\x01\x5B\xA4" => "\x91\x38\xCF\x30", "\x00\x01\x6D\x81" => "\x92\x32\xA0\x33", "\x00\x01\x7F\xB2" => "\x92\x35\xF8\x30", "\x00\x01\x89\x9B" => "\x92\x37\xF9\x37", "\x00\x01\x9E\x77" => "\x93\x32\x99\x37", "\x00\x02\x08\x9A" => "\x95\x33\xE0\x38", "\x00\x02\x1B\x00" => "\x95\x37\xBF\x38", "\x00\x02\x31\xBE" => "\x96\x32\x90\x30", "\x00\x02\x64\xD4" => "\x97\x32\xBF\x38", "\x00\x02\xA9\xA0" => "\x98\x36\xBD\x30", "\x00\x02\xBA\x38" => "\x98\x39\xEB\x38", "\x00\x03\x1C\x13" => "\x9A\x39\xDC\x39", "\x00\x03\x20\x6D" => "\x9B\x30\xCE\x33", "\x00\x03\x22\xA9" => "\x9B\x31\x89\x35", "\x00\x03\x39\xB3" => "\x9B\x35\xDF\x33", "\x00\x03\xA7\xF2" => "\x9D\x38\x93\x36", "\x00\x03\xDF\xFB" => "\x9E\x39\xC4\x31", "\x00\x04\x01\x69" => "\x9F\x36\xA9\x39", "\x00\x04\x23\x79" => "\xA0\x33\x9F\x39", "\x00\x04\x26\x52" => "\xA0\x33\xE8\x38", "\x00\x04\x38\xDB" => "\xA0\x37\xCB\x33", "\x00\x04\x46\x84" => "\xA1\x30\xAF\x30", "\x00\x04\x6C\x7C" => "\xA1\x38\x8B\x30", "\x00\x04\x78\x41" => "\xA2\x30\xBC\x33", "\x00\x04\x97\x32" => "\xA2\x36\xE0\x34", "\x00\x04\x9E\xCC" => "\xA2\x38\xA7\x30", "\x00\x04\xC5\xDB" => "\xA3\x36\x9E\x39", "\x00\x04\xF4\xE2" => "\xA4\x35\xE4\x38", "\x00\x05\x3B\xA6" => "\xA6\x30\x96\x34", "\x00\x05\x76\x53" => "\xA7\x32\x8C\x35", "\x00\x05\xEA\x9F" => "\xA9\x35\xDB\x37", "\x00\x06\x12\x29" => "\xAA\x33\xDF\x39", "\x00\x06\x1B\x9E" => "\xAA\x35\xD6\x30", "\x00\x06\x3B\x26" => "\xAB\x32\x8B\x32", "\x00\x06\x4C\xA8" => "\xAB\x35\xD1\x34", "\x00\x06\x63\x3E" => "\xAC\x30\x9D\x36", "\x00\x06\xB3\xA1" => "\xAD\x36\xC7\x35", "\x00\x07\x0A\x31" => "\xAF\x34\x93\x35", "\x00\x07\x22\xA7" => "\xAF\x39\x8F\x37", "\x00\x07\x79\xA3" => "\xB1\x36\xE4\x35", "\x00\x07\x88\xFA" => "\xB1\x39\xF3\x32", "\x00\x07\xCE\xCA" => "\xB3\x34\x8C\x34", "\x00\x07\xF8\xD2" => "\xB4\x32\xD0\x34", "\x00\x08\x20\xF6" => "\xB5\x30\xE4\x30", "\x00\x08\xAD\x05" => "\xB7\x39\x9F\x35", "\x00\x08\xEA\x7E" => "\xB9\x31\xDD\x32", "\x00\x08\xF0\xB8" => "\xB9\x32\xFE\x36", "\x00\x09\x14\x07" => "\xBA\x30\x96\x35", "\x00\x09\x41\xDD" => "\xBA\x39\xBD\x39", "\x00\x09\x42\xEF" => "\xBA\x39\xD9\x33", "\x00\x07\x22\xA7" => "\xAF\x39\x8F\x37", "\x00\x07\x79\xA3" => "\xB1\x36\xE4\x35", "\x00\x07\x88\xFA" => "\xB1\x39\xF3\x32", "\x00\x07\xCE\xCA" => "\xB3\x34\x8C\x34", "\x00\x07\xF8\xD2" => "\xB4\x32\xD0\x34", "\x00\x08\x20\xF6" => "\xB5\x30\xE4\x30", "\x00\x08\xAD\x05" => "\xB7\x39\x9F\x35", "\x00\x08\xEA\x7E" => "\xB9\x31\xDD\x32", "\x00\x08\xF0\xB8" => "\xB9\x32\xFE\x36", "\x00\x09\x14\x07" => "\xBA\x30\x96\x35", "\x00\x09\x41\xDD" => "\xBA\x39\xBD\x39", "\x00\x09\x42\xEF" => "\xBA\x39\xD9\x33", "\x00\x09\xBA\x2B" => "\xBD\x33\xF5\x37", "\x00\x0A\x26\x00" => "\xBF\x35\xEA\x32", "\x00\x0A\x36\xE9" => "\xBF\x39\xA3\x31", "\x00\x0A\x7A\x20" => "\xC1\x32\xF5\x38", "\x00\x0A\x9C\x93" => "\xC1\x39\xF5\x37", "\x00\x0A\xC0\xD7" => "\xC2\x37\xA6\x31", "\x00\x0A\xD8\x77" => "\xC3\x32\x8C\x39", "\x00\x0B\x1A\x9B" => "\xC4\x35\xC4\x31", "\x00\x0B\x4F\x27" => "\xC5\x36\x9B\x33", "\x00\x0B\x72\x6D" => "\xC6\x33\xB0\x33", "\x00\x0B\xEE\x23" => "\xC8\x38\xC1\x33", "\x00\x0B\xF0\xDF" => "\xC8\x39\x89\x33", "\x00\x0C\x0B\xE1" => "\xC9\x34\xC6\x37", "\x00\x0C\x4C\x98" => "\xCA\x37\xD9\x34", "\x00\x0C\x5F\x41" => "\xCB\x31\xBF\x31", "\x00\x0C\x63\xE4" => "\xCB\x32\xB7\x38", "\x00\x0C\x70\x0A" => "\xCB\x34\xF2\x38", "\x00\x0C\xAD\x6A" => "\xCC\x37\xB0\x30", "\x00\x0C\xCC\x03" => "\xCD\x33\xCB\x33", "\x00\x0C\xD5\x4C" => "\xCD\x35\xBD\x30", "\x00\x0C\xE6\x70" => "\xCD\x38\xF9\x38", "\x00\x0D\x1B\x6A" => "\xCE\x39\xDC\x30", "\x00\x0D\x55\xEE" => "\xD0\x31\xCE\x30", "\x00\x0D\xBB\xB1" => "\xD2\x32\xA5\x31", "\x00\x0D\xC0\x4F" => "\xD2\x33\x9D\x33", "\x00\x0D\xFA\x84" => "\xD3\x35\x87\x34", "\x00\x0E\x16\x71" => "\xD4\x30\xDC\x33", "\x00\x0E\x1E\x03" => "\xD4\x32\xA2\x31", "\x00\x0E\x20\xE8" => "\xD4\x32\xEC\x32", "\x00\x0E\x39\x6A" => "\xD4\x37\xE9\x36", "\x00\x0E\x6A\x95" => "\xD5\x37\xE8\x33", "\x00\x0E\x7E\xCD" => "\xD6\x31\xF5\x39", "\x00\x0E\x80\x69" => "\xD6\x32\xA1\x31", "\x00\x0E\x9A\x7F" => "\xD6\x37\xC6\x39", "\x00\x0E\xEE\x12" => "\xD8\x34\xC4\x34", "\x00\x0E\xFC\xA1" => "\xD8\x37\xBF\x31", "\x00\x0F\x29\xB0" => "\xD9\x36\xD2\x36", "\x00\x0F\x2A\x12" => "\xD9\x36\xDC\x34", "\x00\x0F\x6C\x8C" => "\xDB\x30\x9E\x32", "\x00\x0F\xAF\x04" => "\xDC\x33\xDD\x38", "\x00\x0F\xBE\x65" => "\xDC\x36\xED\x35", "\x00\x0F\xE5\x88" => "\xDD\x34\xE7\x34", "\x00\x0F\xE7\xB1" => "\xDD\x35\xA0\x37", "\x00\x0F\xF4\x27" => "\xDD\x37\xE3\x37"]; testAllValidChars($sampleSMP, 'UTF-32BE', 'GB18030-2022', false); function readGB18030_2022_ConversionTable($path, &$from, &$to, $utf32 = false) { $from = []; $to = []; $fp = fopen($path, 'r+'); while ($line = fgets($fp, 256)) { if ($line[0] == '#') continue; if (sscanf($line, "%x\t%x", $codepoint, $char) == 2) { $codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint); if ($char == PHP_INT_MAX) { // We may be on a 32-bit machine and testing a text encoding with 4-byte codes // (which can't be represented in a PHP integer) $char = ""; for ($i = strpos($line, "\t") + 1; $i < strlen($line); $i += 2) { $substr = substr($line, $i, 2); if (ctype_xdigit($substr)) $char .= chr(hexdec($substr)); else break; } } else { if ($char <= 0xFF) $char = chr($char); // hex codes must not have leading zero bytes else if ($char <= 0xFFFF) $char = pack('n', $char); else if ($char <= 0xFFFFFF) $char = chr($char >> 16) . pack('n', $char & 0xFFFF); else $char = pack('N', $char); } $from[$char] = $codepoint; $to[$codepoint] = $char; } } } readGB18030_2022_ConversionTable(__DIR__ . '/data/GB18030-2022MappingTableBMP.txt', $toUnicode, $fromUnicode); // We will test 4-byte codes separately findInvalidChars($toUnicode, $invalid, $truncated); function notFourByteCode($gb) { return ((ord($gb) < 0x81 || ord($gb) > 0x84) && (ord($gb) < 0x90 || ord($gb) > 0xE3)) || (strlen($gb) > 1 && (ord($gb[1]) < 0x30 || ord($gb[1]) > 0x39)); } $invalid = array_filter($invalid, 'notFourByteCode', ARRAY_FILTER_USE_KEY); $truncated = array_filter($truncated, 'notFourByteCode', ARRAY_FILTER_USE_KEY); testAllValidChars($toUnicode, 'GB18030-2022', 'UTF-16BE', false); testAllInvalidChars($invalid, $toUnicode, 'GB18030-2022', 'UTF-16BE', "\x00%"); testTruncatedChars($truncated, 'GB18030-2022', 'UTF-16BE', "\x00%"); echo "Tested GB18030-2022 (BMP) -> UTF-16BE\n"; // Test one random 4-byte code for each range used for Unicode codepoints in BMP function fourByteCodeIndex($byte4, $byte3, $byte2, $byte1) { return (($byte4 - 0x81) * 10 * 126 * 10) + (($byte3 - 0x30) * 10 * 126) + (($byte2 - 0x81) * 10) + ($byte1 - 0x30); } function fourByteCodeFromIndex($index) { $quotient = intdiv($index, 10 * 126 * 10); $byte4 = $quotient + 0x81; $index -= ($quotient * 10 * 126 * 10); $quotient = intdiv($index, 10 * 126); $byte3 = $quotient + 0x30; $index -= ($quotient * 10 * 126); $quotient = intdiv($index, 10); $byte2 = $quotient + 0x81; $byte1 = $index - ($quotient * 10) + 0x30; return chr($byte4) . chr($byte3) . chr($byte2) . chr($byte1); } // Invalid 4-byte codes in range for BMP testInvalidString("\x81\x30\x81\xFF", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE"); testInvalidString("\x84\x31\xA4\x40", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE"); testInvalidString("\x84\x31\xA5\x30", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE"); testInvalidString("\x84\x32\x81\x30", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE"); testInvalidString("\x85\x31\x81\x30", "\x00\x00\x00%\x00\x00\x00%", "GB18030-2022", "UTF-32BE"); // Valid 4-byte codes for other Unicode planes testValidString("\x90\x30\x81\x30", "\x00\x01\x00\x00", "GB18030-2022", "UTF-32BE"); testValidString("\xE3\x32\x9A\x35", "\x00\x10\xFF\xFF", "GB18030-2022", "UTF-32BE"); // Invalid 4-byte codes for other Unicode planes testInvalidString("\x90\x30\x81\xFF", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE"); testInvalidString("\xE3\x32\x9A\x36", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE"); testInvalidString("\xE4\x30\x81\x35", "\x00\x00\x00%\x00\x00\x00%", "GB18030-2022", "UTF-32BE"); testInvalidString("\x90\x30\x80\x30", "\x00\x00\x00%\x00\x00\x00\x30", "GB18030-2022", "UTF-32BE"); echo "Tested GB18030-2022 (SMP) <-> UTF-32BE\n"; testAllValidChars($fromUnicode, 'UTF-16BE', 'GB18030-2022', false); echo "Tested UTF-16BE -> GB18030-2022 (BMP)\n"; convertInvalidString("\xAA\xB8\x2D\x38\x00\x00\x00#", "%#", "UTF-32BE", "GB18030-2022"); // Test "long" illegal character markers mb_substitute_character("long"); convertInvalidString("\x81\x30\x81\xFF", "%", "GB18030-2022", "UTF-8"); convertInvalidString("\xE3\x32\x9A\x36", "%", "GB18030-2022", "UTF-8"); echo "Done!\n"; ?> --EXPECT-- Tested GB18030-2022 (BMP) -> UTF-16BE Tested GB18030-2022 (SMP) <-> UTF-32BE Tested UTF-16BE -> GB18030-2022 (BMP) Done!