--TEST-- Exhaustive test of verification and conversion of GB18030 text --EXTENSIONS-- mbstring --SKIPIF-- --FILE-- 0x84) && (ord($gb) < 0x90 || ord($gb) > 0xE3)) || (strlen($gb) > 1 && (ord($gb[1]) < 0x30 || ord($gb[1]) > 0x39)); } $invalid = array_filter($invalid, 'notFourByteCode', ARRAY_FILTER_USE_KEY); $truncated = array_filter($truncated, 'notFourByteCode', ARRAY_FILTER_USE_KEY); testAllValidChars($toUnicode, 'GB18030', 'UTF-16BE', false); testAllInvalidChars($invalid, $toUnicode, 'GB18030', 'UTF-16BE', "\x00%"); testTruncatedChars($truncated, 'GB18030', 'UTF-16BE', "\x00%"); echo "Tested GB18030 (1 and 2 byte characters) -> UTF-16BE\n"; // Test one random 4-byte code for each range used for Unicode codepoints in BMP function fourByteCodeIndex($byte4, $byte3, $byte2, $byte1) { return (($byte4 - 0x81) * 10 * 126 * 10) + (($byte3 - 0x30) * 10 * 126) + (($byte2 - 0x81) * 10) + ($byte1 - 0x30); } function fourByteCodeFromIndex($index) { $quotient = intdiv($index, 10 * 126 * 10); $byte4 = $quotient + 0x81; $index -= ($quotient * 10 * 126 * 10); $quotient = intdiv($index, 10 * 126); $byte3 = $quotient + 0x30; $index -= ($quotient * 10 * 126); $quotient = intdiv($index, 10); $byte2 = $quotient + 0x81; $byte1 = $index - ($quotient * 10) + 0x30; return chr($byte4) . chr($byte3) . chr($byte2) . chr($byte1); } foreach ($gb18030_BMP_Mappings as $mapping) { [$byte4, $byte3, $byte2, $byte1, $unicode, $n] = $mapping; $i = rand(0, $n-1); $gb = fourByteCodeFromIndex(fourByteCodeIndex($byte4, $byte3, $byte2, $byte1) + $i); $unicode += $i; testValidString($gb, pack('n', $unicode), 'GB18030', 'UTF-16BE'); } // Invalid 4-byte codes in range for BMP testInvalidString("\x81\x30\x81\xFF", "\x00\x00\x00%", "GB18030", "UTF-32BE"); testInvalidString("\x84\x31\xA4\x40", "\x00\x00\x00%", "GB18030", "UTF-32BE"); testInvalidString("\x84\x31\xA5\x30", "\x00\x00\x00%", "GB18030", "UTF-32BE"); testInvalidString("\x84\x32\x81\x30", "\x00\x00\x00%", "GB18030", "UTF-32BE"); testInvalidString("\x85\x31\x81\x30", "\x00\x00\x00%\x00\x00\x00%", "GB18030", "UTF-32BE"); // Valid 4-byte codes for other Unicode planes testValidString("\x90\x30\x81\x30", "\x00\x01\x00\x00", "GB18030", "UTF-32BE"); testValidString("\xE3\x32\x9A\x35", "\x00\x10\xFF\xFF", "GB18030", "UTF-32BE"); // Invalid 4-byte codes for other Unicode planes testInvalidString("\x90\x30\x81\xFF", "\x00\x00\x00%", "GB18030", "UTF-32BE"); testInvalidString("\xE3\x32\x9A\x36", "\x00\x00\x00%", "GB18030", "UTF-32BE"); testInvalidString("\xE4\x30\x81\x35", "\x00\x00\x00%\x00\x00\x00%", "GB18030", "UTF-32BE"); testInvalidString("\x90\x30\x80\x30", "\x00\x00\x00%\x00\x00\x00\x30", "GB18030", "UTF-32BE"); echo "Tested GB18030 4-byte characters <-> UTF-16BE\n"; testAllValidChars($fromUnicode, 'UTF-16BE', 'GB18030', false); echo "Tested UTF-16BE -> GB18030 (1 and 2 byte characters)\n"; // Regression test // This was found by a fuzzer (previously the invalid codepoint would be converted to \x00) convertInvalidString("\xAA\xB8\x2D\x38\x00\x00\x00#", "%#", "UTF-32BE", "GB18030"); // Test "long" illegal character markers mb_substitute_character("long"); convertInvalidString("\x81\x30\x81\xFF", "%", "GB18030", "UTF-8"); convertInvalidString("\xE3\x32\x9A\x36", "%", "GB18030", "UTF-8"); echo "Done!\n"; ?> --EXPECT-- Tested GB18030 (1 and 2 byte characters) -> UTF-16BE Tested GB18030 4-byte characters <-> UTF-16BE Tested UTF-16BE -> GB18030 (1 and 2 byte characters) Done!