1--TEST-- 2Exhaustive test of verification and conversion of GB18030-2022 text 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8if (PHP_INT_SIZE == 4 && !extension_loaded("ctype")) die("skip needs ctype extension on 32-bit"); 9?> 10--FILE-- 11<?php 12include('encoding_tests.inc'); 13srand(2323); // Make results consistent 14mb_substitute_character(0x25); // '%' 15 16$updatedMappings = [ 17 "\xA6\xD9" => "\xFE\x10", 18 "\xA6\xDA" => "\xFE\x12", 19 "\xA6\xDB" => "\xFE\x11", 20 "\xA6\xDC" => "\xFE\x13", 21 "\xA6\xDD" => "\xFE\x14", 22 "\xA6\xDE" => "\xFE\x15", 23 "\xA6\xDF" => "\xFE\x16", 24 "\xA6\xEC" => "\xFE\x17", 25 "\xA6\xED" => "\xFE\x18", 26 "\xA6\xF3" => "\xFE\x19", 27 28 "\xA8\xBC" => "\x1E\x3F", 29 "\xA8\xBF" => "\x01\xF9", 30 "\xA9\x89" => "\x30\x3E", 31 "\xA9\x8A" => "\x2F\xF0", 32 "\xA9\x8B" => "\x2F\xF1", 33 "\xA9\x8C" => "\x2F\xF2", 34 "\xA9\x8D" => "\x2F\xF3", 35 "\xA9\x8E" => "\x2F\xF4", 36 "\xA9\x8F" => "\x2F\xF5", 37 "\xA9\x90" => "\x2F\xF6", 38 "\xA9\x91" => "\x2F\xF7", 39 "\xA9\x92" => "\x2F\xF8", 40 "\xA9\x93" => "\x2F\xF9", 41 "\xA9\x94" => "\x2F\xFA", 42 "\xA9\x95" => "\x2F\xFB", 43 44 "\xFE\x50" => "\x2E\x81", 45 "\xFE\x51" => "\xE8\x16", 46 "\xFE\x52" => "\xE8\x17", 47 "\xFE\x53" => "\xE8\x18", 48 "\xFE\x54" => "\x2E\x84", 49 "\xFE\x55" => "\x34\x73", 50 "\xFE\x56" => "\x34\x47", 51 "\xFE\x57" => "\x2E\x88", 52 "\xFE\x58" => "\x2E\x8B", 53 "\xFE\x59" => "\x9F\xB4", 54 "\xFE\x5A" => "\x35\x9E", 55 "\xFE\x5B" => "\x36\x1A", 56 "\xFE\x5C" => "\x36\x0E", 57 "\xFE\x5D" => "\x2E\x8C", 58 "\xFE\x5E" => "\x2E\x97", 59 "\xFE\x5F" => "\x39\x6E", 60 61 "\xFE\x60" => "\x39\x18", 62 "\xFE\x61" => "\x9F\xB5", 63 "\xFE\x62" => "\x39\xCF", 64 "\xFE\x63" => "\x39\xDF", 65 "\xFE\x64" => "\x3A\x73", 66 "\xFE\x65" => "\x39\xD0", 67 "\xFE\x66" => "\x9F\xB6", 68 "\xFE\x67" => "\x9F\xB7", 69 "\xFE\x68" => "\x3B\x4E", 70 "\xFE\x69" => "\x3C\x6E", 71 "\xFE\x6A" => "\x3C\xE0", 72 "\xFE\x6B" => "\x2E\xA7", 73 "\xFE\x6C" => "\xE8\x31", 74 "\xFE\x6D" => "\x9F\xB8", 75 "\xFE\x6E" => "\x2E\xAA", 76 "\xFE\x6F" => "\x40\x56", 77 78 "\xFE\x76" => "\xE8\x3B", 79 "\xFE\x7E" => "\x9F\xB9", 80 "\xFE\x90" => "\x9F\xBA", 81 "\xFE\x91" => "\xE8\x55", 82 "\xFE\xA0" => "\x9F\xBB"]; 83testAllValidChars($updatedMappings, 'GB18030-2022', 'UTF-16BE', false); 84testAllValidChars(array_flip($updatedMappings), 'UTF-16BE', 'GB18030-2022', false); 85 86$sampleSMP = [ 87 "\x00\x10\x03\x08" => "\xDE\x30\xE6\x36", 88 "\x00\x10\x14\xEB" => "\xDE\x34\xB8\x35", 89 "\x00\x10\x29\x76" => "\xDE\x38\xCE\x34", 90 "\x00\x10\x40\x6E" => "\xDF\x33\xA4\x34", 91 "\x00\x10\x78\x7B" => "\xE0\x34\xD5\x33", 92 "\x00\x01\x25\x2A" => "\x90\x37\xC6\x34", 93 "\x00\x01\x5B\xA4" => "\x91\x38\xCF\x30", 94 "\x00\x01\x6D\x81" => "\x92\x32\xA0\x33", 95 "\x00\x01\x7F\xB2" => "\x92\x35\xF8\x30", 96 "\x00\x01\x89\x9B" => "\x92\x37\xF9\x37", 97 "\x00\x01\x9E\x77" => "\x93\x32\x99\x37", 98 "\x00\x02\x08\x9A" => "\x95\x33\xE0\x38", 99 "\x00\x02\x1B\x00" => "\x95\x37\xBF\x38", 100 "\x00\x02\x31\xBE" => "\x96\x32\x90\x30", 101 "\x00\x02\x64\xD4" => "\x97\x32\xBF\x38", 102 "\x00\x02\xA9\xA0" => "\x98\x36\xBD\x30", 103 "\x00\x02\xBA\x38" => "\x98\x39\xEB\x38", 104 "\x00\x03\x1C\x13" => "\x9A\x39\xDC\x39", 105 "\x00\x03\x20\x6D" => "\x9B\x30\xCE\x33", 106 "\x00\x03\x22\xA9" => "\x9B\x31\x89\x35", 107 "\x00\x03\x39\xB3" => "\x9B\x35\xDF\x33", 108 "\x00\x03\xA7\xF2" => "\x9D\x38\x93\x36", 109 "\x00\x03\xDF\xFB" => "\x9E\x39\xC4\x31", 110 "\x00\x04\x01\x69" => "\x9F\x36\xA9\x39", 111 "\x00\x04\x23\x79" => "\xA0\x33\x9F\x39", 112 "\x00\x04\x26\x52" => "\xA0\x33\xE8\x38", 113 "\x00\x04\x38\xDB" => "\xA0\x37\xCB\x33", 114 "\x00\x04\x46\x84" => "\xA1\x30\xAF\x30", 115 "\x00\x04\x6C\x7C" => "\xA1\x38\x8B\x30", 116 "\x00\x04\x78\x41" => "\xA2\x30\xBC\x33", 117 "\x00\x04\x97\x32" => "\xA2\x36\xE0\x34", 118 "\x00\x04\x9E\xCC" => "\xA2\x38\xA7\x30", 119 "\x00\x04\xC5\xDB" => "\xA3\x36\x9E\x39", 120 "\x00\x04\xF4\xE2" => "\xA4\x35\xE4\x38", 121 "\x00\x05\x3B\xA6" => "\xA6\x30\x96\x34", 122 "\x00\x05\x76\x53" => "\xA7\x32\x8C\x35", 123 "\x00\x05\xEA\x9F" => "\xA9\x35\xDB\x37", 124 "\x00\x06\x12\x29" => "\xAA\x33\xDF\x39", 125 "\x00\x06\x1B\x9E" => "\xAA\x35\xD6\x30", 126 "\x00\x06\x3B\x26" => "\xAB\x32\x8B\x32", 127 "\x00\x06\x4C\xA8" => "\xAB\x35\xD1\x34", 128 "\x00\x06\x63\x3E" => "\xAC\x30\x9D\x36", 129 "\x00\x06\xB3\xA1" => "\xAD\x36\xC7\x35", 130 "\x00\x07\x0A\x31" => "\xAF\x34\x93\x35", 131 "\x00\x07\x22\xA7" => "\xAF\x39\x8F\x37", 132 "\x00\x07\x79\xA3" => "\xB1\x36\xE4\x35", 133 "\x00\x07\x88\xFA" => "\xB1\x39\xF3\x32", 134 "\x00\x07\xCE\xCA" => "\xB3\x34\x8C\x34", 135 "\x00\x07\xF8\xD2" => "\xB4\x32\xD0\x34", 136 "\x00\x08\x20\xF6" => "\xB5\x30\xE4\x30", 137 "\x00\x08\xAD\x05" => "\xB7\x39\x9F\x35", 138 "\x00\x08\xEA\x7E" => "\xB9\x31\xDD\x32", 139 "\x00\x08\xF0\xB8" => "\xB9\x32\xFE\x36", 140 "\x00\x09\x14\x07" => "\xBA\x30\x96\x35", 141 "\x00\x09\x41\xDD" => "\xBA\x39\xBD\x39", 142 "\x00\x09\x42\xEF" => "\xBA\x39\xD9\x33", 143 "\x00\x07\x22\xA7" => "\xAF\x39\x8F\x37", 144 "\x00\x07\x79\xA3" => "\xB1\x36\xE4\x35", 145 "\x00\x07\x88\xFA" => "\xB1\x39\xF3\x32", 146 "\x00\x07\xCE\xCA" => "\xB3\x34\x8C\x34", 147 "\x00\x07\xF8\xD2" => "\xB4\x32\xD0\x34", 148 "\x00\x08\x20\xF6" => "\xB5\x30\xE4\x30", 149 "\x00\x08\xAD\x05" => "\xB7\x39\x9F\x35", 150 "\x00\x08\xEA\x7E" => "\xB9\x31\xDD\x32", 151 "\x00\x08\xF0\xB8" => "\xB9\x32\xFE\x36", 152 "\x00\x09\x14\x07" => "\xBA\x30\x96\x35", 153 "\x00\x09\x41\xDD" => "\xBA\x39\xBD\x39", 154 "\x00\x09\x42\xEF" => "\xBA\x39\xD9\x33", 155 "\x00\x09\xBA\x2B" => "\xBD\x33\xF5\x37", 156 "\x00\x0A\x26\x00" => "\xBF\x35\xEA\x32", 157 "\x00\x0A\x36\xE9" => "\xBF\x39\xA3\x31", 158 "\x00\x0A\x7A\x20" => "\xC1\x32\xF5\x38", 159 "\x00\x0A\x9C\x93" => "\xC1\x39\xF5\x37", 160 "\x00\x0A\xC0\xD7" => "\xC2\x37\xA6\x31", 161 "\x00\x0A\xD8\x77" => "\xC3\x32\x8C\x39", 162 "\x00\x0B\x1A\x9B" => "\xC4\x35\xC4\x31", 163 "\x00\x0B\x4F\x27" => "\xC5\x36\x9B\x33", 164 "\x00\x0B\x72\x6D" => "\xC6\x33\xB0\x33", 165 "\x00\x0B\xEE\x23" => "\xC8\x38\xC1\x33", 166 "\x00\x0B\xF0\xDF" => "\xC8\x39\x89\x33", 167 "\x00\x0C\x0B\xE1" => "\xC9\x34\xC6\x37", 168 "\x00\x0C\x4C\x98" => "\xCA\x37\xD9\x34", 169 "\x00\x0C\x5F\x41" => "\xCB\x31\xBF\x31", 170 "\x00\x0C\x63\xE4" => "\xCB\x32\xB7\x38", 171 "\x00\x0C\x70\x0A" => "\xCB\x34\xF2\x38", 172 "\x00\x0C\xAD\x6A" => "\xCC\x37\xB0\x30", 173 "\x00\x0C\xCC\x03" => "\xCD\x33\xCB\x33", 174 "\x00\x0C\xD5\x4C" => "\xCD\x35\xBD\x30", 175 "\x00\x0C\xE6\x70" => "\xCD\x38\xF9\x38", 176 "\x00\x0D\x1B\x6A" => "\xCE\x39\xDC\x30", 177 "\x00\x0D\x55\xEE" => "\xD0\x31\xCE\x30", 178 "\x00\x0D\xBB\xB1" => "\xD2\x32\xA5\x31", 179 "\x00\x0D\xC0\x4F" => "\xD2\x33\x9D\x33", 180 "\x00\x0D\xFA\x84" => "\xD3\x35\x87\x34", 181 "\x00\x0E\x16\x71" => "\xD4\x30\xDC\x33", 182 "\x00\x0E\x1E\x03" => "\xD4\x32\xA2\x31", 183 "\x00\x0E\x20\xE8" => "\xD4\x32\xEC\x32", 184 "\x00\x0E\x39\x6A" => "\xD4\x37\xE9\x36", 185 "\x00\x0E\x6A\x95" => "\xD5\x37\xE8\x33", 186 "\x00\x0E\x7E\xCD" => "\xD6\x31\xF5\x39", 187 "\x00\x0E\x80\x69" => "\xD6\x32\xA1\x31", 188 "\x00\x0E\x9A\x7F" => "\xD6\x37\xC6\x39", 189 "\x00\x0E\xEE\x12" => "\xD8\x34\xC4\x34", 190 "\x00\x0E\xFC\xA1" => "\xD8\x37\xBF\x31", 191 "\x00\x0F\x29\xB0" => "\xD9\x36\xD2\x36", 192 "\x00\x0F\x2A\x12" => "\xD9\x36\xDC\x34", 193 "\x00\x0F\x6C\x8C" => "\xDB\x30\x9E\x32", 194 "\x00\x0F\xAF\x04" => "\xDC\x33\xDD\x38", 195 "\x00\x0F\xBE\x65" => "\xDC\x36\xED\x35", 196 "\x00\x0F\xE5\x88" => "\xDD\x34\xE7\x34", 197 "\x00\x0F\xE7\xB1" => "\xDD\x35\xA0\x37", 198 "\x00\x0F\xF4\x27" => "\xDD\x37\xE3\x37"]; 199testAllValidChars($sampleSMP, 'UTF-32BE', 'GB18030-2022', false); 200 201function readGB18030_2022_ConversionTable($path, &$from, &$to, $utf32 = false) { 202 $from = []; 203 $to = []; 204 205 $fp = fopen($path, 'r+'); 206 while ($line = fgets($fp, 256)) { 207 if ($line[0] == '#') 208 continue; 209 if (sscanf($line, "%x\t%x", $codepoint, $char) == 2) { 210 $codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint); 211 if ($char == PHP_INT_MAX) { 212 // We may be on a 32-bit machine and testing a text encoding with 4-byte codes 213 // (which can't be represented in a PHP integer) 214 $char = ""; 215 for ($i = strpos($line, "\t") + 1; $i < strlen($line); $i += 2) { 216 $substr = substr($line, $i, 2); 217 if (ctype_xdigit($substr)) 218 $char .= chr(hexdec($substr)); 219 else 220 break; 221 } 222 } else { 223 if ($char <= 0xFF) 224 $char = chr($char); // hex codes must not have leading zero bytes 225 else if ($char <= 0xFFFF) 226 $char = pack('n', $char); 227 else if ($char <= 0xFFFFFF) 228 $char = chr($char >> 16) . pack('n', $char & 0xFFFF); 229 else 230 $char = pack('N', $char); 231 } 232 $from[$char] = $codepoint; 233 $to[$codepoint] = $char; 234 } 235 } 236} 237 238readGB18030_2022_ConversionTable(__DIR__ . '/data/GB18030-2022MappingTableBMP.txt', $toUnicode, $fromUnicode); 239 240// We will test 4-byte codes separately 241findInvalidChars($toUnicode, $invalid, $truncated); 242 243function notFourByteCode($gb) { 244 return ((ord($gb) < 0x81 || ord($gb) > 0x84) && (ord($gb) < 0x90 || ord($gb) > 0xE3)) || 245 (strlen($gb) > 1 && (ord($gb[1]) < 0x30 || ord($gb[1]) > 0x39)); 246} 247 248$invalid = array_filter($invalid, 'notFourByteCode', ARRAY_FILTER_USE_KEY); 249$truncated = array_filter($truncated, 'notFourByteCode', ARRAY_FILTER_USE_KEY); 250 251testAllValidChars($toUnicode, 'GB18030-2022', 'UTF-16BE', false); 252testAllInvalidChars($invalid, $toUnicode, 'GB18030-2022', 'UTF-16BE', "\x00%"); 253testTruncatedChars($truncated, 'GB18030-2022', 'UTF-16BE', "\x00%"); 254 255echo "Tested GB18030-2022 (BMP) -> UTF-16BE\n"; 256 257// Test one random 4-byte code for each range used for Unicode codepoints in BMP 258function fourByteCodeIndex($byte4, $byte3, $byte2, $byte1) { 259 return (($byte4 - 0x81) * 10 * 126 * 10) + (($byte3 - 0x30) * 10 * 126) + (($byte2 - 0x81) * 10) + ($byte1 - 0x30); 260} 261 262function fourByteCodeFromIndex($index) { 263 $quotient = intdiv($index, 10 * 126 * 10); 264 $byte4 = $quotient + 0x81; 265 $index -= ($quotient * 10 * 126 * 10); 266 $quotient = intdiv($index, 10 * 126); 267 $byte3 = $quotient + 0x30; 268 $index -= ($quotient * 10 * 126); 269 $quotient = intdiv($index, 10); 270 $byte2 = $quotient + 0x81; 271 $byte1 = $index - ($quotient * 10) + 0x30; 272 return chr($byte4) . chr($byte3) . chr($byte2) . chr($byte1); 273} 274 275// Invalid 4-byte codes in range for BMP 276testInvalidString("\x81\x30\x81\xFF", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE"); 277testInvalidString("\x84\x31\xA4\x40", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE"); 278testInvalidString("\x84\x31\xA5\x30", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE"); 279testInvalidString("\x84\x32\x81\x30", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE"); 280testInvalidString("\x85\x31\x81\x30", "\x00\x00\x00%\x00\x00\x00%", "GB18030-2022", "UTF-32BE"); 281 282// Valid 4-byte codes for other Unicode planes 283testValidString("\x90\x30\x81\x30", "\x00\x01\x00\x00", "GB18030-2022", "UTF-32BE"); 284testValidString("\xE3\x32\x9A\x35", "\x00\x10\xFF\xFF", "GB18030-2022", "UTF-32BE"); 285 286// Invalid 4-byte codes for other Unicode planes 287testInvalidString("\x90\x30\x81\xFF", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE"); 288testInvalidString("\xE3\x32\x9A\x36", "\x00\x00\x00%", "GB18030-2022", "UTF-32BE"); 289testInvalidString("\xE4\x30\x81\x35", "\x00\x00\x00%\x00\x00\x00%", "GB18030-2022", "UTF-32BE"); 290 291testInvalidString("\x90\x30\x80\x30", "\x00\x00\x00%\x00\x00\x00\x30", "GB18030-2022", "UTF-32BE"); 292 293echo "Tested GB18030-2022 (SMP) <-> UTF-32BE\n"; 294 295testAllValidChars($fromUnicode, 'UTF-16BE', 'GB18030-2022', false); 296echo "Tested UTF-16BE -> GB18030-2022 (BMP)\n"; 297 298convertInvalidString("\xAA\xB8\x2D\x38\x00\x00\x00#", "%#", "UTF-32BE", "GB18030-2022"); 299 300// Test "long" illegal character markers 301mb_substitute_character("long"); 302convertInvalidString("\x81\x30\x81\xFF", "%", "GB18030-2022", "UTF-8"); 303convertInvalidString("\xE3\x32\x9A\x36", "%", "GB18030-2022", "UTF-8"); 304 305echo "Done!\n"; 306?> 307--EXPECT-- 308Tested GB18030-2022 (BMP) -> UTF-16BE 309Tested GB18030-2022 (SMP) <-> UTF-32BE 310Tested UTF-16BE -> GB18030-2022 (BMP) 311Done! 312