1--TEST-- 2Exhaustive test of verification and conversion of GB18030 text 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11include('encoding_tests.inc'); 12srand(1111); // Make results consistent 13mb_substitute_character(0x25); // '%' 14 15readConversionTable(__DIR__ . '/data/GB18030-2byte.txt', $toUnicode, $fromUnicode); 16 17/* GB18030 represents all Unicode codepoints in the BMP which are _not_ covered by any 18 * 2-byte GB18030 codepoint as a 4-byte code, with each of the 4 bytes in the following ranges: 19 * 20 * - 1st byte: 0x81-0x84 21 * - 2nd byte: 0x30-0x39 22 * - 3rd byte: 0x81-0xFE 23 * - 4th byte: 0x30-0x39 24 * 25 * These start from 0x81308130 and count upwards one by one, with all the Unicode codepoints 26 * which need to be represented as a 4-byte code appearing in sequence. 27 * 28 * Each subarray here is: [starting GB18030 codepoint bytes (4 of them), Unicode codepoint which it 29 * converts to, number of sequential Unicode codepoints represented by sequential GB18030 codepoints] */ 30$gb18030_BMP_Mappings = [ 31 [0x81, 0x30, 0x81, 0x30, 0x80, 36], 32 [0x81, 0x30, 0x84, 0x36, 0xa5, 2], 33 [0x81, 0x30, 0x84, 0x38, 0xa9, 7], 34 [0x81, 0x30, 0x85, 0x35, 0xb2, 5], 35 [0x81, 0x30, 0x86, 0x30, 0xb8, 31], 36 [0x81, 0x30, 0x89, 0x31, 0xd8, 8], 37 [0x81, 0x30, 0x89, 0x39, 0xe2, 6], 38 [0x81, 0x30, 0x8a, 0x35, 0xeb, 1], 39 [0x81, 0x30, 0x8a, 0x36, 0xee, 4], 40 [0x81, 0x30, 0x8b, 0x30, 0xf4, 3], 41 [0x81, 0x30, 0x8b, 0x33, 0xf8, 1], 42 [0x81, 0x30, 0x8b, 0x34, 0xfb, 1], 43 [0x81, 0x30, 0x8b, 0x35, 0xfd, 4], 44 [0x81, 0x30, 0x8b, 0x39, 0x102, 17], 45 [0x81, 0x30, 0x8d, 0x36, 0x114, 7], 46 [0x81, 0x30, 0x8e, 0x33, 0x11c, 15], 47 [0x81, 0x30, 0x8f, 0x38, 0x12c, 24], 48 [0x81, 0x30, 0x92, 0x32, 0x145, 3], 49 [0x81, 0x30, 0x92, 0x35, 0x149, 4], 50 [0x81, 0x30, 0x92, 0x39, 0x14e, 29], 51 [0x81, 0x30, 0x95, 0x38, 0x16c, 98], 52 [0x81, 0x30, 0x9f, 0x36, 0x1cf, 1], 53 [0x81, 0x30, 0x9f, 0x37, 0x1d1, 1], 54 [0x81, 0x30, 0x9f, 0x38, 0x1d3, 1], 55 [0x81, 0x30, 0x9f, 0x39, 0x1d5, 1], 56 [0x81, 0x30, 0xa0, 0x30, 0x1d7, 1], 57 [0x81, 0x30, 0xa0, 0x31, 0x1d9, 1], 58 [0x81, 0x30, 0xa0, 0x32, 0x1db, 1], 59 [0x81, 0x30, 0xa0, 0x33, 0x1dd, 28], 60 [0x81, 0x30, 0xa3, 0x31, 0x1fa, 87], 61 [0x81, 0x30, 0xab, 0x38, 0x252, 15], 62 [0x81, 0x30, 0xad, 0x33, 0x262, 101], 63 [0x81, 0x30, 0xb7, 0x34, 0x2c8, 1], 64 [0x81, 0x30, 0xb7, 0x35, 0x2cc, 13], 65 [0x81, 0x30, 0xb8, 0x38, 0x2da, 183], 66 [0x81, 0x30, 0xcb, 0x31, 0x3a2, 1], 67 [0x81, 0x30, 0xcb, 0x32, 0x3aa, 7], 68 [0x81, 0x30, 0xcb, 0x39, 0x3c2, 1], 69 [0x81, 0x30, 0xcc, 0x30, 0x3ca, 55], 70 [0x81, 0x30, 0xd1, 0x35, 0x402, 14], 71 [0x81, 0x30, 0xd2, 0x39, 0x450, 1], 72 [0x81, 0x30, 0xd3, 0x30, 0x452, 7102], 73 [0x81, 0x36, 0xa5, 0x32, 0x2011, 2], 74 [0x81, 0x36, 0xa5, 0x34, 0x2017, 1], 75 [0x81, 0x36, 0xa5, 0x35, 0x201a, 2], 76 [0x81, 0x36, 0xa5, 0x37, 0x201e, 7], 77 [0x81, 0x36, 0xa6, 0x34, 0x2027, 9], 78 [0x81, 0x36, 0xa7, 0x33, 0x2031, 1], 79 [0x81, 0x36, 0xa7, 0x34, 0x2034, 1], 80 [0x81, 0x36, 0xa7, 0x35, 0x2036, 5], 81 [0x81, 0x36, 0xa8, 0x30, 0x203c, 112], 82 [0x81, 0x36, 0xb3, 0x32, 0x20ad, 86], 83 [0x81, 0x36, 0xbb, 0x38, 0x2104, 1], 84 [0x81, 0x36, 0xbb, 0x39, 0x2106, 3], 85 [0x81, 0x36, 0xbc, 0x32, 0x210a, 12], 86 [0x81, 0x36, 0xbd, 0x34, 0x2117, 10], 87 [0x81, 0x36, 0xbe, 0x34, 0x2122, 62], 88 [0x81, 0x36, 0xc4, 0x36, 0x216c, 4], 89 [0x81, 0x36, 0xc5, 0x30, 0x217a, 22], 90 [0x81, 0x36, 0xc7, 0x32, 0x2194, 2], 91 [0x81, 0x36, 0xc7, 0x34, 0x219a, 110], 92 [0x81, 0x36, 0xd2, 0x34, 0x2209, 6], 93 [0x81, 0x36, 0xd3, 0x30, 0x2210, 1], 94 [0x81, 0x36, 0xd3, 0x31, 0x2212, 3], 95 [0x81, 0x36, 0xd3, 0x34, 0x2216, 4], 96 [0x81, 0x36, 0xd3, 0x38, 0x221b, 2], 97 [0x81, 0x36, 0xd4, 0x30, 0x2221, 2], 98 [0x81, 0x36, 0xd4, 0x32, 0x2224, 1], 99 [0x81, 0x36, 0xd4, 0x33, 0x2226, 1], 100 [0x81, 0x36, 0xd4, 0x34, 0x222c, 2], 101 [0x81, 0x36, 0xd4, 0x36, 0x222f, 5], 102 [0x81, 0x36, 0xd5, 0x31, 0x2238, 5], 103 [0x81, 0x36, 0xd5, 0x36, 0x223e, 10], 104 [0x81, 0x36, 0xd6, 0x36, 0x2249, 3], 105 [0x81, 0x36, 0xd6, 0x39, 0x224d, 5], 106 [0x81, 0x36, 0xd7, 0x34, 0x2253, 13], 107 [0x81, 0x36, 0xd8, 0x37, 0x2262, 2], 108 [0x81, 0x36, 0xd8, 0x39, 0x2268, 6], 109 [0x81, 0x36, 0xd9, 0x35, 0x2270, 37], 110 [0x81, 0x36, 0xdd, 0x32, 0x2296, 3], 111 [0x81, 0x36, 0xdd, 0x35, 0x229a, 11], 112 [0x81, 0x36, 0xde, 0x36, 0x22a6, 25], 113 [0x81, 0x36, 0xe1, 0x31, 0x22c0, 82], 114 [0x81, 0x36, 0xe9, 0x33, 0x2313, 333], 115 [0x81, 0x37, 0x8c, 0x36, 0x246a, 10], 116 [0x81, 0x37, 0x8d, 0x36, 0x249c, 100], 117 [0x81, 0x37, 0x97, 0x36, 0x254c, 4], 118 [0x81, 0x37, 0x98, 0x30, 0x2574, 13], 119 [0x81, 0x37, 0x99, 0x33, 0x2590, 3], 120 [0x81, 0x37, 0x99, 0x36, 0x2596, 10], 121 [0x81, 0x37, 0x9a, 0x36, 0x25a2, 16], 122 [0x81, 0x37, 0x9c, 0x32, 0x25b4, 8], 123 [0x81, 0x37, 0x9d, 0x30, 0x25be, 8], 124 [0x81, 0x37, 0x9d, 0x38, 0x25c8, 3], 125 [0x81, 0x37, 0x9e, 0x31, 0x25cc, 2], 126 [0x81, 0x37, 0x9e, 0x33, 0x25d0, 18], 127 [0x81, 0x37, 0xa0, 0x31, 0x25e6, 31], 128 [0x81, 0x37, 0xa3, 0x32, 0x2607, 2], 129 [0x81, 0x37, 0xa3, 0x34, 0x260a, 54], 130 [0x81, 0x37, 0xa8, 0x38, 0x2641, 1], 131 [0x81, 0x37, 0xa8, 0x39, 0x2643, 2110], 132 [0x81, 0x38, 0xfd, 0x39, 0x2e82, 2], 133 [0x81, 0x38, 0xfe, 0x31, 0x2e85, 3], 134 [0x81, 0x38, 0xfe, 0x34, 0x2e89, 2], 135 [0x81, 0x38, 0xfe, 0x36, 0x2e8d, 10], 136 [0x81, 0x39, 0x81, 0x36, 0x2e98, 15], 137 [0x81, 0x39, 0x83, 0x31, 0x2ea8, 2], 138 [0x81, 0x39, 0x83, 0x33, 0x2eab, 3], 139 [0x81, 0x39, 0x83, 0x36, 0x2eaf, 4], 140 [0x81, 0x39, 0x84, 0x30, 0x2eb4, 2], 141 [0x81, 0x39, 0x84, 0x32, 0x2eb8, 3], 142 [0x81, 0x39, 0x84, 0x35, 0x2ebc, 14], 143 [0x81, 0x39, 0x85, 0x39, 0x2ecb, 293], 144 [0x81, 0x39, 0xa3, 0x32, 0x2ffc, 4], 145 [0x81, 0x39, 0xa3, 0x36, 0x3004, 1], 146 [0x81, 0x39, 0xa3, 0x37, 0x3018, 5], 147 [0x81, 0x39, 0xa4, 0x32, 0x301f, 2], 148 [0x81, 0x39, 0xa4, 0x34, 0x302a, 20], 149 [0x81, 0x39, 0xa6, 0x34, 0x303f, 2], 150 [0x81, 0x39, 0xa6, 0x36, 0x3094, 7], 151 [0x81, 0x39, 0xa7, 0x33, 0x309f, 2], 152 [0x81, 0x39, 0xa7, 0x35, 0x30f7, 5], 153 [0x81, 0x39, 0xa8, 0x30, 0x30ff, 6], 154 [0x81, 0x39, 0xa8, 0x36, 0x312a, 246], 155 [0x81, 0x39, 0xc1, 0x32, 0x322a, 7], 156 [0x81, 0x39, 0xc1, 0x39, 0x3232, 113], 157 [0x81, 0x39, 0xcd, 0x32, 0x32a4, 234], 158 [0x81, 0x39, 0xe4, 0x36, 0x3390, 12], 159 [0x81, 0x39, 0xe5, 0x38, 0x339f, 2], 160 [0x81, 0x39, 0xe6, 0x30, 0x33a2, 34], 161 [0x81, 0x39, 0xe9, 0x34, 0x33c5, 9], 162 [0x81, 0x39, 0xea, 0x33, 0x33cf, 2], 163 [0x81, 0x39, 0xea, 0x35, 0x33d3, 2], 164 [0x81, 0x39, 0xea, 0x37, 0x33d6, 113], 165 [0x81, 0x39, 0xf6, 0x30, 0x3448, 43], 166 [0x81, 0x39, 0xfa, 0x33, 0x3474, 298], 167 [0x82, 0x30, 0x9a, 0x31, 0x359f, 111], 168 [0x82, 0x30, 0xa5, 0x32, 0x360f, 11], 169 [0x82, 0x30, 0xa6, 0x33, 0x361b, 765], 170 [0x82, 0x30, 0xf2, 0x38, 0x3919, 85], 171 [0x82, 0x30, 0xfb, 0x33, 0x396f, 96], 172 [0x82, 0x31, 0x86, 0x39, 0x39d1, 14], 173 [0x82, 0x31, 0x88, 0x33, 0x39e0, 147], 174 [0x82, 0x31, 0x97, 0x30, 0x3a74, 218], 175 [0x82, 0x31, 0xac, 0x38, 0x3b4f, 287], 176 [0x82, 0x31, 0xc9, 0x35, 0x3c6f, 113], 177 [0x82, 0x31, 0xd4, 0x38, 0x3ce1, 885], 178 [0x82, 0x32, 0xaf, 0x33, 0x4057, 264], 179 [0x82, 0x32, 0xc9, 0x37, 0x4160, 471], 180 [0x82, 0x32, 0xf8, 0x38, 0x4338, 116], 181 [0x82, 0x33, 0x86, 0x34, 0x43ad, 4], 182 [0x82, 0x33, 0x86, 0x38, 0x43b2, 43], 183 [0x82, 0x33, 0x8b, 0x31, 0x43de, 248], 184 [0x82, 0x33, 0xa3, 0x39, 0x44d7, 373], 185 [0x82, 0x33, 0xc9, 0x32, 0x464d, 20], 186 [0x82, 0x33, 0xcb, 0x32, 0x4662, 193], 187 [0x82, 0x33, 0xde, 0x35, 0x4724, 5], 188 [0x82, 0x33, 0xdf, 0x30, 0x472a, 82], 189 [0x82, 0x33, 0xe7, 0x32, 0x477d, 16], 190 [0x82, 0x33, 0xe8, 0x38, 0x478e, 441], 191 [0x82, 0x34, 0x96, 0x39, 0x4948, 50], 192 [0x82, 0x34, 0x9b, 0x39, 0x497b, 2], 193 [0x82, 0x34, 0x9c, 0x31, 0x497e, 4], 194 [0x82, 0x34, 0x9c, 0x35, 0x4984, 1], 195 [0x82, 0x34, 0x9c, 0x36, 0x4987, 20], 196 [0x82, 0x34, 0x9e, 0x36, 0x499c, 3], 197 [0x82, 0x34, 0x9e, 0x39, 0x49a0, 22], 198 [0x82, 0x34, 0xa1, 0x31, 0x49b8, 703], 199 [0x82, 0x34, 0xe7, 0x34, 0x4c78, 39], 200 [0x82, 0x34, 0xeb, 0x33, 0x4ca4, 111], 201 [0x82, 0x34, 0xf6, 0x34, 0x4d1a, 148], 202 [0x82, 0x35, 0x87, 0x32, 0x4daf, 81], 203 [0x82, 0x35, 0x8f, 0x33, 0x9fa6, 14426], 204 [0x83, 0x36, 0xc7, 0x39, 0xe76c, 1], 205 [0x83, 0x36, 0xc8, 0x30, 0xe7c8, 1], 206 [0x83, 0x36, 0xc8, 0x31, 0xe7e7, 13], 207 [0x83, 0x36, 0xc9, 0x34, 0xe815, 1], 208 [0x83, 0x36, 0xc9, 0x35, 0xe819, 5], 209 [0x83, 0x36, 0xca, 0x30, 0xe81f, 7], 210 [0x83, 0x36, 0xca, 0x37, 0xe827, 4], 211 [0x83, 0x36, 0xcb, 0x31, 0xe82d, 4], 212 [0x83, 0x36, 0xcb, 0x35, 0xe833, 8], 213 [0x83, 0x36, 0xcc, 0x33, 0xe83c, 7], 214 [0x83, 0x36, 0xcd, 0x30, 0xe844, 16], 215 [0x83, 0x36, 0xce, 0x36, 0xe856, 14], 216 [0x83, 0x36, 0xd0, 0x30, 0xe865, 4295], 217 [0x84, 0x30, 0x85, 0x35, 0xf92d, 76], 218 [0x84, 0x30, 0x8d, 0x31, 0xf97a, 27], 219 [0x84, 0x30, 0x8f, 0x38, 0xf996, 81], 220 [0x84, 0x30, 0x97, 0x39, 0xf9e8, 9], 221 [0x84, 0x30, 0x98, 0x38, 0xf9f2, 26], 222 [0x84, 0x30, 0x9b, 0x34, 0xfa10, 1], 223 [0x84, 0x30, 0x9b, 0x35, 0xfa12, 1], 224 [0x84, 0x30, 0x9b, 0x36, 0xfa15, 3], 225 [0x84, 0x30, 0x9b, 0x39, 0xfa19, 6], 226 [0x84, 0x30, 0x9c, 0x35, 0xfa22, 1], 227 [0x84, 0x30, 0x9c, 0x36, 0xfa25, 2], 228 [0x84, 0x30, 0x9c, 0x38, 0xfa2a, 1030], 229 [0x84, 0x31, 0x85, 0x38, 0xfe32, 1], 230 [0x84, 0x31, 0x85, 0x39, 0xfe45, 4], 231 [0x84, 0x31, 0x86, 0x33, 0xfe53, 1], 232 [0x84, 0x31, 0x86, 0x34, 0xfe58, 1], 233 [0x84, 0x31, 0x86, 0x35, 0xfe67, 1], 234 [0x84, 0x31, 0x86, 0x36, 0xfe6c, 149], 235 [0x84, 0x31, 0x95, 0x35, 0xff5f, 129], 236 [0x84, 0x31, 0xa2, 0x34, 0xffe6, 26], 237]; 238 239// We will test 4-byte codes separately 240findInvalidChars($toUnicode, $invalid, $truncated); 241 242function notFourByteCode($gb) { 243 return ((ord($gb) < 0x81 || ord($gb) > 0x84) && (ord($gb) < 0x90 || ord($gb) > 0xE3)) || 244 (strlen($gb) > 1 && (ord($gb[1]) < 0x30 || ord($gb[1]) > 0x39)); 245} 246 247$invalid = array_filter($invalid, 'notFourByteCode', ARRAY_FILTER_USE_KEY); 248$truncated = array_filter($truncated, 'notFourByteCode', ARRAY_FILTER_USE_KEY); 249 250testAllValidChars($toUnicode, 'GB18030', 'UTF-16BE', false); 251testAllInvalidChars($invalid, $toUnicode, 'GB18030', 'UTF-16BE', "\x00%"); 252testTruncatedChars($truncated, 'GB18030', 'UTF-16BE', "\x00%"); 253 254echo "Tested GB18030 (1 and 2 byte characters) -> UTF-16BE\n"; 255 256// Test one random 4-byte code for each range used for Unicode codepoints in BMP 257function fourByteCodeIndex($byte4, $byte3, $byte2, $byte1) { 258 return (($byte4 - 0x81) * 10 * 126 * 10) + (($byte3 - 0x30) * 10 * 126) + (($byte2 - 0x81) * 10) + ($byte1 - 0x30); 259} 260 261function fourByteCodeFromIndex($index) { 262 $quotient = intdiv($index, 10 * 126 * 10); 263 $byte4 = $quotient + 0x81; 264 $index -= ($quotient * 10 * 126 * 10); 265 $quotient = intdiv($index, 10 * 126); 266 $byte3 = $quotient + 0x30; 267 $index -= ($quotient * 10 * 126); 268 $quotient = intdiv($index, 10); 269 $byte2 = $quotient + 0x81; 270 $byte1 = $index - ($quotient * 10) + 0x30; 271 return chr($byte4) . chr($byte3) . chr($byte2) . chr($byte1); 272} 273 274foreach ($gb18030_BMP_Mappings as $mapping) { 275 [$byte4, $byte3, $byte2, $byte1, $unicode, $n] = $mapping; 276 $i = rand(0, $n-1); 277 $gb = fourByteCodeFromIndex(fourByteCodeIndex($byte4, $byte3, $byte2, $byte1) + $i); 278 $unicode += $i; 279 testValidString($gb, pack('n', $unicode), 'GB18030', 'UTF-16BE'); 280} 281 282// Invalid 4-byte codes in range for BMP 283testInvalidString("\x81\x30\x81\xFF", "\x00\x00\x00%", "GB18030", "UTF-32BE"); 284testInvalidString("\x84\x31\xA4\x40", "\x00\x00\x00%", "GB18030", "UTF-32BE"); 285testInvalidString("\x84\x31\xA5\x30", "\x00\x00\x00%", "GB18030", "UTF-32BE"); 286testInvalidString("\x84\x32\x81\x30", "\x00\x00\x00%", "GB18030", "UTF-32BE"); 287testInvalidString("\x85\x31\x81\x30", "\x00\x00\x00%\x00\x00\x00%", "GB18030", "UTF-32BE"); 288 289// Valid 4-byte codes for other Unicode planes 290testValidString("\x90\x30\x81\x30", "\x00\x01\x00\x00", "GB18030", "UTF-32BE"); 291testValidString("\xE3\x32\x9A\x35", "\x00\x10\xFF\xFF", "GB18030", "UTF-32BE"); 292 293// Invalid 4-byte codes for other Unicode planes 294testInvalidString("\x90\x30\x81\xFF", "\x00\x00\x00%", "GB18030", "UTF-32BE"); 295testInvalidString("\xE3\x32\x9A\x36", "\x00\x00\x00%", "GB18030", "UTF-32BE"); 296testInvalidString("\xE4\x30\x81\x35", "\x00\x00\x00%\x00\x00\x00%", "GB18030", "UTF-32BE"); 297 298testInvalidString("\x90\x30\x80\x30", "\x00\x00\x00%\x00\x00\x00\x30", "GB18030", "UTF-32BE"); 299 300echo "Tested GB18030 4-byte characters <-> UTF-16BE\n"; 301 302testAllValidChars($fromUnicode, 'UTF-16BE', 'GB18030', false); 303echo "Tested UTF-16BE -> GB18030 (1 and 2 byte characters)\n"; 304 305// Regression test 306// This was found by a fuzzer (previously the invalid codepoint would be converted to \x00) 307convertInvalidString("\xAA\xB8\x2D\x38\x00\x00\x00#", "%#", "UTF-32BE", "GB18030"); 308 309// Test "long" illegal character markers 310mb_substitute_character("long"); 311convertInvalidString("\x81\x30\x81\xFF", "%", "GB18030", "UTF-8"); 312convertInvalidString("\xE3\x32\x9A\x36", "%", "GB18030", "UTF-8"); 313 314echo "Done!\n"; 315?> 316--EXPECT-- 317Tested GB18030 (1 and 2 byte characters) -> UTF-16BE 318Tested GB18030 4-byte characters <-> UTF-16BE 319Tested UTF-16BE -> GB18030 (1 and 2 byte characters) 320Done! 321