1--TEST-- 2Exhaustive test of CP50220, CP50221, and CP50222 encodings 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11include('encoding_tests.inc'); 12mb_substitute_character(0x25); // '%' 13 14function shiftJISDecode($bytes) { 15 /* Convert CP932's default Shift-JIS representation to kuten code 16 * 17 * Shift-JIS is fun! The first byte only represents the top 7 bits of 18 * the ku number, because 94 first bytes were not available. There are 19 * two different ranges of 94 which the second byte can fall in, and 20 * we get the low bit of the ku number by seeing which one it is. */ 21 $first = ($bytes >> 8) & 0xFF; 22 $second = $bytes & 0xFF; 23 $hi_bits = $first - (($first > 0x9F) ? 0xE0 - 31 : 0x81); 24 if ($second > 0x9E) { 25 $kuten = ((($hi_bits << 1) + 0x22) << 8) + ($second - 0x9F + 0x21); 26 } else if ($second > 0x7F) { 27 $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x80 + 63 + 0x21); 28 } else { 29 $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x40 + 0x21); 30 } 31 return $kuten; 32} 33 34/* Read in table of all characters in CP932 charset */ 35$cp932Chars = array(); /* CP932 -> UTF-16BE */ 36$nonInvertible = array(); 37$fromUnicode = array(); 38$fp = fopen(__DIR__ . '/data/CP932.txt', 'r+'); 39while ($line = fgets($fp, 256)) { 40 if ($line[0] == '#') 41 continue; 42 43 if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) { 44 if ($bytes < 256) 45 continue; 46 47 48 if (isset($fromUnicode[$codepoint])) { 49 $nonInvertible[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint); 50 } else { 51 $cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('n', $codepoint); 52 $fromUnicode[$codepoint] = $bytes; 53 } 54 } 55} 56 57/* Aside from the characters in that table, we also support a 'user' area, 58 * which maps to Unicode 'private' codepoints 0xE000-E757 */ 59$codepoint = 0xE000; 60for ($i = 0xF0; $i <= 0xF9; $i++) { 61 for ($j = 0x40; $j <= 0xFC; $j++) { 62 if ($j == 0x7F) 63 continue; 64 $cp932Chars[pack('n', shiftJISDecode(($i << 8) + $j))] = pack('n', $codepoint); 65 $codepoint++; 66 } 67} 68 69/* Read in table of all characters in JISX-0201 charset */ 70$jisx0201Chars = array(); /* JISX0201 -> UTF-16BE */ 71$fp = fopen(__DIR__ . '/data/JISX0201.txt', 'r+'); 72while ($line = fgets($fp, 256)) { 73 if ($line[0] == '#') 74 continue; 75 76 if (sscanf($line, "0x%x\t0x%x", $byte, $codepoint) == 2) 77 $jisx0201Chars[chr($byte)] = pack('n', $codepoint); 78} 79 80/* Read in table of all characters in JISX-0212 charset */ 81$jisx0212Chars = array(); 82$fp = fopen(__DIR__ . '/data/JISX0212.txt', 'r+'); 83while ($line = fgets($fp, 256)) { 84 if ($line[0] == '#') 85 continue; 86 87 if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) { 88 $jisx0212Chars[pack('n', $bytes)] = pack('n', $codepoint); 89 } 90} 91 92/* Our conversions between CP5022x (when CP932 charset is selected) and Unicode 93 * differ in a number of places from the table provided by the Unicode Consortium */ 94$cp932Chars["\x21\x41"] = "\x30\x1C"; /* WAVE DASH instead of FULLWIDTH TILDE */ 95$cp932Chars["\x21\x42"] = "\x20\x16"; /* DOUBLE VERTICAL LINE instead of PARALLEL TO */ 96$cp932Chars["\x21\x5D"] = "\x22\x12"; /* MINUS SIGN instead of FULLWIDTH HYPHEN-MINUS */ 97$cp932Chars["\x21\x71"] = "\x00\xA2"; /* CENT SIGN instead of FULLWIDTH CENT SIGN */ 98$cp932Chars["\x21\x72"] = "\x00\xA3"; /* POUND SIGN instead of FULLWIDTH POUND SIGN */ 99$cp932Chars["\x22\x4C"] = "\x00\xAC"; /* NOT SIGN instead of FULLWIDTH NOT SIGN */ 100 101function testValid($from, $to, $encoding, $bothWays = true) { 102 identifyValidString($from, $encoding); 103 convertValidString($from, $to, $encoding, 'UTF-16BE', false); 104 105 if ($bothWays) { 106 /* An 0xF at the beginning is redundant; it switches to ASCII mode, but 107 * ASCII mode is default */ 108 if ($from[0] == "\x0F") 109 $from = substr($from, 1, strlen($from) - 1); 110 /* ESC ( B at the beginning is redundant, since ASCII mode is the default */ 111 if (substr($from, 0, 3) == "\x1B(B") 112 $from = substr($from, 3, strlen($from) - 3); 113 /* If the string switches to a different charset, it should switch back to 114 * ASCII at the end */ 115 if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(J") !== false || strpos($from, "\x1B(I") !== false) 116 $from .= "\x1B(B"; 117 if ($encoding == 'CP50222' && $from[0] == "\x0E") 118 $from .= "\x0F"; 119 120 convertValidString($to, $from, 'UTF-16BE', $encoding, false); 121 } 122} 123 124function testInvalid($from, $to, $encoding) { 125 testInvalidString($from, $to, $encoding, 'UTF-16BE'); 126} 127 128for ($i = 0; $i < 0x80; $i++) { 129 if ($i == 0xE || $i == 0xF || $i == 0x1B) 130 continue; 131 testValid(chr($i), "\x00" . chr($i), 'CP50220'); 132 testValid(chr($i), "\x00" . chr($i), 'CP50221'); 133 testValid(chr($i), "\x00" . chr($i), 'CP50222'); 134 testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'CP50220'); 135 testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'CP50221'); 136 testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'CP50222'); 137 testValid("\x0F" . chr($i), "\x00" . chr($i), 'CP50222', false); /* 0xF is 'Shift Out' code */ 138} 139 140for ($i = 0x80; $i < 256; $i++) { 141 if ($i >= 0xA1 && $i <= 0xDF) // We convert single bytes from 0xA1-0xDF as JIS X 0201 kana 142 continue; 143 testInvalid(chr($i), "\x00%", 'CP50220'); 144 testInvalid(chr($i), "\x00%", 'CP50221'); 145 testInvalid(chr($i), "\x00%", 'CP50222'); 146 testInvalid("\x1B(B" . chr($i), "\x00%", 'CP50220'); 147 testInvalid("\x1B(B" . chr($i), "\x00%", 'CP50221'); 148 testInvalid("\x1B(B" . chr($i), "\x00%", 'CP50222'); 149 testInvalid("\x0F" . chr($i), "\x00%", 'CP50220'); 150 testInvalid("\x0F" . chr($i), "\x00%", 'CP50221'); 151 testInvalid("\x0F" . chr($i), "\x00%", 'CP50222'); 152} 153 154// Switch back to ASCII after a multibyte character 155convertValidString("\x30\x00\x00a\x00b\x00c", "\x1B\$B\x21\x21\x1B(Babc", 'UTF-16BE', 'CP50221', false); 156convertValidString("\x30\x00\x00a\x00b\x00c", "\x1B\$B\x21\x21\x1B(Babc", 'UTF-16BE', 'CP50222', false); 157 158echo "ASCII support OK\n"; 159 160/* All valid JIS X 0201 characters 161 * Those with a 1 in the high bit are JIS X 0201 kana */ 162foreach ($jisx0201Chars as $jisx0201 => $utf16BE) { 163 if (ord($jisx0201) >= 128) { /* Kana */ 164 $kana = chr(ord($jisx0201) - 128); 165 testValid("\x1B(I" . $kana, $utf16BE, 'CP50221'); 166 testValid("\x1B(J\x0E" . $kana, $utf16BE, 'CP50222', false); /* 0xE is 'Shift In' code */ 167 testValid("\x0E" . $kana, $utf16BE, 'CP50222', false); 168 testValid($jisx0201, $utf16BE, 'CP50220', false); 169 testValid($jisx0201, $utf16BE, 'CP50221', false); 170 testValid($jisx0201, $utf16BE, 'CP50222', false); 171 convertValidString($utf16BE, "\x0E" . chr(ord($jisx0201) - 0x80) . "\x0F", 'UTF-16BE', 'CP50222', false); 172 } else { /* Latin */ 173 testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50220', $utf16BE > "\x00\x80"); 174 testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50221', $utf16BE > "\x00\x80"); 175 testValid("\x1B(J" . $jisx0201, $utf16BE, 'CP50222', $utf16BE > "\x00\x80"); 176 } 177} 178 179for ($i = 0x80; $i < 256; $i++) { 180 if ($i >= 0xA1 && $i <= 0xDF) 181 continue; 182 testInvalid("\x1B(I" . chr($i), "\x00%", 'CP50220'); 183 testInvalid("\x1B(I" . chr($i), "\x00%", 'CP50221'); 184 testInvalid("\x1B(I" . chr($i), "\x00%", 'CP50222'); 185 testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50220'); 186 testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50221'); 187 testInvalid("\x1B(J" . chr($i), "\x00%", 'CP50222'); 188} 189 190/* Go from JIS X 0201 to ASCII or JIS X 0208 */ 191convertValidString("\xFF\x61\x00A", "\x0E\x21\x0FA", 'UTF-16BE', 'CP50222', false); 192convertValidString("\xFF\x61\x22\x25", "\x0E\x21\x0F\x1B\$B\x21\x42\x1B(B", 'UTF-16BE', 'CP50222', false); 193convertValidString("\xFF\x61\x20\x3E", "\x0E\x21\x0F\x1B(J\x7E\x1B(B", 'UTF-16BE', 'CP50222'); 194 195echo "JIS X 0201 support OK\n"; 196 197/* All valid CP932 characters */ 198foreach ($cp932Chars as $cp932 => $utf16BE) { 199 testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50220'); 200 testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50221'); 201 testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50222'); 202} 203foreach ($nonInvertible as $cp932 => $utf16BE) { 204 testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50220', false); 205 testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50221', false); 206 testValid("\x1B\$B" . $cp932, $utf16BE, 'CP50222', false); 207} 208 209/* There are some conversions we support from Unicode -> CP5022x, but not in the opposite direction */ 210foreach (['CP50220', 'CP50221', 'CP50222'] as $encoding) { 211 convertValidString("\x22\x25", "\x1B\$B\x21\x42\x1B(B", 'UTF-16BE', $encoding, false); 212 convertValidString("\xFF\x0D", "\x1B\$B\x21\x5D\x1B(B", 'UTF-16BE', $encoding, false); 213 convertValidString("\xFF\xE0", "\x1B\$B\x21\x71\x1B(B", 'UTF-16BE', $encoding, false); 214 convertValidString("\xFF\xE1", "\x1B\$B\x21\x72\x1B(B", 'UTF-16BE', $encoding, false); 215 convertValidString("\xFF\xE2", "\x1B\$B\x22\x4C\x1B(B", 'UTF-16BE', $encoding, false); 216} 217 218/* All invalid 2-byte CP932 characters */ 219for ($i = 0x21; $i <= 0x97; $i++) { 220 for ($j = 0; $j < 256; $j++) { 221 $testString = chr($i) . chr($j); 222 if (!isset($cp932Chars[$testString]) && !isset($nonInvertible[$testString])) { 223 testInvalid("\x1B\$B" . $testString, "\x00%", 'CP50220'); 224 testInvalid("\x1B\$B" . $testString, "\x00%", 'CP50221'); 225 testInvalid("\x1B\$B" . $testString, "\x00%", 'CP50222'); 226 } 227 } 228} 229 230/* Try truncated 2-byte characters */ 231for ($i = 0x21; $i <= 0x97; $i++) { 232 testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50220'); 233 testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50221'); 234 testInvalid("\x1B\$B" . chr($i), "\x00%", 'CP50222'); 235} 236 237/* Test alternative escape sequence to select CP932 */ 238testValid("\x1B\$(B\x21\x21", "\x30\x00", 'CP50220', false); 239 240echo "CP932 support OK\n"; 241 242foreach ($jisx0212Chars as $jisx0212 => $utf16BE) { 243 testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50220', false); 244 testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50221', false); 245 testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'CP50222', false); 246} 247 248for ($i = 0x21; $i <= 0x97; $i++) { 249 for ($j = 0; $j < 256; $j++) { 250 $testString = chr($i) . chr($j); 251 if (!isset($jisx0212Chars[$testString])) { 252 testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50220'); 253 testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50221'); 254 testInvalid("\x1B\$(D" . $testString, "\x00%", 'CP50222'); 255 } 256 } 257} 258 259for ($i = 0x21; $i <= 0x97; $i++) { 260 testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50220'); 261 testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50221'); 262 testInvalid("\x1B\$(D" . chr($i), "\x00%", 'CP50222'); 263} 264 265echo "JIS X 0212 support OK\n"; 266 267/* Unicode codepoint for halfwidth katakana -> kuten code for ordinary katakana */ 268$fullwidthKatakana = array( 269 0xFF61 => 0x2123, /* Ideographic full stop */ 270 0xFF62 => 0x2156, /* Left corner bracket */ 271 0xFF63 => 0x2157, /* Right corner bracket */ 272 0xFF64 => 0x2122, /* Ideographic comma */ 273 0xFF65 => 0x2126, /* Katakana middle dot */ 274 0xFF66 => 0x2572, /* Wo */ 275 0xFF67 => 0x2521, /* Small A */ 276 0xFF68 => 0x2523, /* Small I */ 277 0xFF69 => 0x2525, /* Small U */ 278 0xFF6A => 0x2527, /* Small E */ 279 0xFF6B => 0x2529, /* Small O */ 280 0xFF6C => 0x2563, /* Small Ya */ 281 0xFF6D => 0x2565, /* Small Yu */ 282 0xFF6E => 0x2567, /* Small Yo */ 283 0xFF6F => 0x2543, /* Small Tsu */ 284 0xFF70 => 0x213C, /* Prolonged Sound Marker */ 285 0xFF71 => 0x2522, /* A */ 286 0xFF72 => 0x2524, /* I */ 287 0xFF73 => 0x2526, /* U */ 288 0xFF74 => 0x2528, /* E */ 289 0xFF75 => 0x252A, /* O */ 290 0xFF76 => 0x252B, /* Ka */ 291 0xFF77 => 0x252D, /* Ki */ 292 0xFF78 => 0x252F, /* Ku */ 293 0xFF79 => 0x2531, /* Ke */ 294 0xFF7A => 0x2533, /* Ko */ 295 0xFF7B => 0x2535, /* Sa */ 296 0xFF7C => 0x2537, /* Shi */ 297 0xFF7D => 0x2539, /* Su */ 298 0xFF7E => 0x253B, /* Se */ 299 0xFF7F => 0x253D, /* So */ 300 0xFF80 => 0x253F, /* Ta */ 301 0xFF81 => 0x2541, /* Chi */ 302 0xFF82 => 0x2544, /* Tsu */ 303 0xFF83 => 0x2546, /* Te */ 304 0xFF84 => 0x2548, /* To */ 305 0xFF85 => 0x254A, /* Na */ 306 0xFF86 => 0x254B, /* Ni */ 307 0xFF87 => 0x254C, /* Nu */ 308 0xFF88 => 0x254D, /* Ne */ 309 0xFF89 => 0x254E, /* No */ 310 0xFF8A => 0x254F, /* Ha */ 311 0xFF8B => 0x2552, /* Hi */ 312 0xFF8C => 0x2555, /* Fu */ 313 0xFF8D => 0x2558, /* He */ 314 0xFF8E => 0x255B, /* Ho */ 315 0xFF8F => 0x255E, /* Ma */ 316 0xFF90 => 0x255F, /* Mi */ 317 0xFF91 => 0x2560, /* Mu */ 318 0xFF92 => 0x2561, /* Me */ 319 0xFF93 => 0x2562, /* Mo */ 320 0xFF94 => 0x2564, /* Ya */ 321 0xFF95 => 0x2566, /* Yu */ 322 0xFF96 => 0x2568, /* Yo */ 323 0xFF97 => 0x2569, /* Ra */ 324 0xFF98 => 0x256A, /* Ri */ 325 0xFF99 => 0x256B, /* Ru */ 326 0xFF9A => 0x256C, /* Re */ 327 0xFF9B => 0x256D, /* Ro */ 328 0xFF9C => 0x256F, /* Wa */ 329 0xFF9D => 0x2573, /* N */ 330 0xFF9E => 0x212B, /* Voice Mark */ 331 0xFF9F => 0x212C /* Semi-voice Mark */ 332); 333foreach ($fullwidthKatakana as $cp => $kuten) { 334 convertValidString(pack('n', $cp), "\x1B\$B" . pack('n', $kuten) . "\x1B(B", 'UTF-16BE', 'CP50220', false); 335} 336 337echo "Folding of fullwidth katakana for CP50220 OK\n"; 338 339testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50220'); 340testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50221'); 341testInvalidString("\xD8\x00", '%', 'UTF-16BE', 'CP50222'); 342 343echo "Invalid Unicode is flagged when converting to CP5022x\n"; 344 345// Test "long" illegal character markers 346mb_substitute_character("long"); 347convertInvalidString("\x80", "%", "CP50220", "UTF-8"); 348convertInvalidString("\x80", "%", "CP50221", "UTF-8"); 349convertInvalidString("\x80", "%", "CP50222", "UTF-8"); 350convertInvalidString("\x1B\$B1", "%", "CP50220", "UTF-8"); 351convertInvalidString("\x1B\$B1", "%", "CP50221", "UTF-8"); 352convertInvalidString("\x1B\$B1", "%", "CP50222", "UTF-8"); 353 354echo "Long error markers OK\n"; 355 356foreach (['CP50220', 'CP50221', 'CP50222'] as $encoding) { 357 testInvalidString("\x1B", "%", $encoding, "UTF-8"); 358 testInvalidString("\x1BX", "%X", $encoding, "UTF-8"); 359 testInvalidString("\x1B(", "%", $encoding, "UTF-8"); 360 testInvalidString("\x1B(X", "%(X", $encoding, "UTF-8"); 361 testInvalidString("\x1B\$", "%", $encoding, "UTF-8"); 362 testInvalidString("\x1B\$(", "%", $encoding, "UTF-8"); 363 testInvalidString("\x1B\$X", "%\$X", $encoding, "UTF-8"); 364 testInvalidString("\x1B\$(X", "%\$(X", $encoding, "UTF-8"); 365} 366 367echo "Invalid escape sequences OK\n"; 368 369// Regression tests 370if (mb_convert_encoding("\x1BC\xF5", 'UTF-16BE', 'CP50221') !== "\x00%\x00C\x00%") 371 die("Bad"); 372 373// Previously, the CP50220 implementation would eat trailing null bytes 374$converted = mb_convert_encoding("ab\x00", 'UTF-16BE', 'CP50220'); 375if ($converted !== "\x00a\x00b\x00\x00") 376 die("Bad handling of trailing null byte (got " . bin2hex($converted) . ")"); 377 378// Previously, the CP50220 implementation would reorder error markers with 379// subsequent characters 380mb_substitute_character(0x3F); 381$converted = mb_convert_encoding("\xff\xff\x00&", 'CP50220', 'UTF-16BE'); 382if ($converted !== '?&') 383 die("Bad handling of erroneous codepoint followed by good one (got " . bin2hex($converted) . ")"); 384 385// In CP50220, two codepoints can be collapsed into a single kuten code in some cases 386// This should work even on a boundary between separately processed buffers 387$shouldCollapse = "\xFF\x76\xFF\x9E"; 388$expected = "\x1B\$B%,\x1B(B"; 389for ($i = 0; $i < 256; $i++) { 390 convertValidString(str_repeat("\x00a", $i) . $shouldCollapse, str_repeat('a', $i) . $expected, 'UTF-16BE', 'CP50220', false); 391} 392 393?> 394--EXPECT-- 395ASCII support OK 396JIS X 0201 support OK 397CP932 support OK 398JIS X 0212 support OK 399Folding of fullwidth katakana for CP50220 OK 400Invalid Unicode is flagged when converting to CP5022x 401Long error markers OK 402Invalid escape sequences OK 403