1--TEST-- 2Exhaustive test of CP932 encoding verification and conversion (including 'SJIS-win' variant) 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11srand(4321); /* Make results consistent */ 12include('encoding_tests.inc'); 13mb_substitute_character(0x25); // '%' 14 15/* Read in the table of all characters in CP932 */ 16readConversionTable(__DIR__ . '/data/CP932.txt', $validChars, $fromUnicode); 17 18/* Aside from the characters in that table, we also support a 'user' area 19 * from 0xF040-0xF9FC, which map to Unicode 'private' codepoints 0xE000-E757 */ 20$codepoint = 0xE000; 21for ($i = 0xF0; $i <= 0xF9; $i++) { 22 for ($j = 0x40; $j <= 0xFC; $j++) { 23 if ($j == 0x7F) 24 continue; 25 $utf16 = pack('n', $codepoint); 26 $cp932 = chr($i) . chr($j); 27 $validChars[$cp932] = $utf16; 28 $fromUnicode[$utf16] = $cp932; 29 $codepoint++; 30 } 31} 32 33/* U+00A2 is CENT SIGN; convert to FULLWIDTH CENT SIGN */ 34$fromUnicode["\x00\xA2"] = "\x81\x91"; 35/* U+00A3 is POUND SIGN; convert to FULLWIDTH POUND SIGN */ 36$fromUnicode["\x00\xA3"] = "\x81\x92"; 37/* U+00A5 is YEN SIGN; convert to 0x5C, which has conflicting uses 38 * (either as backslash or as Yen sign) */ 39$fromUnicode["\x00\xA5"] = "\x5C"; 40 41 42/* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE) 43 * But when converting Unicode to CP932, we also accept U+301C (WAVE DASH) */ 44$fromUnicode["\x30\x1C"] = "\x81\x60"; 45/* We map the JIS X 0208 MINUS SIGN to U+FF0D (FULLWIDTH HYPHEN-MINUS SIGN), 46 * but when converting Unicode to CP932, we also accept U+2212 (MINUS SIGN) */ 47$fromUnicode["\x22\x12"] = "\x81\x7C"; 48/* We map the JIS X 0208 PARALLEL TO symbol to U+2225 (PARALLEL TO), 49 * but when converting Unicode to CP932, we also accept U+2016 50 * (DOUBLE VERTICAL LINE) */ 51$fromUnicode["\x20\x16"] = "\x81\x61"; 52/* We map the JIS X 0208 NOT SIGN to U+FFE2 (FULLWIDTH NOT SIGN), 53 * but when converting Unicode to CP932, we also accept U+00AC (NOT SIGN) */ 54$fromUnicode["\x00\xAC"] = "\x81\xCA"; 55 56/* U+00AF is MACRON; convert to FULLWIDTH MACRON */ 57$fromUnicode["\x00\xAF"] = "\x81\x50"; 58 59/* U+203E is OVERLINE; convert to 0x7E, which has conflicting uses 60 * (either as tilde or as overline) */ 61$fromUnicode["\x20\x3E"] = "\x7E"; 62 63findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xFC), 2)); 64 65findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2)); 66 67/* There are 396 Unicode codepoints which are non-invertible in CP932 68 * (multiple CP932 byte sequences map to the same codepoint) 69 * Some of these are 3-way pile-ups. I wonder what the fine folks at MS 70 * were thinking when they designed this text encoding. */ 71 72/* Everything from 0xED00-0xEEFF falls in this unfortunate category 73 * (Other sequences in 0xFA00-0xFBFF map to the same codepoints, and when 74 * converting from Unicode back to CP932, we favor the F's rather than the E's) */ 75$nonInvertible = array(); 76for ($i = 0xED00; $i <= 0xEEFF; $i++) { 77 $bytes = pack('n', $i); 78 if (isset($validChars[$bytes])) { 79 unset($fromUnicode[$validChars[$bytes]]); 80 $nonInvertible[$bytes] = $validChars[$bytes]; 81 unset($validChars[$bytes]); // will test these separately 82 } 83} 84 85/* There are 23 other collisions between 2-byte sequences which variously 86 * start with 0x81, 0x87, or 0xFA 87 * We _love_ 0x81 and use it when possible. 0x87 is a second favorite */ 88for ($i = 0xFA4A; $i <= 0xFA53; $i++) { 89 $bytes = pack('n', $i); 90 unset($fromUnicode[$validChars[$bytes]]); 91 $nonInvertible[$bytes] = $validChars[$bytes]; 92 unset($validChars[$bytes]); // will test these separately 93} 94foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C, 0xFA54, 0xFA58, 0xFA59, 0xFA5A, 0xFA5B] as $i) { 95 $bytes = pack('n', $i); 96 unset($fromUnicode[$validChars[$bytes]]); 97 $nonInvertible[$bytes] = $validChars[$bytes]; 98 unset($validChars[$bytes]); // will test these separately 99} 100 101testAllValidChars($validChars, 'CP932', 'UTF-16BE'); 102foreach ($nonInvertible as $cp932 => $unicode) 103 testValidString($cp932, $unicode, 'CP932', 'UTF-16BE', false); 104echo "CP932 verification and conversion works on all valid characters\n"; 105 106testAllInvalidChars($invalidChars, $validChars, 'CP932', 'UTF-16BE', "\x00%"); 107echo "CP932 verification and conversion works on all invalid characters\n"; 108 109convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP932', '%'); 110echo "Unicode -> CP932 conversion works on all invalid codepoints\n"; 111 112/* Now test 'SJIS-win' variant of CP932, which is really CP932 but with 113 * two different mappings 114 * Instead of mapping U+00A5 and U+203E to the single bytes 0x5C and 07E 115 * (which have conflicting uses), 'SJIS-win' maps them to appropriate 116 * JIS X 0208 characters */ 117 118/* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */ 119$fromUnicode["\x00\xA5"] = "\x81\x8F"; 120/* U+203E is OVERLINE; convert to JIS X 0208 FULLWIDTH MACRON */ 121$fromUnicode["\x20\x3E"] = "\x81\x50"; 122 123testAllValidChars($validChars, 'SJIS-win', 'UTF-16BE'); 124foreach ($nonInvertible as $cp932 => $unicode) 125 testValidString($cp932, $unicode, 'SJIS-win', 'UTF-16BE', false); 126echo "SJIS-win verification and conversion works on all valid characters\n"; 127 128testAllInvalidChars($invalidChars, $validChars, 'SJIS-win', 'UTF-16BE', "\x00%"); 129echo "SJIS-win verification and conversion works on all invalid characters\n"; 130 131convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'SJIS-win', '%'); 132echo "Unicode -> SJIS-win conversion works on all invalid codepoints\n"; 133 134// Test "long" illegal character markers 135mb_substitute_character("long"); 136convertInvalidString("\x80", "%", "CP932", "UTF-8"); 137convertInvalidString("\xEA", "%", "CP932", "UTF-8"); 138convertInvalidString("\x81\x20", "%", "CP932", "UTF-8"); 139convertInvalidString("\xEA\xA9", "%", "CP932", "UTF-8"); 140convertInvalidString("\x80", "%", "SJIS-win", "UTF-8"); 141convertInvalidString("\xEA", "%", "SJIS-win", "UTF-8"); 142convertInvalidString("\x81\x20", "%", "SJIS-win", "UTF-8"); 143convertInvalidString("\xEA\xA9", "%", "SJIS-win", "UTF-8"); 144 145echo "Done!\n"; 146?> 147--EXPECT-- 148CP932 verification and conversion works on all valid characters 149CP932 verification and conversion works on all invalid characters 150Unicode -> CP932 conversion works on all invalid codepoints 151SJIS-win verification and conversion works on all valid characters 152SJIS-win verification and conversion works on all invalid characters 153Unicode -> SJIS-win conversion works on all invalid codepoints 154Done! 155