1--TEST-- 2Test of ASCII and JIS X 0201/0208/0212 support in ISO-2022-JP and JIS7/8 encodings 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11include('encoding_tests.inc'); 12mb_substitute_character(0x25); // '%' 13 14/* Read in table of all characters in JISX-0212 charset */ 15readConversionTable(__DIR__ . '/data/JISX0212.txt', $jisx0212Chars, $unused); 16 17/* Read in table of all characters in JISX-0208 charset */ 18$jisx0208Chars = array(); /* JISX0208 -> UTF-16BE */ 19$fp = fopen(__DIR__ . '/data/JISX0208.txt', 'r+'); 20while ($line = fgets($fp, 256)) { 21 if ($line[0] == '#') 22 continue; 23 24 if (sscanf($line, "0x%x\t0x%x\t0x%x", $shiftJIS, $jis0208Code, $unicodeCP) == 3) { 25 $jisx0208Chars[pack('n', $jis0208Code)] = pack('n', $unicodeCP); 26 } 27} 28 29/* Read in table of all characters in JISX-0201 charset */ 30readConversionTable(__DIR__ . '/data/JISX0201.txt', $jisx0201Chars, $unused); 31 32/* The JIS X 0208 character set does not have a single, straightforward 33 * mapping to the Unicode character set 34 * mbstring converts one character differently from the mappings in 35 * ../docs/JISX0208.txt, which comes from the Unicode Consortium */ 36 37/* 0x2140 is a backslash; this can be mapped to 0x005C for an ordinary 38 * backslash, or 0xFF3C for a _fullwidth_ one */ 39$jisx0208Chars["\x21\x40"] = "\xFF\x3C"; 40 41function testValid($from, $to, $encoding, $bothWays = true) { 42 identifyValidString($from, $encoding); 43 convertValidString($from, $to, $encoding, 'UTF-16BE', false); 44 45 if ($bothWays) { 46 /* An 0xF at the beginning of a JIS7 string is redundant; it switches 47 * to ASCII mode, but ASCII mode is default */ 48 if ($from[0] == "\x0F") 49 $from = substr($from, 1, strlen($from) - 1); 50 /* ESC ( B at the beginning is redundant, since ASCII mode is the default */ 51 if (substr($from, 0, 3) == "\x1B(B") 52 $from = substr($from, 3, strlen($from) - 3); 53 convertValidString($to, $from, 'UTF-16BE', $encoding, false); 54 } 55} 56 57function testInvalid($from, $to, $encoding) { 58 testInvalidString($from, $to, $encoding, 'UTF-16BE'); 59} 60 61for ($i = 0; $i < 0x80; $i++) { 62 if ($i == 0xE || $i == 0xF || $i == 0x1B) 63 continue; 64 testValid(chr($i), "\x00" . chr($i), 'JIS'); 65 convertValidString("\x0F" . chr($i), "\x00" . chr($i), 'JIS', 'UTF-16BE', false); /* 0xF is 'Shift In' code */ 66 testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'JIS'); 67 testValid(chr($i), "\x00" . chr($i), 'ISO-2022-JP'); 68 testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'ISO-2022-JP'); 69} 70 71for ($i = 0x80; $i < 256; $i++) { 72 if ($i >= 0xA1 && $i <= 0xDF) // We convert single bytes from 0xA1-0xDF as JIS X 0201 kana 73 continue; 74 testInvalid(chr($i), "\x00%", 'JIS'); 75 testInvalid("\x0F" . chr($i), "\x00%", 'JIS'); 76 testInvalid("\x1B(B" . chr($i), "\x00%", 'JIS'); 77 testInvalid(chr($i), "\x00%", 'ISO-2022-JP'); 78 testInvalid("\x1B(B" . chr($i), "\x00%", 'ISO-2022-JP'); 79} 80 81echo "ASCII support OK\n"; 82 83/* All valid JIS X 0201 characters 84 * Those with a 1 in the high bit are JIS X 0201 kana; JIS7 encodes those 85 * with a 0 in the high bit and treats them as a separate charset 86 * (We don't test ISO-2022-JP here, as it does not support the JIS X 0201 charset) */ 87foreach ($jisx0201Chars as $jisx0201 => $utf16BE) { 88 if (ord($jisx0201) >= 128) { 89 $kana = chr(ord($jisx0201) - 128); 90 testValid("\x1B(I" . $kana . "\x1B(B", $utf16BE, 'JIS', false); 91 testValid("\x0E" . $kana . "\x0F", $utf16BE, 'JIS', false); /* 0xE is 'Shift Out' code */ 92 testValid($jisx0201, $utf16BE, 'JIS', false); 93 } else { 94 testValid("\x1B(J" . $jisx0201 . "\x1B(B", $utf16BE, 'JIS', $utf16BE > "\x00\x80"); 95 } 96} 97 98for ($i = 0x80; $i < 256; $i++) { 99 if ($i >= 0xA1 && $i <= 0xDF) 100 continue; 101 testInvalid("\x1B(I" . chr($i) . "\x1B(B", "\x00%", 'JIS'); 102 testInvalid("\x1B(J" . chr($i) . "\x1B(B", "\x00%", 'JIS'); 103} 104 105echo "JIS X 0201 support OK\n"; 106 107/* All valid JISX0208 characters */ 108foreach ($jisx0208Chars as $jisx0208 => $utf16BE) { 109 testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'JIS'); 110 testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'ISO-2022-JP'); 111} 112 113/* All invalid 2-byte JISX0208 characters */ 114for ($i = 0x21; $i <= 0x7E; $i++) { 115 for ($j = 0; $j < 256; $j++) { 116 $testString = chr($i) . chr($j); 117 if (!isset($jisx0208Chars[$testString])) { 118 testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'JIS'); 119 testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'ISO-2022-JP'); 120 } 121 } 122} 123 124/* Try truncated JISX0208 characters */ 125for ($i = 0x21; $i <= 0x7E; $i++) { 126 testInvalid("\x1B\$B" . chr($i), "\x00%", 'JIS'); 127 testInvalid("\x1B\$B" . chr($i), "\x00%", 'ISO-2022-JP'); 128} 129 130/* Switch from Kanji to ASCII */ 131testValidString("\x30\x00\x00A", "\x1B\$B\x21\x21\x1B(BA", "UTF-16BE", "JIS", false); 132testValidString("\x30\x00\x00A", "\x1B\$B\x21\x21\x1B(BA", "UTF-16BE", "ISO-2022-JP", false); 133 134echo "JIS X 0208 support OK\n"; 135 136/* JIS7 supports escape to switch to JIS X 0212 charset, but ISO-2022-JP does not */ 137 138/* All valid JISX0212 characters */ 139foreach ($jisx0212Chars as $jisx0212 => $utf16BE) { 140 testValid("\x1B\$(D" . $jisx0212 . "\x1B(B", $utf16BE, 'JIS', false); 141} 142 143/* All invalid 2-byte JISX0212 characters */ 144for ($i = 0x21; $i <= 0x7E; $i++) { 145 for ($j = 0; $j < 256; $j++) { 146 $testString = chr($i) . chr($j); 147 if (!isset($jisx0212Chars[$testString])) { 148 testInvalid("\x1B\$(D" . $testString . "\x1B(B", "\x00%", 'JIS'); 149 } 150 } 151} 152 153/* Try truncated JISX0212 characters */ 154for ($i = 0x21; $i <= 0x7E; $i++) { 155 testInvalid("\x1B\$(D" . chr($i) . "\x1B(B", "\x00%\x00%", 'JIS'); 156} 157 158testValidString("\x00\xA1", "\x1B\$(D\x22\x42\x1B(B", "UTF-16BE", "JIS", false); 159// Check that ISO-2022-JP treats JISX 0212 chars as error 160convertInvalidString("\x00\xA1", "%", "UTF-16BE", "ISO-2022-JP", false); 161 162echo "JIS X 0212 support OK\n"; 163 164/* All possible escape sequences */ 165$validJisEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true]; 166$validIso2022jpEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B(B" => true, "\x1B(J" => true]; 167for ($i = 0; $i <= 0xFF; $i++) { 168 for ($j = 0; $j <= 0xFF; $j++) { 169 $escapeSequence = "\x1B" . chr($i) . chr($j); 170 if ($escapeSequence === "\x1B\$(") 171 continue; 172 if (isset($validJisEscapes[$escapeSequence])) { 173 testValid($escapeSequence . "\x1B(B", "", 'JIS', false); 174 } else { 175 identifyInvalidString($escapeSequence . "\x1B(B", 'JIS'); 176 } 177 if (isset($validIso2022jpEscapes[$escapeSequence])) { 178 testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false); 179 } else { 180 identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP'); 181 } 182 } 183} 184for ($i = 0; $i <= 0xFF; $i++) { 185 $escapeSequence = "\x1B\$(" . chr($i); 186 if (isset($validJisEscapes[$escapeSequence])) { 187 testValid($escapeSequence . "\x1B(B", "", 'JIS', false); 188 } else { 189 identifyInvalidString($escapeSequence . "\x1B(B", 'JIS'); 190 } 191 if (isset($validIso2022jpEscapes[$escapeSequence])) { 192 testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false); 193 } else { 194 identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP'); 195 } 196} 197/* Also try a bare ESC */ 198identifyInvalidString("\x1B", 'JIS'); 199identifyInvalidString("\x1B", 'ISO-2022-JP'); 200 201convertInvalidString("\x1B$", "%", "JIS", "UTF-8"); 202convertInvalidString("\x1B$", "%", "ISO-2022-JP", "UTF-8"); 203convertInvalidString("\x1B(", "%", "JIS", "UTF-8"); 204convertInvalidString("\x1B(", "%", "ISO-2022-JP", "UTF-8"); 205convertInvalidString("\x1B,", "%,", "JIS", "UTF-8"); 206convertInvalidString("\x1B,", "%,", "ISO-2022-JP", "UTF-8"); 207 208echo "All escape sequences work as expected\n"; 209 210foreach (['JIS', 'ISO-2022-JP'] as $encoding) { 211 testValidString("\x22\x25", "\x1B\$B!B\x1B(B", 'UTF-16BE', $encoding, false); 212 testValidString("\xFF\x0D", "\x1B\$B!]\x1B(B", 'UTF-16BE', $encoding, false); 213 testValidString("\xFF\xE0", "\x1B\$B!q\x1B(B", 'UTF-16BE', $encoding, false); 214 testValidString("\xFF\xE1", "\x1B\$B!r\x1B(B", 'UTF-16BE', $encoding, false); 215 testValidString("\xFF\xE2", "\x1B\$B\"L\x1B(B", 'UTF-16BE', $encoding, false); 216 217 testValidString("\x00\xA5", "\x1B(J\x5C\x1B(B", 'UTF-16BE', $encoding, false); 218} 219testValidString("\x20\x3E", "\x1B\$B!1\x1B(B", 'UTF-16BE', 'ISO-2022-JP', false); 220 221echo "Other mappings from Unicode -> ISO-2022-JP are OK\n"; 222 223// Single bytes from 0xA3-0xDF can be used to encode kana in JIS8 224$grInvoked = [ 225 "\xA3" => "\x1B(I\x23\x1B(B", 226 "\xB1" => "\x1B(I\x31\x1B(B", 227 "\xC2" => "\x1B(I\x42\x1B(B", 228 "\xDF" => "\x1B(I\x5F\x1B(B" 229]; 230foreach ($grInvoked as $gr => $jisx) { 231 // JISX 0201 is used as the canonical form for outputting kana 232 testValidString($gr, $jisx, 'JIS', 'JIS', false); 233 if (mb_convert_encoding($gr, 'UTF-16BE', 'JIS') !== mb_convert_encoding($jisx, 'UTF-16BE', 'JIS')) 234 die("Equivalent GR byte and JISX 0201 sequence do not decode to the same codepoint"); 235} 236 237echo "GR-invoked kana support OK\n"; 238 239// Check handling of BOM 240convertInvalidString("\xFF\xFE", "%", "UTF-16BE", "JIS", false); 241convertInvalidString("\xFF\xFE", "%", "UTF-16BE", "ISO-2022-JP", false); 242 243// Test "long" illegal character markers 244mb_substitute_character("long"); 245convertInvalidString("\xE0", "%", "JIS", "UTF-8"); 246convertInvalidString("\xE0", "%", "ISO-2022-JP", "UTF-8"); 247convertInvalidString("\x1B\$(X", "%\$(X", "JIS", "UTF-8"); // Invalid escape 248convertInvalidString("\x1B\$(X", "%\$(X", "ISO-2022-JP", "UTF-8"); // Invalid escape 249convertInvalidString("\x1B\$B!", "%", "JIS", "UTF-8"); // Truncated character 250convertInvalidString("\x1B\$B!", "%", "ISO-2022-JP", "UTF-8"); // Truncated character 251 252echo "Done!\n"; 253?> 254--EXPECT-- 255ASCII support OK 256JIS X 0201 support OK 257JIS X 0208 support OK 258JIS X 0212 support OK 259All escape sequences work as expected 260Other mappings from Unicode -> ISO-2022-JP are OK 261GR-invoked kana support OK 262Done! 263