1--TEST-- 2Exhaustive test of ISO-2022-JP-MS text encoding 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11srand(444); /* Make results consistent */ 12include('encoding_tests.inc'); 13mb_substitute_character(0x25); // '%' 14 15function shiftJISDecode($bytes) { 16 /* Convert CP932's default Shift-JIS representation to kuten code */ 17 $first = ($bytes >> 8) & 0xFF; 18 $second = $bytes & 0xFF; 19 $hi_bits = $first - (($first > 0x9F) ? 0xE0 - 31 : 0x81); 20 if ($second > 0x9E) { 21 $kuten = ((($hi_bits << 1) + 0x22) << 8) + ($second - 0x9F + 0x21); 22 } else if ($second > 0x7F) { 23 $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x80 + 63 + 0x21); 24 } else { 25 $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x40 + 0x21); 26 } 27 return $kuten; 28} 29 30/* Read in the table of all characters in CP932 */ 31$cp932Chars = array(); /* CP932 string -> UTF-32BE string */ 32$fp = fopen(realpath(__DIR__ . '/data/CP932.txt'), 'r+'); 33while ($line = fgets($fp, 256)) { 34 if ($line[0] == '#') 35 continue; 36 37 if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) { 38 if ($bytes < 256) 39 continue; 40 /* ISO-2022-JP-MS only uses the first two ranges of MS vendor extensions */ 41 if ($bytes >= 0xFA00) 42 continue; 43 $cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('N', $codepoint); 44 } 45} 46 47/* Windows-932 has many cases where two different kuten codes map to the same 48 * Unicode codepoints */ 49foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C, 0xEEF9] as $i) { 50 $bytes = pack('n', shiftJISDecode($i)); 51 $nonInvertible[$bytes] = $cp932Chars[$bytes]; 52} 53 54/* Add User Defined codes (which use ESC $ ( ? escape sequence)) */ 55$udcChars = array(); 56for ($cp = 0xE000; $cp < (0xE000 + (20 * 94)); $cp++) { 57 $i = $cp - 0xE000; 58 $bytes = (((int)($i / 94) + 0x21) << 8) + (($i % 94) + 0x21); 59 $udcChars[pack('n', $bytes)] = pack('N', $cp); 60} 61 62/* Read in table of all characters in JISX-0201 charset */ 63$jisx0201Chars = array(); /* JISX0201 -> UTF-32BE */ 64$fp = fopen(realpath(__DIR__ . '/data/JISX0201.txt'), 'r+'); 65while ($line = fgets($fp, 256)) { 66 if ($line[0] == '#') 67 continue; 68 69 if (sscanf($line, "0x%x\t0x%x", $byte, $codepoint) == 2) 70 $jisx0201Chars[chr($byte)] = pack('N', $codepoint); 71} 72 73function testValid($from, $to, $encoding, $bothWays = true) { 74 identifyValidString($from, $encoding); 75 convertValidString($from, $to, $encoding, 'UTF-32BE', false); 76 77 if ($bothWays) { 78 /* ESC ( B at the beginning is redundant, since ASCII mode is the default */ 79 if (substr($from, 0, 3) == "\x1B(B") 80 $from = substr($from, 3, strlen($from) - 3); 81 /* If the string switches to a different charset, it should switch back to 82 * ASCII at the end */ 83 if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(I") !== false || strpos($from, "\x1B\$@") !== false || strpos($from, "\x1B\$(B") !== false || strpos($from, "\x1B\$(@") !== false || strpos($from, "\x1B\$(?") !== false) 84 $from .= "\x1B(B"; 85 86 convertValidString($to, $from, 'UTF-32BE', $encoding, false); 87 } 88} 89 90function testInvalid($from, $to, $encoding) { 91 testInvalidString($from, $to, $encoding, 'UTF-32BE'); 92} 93 94for ($i = 0; $i < 0x80; $i++) { 95 if ($i == 0x1B) 96 continue; 97 testValid(chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-MS'); 98 testValid("\x1B(B" . chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-MS', false); 99 testValid("\x1B(J" . chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-MS', false); 100} 101 102for ($i = 0x80; $i < 256; $i++) { 103 if ($i >= 0xA1 && $i <= 0xDF) // We convert single bytes from 0xA1-0xDF as JIS X 0201 kana 104 continue; 105 testInvalid(chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS'); 106 testInvalid("\x1B(B" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS'); 107 testInvalid("\x1B(J" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS'); 108} 109 110echo "ASCII support OK\n"; 111 112/* All valid JIS X 0201 characters 113 * Those with a 1 in the high bit are JIS X 0201 kana */ 114foreach ($jisx0201Chars as $jisx0201 => $utf32BE) { 115 if (ord($jisx0201) >= 128) { 116 $kana = chr(ord($jisx0201) - 128); 117 testValid("\x1B(I" . $kana, $utf32BE, 'ISO-2022-JP-MS', false); 118 testValid($jisx0201, $utf32BE, 'ISO-2022-JP-MS', false); 119 } 120} 121 122for ($i = 0x80; $i < 256; $i++) { 123 if ($i >= 0xA1 && $i <= 0xDF) 124 continue; 125 testInvalid("\x1B(I" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS'); 126 testInvalid("\x1B(J" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-MS'); 127} 128 129echo "JIS X 0201 support OK\n"; 130 131function testAllValidCharsWithPrefix($validChars, $prefix, $bothWays) { 132 $good = array_keys($validChars); 133 shuffle($good); 134 while (!empty($good)) { 135 $length = min(rand(5,10), count($good)); 136 $from = $to = ''; 137 while ($length--) { 138 $goodChar = array_pop($good); 139 $from .= $goodChar; 140 $to .= $validChars[$goodChar]; 141 } 142 testValid($prefix . $from, $to, 'ISO-2022-JP-MS', $bothWays); 143 } 144} 145 146$validChars = $cp932Chars; 147/* We allow ASCII/JIS X 0201 characters to appear even in JIS X 0208 mode */ 148for ($i = 0; $i <= 0x7F; $i++) 149 $validChars[chr($i)] = chr($i); 150for ($i = 0xA1; $i <= 0xDF; $i++) 151 $validChars[chr($i)] = $jisx0201Chars[chr($i)]; 152$lenTable = array_fill_keys(range(0xE0, 0xFC), 2) + array_fill_keys(range(0x81, 0x9F), 2); 153findInvalidChars($validChars, $invalidChars, $truncatedChars, $lenTable); 154 155foreach ($nonInvertible as $bytes => $char) 156 unset($cp932Chars[$bytes]); 157 158testAllValidCharsWithPrefix($cp932Chars, "\x1B\$B", true); 159testAllValidCharsWithPrefix($nonInvertible, "\x1B\$B", false); 160 161foreach (array_keys($invalidChars) as $invalid) { 162 $firstByte = ord($invalid[0]); 163 if (($firstByte > 0x80 && $firstByte < 0xA0) || $firstByte >= 0xE0) { 164 /* The first byte of this 2-byte character will be rejected and result in % being sent 165 * to the output. Then the second byte will do something else. It is easier to write the 166 * test if we only check with the 1st byte. */ 167 testInvalidString("\x1B\$B" . $invalid[0], "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE'); 168 } else { 169 testInvalidString("\x1B\$B" . $invalid, "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE'); 170 } 171} 172foreach (array_keys($truncatedChars) as $truncated) 173 testInvalidString("\x1B\$B" . $truncated, "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE'); 174 175echo "JIS X 0208 (with MS extensions) support OK\n"; 176 177$validChars = $udcChars; 178for ($i = 0; $i <= 0x7F; $i++) 179 $validChars[chr($i)] = chr($i); 180for ($i = 0xA1; $i <= 0xDF; $i++) 181 $validChars[chr($i)] = $jisx0201Chars[chr($i)]; 182findInvalidChars($validChars, $invalidChars, $truncatedChars, array_fill_keys(range(0x21, 0x7F), 2)); 183 184testAllValidCharsWithPrefix($udcChars, "\x1B\$(?", true); 185 186foreach (array_keys($invalidChars) as $invalid) { 187 $firstByte = ord(substr($invalid, 0, 1)); 188 if (($firstByte > 0x80 && $firstByte < 0xA0) || $firstByte >= 0xE0) { 189 testInvalidString("\x1B\$(?" . $invalid[0], "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE'); 190 } else { 191 testInvalidString("\x1B\$(?" . $invalid, "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE'); 192 } 193} 194foreach (array_keys($truncatedChars) as $truncated) 195 testInvalidString("\x1B\$(?" . $truncated, "\x00\x00\x00%", 'ISO-2022-JP-MS', 'UTF-32BE'); 196 197echo "UDC support OK\n"; 198 199testValidString("\x00\xA5", "\x1B\$B!o\x1B(B", "UTF-16BE", "ISO-2022-JP-MS", false); 200testValidString("\x20\x3E", "\x1B\$B!1\x1B(B", "UTF-16BE", "ISO-2022-JP-MS", false); 201testValidString("\xFF\x5E", "\x1B\$B!A\x1B(B", "UTF-16BE", "ISO-2022-JP-MS", false); 202 203echo "Other mappings from Unicode -> ISO-2022-JP-MS OK\n"; 204 205// Alternative escape sequences for 2-byte characters 206testValidString("\x1B\$(B\x21\x21", "\x30\x00", "ISO-2022-JP-MS", "UTF-16BE", false); 207testValidString("\x1B\$(@\x21\x21", "\x30\x00", "ISO-2022-JP-MS", "UTF-16BE", false); 208 209// Switching between different character types 210testValidString("\x00a\x00b\x00c\xFF\x61\x00a\x00b\x00c", "abc\x1B(I\x21\x1B(Babc", "UTF-16BE", "ISO-2022-JP-MS", false); 211 212// Test "long" illegal character markers 213mb_substitute_character("long"); 214convertInvalidString("\xE0", "%", "ISO-2022-JP-MS", "UTF-8"); 215// Invalid escapes: 216convertInvalidString("\x1B", "%", "ISO-2022-JP-MS", "UTF-8"); 217convertInvalidString("\x1B.", "%", "ISO-2022-JP-MS", "UTF-8"); 218convertInvalidString("\x1B\$", "%", "ISO-2022-JP-MS", "UTF-8"); 219convertInvalidString("\x1B\$.", "%", "ISO-2022-JP-MS", "UTF-8"); 220convertInvalidString("\x1B(", "%", "ISO-2022-JP-MS", "UTF-8"); 221convertInvalidString("\x1B(.", "%", "ISO-2022-JP-MS", "UTF-8"); 222convertInvalidString("\x1B\$(", "%", "ISO-2022-JP-MS", "UTF-8"); 223convertInvalidString("\x1B\$(X", "%", "ISO-2022-JP-MS", "UTF-8"); 224convertInvalidString("\x1B\$B\x9F", "%", "ISO-2022-JP-MS", "UTF-8"); // 0x9F does not start any 2-byte character 225 226echo "Done!\n"; 227?> 228--EXPECT-- 229ASCII support OK 230JIS X 0201 support OK 231JIS X 0208 (with MS extensions) support OK 232UDC support OK 233Other mappings from Unicode -> ISO-2022-JP-MS OK 234Done! 235