1--TEST-- 2Exhaustive test of ISO-2022-JP-2004 encoding verification and conversion 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11srand(111); /* Make results consistent */ 12include('encoding_tests.inc'); 13mb_substitute_character(0x25); // '%' 14 15/* Read in table of all characters in JISX-0208 charset */ 16$jisx0208Chars = array(); /* JISX0208 -> UTF-16BE */ 17$fp = fopen(__DIR__ . '/data/JISX0208.txt', 'r+'); 18while ($line = fgets($fp, 256)) { 19 if ($line[0] == '#') 20 continue; 21 22 if (sscanf($line, "0x%x\t0x%x\t0x%x", $shiftJIS, $jis0208Code, $unicodeCP) == 3) { 23 $jisx0208Chars[pack('n', $jis0208Code)] = pack('n', $unicodeCP); 24 } 25} 26 27/* The JIS X 0208 character set does not have a single, straightforward 28 * mapping to the Unicode character set 29 * mbstring converts one character differently from the mappings in 30 * data/JISX0208.txt, which comes from the Unicode Consortium */ 31 32/* 0x2140 is a backslash; this can be mapped to 0x005C for an ordinary 33 * backslash, or 0xFF3C for a _fullwidth_ one */ 34$jisx0208Chars["\x21\x40"] = "\xFF\x3C"; 35 36/* Single bytes from 0x0-0x20 are allowed */ 37for ($i = 0; $i <= 0x20; $i++) { 38 if ($i != 0x1B) 39 $jisx0208Chars[chr($i)] = "\x00" . chr($i); 40} 41/* As is 0x7F */ 42$jisx0208Chars["\x7F"] = "\x00\x7F"; 43 44/* Now read table of JISX-0213:2004 plane 1 and JISX-0213:2000 plane 2 chars */ 45$jisx0213_2004_1Chars = array(); 46$jisx0213_2000_2Chars = array(); 47$fp = fopen(__DIR__ . '/data/ISO-2022-JP-2004-JISX0213.txt', 'r+'); 48while ($line = fgets($fp, 256)) { 49 if ($line[0] == '#') 50 continue; 51 52 $cp2 = null; 53 if (sscanf($line, "%d-%x\tU+%x+%x", $type, $bytes, $cp1, $cp2) >= 3) { 54 if ($cp1 <= 0xFFFF) 55 $unicode = pack('n', $cp1); 56 else 57 $unicode = mb_convert_encoding(pack('N', $cp1), 'UTF-16BE', 'UTF-32BE'); 58 if ($cp2) 59 $unicode .= pack('n', $cp2); 60 61 if ($type == 3) 62 $jisx0213_2004_1Chars[pack('n', $bytes)] = $unicode; 63 else if ($type == 4) 64 $jisx0213_2000_2Chars[pack('n', $bytes)] = $unicode; 65 } 66} 67 68/* JISX 0213 plane 1 0x2131 is an overline; Unicode has a halfwidth overline 69 * at 0x203E and a fullwidth overline at 0xFFE3 70 * We'll use the fullwidth version when converting JISX 0213 to Unicode */ 71$jisx0213_2004_1Chars["\x21\x31"] = "\xFF\xE3"; 72/* Same deal with the Yen sign; use the fullwidth one */ 73$jisx0213_2004_1Chars["\x21\x6F"] = "\xFF\xE5"; 74 75/* Since JISX 0213 is an extension of JISX 0208, allow the same single-byte chars */ 76for ($i = 0; $i <= 0x20; $i++) { 77 if ($i != 0x1B) 78 $jisx0213_2004_1Chars[chr($i)] = "\x00" . chr($i); 79} 80$jisx0213_2004_1Chars["\x7F"] = "\x00\x7F"; 81 82for ($i = 0; $i <= 0x20; $i++) { 83 if ($i != 0x1B) 84 $jisx0213_2000_2Chars[chr($i)] = "\x00" . chr($i); 85} 86$jisx0213_2000_2Chars["\x7F"] = "\x00\x7F"; 87 88function testValid($from, $to, $bothWays = true) { 89 identifyValidString($from, 'ISO-2022-JP-2004'); 90 convertValidString($from, $to, 'ISO-2022-JP-2004', 'UTF-16BE', false); 91 92 if ($bothWays) { 93 /* Try going in the opposite direction too 94 * ESC ( B at the beginning of ISO-2022-JP-2004 string is redundant, 95 * since ASCII mode is the default */ 96 if (substr($from, 0, 3) == "\x1B(B") 97 $from = substr($from, 3, strlen($from) - 3); 98 /* If the ISO-2022-JP-2004 string switches to a different charset, it 99 * should switch back to ASCII at the end */ 100 if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B\$(Q") !== false || strpos($from, "\x1B\$(P") !== false) 101 $from .= "\x1B(B"; 102 103 convertValidString($to, $from, 'UTF-16BE', 'ISO-2022-JP-2004', false); 104 } 105} 106 107function testInvalid($from, $to) { 108 testInvalidString($from, $to, 'ISO-2022-JP-2004', 'UTF-16BE'); 109} 110 111/* Try all ASCII characters */ 112for ($i = 0; $i <= 0x7F; $i++) { 113 if ($i == 0x1B) 114 continue; 115 testValid(chr($i), "\x00" . chr($i)); 116} 117 118/* Try all ASCII characters, with explicit ASCII escape */ 119for ($i = 0; $i <= 0x7F; $i++) { 120 if ($i == 0x1B) 121 continue; 122 testValid("\x1B(B" . chr($i), "\x00" . chr($i)); 123} 124 125echo "Encoding verification and conversion works for all ASCII characters\n"; 126 127for ($i = 0x80; $i <= 0x9F; $i++) { 128 convertInvalidString("\x00" . chr($i), "%", 'UTF-16BE', 'ISO-2022-JP-2004'); 129} 130 131echo "Codepoints from U+0080-009F are rejected\n"; 132 133/* Try a bare ESC */ 134identifyInvalidString("\x1B", 'ISO-2022-JP-2004'); 135 136/* Try all non-ASCII, non-ESC single bytes */ 137for ($i = 0x80; $i <= 0xFF; $i++) { 138 testInvalid(chr($i), "\x00%"); 139} 140 141echo "Encoding verification and conversion rejects all invalid single bytes\n"; 142 143/* All valid JISX0208 characters */ 144foreach ($jisx0208Chars as $jisx0208 => $utf16BE) { 145 /* Since JIS X 0213 charset is a superset of JIS X 0208, we don't bother 146 * using JIS X 0208 when converting Unicode to ISO-2022-JP-2004 147 * Therefore, don't test conversion in both directions here */ 148 testValid("\x1B\$B" . $jisx0208, $utf16BE, false); 149} 150 151/* All invalid 1-byte JISX0208 characters */ 152for ($i = 0; $i < 256; $i++) { 153 if ($i == 0x1B) 154 continue; 155 if ($i >= 0x21 && $i <= 0x7E) 156 continue; 157 $testString = chr($i); 158 if (!isset($jisx0208Chars[$testString])) { 159 testInvalid("\x1B\$B" . $testString, "\x00%"); 160 } 161} 162 163/* All invalid 2-byte JISX0208 characters */ 164for ($i = 0x21; $i <= 0x7E; $i++) { 165 for ($j = 0; $j < 256; $j++) { 166 $testString = chr($i) . chr($j); 167 if (!isset($jisx0208Chars[$testString])) { 168 testInvalid("\x1B\$B" . $testString, "\x00%"); 169 } 170 } 171} 172 173echo "Encoding verification and conversion work on JISX-0208 characters\n"; 174 175/* All JISX0213 plane 1 characters */ 176foreach ($jisx0213_2004_1Chars as $jisx0213_2004 => $utf16BE) { 177 /* For single bytes, don't try conversion in both directions */ 178 testValid("\x1B$(Q" . $jisx0213_2004, $utf16BE, $utf16BE > "\x01\x00"); 179} 180 181/* All invalid 2-byte JISX0213 plane 1 characters */ 182for ($i = 0x21; $i <= 0x7E; $i++) { 183 for ($j = 0; $j < 256; $j++) { 184 $testString = chr($i) . chr($j); 185 if (!isset($jisx0213_2004_1Chars[$testString])) { 186 testInvalid("\x1B$(Q" . $testString, "\x00%"); 187 } 188 } 189} 190 191echo "Encoding verification and conversion work on JISX-0213:2004 plane 1 characters\n"; 192 193/* All JISX0213 plane 2 characters */ 194foreach ($jisx0213_2000_2Chars as $jisx0213_2000 => $utf16BE) { 195 /* For single bytes, don't try conversion in both directions */ 196 testValid("\x1B$(P" . $jisx0213_2000, $utf16BE, $utf16BE > "\x01\x00"); 197} 198 199/* All invalid 2-byte JISX0213 plane 2 characters */ 200for ($i = 0x21; $i <= 0x7E; $i++) { 201 for ($j = 0; $j < 256; $j++) { 202 $testString = chr($i) . chr($j); 203 if (!isset($jisx0213_2000_2Chars[$testString])) { 204 testInvalid("\x1B$(P" . $testString, "\x00%"); 205 } 206 } 207} 208 209echo "Encoding verification and conversion work on JISX-0213:2000 plane 2 characters\n"; 210 211/* All possible escape sequences */ 212$validEscapes = ["\x1B\$B" => true, "\x1B(B" => true, "\x1B$(Q" => true, "\x1B$(P" => true]; 213for ($i = 0; $i <= 0xFF; $i++) { 214 for ($j = 0; $j <= 0xFF; $j++) { 215 $escapeSequence = "\x1B" . chr($i) . chr($j); 216 if (isset($validEscapes[$escapeSequence])) { 217 testValid($escapeSequence, "", false); 218 } else { 219 identifyInvalidString($escapeSequence, 'ISO-2022-JP-2004'); 220 } 221 } 222} 223 224echo "All escape sequences work as expected\n"; 225 226identifyInvalidString("\x1B$", 'ISO-2022-JP-2004'); 227identifyInvalidString("\x1B(", 'ISO-2022-JP-2004'); 228identifyInvalidString("\x1B$(", 'ISO-2022-JP-2004'); 229 230echo "All incomplete escape sequences are rejected\n"; 231 232/* Try all combinations of 2 different charsets in the same string */ 233$ascii = "\x1B(Ba"; 234$jisx0208 = "\x1B\$B" . array_keys($jisx0208Chars)[rand(0,1000)]; 235$jisx0213_1 = "\x1B$(Q" . array_keys($jisx0213_2004_1Chars)[rand(0,1000)]; 236$jisx0213_2 = "\x1B$(P" . array_keys($jisx0213_2000_2Chars)[rand(0,1000)]; 237$differentCharsets = [$ascii, $jisx0208, $jisx0213_1, $jisx0213_2]; 238foreach ($differentCharsets as $a) { 239 foreach ($differentCharsets as $b) { 240 identifyValidString($a . $b, 'ISO-2022-JP-2004'); 241 } 242} 243 244/* Try redundant escape sequences (switching mode but including any characters 245 * in the new mode) */ 246$ascii_Esc = "\x1B(B"; 247$jisx0208_Esc = "\x1B\$B"; 248$jisx0213_1_Esc = "\x1B$(Q"; 249$jisx0213_2_Esc = "\x1B$(P"; 250$differentCharsets = [$ascii_Esc, $jisx0208_Esc, $jisx0213_1_Esc, $jisx0213_2_Esc]; 251foreach ($differentCharsets as $a) { 252 foreach ($differentCharsets as $b) { 253 testValid($a . $b, "", false); 254 } 255} 256 257echo "Combining multiple charsets in the same string works as expected\n"; 258 259/* Try ending in the middle of a JISX0208 character */ 260testInvalid(substr($jisx0208, 0, strlen($jisx0208) - 1), "\x00%"); 261 262/* Try ending in the middle of a JISX0213 plane 1 character */ 263testInvalid(substr($jisx0213_1, 0, strlen($jisx0213_1) - 1), "\x00%"); 264 265/* Try ending in the middle of a JISX0213 plane 2 character */ 266testInvalid(substr($jisx0213_2, 0, strlen($jisx0213_2) - 1), "\x00%"); 267 268echo "Strings with truncated multi-byte characters are rejected\n"; 269 270/* We have tried converting all kinds of strings with single characters; 271 * now try some random examples of strings with multiple characters */ 272$jisx0208 = array_keys($jisx0208Chars); 273shuffle($jisx0208); 274$jisx0213_1 = array_keys($jisx0213_2004_1Chars); 275shuffle($jisx0213_1); 276$jisx0213_2 = array_keys($jisx0213_2000_2Chars); 277shuffle($jisx0213_2); 278 279for ($i = 0; $i < 100; $i++) { 280 $size = rand(5,20); 281 $testString = ''; 282 $convertsTo = ''; 283 284 /* Build a string from a random combination of characters in the supported 285 * character sets */ 286 while ($size--) { 287 $type = rand(0,4); 288 $chars = rand(0,10); 289 if ($type == 0) { /* ASCII */ 290 $testString .= "\x1B(B"; 291 while ($chars--) { 292 $ascii = chr(rand(0x20, 0x7E)); 293 $testString .= $ascii; 294 $convertsTo .= "\x00" . $ascii; 295 } 296 } else if ($type == 1) { /* JIS X 0208 */ 297 $testString .= "\x1B\$B"; 298 while ($chars--) { 299 $jis = array_pop($jisx0208); 300 $testString .= $jis; 301 $convertsTo .= $jisx0208Chars[$jis]; 302 } 303 } else if ($type == 2) { /* JIS X 0213:2004 plane 1 */ 304 $testString .= "\x1B$(Q"; 305 while ($chars--) { 306 $jis = array_pop($jisx0213_1); 307 $testString .= $jis; 308 $convertsTo .= $jisx0213_2004_1Chars[$jis]; 309 } 310 } else { /* JIS X 0213:2000 plane 2 */ 311 $testString .= "\x1B$(P"; 312 while ($chars-- && !empty($jisx0213_2)) { 313 $jis = array_pop($jisx0213_2); 314 $testString .= $jis; 315 $convertsTo .= $jisx0213_2000_2Chars[$jis]; 316 } 317 } 318 } 319 320 testValid($testString, $convertsTo, false); 321} 322 323// Regression test: Test handling of 0x80-0x9F; these have a special meaning in EUC-JP-2004, 324// but not in ISO-2022-JP-2004 325for ($i = 0x80; $i <= 0x9F; $i++) 326 convertInvalidString(chr($i), "%", "ISO-2022-JP-2004", "UTF-8"); 327 328// Regression test: Codepoint which has a special representation in EUC-JP-2004 329convertInvalidString("\xFF\x95", "%", "UTF-16BE", "ISO-2022-JP-2004"); 330 331// Regression test: Old implementation did not switch properly between JIS X 0213 plane 1 332// and plane 2 333// So try a character which is in plane 1 followed by one in plane 2 334testValidString("\x30\x00\x4E\x02", "\x1B\$(Q\x21\x21\x1B\$(P\x21\x22\x1B(B", "UTF-16BE", "ISO-2022-JP-2004"); 335// Try plane 2 followed by plane 1 336testValidString("\x4E\x02\x30\x00", "\x1B\$(P\x21\x22\x1B\$(Q\x21\x21\x1B(B", "UTF-16BE", "ISO-2022-JP-2004"); 337 338// Test "long" illegal character markers 339mb_substitute_character("long"); 340convertInvalidString("\xE0", "%", "ISO-2022-JP-2004", "UTF-8"); 341convertInvalidString("\x1B\$(X", "%", "ISO-2022-JP-2004", "UTF-8"); // Invalid escape 342convertInvalidString("\x1B\$B!", "%", "ISO-2022-JP-2004", "UTF-8"); // Truncated character 343 344// Test sequences of 2 Unicode codepoints which convert to a single character in ISO-2022-JP-2004 345testValidString("\x02\x54\x03\x00", "\x1B\$(Q+H\x1B(B", "UTF-16BE", "ISO-2022-JP-2004"); 346// Including the case where such a codepoint is followed by one which it can't combine with 347testValidString("\x02\x54\x00A", "\x1B\$(Q+8\x1B(BA", "UTF-16BE", "ISO-2022-JP-2004"); 348 349echo "All done!\n"; 350 351?> 352--EXPECT-- 353Encoding verification and conversion works for all ASCII characters 354Codepoints from U+0080-009F are rejected 355Encoding verification and conversion rejects all invalid single bytes 356Encoding verification and conversion work on JISX-0208 characters 357Encoding verification and conversion work on JISX-0213:2004 plane 1 characters 358Encoding verification and conversion work on JISX-0213:2000 plane 2 characters 359All escape sequences work as expected 360All incomplete escape sequences are rejected 361Combining multiple charsets in the same string works as expected 362Strings with truncated multi-byte characters are rejected 363All done! 364