1--TEST-- 2Test of ASCII and KS X 1001-1992 support in ISO-2022-KR encoding 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11include('encoding_tests.inc'); 12mb_substitute_character(0x25); // '%' 13 14readConversionTable(__DIR__ . '/data/KSX1001.txt', $ksxChars, $unused); 15 16function testValid($from, $to, $bothWays = true) { 17 identifyValidString($from, 'ISO-2022-KR'); 18 convertValidString($from, $to, 'ISO-2022-KR', 'UTF-16BE', false); 19 20 if ($bothWays) { 21 /* 0xF at the beginning of an ISO-2022 string is redundant; it switches 22 * to ASCII mode, but ASCII mode is default */ 23 if (strlen($from) > 0 && $from[0] == "\x0F") 24 $from = substr($from, 1, strlen($from) - 1); 25 /* If the string switches to a different charset, it should switch back to 26 * ASCII at the end */ 27 if (strpos($from, "\x0E") !== false && $from[-1] !== "\x0F") 28 $from .= "\x0F"; 29 if (strpos($from, "\x1B\$)C") === false && $from !== '') 30 $from = "\x1B\$)C" . $from; 31 32 convertValidString($to, $from, 'UTF-16BE', 'ISO-2022-KR', false); 33 } 34} 35 36function testInvalid($from, $to) { 37 testInvalidString($from, $to, 'ISO-2022-KR', 'UTF-16BE'); 38} 39 40testValid("", ""); 41echo "Empty string OK\n"; 42 43for ($i = 0; $i < 0x80; $i++) { 44 if ($i == 0xE || $i == 0xF || $i == 0x1B) 45 continue; 46 testValid(chr($i), "\x00" . chr($i)); 47 testValid("\x0F" . chr($i), "\x00" . chr($i)); /* 0xF is 'Shift In' code */ 48} 49 50for ($i = 0x80; $i < 256; $i++) { 51 testInvalid(chr($i), "\x00%"); 52 testInvalid("\x0F" . chr($i), "\x00%"); 53} 54 55echo "ASCII support OK\n"; 56 57foreach ($ksxChars as $ksx => $utf16BE) { 58 testValid("\x0E" . $ksx, $utf16BE, false); 59 testValid("\x1B$)C\x0E" . $ksx, $utf16BE, false); 60 testValid("\x1B$)C\x0E" . $ksx . "\x0F", $utf16BE); 61} 62 63findInvalidChars($ksxChars, $invalidKsx, $truncatedKsx); 64 65$badChars = array_keys($invalidKsx); 66foreach ($badChars as $badChar) { 67 if ($badChar[0] == "\x0E" || $badChar[0] == "\x0F" || $badChar[0] == "\x1B") 68 continue; 69 testInvalid("\x1B$)C\x0E" . $badChar, "\x00%"); 70} 71 72$badChars = array_keys($truncatedKsx); 73foreach ($badChars as $badChar) { 74 testInvalid("\x1B$)C\x0E" . $badChar, "\x00%"); 75} 76 77echo "KS X 1001 support OK\n"; 78 79/* After a valid ESC sequence, we are still in ASCII mode; 'Shift Out' is needed to start KS X 1001 */ 80testValid("\x1B$)Cabc", "\x00a\x00b\x00c", false); 81 82/* Test invalid and truncated ESC sequences */ 83testInvalid("\x1B", "\x00%"); 84testInvalid("\x1B$", "\x00%"); 85testInvalid("\x1B$)", "\x00%"); 86 87for ($i = 0; $i < 256; $i++) { 88 if (chr($i) != '$') 89 testInvalid("\x1B" . chr($i), "\x00%"); 90 if (chr($i) != ')') 91 testInvalid("\x1B$" . chr($i), "\x00%"); 92 if (chr($i) != 'C') 93 testInvalid("\x1B$)" . chr($i), "\x00%"); 94} 95 96/* We can switch back and forth between ASCII and KS X 1001 */ 97testValid("\x0E\x0E\x0F\x0E\x0Fabc", "\x00a\x00b\x00c", false); 98 99echo "Escapes behave as expected\n"; 100 101// Test switching between KS X 1001 and ASCII when converting Unicode -> ISO-2022-KR 102convertValidString("\x76\x20\x00a\x00b", "\x1B$)C\x0E\x74\x30\x0Fab", "UTF-16BE", "ISO-2022-KR", false); 103 104// Regression test: Our conversion table for KS X 1001 only goes up to 0x7D7E, but 105// we previously accepted and tried to convert two-byte sequences starting with 106// 0x7E, resulting in a failed assertion 107convertInvalidString("\x0E~/", "%", "ISO-2022-KR", "UTF-8"); 108 109// Regression test: The old implementation would wrongly convert some codepoints 110// which are not in KS X 1001 at all to 'random' characters in KS X 1001 111convertInvalidString("\xFF\x86", "\x1B\$)C%", "UTF-16BE", "ISO-2022-KR"); 112 113// Regression test: The old implementation would sometimes emit an extra 0x0F ('shift out') 114// character at the end of a string, although the string was already ending in ASCII mode 115convertValidString("\x68\x46\x00a", "\x1B\$)C\x0E\x68\x46\x0Fa", "UTF-16BE", "ISO-2022-KR", false); 116 117// Regression test: Don't shift from KS X 1001 to ASCII mode on invalid escape sequence 118convertInvalidString("\x0E\x1BX\x74\x30", "\x00%\x76\x20", "ISO-2022-KR", "UTF-16BE", false); 119 120// Test "long" illegal character markers 121mb_substitute_character("long"); 122convertInvalidString("\x1B", "%", "ISO-2022-KR", "UTF-8"); 123convertInvalidString("\x1B$", "%", "ISO-2022-KR", "UTF-8"); 124convertInvalidString("\x1B$)", "%", "ISO-2022-KR", "UTF-8"); 125convertInvalidString("\x1B$)C\x0E\x7C\x84", "%", "ISO-2022-KR", "UTF-8"); 126 127echo "Done!\n"; 128?> 129--EXPECT-- 130Empty string OK 131ASCII support OK 132KS X 1001 support OK 133Escapes behave as expected 134Done! 135