1--TEST-- 2Exhaustive test of ISO-2022-JP-KDDI text encoding 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11srand(390); /* Make results consistent */ 12include('encoding_tests.inc'); 13mb_substitute_character(0x25); // '%' 14 15function shiftJISDecode($bytes) { 16 /* Convert CP932's default Shift-JIS representation to kuten code */ 17 $first = ($bytes >> 8) & 0xFF; 18 $second = $bytes & 0xFF; 19 $hi_bits = $first - (($first > 0x9F) ? 0xE0 - 31 : 0x81); 20 if ($second > 0x9E) { 21 $kuten = ((($hi_bits << 1) + 0x22) << 8) + ($second - 0x9F + 0x21); 22 } else if ($second > 0x7F) { 23 $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x80 + 63 + 0x21); 24 } else { 25 $kuten = ((($hi_bits << 1) + 0x21) << 8) + ($second - 0x40 + 0x21); 26 } 27 return $kuten; 28} 29 30/* Read in the table of all characters in CP932 */ 31$cp932Chars = array(); /* CP932 string -> UTF-32BE string */ 32$fp = fopen(realpath(__DIR__ . '/data/CP932.txt'), 'r+'); 33while ($line = fgets($fp, 256)) { 34 if ($line[0] == '#') 35 continue; 36 37 if (sscanf($line, "0x%x\t0x%x", $bytes, $codepoint) == 2) { 38 if ($bytes < 256) 39 continue; 40 /* For ISO-2022-JP-KDDI, we only accept the first range of MicroSoft 41 * vendor extensions, in ku 13 */ 42 if ($bytes > 0xEAA4) 43 continue; 44 $cp932Chars[pack('n', shiftJISDecode($bytes))] = pack('N', $codepoint); 45 } 46} 47 48/* Add KDDI-specific emoji to the CP932 characters 49 * They are mapped in 22 ku (or 'rows') above the places where they are mapped 50 * in the Shift-JIS representation of KDDI emoji */ 51$fp = fopen(realpath(__DIR__ . '/data/EmojiSources.txt'), 'r+'); 52while ($line = fgets($fp, 256)) { 53 if ($line[0] == '#') 54 continue; 55 $fields = explode(';', rtrim($line)); 56 if (count($fields) >= 4) { 57 if (sscanf($fields[0], "%x %x", $cp1, $cp2) == 2) 58 $utf32 = pack('N', $cp1) . pack('N', $cp2); 59 else 60 $utf32 = pack('N', hexdec($fields[0])); 61 62 if ($fields[2]) { 63 $kuten = shiftJISDecode(hexdec($fields[2])); 64 $ku = $kuten >> 8; 65 if ($ku >= 106 && $ku <= 112) 66 $cp932Chars[pack('n', $kuten - (22 * 0x100))] = $utf32; 67 } 68 } 69} 70 71/* Duplicate mappings for the same characters in CP932 */ 72$nonInvertible = array(); 73foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C] as $i) { 74 $bytes = pack('n', shiftJISDecode($i)); 75 $nonInvertible[$bytes] = $cp932Chars[$bytes]; 76} 77 78/* Read in table of all characters in JISX-0201 charset */ 79$jisx0201Chars = array(); /* JISX0201 -> UTF-32BE */ 80$fp = fopen(realpath(__DIR__ . '/data/JISX0201.txt'), 'r+'); 81while ($line = fgets($fp, 256)) { 82 if ($line[0] == '#') 83 continue; 84 85 if (sscanf($line, "0x%x\t0x%x", $byte, $codepoint) == 2) 86 $jisx0201Chars[chr($byte)] = pack('N', $codepoint); 87} 88 89function testValid($from, $to, $encoding, $bothWays = true) { 90 identifyValidString($from, $encoding); 91 convertValidString($from, $to, $encoding, 'UTF-32BE', false); 92 93 if ($bothWays) { 94 /* ESC ( B at the beginning is redundant, since ASCII mode is the default */ 95 if (substr($from, 0, 3) == "\x1B(B") 96 $from = substr($from, 3, strlen($from) - 3); 97 /* If the string switches to a different charset, it should switch back to 98 * ASCII at the end */ 99 if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(I") !== false || strpos($from, "\x1B\$@") !== false || strpos($from, "\x1B\$(B") !== false) 100 $from .= "\x1B(B"; 101 102 convertValidString($to, $from, 'UTF-32BE', $encoding, false); 103 } 104} 105 106function testInvalid($from, $to, $encoding) { 107 testInvalidString($from, $to, $encoding, 'UTF-32BE'); 108} 109 110for ($i = 0; $i < 0x80; $i++) { 111 if ($i == 0x1B) 112 continue; 113 testValid(chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-KDDI'); 114 testValid("\x1B(B" . chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-KDDI', false); 115 testValid("\x1B(J" . chr($i), "\x00\x00\x00" . chr($i), 'ISO-2022-JP-KDDI', false); 116} 117 118for ($i = 0x80; $i < 256; $i++) { 119 if ($i >= 0xA1 && $i <= 0xDF) // We convert single bytes from 0xA1-0xDF as JIS X 0201 kana 120 continue; 121 testInvalid(chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI'); 122 testInvalid("\x1B(B" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI'); 123 testInvalid("\x1B(J" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI'); 124} 125 126echo "ASCII support OK\n"; 127 128/* All valid JIS X 0201 characters 129 * Those with a 1 in the high bit are JIS X 0201 kana */ 130foreach ($jisx0201Chars as $jisx0201 => $utf32BE) { 131 if (ord($jisx0201) >= 128) { 132 $kana = chr(ord($jisx0201) - 128); 133 testValid("\x1B(I" . $kana, $utf32BE, 'ISO-2022-JP-KDDI', false); 134 testValid($jisx0201, $utf32BE, 'ISO-2022-JP-KDDI', false); 135 } 136} 137 138for ($i = 0x80; $i < 256; $i++) { 139 if ($i >= 0xA1 && $i <= 0xDF) 140 continue; 141 testInvalid("\x1B(I" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI'); 142 testInvalid("\x1B(J" . chr($i), "\x00\x00\x00%", 'ISO-2022-JP-KDDI'); 143} 144 145echo "JIS X 0201 support OK\n"; 146 147$validChars = $cp932Chars; 148/* We allow ASCII/JIS X 0201 characters to appear even in JIS X 0208 mode */ 149for ($i = 0; $i <= 0x7F; $i++) 150 $validChars[chr($i)] = chr($i); 151for ($i = 0xA1; $i <= 0xDF; $i++) 152 $validChars[chr($i)] = $jisx0201Chars[chr($i)]; 153$lenTable = array_fill_keys(range(0xE0, 0xFC), 2) + array_fill_keys(range(0x81, 0x9F), 2); 154findInvalidChars($validChars, $invalidChars, $truncatedChars, $lenTable); 155 156foreach ($nonInvertible as $bytes => $char) 157 unset($cp932Chars[$bytes]); 158 159$good = array_keys($cp932Chars); 160shuffle($good); 161while (!empty($good)) { 162 $length = min(rand(5,10), count($good)); 163 $from = $to = ''; 164 while ($length--) { 165 $goodChar = array_pop($good); 166 $from .= $goodChar; 167 $to .= $cp932Chars[$goodChar]; 168 } 169 testValid("\x1B\$B" . $from, $to, 'ISO-2022-JP-KDDI'); 170} 171 172$good = array_keys($nonInvertible); 173shuffle($good); 174while (!empty($good)) { 175 $length = min(rand(5,10), count($good)); 176 $from = $to = ''; 177 while ($length--) { 178 $goodChar = array_pop($good); 179 $from .= $goodChar; 180 $to .= $nonInvertible[$goodChar]; 181 } 182 testValid("\x1B\$B" . $from, $to, 'ISO-2022-JP-KDDI', false); 183} 184 185foreach (array_keys($invalidChars) as $invalid) { 186 $firstByte = ord($invalid[0]); 187 if (($firstByte > 0x80 && $firstByte < 0xA0) || $firstByte >= 0xE0) { 188 /* The first byte of this 2-byte character will be rejected and result in % being sent 189 * to the output. Then the second byte will do something else. It is easier to write the 190 * test if we only check with the 1st byte. */ 191 testInvalidString("\x1B\$B" . $invalid[0], "\x00\x00\x00%", 'ISO-2022-JP-KDDI', 'UTF-32BE'); 192 } else { 193 testInvalidString("\x1B\$B" . $invalid, "\x00\x00\x00%", 'ISO-2022-JP-KDDI', 'UTF-32BE'); 194 } 195} 196 197foreach (array_keys($truncatedChars) as $truncated) 198 testInvalidString("\x1B\$B" . $truncated, "\x00\x00\x00%", 'ISO-2022-JP-KDDI', 'UTF-32BE'); 199 200echo "JIS X 0208 (with MS extensions) and KDDI emoji support OK\n"; 201 202testValidString("\x00\xA5", "\x1B\$B!o\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false); 203testValidString("\x20\x3E", "\x1B\$B!1\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false); 204testValidString("\xFF\x5E", "\x1B\$B!A\x1B(B", "UTF-16BE", "ISO-2022-JP-KDDI", false); 205 206echo "Other mappings from Unicode -> ISO-2022-JP-KDDI OK\n"; 207 208// Test "long" illegal character markers 209mb_substitute_character("long"); 210convertInvalidString("\xE0", "%", "ISO-2022-JP-KDDI", "UTF-8"); 211// Invalid escapes: 212convertInvalidString("\x1B", "%", "ISO-2022-JP-KDDI", "UTF-8"); 213convertInvalidString("\x1B.", "%", "ISO-2022-JP-KDDI", "UTF-8"); 214convertInvalidString("\x1B\$", "%", "ISO-2022-JP-KDDI", "UTF-8"); 215convertInvalidString("\x1B\$.", "%", "ISO-2022-JP-KDDI", "UTF-8"); 216convertInvalidString("\x1B\$(X", "%", "ISO-2022-JP-KDDI", "UTF-8"); 217convertInvalidString("\x1B\$B\x9F", "%", "ISO-2022-JP-KDDI", "UTF-8"); // 0x9F does not start any 2-byte character 218convertInvalidString("\xE0\x00", "U+E000", "UTF-16BE", "ISO-2022-JP-KDDI"); 219 220echo "Done!\n"; 221?> 222--EXPECT-- 223ASCII support OK 224JIS X 0201 support OK 225JIS X 0208 (with MS extensions) and KDDI emoji support OK 226Other mappings from Unicode -> ISO-2022-JP-KDDI OK 227Done! 228