1--TEST-- 2Exhaustive test of Shift-JIS DoCoMo, KDDI, SoftBank encoding verification and conversion 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11srand(818); /* Make results consistent */ 12include('encoding_tests.inc'); 13mb_substitute_character(0x25); // '%' 14 15/* Read in the table of all characters in Windows-932 16 * (The SJIS-Mobile encodings all use MS extensions) */ 17readConversionTable(__DIR__ . '/data/CP932.txt', $sjisChars, $fromUnicode, true); 18 19/* U+301C (WAVE DASH) converts to SJIS 0x8160 (WAVE DASH) */ 20$fromUnicode["\x00\x00\x30\x1C"] = "\x81\x60"; 21/* U+2212 (MINUS SIGN) converts to SJIS 0x817C (FULLWIDTH HYPHEN-MINUS) */ 22$fromUnicode["\x00\x00\x22\x12"] = "\x81\x7C"; 23/* U+203E (OVERLINE) converts to SJIS 0x8150 (FULLWIDTH MACRON) */ 24$fromUnicode["\x00\x00\x20\x3E"] = "\x81\x50"; 25/* U+2016 (DOUBLE VERTICAL LINE) converts to SJIS 0x8161 (PARALLEL TO) */ 26$fromUnicode["\x00\x00\x20\x16"] = "\x81\x61"; 27/* U+00AF (MACRON) converts to SJIS 0x8150 (FULLWIDTH MACRON) */ 28$fromUnicode["\x00\x00\x00\xAF"] = "\x81\x50"; 29/* U+00AC (NOT SIGN) converts to SJIS 0x81CA (FULLWIDTH NOT SIGN) */ 30$fromUnicode["\x00\x00\x00\xAC"] = "\x81\xCA"; 31/* U+00A5 (YEN SIGN) converts to SJIS 0x818F (FULLWIDTH YEN SIGN) */ 32$fromUnicode["\x00\x00\x00\xA5"] = "\x81\x8F"; 33/* U+00A3 (POUND SIGN) converts to SJIS 0x8192 (FULLWIDTH POUND SIGN) */ 34$fromUnicode["\x00\x00\x00\xA3"] = "\x81\x92"; 35/* U+00A2 (CENT SIGN) converts to SJIS 0x8191 (FULLWIDTH CENT SIGN) */ 36$fromUnicode["\x00\x00\x00\xA2"] = "\x81\x91"; 37 38/* Aside from the characters in that table, we also support a 'user' area 39 * from 0xF040-0xF9FC, which map to Unicode 'private' codepoints 0xE000-E757 */ 40$codepoint = 0xE000; 41for ($i = 0xF0; $i <= 0xF9; $i++) { 42 for ($j = 0x40; $j <= 0xFC; $j++) { 43 if ($j == 0x7F) 44 continue; 45 $utf32 = pack('N', $codepoint); 46 $cp932 = chr($i) . chr($j); 47 $sjisChars[$cp932] = $utf32; 48 $fromUnicode[$utf32] = $cp932; 49 $codepoint++; 50 } 51} 52 53$invalidCodepoints = array(); 54for ($i = 0; $i <= 0xFFFF; $i++) { 55 $cp = pack('N', $i); 56 if (!isset($fromUnicode[$cp])) 57 $invalidCodepoints[$cp] = true; 58} 59 60/* Windows-932 has many cases where two different kuten codes map to the same 61 * Unicode codepoints 62 * 63 * Everything from 0xED00-0xEEFF falls in this unfortunate category 64 * (Other sequences in 0xFA00-0xFC4B map to the same codepoints.) 65 * Our implementation of CP932 prefers the F's, but for SJIS-Mobile, 66 * we prefer the E's */ 67$nonInvertible = array(); 68for ($i = 0xFA00; $i <= 0xFC4B; $i++) { 69 $bytes = pack('n', $i); 70 if (isset($sjisChars[$bytes])) { 71 $nonInvertible[$bytes] = $sjisChars[$bytes]; 72 unset($fromUnicode[$sjisChars[$bytes]]); 73 } 74} 75 76/* Other "collisions" */ 77foreach ([0x8790, 0x8791, 0x8792, 0x8795, 0x8796, 0x8797, 0x879A, 0x879B, 0x879C, 0xEEF9] as $i) { 78 $bytes = pack('n', $i); 79 $nonInvertible[$bytes] = $sjisChars[$bytes]; 80 unset($fromUnicode[$sjisChars[$bytes]]); 81} 82 83$nonInvertibleSoftbank = $nonInvertible; 84$nonInvertibleDocomo = $nonInvertible; 85 86/* Now read table of vendor-specific emoji encodings */ 87$docomo = $sjisChars; 88$kddi = $sjisChars; 89$softbank = $sjisChars; 90$sbEmoji = array(); 91$fp = fopen(realpath(__DIR__ . '/data/EmojiSources.txt'), 'r+'); 92while ($line = fgets($fp, 256)) { 93 if ($line[0] == '#') 94 continue; 95 $fields = explode(';', rtrim($line)); 96 if (count($fields) >= 4) { 97 if (sscanf($fields[0], "%x %x", $cp1, $cp2) == 2) { 98 $utf32 = pack('N', $cp1) . pack('N', $cp2); 99 } else { 100 $utf32 = pack('N', hexdec($fields[0])); 101 unset($invalidCodepoints[$utf32]); 102 } 103 104 if ($fields[1]) 105 $docomo[pack('n', hexdec($fields[1]))] = $utf32; 106 if ($fields[2]) 107 $kddi[pack('n', hexdec($fields[2]))] = $utf32; 108 if ($fields[3]) { 109 $bytes = pack('n', hexdec($fields[3])); 110 $sbEmoji[$bytes] = $utf32; 111 unset($nonInvertibleSoftbank[$bytes]); 112 } 113 } 114} 115 116/* Other, vendor-specific emoji which do not appear in EmojiSources.txt 117 * Most of these don't exist in Unicode and have been mapped to 'private 118 * area' codepoints */ 119$docomo["\xF9\x4A"] = "\x00\x0F\xEE\x16"; // PIAS PI 120$docomo["\xF9\x4B"] = "\x00\x0F\xEE\x17"; // PIAS A 121$docomo["\xF9\x4C"] = "\x00\x0F\xEE\x18"; // INVERSE TICKET 122$docomo["\xF9\x4D"] = "\x00\x0F\xEE\x19"; // KATAKANA ABBREVIATION FOR TICKET ("chi ke") 123$docomo["\xF9\x4E"] = "\x00\x0F\xEE\x1A"; // RESERVE BY PHONE 124$docomo["\xF9\x4F"] = "\x00\x0F\xEE\x1B"; // P CODE 125$docomo["\xF9\x53"] = "\x00\x0F\xEE\x1C"; // MOVIES 2 126$docomo["\xF9\x54"] = "\x00\x0F\xEE\x1D"; // PIAS PI INVERSE 127$docomo["\xF9\x58"] = "\x00\x0F\xEE\x1E"; // PIAS PI CIRCLE 128$docomo["\xF9\x59"] = "\x00\x0F\xEE\x1F"; // PIAS PI SQUARE 129$docomo["\xF9\x5A"] = "\x00\x0F\xEE\x20"; // CHECK 130$docomo["\xF9\x5F"] = "\x00\x0F\xEE\x21"; // F 131$docomo["\xF9\x60"] = "\x00\x0F\xEE\x22"; // D 132$docomo["\xF9\x61"] = "\x00\x0F\xEE\x23"; // S 133$docomo["\xF9\x62"] = "\x00\x0F\xEE\x24"; // C 134$docomo["\xF9\x63"] = "\x00\x0F\xEE\x25"; // R 135$docomo["\xF9\x64"] = "\x00\x00\x25\xEA"; // SQUARE WITH LOWER RIGHT DIAGONAL HALF BLACK 136$nonInvertibleDocomo["\xF9\x64"] = "\x00\x00\x25\xEA"; 137$docomo["\xF9\x65"] = "\x00\x00\x25\xA0"; // BLACK SQUARE 138$nonInvertibleDocomo["\xF9\x65"] = "\x00\x00\x25\xA0"; 139$docomo["\xF9\x66"] = "\x00\x00\x25\xBF"; // DOWNWARD TRIANGLE 140$nonInvertibleDocomo["\xF9\x66"] = "\x00\x00\x25\xBF"; 141/* TODO: test that FEE28 converts to F966, for backwards compatibility */ 142$docomo["\xF9\x67"] = "\x00\x0F\xEE\x29"; // QUADRUPLE DAGGER 143$docomo["\xF9\x68"] = "\x00\x0F\xEE\x2A"; // TRIPLE DAGGER 144$docomo["\xF9\x69"] = "\x00\x0F\xEE\x2B"; // DOUBLE DAGGER 145$docomo["\xF9\x6A"] = "\x00\x00\x20\x20"; // DAGGER 146$nonInvertibleDocomo["\xF9\x6A"] = "\x00\x00\x20\x20"; 147/* TODO: test that FEE2C converts to F96A, for backwards compatibility */ 148$docomo["\xF9\x6B"] = "\x00\x0F\xEE\x2D"; // I (meaning "inexpensive") 149$docomo["\xF9\x6C"] = "\x00\x0F\xEE\x2E"; // M (meaning "moderate") 150$docomo["\xF9\x6D"] = "\x00\x0F\xEE\x2F"; // E (meaning "expensive") 151$docomo["\xF9\x6E"] = "\x00\x0F\xEE\x30"; // VE (meaning "very expensive") 152$docomo["\xF9\x6F"] = "\x00\x0F\xEE\x31"; // SPHERE 153$docomo["\xF9\x70"] = "\x00\x0F\xEE\x32"; // CREDIT CARDS NOT ACCEPTED 154$docomo["\xF9\x71"] = "\x00\x0F\xEE\x33"; // CHECKBOX 155$docomo["\xF9\x75"] = "\x00\x0F\xEE\x10"; // I-MODE 156$docomo["\xF9\x76"] = "\x00\x0F\xEE\x11"; // I-MODE WITH FRAME 157$docomo["\xF9\x78"] = "\x00\x0F\xEE\x12"; // PROVIDED BY DOCOMO 158$docomo["\xF9\x79"] = "\x00\x0F\xEE\x13"; // DOCOMO POINT 159$docomo["\xF9\x84"] = "\x00\x00\x27\xBF"; // FREE DIAL; mapped to DOUBLE CURLY LOOP 160unset($invalidCodepoints["\x00\x00\x27\xBF"]); 161$docomo["\xF9\x86"] = "\x00\x0F\xE8\x2D"; // MOBILE Q 162$docomo["\xF9\xB1"] = "\x00\x0F\xEE\x14"; // I-APPLI 163$docomo["\xF9\xB2"] = "\x00\x0F\xEE\x15"; // I-APPLI WITH BORDER 164 165$kddi["\xF7\x94"] = "\x00\x0F\xEE\x40"; // EZ WEB 166$kddi["\xF7\xCF"] = "\x00\x0F\xEE\x41"; // EZ PLUS 167$kddi["\xF3\x70"] = "\x00\x0F\xEE\x42"; // EZ NAVIGATION 168$kddi["\xF4\x78"] = "\x00\x0F\xEE\x43"; // EZ MOVIE 169$kddi["\xF4\x86"] = "\x00\x0F\xEE\x44"; // CMAIL 170$kddi["\xF4\x8E"] = "\x00\x0F\xEE\x45"; // JAVA (TM) 171$kddi["\xF4\x8F"] = "\x00\x0F\xEE\x46"; // BREW 172$kddi["\xF4\x90"] = "\x00\x0F\xEE\x47"; // EZ RING MUSIC 173$kddi["\xF4\x91"] = "\x00\x0F\xEE\x48"; // EZ NAVI 174$kddi["\xF4\x92"] = "\x00\x0F\xEE\x49"; // WIN 175$kddi["\xF4\x93"] = "\x00\x0F\xEE\x4A"; // PREMIUM SIGN 176$kddi["\xF7\x48"] = "\x00\x0F\xE8\x2D"; // MOBILE Q 177$kddi["\xF7\xA3"] = "\x00\x0F\xE8\x3C"; // PDC ("personal digital cellular") 178$kddi["\xF7\xD2"] = "\x00\x0F\xEB\x89"; // OPENWAVE 179 180$sbEmoji["\xF7\xB1"] = "\x00\x00\x27\xBF"; // FREE DIAL; mapped to DOUBLE CURLY 181$sbEmoji["\xF7\xF4"] = "\x00\x0F\xEE\x77"; // J-PHONE SHOP 182$sbEmoji["\xF7\xF5"] = "\x00\x0F\xEE\x78"; // SKY WEB 183$sbEmoji["\xF7\xF6"] = "\x00\x0F\xEE\x79"; // SKY WALKER 184$sbEmoji["\xF7\xF7"] = "\x00\x0F\xEE\x7A"; // SKY MELODY 185$sbEmoji["\xF7\xF8"] = "\x00\x0F\xEE\x7B"; // J-PHONE 1 186$sbEmoji["\xF7\xF9"] = "\x00\x0F\xEE\x7C"; // J-PHONE 2 187$sbEmoji["\xF7\xFA"] = "\x00\x0F\xEE\x7D"; // J-PHONE 3 188 189/* SoftBank-specific 'JSky1', 'JSky2', 'VODAFONE1', 'VODAFONE2', etc. emoji, 190 * which are not supported by Unicode */ 191for ($i = 0xFBD8; $i <= 0xFBDE; $i++) { 192 $bytes = pack('n', $i); 193 $sbEmoji[$bytes] = pack('N', 0xFEE70 + $i - 0xFBD8); 194 unset($nonInvertibleSoftbank[$bytes]); 195} 196/* SoftBank-specific emoji for Shibuya department store */ 197$sbEmoji["\xFB\xAA"] = "\x00\x0F\xE4\xC5"; 198unset($nonInvertibleSoftbank["\xFB\xAA"]); 199 200$softbank = array_merge($softbank, $sbEmoji); 201 202/* For Softbank, we support an alternative representation for emoji which 203 * uses sequences starting with ESC. Apparently this was used in older 204 * versions of Softbank's phones. 205 * ESC could be followed by 6 different ASCII characters, each of which 206 * represented a different ku code */ 207$escCodeToKu = array('G' => 0x91, 'E' => 0x8D, 'F' => 0x8E, 'O' => 0x92, 'P' => 0x95, 'Q' => 0x96); 208$escCodeMaxTen = array('G' => 0x7A, 'E' => 0x7A, 'F' => 0x7A, 'O' => 0x6D, 'P' => 0x6C, 'Q' => 0x5E); 209 210function shiftJISEncode($ku, $ten) { 211 $ku -= 0x21; 212 $ten -= 0x21; 213 $hiBits = $ku >> 1; 214 $loBit = $ku % 2; 215 if ($hiBits < 31) { 216 $sjis = chr($hiBits + 0x81); 217 } else { 218 $sjis = chr($hiBits - 31 + 0xE0); 219 } 220 if ($loBit == 0) { 221 if ($ten < 63) 222 return $sjis . chr($ten + 0x40); 223 else 224 return $sjis . chr($ten - 63 + 0x80); 225 } else { 226 return $sjis . chr($ten + 0x9F); 227 } 228} 229 230foreach ($escCodeToKu as $char => $ku) { 231 for ($ten = 0x21; $ten <= $escCodeMaxTen[$char]; $ten++) { 232 $sjis = shiftJISEncode($ku, $ten); 233 if (isset($sbEmoji[$sjis])) { 234 $bytes = "\x1B\$" . $char . chr($ten); 235 $unicode = $softbank[$sjis]; 236 $nonInvertibleSoftbank[$bytes] = $softbank[$bytes] = $unicode; 237 } 238 } 239} 240 241/* A bare ESC is not valid for Softbank, since it is used for escape sequences 242 * which represent emoji */ 243unset($softbank["\x1B"]); 244 245function testSJISVariant($validChars, $nonInvertible, $encoding) { 246 global $fromUnicode, $invalidCodepoints, $escCodeToKu; 247 248 $lenTable = array_fill_keys(range(0xE0, 0xFC), 2) + array_fill_keys(range(0x81, 0x9F), 2); 249 findInvalidChars($validChars, $invalidChars, $truncated, $lenTable); 250 251 foreach ($escCodeToKu as $char => $unused) { 252 unset($invalidChars["\x1B\$" . $char . "\x0F"]); 253 unset($truncated["\x1B\$" . $char]); 254 } 255 256 $escapes = []; 257 foreach ($nonInvertible as $bytes => $unicode) { 258 unset($validChars[$bytes]); 259 if (substr($bytes, 0, 1) === "\x1B") 260 array_push($escapes, $bytes); 261 } 262 /* 0xF is used to terminate a run of emoji encoded using ESC sequence 263 * We couldn't do this earlier or `findInvalidChars` wouldn't have worked 264 * as desired */ 265 foreach ($escapes as $bytes) { 266 $nonInvertible[$bytes . "\x0F"] = $nonInvertible[$bytes]; 267 unset($nonInvertible[$bytes]); 268 } 269 270 testAllValidChars($validChars, $encoding, 'UTF-32BE'); 271 testAllValidChars($nonInvertible, $encoding, 'UTF-32BE', false); 272 echo "$encoding verification and conversion works on all valid characters\n"; 273 274 testAllInvalidChars($invalidChars, $validChars, $encoding, 'UTF-32BE', "\x00\x00\x00%"); 275 testTruncatedChars($truncated, $encoding, 'UTF-32BE', "\x00\x00\x00%"); 276 echo "$encoding verification and conversion works on all invalid characters\n"; 277 278 convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-32BE', $encoding, '%'); 279 echo "Unicode -> $encoding conversion works on all invalid codepoints\n"; 280 281 // Test "long" illegal character markers 282 mb_substitute_character("long"); 283 convertInvalidString("\x80", "%", $encoding, "UTF-8"); 284 convertInvalidString("\x81\x20", "%", $encoding, "UTF-8"); 285 convertInvalidString("\xEA\xA9", "%", $encoding, "UTF-8"); 286 mb_substitute_character(0x25); // '%' 287 288 // Test Regional Indicator codepoint at end of string 289 // The mobile SJIS variants all have special characters to represent certain national 290 // flags, but in Unicode these are represented by a sequence of _two_ codepoints 291 // So if only one of those two codepoints appears at the end of a string, it can't 292 // be converted to SJIS and should be treated as an error 293 convertInvalidString("\x00\x01\xF1\xE9", "%", "UTF-32BE", $encoding); // Regional Indicator C 294 295 // Test Regional Indicator codepoint followed by some other codepoint 296 convertInvalidString("\x00\x01\xF1\xE9\x00\x00\x00A", "%A", "UTF-32BE", $encoding); 297} 298 299testSJISVariant($docomo, $nonInvertibleDocomo, 'SJIS-Mobile#DOCOMO'); 300testSJISVariant($kddi, $nonInvertible, 'SJIS-Mobile#KDDI'); 301testSJISVariant($softbank, $nonInvertibleSoftbank, 'SJIS-Mobile#SOFTBANK'); 302 303// Special Softbank escape sequences can appear at end of string 304convertValidString("\x1B\$O", "", "SJIS-Mobile#SOFTBANK", "UTF-8", false); 305convertValidString("\x1B\$P", "", "SJIS-Mobile#SOFTBANK", "UTF-8", false); 306convertValidString("\x1B\$Q", "", "SJIS-Mobile#SOFTBANK", "UTF-8", false); 307// Try invalid escape sequence 308convertInvalidString("\x1B\$X", "%", "SJIS-Mobile#SOFTBANK", "UTF-8", false); 309// Try truncated escape sequence 310convertInvalidString("\x1B\$", "%", "SJIS-Mobile#SOFTBANK", "UTF-8", false); 311 312// Regression test for problem with not allocating enough space in output buffer 313// This occurred when the input string was shorter than the output 314convertValidString("\xA9\xA9\xA9\xA9", "\xF9\xD6\xF9\xD6\xF9\xD6\xF9\xD6", '8bit', 'SJIS-Mobile#DOCOMO'); 315convertValidString("\xA9\xA9\xA9\xA9", "\xF7\x74\xF7\x74\xF7\x74\xF7\x74", '8bit', 'SJIS-Mobile#KDDI'); 316convertValidString("\xA9\xA9\xA9\xA9", "\xF7\xEE\xF7\xEE\xF7\xEE\xF7\xEE", '8bit', 'SJIS-Mobile#SOFTBANK'); 317 318// Regression test: Old implementation used to drop digits (0-9) and hash (#) if 319// they appeared at end of input string 320for ($i = ord('0'); $i <= ord('9'); $i++) { 321 convertValidString("abc" . chr($i), "abc" . chr($i), 'UTF-8', 'SJIS-Mobile#DOCOMO'); 322 convertValidString("abc" . chr($i), "abc" . chr($i), 'UTF-8', 'SJIS-Mobile#KDDI'); 323 convertValidString("abc" . chr($i), "abc" . chr($i), 'UTF-8', 'SJIS-Mobile#SOFTBANK'); 324} 325 326// Regression test: Originally, new implementation also did not handle 0-9 and hash 327// followed by U+20E3 (keycap modifier) correctly if the 0-9 or hash occurred at 328// the very end of one buffer of wchars, and the keycap modifier was at the 329// beginning of the following buffer of wchars 330for ($i = 0; $i <= 256; $i++) { 331 convertValidString(str_repeat("\x00a", $i) . "\x00\x30\x20\xE3", str_repeat('a', $i) . "\xF9\x90", 'UTF-16BE', 'SJIS-Mobile#DOCOMO'); 332 convertValidString(str_repeat("\x00a", $i) . "\x00\x30\x20\xE3", str_repeat('a', $i) . "\xF7\xC9", 'UTF-16BE', 'SJIS-Mobile#KDDI'); 333 convertValidString(str_repeat("\x00a", $i) . "\x00\x30\x20\xE3", str_repeat('a', $i) . "\xF7\xC5", 'UTF-16BE', 'SJIS-Mobile#SOFTBANK'); 334} 335 336// Regression test for 0-9 appearing at end of one buffer and U+203E NOT appearing 337// at the beginning of the next 338for ($i = 0; $i <= 256; $i++) { 339 convertValidString(str_repeat("\x000", $i), str_repeat('0', $i), 'UTF-16BE', 'SJIS-Mobile#DOCOMO'); 340 convertValidString(str_repeat("\x000", $i), str_repeat('0', $i), 'UTF-16BE', 'SJIS-Mobile#KDDI'); 341 convertValidString(str_repeat("\x000", $i), str_repeat('0', $i), 'UTF-16BE', 'SJIS-Mobile#SOFTBANK'); 342} 343 344// Regression test for not making enough space in output buffer when 0-9 appeared 345// at the end of one buffer and was re-processed together with the next 346// This crazy-looking string was found by a fuzzer 347$str = "\x04\xff\x930\x00\xffUTF7~'F\x00A\x00\xffA\x0018030@\x00[\x1b\$EEEEE\x5C\x80(8~\x00F\x00zgb-18030$\x008~\x00F\x00z-gb-18EUC_JP-2004\x00z-g0\x0018030\x00b-18030$\x008~\x00F\x00z-gb-18EUC_JP-2004\x00z-g0\x0018030\x00"; 348mb_convert_encoding($str, 'SJIS-Mobile#SOFTBANK', 'SJIS-Mobile#SOFTBANK'); 349 350?> 351--EXPECT-- 352SJIS-Mobile#DOCOMO verification and conversion works on all valid characters 353SJIS-Mobile#DOCOMO verification and conversion works on all invalid characters 354Unicode -> SJIS-Mobile#DOCOMO conversion works on all invalid codepoints 355SJIS-Mobile#KDDI verification and conversion works on all valid characters 356SJIS-Mobile#KDDI verification and conversion works on all invalid characters 357Unicode -> SJIS-Mobile#KDDI conversion works on all invalid codepoints 358SJIS-Mobile#SOFTBANK verification and conversion works on all valid characters 359SJIS-Mobile#SOFTBANK verification and conversion works on all invalid characters 360Unicode -> SJIS-Mobile#SOFTBANK conversion works on all invalid codepoints 361