1--TEST-- 2Exhaustive test of Shift-JIS encoding verification and conversion 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11srand(999); /* Make results consistent */ 12include('encoding_tests.inc'); 13mb_substitute_character(0x25); // '%' 14 15/* Read in the table of all characters in Shift-JIS */ 16readConversionTable(__DIR__ . '/data/SHIFTJIS.txt', $validChars, $fromUnicode); 17 18for ($i = 0; $i < 0x20; $i++) { 19 $validChars[chr($i)] = "\x00" . chr($i); 20 $fromUnicode["\x00" . chr($i)] = chr($i); 21} 22 23/* According to the relevant Japan Industrial Standards Committee standards, 24 * SJIS 0x5C is a Yen sign, and 0x7E is an overline. 25 * 26 * However, this conflicts with the implementation of SJIS in various legacy 27 * software (notably Microsoft products), where SJIS 0x5C and 0x7E are taken 28 * as equivalent to the same ASCII bytes. 29 * 30 * Prior to PHP 8.1, mbstring's implementation of SJIS handled these bytes 31 * compatibly with Microsoft products. This was changed in PHP 8.1.0, in an 32 * attempt to comply with the JISC specifications. However, after discussion 33 * with various concerned Japanese developers, it seems that the historical 34 * behavior was more useful in the majority of applications which process 35 * SJIS-encoded text. */ 36$validChars["\x5C"] = "\x00\x5C"; 37$validChars["\x7E"] = "\x00\x7E"; 38$fromUnicode["\x00\x5C"] = "\x5C"; 39$fromUnicode["\x00\x7E"] = "\x7E"; 40 41/* That means it does not make sense to convert U+203E (OVERLINE) 42 * to 0x7E; convert it to JIS X 0208 FULLWIDTH MACRON instead */ 43$fromUnicode["\x20\x3E"] = "\x81\x50"; 44/* U+00AF is MACRON; convert that to FULLWIDTH MACRON as well */ 45$fromUnicode["\x00\xAF"] = "\x81\x50"; 46/* Since we are treating 0x5C as equivalent to U+005C, it does not 47 * make sense to convert U+00A5 (YEN SIGN) to 0x5C 48 * Convert it to JIS X 0208 FULLWIDTH YEN SIGN instead */ 49$fromUnicode["\x00\xA5"] = "\x81\x8F"; 50 51/* DEL character */ 52$validChars["\x7F"] = "\x00\x7F"; 53$fromUnicode["\x00\x7F"] = "\x7F"; 54/* Use fullwidth reverse solidus, not (halfwidth) backslash (0x5C) */ 55$validChars["\x81\x5F"] = "\xFF\x3C"; 56$fromUnicode["\xFF\x3C"] = "\x81\x5F"; 57/* Unicode has both halfwidth and fullwidth NOT SIGN; convert both of them 58 * to JIS X 0208 NOT SIGN */ 59$fromUnicode["\xFF\xE2"] = "\x81\xCA"; 60/* Likewise for fullwidth and halfwidth POUND SIGN */ 61$fromUnicode["\xFF\xE1"] = "\x81\x92"; 62/* Likewise for fullwidth and halfwidth CENT SIGN */ 63$fromUnicode["\xFF\xE0"] = "\x81\x91"; 64/* Convert Unicode FULLWIDTH TILDE to JIS X 0208 WAVE DASH */ 65$fromUnicode["\xFF\x5E"] = "\x81\x60"; 66/* Convert Unicode FULLWIDTH HYPHEN-MINUS to JIS X 0208 MINUS SIGN */ 67$fromUnicode["\xFF\x0D"] = "\x81\x7C"; 68/* Convert Unicode PARALLEL TO to JIS X 0208 DOUBLE VERTICAL LINE */ 69$fromUnicode["\x22\x25"] = "\x81\x61"; 70 71testAllValidChars($validChars, 'Shift-JIS', 'UTF-16BE'); 72echo "SJIS verification and conversion works on all valid characters\n"; 73 74findInvalidChars($validChars, $invalidChars, $truncated, 75 array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xEF), 2)); 76testAllInvalidChars($invalidChars, $validChars, 'Shift-JIS', 'UTF-16BE', "\x00%"); 77testTruncatedChars($truncated, 'Shift-JIS', 'UTF-16BE', "\x00%"); 78echo "SJIS verification and conversion works on all invalid characters\n"; 79 80testAllValidChars($fromUnicode, 'UTF-16BE', 'Shift-JIS', false); 81echo "Unicode -> SJIS conversion works on all valid characters\n"; 82 83findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2)); 84convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'Shift-JIS', '%'); 85echo "Unicode -> SJIS conversion works on all invalid characters\n"; 86 87testValidString("\xFF\x5E", "\x81\x60", 'UTF-16BE', 'SJIS', false); 88echo "Other mappings from Unicode -> SJIS are OK\n"; 89 90// Test "long" illegal character markers 91mb_substitute_character("long"); 92convertInvalidString("\x80", "%", "Shift-JIS", "UTF-8"); 93convertInvalidString("\x81\x20", "%", "Shift-JIS", "UTF-8"); 94convertInvalidString("\xEA\xA9", "%", "Shift-JIS", "UTF-8"); 95 96echo "Done!\n"; 97?> 98--EXPECT-- 99SJIS verification and conversion works on all valid characters 100SJIS verification and conversion works on all invalid characters 101Unicode -> SJIS conversion works on all valid characters 102Unicode -> SJIS conversion works on all invalid characters 103Other mappings from Unicode -> SJIS are OK 104Done! 105