1--TEST-- 2Exhaustive test of SJIS-2004 encoding verification and conversion 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11srand(101); /* Make results consistent */ 12include('encoding_tests.inc'); 13mb_substitute_character(0x25); // '%' 14 15/* Read in the table of all characters in SJIS-2004 */ 16$validChars = array(); /* SJIS-2004 string -> UTF-32BE string */ 17$fromUnicode = array(); /* UTF-16BE -> SJIS-2004 */ 18$fp = fopen(__DIR__ . '/data/SJIS-2004.txt', 'r+'); 19while ($line = fgets($fp, 256)) { 20 if ($line[0] == '#') 21 continue; 22 23 $codepoint2 = null; 24 if (sscanf($line, "0x%x\tU+%x+%x", $bytes, $codepoint1, $codepoint2) >= 2) { 25 $sjis = ($bytes < 256) ? chr($bytes) : pack('n', $bytes); 26 if ($codepoint2) { 27 $validChars[$sjis] = pack('NN', $codepoint1, $codepoint2); 28 } else { 29 /* Two input byte sequences can translate to either a 'halfwidth' or a 30 * 'fullwidth' version of a character; our implementation of SJIS-2004 31 * translates them to the fullwidth versions */ 32 if (preg_match('/Fullwidth: U\+([0-9A-F]+)/', $line, $match)) 33 $codepoint1 = hexdec($match[1]); 34 $validChars[$sjis] = pack('N', $codepoint1); 35 if ($codepoint1 <= 0xFFFF) 36 $fromUnicode[pack('n', $codepoint1)] = $sjis; 37 } 38 } 39} 40 41$fromUnicode["\x00\x7E"] = "\x7E"; 42$fromUnicode["\x00\x5C"] = "\x5C"; 43 44testAllValidChars($validChars, 'SJIS-2004', 'UTF-32BE'); 45echo "SJIS-2004 verification and conversion works for all valid characters\n"; 46 47findInvalidChars($validChars, $invalidChars, $truncated, 48 array_fill_keys(range(0x81, 0x9F), 2) + array_fill_keys(range(0xE0, 0xFC), 2)); 49testAllInvalidChars($invalidChars, $validChars, 'SJIS-2004', 'UTF-32BE', "\x00\x00\x00%"); 50testTruncatedChars($truncated, 'SJIS-2004', 'UTF-32BE', "\x00\x00\x00%"); 51echo "SJIS-2004 verification and conversion rejects all invalid characters\n"; 52 53testAllValidChars($fromUnicode, 'UTF-16BE', 'SJIS-2004', false); 54echo "Unicode -> SJIS-2004 conversion works on all valid characters\n"; 55 56findInvalidChars($fromUnicode, $invalidChars, $unused, array_fill_keys(range(0, 0xFF), 2)); 57convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-16BE', 'SJIS-2004', '%'); 58echo "Unicode -> SJIS-2004 conversion works on all invalid characters\n"; 59 60// Some pairs of Unicode codepoints are represented by a single character in SJIS-2004 61// Test the case where the first codepoint looks like it might be one of these pairs... 62// but the second one doesn't match 63convertValidString("\x30\x4B\x00A", "\x82\xA9A", 'UTF-16BE', 'SJIS-2004', false); 64 65// Test "long" illegal character markers 66mb_substitute_character("long"); 67convertInvalidString("\x80", "%", "SJIS-2004", "UTF-8"); 68convertInvalidString("\x81\x20", "%", "SJIS-2004", "UTF-8"); 69convertInvalidString("\xFC\xF5", "%", "SJIS-2004", "UTF-8"); 70 71echo "Done!\n"; 72?> 73--EXPECT-- 74SJIS-2004 verification and conversion works for all valid characters 75SJIS-2004 verification and conversion rejects all invalid characters 76Unicode -> SJIS-2004 conversion works on all valid characters 77Unicode -> SJIS-2004 conversion works on all invalid characters 78Done! 79