1--TEST-- 2Exhaustive test of CP51932 encoding verification and conversion 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11srand(2020); /* Make results consistent */ 12include('encoding_tests.inc'); 13mb_substitute_character(0x25); // '%' 14 15/* Read in the table of all characters in CP51932 */ 16$validChars = array(); /* CP51932 string -> UTF-16BE string */ 17$fromUnicode = array(); 18 19$fp = fopen(realpath(__DIR__ . '/data/CP51932.txt'), 'r+'); 20while ($line = fgets($fp, 256)) { 21 if ($line[0] == '#') 22 continue; 23 24 $byte2 = null; 25 if (sscanf($line, '<U%x> \x%x\x%x', $codepoint, $byte1, $byte2) >= 2) { 26 /* The table we are using tries to map as many Unicode codepoints into 27 * CP51932 as possible, including by mapping latin characters with accents 28 * to the equivalent without accents; but since CP51932 is based on the 29 * CP932 character set, we don't need to handle codepoints which are not 30 * mapped from any character in CP932 */ 31 if (($codepoint >= 0xC0 && $codepoint <= 0xD6) || 32 ($codepoint >= 0xD8 && $codepoint <= 0xF6) || 33 ($codepoint >= 0xF8 && $codepoint <= 0xFF)) 34 continue; 35 $cp51932 = ($byte2 ? (chr($byte1) . chr($byte2)) : chr($byte1)); 36 $utf16 = pack('n', $codepoint); 37 $validChars[$cp51932] = $utf16; 38 $fromUnicode[$utf16] = $cp51932; 39 } 40} 41 42/* We map the JIS X 0208 FULLWIDTH TILDE to U+FF5E (FULLWIDTH TILDE) 43 * But when converting Unicode to CP51932, we also accept U+301C (WAVE DASH) */ 44$fromUnicode["\x30\x1C"] = "\xA1\xC1"; 45/* We map the JIS X 0208 MINUS SIGN to U+FF0D (FULLWIDTH HYPHEN-MINUS SIGN), 46 * but when converting Unicode to CP51932, we also accept U+2212 (MINUS SIGN) */ 47$fromUnicode["\x22\x12"] = "\xA1\xDD"; 48/* We map the JIS X 0208 PARALLEL TO symbol to U+2225 (PARALLEL TO), 49 * but when converting Unicode to CP51932, we also accept U+2016 50 * (DOUBLE VERTICAL LINE) */ 51$fromUnicode["\x20\x16"] = "\xA1\xC2"; 52 53/* There are a number of duplicate, irreversible mappings in the CP51932 table 54 * In most cases, the one which we primarily use appears last in the table, 55 * but in some cases, it is first and will be overwritten in the above loop 56 * 57 * Interestingly, the "collisions" happen in both directions! Part of this is 58 * because the table we are using attempts to map as many Unicode codepoints 59 * as possible to CP932 characters */ 60$fromUnicode["\x22\x20"] = "\xA2\xDC"; 61$fromUnicode["\x22\x29"] = "\xA2\xC1"; 62$fromUnicode["\x22\x2B"] = "\xA2\xE9"; 63$fromUnicode["\x22\x35"] = "\xA2\xE8"; 64$fromUnicode["\x22\x1A"] = "\xA2\xE5"; 65$fromUnicode["\x22\x2A"] = "\xA2\xC0"; 66$fromUnicode["\x22\x61"] = "\xA2\xE1"; 67$fromUnicode["\x22\xA5"] = "\xA2\xDD"; 68$fromUnicode["\x22\x52"] = "\xA2\xE2"; 69$fromUnicode["\xFF\xE2"] = "\xA2\xCC"; 70unset($fromUnicode["\x00\xA1"]); // Don't map upside-down ! to ordinary ! 71unset($fromUnicode["\x00\xA6"]); // Don't map broken bar to ordinary pipe character 72unset($fromUnicode["\x00\xA9"]); // Don't map © to c 73unset($fromUnicode["\x00\xAA"]); // Don't map feminine ordinal indicator 74unset($fromUnicode["\x00\xAB"]); // Don't map left double angled quote mark to "much less than" 75unset($fromUnicode["\x00\xAD"]); // Don't map soft hyphen to ordinary hyphen 76unset($fromUnicode["\x00\xAE"]); // Don't map ® to R 77unset($fromUnicode["\x00\xAF"]); // Don't map Unicode halfwidth macron to CP932 fullwidth macron 78unset($fromUnicode["\x00\xB2"]); // Don't map ² to ordinary 2 79unset($fromUnicode["\x00\xB3"]); // Don't map ³ to ordinary 3 80unset($fromUnicode["\x00\xB5"]); // Don't map micro sign to Greek mu 81unset($fromUnicode["\x00\xB7"]); // Don't map middle dot to katakana middle dot 82unset($fromUnicode["\x00\xB8"]); // Don't map cedilla to fullwidth comma 83unset($fromUnicode["\x00\xB9"]); // Don't map ¹ to ordinary 1 84unset($fromUnicode["\x00\xBA"]); // Don't map "masculine ordinal indicator" 85unset($fromUnicode["\x00\xBB"]); // Don't map right double angled quote mark to "much greater than" 86unset($fromUnicode["\x30\x94"]); // Don't map hiragana vu to katakana vu 87 88for ($i = 0; $i <= 0x7F; $i++) 89 $validChars[chr($i)] = "\x00" . chr($i); 90 91/* U+00A5 is YEN SIGN; convert to FULLWIDTH YEN SIGN */ 92$fromUnicode["\x00\xA5"] = "\xA1\xEF"; 93/* U+203E is OVERLINE; convert to FULLWIDTH MACRON */ 94$fromUnicode["\x20\x3E"] = "\xA1\xB1"; 95/* U+00AF is MACRON; convert to FULLWIDTH MACRON */ 96$fromUnicode["\x00\xAF"] = "\xA1\xB1"; 97 98testAllValidChars($validChars, 'CP51932', 'UTF-16BE', false); 99testAllValidChars($fromUnicode, 'UTF-16BE', 'CP51932', false); 100echo "CP51932 verification and conversion works on all valid characters\n"; 101 102findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0xA9, 0xAF), 2) + array_fill_keys(range(0xF5, 0xF8), 2) + array(0xFD => 2, 0xFE => 2)); 103 104testAllInvalidChars($invalidChars, $validChars, 'CP51932', 'UTF-16BE', "\x00%"); 105testTruncatedChars($truncated, 'CP51932', 'UTF-16BE', "\x00%"); 106echo "CP51932 verification and conversion works on all invalid characters\n"; 107 108findInvalidChars($fromUnicode, $invalidCodepoints, $unused, array_fill_keys(range(0, 0xFF), 2)); 109convertAllInvalidChars($invalidCodepoints, $fromUnicode, 'UTF-16BE', 'CP51932', '%'); 110echo "Unicode -> CP51932 conversion works on all invalid codepoints\n"; 111 112// Test "long" illegal character markers 113mb_substitute_character("long"); 114convertInvalidString("\x80", "%", "CP51932", "UTF-8"); 115convertInvalidString("\xFE\xFF", "%", "CP51932", "UTF-8"); 116 117echo "Done!\n"; 118?> 119--EXPECT-- 120CP51932 verification and conversion works on all valid characters 121CP51932 verification and conversion works on all invalid characters 122Unicode -> CP51932 conversion works on all invalid codepoints 123Done! 124