1--TEST-- 2Exhaustive test of EUC-JP encoding verification and conversion 3--EXTENSIONS-- 4mbstring 5--SKIPIF-- 6<?php 7if (getenv("SKIP_SLOW_TESTS")) die("skip slow test"); 8?> 9--FILE-- 10<?php 11srand(555); /* Make results consistent */ 12include('encoding_tests.inc'); 13mb_substitute_character(0x25); // '%' 14 15/* Read in the table of all characters in EUC-JP */ 16readConversionTable(__DIR__ . '/data/EUC-JP.txt', $validChars, $fromUnicode, true); 17 18/* The JIS X 0208 character set does not have a single, straightforward 19 * mapping to the Unicode character set */ 20 21/* Kuten code 0x2140 (EUC-JP 0xA1C0) is a backslash; this can be mapped to 22 * 0x005C for an ordinary backslash, or 0xFF3C for a _fullwidth_ one 23 * We go with fullwidth */ 24$validChars["\xA1\xC0"] = "\x00\x00\xFF\x3C"; 25$fromUnicode["\x00\x00\xFF\x3C"] = "\xA1\xC0"; 26 27/* Unicode has both halfwidth and fullwidth NOT SIGN; convert both of them 28 * to JIS X 0208 NOT SIGN */ 29$fromUnicode["\x00\x00\xFF\xE2"] = "\xA2\xCC"; 30/* Likewise for fullwidth and halfwidth POUND SIGN */ 31$fromUnicode["\x00\x00\xFF\xE1"] = "\xA1\xF2"; 32/* Likewise for fullwidth and halfwidth CENT SIGN */ 33$fromUnicode["\x00\x00\xFF\xE0"] = "\xA1\xF1"; 34/* Convert Unicode FULLWIDTH TILDE to JIS X 0208 WAVE DASH */ 35$fromUnicode["\x00\x00\xFF\x5E"] = "\xA1\xC1"; 36/* Convert Unicode FULLWIDTH HYPHEN-MINUS to JIS X 0208 MINUS SIGN */ 37$fromUnicode["\x00\x00\xFF\x0D"] = "\xA1\xDD"; 38/* Convert Unicode PARALLEL TO to JIS X 0208 DOUBLE VERTICAL LINE */ 39$fromUnicode["\x00\x00\x22\x25"] = "\xA1\xC2"; 40 41/* Unicode 0x007E (tilde) can be represented in two different ways in EUC-JP 42 * When converting Unicode to EUC-JP, use the simpler representation */ 43$fromUnicode["\x00\x00\x00\x7E"] = "\x7E"; 44/* Likewise with 0x005C */ 45$fromUnicode["\x00\x00\x00\x5C"] = "\x5C"; 46 47/* U+203E is OVERLINE; convert to FULLWIDTH MACRON */ 48$fromUnicode["\x00\x00\x20\x3E"] = "\xA1\xB1"; 49 50findInvalidChars($validChars, $invalidChars, $truncated, array_fill_keys(range(0xA1, 0xFE), 2) + array(0x8E => 2, 0x8F => 3)); 51 52/* In the JIS X 0212 character set, kuten code 0x2237 (EUC-JP 0x8FA2B7) 53 * is an ordinary tilde character 54 * This mapping is not reversible, because ASCII 0x7E also represents 55 * the same character */ 56unset($validChars["\x8F\xA2\xB7"]); 57 58testAllValidChars($validChars, 'EUC-JP', 'UTF-32BE'); 59echo "Encoding verification and conversion work for all valid characters\n"; 60 61testAllInvalidChars($invalidChars, $validChars, 'EUC-JP', 'UTF-32BE', "\x00\x00\x00%"); 62testTruncatedChars($truncated, 'EUC-JP', 'UTF-32BE', "\x00\x00\x00%"); 63echo "Encoding verification and conversion work for all invalid characters\n"; 64 65testValidString("\x8F\xA2\xB7", "\x00\x00\x00~", 'EUC-JP', 'UTF-32BE', false); 66echo "Irreversible mapping of 0x8FA2B7 follows JIS X 0212 correctly\n"; 67 68testAllValidChars($fromUnicode, 'UTF-32BE', 'EUC-JP', false); 69echo "Unicode -> EUC-JP conversion works on all valid characters\n"; 70 71$invalidChars = array(); 72for ($cp = 0; $cp <= 0xFFFF; $cp++) { 73 $char = pack('N', $cp); 74 if (!isset($fromUnicode[$char])) 75 $invalidChars[$char] = true; 76} 77convertAllInvalidChars($invalidChars, $fromUnicode, 'UTF-32BE', 'EUC-JP', '%'); 78echo "Unicode -> EUC-JP conversion works on all invalid characters\n"; 79 80// Test "long" illegal character markers 81mb_substitute_character("long"); 82convertInvalidString("\x80", "%", "EUC-JP", "UTF-8"); 83convertInvalidString("\xFE\xFF", "%", "EUC-JP", "UTF-8"); 84 85echo "Done!\n"; 86?> 87--EXPECT-- 88Encoding verification and conversion work for all valid characters 89Encoding verification and conversion work for all invalid characters 90Irreversible mapping of 0x8FA2B7 follows JIS X 0212 correctly 91Unicode -> EUC-JP conversion works on all valid characters 92Unicode -> EUC-JP conversion works on all invalid characters 93Done! 94