1<?php 2 3// Common code for tests which focus on conversion and verification of text 4// in some specific encoding 5 6// Read a file with one character and its equivalent Unicode codepoint on each 7// line, delimited by tabs 8function readConversionTable($path, &$from, &$to, $utf32 = false) { 9 $from = array(); 10 $to = array(); 11 12 $fp = fopen($path, 'r+'); 13 while ($line = fgets($fp, 256)) { 14 if ($line[0] == '#') 15 continue; 16 if (sscanf($line, "0x%x\t0x%x", $char, $codepoint) == 2) { 17 // Skip codepoints that do not have a mapping (e.g. in BIG5.txt) 18 if ($codepoint === 0xFFFD) { 19 continue; 20 } 21 $codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint); 22 if ($char == PHP_INT_MAX) { 23 // We may be on a 32-bit machine and testing a text encoding with 4-byte codes 24 // (which can't be represented in a PHP integer) 25 $char = ""; 26 for ($i = 2; $i < strlen($line); $i += 2) { 27 $substr = substr($line, $i, 2); 28 if (ctype_xdigit($substr)) 29 $char .= chr(hexdec($substr)); 30 else 31 break; 32 } 33 } else { 34 if ($char <= 0xFF) 35 $char = chr($char); // hex codes must not have leading zero bytes 36 else if ($char <= 0xFFFF) 37 $char = pack('n', $char); 38 else if ($char <= 0xFFFFFF) 39 $char = chr($char >> 16) . pack('n', $char & 0xFFFF); 40 else 41 $char = pack('N', $char); 42 } 43 $from[$char] = $codepoint; 44 $to[$codepoint] = $char; 45 } 46 } 47} 48 49function dbgPrint($str) { 50 $result = ''; 51 if (mb_check_encoding($str, 'ASCII')) 52 $result .= '"' . $str . '" '; 53 return $result . "(" . bin2hex($str) . ")"; 54} 55 56function identifyValidString($goodString, $encoding) { 57 $result = mb_check_encoding($goodString, $encoding); 58 if (!$result) 59 die("mb_check_encoding failed on good $encoding string: " . dbgPrint($goodString)); 60} 61 62function identifyInvalidString($badString, $encoding) { 63 $result = mb_check_encoding($badString, $encoding); 64 if ($result) 65 die("mb_check_encoding passed on bad $encoding string: " . dbgPrint($badString)); 66} 67 68function testConversion($fromString, $toString, $fromEncoding, $toEncoding) { 69 $result = mb_convert_encoding($fromString, $toEncoding, $fromEncoding); 70 if ($result !== $toString) 71 die("mb_convert_encoding not working on $fromEncoding input: " . dbgPrint($fromString) . "\nExpected $toEncoding: " . dbgPrint($toString) . "\nActually got: " . dbgPrint($result)); 72} 73 74function testValidConversion($fromString, $toString, $fromEncoding, $toEncoding) { 75 $illegalChars = mb_get_info('illegal_chars'); 76 testConversion($fromString, $toString, $fromEncoding, $toEncoding); 77 if (mb_get_info('illegal_chars') !== $illegalChars) 78 die("mb_convert_encoding incremented illegal_chars on valid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding"); 79} 80 81function convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) { 82 testValidConversion($fromString, $toString, $fromEncoding, $toEncoding); 83 if ($bothWays) 84 testValidConversion($toString, $fromString, $toEncoding, $fromEncoding); 85} 86 87function convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding) { 88 $illegalChars = mb_get_info('illegal_chars'); 89 testConversion($fromString, $toString, $fromEncoding, $toEncoding); 90 if (mb_get_info('illegal_chars') <= $illegalChars) 91 die("mb_convert_encoding did not increment illegal_chars on invalid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding"); 92} 93 94function testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) { 95 identifyValidString($fromString, $fromEncoding); 96 convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays); 97} 98 99function testInvalidString($fromString, $toString, $fromEncoding, $toEncoding) { 100 identifyInvalidString($fromString, $fromEncoding); 101 convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding); 102} 103 104// Only for encodings where valid characters can be concatenated together in any 105// way, without any escape sequences 106function testAllValidChars($charMap, $fromEncoding, $toEncoding, $bothWays = true) { 107 $goodChars = array_keys($charMap); 108 shuffle($goodChars); 109 while (!empty($goodChars)) { 110 $length = min(rand(5,10), count($goodChars)); 111 $fromString = $toString = ''; 112 while ($length--) { 113 $goodChar = array_pop($goodChars); 114 $fromString .= $goodChar; 115 $toString .= $charMap[$goodChar]; 116 } 117 118 testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays); 119 } 120} 121 122function testAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) { 123 $badChars = array_keys($badChars); 124 $goodChars = array(); 125 while (!empty($badChars)) { 126 if (empty($goodChars)) { 127 $goodChars = array_keys($charMap); 128 shuffle($goodChars); 129 } 130 $goodChar = array_pop($goodChars); 131 $fromString = array_pop($badChars) . $goodChar; 132 $toString = $replacement . $charMap[$goodChar]; 133 134 testInvalidString($fromString, $toString, $fromEncoding, $toEncoding); 135 } 136} 137 138function convertAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) { 139 $badChars = array_keys($badChars); 140 $goodChars = array(); 141 while (!empty($badChars)) { 142 if (empty($goodChars)) { 143 $goodChars = array_keys($charMap); 144 shuffle($goodChars); 145 } 146 $goodChar = array_pop($goodChars); 147 $fromString = array_pop($badChars) . $goodChar; 148 $toString = $replacement . $charMap[$goodChar]; 149 150 convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding); 151 } 152} 153 154function testTruncatedChars($truncated, $fromEncoding, $toEncoding, $replacement) { 155 $truncatedChars = array_keys($truncated); 156 foreach ($truncatedChars as $truncatedChar) { 157 testInvalidString($truncatedChar, $replacement, $fromEncoding, $toEncoding); 158 } 159} 160 161// For variable-width encodings, where we have an exhaustive list of 162// all valid characters of any width 163// 164// `$startBytes` maps from first-byte values to the corresponding character length 165// (For encodings where the first byte can tell you the length of a multi-byte 166// character) 167// Note that `$startBytes` can be partial! 168function findInvalidChars($valid, &$invalid, &$truncated, $startBytes = array()) { 169 $invalid = array(); 170 $truncated = array(); 171 $prefixes = array(); /* All sequences which are not (but can start) a valid character */ 172 173 foreach ($valid as $char => $unicode) { 174 for ($len = 1; $len < strlen($char); $len++) 175 $prefixes[substr($char, 0, $len)] = true; 176 } 177 178 $varLength = function($prefix) use($valid, $prefixes, &$invalid, &$truncated, &$varLength) { 179 for ($byte = 0; $byte < 256; $byte++) { 180 $str = $prefix . chr($byte); 181 if (!isset($valid[$str])) { 182 if (isset($prefixes[$str])) { 183 $truncated[$str] = true; 184 $varLength($str); 185 } else { 186 $invalid[$str] = true; 187 } 188 } 189 } 190 }; 191 192 $fixedLength = function($prefix, $remaining) use($valid, $prefixes, &$invalid, &$truncated, &$fixedLength) { 193 if ($remaining == 0) { 194 if (!isset($valid[$prefix])) 195 $invalid[$prefix] = true; 196 } else if ($remaining == 1) { 197 $truncated[$prefix] = true; 198 for ($i = 0; $i < 256; $i++) { 199 $str = $prefix . chr($i); 200 if (!isset($valid[$str])) 201 $invalid[$str] = true; 202 } 203 } else { 204 $truncated[$prefix] = true; 205 for ($i = 0; $i < 256; $i++) 206 $fixedLength($prefix . chr($i), $remaining - 1); 207 } 208 }; 209 210 for ($byte = 0; $byte < 256; $byte++) { 211 if (isset($startBytes[$byte])) { 212 $fixedLength(chr($byte), $startBytes[$byte] - 1); 213 } else { 214 $str = chr($byte); 215 if (!isset($valid[$str])) { 216 if (isset($prefixes[$str])) { 217 $truncated[$str] = true; 218 $varLength($str); 219 } else { 220 $invalid[$str] = true; 221 } 222 } 223 } 224 } 225} 226 227function testEncodingFromUTF16ConversionTable($path, $encoding, $replacement = '%', $startBytes = array()) { 228 srand(1000); // Make results consistent 229 mb_substitute_character(0x25); // '%' 230 readConversionTable($path, $toUnicode, $fromUnicode); 231 232 findInvalidChars($toUnicode, $invalid, $truncated, $startBytes); 233 testAllValidChars($toUnicode, $encoding, 'UTF-16BE'); 234 testAllInvalidChars($invalid, $toUnicode, $encoding, 'UTF-16BE', "\x00%"); 235 testTruncatedChars($truncated, $encoding, 'UTF-16BE', "\x00%"); 236 echo "Tested $encoding -> UTF-16BE\n"; 237 238 findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2)); 239 convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', $encoding, $replacement); 240 echo "Tested UTF-16BE -> $encoding\n"; 241} 242?> 243