1<?php 2 3// Common code for tests which focus on conversion and verification of text 4// in some specific encoding 5 6// Test failed limit. If you want to execute all tests, set to -1. 7$testFailedLimit = 1000; 8// Test failed counter 9$testFailedCounter = 0; 10 11// Count tests failed. If tests failed are more than $testFailedLimit, stop testing. 12function testFailedIncrement() { 13 global $testFailedCounter, $testFailedLimit; 14 15 // $testFailedLimit is -1, no limit. 16 if ($testFailedLimit === -1) 17 return; 18 19 $testFailedCounter++; 20 if ($testFailedCounter < $testFailedLimit) 21 return; 22 23 die("=== Failed test " . $testFailedLimit . " times exceeded, stop testing ==="); 24} 25 26// Read a file with one character and its equivalent Unicode codepoint on each 27// line, delimited by tabs 28function readConversionTable($path, &$from, &$to, $utf32 = false) { 29 $from = array(); 30 $to = array(); 31 32 $fp = fopen($path, 'r+'); 33 while ($line = fgets($fp, 256)) { 34 if ($line[0] == '#') 35 continue; 36 if (sscanf($line, "0x%x\t0x%x", $char, $codepoint) == 2) { 37 // Skip codepoints that do not have a mapping (e.g. in BIG5.txt) 38 if ($codepoint === 0xFFFD) { 39 continue; 40 } 41 $codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint); 42 if ($char == PHP_INT_MAX) { 43 // We may be on a 32-bit machine and testing a text encoding with 4-byte codes 44 // (which can't be represented in a PHP integer) 45 $char = ""; 46 for ($i = 2; $i < strlen($line); $i += 2) { 47 $substr = substr($line, $i, 2); 48 if (ctype_xdigit($substr)) 49 $char .= chr(hexdec($substr)); 50 else 51 break; 52 } 53 } else { 54 if ($char <= 0xFF) 55 $char = chr($char); // hex codes must not have leading zero bytes 56 else if ($char <= 0xFFFF) 57 $char = pack('n', $char); 58 else if ($char <= 0xFFFFFF) 59 $char = chr($char >> 16) . pack('n', $char & 0xFFFF); 60 else 61 $char = pack('N', $char); 62 } 63 $from[$char] = $codepoint; 64 $to[$codepoint] = $char; 65 } 66 } 67} 68 69function dbgPrint($str) { 70 $result = ''; 71 if (mb_check_encoding($str, 'ASCII')) 72 $result .= '"' . $str . '" '; 73 return $result . "(" . bin2hex($str) . ")"; 74} 75 76function identifyValidString($goodString, $encoding) { 77 $result = mb_check_encoding($goodString, $encoding); 78 if (!$result) { 79 echo "mb_check_encoding failed on good $encoding string: " . dbgPrint($goodString) . PHP_EOL; 80 testFailedIncrement(); 81 } 82} 83 84function identifyInvalidString($badString, $encoding) { 85 $result = mb_check_encoding($badString, $encoding); 86 if ($result) 87 echo "mb_check_encoding passed on bad $encoding string: " . dbgPrint($badString) . PHP_EOL; 88} 89 90function testConversion($fromString, $toString, $fromEncoding, $toEncoding) { 91 $result = mb_convert_encoding($fromString, $toEncoding, $fromEncoding); 92 if ($result !== $toString) { 93 echo "mb_convert_encoding not working on $fromEncoding input: " . dbgPrint($fromString) . "\nExpected $toEncoding: " . dbgPrint($toString) . "\nActually got: " . dbgPrint($result) . PHP_EOL; 94 testFailedIncrement(); 95 } 96} 97 98function testValidConversion($fromString, $toString, $fromEncoding, $toEncoding) { 99 $illegalChars = mb_get_info('illegal_chars'); 100 testConversion($fromString, $toString, $fromEncoding, $toEncoding); 101 if (mb_get_info('illegal_chars') !== $illegalChars) { 102 echo "mb_convert_encoding incremented illegal_chars on valid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding" . PHP_EOL; 103 testFailedIncrement(); 104 } 105} 106 107function convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) { 108 testValidConversion($fromString, $toString, $fromEncoding, $toEncoding); 109 if ($bothWays) 110 testValidConversion($toString, $fromString, $toEncoding, $fromEncoding); 111} 112 113function convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding) { 114 $illegalChars = mb_get_info('illegal_chars'); 115 testConversion($fromString, $toString, $fromEncoding, $toEncoding); 116 if (mb_get_info('illegal_chars') <= $illegalChars) { 117 echo "mb_convert_encoding did not increment illegal_chars on invalid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding" . PHP_EOL; 118 testFailedIncrement(); 119 } 120} 121 122function testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) { 123 identifyValidString($fromString, $fromEncoding); 124 convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays); 125} 126 127function testInvalidString($fromString, $toString, $fromEncoding, $toEncoding) { 128 identifyInvalidString($fromString, $fromEncoding); 129 convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding); 130} 131 132// Only for encodings where valid characters can be concatenated together in any 133// way, without any escape sequences 134function testAllValidChars($charMap, $fromEncoding, $toEncoding, $bothWays = true) { 135 $goodChars = array_keys($charMap); 136 shuffle($goodChars); 137 // Try a long string 138 $fromString = $toString = ''; 139 foreach ($goodChars as $goodChar) { 140 $fromString .= $goodChar; 141 $toString .= $charMap[$goodChar]; 142 } 143 testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays); 144 // Try various shorter ones 145 while (!empty($goodChars)) { 146 $length = min(rand(5,10), count($goodChars)); 147 $fromString = $toString = ''; 148 while ($length--) { 149 $goodChar = array_pop($goodChars); 150 $fromString .= $goodChar; 151 $toString .= $charMap[$goodChar]; 152 } 153 154 testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays); 155 156 $strLen = mb_strlen($fromString, $fromEncoding); 157 if ($strLen !== mb_strlen($toString, $toEncoding)) { 158 echo "Length of $fromEncoding string '" . bin2hex($fromString) . "' was different than expected; mb_strlen returned $strLen" . PHP_EOL; 159 testFailedIncrement(); 160 } 161 } 162} 163 164function testAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) { 165 $badChars = array_keys($badChars); 166 $goodChars = array(); 167 while (!empty($badChars)) { 168 if (empty($goodChars)) { 169 $goodChars = array_keys($charMap); 170 shuffle($goodChars); 171 } 172 $goodChar = array_pop($goodChars); 173 $fromString = array_pop($badChars) . $goodChar; 174 $toString = $replacement . $charMap[$goodChar]; 175 176 testInvalidString($fromString, $toString, $fromEncoding, $toEncoding); 177 } 178} 179 180function convertAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) { 181 $badChars = array_keys($badChars); 182 $goodChars = array(); 183 while (!empty($badChars)) { 184 if (empty($goodChars)) { 185 $goodChars = array_keys($charMap); 186 shuffle($goodChars); 187 } 188 $goodChar = array_pop($goodChars); 189 $fromString = array_pop($badChars) . $goodChar; 190 $toString = $replacement . $charMap[$goodChar]; 191 192 convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding); 193 } 194} 195 196function testTruncatedChars($truncated, $fromEncoding, $toEncoding, $replacement) { 197 $truncatedChars = array_keys($truncated); 198 foreach ($truncatedChars as $truncatedChar) { 199 testInvalidString($truncatedChar, $replacement, $fromEncoding, $toEncoding); 200 } 201} 202 203// For variable-width encodings, where we have an exhaustive list of 204// all valid characters of any width 205// 206// `$startBytes` maps from first-byte values to the corresponding character length 207// (For encodings where the first byte can tell you the length of a multi-byte 208// character) 209// Note that `$startBytes` can be partial! 210function findInvalidChars($valid, &$invalid, &$truncated, $startBytes = array()) { 211 $invalid = array(); 212 $truncated = array(); 213 $prefixes = array(); /* All sequences which are not (but can start) a valid character */ 214 215 foreach ($valid as $char => $unicode) { 216 for ($len = 1; $len < strlen($char); $len++) 217 $prefixes[substr($char, 0, $len)] = true; 218 } 219 220 $varLength = function($prefix) use($valid, $prefixes, &$invalid, &$truncated, &$varLength) { 221 for ($byte = 0; $byte < 256; $byte++) { 222 $str = $prefix . chr($byte); 223 if (!isset($valid[$str])) { 224 if (isset($prefixes[$str])) { 225 $truncated[$str] = true; 226 $varLength($str); 227 } else { 228 $invalid[$str] = true; 229 } 230 } 231 } 232 }; 233 234 $fixedLength = function($prefix, $remaining) use($valid, $prefixes, &$invalid, &$truncated, &$fixedLength) { 235 if ($remaining == 0) { 236 if (!isset($valid[$prefix])) 237 $invalid[$prefix] = true; 238 } else if ($remaining == 1) { 239 $truncated[$prefix] = true; 240 for ($i = 0; $i < 256; $i++) { 241 $str = $prefix . chr($i); 242 if (!isset($valid[$str])) 243 $invalid[$str] = true; 244 } 245 } else { 246 $truncated[$prefix] = true; 247 for ($i = 0; $i < 256; $i++) 248 $fixedLength($prefix . chr($i), $remaining - 1); 249 } 250 }; 251 252 for ($byte = 0; $byte < 256; $byte++) { 253 if (isset($startBytes[$byte])) { 254 $fixedLength(chr($byte), $startBytes[$byte] - 1); 255 } else { 256 $str = chr($byte); 257 if (!isset($valid[$str])) { 258 if (isset($prefixes[$str])) { 259 $truncated[$str] = true; 260 $varLength($str); 261 } else { 262 $invalid[$str] = true; 263 } 264 } 265 } 266 } 267} 268 269function testEncodingFromUTF16ConversionTable($path, $encoding, $replacement = '%', $startBytes = array()) { 270 srand(1000); // Make results consistent 271 mb_substitute_character(0x25); // '%' 272 readConversionTable($path, $toUnicode, $fromUnicode); 273 274 findInvalidChars($toUnicode, $invalid, $truncated, $startBytes); 275 testAllValidChars($toUnicode, $encoding, 'UTF-16BE'); 276 testAllInvalidChars($invalid, $toUnicode, $encoding, 'UTF-16BE', "\x00%"); 277 testTruncatedChars($truncated, $encoding, 'UTF-16BE', "\x00%"); 278 echo "Tested $encoding -> UTF-16BE\n"; 279 280 findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2)); 281 convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', $encoding, $replacement); 282 echo "Tested UTF-16BE -> $encoding\n"; 283} 284?> 285