1<?php 2 3// Common code for tests which focus on conversion and verification of text 4// in some specific encoding 5 6// Read a file with one character and its equivalent Unicode codepoint on each 7// line, delimited by tabs 8function readConversionTable($path, &$from, &$to, $utf32 = false) { 9 $from = array(); 10 $to = array(); 11 12 $fp = fopen($path, 'r+'); 13 while ($line = fgets($fp, 256)) { 14 if ($line[0] == '#') 15 continue; 16 if (sscanf($line, "0x%x\t0x%x", $char, $codepoint) == 2) { 17 // Skip codepoints that do not have a mapping (e.g. in BIG5.txt) 18 if ($codepoint === 0xFFFD) { 19 continue; 20 } 21 $codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint); 22 if ($char == PHP_INT_MAX) { 23 // We may be on a 32-bit machine and testing a text encoding with 4-byte codes 24 // (which can't be represented in a PHP integer) 25 $char = ""; 26 for ($i = 2; $i < strlen($line); $i += 2) { 27 $substr = substr($line, $i, 2); 28 if (ctype_xdigit($substr)) 29 $char .= chr(hexdec($substr)); 30 else 31 break; 32 } 33 } else { 34 if ($char <= 0xFF) 35 $char = chr($char); // hex codes must not have leading zero bytes 36 else if ($char <= 0xFFFF) 37 $char = pack('n', $char); 38 else if ($char <= 0xFFFFFF) 39 $char = chr($char >> 16) . pack('n', $char & 0xFFFF); 40 else 41 $char = pack('N', $char); 42 } 43 $from[$char] = $codepoint; 44 $to[$codepoint] = $char; 45 } 46 } 47} 48 49function dbgPrint($str) { 50 $result = ''; 51 if (mb_check_encoding($str, 'ASCII')) 52 $result .= '"' . $str . '" '; 53 return $result . "(" . bin2hex($str) . ")"; 54} 55 56function identifyValidString($goodString, $encoding) { 57 $result = mb_check_encoding($goodString, $encoding); 58 if (!$result) 59 die("mb_check_encoding failed on good $encoding string: " . dbgPrint($goodString)); 60} 61 62function identifyInvalidString($badString, $encoding) { 63 $result = mb_check_encoding($badString, $encoding); 64 if ($result) 65 die("mb_check_encoding passed on bad $encoding string: " . dbgPrint($badString)); 66} 67 68function testConversion($fromString, $toString, $fromEncoding, $toEncoding) { 69 $result = mb_convert_encoding($fromString, $toEncoding, $fromEncoding); 70 if ($result !== $toString) 71 die("mb_convert_encoding not working on $fromEncoding input: " . dbgPrint($fromString) . "\nExpected $toEncoding: " . dbgPrint($toString) . "\nActually got: " . dbgPrint($result)); 72} 73 74function testValidConversion($fromString, $toString, $fromEncoding, $toEncoding) { 75 $illegalChars = mb_get_info('illegal_chars'); 76 testConversion($fromString, $toString, $fromEncoding, $toEncoding); 77 if (mb_get_info('illegal_chars') !== $illegalChars) 78 die("mb_convert_encoding incremented illegal_chars on valid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding"); 79} 80 81function convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) { 82 testValidConversion($fromString, $toString, $fromEncoding, $toEncoding); 83 if ($bothWays) 84 testValidConversion($toString, $fromString, $toEncoding, $fromEncoding); 85} 86 87function convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding) { 88 $illegalChars = mb_get_info('illegal_chars'); 89 testConversion($fromString, $toString, $fromEncoding, $toEncoding); 90 if (mb_get_info('illegal_chars') <= $illegalChars) 91 die("mb_convert_encoding did not increment illegal_chars on invalid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding"); 92} 93 94function testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) { 95 identifyValidString($fromString, $fromEncoding); 96 convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays); 97} 98 99function testInvalidString($fromString, $toString, $fromEncoding, $toEncoding) { 100 identifyInvalidString($fromString, $fromEncoding); 101 convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding); 102} 103 104// Only for encodings where valid characters can be concatenated together in any 105// way, without any escape sequences 106function testAllValidChars($charMap, $fromEncoding, $toEncoding, $bothWays = true) { 107 $goodChars = array_keys($charMap); 108 shuffle($goodChars); 109 // Try a long string 110 $fromString = $toString = ''; 111 foreach ($goodChars as $goodChar) { 112 $fromString .= $goodChar; 113 $toString .= $charMap[$goodChar]; 114 } 115 testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays); 116 // Try various shorter ones 117 while (!empty($goodChars)) { 118 $length = min(rand(5,10), count($goodChars)); 119 $fromString = $toString = ''; 120 while ($length--) { 121 $goodChar = array_pop($goodChars); 122 $fromString .= $goodChar; 123 $toString .= $charMap[$goodChar]; 124 } 125 126 testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays); 127 } 128} 129 130function testAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) { 131 $badChars = array_keys($badChars); 132 $goodChars = array(); 133 while (!empty($badChars)) { 134 if (empty($goodChars)) { 135 $goodChars = array_keys($charMap); 136 shuffle($goodChars); 137 } 138 $goodChar = array_pop($goodChars); 139 $fromString = array_pop($badChars) . $goodChar; 140 $toString = $replacement . $charMap[$goodChar]; 141 142 testInvalidString($fromString, $toString, $fromEncoding, $toEncoding); 143 } 144} 145 146function convertAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) { 147 $badChars = array_keys($badChars); 148 $goodChars = array(); 149 while (!empty($badChars)) { 150 if (empty($goodChars)) { 151 $goodChars = array_keys($charMap); 152 shuffle($goodChars); 153 } 154 $goodChar = array_pop($goodChars); 155 $fromString = array_pop($badChars) . $goodChar; 156 $toString = $replacement . $charMap[$goodChar]; 157 158 convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding); 159 } 160} 161 162function testTruncatedChars($truncated, $fromEncoding, $toEncoding, $replacement) { 163 $truncatedChars = array_keys($truncated); 164 foreach ($truncatedChars as $truncatedChar) { 165 testInvalidString($truncatedChar, $replacement, $fromEncoding, $toEncoding); 166 } 167} 168 169// For variable-width encodings, where we have an exhaustive list of 170// all valid characters of any width 171// 172// `$startBytes` maps from first-byte values to the corresponding character length 173// (For encodings where the first byte can tell you the length of a multi-byte 174// character) 175// Note that `$startBytes` can be partial! 176function findInvalidChars($valid, &$invalid, &$truncated, $startBytes = array()) { 177 $invalid = array(); 178 $truncated = array(); 179 $prefixes = array(); /* All sequences which are not (but can start) a valid character */ 180 181 foreach ($valid as $char => $unicode) { 182 for ($len = 1; $len < strlen($char); $len++) 183 $prefixes[substr($char, 0, $len)] = true; 184 } 185 186 $varLength = function($prefix) use($valid, $prefixes, &$invalid, &$truncated, &$varLength) { 187 for ($byte = 0; $byte < 256; $byte++) { 188 $str = $prefix . chr($byte); 189 if (!isset($valid[$str])) { 190 if (isset($prefixes[$str])) { 191 $truncated[$str] = true; 192 $varLength($str); 193 } else { 194 $invalid[$str] = true; 195 } 196 } 197 } 198 }; 199 200 $fixedLength = function($prefix, $remaining) use($valid, $prefixes, &$invalid, &$truncated, &$fixedLength) { 201 if ($remaining == 0) { 202 if (!isset($valid[$prefix])) 203 $invalid[$prefix] = true; 204 } else if ($remaining == 1) { 205 $truncated[$prefix] = true; 206 for ($i = 0; $i < 256; $i++) { 207 $str = $prefix . chr($i); 208 if (!isset($valid[$str])) 209 $invalid[$str] = true; 210 } 211 } else { 212 $truncated[$prefix] = true; 213 for ($i = 0; $i < 256; $i++) 214 $fixedLength($prefix . chr($i), $remaining - 1); 215 } 216 }; 217 218 for ($byte = 0; $byte < 256; $byte++) { 219 if (isset($startBytes[$byte])) { 220 $fixedLength(chr($byte), $startBytes[$byte] - 1); 221 } else { 222 $str = chr($byte); 223 if (!isset($valid[$str])) { 224 if (isset($prefixes[$str])) { 225 $truncated[$str] = true; 226 $varLength($str); 227 } else { 228 $invalid[$str] = true; 229 } 230 } 231 } 232 } 233} 234 235function testEncodingFromUTF16ConversionTable($path, $encoding, $replacement = '%', $startBytes = array()) { 236 srand(1000); // Make results consistent 237 mb_substitute_character(0x25); // '%' 238 readConversionTable($path, $toUnicode, $fromUnicode); 239 240 findInvalidChars($toUnicode, $invalid, $truncated, $startBytes); 241 testAllValidChars($toUnicode, $encoding, 'UTF-16BE'); 242 testAllInvalidChars($invalid, $toUnicode, $encoding, 'UTF-16BE', "\x00%"); 243 testTruncatedChars($truncated, $encoding, 'UTF-16BE', "\x00%"); 244 echo "Tested $encoding -> UTF-16BE\n"; 245 246 findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2)); 247 convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', $encoding, $replacement); 248 echo "Tested UTF-16BE -> $encoding\n"; 249} 250?> 251