1<?php
2
3// Common code for tests which focus on conversion and verification of text
4// in some specific encoding
5
6// Read a file with one character and its equivalent Unicode codepoint on each
7// line, delimited by tabs
8function readConversionTable($path, &$from, &$to, $utf32 = false) {
9    $from = array();
10    $to   = array();
11
12    $fp = fopen($path, 'r+');
13    while ($line = fgets($fp, 256)) {
14        if ($line[0] == '#')
15            continue;
16        if (sscanf($line, "0x%x\t0x%x", $char, $codepoint) == 2) {
17            // Skip codepoints that do not have a mapping (e.g. in BIG5.txt)
18            if ($codepoint === 0xFFFD) {
19                continue;
20            }
21            $codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint);
22            if ($char == PHP_INT_MAX) {
23                // We may be on a 32-bit machine and testing a text encoding with 4-byte codes
24                // (which can't be represented in a PHP integer)
25                $char = "";
26                for ($i = 2; $i < strlen($line); $i += 2) {
27                    $substr = substr($line, $i, 2);
28                    if (ctype_xdigit($substr))
29                        $char .= chr(hexdec($substr));
30                    else
31                        break;
32                }
33            } else {
34                if ($char <= 0xFF)
35                    $char = chr($char); // hex codes must not have leading zero bytes
36                else if ($char <= 0xFFFF)
37                    $char = pack('n', $char);
38                else if ($char <= 0xFFFFFF)
39                    $char = chr($char >> 16) . pack('n', $char & 0xFFFF);
40                else
41                    $char = pack('N', $char);
42            }
43            $from[$char] = $codepoint;
44            $to[$codepoint] = $char;
45        }
46    }
47}
48
49function dbgPrint($str) {
50    $result = '';
51    if (mb_check_encoding($str, 'ASCII'))
52        $result .= '"' . $str . '" ';
53    return $result . "(" . bin2hex($str) . ")";
54}
55
56function identifyValidString($goodString, $encoding) {
57    $result = mb_check_encoding($goodString, $encoding);
58    if (!$result)
59        die("mb_check_encoding failed on good $encoding string: " . dbgPrint($goodString));
60}
61
62function identifyInvalidString($badString, $encoding) {
63    $result = mb_check_encoding($badString, $encoding);
64    if ($result)
65        die("mb_check_encoding passed on bad $encoding string: " . dbgPrint($badString));
66}
67
68function testConversion($fromString, $toString, $fromEncoding, $toEncoding) {
69    $result = mb_convert_encoding($fromString, $toEncoding, $fromEncoding);
70    if ($result !== $toString)
71        die("mb_convert_encoding not working on $fromEncoding input: " . dbgPrint($fromString) . "\nExpected $toEncoding: " . dbgPrint($toString) . "\nActually got: " . dbgPrint($result));
72}
73
74function testValidConversion($fromString, $toString, $fromEncoding, $toEncoding) {
75    $illegalChars = mb_get_info('illegal_chars');
76    testConversion($fromString, $toString, $fromEncoding, $toEncoding);
77    if (mb_get_info('illegal_chars') !== $illegalChars)
78        die("mb_convert_encoding incremented illegal_chars on valid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding");
79}
80
81function convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) {
82    testValidConversion($fromString, $toString, $fromEncoding, $toEncoding);
83    if ($bothWays)
84        testValidConversion($toString, $fromString, $toEncoding, $fromEncoding);
85}
86
87function convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding) {
88    $illegalChars = mb_get_info('illegal_chars');
89    testConversion($fromString, $toString, $fromEncoding, $toEncoding);
90    if (mb_get_info('illegal_chars') <= $illegalChars)
91        die("mb_convert_encoding did not increment illegal_chars on invalid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding");
92}
93
94function testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) {
95    identifyValidString($fromString, $fromEncoding);
96    convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
97}
98
99function testInvalidString($fromString, $toString, $fromEncoding, $toEncoding) {
100    identifyInvalidString($fromString, $fromEncoding);
101    convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
102}
103
104// Only for encodings where valid characters can be concatenated together in any
105// way, without any escape sequences
106function testAllValidChars($charMap, $fromEncoding, $toEncoding, $bothWays = true) {
107    $goodChars = array_keys($charMap);
108    shuffle($goodChars);
109    // Try a long string
110    $fromString = $toString = '';
111    foreach ($goodChars as $goodChar) {
112        $fromString .= $goodChar;
113        $toString .= $charMap[$goodChar];
114    }
115    testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
116    // Try various shorter ones
117    while (!empty($goodChars)) {
118        $length = min(rand(5,10), count($goodChars));
119        $fromString = $toString = '';
120        while ($length--) {
121            $goodChar = array_pop($goodChars);
122            $fromString .= $goodChar;
123            $toString .= $charMap[$goodChar];
124        }
125
126        testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
127    }
128}
129
130function testAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) {
131    $badChars = array_keys($badChars);
132    $goodChars = array();
133    while (!empty($badChars)) {
134        if (empty($goodChars)) {
135            $goodChars = array_keys($charMap);
136            shuffle($goodChars);
137        }
138        $goodChar   = array_pop($goodChars);
139        $fromString = array_pop($badChars) . $goodChar;
140        $toString   = $replacement . $charMap[$goodChar];
141
142        testInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
143    }
144}
145
146function convertAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) {
147    $badChars = array_keys($badChars);
148    $goodChars = array();
149    while (!empty($badChars)) {
150        if (empty($goodChars)) {
151            $goodChars = array_keys($charMap);
152            shuffle($goodChars);
153        }
154        $goodChar   = array_pop($goodChars);
155        $fromString = array_pop($badChars) . $goodChar;
156        $toString   = $replacement . $charMap[$goodChar];
157
158        convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
159    }
160}
161
162function testTruncatedChars($truncated, $fromEncoding, $toEncoding, $replacement) {
163    $truncatedChars = array_keys($truncated);
164    foreach ($truncatedChars as $truncatedChar) {
165        testInvalidString($truncatedChar, $replacement, $fromEncoding, $toEncoding);
166    }
167}
168
169// For variable-width encodings, where we have an exhaustive list of
170// all valid characters of any width
171//
172// `$startBytes` maps from first-byte values to the corresponding character length
173// (For encodings where the first byte can tell you the length of a multi-byte
174// character)
175// Note that `$startBytes` can be partial!
176function findInvalidChars($valid, &$invalid, &$truncated, $startBytes = array()) {
177    $invalid    = array();
178    $truncated  = array();
179    $prefixes   = array(); /* All sequences which are not (but can start) a valid character */
180
181    foreach ($valid as $char => $unicode) {
182        for ($len = 1; $len < strlen($char); $len++)
183            $prefixes[substr($char, 0, $len)] = true;
184    }
185
186    $varLength = function($prefix) use($valid, $prefixes, &$invalid, &$truncated, &$varLength) {
187        for ($byte = 0; $byte < 256; $byte++) {
188            $str = $prefix . chr($byte);
189            if (!isset($valid[$str])) {
190                if (isset($prefixes[$str])) {
191                    $truncated[$str] = true;
192                    $varLength($str);
193                } else {
194                    $invalid[$str] = true;
195                }
196            }
197        }
198    };
199
200    $fixedLength = function($prefix, $remaining) use($valid, $prefixes, &$invalid, &$truncated, &$fixedLength) {
201        if ($remaining == 0) {
202            if (!isset($valid[$prefix]))
203                $invalid[$prefix] = true;
204        } else if ($remaining == 1) {
205            $truncated[$prefix] = true;
206            for ($i = 0; $i < 256; $i++) {
207                $str = $prefix . chr($i);
208                if (!isset($valid[$str]))
209                    $invalid[$str] = true;
210            }
211        } else {
212            $truncated[$prefix] = true;
213            for ($i = 0; $i < 256; $i++)
214                $fixedLength($prefix . chr($i), $remaining - 1);
215        }
216    };
217
218    for ($byte = 0; $byte < 256; $byte++) {
219        if (isset($startBytes[$byte])) {
220            $fixedLength(chr($byte), $startBytes[$byte] - 1);
221        } else {
222            $str = chr($byte);
223            if (!isset($valid[$str])) {
224                if (isset($prefixes[$str])) {
225                    $truncated[$str] = true;
226                    $varLength($str);
227                } else {
228                    $invalid[$str] = true;
229                }
230            }
231        }
232    }
233}
234
235function testEncodingFromUTF16ConversionTable($path, $encoding, $replacement = '%', $startBytes = array()) {
236    srand(1000); // Make results consistent
237    mb_substitute_character(0x25); // '%'
238    readConversionTable($path, $toUnicode, $fromUnicode);
239
240    findInvalidChars($toUnicode, $invalid, $truncated, $startBytes);
241    testAllValidChars($toUnicode, $encoding, 'UTF-16BE');
242    testAllInvalidChars($invalid, $toUnicode, $encoding, 'UTF-16BE', "\x00%");
243    testTruncatedChars($truncated, $encoding, 'UTF-16BE', "\x00%");
244    echo "Tested $encoding -> UTF-16BE\n";
245
246    findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
247    convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', $encoding, $replacement);
248    echo "Tested UTF-16BE -> $encoding\n";
249}
250?>
251