1<?php
2
3// Common code for tests which focus on conversion and verification of text
4// in some specific encoding
5
6// Read a file with one character and its equivalent Unicode codepoint on each
7// line, delimited by tabs
8function readConversionTable($path, &$from, &$to, $utf32 = false) {
9    $from = array();
10    $to   = array();
11
12    $fp = fopen($path, 'r+');
13    while ($line = fgets($fp, 256)) {
14        if ($line[0] == '#')
15            continue;
16        if (sscanf($line, "0x%x\t0x%x", $char, $codepoint) == 2) {
17            // Skip codepoints that do not have a mapping (e.g. in BIG5.txt)
18            if ($codepoint === 0xFFFD) {
19                continue;
20            }
21            $codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint);
22            if ($char == PHP_INT_MAX) {
23                // We may be on a 32-bit machine and testing a text encoding with 4-byte codes
24                // (which can't be represented in a PHP integer)
25                $char = "";
26                for ($i = 2; $i < strlen($line); $i += 2) {
27                    $substr = substr($line, $i, 2);
28                    if (ctype_xdigit($substr))
29                        $char .= chr(hexdec($substr));
30                    else
31                        break;
32                }
33            } else {
34                if ($char <= 0xFF)
35                    $char = chr($char); // hex codes must not have leading zero bytes
36                else if ($char <= 0xFFFF)
37                    $char = pack('n', $char);
38                else if ($char <= 0xFFFFFF)
39                    $char = chr($char >> 16) . pack('n', $char & 0xFFFF);
40                else
41                    $char = pack('N', $char);
42            }
43            $from[$char] = $codepoint;
44            $to[$codepoint] = $char;
45        }
46    }
47}
48
49function dbgPrint($str) {
50    $result = '';
51    if (mb_check_encoding($str, 'ASCII'))
52        $result .= '"' . $str . '" ';
53    return $result . "(" . bin2hex($str) . ")";
54}
55
56function identifyValidString($goodString, $encoding) {
57    $result = mb_check_encoding($goodString, $encoding);
58    if (!$result)
59        die("mb_check_encoding failed on good $encoding string: " . dbgPrint($goodString));
60}
61
62function identifyInvalidString($badString, $encoding) {
63    $result = mb_check_encoding($badString, $encoding);
64    if ($result)
65        die("mb_check_encoding passed on bad $encoding string: " . dbgPrint($badString));
66}
67
68function testConversion($fromString, $toString, $fromEncoding, $toEncoding) {
69    $result = mb_convert_encoding($fromString, $toEncoding, $fromEncoding);
70    if ($result !== $toString)
71        die("mb_convert_encoding not working on $fromEncoding input: " . dbgPrint($fromString) . "\nExpected $toEncoding: " . dbgPrint($toString) . "\nActually got: " . dbgPrint($result));
72}
73
74function testValidConversion($fromString, $toString, $fromEncoding, $toEncoding) {
75    $illegalChars = mb_get_info('illegal_chars');
76    testConversion($fromString, $toString, $fromEncoding, $toEncoding);
77    if (mb_get_info('illegal_chars') !== $illegalChars)
78        die("mb_convert_encoding incremented illegal_chars on valid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding");
79}
80
81function convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) {
82    testValidConversion($fromString, $toString, $fromEncoding, $toEncoding);
83    if ($bothWays)
84        testValidConversion($toString, $fromString, $toEncoding, $fromEncoding);
85}
86
87function convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding) {
88    $illegalChars = mb_get_info('illegal_chars');
89    testConversion($fromString, $toString, $fromEncoding, $toEncoding);
90    if (mb_get_info('illegal_chars') <= $illegalChars)
91        die("mb_convert_encoding did not increment illegal_chars on invalid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding");
92}
93
94function testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) {
95    identifyValidString($fromString, $fromEncoding);
96    convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
97}
98
99function testInvalidString($fromString, $toString, $fromEncoding, $toEncoding) {
100    identifyInvalidString($fromString, $fromEncoding);
101    convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
102}
103
104// Only for encodings where valid characters can be concatenated together in any
105// way, without any escape sequences
106function testAllValidChars($charMap, $fromEncoding, $toEncoding, $bothWays = true) {
107    $goodChars = array_keys($charMap);
108    shuffle($goodChars);
109    while (!empty($goodChars)) {
110        $length = min(rand(5,10), count($goodChars));
111        $fromString = $toString = '';
112        while ($length--) {
113            $goodChar = array_pop($goodChars);
114            $fromString .= $goodChar;
115            $toString .= $charMap[$goodChar];
116        }
117
118        testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
119    }
120}
121
122function testAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) {
123    $badChars = array_keys($badChars);
124    $goodChars = array();
125    while (!empty($badChars)) {
126        if (empty($goodChars)) {
127            $goodChars = array_keys($charMap);
128            shuffle($goodChars);
129        }
130        $goodChar   = array_pop($goodChars);
131        $fromString = array_pop($badChars) . $goodChar;
132        $toString   = $replacement . $charMap[$goodChar];
133
134        testInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
135    }
136}
137
138function convertAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) {
139    $badChars = array_keys($badChars);
140    $goodChars = array();
141    while (!empty($badChars)) {
142        if (empty($goodChars)) {
143            $goodChars = array_keys($charMap);
144            shuffle($goodChars);
145        }
146        $goodChar   = array_pop($goodChars);
147        $fromString = array_pop($badChars) . $goodChar;
148        $toString   = $replacement . $charMap[$goodChar];
149
150        convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
151    }
152}
153
154function testTruncatedChars($truncated, $fromEncoding, $toEncoding, $replacement) {
155    $truncatedChars = array_keys($truncated);
156    foreach ($truncatedChars as $truncatedChar) {
157        testInvalidString($truncatedChar, $replacement, $fromEncoding, $toEncoding);
158    }
159}
160
161// For variable-width encodings, where we have an exhaustive list of
162// all valid characters of any width
163//
164// `$startBytes` maps from first-byte values to the corresponding character length
165// (For encodings where the first byte can tell you the length of a multi-byte
166// character)
167// Note that `$startBytes` can be partial!
168function findInvalidChars($valid, &$invalid, &$truncated, $startBytes = array()) {
169    $invalid    = array();
170    $truncated  = array();
171    $prefixes   = array(); /* All sequences which are not (but can start) a valid character */
172
173    foreach ($valid as $char => $unicode) {
174        for ($len = 1; $len < strlen($char); $len++)
175            $prefixes[substr($char, 0, $len)] = true;
176    }
177
178    $varLength = function($prefix) use($valid, $prefixes, &$invalid, &$truncated, &$varLength) {
179        for ($byte = 0; $byte < 256; $byte++) {
180            $str = $prefix . chr($byte);
181            if (!isset($valid[$str])) {
182                if (isset($prefixes[$str])) {
183                    $truncated[$str] = true;
184                    $varLength($str);
185                } else {
186                    $invalid[$str] = true;
187                }
188            }
189        }
190    };
191
192    $fixedLength = function($prefix, $remaining) use($valid, $prefixes, &$invalid, &$truncated, &$fixedLength) {
193        if ($remaining == 0) {
194            if (!isset($valid[$prefix]))
195                $invalid[$prefix] = true;
196        } else if ($remaining == 1) {
197            $truncated[$prefix] = true;
198            for ($i = 0; $i < 256; $i++) {
199                $str = $prefix . chr($i);
200                if (!isset($valid[$str]))
201                    $invalid[$str] = true;
202            }
203        } else {
204            $truncated[$prefix] = true;
205            for ($i = 0; $i < 256; $i++)
206                $fixedLength($prefix . chr($i), $remaining - 1);
207        }
208    };
209
210    for ($byte = 0; $byte < 256; $byte++) {
211        if (isset($startBytes[$byte])) {
212            $fixedLength(chr($byte), $startBytes[$byte] - 1);
213        } else {
214            $str = chr($byte);
215            if (!isset($valid[$str])) {
216                if (isset($prefixes[$str])) {
217                    $truncated[$str] = true;
218                    $varLength($str);
219                } else {
220                    $invalid[$str] = true;
221                }
222            }
223        }
224    }
225}
226
227function testEncodingFromUTF16ConversionTable($path, $encoding, $replacement = '%', $startBytes = array()) {
228    srand(1000); // Make results consistent
229    mb_substitute_character(0x25); // '%'
230    readConversionTable($path, $toUnicode, $fromUnicode);
231
232    findInvalidChars($toUnicode, $invalid, $truncated, $startBytes);
233    testAllValidChars($toUnicode, $encoding, 'UTF-16BE');
234    testAllInvalidChars($invalid, $toUnicode, $encoding, 'UTF-16BE', "\x00%");
235    testTruncatedChars($truncated, $encoding, 'UTF-16BE', "\x00%");
236    echo "Tested $encoding -> UTF-16BE\n";
237
238    findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
239    convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', $encoding, $replacement);
240    echo "Tested UTF-16BE -> $encoding\n";
241}
242?>
243