1<?php
2
3// Common code for tests which focus on conversion and verification of text
4// in some specific encoding
5
6// Test failed limit. If you want to execute all tests, set to -1.
7$testFailedLimit = 1000;
8// Test failed counter
9$testFailedCounter = 0;
10
11// Count tests failed. If tests failed are more than $testFailedLimit, stop testing.
12function testFailedIncrement() {
13    global $testFailedCounter, $testFailedLimit;
14
15    // $testFailedLimit is -1, no limit.
16    if ($testFailedLimit === -1)
17        return;
18
19    $testFailedCounter++;
20    if ($testFailedCounter < $testFailedLimit)
21        return;
22
23    die("=== Failed test " . $testFailedLimit . " times exceeded, stop testing ===");
24}
25
26// Read a file with one character and its equivalent Unicode codepoint on each
27// line, delimited by tabs
28function readConversionTable($path, &$from, &$to, $utf32 = false) {
29    $from = array();
30    $to   = array();
31
32    $fp = fopen($path, 'r+');
33    while ($line = fgets($fp, 256)) {
34        if ($line[0] == '#')
35            continue;
36        if (sscanf($line, "0x%x\t0x%x", $char, $codepoint) == 2) {
37            // Skip codepoints that do not have a mapping (e.g. in BIG5.txt)
38            if ($codepoint === 0xFFFD) {
39                continue;
40            }
41            $codepoint = $utf32 ? pack('N', $codepoint) : pack('n', $codepoint);
42            if ($char == PHP_INT_MAX) {
43                // We may be on a 32-bit machine and testing a text encoding with 4-byte codes
44                // (which can't be represented in a PHP integer)
45                $char = "";
46                for ($i = 2; $i < strlen($line); $i += 2) {
47                    $substr = substr($line, $i, 2);
48                    if (ctype_xdigit($substr))
49                        $char .= chr(hexdec($substr));
50                    else
51                        break;
52                }
53            } else {
54                if ($char <= 0xFF)
55                    $char = chr($char); // hex codes must not have leading zero bytes
56                else if ($char <= 0xFFFF)
57                    $char = pack('n', $char);
58                else if ($char <= 0xFFFFFF)
59                    $char = chr($char >> 16) . pack('n', $char & 0xFFFF);
60                else
61                    $char = pack('N', $char);
62            }
63            $from[$char] = $codepoint;
64            $to[$codepoint] = $char;
65        }
66    }
67}
68
69function dbgPrint($str) {
70    $result = '';
71    if (mb_check_encoding($str, 'ASCII'))
72        $result .= '"' . $str . '" ';
73    return $result . "(" . bin2hex($str) . ")";
74}
75
76function identifyValidString($goodString, $encoding) {
77    $result = mb_check_encoding($goodString, $encoding);
78    if (!$result) {
79        echo "mb_check_encoding failed on good $encoding string: " . dbgPrint($goodString) . PHP_EOL;
80        testFailedIncrement();
81    }
82}
83
84function identifyInvalidString($badString, $encoding) {
85    $result = mb_check_encoding($badString, $encoding);
86    if ($result)
87        echo "mb_check_encoding passed on bad $encoding string: " . dbgPrint($badString) . PHP_EOL;
88}
89
90function testConversion($fromString, $toString, $fromEncoding, $toEncoding) {
91    $result = mb_convert_encoding($fromString, $toEncoding, $fromEncoding);
92    if ($result !== $toString) {
93        echo "mb_convert_encoding not working on $fromEncoding input: " . dbgPrint($fromString) . "\nExpected $toEncoding: " . dbgPrint($toString) . "\nActually got: " . str_repeat(' ', strlen($toEncoding) - 3) . dbgPrint($result) . PHP_EOL;
94        testFailedIncrement();
95    }
96}
97
98function testValidConversion($fromString, $toString, $fromEncoding, $toEncoding) {
99    $illegalChars = mb_get_info('illegal_chars');
100    testConversion($fromString, $toString, $fromEncoding, $toEncoding);
101    if (mb_get_info('illegal_chars') !== $illegalChars) {
102        echo "mb_convert_encoding incremented illegal_chars on valid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding" . PHP_EOL;
103        testFailedIncrement();
104    }
105}
106
107function convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) {
108    testValidConversion($fromString, $toString, $fromEncoding, $toEncoding);
109    if ($bothWays)
110        testValidConversion($toString, $fromString, $toEncoding, $fromEncoding);
111}
112
113function convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding) {
114    $illegalChars = mb_get_info('illegal_chars');
115    testConversion($fromString, $toString, $fromEncoding, $toEncoding);
116    if (mb_get_info('illegal_chars') <= $illegalChars) {
117        echo "mb_convert_encoding did not increment illegal_chars on invalid $fromEncoding string: " . dbgPrint($fromString) . " when converting to $toEncoding" . PHP_EOL;
118        testFailedIncrement();
119    }
120}
121
122function testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays = true) {
123    identifyValidString($fromString, $fromEncoding);
124    convertValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
125}
126
127function testInvalidString($fromString, $toString, $fromEncoding, $toEncoding) {
128    identifyInvalidString($fromString, $fromEncoding);
129    convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
130}
131
132// Only for encodings where valid characters can be concatenated together in any
133// way, without any escape sequences
134function testAllValidChars($charMap, $fromEncoding, $toEncoding, $bothWays = true) {
135    $goodChars = array_keys($charMap);
136    shuffle($goodChars);
137    // Try a long string
138    $fromString = $toString = '';
139    foreach ($goodChars as $goodChar) {
140        $fromString .= $goodChar;
141        $toString .= $charMap[$goodChar];
142    }
143    testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
144    // Try various shorter ones
145    while (!empty($goodChars)) {
146        $length = min(rand(5,10), count($goodChars));
147        $fromString = $toString = '';
148        while ($length--) {
149            $goodChar = array_pop($goodChars);
150            $fromString .= $goodChar;
151            $toString .= $charMap[$goodChar];
152        }
153
154        testValidString($fromString, $toString, $fromEncoding, $toEncoding, $bothWays);
155
156        $strLen = mb_strlen($fromString, $fromEncoding);
157        if ($strLen !== mb_strlen($toString, $toEncoding)) {
158            echo "Length of $fromEncoding string '" . bin2hex($fromString) . "' was different than expected; mb_strlen returned $strLen" . PHP_EOL;
159            testFailedIncrement();
160        }
161    }
162}
163
164function testAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) {
165    $badChars = array_keys($badChars);
166    $goodChars = array();
167    while (!empty($badChars)) {
168        if (empty($goodChars)) {
169            $goodChars = array_keys($charMap);
170            shuffle($goodChars);
171        }
172        $goodChar   = array_pop($goodChars);
173        $fromString = array_pop($badChars) . $goodChar;
174        $toString   = $replacement . $charMap[$goodChar];
175
176        testInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
177    }
178}
179
180function convertAllInvalidChars($badChars, $charMap, $fromEncoding, $toEncoding, $replacement) {
181    $badChars = array_keys($badChars);
182    $goodChars = array();
183    while (!empty($badChars)) {
184        if (empty($goodChars)) {
185            $goodChars = array_keys($charMap);
186            shuffle($goodChars);
187        }
188        $goodChar   = array_pop($goodChars);
189        $fromString = array_pop($badChars) . $goodChar;
190        $toString   = $replacement . $charMap[$goodChar];
191
192        convertInvalidString($fromString, $toString, $fromEncoding, $toEncoding);
193    }
194}
195
196function testTruncatedChars($truncated, $fromEncoding, $toEncoding, $replacement) {
197    $truncatedChars = array_keys($truncated);
198    foreach ($truncatedChars as $truncatedChar) {
199        testInvalidString($truncatedChar, $replacement, $fromEncoding, $toEncoding);
200    }
201}
202
203// For variable-width encodings, where we have an exhaustive list of
204// all valid characters of any width
205//
206// `$startBytes` maps from first-byte values to the corresponding character length
207// (For encodings where the first byte can tell you the length of a multi-byte
208// character)
209// Note that `$startBytes` can be partial!
210function findInvalidChars($valid, &$invalid, &$truncated, $startBytes = array()) {
211    $invalid    = array();
212    $truncated  = array();
213    $prefixes   = array(); /* All sequences which are not (but can start) a valid character */
214
215    foreach ($valid as $char => $unicode) {
216        for ($len = 1; $len < strlen($char); $len++)
217            $prefixes[substr($char, 0, $len)] = true;
218    }
219
220    $varLength = function($prefix) use($valid, $prefixes, &$invalid, &$truncated, &$varLength) {
221        for ($byte = 0; $byte < 256; $byte++) {
222            $str = $prefix . chr($byte);
223            if (!isset($valid[$str])) {
224                if (isset($prefixes[$str])) {
225                    $truncated[$str] = true;
226                    $varLength($str);
227                } else {
228                    $invalid[$str] = true;
229                }
230            }
231        }
232    };
233
234    $fixedLength = function($prefix, $remaining) use($valid, $prefixes, &$invalid, &$truncated, &$fixedLength) {
235        if ($remaining == 0) {
236            if (!isset($valid[$prefix]))
237                $invalid[$prefix] = true;
238        } else if ($remaining == 1) {
239            $truncated[$prefix] = true;
240            for ($i = 0; $i < 256; $i++) {
241                $str = $prefix . chr($i);
242                if (!isset($valid[$str]))
243                    $invalid[$str] = true;
244            }
245        } else {
246            $truncated[$prefix] = true;
247            for ($i = 0; $i < 256; $i++)
248                $fixedLength($prefix . chr($i), $remaining - 1);
249        }
250    };
251
252    for ($byte = 0; $byte < 256; $byte++) {
253        if (isset($startBytes[$byte])) {
254            $fixedLength(chr($byte), $startBytes[$byte] - 1);
255        } else {
256            $str = chr($byte);
257            if (!isset($valid[$str])) {
258                if (isset($prefixes[$str])) {
259                    $truncated[$str] = true;
260                    $varLength($str);
261                } else {
262                    $invalid[$str] = true;
263                }
264            }
265        }
266    }
267}
268
269function testEncodingFromUTF16ConversionTable($path, $encoding, $replacement = '%', $startBytes = array()) {
270    srand(1000); // Make results consistent
271    mb_substitute_character(0x25); // '%'
272    readConversionTable($path, $toUnicode, $fromUnicode);
273
274    findInvalidChars($toUnicode, $invalid, $truncated, $startBytes);
275    testAllValidChars($toUnicode, $encoding, 'UTF-16BE');
276    testAllInvalidChars($invalid, $toUnicode, $encoding, 'UTF-16BE', "\x00%");
277    testTruncatedChars($truncated, $encoding, 'UTF-16BE', "\x00%");
278    echo "Tested $encoding -> UTF-16BE\n";
279
280    findInvalidChars($fromUnicode, $invalid, $unused, array_fill_keys(range(0,0xFF), 2));
281    convertAllInvalidChars($invalid, $fromUnicode, 'UTF-16BE', $encoding, $replacement);
282    echo "Tested UTF-16BE -> $encoding\n";
283}
284?>
285