1--TEST--
2Confirm error handling for UTF-8 complies with WHATWG spec
3--EXTENSIONS--
4mbstring
5--FILE--
6<?php
7/* The WHATWG specifies not just how web browsers should handle _valid_
8 * UTF-8 text, but how they should handle _invalid_ UTF-8 text (such
9 * as how many error markers each invalid byte sequence should decode
10 * to).
11 * That specification is followed by the JavaScript Encoding API.
12 *
13 * The API documentation for mb_convert_encoding does not specify how
14 * many error markers we will emit for each possible invalid byte
15 * sequence, so we might as well comply with the WHATWG specification.
16 *
17 * Thanks to Martin Auswöger for pointing this out... and another big
18 * thanks for providing test cases!
19 *
20 * Ref: https://encoding.spec.whatwg.org/#utf-8-decoder
21 */
22mb_substitute_character(0x25);
23
24$testCases = [
25  ["\x80", "%"],
26  ["\xFF", "%"],
27  ["\xC2\x7F", "%\x7F"],
28  ["\xC2\x80", "\xC2\x80"],
29  ["\xDF\xBF", "\xDF\xBF"],
30  ["\xDF\xC0", "%%"],
31  ["\xE0\xA0\x7F", "%\x7F"],
32  ["\xE0\xA0\x80", "\xE0\xA0\x80"],
33  ["\xEF\xBF\xBF", "\xEF\xBF\xBF"],
34  ["\xEF\xBF\xC0", "%%"],
35  ["\xF0\x90\x80\x7F", "%\x7F"],
36  ["\xF0\x90\x80\x80", "\xF0\x90\x80\x80"],
37  ["\xF4\x8F\xBF\xBF", "\xF4\x8F\xBF\xBF"],
38  ["\xF4\x8F\xBF\xC0", "%%"],
39  ["\xFA\x80\x80\x80\x80", "%%%%%"],
40  ["\xFB\xBF\xBF\xBF\xBF", "%%%%%"],
41  ["\xFD\x80\x80\x80\x80\x80", "%%%%%%"],
42  ["\xFD\xBF\xBF\xBF\xBF\xBF", "%%%%%%"]
43];
44
45foreach ($testCases as $testCase) {
46  $result = mb_convert_encoding($testCase[0], 'UTF-8', 'UTF-8');
47  if ($result !== $testCase[1]) {
48    die("Expected UTF-8 string " . bin2hex($testCase[0]) . " to convert to UTF-8 string " . bin2hex($testCase[1]) . "; got " . bin2hex($result));
49  }
50}
51
52echo "All done!\n";
53
54?>
55--EXPECT--
56All done!
57