xref: /php-src/ext/mbstring/ucgendat/uctest.php (revision 02294f0c)
1#!/usr/bin/env php
2<?php error_reporting(E_ALL);
3
4$dir = __DIR__;
5$unicodeDataFile = $dir . '/UnicodeData.txt';
6$caseFoldingFile = $dir . '/CaseFolding.txt';
7$specialCasingFile = $dir . '/SpecialCasing.txt';
8
9$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile];
10foreach ($files as $file) {
11    if (!file_exists($file)) {
12        echo "File $file does not exist.\n";
13        return;
14    }
15}
16
17testUnicodeData(file_get_contents($unicodeDataFile));
18testCaseFolding(file_get_contents($caseFoldingFile));
19testSpecialCasing(file_get_contents($specialCasingFile));
20
21function parseDataFile(string $input) {
22    $lines = explode("\n", $input);
23    foreach ($lines as $line) {
24        // Strip comments
25        if (false !== $hashPos = strpos($line, '#')) {
26            $line = substr($line, 0, $hashPos);
27        }
28
29        // Skip empty lines
30        $line = trim($line);
31        if ($line === '') {
32            continue;
33        }
34
35        $fields = array_map('trim', explode(';', $line));
36        yield $fields;
37    }
38}
39
40function parseCodes(string $strCodes) : array {
41    $codes = [];
42    foreach (explode(' ', $strCodes) as $strCode) {
43        $codes[] = intval($strCode, 16);
44    }
45    return $codes;
46}
47
48function testCaseMap($type, int $origCode, array $newCodes) {
49    $origChar = mb_chr($origCode);
50    $newStr = "";
51    foreach ($newCodes as $newCode) {
52        $newStr .= mb_chr($newCode);
53    }
54
55    $mbNewStr = mb_convert_case($origChar, $type);
56    if ($mbNewStr !== $newStr) {
57        echo "$type: $mbNewStr != $newStr\n";
58    }
59}
60
61function testSimpleCaseMap($type, int $origCode, int $newCode) {
62    if ($newCode) {
63        testCaseMap($type, $origCode, [$newCode]);
64    } else {
65        testCaseMap($type, $origCode, [$origCode]);
66    }
67}
68
69function testUnicodeData(string $input) {
70    $uppers = [];
71    $folds = [];
72
73    foreach (parseDataFile($input) as $fields) {
74        assert(count($fields) == 15);
75
76        $code = intval($fields[0], 16);
77        $upperCase = intval($fields[12], 16);
78        $lowerCase = intval($fields[13], 16);
79        $titleCase = intval($fields[14], 16);
80        testSimpleCaseMap(MB_CASE_UPPER_SIMPLE, $code, $upperCase);
81        testSimpleCaseMap(MB_CASE_LOWER_SIMPLE, $code, $lowerCase);
82
83        // Unfortunately MB_CASE_TITLE does not actually return the title case, even when passed
84        // only a single character. It does ad-hoc magic based on the character class, so that
85        // certain characters, such as roman numerals or circled characters will not be
86        // title-cased.
87        //testSimpleCaseMap(MB_CASE_TITLE_SIMPLE, $code, $titleCase ?: $upperCase);
88
89        $chr = mb_chr($code);
90        $upper = mb_strtoupper($chr);
91        $uppers[$upper][] = $chr;
92        $fold = mb_convert_case($chr, 3);
93        $folds[$fold][] = $chr;
94    }
95}
96
97function testCaseFolding(string $input) {
98    foreach (parseDataFile($input) as $fields) {
99        assert(count($fields) == 4);
100
101        $code = intval($fields[0], 16);
102        $status = $fields[1];
103        if ($status == 'C' || $status == 'S') {
104            $foldCode = intval($fields[2], 16);
105            testSimpleCaseMap(MB_CASE_FOLD_SIMPLE, $code, $foldCode);
106        } else if ($status == 'F') {
107            $foldCodes = parseCodes($fields[2]);
108            testCaseMap(MB_CASE_FOLD, $code, $foldCodes);
109        }
110    }
111}
112
113function testSpecialCasing(string $input) {
114    foreach (parseDataFile($input) as $fields) {
115        assert(count($fields) >= 5);
116
117        $code = intval($fields[0], 16);
118        $lower = parseCodes($fields[1]);
119        $title = parseCodes($fields[2]);
120        $upper = parseCodes($fields[3]);
121
122        $cond = $fields[4];
123        if ($cond) {
124            // We don't support conditional mappings
125            continue;
126        }
127
128        testCaseMap(MB_CASE_LOWER, $code, $lower);
129        testCaseMap(MB_CASE_UPPER, $code, $upper);
130        testCaseMap(MB_CASE_TITLE, $code, $title);
131    }
132}
133