xref: /PHP-8.0/ext/mbstring/ucgendat/ucgendat.php (revision 36c79465)
1#!/usr/bin/env php
2<?php error_reporting(E_ALL);
3
4/**
5 * This is based on the ucgendat.c file from the OpenLDAP project, licensed as
6 * follows. This file is not necessary to build PHP. It's only necessary to
7 * rebuild unicode_data.h from Unicode ucd files.
8 *
9 * Example usage:
10 * php ucgendat.php UnicodeData.txt
11 */
12
13/* Copyright 1998-2007 The OpenLDAP Foundation.
14 * All rights reserved.
15 *
16 * Redistribution and use in source and binary forms, with or without
17 * modification, are permitted only as authorized by the OpenLDAP
18 * Public License.
19 *
20 * A copy of this license is available at
21 * <http://www.OpenLDAP.org/license.html>.
22 */
23
24/* Copyright 2001 Computing Research Labs, New Mexico State University
25 *
26 * Permission is hereby granted, free of charge, to any person obtaining a
27 * copy of this software and associated documentation files (the "Software"),
28 * to deal in the Software without restriction, including without limitation
29 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
30 * and/or sell copies of the Software, and to permit persons to whom the
31 * Software is furnished to do so, subject to the following conditions:
32 *
33 * The above copyright notice and this permission notice shall be included in
34 * all copies or substantial portions of the Software.
35 *
36 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
37 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
38 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
39 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
40 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
41 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
42 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
43 */
44
45if ($argc < 2) {
46    echo "Usage: php ucgendata.php ./datadir\n";
47    echo "./datadir must contain:\n";
48    echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt and DerivedCoreProperties.txt\n";
49    return;
50}
51
52$dir = $argv[1];
53$unicodeDataFile = $dir . '/UnicodeData.txt';
54$caseFoldingFile = $dir . '/CaseFolding.txt';
55$specialCasingFile = $dir . '/SpecialCasing.txt';
56$derivedCorePropertiesFile = $dir . '/DerivedCoreProperties.txt';
57
58$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile];
59foreach ($files as $file) {
60    if (!file_exists($file)) {
61        echo "File $file does not exist.\n";
62        return;
63    }
64}
65
66$outputFile = __DIR__ . "/../unicode_data.h";
67
68$data = new UnicodeData;
69parseUnicodeData($data, file_get_contents($unicodeDataFile));
70parseCaseFolding($data, file_get_contents($caseFoldingFile));
71parseSpecialCasing($data, file_get_contents($specialCasingFile));
72parseDerivedCoreProperties($data, file_get_contents($derivedCorePropertiesFile));
73file_put_contents($outputFile, generateData($data));
74
75class Range {
76    public $start;
77    public $end;
78
79    public function __construct(int $start, int $end) {
80        $this->start = $start;
81        $this->end = $end;
82    }
83}
84
85class UnicodeData {
86    public $propIndexes;
87    public $numProps;
88    public $propRanges;
89    public $caseMaps;
90    public $extraCaseData;
91
92    public function __construct() {
93        /*
94         * List of properties expected to be found in the Unicode Character Database.
95         */
96        $this->propIndexes = array_flip([
97            "Mn", "Mc", "Me", "Nd", "Nl", "No",
98            "Zs", "Zl", "Zp", "Cc", "Cf", "Cs",
99            "Co", "Cn", "Lu", "Ll", "Lt", "Lm",
100            "Lo", "Pc", "Pd", "Ps", "Pe", "Po",
101            "Sm", "Sc", "Sk", "So", "L", "R",
102            "EN", "ES", "ET", "AN", "CS", "B",
103            "S", "WS", "ON", "Pi", "Pf", "AL",
104            "Cased", "Case_Ignorable"
105        ]);
106        $this->numProps = count($this->propIndexes);
107
108        $this->propRanges = array_fill(0, $this->numProps, []);
109        $this->caseMaps = [
110            'upper' => [],
111            'lower' => [],
112            'title' => [],
113            'fold' => [],
114        ];
115        $this->extraCaseData = [];
116    }
117
118    function propToIndex(string $prop) : int {
119        /* Deal with directionality codes introduced in Unicode 3.0. */
120        if (in_array($prop, ["BN", "NSM", "PDF", "LRE", "LRO", "RLE", "RLO", "LRI", "RLI", "FSI", "PDI"])) {
121            /*
122             * Mark all of these as Other Neutral to preserve compatibility with
123             * older versions.
124             */
125            $prop = "ON";
126        }
127
128        if (!isset($this->propIndexes[$prop])) {
129            throw new Exception("Unknown property $prop");
130        }
131
132        return $this->propIndexes[$prop];
133    }
134
135    public function addProp(int $code, string $prop) {
136        $propIdx = self::propToIndex($prop);
137
138        // Check if this extends the last range
139        $ranges = $this->propRanges[$propIdx];
140        if (!empty($ranges)) {
141            $lastRange = $ranges[count($ranges) - 1];
142            if ($code === $lastRange->end + 1) {
143                $lastRange->end++;
144                return;
145            }
146        }
147
148        $this->propRanges[$propIdx][] = new Range($code, $code);
149    }
150
151    public function addPropRange(int $startCode, int $endCode, string $prop) {
152        $propIdx = self::propToIndex($prop);
153        $this->propRanges[$propIdx][] = new Range($startCode, $endCode);
154    }
155
156    public function addCaseMapping(string $case, int $origCode, int $mappedCode) {
157        $this->caseMaps[$case][$origCode] = $mappedCode;
158    }
159
160    public function compactRangeArray(array $ranges) : array {
161        // Sort by start codepoint
162        usort($ranges, function (Range $r1, Range $r2) {
163            return $r1->start <=> $r2->start;
164        });
165
166        $lastRange = new Range(-1, -1);
167        $newRanges = [];
168        foreach ($ranges as $range) {
169            if ($lastRange->end == -1) {
170                $lastRange = $range;
171            } else if ($range->start == $lastRange->end + 1) {
172                $lastRange->end = $range->end;
173            } else if ($range->start > $lastRange->end + 1) {
174                $newRanges[] = $lastRange;
175                $lastRange = $range;
176            } else {
177                throw new Exception(sprintf(
178                    "Overlapping ranges [%x, %x] and [%x, %x]",
179                    $lastRange->start, $lastRange->end,
180                    $range->start, $range->end
181                ));
182            }
183        }
184        if ($lastRange->end != -1) {
185            $newRanges[] = $lastRange;
186        }
187        return $newRanges;
188    }
189
190    public function compactPropRanges() {
191        foreach ($this->propRanges as &$ranges) {
192            $ranges = $this->compactRangeArray($ranges);
193        }
194    }
195}
196
197function parseDataFile(string $input) {
198    $lines = explode("\n", $input);
199    foreach ($lines as $line) {
200        // Strip comments
201        if (false !== $hashPos = strpos($line, '#')) {
202            $line = substr($line, 0, $hashPos);
203        }
204
205        // Skip empty lines
206        $line = trim($line);
207        if ($line === '') {
208            continue;
209        }
210
211        $fields = array_map('trim', explode(';', $line));
212        yield $fields;
213    }
214}
215
216function parseUnicodeData(UnicodeData $data, string $input) : void {
217    $lines = parseDataFile($input);
218    foreach ($lines as $fields) {
219        if (count($fields) != 15) {
220            throw new Exception("Line does not contain 15 fields");
221        }
222
223        $code = intval($fields[0], 16);
224
225        $name = $fields[1];
226        if ($name === '') {
227            throw new Exception("Empty name");
228        }
229
230        if ($name[0] === '<' && $name !== '<control>') {
231            // This is a character range
232            $lines->next();
233            $nextFields = $lines->current();
234            $nextCode = intval($nextFields[0], 16);
235
236            $generalCategory = $fields[2];
237            $data->addPropRange($code, $nextCode, $generalCategory);
238
239            $bidiClass = $fields[4];
240            $data->addPropRange($code, $nextCode, $bidiClass);
241            continue;
242        }
243
244        $generalCategory = $fields[2];
245        $data->addProp($code, $generalCategory);
246
247        $bidiClass = $fields[4];
248        $data->addProp($code, $bidiClass);
249
250        $upperCase = intval($fields[12], 16);
251        $lowerCase = intval($fields[13], 16);
252        $titleCase = intval($fields[14], 16) ?: $upperCase;
253        if ($upperCase) {
254            $data->addCaseMapping('upper', $code, $upperCase);
255        }
256        if ($lowerCase) {
257            $data->addCaseMapping('lower', $code, $lowerCase);
258        }
259        if ($titleCase) {
260            $data->addCaseMapping('title', $code, $titleCase);
261        }
262    }
263}
264
265function parseCodes(string $strCodes) : array {
266    $codes = [];
267    foreach (explode(' ', $strCodes) as $strCode) {
268        $codes[] = intval($strCode, 16);
269    }
270    return $codes;
271}
272
273function parseCaseFolding(UnicodeData $data, string $input) : void {
274    foreach (parseDataFile($input) as $fields) {
275        if (count($fields) != 4) {
276            throw new Exception("Line does not contain 4 fields");
277        }
278
279        $code = intval($fields[0], 16);
280        $status = $fields[1];
281        if ($status == 'T') {
282            // Use language-agnostic case folding
283            continue;
284        }
285
286        if ($status == 'C' || $status == 'S') {
287            $foldCode = intval($fields[2], 16);
288            if (!isset($data->caseMaps['fold'][$code])) {
289                $data->addCaseMapping('fold', $code, $foldCode);
290            } else {
291                // Add simple mapping to full mapping data
292                assert(is_array($data->caseMaps['fold'][$code]));
293                $data->caseMaps['fold'][$code][0] = $foldCode;
294            }
295        } else if ($status == 'F') {
296            $foldCodes = parseCodes($fields[2]);
297            $existingFoldCode = $data->caseMaps['fold'][$code] ?? $code;
298            $data->caseMaps['fold'][$code] = array_merge([$code], $foldCodes);
299        } else {
300            assert(0);
301        }
302    }
303}
304
305function addSpecialCasing(UnicodeData $data, string $type, int $code, array $caseCodes) : void {
306    $simpleCaseCode = $data->caseMaps[$type][$code] ?? $code;
307    if (count($caseCodes) == 1) {
308        if ($caseCodes[0] != $simpleCaseCode) {
309            throw new Exception("Simple case code in special casing does not match");
310        }
311
312        // Special case: If a title-case character maps to itself, we may still have to store it,
313        // if there is a non-trivial upper-case mapping for it
314        if ($type == 'title' && $code == $caseCodes[0]
315                && ($data->caseMaps['upper'][$code] ?? $code) != $code) {
316            $data->caseMaps['title'][$code] = $code;
317        }
318        return;
319    }
320
321    if (count($caseCodes) > 3) {
322        throw new Exception("Special case mapping with more than 3 code points");
323    }
324
325    $data->caseMaps[$type][$code] = array_merge([$simpleCaseCode], $caseCodes);
326}
327
328function parseSpecialCasing(UnicodeData $data, string $input) : void {
329    foreach (parseDataFile($input) as $fields) {
330        if (count($fields) != 5 && count($fields) != 6) {
331            throw new Exception("Line does not contain 5 or 6 fields");
332        }
333
334        $code = intval($fields[0], 16);
335        $lower = parseCodes($fields[1]);
336        $title = parseCodes($fields[2]);
337        $upper = parseCodes($fields[3]);
338
339        $cond = $fields[4];
340        if ($cond) {
341            // Only use unconditional mappings
342            continue;
343        }
344
345        addSpecialCasing($data, 'lower', $code, $lower);
346        addSpecialCasing($data, 'upper', $code, $upper);
347
348        // Should happen last
349        addSpecialCasing($data, 'title', $code, $title);
350    }
351}
352
353function parseDerivedCoreProperties(UnicodeData $data, string $input) : void {
354    foreach (parseDataFile($input) as $fields) {
355        if (count($fields) != 2) {
356            throw new Exception("Line does not contain 2 fields");
357        }
358
359        $property = $fields[1];
360        if ($property != 'Cased' && $property != 'Case_Ignorable') {
361            continue;
362        }
363
364        $range = explode('..', $fields[0]);
365        if (count($range) == 2) {
366            $data->addPropRange(intval($range[0], 16), intval($range[1], 16), $property);
367        } else if (count($range) == 1) {
368            $data->addProp(intval($range[0], 16), $property);
369        } else {
370            throw new Exception("Invalid range");
371        }
372    }
373}
374
375function formatArray(array $values, int $width, string $format) : string {
376    $result = '';
377    $i = 0;
378    $c = count($values);
379    for ($i = 0; $i < $c; $i++) {
380        if ($i != 0) {
381            $result .= ',';
382        }
383
384        $result .= $i % $width == 0 ? "\n\t" : " ";
385        $result .= sprintf($format, $values[$i]);
386    }
387    return $result;
388}
389
390function formatShortHexArray(array $values, int $width) : string {
391    return formatArray($values, $width, "0x%04x");
392}
393function formatShortDecArray(array $values, int $width) : string {
394    return formatArray($values, $width, "% 5d");
395}
396function formatIntArray(array $values, int $width) : string {
397    return formatArray($values, $width, "0x%08x");
398}
399
400function generatePropData(UnicodeData $data) {
401    $data->compactPropRanges();
402
403    $propOffsets = [];
404    $idx = 0;
405    foreach ($data->propRanges as $ranges) {
406        $num = count($ranges);
407        $propOffsets[] = $num ? $idx : 0xffff;
408        $idx += 2*$num;
409    }
410
411    // Add sentinel for binary search
412    $propOffsets[] = $idx;
413
414    // TODO ucgendat.c pads the prop offsets to the next multiple of 4
415    // for rather debious reasons of alignment. This should probably be
416    // dropped
417    while (count($propOffsets) % 4 != 0) {
418        $propOffsets[] = 0;
419    }
420
421    $totalRanges = $idx;
422
423    $result = "";
424    $result .= "static const unsigned short _ucprop_size = $data->numProps;\n\n";
425    $result .= "static const unsigned short  _ucprop_offsets[] = {";
426    $result .= formatShortHexArray($propOffsets, 8);
427    $result .= "\n};\n\n";
428
429    $values = [];
430    foreach ($data->propRanges as $ranges) {
431        foreach ($ranges as $range) {
432            $values[] = $range->start;
433            $values[] = $range->end;
434        }
435    }
436
437    $result .= "static const unsigned int _ucprop_ranges[] = {";
438    $result .= formatIntArray($values, 4);
439    $result .= "\n};\n\n";
440    return $result;
441}
442
443function flatten(array $array) {
444    $result = [];
445    foreach ($array as $arr) {
446        foreach ($arr as $v) {
447            $result[] = $v;
448        }
449    }
450    return $result;
451}
452
453function prepareCaseData(UnicodeData $data) {
454    // Don't store titlecase if it's the same as uppercase
455    foreach ($data->caseMaps['title'] as $code => $titleCode) {
456        if ($titleCode == ($data->caseMaps['upper'][$code] ?? $code)) {
457            unset($data->caseMaps['title'][$code]);
458        }
459    }
460
461    // Store full (multi-char) case mappings in a separate table and only
462    // store an index into it
463    foreach ($data->caseMaps as $type => $caseMap) {
464        foreach ($caseMap as $code => $caseCode) {
465            if (is_array($caseCode)) {
466                // -1 because the first entry is the simple case mapping
467                $len = count($caseCode) - 1;
468                $idx = count($data->extraCaseData);
469                $data->caseMaps[$type][$code] = ($len << 24) | $idx;
470
471                foreach ($caseCode as $c) {
472                    $data->extraCaseData[] = $c;
473                }
474            }
475        }
476    }
477}
478
479function generateCaseMPH(string $name, array $map) {
480    $prefix = "_uccase_" . $name;
481    list($gTable, $table) = generateMPH($map, $fast = false);
482    echo "$name: n=", count($table), ", g=", count($gTable), "\n";
483
484    $result = "";
485    $result .= "static const unsigned {$prefix}_g_size = " . count($gTable) . ";\n";
486    $result .= "static const short {$prefix}_g[] = {";
487    $result .= formatShortDecArray($gTable, 8);
488    $result .= "\n};\n\n";
489    $result .= "static const unsigned {$prefix}_table_size = " . count($table) . ";\n";
490    $result .= "static const unsigned {$prefix}_table[] = {";
491    $result .= formatIntArray(flatten($table), 4);
492    $result .= "\n};\n\n";
493    return $result;
494}
495
496function generateCaseData(UnicodeData $data) {
497    prepareCaseData($data);
498
499    $result = "";
500    $result .= generateCaseMPH('upper', $data->caseMaps['upper']);
501    $result .= generateCaseMPH('lower', $data->caseMaps['lower']);
502    $result .= generateCaseMPH('title', $data->caseMaps['title']);
503    $result .= generateCaseMPH('fold', $data->caseMaps['fold']);
504    $result .= "static const unsigned _uccase_extra_table[] = {";
505    $result .= formatIntArray($data->extraCaseData, 4);
506    $result .= "\n};\n\n";
507    return $result;
508}
509
510function generateData(UnicodeData $data) {
511    $result = <<<'HEADER'
512/* This file was generated from a modified version UCData's ucgendat.
513 *
514 *                     DO NOT EDIT THIS FILE!
515 *
516 * Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download
517 * the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt
518 * files from  http://www.unicode.org/Public/ and run this program.
519 *
520 * More information can be found in the UCData package. Unfortunately,
521 * the project's page doesn't seem to be live anymore, so you can use
522 * OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */
523HEADER;
524    $result .= "\n\n" . generatePropData($data);
525    $result .= generateCaseData($data);
526
527    return $result;
528}
529
530/*
531 * Minimal Perfect Hash Generation
532 *
533 * Based on "Hash, displace, and compress" algorithm due to
534 * Belazzougui, Botelho and Dietzfelbinger.
535 *
536 * Hash function based on https://stackoverflow.com/a/12996028/385378.
537 * MPH implementation based on http://stevehanov.ca/blog/index.php?id=119.
538 */
539
540function hashInt(int $d, int $x) {
541    $x ^= $d;
542    $x = (($x >> 16) ^ $x) * 0x45d9f3b;
543    return $x & 0xffffffff;
544}
545
546function tryGenerateMPH(array $map, int $gSize) {
547    $tableSize = count($map);
548    $table = [];
549    $gTable = array_fill(0, $gSize, 0x7fff);
550    $buckets = [];
551
552    foreach ($map as $k => $v) {
553        $h = hashInt(0, $k) % $gSize;
554        $buckets[$h][] = [$k, $v];
555    }
556
557    // Sort by descending number of collisions
558    usort($buckets, function ($b1, $b2) {
559        return -(count($b1) <=> count($b2));
560    });
561
562    foreach ($buckets as $bucket) {
563        $collisions = count($bucket);
564        if ($collisions <= 1) {
565            continue;
566        }
567
568        // Try values of $d until all elements placed in different slots
569        $d = 1;
570        $i = 0;
571        $used = [];
572        while ($i < $collisions) {
573            if ($d > 0x7fff) {
574                return [];
575            }
576
577            list($k) = $bucket[$i];
578            $slot = hashInt($d, $k) % $tableSize;
579            if (isset($table[$slot]) || isset($used[$slot])) {
580                $d++;
581                $i = 0;
582                $used = [];
583            } else {
584                $i++;
585                $used[$slot] = true;
586            }
587        }
588
589        $g = hashInt(0, $bucket[0][0]) % $gSize;
590        $gTable[$g] = $d;
591        foreach ($bucket as $elem) {
592            $table[hashInt($d, $elem[0]) % $tableSize] = $elem;
593        }
594    }
595
596    $freeSlots = [];
597    for ($i = 0; $i < $tableSize; $i++) {
598        if (!isset($table[$i])) {
599            $freeSlots[] = $i;
600        }
601    }
602
603    // For buckets with only one element, we directly store the index
604    $freeIdx = 0;
605    foreach ($buckets as $bucket) {
606        if (count($bucket) != 1) {
607            continue;
608        }
609
610        $elem = $bucket[0];
611        $slot = $freeSlots[$freeIdx++];
612        $table[$slot] = $elem;
613
614        $g = hashInt(0, $elem[0]) % $gSize;
615        $gTable[$g] = -$slot;
616    }
617
618    ksort($gTable);
619    ksort($table);
620
621    return [$gTable, $table];
622}
623
624function generateMPH(array $map, bool $fast) {
625    if ($fast) {
626        // Check size starting lambda=5.0 in 0.5 increments
627        for ($lambda = 5.0;; $lambda -= 0.5) {
628            $m = (int) (count($map) / $lambda);
629            $tmpMph = tryGenerateMPH($map, $m);
630            if (!empty($tmpMph)) {
631                $mph = $tmpMph;
632                break;
633            }
634        }
635    } else {
636        // Check all sizes starting lambda=7.0
637        $m = (int) (count($map) / 7.0);
638        for (;; $m++) {
639            $tmpMph = tryGenerateMPH($map, $m);
640            if (!empty($tmpMph)) {
641                $mph = $tmpMph;
642                break;
643            }
644        }
645    }
646
647    return $mph;
648}
649