xref: /PHP-7.3/ext/mbstring/ucgendat/ucgendat.php (revision 3362620b)
1<?php error_reporting(E_ALL);
2
3/* This is based on the ucgendat.c file from the OpenLDAP project, licensed
4 * as follows. */
5
6/* Copyright 1998-2007 The OpenLDAP Foundation.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted only as authorized by the OpenLDAP
11 * Public License.
12 *
13 * A copy of this license is available at
14 * <http://www.OpenLDAP.org/license.html>.
15 */
16
17/* Copyright 2001 Computing Research Labs, New Mexico State University
18 *
19 * Permission is hereby granted, free of charge, to any person obtaining a
20 * copy of this software and associated documentation files (the "Software"),
21 * to deal in the Software without restriction, including without limitation
22 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
23 * and/or sell copies of the Software, and to permit persons to whom the
24 * Software is furnished to do so, subject to the following conditions:
25 *
26 * The above copyright notice and this permission notice shall be included in
27 * all copies or substantial portions of the Software.
28 *
29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
32 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
33 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
34 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
35 * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
36 */
37
38if ($argc < 2) {
39    echo "Usage: php ucgendata.php ./datadir\n";
40    echo "./datadir must contain:\n";
41    echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt and DerivedCoreProperties.txt\n";
42    return;
43}
44
45$dir = $argv[1];
46$unicodeDataFile = $dir . '/UnicodeData.txt';
47$caseFoldingFile = $dir . '/CaseFolding.txt';
48$specialCasingFile = $dir . '/SpecialCasing.txt';
49$derivedCorePropertiesFile = $dir . '/DerivedCoreProperties.txt';
50
51$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile];
52foreach ($files as $file) {
53    if (!file_exists($file)) {
54        echo "File $file does not exist.\n";
55        return;
56    }
57}
58
59$outputFile = __DIR__ . "/../unicode_data.h";
60
61$data = new UnicodeData;
62parseUnicodeData($data, file_get_contents($unicodeDataFile));
63parseCaseFolding($data, file_get_contents($caseFoldingFile));
64parseSpecialCasing($data, file_get_contents($specialCasingFile));
65parseDerivedCoreProperties($data, file_get_contents($derivedCorePropertiesFile));
66file_put_contents($outputFile, generateData($data));
67
68class Range {
69    public $start;
70    public $end;
71
72    public function __construct(int $start, int $end) {
73        $this->start = $start;
74        $this->end = $end;
75    }
76}
77
78class UnicodeData {
79    public $propIndexes;
80    public $numProps;
81    public $propRanges;
82    public $caseMaps;
83    public $extraCaseData;
84
85    public function __construct() {
86        /*
87         * List of properties expected to be found in the Unicode Character Database.
88         */
89        $this->propIndexes = array_flip([
90            "Mn", "Mc", "Me", "Nd", "Nl", "No",
91            "Zs", "Zl", "Zp", "Cc", "Cf", "Cs",
92            "Co", "Cn", "Lu", "Ll", "Lt", "Lm",
93            "Lo", "Pc", "Pd", "Ps", "Pe", "Po",
94            "Sm", "Sc", "Sk", "So", "L", "R",
95            "EN", "ES", "ET", "AN", "CS", "B",
96            "S", "WS", "ON", "Pi", "Pf", "AL",
97            "Cased", "Case_Ignorable"
98        ]);
99        $this->numProps = count($this->propIndexes);
100
101        $this->propRanges = array_fill(0, $this->numProps, []);
102        $this->caseMaps = [
103            'upper' => [],
104            'lower' => [],
105            'title' => [],
106            'fold' => [],
107        ];
108        $this->extraCaseData = [];
109    }
110
111    function propToIndex(string $prop) : int {
112        /* Deal with directionality codes introduced in Unicode 3.0. */
113        if (in_array($prop, ["BN", "NSM", "PDF", "LRE", "LRO", "RLE", "RLO", "LRI", "RLI", "FSI", "PDI"])) {
114            /*
115             * Mark all of these as Other Neutral to preserve compatibility with
116             * older versions.
117             */
118            $prop = "ON";
119        }
120
121        if (!isset($this->propIndexes[$prop])) {
122            throw new Exception("Unknown property $prop");
123        }
124
125        return $this->propIndexes[$prop];
126    }
127
128    public function addProp(int $code, string $prop) {
129        $propIdx = self::propToIndex($prop);
130
131        // Check if this extends the last range
132        $ranges = $this->propRanges[$propIdx];
133        if (!empty($ranges)) {
134            $lastRange = $ranges[count($ranges) - 1];
135            if ($code === $lastRange->end + 1) {
136                $lastRange->end++;
137                return;
138            }
139        }
140
141        $this->propRanges[$propIdx][] = new Range($code, $code);
142    }
143
144    public function addPropRange(int $startCode, int $endCode, string $prop) {
145        $propIdx = self::propToIndex($prop);
146        $this->propRanges[$propIdx][] = new Range($startCode, $endCode);
147    }
148
149    public function addCaseMapping(string $case, int $origCode, int $mappedCode) {
150        $this->caseMaps[$case][$origCode] = $mappedCode;
151    }
152
153    public function compactRangeArray(array $ranges) : array {
154        // Sort by start codepoint
155        usort($ranges, function (Range $r1, Range $r2) {
156            return $r1->start <=> $r2->start;
157        });
158
159        $lastRange = new Range(-1, -1);
160        $newRanges = [];
161        foreach ($ranges as $range) {
162            if ($lastRange->end == -1) {
163                $lastRange = $range;
164            } else if ($range->start == $lastRange->end + 1) {
165                $lastRange->end = $range->end;
166            } else if ($range->start > $lastRange->end + 1) {
167                $newRanges[] = $lastRange;
168                $lastRange = $range;
169            } else {
170                throw new Exception(sprintf(
171                    "Overlapping ranges [%x, %x] and [%x, %x]",
172                    $lastRange->start, $lastRange->end,
173                    $range->start, $range->end
174                ));
175            }
176        }
177        if ($lastRange->end != -1) {
178            $newRanges[] = $lastRange;
179        }
180        return $newRanges;
181    }
182
183    public function compactPropRanges() {
184        foreach ($this->propRanges as &$ranges) {
185            $ranges = $this->compactRangeArray($ranges);
186        }
187    }
188}
189
190function parseDataFile(string $input) {
191    $lines = explode("\n", $input);
192    foreach ($lines as $line) {
193        // Strip comments
194        if (false !== $hashPos = strpos($line, '#')) {
195            $line = substr($line, 0, $hashPos);
196        }
197
198        // Skip empty lines
199        $line = trim($line);
200        if ($line === '') {
201            continue;
202        }
203
204        $fields = array_map('trim', explode(';', $line));
205        yield $fields;
206    }
207}
208
209function parseUnicodeData(UnicodeData $data, string $input) : void {
210    $lines = parseDataFile($input);
211    foreach ($lines as $fields) {
212        if (count($fields) != 15) {
213            throw new Exception("Line does not contain 15 fields");
214        }
215
216        $code = intval($fields[0], 16);
217
218        $name = $fields[1];
219        if ($name === '') {
220            throw new Exception("Empty name");
221        }
222
223        if ($name[0] === '<' && $name !== '<control>') {
224            // This is a character range
225            $lines->next();
226            $nextFields = $lines->current();
227            $nextCode = intval($nextFields[0], 16);
228
229            $generalCategory = $fields[2];
230            $data->addPropRange($code, $nextCode, $generalCategory);
231
232            $bidiClass = $fields[4];
233            $data->addPropRange($code, $nextCode, $bidiClass);
234            continue;
235        }
236
237        $generalCategory = $fields[2];
238        $data->addProp($code, $generalCategory);
239
240        $bidiClass = $fields[4];
241        $data->addProp($code, $bidiClass);
242
243        $upperCase = intval($fields[12], 16);
244        $lowerCase = intval($fields[13], 16);
245        $titleCase = intval($fields[14], 16) ?: $upperCase;
246        if ($upperCase) {
247            $data->addCaseMapping('upper', $code, $upperCase);
248        }
249        if ($lowerCase) {
250            $data->addCaseMapping('lower', $code, $lowerCase);
251        }
252        if ($titleCase) {
253            $data->addCaseMapping('title', $code, $titleCase);
254        }
255    }
256}
257
258function parseCodes(string $strCodes) : array {
259    $codes = [];
260    foreach (explode(' ', $strCodes) as $strCode) {
261        $codes[] = intval($strCode, 16);
262    }
263    return $codes;
264}
265
266function parseCaseFolding(UnicodeData $data, string $input) : void {
267    foreach (parseDataFile($input) as $fields) {
268        if (count($fields) != 4) {
269            throw new Exception("Line does not contain 4 fields");
270        }
271
272        $code = intval($fields[0], 16);
273        $status = $fields[1];
274        if ($status == 'T') {
275            // Use language-agnostic case folding
276            continue;
277        }
278
279        if ($status == 'C' || $status == 'S') {
280            $foldCode = intval($fields[2], 16);
281            if (!isset($data->caseMaps['fold'][$code])) {
282                $data->addCaseMapping('fold', $code, $foldCode);
283            } else {
284                // Add simple mapping to full mapping data
285                assert(is_array($data->caseMaps['fold'][$code]));
286                $data->caseMaps['fold'][$code][0] = $foldCode;
287            }
288        } else if ($status == 'F') {
289            $foldCodes = parseCodes($fields[2]);
290            $existingFoldCode = $data->caseMaps['fold'][$code] ?? $code;
291            $data->caseMaps['fold'][$code] = array_merge([$code], $foldCodes);
292        } else {
293            assert(0);
294        }
295    }
296}
297
298function addSpecialCasing(UnicodeData $data, string $type, int $code, array $caseCodes) : void {
299    $simpleCaseCode = $data->caseMaps[$type][$code] ?? $code;
300    if (count($caseCodes) == 1) {
301        if ($caseCodes[0] != $simpleCaseCode) {
302            throw new Exception("Simple case code in special casing does not match");
303        }
304
305        // Special case: If a title-case character maps to itself, we may still have to store it,
306        // if there is a non-trivial upper-case mapping for it
307        if ($type == 'title' && $code == $caseCodes[0]
308                && ($data->caseMaps['upper'][$code] ?? $code) != $code) {
309            $data->caseMaps['title'][$code] = $code;
310        }
311        return;
312    }
313
314    if (count($caseCodes) > 3) {
315        throw new Exception("Special case mapping with more than 3 code points");
316    }
317
318    $data->caseMaps[$type][$code] = array_merge([$simpleCaseCode], $caseCodes);
319}
320
321function parseSpecialCasing(UnicodeData $data, string $input) : void {
322    foreach (parseDataFile($input) as $fields) {
323        if (count($fields) != 5 && count($fields) != 6) {
324            throw new Exception("Line does not contain 5 or 6 fields");
325        }
326
327        $code = intval($fields[0], 16);
328        $lower = parseCodes($fields[1]);
329        $title = parseCodes($fields[2]);
330        $upper = parseCodes($fields[3]);
331
332        $cond = $fields[4];
333        if ($cond) {
334            // Only use unconditional mappings
335            continue;
336        }
337
338        addSpecialCasing($data, 'lower', $code, $lower);
339        addSpecialCasing($data, 'upper', $code, $upper);
340
341        // Should happen last
342        addSpecialCasing($data, 'title', $code, $title);
343    }
344}
345
346function parseDerivedCoreProperties(UnicodeData $data, string $input) : void {
347    foreach (parseDataFile($input) as $fields) {
348        if (count($fields) != 2) {
349            throw new Exception("Line does not contain 2 fields");
350        }
351
352        $property = $fields[1];
353        if ($property != 'Cased' && $property != 'Case_Ignorable') {
354            continue;
355        }
356
357        $range = explode('..', $fields[0]);
358        if (count($range) == 2) {
359            $data->addPropRange(intval($range[0], 16), intval($range[1], 16), $property);
360        } else if (count($range) == 1) {
361            $data->addProp(intval($range[0], 16), $property);
362        } else {
363            throw new Exception("Invalid range");
364        }
365    }
366}
367
368function formatArray(array $values, int $width, string $format) : string {
369    $result = '';
370    $i = 0;
371    $c = count($values);
372    for ($i = 0; $i < $c; $i++) {
373        if ($i != 0) {
374            $result .= ',';
375        }
376
377        $result .= $i % $width == 0 ? "\n\t" : " ";
378        $result .= sprintf($format, $values[$i]);
379    }
380    return $result;
381}
382
383function formatShortHexArray(array $values, int $width) : string {
384    return formatArray($values, $width, "0x%04x");
385}
386function formatShortDecArray(array $values, int $width) : string {
387    return formatArray($values, $width, "% 5d");
388}
389function formatIntArray(array $values, int $width) : string {
390    return formatArray($values, $width, "0x%08x");
391}
392
393function generatePropData(UnicodeData $data) {
394    $data->compactPropRanges();
395
396    $propOffsets = [];
397    $idx = 0;
398    foreach ($data->propRanges as $ranges) {
399        $num = count($ranges);
400        $propOffsets[] = $num ? $idx : 0xffff;
401        $idx += 2*$num;
402    }
403
404    // Add sentinel for binary search
405    $propOffsets[] = $idx;
406
407    // TODO ucgendat.c pads the prop offsets to the next multiple of 4
408    // for rather debious reasons of alignment. This should probably be
409    // dropped
410    while (count($propOffsets) % 4 != 0) {
411        $propOffsets[] = 0;
412    }
413
414    $totalRanges = $idx;
415
416    $result = "";
417    $result .= "static const unsigned short _ucprop_size = $data->numProps;\n\n";
418    $result .= "static const unsigned short  _ucprop_offsets[] = {";
419    $result .= formatShortHexArray($propOffsets, 8);
420    $result .= "\n};\n\n";
421
422    $values = [];
423    foreach ($data->propRanges as $ranges) {
424        foreach ($ranges as $range) {
425            $values[] = $range->start;
426            $values[] = $range->end;
427        }
428    }
429
430    $result .= "static const unsigned int _ucprop_ranges[] = {";
431    $result .= formatIntArray($values, 4);
432    $result .= "\n};\n\n";
433    return $result;
434}
435
436function flatten(array $array) {
437    $result = [];
438    foreach ($array as $arr) {
439        foreach ($arr as $v) {
440            $result[] = $v;
441        }
442    }
443    return $result;
444}
445
446function prepareCaseData(UnicodeData $data) {
447    // Don't store titlecase if it's the same as uppercase
448    foreach ($data->caseMaps['title'] as $code => $titleCode) {
449        if ($titleCode == ($data->caseMaps['upper'][$code] ?? $code)) {
450            unset($data->caseMaps['title'][$code]);
451        }
452    }
453
454    // Store full (multi-char) case mappings in a separate table and only
455    // store an index into it
456    foreach ($data->caseMaps as $type => $caseMap) {
457        foreach ($caseMap as $code => $caseCode) {
458            if (is_array($caseCode)) {
459                // -1 because the first entry is the simple case mapping
460                $len = count($caseCode) - 1;
461                $idx = count($data->extraCaseData);
462                $data->caseMaps[$type][$code] = ($len << 24) | $idx;
463
464                foreach ($caseCode as $c) {
465                    $data->extraCaseData[] = $c;
466                }
467            }
468        }
469    }
470}
471
472function generateCaseMPH(string $name, array $map) {
473    $prefix = "_uccase_" . $name;
474    list($gTable, $table) = generateMPH($map, $fast = false);
475    echo "$name: n=", count($table), ", g=", count($gTable), "\n";
476
477    $result = "";
478    $result .= "static const unsigned {$prefix}_g_size = " . count($gTable) . ";\n";
479    $result .= "static const short {$prefix}_g[] = {";
480    $result .= formatShortDecArray($gTable, 8);
481    $result .= "\n};\n\n";
482    $result .= "static const unsigned {$prefix}_table_size = " . count($table) . ";\n";
483    $result .= "static const unsigned {$prefix}_table[] = {";
484    $result .= formatIntArray(flatten($table), 4);
485    $result .= "\n};\n\n";
486    return $result;
487}
488
489function generateCaseData(UnicodeData $data) {
490    prepareCaseData($data);
491
492    $result = "";
493    $result .= generateCaseMPH('upper', $data->caseMaps['upper']);
494    $result .= generateCaseMPH('lower', $data->caseMaps['lower']);
495    $result .= generateCaseMPH('title', $data->caseMaps['title']);
496    $result .= generateCaseMPH('fold', $data->caseMaps['fold']);
497    $result .= "static const unsigned _uccase_extra_table[] = {";
498    $result .= formatIntArray($data->extraCaseData, 4);
499    $result .= "\n};\n\n";
500    return $result;
501}
502
503function generateData(UnicodeData $data) {
504    $result = <<<'HEADER'
505/* This file was generated from a modified version UCData's ucgendat.
506 *
507 *                     DO NOT EDIT THIS FILE!
508 *
509 * Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download
510 * the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt
511 * files from  http://www.unicode.org/Public/ and run this program.
512 *
513 * More information can be found in the UCData package. Unfortunately,
514 * the project's page doesn't seem to be live anymore, so you can use
515 * OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */
516HEADER;
517    $result .= "\n\n" . generatePropData($data);
518    $result .= generateCaseData($data);
519
520    return $result;
521}
522
523/*
524 * Minimal Perfect Hash Generation
525 *
526 * Based on "Hash, displace, and compress" algorithm due to
527 * Belazzougui, Botelho and Dietzfelbinger.
528 *
529 * Hash function based on https://stackoverflow.com/a/12996028/385378.
530 * MPH implementation based on http://stevehanov.ca/blog/index.php?id=119.
531 */
532
533function hashInt(int $d, int $x) {
534    $x ^= $d;
535    $x = (($x >> 16) ^ $x) * 0x45d9f3b;
536    return $x & 0xffffffff;
537}
538
539function tryGenerateMPH(array $map, int $gSize) {
540    $tableSize = count($map);
541    $table = [];
542    $gTable = array_fill(0, $gSize, 0x7fff);
543    $buckets = [];
544
545    foreach ($map as $k => $v) {
546        $h = hashInt(0, $k) % $gSize;
547        $buckets[$h][] = [$k, $v];
548    }
549
550    // Sort by descending number of collisions
551    usort($buckets, function ($b1, $b2) {
552        return -(count($b1) <=> count($b2));
553    });
554
555    foreach ($buckets as $bucket) {
556        $collisions = count($bucket);
557        if ($collisions <= 1) {
558            continue;
559        }
560
561        // Try values of $d until all elements placed in different slots
562        $d = 1;
563        $i = 0;
564        $used = [];
565        while ($i < $collisions) {
566            if ($d > 0x7fff) {
567                return [];
568            }
569
570            list($k) = $bucket[$i];
571            $slot = hashInt($d, $k) % $tableSize;
572            if (isset($table[$slot]) || isset($used[$slot])) {
573                $d++;
574                $i = 0;
575                $used = [];
576            } else {
577                $i++;
578                $used[$slot] = true;
579            }
580        }
581
582        $g = hashInt(0, $bucket[0][0]) % $gSize;
583        $gTable[$g] = $d;
584        foreach ($bucket as $elem) {
585            $table[hashInt($d, $elem[0]) % $tableSize] = $elem;
586        }
587    }
588
589    $freeSlots = [];
590    for ($i = 0; $i < $tableSize; $i++) {
591        if (!isset($table[$i])) {
592            $freeSlots[] = $i;
593        }
594    }
595
596    // For buckets with only one element, we directly store the index
597    $freeIdx = 0;
598    foreach ($buckets as $bucket) {
599        if (count($bucket) != 1) {
600            continue;
601        }
602
603        $elem = $bucket[0];
604        $slot = $freeSlots[$freeIdx++];
605        $table[$slot] = $elem;
606
607        $g = hashInt(0, $elem[0]) % $gSize;
608        $gTable[$g] = -$slot;
609    }
610
611    ksort($gTable);
612    ksort($table);
613
614    return [$gTable, $table];
615}
616
617function generateMPH(array $map, bool $fast) {
618    if ($fast) {
619        // Check size starting lambda=5.0 in 0.5 increments
620        for ($lambda = 5.0;; $lambda -= 0.5) {
621            $m = (int) (count($map) / $lambda);
622            $tmpMph = tryGenerateMPH($map, $m);
623            if (!empty($tmpMph)) {
624                $mph = $tmpMph;
625                break;
626            }
627        }
628    } else {
629        // Check all sizes starting lambda=7.0
630        $m = (int) (count($map) / 7.0);
631        for (;; $m++) {
632            $tmpMph = tryGenerateMPH($map, $m);
633            if (!empty($tmpMph)) {
634                $mph = $tmpMph;
635                break;
636            }
637        }
638    }
639
640    return $mph;
641}
642