#!/usr/bin/env php . */ /* Copyright 2001 Computing Research Labs, New Mexico State University * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ if ($argc < 2) { echo "Usage: php ucgendata.php ./datadir\n"; echo "./datadir must contain:\n"; echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt, DerivedCoreProperties.txt, and EastAsianWidth.txt\n"; return; } $dir = $argv[1]; $unicodeDataFile = $dir . '/UnicodeData.txt'; $caseFoldingFile = $dir . '/CaseFolding.txt'; $specialCasingFile = $dir . '/SpecialCasing.txt'; $derivedCorePropertiesFile = $dir . '/DerivedCoreProperties.txt'; $eastAsianWidthFile = $dir . '/EastAsianWidth.txt'; $files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile, $eastAsianWidthFile]; foreach ($files as $file) { if (!file_exists($file)) { echo "File $file does not exist.\n"; return; } } $outputFile = __DIR__ . "/../unicode_data.h"; $data = new UnicodeData; parseUnicodeData($data, file_get_contents($unicodeDataFile)); parseCaseFolding($data, file_get_contents($caseFoldingFile)); parseSpecialCasing($data, file_get_contents($specialCasingFile)); parseDerivedCoreProperties($data, file_get_contents($derivedCorePropertiesFile)); file_put_contents($outputFile, generateData($data)); $eawFile = __DIR__ . "/../libmbfl/mbfl/eaw_table.h"; $eawData = parseEastAsianWidth(file_get_contents($eastAsianWidthFile)); file_put_contents($eawFile, generateEastAsianWidthData($eawData)); class Range { public $start; public $end; public function __construct(int $start, int $end) { $this->start = $start; $this->end = $end; } } class UnicodeData { public $propIndexes; public $numProps; public $propRanges; public $caseMaps; public $extraCaseData; public function __construct() { /* * List of properties expected to be found in the Unicode Character Database. */ $this->propIndexes = array_flip([ "Mn", "Mc", "Me", "Nd", "Nl", "No", "Zs", "Zl", "Zp", "Cs", "Co", "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Sm", "Sc", "Sk", "So", "L", "R", "EN", "ES", "ET", "AN", "CS", "B", "S", "WS", "ON", "AL", "C", "P", "Cased", "Case_Ignorable" ]); $this->numProps = count($this->propIndexes); $this->propRanges = array_fill(0, $this->numProps, []); $this->caseMaps = [ 'upper' => [], 'lower' => [], 'title' => [], 'fold' => [], ]; $this->extraCaseData = []; } function propToIndex(string $prop) : int { /* Deal with directionality codes introduced in Unicode 3.0. */ if (in_array($prop, ["BN", "NSM", "PDF", "LRE", "LRO", "RLE", "RLO", "LRI", "RLI", "FSI", "PDI"])) { /* * Mark all of these as Other Neutral to preserve compatibility with * older versions. */ $prop = "ON"; } /* Merge all punctuation into a single category for efficiency of access. * We're currently not interested in distinguishing different kinds of punctuation. */ if (in_array($prop, ["Pc", "Pd", "Ps", "Pe", "Po", "Pi", "Pf"])) { $prop = "P"; } /* Same for control. */ if (in_array($prop, ["Cc", "Cf"])) { $prop = "C"; } if (!isset($this->propIndexes[$prop])) { throw new Exception("Unknown property $prop"); } return $this->propIndexes[$prop]; } public function addProp(int $code, string $prop) { $propIdx = self::propToIndex($prop); // Check if this extends the last range $ranges = $this->propRanges[$propIdx]; if (!empty($ranges)) { $lastRange = $ranges[count($ranges) - 1]; if ($code === $lastRange->end + 1) { $lastRange->end++; return; } } $this->propRanges[$propIdx][] = new Range($code, $code); } public function addPropRange(int $startCode, int $endCode, string $prop) { $propIdx = self::propToIndex($prop); $this->propRanges[$propIdx][] = new Range($startCode, $endCode); } public function addCaseMapping(string $case, int $origCode, int $mappedCode) { $this->caseMaps[$case][$origCode] = $mappedCode; } public function compactRangeArray(array $ranges) : array { // Sort by start codepoint usort($ranges, function (Range $r1, Range $r2) { return $r1->start <=> $r2->start; }); $lastRange = new Range(-1, -1); $newRanges = []; foreach ($ranges as $range) { if ($lastRange->end == -1) { $lastRange = $range; } else if ($range->start == $lastRange->end + 1) { $lastRange->end = $range->end; } else if ($range->start > $lastRange->end + 1) { $newRanges[] = $lastRange; $lastRange = $range; } else { throw new Exception(sprintf( "Overlapping ranges [%x, %x] and [%x, %x]", $lastRange->start, $lastRange->end, $range->start, $range->end )); } } if ($lastRange->end != -1) { $newRanges[] = $lastRange; } return $newRanges; } public function compactPropRanges() { foreach ($this->propRanges as &$ranges) { $ranges = $this->compactRangeArray($ranges); } } } function parseDataFile(string $input) { $lines = explode("\n", $input); foreach ($lines as $line) { // Strip comments if (false !== $hashPos = strpos($line, '#')) { $line = substr($line, 0, $hashPos); } // Skip empty lines $line = trim($line); if ($line === '') { continue; } $fields = array_map('trim', explode(';', $line)); yield $fields; } } function parseUnicodeData(UnicodeData $data, string $input) : void { $lines = parseDataFile($input); foreach ($lines as $fields) { if (count($fields) != 15) { throw new Exception("Line does not contain 15 fields"); } $code = intval($fields[0], 16); $name = $fields[1]; if ($name === '') { throw new Exception("Empty name"); } if ($name[0] === '<' && $name !== '') { // This is a character range $lines->next(); $nextFields = $lines->current(); $nextCode = intval($nextFields[0], 16); $generalCategory = $fields[2]; $data->addPropRange($code, $nextCode, $generalCategory); $bidiClass = $fields[4]; $data->addPropRange($code, $nextCode, $bidiClass); continue; } $generalCategory = $fields[2]; $data->addProp($code, $generalCategory); $bidiClass = $fields[4]; $data->addProp($code, $bidiClass); $upperCase = intval($fields[12], 16); $lowerCase = intval($fields[13], 16); $titleCase = intval($fields[14], 16) ?: $upperCase; if ($upperCase) { $data->addCaseMapping('upper', $code, $upperCase); } if ($lowerCase) { $data->addCaseMapping('lower', $code, $lowerCase); } if ($titleCase) { $data->addCaseMapping('title', $code, $titleCase); } } } function parseCodes(string $strCodes) : array { $codes = []; foreach (explode(' ', $strCodes) as $strCode) { $codes[] = intval($strCode, 16); } return $codes; } function parseCaseFolding(UnicodeData $data, string $input) : void { foreach (parseDataFile($input) as $fields) { if (count($fields) != 4) { throw new Exception("Line does not contain 4 fields"); } $code = intval($fields[0], 16); $status = $fields[1]; if ($status == 'T') { // Use language-agnostic case folding continue; } if ($status == 'C' || $status == 'S') { $foldCode = intval($fields[2], 16); if (!isset($data->caseMaps['fold'][$code])) { $data->addCaseMapping('fold', $code, $foldCode); } else { // Add simple mapping to full mapping data assert(is_array($data->caseMaps['fold'][$code])); $data->caseMaps['fold'][$code][0] = $foldCode; } } else if ($status == 'F') { $foldCodes = parseCodes($fields[2]); $existingFoldCode = $data->caseMaps['fold'][$code] ?? $code; $data->caseMaps['fold'][$code] = array_merge([$code], $foldCodes); } else { assert(0); } } } function addSpecialCasing(UnicodeData $data, string $type, int $code, array $caseCodes) : void { $simpleCaseCode = $data->caseMaps[$type][$code] ?? $code; if (count($caseCodes) == 1) { if ($caseCodes[0] != $simpleCaseCode) { throw new Exception("Simple case code in special casing does not match"); } // Special case: If a title-case character maps to itself, we may still have to store it, // if there is a non-trivial upper-case mapping for it if ($type == 'title' && $code == $caseCodes[0] && ($data->caseMaps['upper'][$code] ?? $code) != $code) { $data->caseMaps['title'][$code] = $code; } return; } if (count($caseCodes) > 3) { throw new Exception("Special case mapping with more than 3 code points"); } $data->caseMaps[$type][$code] = array_merge([$simpleCaseCode], $caseCodes); } function parseSpecialCasing(UnicodeData $data, string $input) : void { foreach (parseDataFile($input) as $fields) { if (count($fields) != 5 && count($fields) != 6) { throw new Exception("Line does not contain 5 or 6 fields"); } $code = intval($fields[0], 16); $lower = parseCodes($fields[1]); $title = parseCodes($fields[2]); $upper = parseCodes($fields[3]); $cond = $fields[4]; if ($cond) { // Only use unconditional mappings continue; } addSpecialCasing($data, 'lower', $code, $lower); addSpecialCasing($data, 'upper', $code, $upper); // Should happen last addSpecialCasing($data, 'title', $code, $title); } } function parseDerivedCoreProperties(UnicodeData $data, string $input) : void { foreach (parseDataFile($input) as $fields) { if (count($fields) != 2) { throw new Exception("Line does not contain 2 fields"); } $property = $fields[1]; if ($property != 'Cased' && $property != 'Case_Ignorable') { continue; } $range = explode('..', $fields[0]); if (count($range) == 2) { $data->addPropRange(intval($range[0], 16), intval($range[1], 16), $property); } else if (count($range) == 1) { $data->addProp(intval($range[0], 16), $property); } else { throw new Exception("Invalid range"); } } } function parseEastAsianWidth(string $input) : array { $wideRanges = []; foreach (parseDataFile($input) as $fields) { if ($fields[1] == 'W' || $fields[1] == 'F') { if ($dotsPos = strpos($fields[0], '..')) { $startCode = intval(substr($fields[0], 0, $dotsPos), 16); $endCode = intval(substr($fields[0], $dotsPos + 2), 16); if (!empty($wideRanges)) { $lastRange = $wideRanges[count($wideRanges) - 1]; if ($startCode == $lastRange->end + 1) { $lastRange->end = $endCode; continue; } } $wideRanges[] = new Range($startCode, $endCode); } else { $code = intval($fields[0], 16); if (!empty($wideRanges)) { $lastRange = $wideRanges[count($wideRanges) - 1]; if ($code == $lastRange->end + 1) { $lastRange->end++; continue; } } $wideRanges[] = new Range($code, $code); } } } return $wideRanges; } function formatArray(array $values, int $width, string $format) : string { $result = ''; $i = 0; $c = count($values); for ($i = 0; $i < $c; $i++) { if ($i != 0) { $result .= ','; } $result .= $i % $width == 0 ? "\n\t" : " "; $result .= sprintf($format, $values[$i]); } return $result; } function formatShortHexArray(array $values, int $width) : string { return formatArray($values, $width, "0x%04x"); } function formatShortDecArray(array $values, int $width) : string { return formatArray($values, $width, "% 5d"); } function formatIntArray(array $values, int $width) : string { return formatArray($values, $width, "0x%08x"); } function generatePropData(UnicodeData $data) { $data->compactPropRanges(); $propOffsets = []; $idx = 0; foreach ($data->propRanges as $ranges) { $num = count($ranges); $propOffsets[] = $idx; $idx += 2*$num; } // Add sentinel for binary search $propOffsets[] = $idx; // TODO ucgendat.c pads the prop offsets to the next multiple of 4 // for rather dubious reasons of alignment. This should probably be // dropped while (count($propOffsets) % 4 != 0) { $propOffsets[] = 0; } $totalRanges = $idx; $result = ""; $result .= "static const unsigned short _ucprop_size = $data->numProps;\n\n"; $result .= "static const unsigned short _ucprop_offsets[] = {"; $result .= formatShortHexArray($propOffsets, 8); $result .= "\n};\n\n"; $values = []; foreach ($data->propRanges as $ranges) { foreach ($ranges as $range) { $values[] = $range->start; $values[] = $range->end; } } $result .= "static const unsigned int _ucprop_ranges[] = {"; $result .= formatIntArray($values, 4); $result .= "\n};\n\n"; return $result; } function flatten(array $array) { $result = []; foreach ($array as $arr) { foreach ($arr as $v) { $result[] = $v; } } return $result; } function prepareCaseData(UnicodeData $data) { // Don't store titlecase if it's the same as uppercase foreach ($data->caseMaps['title'] as $code => $titleCode) { if ($titleCode == ($data->caseMaps['upper'][$code] ?? $code)) { unset($data->caseMaps['title'][$code]); } } // Store full (multi-char) case mappings in a separate table and only // store an index into it foreach ($data->caseMaps as $type => $caseMap) { foreach ($caseMap as $code => $caseCode) { if (is_array($caseCode)) { // -1 because the first entry is the simple case mapping $len = count($caseCode) - 1; $idx = count($data->extraCaseData); $data->caseMaps[$type][$code] = ($len << 24) | $idx; foreach ($caseCode as $c) { $data->extraCaseData[] = $c; } } } } } function generateCaseMPH(string $name, array $map) { $prefix = "_uccase_" . $name; list($gTable, $table) = generateMPH($map, $fast = false); echo "$name: n=", count($table), ", g=", count($gTable), "\n"; $result = ""; $result .= "static const unsigned {$prefix}_g_size = " . count($gTable) . ";\n"; $result .= "static const short {$prefix}_g[] = {"; $result .= formatShortDecArray($gTable, 8); $result .= "\n};\n\n"; $result .= "static const unsigned {$prefix}_table_size = " . count($table) . ";\n"; $result .= "static const unsigned {$prefix}_table[] = {"; $result .= formatIntArray(flatten($table), 4); $result .= "\n};\n\n"; return $result; } function generateCaseData(UnicodeData $data) { prepareCaseData($data); $result = ""; $result .= generateCaseMPH('upper', $data->caseMaps['upper']); $result .= generateCaseMPH('lower', $data->caseMaps['lower']); $result .= generateCaseMPH('title', $data->caseMaps['title']); $result .= generateCaseMPH('fold', $data->caseMaps['fold']); $result .= "static const unsigned _uccase_extra_table[] = {"; $result .= formatIntArray($data->extraCaseData, 4); $result .= "\n};\n\n"; return $result; } function generateData(UnicodeData $data) { $result = <<<'HEADER' /* This file was generated from a modified version of UCData's ucgendat. * * DO NOT EDIT THIS FILE! * * Instead, download the appropriate UnicodeData-x.x.x.txt and * CompositionExclusions-x.x.x.txt files from http://www.unicode.org/Public/ * and run ext/mbstring/ucgendat/ucgendat.php. * * More information can be found in the UCData package. Unfortunately, * the project's page doesn't seem to be live anymore, so you can use * OpenLDAP's modified copy (look in libraries/liblunicode/ucdata) */ HEADER; $result .= "\n\n" . generatePropData($data); $result .= generateCaseData($data); return $result; } /* * Minimal Perfect Hash Generation * * Based on "Hash, displace, and compress" algorithm due to * Belazzougui, Botelho and Dietzfelbinger. * * Hash function based on https://stackoverflow.com/a/12996028/385378. * MPH implementation based on http://stevehanov.ca/blog/index.php?id=119. */ function hashInt(int $d, int $x) { $x ^= $d; $x = (($x >> 16) ^ $x) * 0x45d9f3b; return $x & 0xffffffff; } function tryGenerateMPH(array $map, int $gSize) { $tableSize = count($map); $table = []; $gTable = array_fill(0, $gSize, 0x7fff); $buckets = []; foreach ($map as $k => $v) { $h = hashInt(0, $k) % $gSize; $buckets[$h][] = [$k, $v]; } // Sort by descending number of collisions usort($buckets, function ($b1, $b2) { return -(count($b1) <=> count($b2)); }); foreach ($buckets as $bucket) { $collisions = count($bucket); if ($collisions <= 1) { continue; } // Try values of $d until all elements placed in different slots $d = 1; $i = 0; $used = []; while ($i < $collisions) { if ($d > 0x7fff) { return []; } list($k) = $bucket[$i]; $slot = hashInt($d, $k) % $tableSize; if (isset($table[$slot]) || isset($used[$slot])) { $d++; $i = 0; $used = []; } else { $i++; $used[$slot] = true; } } $g = hashInt(0, $bucket[0][0]) % $gSize; $gTable[$g] = $d; foreach ($bucket as $elem) { $table[hashInt($d, $elem[0]) % $tableSize] = $elem; } } $freeSlots = []; for ($i = 0; $i < $tableSize; $i++) { if (!isset($table[$i])) { $freeSlots[] = $i; } } // For buckets with only one element, we directly store the index $freeIdx = 0; foreach ($buckets as $bucket) { if (count($bucket) != 1) { continue; } $elem = $bucket[0]; $slot = $freeSlots[$freeIdx++]; $table[$slot] = $elem; $g = hashInt(0, $elem[0]) % $gSize; $gTable[$g] = -$slot; } ksort($gTable); ksort($table); return [$gTable, $table]; } function generateMPH(array $map, bool $fast) { if ($fast) { // Check size starting lambda=5.0 in 0.5 increments for ($lambda = 5.0;; $lambda -= 0.5) { $m = (int) (count($map) / $lambda); $tmpMph = tryGenerateMPH($map, $m); if (!empty($tmpMph)) { $mph = $tmpMph; break; } } } else { // Check all sizes starting lambda=7.0 $m = (int) (count($map) / 7.0); for (;; $m++) { $tmpMph = tryGenerateMPH($map, $m); if (!empty($tmpMph)) { $mph = $tmpMph; break; } } } return $mph; } function generateEastAsianWidthData(array $wideRanges) { $result = <<<'HEADER' /* This file was generated by ext/mbstring/ucgendat/ucgendat.php. * * DO NOT EDIT THIS FILE! * * East Asian Width table * * Some characters in East Asian languages are intended to be displayed in a space * which is roughly square. (This contrasts with others such as the Latin alphabet, * which are taller than they are wide.) To display these East Asian characters * properly, twice the horizontal space is used. This must be taken into account * when doing things like wrapping text to a specific width. * * Each pair of numbers in the below table is a range of Unicode codepoints * which should be displayed as double-width. */ static const struct { int begin; int end; } mbfl_eaw_table[] = { HEADER; foreach ($wideRanges as $range) { $startCode = dechex($range->start); $endCode = dechex($range->end); $result .= "\t{ 0x{$startCode}, 0x{$endCode} },\n"; } $result .= "};\n"; return $result; }