1#!/usr/bin/env php 2<?php error_reporting(E_ALL); 3 4/** 5 * This is based on the ucgendat.c file from the OpenLDAP project, licensed as 6 * follows. This file is not necessary to build PHP. It's only necessary to 7 * rebuild unicode_data.h from Unicode ucd files. 8 * 9 * Example usage: 10 * php ucgendat.php UnicodeData.txt 11 */ 12 13/* Copyright 1998-2007 The OpenLDAP Foundation. 14 * All rights reserved. 15 * 16 * Redistribution and use in source and binary forms, with or without 17 * modification, are permitted only as authorized by the OpenLDAP 18 * Public License. 19 * 20 * A copy of this license is available at 21 * <http://www.OpenLDAP.org/license.html>. 22 */ 23 24/* Copyright 2001 Computing Research Labs, New Mexico State University 25 * 26 * Permission is hereby granted, free of charge, to any person obtaining a 27 * copy of this software and associated documentation files (the "Software"), 28 * to deal in the Software without restriction, including without limitation 29 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 30 * and/or sell copies of the Software, and to permit persons to whom the 31 * Software is furnished to do so, subject to the following conditions: 32 * 33 * The above copyright notice and this permission notice shall be included in 34 * all copies or substantial portions of the Software. 35 * 36 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 37 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 38 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 39 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY 40 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 41 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 42 * THE USE OR OTHER DEALINGS IN THE SOFTWARE. 43 */ 44 45if ($argc < 2) { 46 echo "Usage: php ucgendata.php ./datadir\n"; 47 echo "./datadir must contain:\n"; 48 echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt and DerivedCoreProperties.txt\n"; 49 return; 50} 51 52$dir = $argv[1]; 53$unicodeDataFile = $dir . '/UnicodeData.txt'; 54$caseFoldingFile = $dir . '/CaseFolding.txt'; 55$specialCasingFile = $dir . '/SpecialCasing.txt'; 56$derivedCorePropertiesFile = $dir . '/DerivedCoreProperties.txt'; 57 58$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile]; 59foreach ($files as $file) { 60 if (!file_exists($file)) { 61 echo "File $file does not exist.\n"; 62 return; 63 } 64} 65 66$outputFile = __DIR__ . "/../unicode_data.h"; 67 68$data = new UnicodeData; 69parseUnicodeData($data, file_get_contents($unicodeDataFile)); 70parseCaseFolding($data, file_get_contents($caseFoldingFile)); 71parseSpecialCasing($data, file_get_contents($specialCasingFile)); 72parseDerivedCoreProperties($data, file_get_contents($derivedCorePropertiesFile)); 73file_put_contents($outputFile, generateData($data)); 74 75class Range { 76 public $start; 77 public $end; 78 79 public function __construct(int $start, int $end) { 80 $this->start = $start; 81 $this->end = $end; 82 } 83} 84 85class UnicodeData { 86 public $propIndexes; 87 public $numProps; 88 public $propRanges; 89 public $caseMaps; 90 public $extraCaseData; 91 92 public function __construct() { 93 /* 94 * List of properties expected to be found in the Unicode Character Database. 95 */ 96 $this->propIndexes = array_flip([ 97 "Mn", "Mc", "Me", "Nd", "Nl", "No", 98 "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", 99 "Co", "Cn", "Lu", "Ll", "Lt", "Lm", 100 "Lo", "Pc", "Pd", "Ps", "Pe", "Po", 101 "Sm", "Sc", "Sk", "So", "L", "R", 102 "EN", "ES", "ET", "AN", "CS", "B", 103 "S", "WS", "ON", "Pi", "Pf", "AL", 104 "Cased", "Case_Ignorable" 105 ]); 106 $this->numProps = count($this->propIndexes); 107 108 $this->propRanges = array_fill(0, $this->numProps, []); 109 $this->caseMaps = [ 110 'upper' => [], 111 'lower' => [], 112 'title' => [], 113 'fold' => [], 114 ]; 115 $this->extraCaseData = []; 116 } 117 118 function propToIndex(string $prop) : int { 119 /* Deal with directionality codes introduced in Unicode 3.0. */ 120 if (in_array($prop, ["BN", "NSM", "PDF", "LRE", "LRO", "RLE", "RLO", "LRI", "RLI", "FSI", "PDI"])) { 121 /* 122 * Mark all of these as Other Neutral to preserve compatibility with 123 * older versions. 124 */ 125 $prop = "ON"; 126 } 127 128 if (!isset($this->propIndexes[$prop])) { 129 throw new Exception("Unknown property $prop"); 130 } 131 132 return $this->propIndexes[$prop]; 133 } 134 135 public function addProp(int $code, string $prop) { 136 $propIdx = self::propToIndex($prop); 137 138 // Check if this extends the last range 139 $ranges = $this->propRanges[$propIdx]; 140 if (!empty($ranges)) { 141 $lastRange = $ranges[count($ranges) - 1]; 142 if ($code === $lastRange->end + 1) { 143 $lastRange->end++; 144 return; 145 } 146 } 147 148 $this->propRanges[$propIdx][] = new Range($code, $code); 149 } 150 151 public function addPropRange(int $startCode, int $endCode, string $prop) { 152 $propIdx = self::propToIndex($prop); 153 $this->propRanges[$propIdx][] = new Range($startCode, $endCode); 154 } 155 156 public function addCaseMapping(string $case, int $origCode, int $mappedCode) { 157 $this->caseMaps[$case][$origCode] = $mappedCode; 158 } 159 160 public function compactRangeArray(array $ranges) : array { 161 // Sort by start codepoint 162 usort($ranges, function (Range $r1, Range $r2) { 163 return $r1->start <=> $r2->start; 164 }); 165 166 $lastRange = new Range(-1, -1); 167 $newRanges = []; 168 foreach ($ranges as $range) { 169 if ($lastRange->end == -1) { 170 $lastRange = $range; 171 } else if ($range->start == $lastRange->end + 1) { 172 $lastRange->end = $range->end; 173 } else if ($range->start > $lastRange->end + 1) { 174 $newRanges[] = $lastRange; 175 $lastRange = $range; 176 } else { 177 throw new Exception(sprintf( 178 "Overlapping ranges [%x, %x] and [%x, %x]", 179 $lastRange->start, $lastRange->end, 180 $range->start, $range->end 181 )); 182 } 183 } 184 if ($lastRange->end != -1) { 185 $newRanges[] = $lastRange; 186 } 187 return $newRanges; 188 } 189 190 public function compactPropRanges() { 191 foreach ($this->propRanges as &$ranges) { 192 $ranges = $this->compactRangeArray($ranges); 193 } 194 } 195} 196 197function parseDataFile(string $input) { 198 $lines = explode("\n", $input); 199 foreach ($lines as $line) { 200 // Strip comments 201 if (false !== $hashPos = strpos($line, '#')) { 202 $line = substr($line, 0, $hashPos); 203 } 204 205 // Skip empty lines 206 $line = trim($line); 207 if ($line === '') { 208 continue; 209 } 210 211 $fields = array_map('trim', explode(';', $line)); 212 yield $fields; 213 } 214} 215 216function parseUnicodeData(UnicodeData $data, string $input) : void { 217 $lines = parseDataFile($input); 218 foreach ($lines as $fields) { 219 if (count($fields) != 15) { 220 throw new Exception("Line does not contain 15 fields"); 221 } 222 223 $code = intval($fields[0], 16); 224 225 $name = $fields[1]; 226 if ($name === '') { 227 throw new Exception("Empty name"); 228 } 229 230 if ($name[0] === '<' && $name !== '<control>') { 231 // This is a character range 232 $lines->next(); 233 $nextFields = $lines->current(); 234 $nextCode = intval($nextFields[0], 16); 235 236 $generalCategory = $fields[2]; 237 $data->addPropRange($code, $nextCode, $generalCategory); 238 239 $bidiClass = $fields[4]; 240 $data->addPropRange($code, $nextCode, $bidiClass); 241 continue; 242 } 243 244 $generalCategory = $fields[2]; 245 $data->addProp($code, $generalCategory); 246 247 $bidiClass = $fields[4]; 248 $data->addProp($code, $bidiClass); 249 250 $upperCase = intval($fields[12], 16); 251 $lowerCase = intval($fields[13], 16); 252 $titleCase = intval($fields[14], 16) ?: $upperCase; 253 if ($upperCase) { 254 $data->addCaseMapping('upper', $code, $upperCase); 255 } 256 if ($lowerCase) { 257 $data->addCaseMapping('lower', $code, $lowerCase); 258 } 259 if ($titleCase) { 260 $data->addCaseMapping('title', $code, $titleCase); 261 } 262 } 263} 264 265function parseCodes(string $strCodes) : array { 266 $codes = []; 267 foreach (explode(' ', $strCodes) as $strCode) { 268 $codes[] = intval($strCode, 16); 269 } 270 return $codes; 271} 272 273function parseCaseFolding(UnicodeData $data, string $input) : void { 274 foreach (parseDataFile($input) as $fields) { 275 if (count($fields) != 4) { 276 throw new Exception("Line does not contain 4 fields"); 277 } 278 279 $code = intval($fields[0], 16); 280 $status = $fields[1]; 281 if ($status == 'T') { 282 // Use language-agnostic case folding 283 continue; 284 } 285 286 if ($status == 'C' || $status == 'S') { 287 $foldCode = intval($fields[2], 16); 288 if (!isset($data->caseMaps['fold'][$code])) { 289 $data->addCaseMapping('fold', $code, $foldCode); 290 } else { 291 // Add simple mapping to full mapping data 292 assert(is_array($data->caseMaps['fold'][$code])); 293 $data->caseMaps['fold'][$code][0] = $foldCode; 294 } 295 } else if ($status == 'F') { 296 $foldCodes = parseCodes($fields[2]); 297 $existingFoldCode = $data->caseMaps['fold'][$code] ?? $code; 298 $data->caseMaps['fold'][$code] = array_merge([$code], $foldCodes); 299 } else { 300 assert(0); 301 } 302 } 303} 304 305function addSpecialCasing(UnicodeData $data, string $type, int $code, array $caseCodes) : void { 306 $simpleCaseCode = $data->caseMaps[$type][$code] ?? $code; 307 if (count($caseCodes) == 1) { 308 if ($caseCodes[0] != $simpleCaseCode) { 309 throw new Exception("Simple case code in special casing does not match"); 310 } 311 312 // Special case: If a title-case character maps to itself, we may still have to store it, 313 // if there is a non-trivial upper-case mapping for it 314 if ($type == 'title' && $code == $caseCodes[0] 315 && ($data->caseMaps['upper'][$code] ?? $code) != $code) { 316 $data->caseMaps['title'][$code] = $code; 317 } 318 return; 319 } 320 321 if (count($caseCodes) > 3) { 322 throw new Exception("Special case mapping with more than 3 code points"); 323 } 324 325 $data->caseMaps[$type][$code] = array_merge([$simpleCaseCode], $caseCodes); 326} 327 328function parseSpecialCasing(UnicodeData $data, string $input) : void { 329 foreach (parseDataFile($input) as $fields) { 330 if (count($fields) != 5 && count($fields) != 6) { 331 throw new Exception("Line does not contain 5 or 6 fields"); 332 } 333 334 $code = intval($fields[0], 16); 335 $lower = parseCodes($fields[1]); 336 $title = parseCodes($fields[2]); 337 $upper = parseCodes($fields[3]); 338 339 $cond = $fields[4]; 340 if ($cond) { 341 // Only use unconditional mappings 342 continue; 343 } 344 345 addSpecialCasing($data, 'lower', $code, $lower); 346 addSpecialCasing($data, 'upper', $code, $upper); 347 348 // Should happen last 349 addSpecialCasing($data, 'title', $code, $title); 350 } 351} 352 353function parseDerivedCoreProperties(UnicodeData $data, string $input) : void { 354 foreach (parseDataFile($input) as $fields) { 355 if (count($fields) != 2) { 356 throw new Exception("Line does not contain 2 fields"); 357 } 358 359 $property = $fields[1]; 360 if ($property != 'Cased' && $property != 'Case_Ignorable') { 361 continue; 362 } 363 364 $range = explode('..', $fields[0]); 365 if (count($range) == 2) { 366 $data->addPropRange(intval($range[0], 16), intval($range[1], 16), $property); 367 } else if (count($range) == 1) { 368 $data->addProp(intval($range[0], 16), $property); 369 } else { 370 throw new Exception("Invalid range"); 371 } 372 } 373} 374 375function formatArray(array $values, int $width, string $format) : string { 376 $result = ''; 377 $i = 0; 378 $c = count($values); 379 for ($i = 0; $i < $c; $i++) { 380 if ($i != 0) { 381 $result .= ','; 382 } 383 384 $result .= $i % $width == 0 ? "\n\t" : " "; 385 $result .= sprintf($format, $values[$i]); 386 } 387 return $result; 388} 389 390function formatShortHexArray(array $values, int $width) : string { 391 return formatArray($values, $width, "0x%04x"); 392} 393function formatShortDecArray(array $values, int $width) : string { 394 return formatArray($values, $width, "% 5d"); 395} 396function formatIntArray(array $values, int $width) : string { 397 return formatArray($values, $width, "0x%08x"); 398} 399 400function generatePropData(UnicodeData $data) { 401 $data->compactPropRanges(); 402 403 $propOffsets = []; 404 $idx = 0; 405 foreach ($data->propRanges as $ranges) { 406 $num = count($ranges); 407 $propOffsets[] = $num ? $idx : 0xffff; 408 $idx += 2*$num; 409 } 410 411 // Add sentinel for binary search 412 $propOffsets[] = $idx; 413 414 // TODO ucgendat.c pads the prop offsets to the next multiple of 4 415 // for rather debious reasons of alignment. This should probably be 416 // dropped 417 while (count($propOffsets) % 4 != 0) { 418 $propOffsets[] = 0; 419 } 420 421 $totalRanges = $idx; 422 423 $result = ""; 424 $result .= "static const unsigned short _ucprop_size = $data->numProps;\n\n"; 425 $result .= "static const unsigned short _ucprop_offsets[] = {"; 426 $result .= formatShortHexArray($propOffsets, 8); 427 $result .= "\n};\n\n"; 428 429 $values = []; 430 foreach ($data->propRanges as $ranges) { 431 foreach ($ranges as $range) { 432 $values[] = $range->start; 433 $values[] = $range->end; 434 } 435 } 436 437 $result .= "static const unsigned int _ucprop_ranges[] = {"; 438 $result .= formatIntArray($values, 4); 439 $result .= "\n};\n\n"; 440 return $result; 441} 442 443function flatten(array $array) { 444 $result = []; 445 foreach ($array as $arr) { 446 foreach ($arr as $v) { 447 $result[] = $v; 448 } 449 } 450 return $result; 451} 452 453function prepareCaseData(UnicodeData $data) { 454 // Don't store titlecase if it's the same as uppercase 455 foreach ($data->caseMaps['title'] as $code => $titleCode) { 456 if ($titleCode == ($data->caseMaps['upper'][$code] ?? $code)) { 457 unset($data->caseMaps['title'][$code]); 458 } 459 } 460 461 // Store full (multi-char) case mappings in a separate table and only 462 // store an index into it 463 foreach ($data->caseMaps as $type => $caseMap) { 464 foreach ($caseMap as $code => $caseCode) { 465 if (is_array($caseCode)) { 466 // -1 because the first entry is the simple case mapping 467 $len = count($caseCode) - 1; 468 $idx = count($data->extraCaseData); 469 $data->caseMaps[$type][$code] = ($len << 24) | $idx; 470 471 foreach ($caseCode as $c) { 472 $data->extraCaseData[] = $c; 473 } 474 } 475 } 476 } 477} 478 479function generateCaseMPH(string $name, array $map) { 480 $prefix = "_uccase_" . $name; 481 list($gTable, $table) = generateMPH($map, $fast = false); 482 echo "$name: n=", count($table), ", g=", count($gTable), "\n"; 483 484 $result = ""; 485 $result .= "static const unsigned {$prefix}_g_size = " . count($gTable) . ";\n"; 486 $result .= "static const short {$prefix}_g[] = {"; 487 $result .= formatShortDecArray($gTable, 8); 488 $result .= "\n};\n\n"; 489 $result .= "static const unsigned {$prefix}_table_size = " . count($table) . ";\n"; 490 $result .= "static const unsigned {$prefix}_table[] = {"; 491 $result .= formatIntArray(flatten($table), 4); 492 $result .= "\n};\n\n"; 493 return $result; 494} 495 496function generateCaseData(UnicodeData $data) { 497 prepareCaseData($data); 498 499 $result = ""; 500 $result .= generateCaseMPH('upper', $data->caseMaps['upper']); 501 $result .= generateCaseMPH('lower', $data->caseMaps['lower']); 502 $result .= generateCaseMPH('title', $data->caseMaps['title']); 503 $result .= generateCaseMPH('fold', $data->caseMaps['fold']); 504 $result .= "static const unsigned _uccase_extra_table[] = {"; 505 $result .= formatIntArray($data->extraCaseData, 4); 506 $result .= "\n};\n\n"; 507 return $result; 508} 509 510function generateData(UnicodeData $data) { 511 $result = <<<'HEADER' 512/* This file was generated from a modified version UCData's ucgendat. 513 * 514 * DO NOT EDIT THIS FILE! 515 * 516 * Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download 517 * the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt 518 * files from http://www.unicode.org/Public/ and run this program. 519 * 520 * More information can be found in the UCData package. Unfortunately, 521 * the project's page doesn't seem to be live anymore, so you can use 522 * OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */ 523HEADER; 524 $result .= "\n\n" . generatePropData($data); 525 $result .= generateCaseData($data); 526 527 return $result; 528} 529 530/* 531 * Minimal Perfect Hash Generation 532 * 533 * Based on "Hash, displace, and compress" algorithm due to 534 * Belazzougui, Botelho and Dietzfelbinger. 535 * 536 * Hash function based on https://stackoverflow.com/a/12996028/385378. 537 * MPH implementation based on http://stevehanov.ca/blog/index.php?id=119. 538 */ 539 540function hashInt(int $d, int $x) { 541 $x ^= $d; 542 $x = (($x >> 16) ^ $x) * 0x45d9f3b; 543 return $x & 0xffffffff; 544} 545 546function tryGenerateMPH(array $map, int $gSize) { 547 $tableSize = count($map); 548 $table = []; 549 $gTable = array_fill(0, $gSize, 0x7fff); 550 $buckets = []; 551 552 foreach ($map as $k => $v) { 553 $h = hashInt(0, $k) % $gSize; 554 $buckets[$h][] = [$k, $v]; 555 } 556 557 // Sort by descending number of collisions 558 usort($buckets, function ($b1, $b2) { 559 return -(count($b1) <=> count($b2)); 560 }); 561 562 foreach ($buckets as $bucket) { 563 $collisions = count($bucket); 564 if ($collisions <= 1) { 565 continue; 566 } 567 568 // Try values of $d until all elements placed in different slots 569 $d = 1; 570 $i = 0; 571 $used = []; 572 while ($i < $collisions) { 573 if ($d > 0x7fff) { 574 return []; 575 } 576 577 list($k) = $bucket[$i]; 578 $slot = hashInt($d, $k) % $tableSize; 579 if (isset($table[$slot]) || isset($used[$slot])) { 580 $d++; 581 $i = 0; 582 $used = []; 583 } else { 584 $i++; 585 $used[$slot] = true; 586 } 587 } 588 589 $g = hashInt(0, $bucket[0][0]) % $gSize; 590 $gTable[$g] = $d; 591 foreach ($bucket as $elem) { 592 $table[hashInt($d, $elem[0]) % $tableSize] = $elem; 593 } 594 } 595 596 $freeSlots = []; 597 for ($i = 0; $i < $tableSize; $i++) { 598 if (!isset($table[$i])) { 599 $freeSlots[] = $i; 600 } 601 } 602 603 // For buckets with only one element, we directly store the index 604 $freeIdx = 0; 605 foreach ($buckets as $bucket) { 606 if (count($bucket) != 1) { 607 continue; 608 } 609 610 $elem = $bucket[0]; 611 $slot = $freeSlots[$freeIdx++]; 612 $table[$slot] = $elem; 613 614 $g = hashInt(0, $elem[0]) % $gSize; 615 $gTable[$g] = -$slot; 616 } 617 618 ksort($gTable); 619 ksort($table); 620 621 return [$gTable, $table]; 622} 623 624function generateMPH(array $map, bool $fast) { 625 if ($fast) { 626 // Check size starting lambda=5.0 in 0.5 increments 627 for ($lambda = 5.0;; $lambda -= 0.5) { 628 $m = (int) (count($map) / $lambda); 629 $tmpMph = tryGenerateMPH($map, $m); 630 if (!empty($tmpMph)) { 631 $mph = $tmpMph; 632 break; 633 } 634 } 635 } else { 636 // Check all sizes starting lambda=7.0 637 $m = (int) (count($map) / 7.0); 638 for (;; $m++) { 639 $tmpMph = tryGenerateMPH($map, $m); 640 if (!empty($tmpMph)) { 641 $mph = $tmpMph; 642 break; 643 } 644 } 645 } 646 647 return $mph; 648} 649