1<?php error_reporting(E_ALL); 2 3/* This is based on the ucgendat.c file from the OpenLDAP project, licensed 4 * as follows. */ 5 6/* Copyright 1998-2007 The OpenLDAP Foundation. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted only as authorized by the OpenLDAP 11 * Public License. 12 * 13 * A copy of this license is available at 14 * <http://www.OpenLDAP.org/license.html>. 15 */ 16 17/* Copyright 2001 Computing Research Labs, New Mexico State University 18 * 19 * Permission is hereby granted, free of charge, to any person obtaining a 20 * copy of this software and associated documentation files (the "Software"), 21 * to deal in the Software without restriction, including without limitation 22 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 23 * and/or sell copies of the Software, and to permit persons to whom the 24 * Software is furnished to do so, subject to the following conditions: 25 * 26 * The above copyright notice and this permission notice shall be included in 27 * all copies or substantial portions of the Software. 28 * 29 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 30 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 31 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 32 * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY 33 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT 34 * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR 35 * THE USE OR OTHER DEALINGS IN THE SOFTWARE. 36 */ 37 38if ($argc < 2) { 39 echo "Usage: php ucgendata.php ./datadir\n"; 40 echo "./datadir must contain:\n"; 41 echo "UnicodeData.txt, CaseFolding.txt, SpecialCasing.txt and DerivedCoreProperties.txt\n"; 42 return; 43} 44 45$dir = $argv[1]; 46$unicodeDataFile = $dir . '/UnicodeData.txt'; 47$caseFoldingFile = $dir . '/CaseFolding.txt'; 48$specialCasingFile = $dir . '/SpecialCasing.txt'; 49$derivedCorePropertiesFile = $dir . '/DerivedCoreProperties.txt'; 50 51$files = [$unicodeDataFile, $caseFoldingFile, $specialCasingFile, $derivedCorePropertiesFile]; 52foreach ($files as $file) { 53 if (!file_exists($file)) { 54 echo "File $file does not exist.\n"; 55 return; 56 } 57} 58 59$outputFile = __DIR__ . "/../unicode_data.h"; 60 61$data = new UnicodeData; 62parseUnicodeData($data, file_get_contents($unicodeDataFile)); 63parseCaseFolding($data, file_get_contents($caseFoldingFile)); 64parseSpecialCasing($data, file_get_contents($specialCasingFile)); 65parseDerivedCoreProperties($data, file_get_contents($derivedCorePropertiesFile)); 66file_put_contents($outputFile, generateData($data)); 67 68class Range { 69 public $start; 70 public $end; 71 72 public function __construct(int $start, int $end) { 73 $this->start = $start; 74 $this->end = $end; 75 } 76} 77 78class UnicodeData { 79 public $propIndexes; 80 public $numProps; 81 public $propRanges; 82 public $caseMaps; 83 public $extraCaseData; 84 85 public function __construct() { 86 /* 87 * List of properties expected to be found in the Unicode Character Database. 88 */ 89 $this->propIndexes = array_flip([ 90 "Mn", "Mc", "Me", "Nd", "Nl", "No", 91 "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", 92 "Co", "Cn", "Lu", "Ll", "Lt", "Lm", 93 "Lo", "Pc", "Pd", "Ps", "Pe", "Po", 94 "Sm", "Sc", "Sk", "So", "L", "R", 95 "EN", "ES", "ET", "AN", "CS", "B", 96 "S", "WS", "ON", "Pi", "Pf", "AL", 97 "Cased", "Case_Ignorable" 98 ]); 99 $this->numProps = count($this->propIndexes); 100 101 $this->propRanges = array_fill(0, $this->numProps, []); 102 $this->caseMaps = [ 103 'upper' => [], 104 'lower' => [], 105 'title' => [], 106 'fold' => [], 107 ]; 108 $this->extraCaseData = []; 109 } 110 111 function propToIndex(string $prop) : int { 112 /* Deal with directionality codes introduced in Unicode 3.0. */ 113 if (in_array($prop, ["BN", "NSM", "PDF", "LRE", "LRO", "RLE", "RLO", "LRI", "RLI", "FSI", "PDI"])) { 114 /* 115 * Mark all of these as Other Neutral to preserve compatibility with 116 * older versions. 117 */ 118 $prop = "ON"; 119 } 120 121 if (!isset($this->propIndexes[$prop])) { 122 throw new Exception("Unknown property $prop"); 123 } 124 125 return $this->propIndexes[$prop]; 126 } 127 128 public function addProp(int $code, string $prop) { 129 $propIdx = self::propToIndex($prop); 130 131 // Check if this extends the last range 132 $ranges = $this->propRanges[$propIdx]; 133 if (!empty($ranges)) { 134 $lastRange = $ranges[count($ranges) - 1]; 135 if ($code === $lastRange->end + 1) { 136 $lastRange->end++; 137 return; 138 } 139 } 140 141 $this->propRanges[$propIdx][] = new Range($code, $code); 142 } 143 144 public function addPropRange(int $startCode, int $endCode, string $prop) { 145 $propIdx = self::propToIndex($prop); 146 $this->propRanges[$propIdx][] = new Range($startCode, $endCode); 147 } 148 149 public function addCaseMapping(string $case, int $origCode, int $mappedCode) { 150 $this->caseMaps[$case][$origCode] = $mappedCode; 151 } 152 153 public function compactRangeArray(array $ranges) : array { 154 // Sort by start codepoint 155 usort($ranges, function (Range $r1, Range $r2) { 156 return $r1->start <=> $r2->start; 157 }); 158 159 $lastRange = new Range(-1, -1); 160 $newRanges = []; 161 foreach ($ranges as $range) { 162 if ($lastRange->end == -1) { 163 $lastRange = $range; 164 } else if ($range->start == $lastRange->end + 1) { 165 $lastRange->end = $range->end; 166 } else if ($range->start > $lastRange->end + 1) { 167 $newRanges[] = $lastRange; 168 $lastRange = $range; 169 } else { 170 throw new Exception(sprintf( 171 "Overlapping ranges [%x, %x] and [%x, %x]", 172 $lastRange->start, $lastRange->end, 173 $range->start, $range->end 174 )); 175 } 176 } 177 if ($lastRange->end != -1) { 178 $newRanges[] = $lastRange; 179 } 180 return $newRanges; 181 } 182 183 public function compactPropRanges() { 184 foreach ($this->propRanges as &$ranges) { 185 $ranges = $this->compactRangeArray($ranges); 186 } 187 } 188} 189 190function parseDataFile(string $input) { 191 $lines = explode("\n", $input); 192 foreach ($lines as $line) { 193 // Strip comments 194 if (false !== $hashPos = strpos($line, '#')) { 195 $line = substr($line, 0, $hashPos); 196 } 197 198 // Skip empty lines 199 $line = trim($line); 200 if ($line === '') { 201 continue; 202 } 203 204 $fields = array_map('trim', explode(';', $line)); 205 yield $fields; 206 } 207} 208 209function parseUnicodeData(UnicodeData $data, string $input) : void { 210 $lines = parseDataFile($input); 211 foreach ($lines as $fields) { 212 if (count($fields) != 15) { 213 throw new Exception("Line does not contain 15 fields"); 214 } 215 216 $code = intval($fields[0], 16); 217 218 $name = $fields[1]; 219 if ($name === '') { 220 throw new Exception("Empty name"); 221 } 222 223 if ($name[0] === '<' && $name !== '<control>') { 224 // This is a character range 225 $lines->next(); 226 $nextFields = $lines->current(); 227 $nextCode = intval($nextFields[0], 16); 228 229 $generalCategory = $fields[2]; 230 $data->addPropRange($code, $nextCode, $generalCategory); 231 232 $bidiClass = $fields[4]; 233 $data->addPropRange($code, $nextCode, $bidiClass); 234 continue; 235 } 236 237 $generalCategory = $fields[2]; 238 $data->addProp($code, $generalCategory); 239 240 $bidiClass = $fields[4]; 241 $data->addProp($code, $bidiClass); 242 243 $upperCase = intval($fields[12], 16); 244 $lowerCase = intval($fields[13], 16); 245 $titleCase = intval($fields[14], 16) ?: $upperCase; 246 if ($upperCase) { 247 $data->addCaseMapping('upper', $code, $upperCase); 248 } 249 if ($lowerCase) { 250 $data->addCaseMapping('lower', $code, $lowerCase); 251 } 252 if ($titleCase) { 253 $data->addCaseMapping('title', $code, $titleCase); 254 } 255 } 256} 257 258function parseCodes(string $strCodes) : array { 259 $codes = []; 260 foreach (explode(' ', $strCodes) as $strCode) { 261 $codes[] = intval($strCode, 16); 262 } 263 return $codes; 264} 265 266function parseCaseFolding(UnicodeData $data, string $input) : void { 267 foreach (parseDataFile($input) as $fields) { 268 if (count($fields) != 4) { 269 throw new Exception("Line does not contain 4 fields"); 270 } 271 272 $code = intval($fields[0], 16); 273 $status = $fields[1]; 274 if ($status == 'T') { 275 // Use language-agnostic case folding 276 continue; 277 } 278 279 if ($status == 'C' || $status == 'S') { 280 $foldCode = intval($fields[2], 16); 281 if (!isset($data->caseMaps['fold'][$code])) { 282 $data->addCaseMapping('fold', $code, $foldCode); 283 } else { 284 // Add simple mapping to full mapping data 285 assert(is_array($data->caseMaps['fold'][$code])); 286 $data->caseMaps['fold'][$code][0] = $foldCode; 287 } 288 } else if ($status == 'F') { 289 $foldCodes = parseCodes($fields[2]); 290 $existingFoldCode = $data->caseMaps['fold'][$code] ?? $code; 291 $data->caseMaps['fold'][$code] = array_merge([$code], $foldCodes); 292 } else { 293 assert(0); 294 } 295 } 296} 297 298function addSpecialCasing(UnicodeData $data, string $type, int $code, array $caseCodes) : void { 299 $simpleCaseCode = $data->caseMaps[$type][$code] ?? $code; 300 if (count($caseCodes) == 1) { 301 if ($caseCodes[0] != $simpleCaseCode) { 302 throw new Exception("Simple case code in special casing does not match"); 303 } 304 305 // Special case: If a title-case character maps to itself, we may still have to store it, 306 // if there is a non-trivial upper-case mapping for it 307 if ($type == 'title' && $code == $caseCodes[0] 308 && ($data->caseMaps['upper'][$code] ?? $code) != $code) { 309 $data->caseMaps['title'][$code] = $code; 310 } 311 return; 312 } 313 314 if (count($caseCodes) > 3) { 315 throw new Exception("Special case mapping with more than 3 code points"); 316 } 317 318 $data->caseMaps[$type][$code] = array_merge([$simpleCaseCode], $caseCodes); 319} 320 321function parseSpecialCasing(UnicodeData $data, string $input) : void { 322 foreach (parseDataFile($input) as $fields) { 323 if (count($fields) != 5 && count($fields) != 6) { 324 throw new Exception("Line does not contain 5 or 6 fields"); 325 } 326 327 $code = intval($fields[0], 16); 328 $lower = parseCodes($fields[1]); 329 $title = parseCodes($fields[2]); 330 $upper = parseCodes($fields[3]); 331 332 $cond = $fields[4]; 333 if ($cond) { 334 // Only use unconditional mappings 335 continue; 336 } 337 338 addSpecialCasing($data, 'lower', $code, $lower); 339 addSpecialCasing($data, 'upper', $code, $upper); 340 341 // Should happen last 342 addSpecialCasing($data, 'title', $code, $title); 343 } 344} 345 346function parseDerivedCoreProperties(UnicodeData $data, string $input) : void { 347 foreach (parseDataFile($input) as $fields) { 348 if (count($fields) != 2) { 349 throw new Exception("Line does not contain 2 fields"); 350 } 351 352 $property = $fields[1]; 353 if ($property != 'Cased' && $property != 'Case_Ignorable') { 354 continue; 355 } 356 357 $range = explode('..', $fields[0]); 358 if (count($range) == 2) { 359 $data->addPropRange(intval($range[0], 16), intval($range[1], 16), $property); 360 } else if (count($range) == 1) { 361 $data->addProp(intval($range[0], 16), $property); 362 } else { 363 throw new Exception("Invalid range"); 364 } 365 } 366} 367 368function formatArray(array $values, int $width, string $format) : string { 369 $result = ''; 370 $i = 0; 371 $c = count($values); 372 for ($i = 0; $i < $c; $i++) { 373 if ($i != 0) { 374 $result .= ','; 375 } 376 377 $result .= $i % $width == 0 ? "\n\t" : " "; 378 $result .= sprintf($format, $values[$i]); 379 } 380 return $result; 381} 382 383function formatShortHexArray(array $values, int $width) : string { 384 return formatArray($values, $width, "0x%04x"); 385} 386function formatShortDecArray(array $values, int $width) : string { 387 return formatArray($values, $width, "% 5d"); 388} 389function formatIntArray(array $values, int $width) : string { 390 return formatArray($values, $width, "0x%08x"); 391} 392 393function generatePropData(UnicodeData $data) { 394 $data->compactPropRanges(); 395 396 $propOffsets = []; 397 $idx = 0; 398 foreach ($data->propRanges as $ranges) { 399 $num = count($ranges); 400 $propOffsets[] = $num ? $idx : 0xffff; 401 $idx += 2*$num; 402 } 403 404 // Add sentinel for binary search 405 $propOffsets[] = $idx; 406 407 // TODO ucgendat.c pads the prop offsets to the next multiple of 4 408 // for rather debious reasons of alignment. This should probably be 409 // dropped 410 while (count($propOffsets) % 4 != 0) { 411 $propOffsets[] = 0; 412 } 413 414 $totalRanges = $idx; 415 416 $result = ""; 417 $result .= "static const unsigned short _ucprop_size = $data->numProps;\n\n"; 418 $result .= "static const unsigned short _ucprop_offsets[] = {"; 419 $result .= formatShortHexArray($propOffsets, 8); 420 $result .= "\n};\n\n"; 421 422 $values = []; 423 foreach ($data->propRanges as $ranges) { 424 foreach ($ranges as $range) { 425 $values[] = $range->start; 426 $values[] = $range->end; 427 } 428 } 429 430 $result .= "static const unsigned int _ucprop_ranges[] = {"; 431 $result .= formatIntArray($values, 4); 432 $result .= "\n};\n\n"; 433 return $result; 434} 435 436function flatten(array $array) { 437 $result = []; 438 foreach ($array as $arr) { 439 foreach ($arr as $v) { 440 $result[] = $v; 441 } 442 } 443 return $result; 444} 445 446function prepareCaseData(UnicodeData $data) { 447 // Don't store titlecase if it's the same as uppercase 448 foreach ($data->caseMaps['title'] as $code => $titleCode) { 449 if ($titleCode == ($data->caseMaps['upper'][$code] ?? $code)) { 450 unset($data->caseMaps['title'][$code]); 451 } 452 } 453 454 // Store full (multi-char) case mappings in a separate table and only 455 // store an index into it 456 foreach ($data->caseMaps as $type => $caseMap) { 457 foreach ($caseMap as $code => $caseCode) { 458 if (is_array($caseCode)) { 459 // -1 because the first entry is the simple case mapping 460 $len = count($caseCode) - 1; 461 $idx = count($data->extraCaseData); 462 $data->caseMaps[$type][$code] = ($len << 24) | $idx; 463 464 foreach ($caseCode as $c) { 465 $data->extraCaseData[] = $c; 466 } 467 } 468 } 469 } 470} 471 472function generateCaseMPH(string $name, array $map) { 473 $prefix = "_uccase_" . $name; 474 list($gTable, $table) = generateMPH($map, $fast = false); 475 echo "$name: n=", count($table), ", g=", count($gTable), "\n"; 476 477 $result = ""; 478 $result .= "static const unsigned {$prefix}_g_size = " . count($gTable) . ";\n"; 479 $result .= "static const short {$prefix}_g[] = {"; 480 $result .= formatShortDecArray($gTable, 8); 481 $result .= "\n};\n\n"; 482 $result .= "static const unsigned {$prefix}_table_size = " . count($table) . ";\n"; 483 $result .= "static const unsigned {$prefix}_table[] = {"; 484 $result .= formatIntArray(flatten($table), 4); 485 $result .= "\n};\n\n"; 486 return $result; 487} 488 489function generateCaseData(UnicodeData $data) { 490 prepareCaseData($data); 491 492 $result = ""; 493 $result .= generateCaseMPH('upper', $data->caseMaps['upper']); 494 $result .= generateCaseMPH('lower', $data->caseMaps['lower']); 495 $result .= generateCaseMPH('title', $data->caseMaps['title']); 496 $result .= generateCaseMPH('fold', $data->caseMaps['fold']); 497 $result .= "static const unsigned _uccase_extra_table[] = {"; 498 $result .= formatIntArray($data->extraCaseData, 4); 499 $result .= "\n};\n\n"; 500 return $result; 501} 502 503function generateData(UnicodeData $data) { 504 $result = <<<'HEADER' 505/* This file was generated from a modified version UCData's ucgendat. 506 * 507 * DO NOT EDIT THIS FILE! 508 * 509 * Instead, compile ucgendat.c (bundled with PHP in ext/mbstring), download 510 * the appropriate UnicodeData-x.x.x.txt and CompositionExclusions-x.x.x.txt 511 * files from http://www.unicode.org/Public/ and run this program. 512 * 513 * More information can be found in the UCData package. Unfortunately, 514 * the project's page doesn't seem to be live anymore, so you can use 515 * OpenLDAPs modified copy (look in libraries/liblunicode/ucdata) */ 516HEADER; 517 $result .= "\n\n" . generatePropData($data); 518 $result .= generateCaseData($data); 519 520 return $result; 521} 522 523/* 524 * Minimal Perfect Hash Generation 525 * 526 * Based on "Hash, displace, and compress" algorithm due to 527 * Belazzougui, Botelho and Dietzfelbinger. 528 * 529 * Hash function based on https://stackoverflow.com/a/12996028/385378. 530 * MPH implementation based on http://stevehanov.ca/blog/index.php?id=119. 531 */ 532 533function hashInt(int $d, int $x) { 534 $x ^= $d; 535 $x = (($x >> 16) ^ $x) * 0x45d9f3b; 536 return $x & 0xffffffff; 537} 538 539function tryGenerateMPH(array $map, int $gSize) { 540 $tableSize = count($map); 541 $table = []; 542 $gTable = array_fill(0, $gSize, 0x7fff); 543 $buckets = []; 544 545 foreach ($map as $k => $v) { 546 $h = hashInt(0, $k) % $gSize; 547 $buckets[$h][] = [$k, $v]; 548 } 549 550 // Sort by descending number of collisions 551 usort($buckets, function ($b1, $b2) { 552 return -(count($b1) <=> count($b2)); 553 }); 554 555 foreach ($buckets as $bucket) { 556 $collisions = count($bucket); 557 if ($collisions <= 1) { 558 continue; 559 } 560 561 // Try values of $d until all elements placed in different slots 562 $d = 1; 563 $i = 0; 564 $used = []; 565 while ($i < $collisions) { 566 if ($d > 0x7fff) { 567 return []; 568 } 569 570 list($k) = $bucket[$i]; 571 $slot = hashInt($d, $k) % $tableSize; 572 if (isset($table[$slot]) || isset($used[$slot])) { 573 $d++; 574 $i = 0; 575 $used = []; 576 } else { 577 $i++; 578 $used[$slot] = true; 579 } 580 } 581 582 $g = hashInt(0, $bucket[0][0]) % $gSize; 583 $gTable[$g] = $d; 584 foreach ($bucket as $elem) { 585 $table[hashInt($d, $elem[0]) % $tableSize] = $elem; 586 } 587 } 588 589 $freeSlots = []; 590 for ($i = 0; $i < $tableSize; $i++) { 591 if (!isset($table[$i])) { 592 $freeSlots[] = $i; 593 } 594 } 595 596 // For buckets with only one element, we directly store the index 597 $freeIdx = 0; 598 foreach ($buckets as $bucket) { 599 if (count($bucket) != 1) { 600 continue; 601 } 602 603 $elem = $bucket[0]; 604 $slot = $freeSlots[$freeIdx++]; 605 $table[$slot] = $elem; 606 607 $g = hashInt(0, $elem[0]) % $gSize; 608 $gTable[$g] = -$slot; 609 } 610 611 ksort($gTable); 612 ksort($table); 613 614 return [$gTable, $table]; 615} 616 617function generateMPH(array $map, bool $fast) { 618 if ($fast) { 619 // Check size starting lambda=5.0 in 0.5 increments 620 for ($lambda = 5.0;; $lambda -= 0.5) { 621 $m = (int) (count($map) / $lambda); 622 $tmpMph = tryGenerateMPH($map, $m); 623 if (!empty($tmpMph)) { 624 $mph = $tmpMph; 625 break; 626 } 627 } 628 } else { 629 // Check all sizes starting lambda=7.0 630 $m = (int) (count($map) / 7.0); 631 for (;; $m++) { 632 $tmpMph = tryGenerateMPH($map, $m); 633 if (!empty($tmpMph)) { 634 $mph = $tmpMph; 635 break; 636 } 637 } 638 } 639 640 return $mph; 641} 642