1<?php 2/* 3 +----------------------------------------------------------------------+ 4 | PHP Version 5 | 5 +----------------------------------------------------------------------+ 6 | Copyright (c) 1997-2010 The PHP Group | 7 +----------------------------------------------------------------------+ 8 | This source file is subject to version 3.01 of the PHP license, | 9 | that is bundled with this package in the file LICENSE, and is | 10 | available through the world-wide-web at the following url: | 11 | http://www.php.net/license/3_01.txt | 12 | If you did not receive a copy of the PHP license and are unable to | 13 | obtain it through the world-wide-web, please send a note to | 14 | license@php.net so we can mail you a copy immediately. | 15 +----------------------------------------------------------------------+ 16 | Authors: Gustavo Lopes <cataphract@php.net> | 17 +----------------------------------------------------------------------+ 18*/ 19 20/* This file prints to stdout the contents of ext/standard/html_tables.h */ 21/* put together with glue; have patience */ 22 23$t = <<<CODE 24/* 25 +----------------------------------------------------------------------+ 26 | PHP Version 5 | 27 +----------------------------------------------------------------------+ 28 | Copyright (c) 1997-%s The PHP Group | 29 +----------------------------------------------------------------------+ 30 | This source file is subject to version 3.01 of the PHP license, | 31 | that is bundled with this package in the file LICENSE, and is | 32 | available through the world-wide-web at the following url: | 33 | http://www.php.net/license/3_01.txt | 34 | If you did not receive a copy of the PHP license and are unable to | 35 | obtain it through the world-wide-web, please send a note to | 36 | license@php.net so we can mail you a copy immediately. | 37 +----------------------------------------------------------------------+ 38*/ 39 40/* \$Id$ */ 41 42#ifndef HTML_TABLES_H 43#define HTML_TABLES_H 44 45/************************************************************************** 46*************************************************************************** 47** THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY IT. ** 48*************************************************************************** 49** Please change html_tables/html_table_gen.php instead and then ** 50** run it in order to generate this file ** 51*************************************************************************** 52**************************************************************************/ 53 54enum entity_charset { cs_utf_8, cs_8859_1, cs_cp1252, cs_8859_15, cs_cp1251, 55 cs_8859_5, cs_cp866, cs_macroman, cs_koi8r, cs_big5, 56 cs_gb2312, cs_big5hkscs, cs_sjis, cs_eucjp, 57 cs_numelems /* used to count the number of charsets */ 58 }; 59#define CHARSET_UNICODE_COMPAT(cs) ((cs) <= cs_8859_1) 60#define CHARSET_SINGLE_BYTE(cs) ((cs) > cs_utf_8 && (cs) < cs_big5) 61#define CHARSET_PARTIAL_SUPPORT(cs) ((cs) >= cs_big5) 62 63static const struct { 64 const char *codeset; 65 enum entity_charset charset; 66} charset_map[] = { 67 { "ISO-8859-1", cs_8859_1 }, 68 { "ISO8859-1", cs_8859_1 }, 69 { "ISO-8859-15", cs_8859_15 }, 70 { "ISO8859-15", cs_8859_15 }, 71 { "utf-8", cs_utf_8 }, 72 { "cp1252", cs_cp1252 }, 73 { "Windows-1252", cs_cp1252 }, 74 { "1252", cs_cp1252 }, 75 { "BIG5", cs_big5 }, 76 { "950", cs_big5 }, 77 { "GB2312", cs_gb2312 }, 78 { "936", cs_gb2312 }, 79 { "BIG5-HKSCS", cs_big5hkscs }, 80 { "Shift_JIS", cs_sjis }, 81 { "SJIS", cs_sjis }, 82 { "932", cs_sjis }, 83 { "EUCJP", cs_eucjp }, 84 { "EUC-JP", cs_eucjp }, 85 { "KOI8-R", cs_koi8r }, 86 { "koi8-ru", cs_koi8r }, 87 { "koi8r", cs_koi8r }, 88 { "cp1251", cs_cp1251 }, 89 { "Windows-1251", cs_cp1251 }, 90 { "win-1251", cs_cp1251 }, 91 { "iso8859-5", cs_8859_5 }, 92 { "iso-8859-5", cs_8859_5 }, 93 { "cp866", cs_cp866 }, 94 { "866", cs_cp866 }, 95 { "ibm866", cs_cp866 }, 96 { "MacRoman", cs_macroman }, 97 { NULL } 98}; 99 100/* longest entity name length excluding & and ; */ 101#define LONGEST_ENTITY_LENGTH 31 102 103/* Definitions for mappings *to* Unicode. 104 * The origin charset must have at most 256 code points. 105 * The multi-byte encodings are not supported */ 106typedef struct { 107 unsigned short uni_cp[64]; 108} enc_to_uni_stage2; 109 110typedef struct { 111 const enc_to_uni_stage2 *inner[4]; 112} enc_to_uni; 113 114/* bits 7-8 bits (only single bytes encodings supported )*/ 115#define ENT_ENC_TO_UNI_STAGE1(k) ((k & 0xC0) >> 6) 116/* bits 1-6 */ 117#define ENT_ENC_TO_UNI_STAGE2(k) ((k) & 0x3F) 118 119 120CODE; 121 122echo sprintf($t, date("Y")); 123 124$encodings = array( 125 array( 126 "ident" => "iso88591", 127 "enumid" => 1, 128 "name" => "ISO-8859-1", 129 "file" => "mappings/8859-1.TXT", 130 ), 131 array( 132 "ident" => "iso88595", 133 "enumid" => 5, 134 "name" => "ISO-8859-5", 135 "file" => "mappings/8859-5.TXT", 136 ), 137 array( 138 "ident" => "iso885915", 139 "enumid" => 3, 140 "name" => "ISO-8859-15", 141 "file" => "mappings/8859-15.TXT", 142 ), 143 array( 144 "ident" => "win1252", 145 "enumid" => 2, 146 "enumident" => "cp1252", 147 "name" => "Windows-1252", 148 "file" => "mappings/CP1252.TXT", 149 ), 150 array( 151 "ident" => "win1251", 152 "enumid" => 4, 153 "enumident" => "cp1252", 154 "name" => "Windows-1251", 155 "file" => "mappings/CP1251.TXT", 156 ), 157 array( 158 "ident" => "koi8r", 159 "enumid" => 8, 160 "name" => "KOI8-R", 161 "file" => "mappings/KOI8-R.TXT", 162 ), 163 array( 164 "ident" => "cp866", 165 "enumid" => 6, 166 "name" => "CP-866", 167 "file" => "mappings/CP866.TXT", 168 ), 169 array( 170 "ident" => "macroman", 171 "enumid" => 7, 172 "name" => "MacRoman", 173 "file" => "mappings/ROMAN.TXT", 174 ), 175); 176 177$prevStage2 = array(); 178 179foreach ($encodings as $e) { 180 echo 181"/* {{{ Mappings *to* Unicode for {$e['name']} */\n\n"; 182 183 /* process file */ 184 $map = array(); 185 $lines = explode("\n", file_get_contents($e{'file'})); 186 foreach ($lines as $l) { 187 if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})/i", $l, $matches)) 188 $map[] = array($matches[1], $matches[2]); 189 } 190 191 $mappy = array(); 192 foreach ($map as $v) { $mappy[hexdec($v[0])] = hexdec($v[1]); } 193 194 $mstable = array("ident" => $e['ident']); 195 /* calculate two-stage tables */ 196 for ($i = 0; $i < 4; $i++) { 197 for ($j = 0; $j < 64; $j++) { 198 $cp = $i << 6 | $j; 199 $mstable[$i][$j] = isset($mappy[$cp]) ? $mappy[$cp] : NULL; 200 } 201 } 202 203 echo 204"/* {{{ Stage 2 tables for {$e['name']} */\n\n"; 205 206 $s2tables_idents = array(); 207 for ($i = 0; $i < 4; $i++) { 208 if (($t = array_keys($prevStage2, $mstable[$i])) !== array()) { 209 $s2tables_idents[$i] = $encodings[$t[0]/5]["ident"]; 210 continue; 211 } 212 213 $s2tables_idents[$i] = $e["ident"]; 214 215 echo "static const enc_to_uni_stage2 enc_to_uni_s2_{$e['ident']}_". 216 sprintf("%02X", $i << 6)." = { {\n"; 217 for ($j = 0; $j < 64; $j++) { 218 if ($j == 0) echo "\t"; 219 elseif ($j % 6 == 0) echo "\n\t"; 220 else echo " "; 221 if ($mstable[$i][$j] !== NULL) 222 echo sprintf("0x%04X,", $mstable[$i][$j]); 223 else 224 echo "0xFFFF,"; /* special value; indicates no mapping */ 225 } 226 echo "\n} };\n\n"; 227 228 $prevStage2[] = $mstable[$i]; 229 } 230 231 echo 232"/* end of stage 2 tables for {$e['name']} }}} */\n\n"; 233 234 echo 235"/* {{{ Stage 1 table for {$e['name']} */\n"; 236 237 echo 238"static const enc_to_uni enc_to_uni_{$e['ident']} = { { 239\t&enc_to_uni_s2_{$s2tables_idents[0]}_00, 240\t&enc_to_uni_s2_{$s2tables_idents[1]}_40, 241\t&enc_to_uni_s2_{$s2tables_idents[2]}_80, 242\t&enc_to_uni_s2_{$s2tables_idents[3]}_C0 } 243}; 244"; 245 246 echo 247"/* end of stage 1 table for {$e['name']} }}} */\n\n"; 248} 249 250$maxencnum = max(array_map(function($e) { return $e['enumid']; }, $encodings)); 251$a = range(0, $maxencnum); 252foreach ($encodings as $e) { $a[$e['enumid']] = $e['ident']; } 253 254 echo 255"/* {{{ Index of tables for encoding conversion */ 256static const enc_to_uni *const enc_to_uni_index[cs_numelems] = {\n"; 257 258foreach ($a as $k => $v) { 259 if (is_numeric($v)) 260 echo "\tNULL,\n"; 261 else 262 echo "\t&enc_to_uni_$v,\n"; 263} 264 265 echo 266"}; 267/* }}} */\n"; 268 269$t = <<<CODE 270 271/* Definitions for mappings *from* Unicode */ 272 273typedef struct { 274 unsigned short un_code_point; /* we don't need bigger */ 275 unsigned char cs_code; /* currently, we only have maps to single-byte encodings */ 276} uni_to_enc; 277 278 279CODE; 280 281echo $t; 282 283$encodings = array( 284 array( 285 "ident" => "iso885915", 286 "name" => "ISO-8859-15", 287 "file" => "mappings/8859-15.TXT", 288 "range" => array(0xA4, 0xBE), 289 ), 290 array( 291 "ident" => "win1252", 292 "name" => "Windows-1252", 293 "file" => "mappings/CP1252.TXT", 294 "range" => array(0x80, 0x9F), 295 ), 296 array( 297 "ident" => "win1251", 298 "name" => "Windows-1251", 299 "file" => "mappings/CP1251.TXT", 300 "range" => array(0x80, 0xFF), 301 ), 302 array( 303 "ident" => "koi8r", 304 "name" => "KOI8-R", 305 "file" => "mappings/KOI8-R.TXT", 306 "range" => array(0x80, 0xFF), 307 ), 308 array( 309 "ident" => "cp866", 310 "name" => "CP-866", 311 "file" => "mappings/CP866.TXT", 312 "range" => array(0x80, 0xFF), 313 ), 314 array( 315 "ident" => "macroman", 316 "name" => "MacRoman", 317 "file" => "mappings/ROMAN.TXT", 318 "range" => array(0x80, 0xFF), 319 ), 320); 321 322foreach ($encodings as $e) { 323 echo 324"/* {{{ Mappings *from* Unicode for {$e['name']} */\n"; 325 326 /* process file */ 327 $map = array(); 328 $lines = explode("\n", file_get_contents($e{'file'})); 329 foreach ($lines as $l) { 330 if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})\s+#\s*(.*)$/i", $l, $matches)) 331 $map[] = array($matches[1], $matches[2], rtrim($matches[3])); 332 } 333 334 $mappy = array(); 335 foreach ($map as $v) { 336 if (hexdec($v[0]) >= $e['range'][0] && hexdec($v[0]) <= $e['range'][1]) 337 $mappy[hexdec($v[1])] = array(hexdec($v[0]), strtolower($v[2])); 338 } 339 ksort($mappy); 340 341 echo 342"static const uni_to_enc unimap_{$e['ident']}[] = {\n"; 343 344 foreach ($mappy as $k => $v) { 345 echo "\t{ ", sprintf("0x%04X", $k), ", ", sprintf("0x%02X", $v[0]), " },\t/* ", 346 $v[1], " */\n"; 347 } 348 echo "};\n"; 349 350 echo 351"/* {{{ end of mappings *from* Unicode for {$e['name']} */\n\n"; 352} 353 354$data = file_get_contents("ents_html5.txt"); 355$pass2 = false; 356$name = "HTML5"; 357$ident = "html5"; 358again: 359 360$t = <<<'CODE' 361/* HTML 5 has many more named entities. 362 * Some of them map to two unicode code points, not one. 363 * We're going to use a three-stage table (with an extra one for the entities 364 * with two code points). */ 365 366#define ENT_STAGE1_INDEX(k) (((k) & 0xFFF000) >> 12) /* > 1D, we have no mapping */ 367#define ENT_STAGE2_INDEX(k) (((k) & 0xFC0) >> 6) 368#define ENT_STAGE3_INDEX(k) ((k) & 0x3F) 369#define ENT_CODE_POINT_FROM_STAGES(i,j,k) (((i) << 12) | ((j) << 6) | (k)) 370 371/* Table should be organized with a leading row telling the size of 372 * the table and the default entity (maybe NULL) and the rest being 373 * normal rows ordered by code point so that we can do a binary search */ 374typedef union { 375 struct { 376 unsigned size; /* number of remaining entries in the table */ 377 const char *default_entity; 378 unsigned short default_entity_len; 379 } leading_entry; 380 struct { 381 unsigned second_cp; /* second code point */ 382 const char *entity; 383 unsigned short entity_len; 384 } normal_entry; 385} entity_multicodepoint_row; 386 387/* blocks of these should start at code points k where k % 0xFC0 == 0 */ 388typedef struct { 389 char ambiguous; /* if 0 look into entity */ 390 union { 391 struct { 392 const char *entity; /* may be NULL */ 393 unsigned short entity_len; 394 } ent; 395 const entity_multicodepoint_row *multicodepoint_table; 396 } data; 397} entity_stage3_row; 398 399/* Calculate k & 0x3F Use as offset */ 400typedef const entity_stage3_row *entity_stage2_row; /* 64 elements */ 401 402/* Calculate k & 0xFC0 >> 6. Use as offset */ 403typedef const entity_stage3_row *const *entity_stage1_row; /* 64 elements */ 404 405/* For stage 1, Calculate k & 0xFFF000 >> 3*4. 406 * If larger than 1D, we have no mapping. Otherwise lookup that index */ 407 408typedef struct { 409 const entity_stage1_row *ms_table; 410 /* for tables with only basic entities, this member is to be accessed 411 * directly for better performance: */ 412 const entity_stage3_row *table; 413} entity_table_opt; 414 415/* Replaced "GT" > "gt" and "QUOT" > "quot" for consistency's sake. */ 416 417 418CODE; 419 420if (!$pass2) 421 echo $t; 422 423$dp = array(); 424 425foreach (explode("\n", $data) as $l) { 426 if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+) ([a-f0-9]+)/i', $l, $matches)) { 427 //echo sprintf("\t{\"%-21s 1, 0x%05d},\n", $matches[1].",", $matches[2]); 428 $dp[] = array($matches[1], $matches[2], $matches[3]); 429 } else if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+)/i', $l, $matches)) { 430 $dp[] = array($matches[1], $matches[2]); 431 } 432} 433 434$origdp = $dp; 435 436usort($dp, function($a, $b) { return hexdec($a[1])-hexdec($b[1]); }); 437 438$multicp_rows = array(); 439foreach ($dp as $el) { 440 if (count($el) == 3) { 441 $multicp_rows[$el[1]] = array(); 442 } 443} 444 445foreach ($dp as $el) { 446 if (key_exists($el[1], $multicp_rows)) { 447 if (count($el) == 3) 448 $multicp_rows[$el[1]][$el[2]] = $el[0]; 449 else 450 $multicp_rows[$el[1]]["default"] = $el[0]; 451 } 452} 453 454if ($pass2 < 2) 455 echo "/* {{{ Start of $name multi-stage table for codepoint -> entity */", "\n\n"; 456else 457 echo "/* {{{ Start of $name table for codepoint -> entity */", "\n\n"; 458 459if (empty($multicp_rows)) 460 goto skip_multicp; 461 462ksort($multicp_rows); 463foreach ($multicp_rows as &$v) { ksort($v); } 464unset($v); 465 466echo 467"/* {{{ Start of double code point tables for $name */", "\n\n"; 468 469foreach ($multicp_rows as $k => $v) { 470 echo "static const entity_multicodepoint_row multi_cp_{$ident}_", 471 sprintf("%05s", $k), "[] = {", "\n"; 472 if (key_exists("default", $v)) { 473 if ($v['default'] == 'GT') /* hack to make > translate to > not GT; */ 474 $v['default'] = "gt"; 475 echo "\t{ {", sprintf("%02d", count($v) - 1), 476 ",\t\t", sprintf("\"%-21s", $v["default"].'",'), "\t", 477 sprintf("% 2d", strlen($v["default"])), '} },', "\n"; 478 } else { 479 echo "\t{ {", sprintf("%02d", count($v)), 480 ",\t\t", sprintf("%-22s", 'NULL'), ",\t0} },\n"; 481 } 482 unset($v["default"]); 483 foreach ($v as $l => $w) { 484 echo "\t{ {", sprintf("0x%05s", $l), ",\t", sprintf("\"%-21s", $w.'",'), "\t", 485 sprintf("% 2d", strlen($w)), '} },', "\n"; 486 } 487 echo "};\n"; 488} 489echo "\n/* End of double code point tables }}} */", "\n\n"; 490 491skip_multicp: 492 493if ($pass2 < 2) 494 echo "/* {{{ Stage 3 Tables for $name */", "\n\n"; 495 496$t = <<<CODE 497static const entity_stage3_row empty_stage3_table[] = { 498 /* 64 elements */ 499 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 500 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 501 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 502 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 503 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 504 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 505 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 506 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 507 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 508 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 509 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 510 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 511 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 512 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 513 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 514 {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, 515}; 516 517CODE; 518 519if (!$pass2) 520 echo $t; 521 522$mstable = array(); 523foreach ($dp as $el) { 524 $s1 = (hexdec($el[1]) & 0xFFF000) >> 12; 525 $s2 = (hexdec($el[1]) & 0xFC0) >> 6; 526 $s3 = hexdec($el[1]) & 0x3F; 527 if (key_exists($el[1], $multicp_rows)) { 528 $mstable[$s1][$s2][$s3] = ""; 529 } else { 530 $mstable[$s1][$s2][$s3] = $el[0]; 531 } 532} 533 534for ($i = 0; $i < 0x1E; $i++) { 535 for ($k = 0; $k < 64; $k++) { 536 $any3 = false; 537 $col3 = array(); 538 for ($l = 0; $l < 64; $l++) { 539 if (isset($mstable[$i][$k][$l])) { 540 $any3 = true; 541 $col3[$l] = $mstable[$i][$k][$l]; 542 } else { 543 $col3[$l] = null; 544 } 545 } 546 if ($any3) { 547 echo "static const entity_stage3_row stage3_table_{$ident}_", 548 sprintf("%02X%03X", $i, $k << 6), "[] = {\n"; 549 foreach ($col3 as $y => $z) { 550 if ($y == 0) echo "\t"; 551 elseif ($y % 4 == 0) echo "\n\t"; 552 else echo " "; 553 if ($z === NULL) 554 echo "{0, { {NULL, 0} } },"; 555 elseif ($z === "QUOT") /* hack to translate " into "e;, not " */ 556 echo "{0, { {\"quot\", 4} } },"; 557 elseif ($z !== "") 558 echo "{0, { {\"$z\", ", strlen($z), "} } },"; 559 else 560 echo "{1, { {(void *)", sprintf("multi_cp_{$ident}_%05X", 561 ($i << 12) | ($k << 6) | $y ), "} } },"; 562 563 } 564 echo "\n};\n\n"; 565 } 566 } 567} 568 569if ($pass2 < 2) 570 echo "/* end of stage 3 Tables for $name }}} */", "\n\n"; 571 572if ($pass2 > 1) 573 goto hashtables; 574 575echo 576"/* {{{ Stage 2 Tables for $name */", "\n\n"; 577 578$t = <<<CODE 579static const entity_stage2_row empty_stage2_table[] = { 580 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 581 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 582 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 583 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 584 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 585 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 586 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 587 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 588 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 589 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 590 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 591 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 592 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 593 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 594 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 595 empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table, 596}; 597 598CODE; 599 600if (!$pass2) 601 echo $t; 602 603for ($i = 0; $i < 0x1E; $i++) { 604 $any = false; 605 for ($k = 0; $k < 64; $k++) { 606 if (isset($mstable[$i][$k])) 607 $any = true; 608 } 609 if ($any) { 610 echo "static const entity_stage2_row stage2_table_{$ident}_", 611 sprintf("%02X000", $i), "[] = {\n"; 612 for ($k = 0; $k < 64; $k++) { 613 if ($k == 0) echo "\t"; 614 elseif ($k % 4 == 0) echo "\n\t"; 615 else echo " "; 616 if (isset($mstable[$i][$k])) { 617 echo sprintf("stage3_table_{$ident}_%05X", ($i << 12) | ($k << 6)), ","; 618 } else { 619 echo "empty_stage3_table", ","; 620 } 621 } 622 echo "\n};\n\n"; 623 } 624} 625 626echo 627"/* end of stage 2 tables for $name }}} */", "\n\n"; 628 629echo "static const entity_stage1_row entity_ms_table_{$ident}[] = {\n"; 630for ($i = 0; $i < 0x1E; $i++) { 631 if (isset($mstable[$i])) 632 echo "\t", sprintf("stage2_table_{$ident}_%02X000", $i), ",\n"; 633 else 634 echo "\tempty_stage2_table,\n"; 635} 636echo "};\n\n"; 637 638echo 639"/* end of $name multi-stage table for codepoint -> entity }}} */\n\n"; 640 641/* commented-out; this enabled binary search, which turned out to be 642 * significantly slower than the hash tables for html 5 entities */ 643//echo 644//"/* {{{ HTML 5 tables for entity -> codepoint */", "\n\n"; 645 646//$t = <<<CODE 647//typedef struct { 648// const char *entity; 649// unsigned short entity_len; 650// unsigned int codepoint1; 651// unsigned int codepoint2; 652//} entity_cp_map; 653// 654//#define ENTITY_CP_MAP_CMP(l, lsize, r, rsize) \ 655// ( ((lsize)==(rsize)) ? (memcmp((l), (r), (lsize))) : ((lsize)-(rsize)) ) 656// 657//static const entity_cp_map html5_ent_cp_map[] = { 658// 659//CODE; 660//echo $t; 661// 662//$dp = $origdp; 663//usort($dp, function($a, $b) { $d = strlen($a[0])-strlen($b[0]); 664// return $d==0?strcmp($a[0], $b[0]):$d; }); 665// 666//$k = 0; 667//foreach ($dp as $o) { 668// if ($k == 0) echo "\t"; 669// elseif ($k % 3 == 0) echo "\n\t"; 670// else echo " "; 671// if (isset($o[2])) 672// echo sprintf('{"%s", %d, 0x%X, 0x%X},', $o[0], strlen($o[0]), 673// hexdec($o[1]), hexdec($o[2])); 674// else 675// echo sprintf('{"%s", %d, 0x%X, 0},', $o[0], strlen($o[0]), 676// hexdec($o[1])); 677// 678// if (isset($o[2])) { 679// $entlen = strlen($o[0]) + 2; 680// $utf8len = strlen( 681// mb_convert_encoding("&#x{$o[1]};&#x{$o[2]};", "UTF-8", "HTML-ENTITIES")); 682// if ($utf8len > $entlen*1.2) { 683// die("violated assumption for traverse_for_entities"); 684// } 685// } 686// 687// $k++; 688//} 689//echo "\n};\n\n"; 690// 691//echo "static const size_t html5_ent_cp_map_size = $k;\n\n"; 692// 693//echo 694//"/* end of HTML 5 tables for entity -> codepoint }}} */\n\n"; 695 696hashtables: 697 698echo 699"/* {{{ $name hash table for entity -> codepoint */", "\n\n"; 700 701$t = <<<CODE 702typedef struct { 703 const char *entity; 704 unsigned short entity_len; 705 unsigned int codepoint1; 706 unsigned int codepoint2; 707} entity_cp_map; 708 709typedef const entity_cp_map *entity_ht_bucket; 710 711typedef struct { 712 unsigned num_elems; /* power of 2 */ 713 const entity_ht_bucket *buckets; /* .num_elems elements */ 714} entity_ht; 715 716static const entity_cp_map ht_bucket_empty[] = { {NULL, 0, 0, 0} }; 717 718CODE; 719 720if (!$pass2) 721 echo $t; 722 723function hashfun($str) 724{ 725 726 $hash = 5381; 727 $nKeyLength = strlen($str); 728 $pos = 0; 729 730 for (; $nKeyLength > 0; $nKeyLength--) { 731 $hash = (int)(((int)(((int)($hash << 5)) + $hash)) + ord($str[$pos++])) 732 & 0xFFFFFFFF; 733 } 734 return $hash; 735 736} 737 738$numelems = max(pow(2, ceil(log(1.5*count($origdp))/log(2))),16); 739$mask = $numelems - 1; 740$hashes = array(); 741foreach ($origdp as $e) { 742 $hashes[hashfun($e[0]) & $mask][] = $e; 743 if (isset($e[2])) { 744 $entlen = strlen($e[0]) + 2; 745 $utf8len = strlen( 746 mb_convert_encoding("&#x{$e[1]};&#x{$e[2]};", "UTF-8", "HTML-ENTITIES")); 747 if ($utf8len > $entlen*1.2) { 748 die("violated assumption for traverse_for_entities"); 749 } 750 } 751} 752 753for ($i = 0; $i < $numelems; $i++) { 754 if (empty($hashes[$i])) 755 continue; 756 echo "static const entity_cp_map ht_bucket_{$ident}_", sprintf("%03X", $i) ,"[] = {"; 757 foreach ($hashes[$i] as $h) { 758 if (isset($h[2])) { 759 echo sprintf(' {"%s", %d, 0x%05X, 0x%05X},', 760 $h[0], strlen($h[0]), hexdec($h[1]), hexdec($h[2])); 761 } else { 762 echo sprintf(' {"%s", %d, 0x%05X, 0},', 763 $h[0], strlen($h[0]), hexdec($h[1])); 764 } 765 } 766 echo " {NULL, 0, 0, 0} };\n"; 767} 768echo "\n"; 769 770echo 771"static const entity_cp_map *const ht_buckets_{$ident}[] = {\n"; 772 773for ($i = 0; $i < $numelems; $i++) { 774 if ($i == 0) echo "\t"; 775 elseif ($i % 4 == 0) echo "\n\t"; 776 else echo " "; 777 if (empty($hashes[$i])) 778 echo "ht_bucket_empty,"; 779 else 780 echo "ht_bucket_{$ident}_", sprintf("%03X", $i), ","; 781} 782echo "\n};\n\n"; 783 784echo 785"static const entity_ht ent_ht_{$ident} = { 786 ", sprintf("0x%X", $numelems), ", 787 ht_buckets_{$ident} 788};\n\n"; 789 790echo 791"/* end of $name hash table for entity -> codepoint }}} */\n\n"; 792 793if (!$pass2) { 794 $data = file_get_contents("ents_html401.txt"); 795 $pass2 = 1; 796 $name = "HTML 4.01"; 797 $ident = "html4"; 798 goto again; 799} elseif ($pass2 == 1) { 800 $data = file_get_contents("ents_basic.txt"); 801 $pass2 = 2; 802 $name = "Basic entities (no apos)"; 803 $ident = "be_noapos"; 804 goto again; 805} elseif ($pass2 == 2) { 806 $data = file_get_contents("ents_basic_apos.txt"); 807 $pass2 = 3; 808 $name = "Basic entities (with apos)"; 809 $ident = "be_apos"; 810 goto again; 811} 812 813echo "#endif /* HTML_TABLES_H */\n"; 814