1#!/usr/bin/env php
2<?php
3/*
4   +----------------------------------------------------------------------+
5   | Copyright (c) The PHP Group                                          |
6   +----------------------------------------------------------------------+
7   | This source file is subject to version 3.01 of the PHP license,      |
8   | that is bundled with this package in the file LICENSE, and is        |
9   | available through the world-wide-web at the following url:           |
10   | https://www.php.net/license/3_01.txt                                 |
11   | If you did not receive a copy of the PHP license and are unable to   |
12   | obtain it through the world-wide-web, please send a note to          |
13   | license@php.net so we can mail you a copy immediately.               |
14   +----------------------------------------------------------------------+
15   | Authors: Gustavo Lopes  <cataphract@php.net>                         |
16   +----------------------------------------------------------------------+
17*/
18
19/* This file prints to stdout the contents of ext/standard/html_tables.h */
20/* put together with glue; have patience */
21
22$t = <<<CODE
23/*
24   +----------------------------------------------------------------------+
25   | Copyright (c) The PHP Group                                          |
26   +----------------------------------------------------------------------+
27   | This source file is subject to version 3.01 of the PHP license,      |
28   | that is bundled with this package in the file LICENSE, and is        |
29   | available through the world-wide-web at the following url:           |
30   | https://www.php.net/license/3_01.txt                                 |
31   | If you did not receive a copy of the PHP license and are unable to   |
32   | obtain it through the world-wide-web, please send a note to          |
33   | license@php.net so we can mail you a copy immediately.               |
34   +----------------------------------------------------------------------+
35*/
36
37#ifndef HTML_TABLES_H
38#define HTML_TABLES_H
39
40/**************************************************************************
41***************************************************************************
42**        THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY IT.        **
43***************************************************************************
44** Please change html_tables/html_table_gen.php instead and then         **
45** run it in order to generate this file                                 **
46***************************************************************************
47**************************************************************************/
48
49enum entity_charset { cs_utf_8, cs_8859_1, cs_cp1252, cs_8859_15, cs_cp1251,
50					  cs_8859_5, cs_cp866, cs_macroman, cs_koi8r, cs_big5,
51					  cs_gb2312, cs_big5hkscs, cs_sjis, cs_eucjp,
52					  cs_numelems /* used to count the number of charsets */
53					};
54#define CHARSET_UNICODE_COMPAT(cs)	((cs) <= cs_8859_1)
55#define CHARSET_SINGLE_BYTE(cs)		((cs) > cs_utf_8 && (cs) < cs_big5)
56#define CHARSET_PARTIAL_SUPPORT(cs)	((cs) >= cs_big5)
57
58static const struct {
59	const char *codeset;
60	uint32_t codeset_len;
61	enum entity_charset charset;
62} charset_map[] = {
63	{ "ISO-8859-1",		sizeof("ISO-8859-1")-1,		cs_8859_1 },
64	{ "ISO8859-1",		sizeof("ISO8859-1")-1,		cs_8859_1 },
65	{ "ISO-8859-15",	sizeof("ISO-8859-15")-1,	cs_8859_15 },
66	{ "ISO8859-15",		sizeof("ISO8859-15")-1,		cs_8859_15 },
67	{ "utf-8",			sizeof("utf-8")-1,			cs_utf_8 },
68	{ "cp1252", 		sizeof("cp1252")-1, 		cs_cp1252 },
69	{ "Windows-1252",	sizeof("Windows-1252")-1,	cs_cp1252 },
70	{ "1252",			sizeof("1252")-1,			cs_cp1252 },
71	{ "BIG5",			sizeof("BIG5")-1,			cs_big5 },
72	{ "950",			sizeof("950")-1,			cs_big5 },
73	{ "GB2312",			sizeof("GB2312")-1,			cs_gb2312 },
74	{ "936",			sizeof("936")-1,			cs_gb2312 },
75	{ "BIG5-HKSCS",		sizeof("BIG5-HKSCS")-1,		cs_big5hkscs },
76	{ "Shift_JIS",		sizeof("Shift_JIS")-1,		cs_sjis },
77	{ "SJIS",			sizeof("SJIS")-1,			cs_sjis },
78	{ "932",			sizeof("932")-1,			cs_sjis },
79	{ "SJIS-win",		sizeof("SJIS-win")-1,		cs_sjis },
80	{ "CP932",			sizeof("CP932")-1,			cs_sjis },
81	{ "EUCJP",			sizeof("EUCJP")-1,			cs_eucjp },
82	{ "EUC-JP",			sizeof("EUC-JP")-1,			cs_eucjp },
83	{ "eucJP-win",		sizeof("eucJP-win")-1,		cs_eucjp },
84	{ "KOI8-R",			sizeof("KOI8-R")-1,			cs_koi8r },
85	{ "koi8-ru",		sizeof("koi8-ru")-1,		cs_koi8r },
86	{ "koi8r",			sizeof("koi8r")-1,			cs_koi8r },
87	{ "cp1251",			sizeof("cp1251")-1,			cs_cp1251 },
88	{ "Windows-1251",	sizeof("Windows-1251")-1,	cs_cp1251 },
89	{ "win-1251",		sizeof("win-1251")-1,		cs_cp1251 },
90	{ "iso8859-5",		sizeof("iso8859-5")-1,		cs_8859_5 },
91	{ "iso-8859-5",		sizeof("iso-8859-5")-1,		cs_8859_5 },
92	{ "cp866",			sizeof("cp866")-1,			cs_cp866 },
93	{ "866",			sizeof("866")-1,			cs_cp866 },
94	{ "ibm866",			sizeof("ibm866")-1,			cs_cp866 },
95	{ "MacRoman",		sizeof("MacRoman")-1,		cs_macroman }
96};
97
98/* longest entity name length excluding & and ; */
99#define LONGEST_ENTITY_LENGTH 31
100
101/* Definitions for mappings *to* Unicode.
102 * The origin charset must have at most 256 code points.
103 * The multi-byte encodings are not supported */
104typedef struct {
105    unsigned short uni_cp[64];
106} enc_to_uni_stage2;
107
108typedef struct {
109    const enc_to_uni_stage2 *inner[4];
110} enc_to_uni;
111
112/* bits 7-8 bits (only single bytes encodings supported )*/
113#define ENT_ENC_TO_UNI_STAGE1(k) ((k & 0xC0) >> 6)
114/* bits 1-6 */
115#define ENT_ENC_TO_UNI_STAGE2(k) ((k) & 0x3F)
116
117
118CODE;
119
120echo $t;
121
122$encodings = array(
123    array(
124        "ident" => "iso88591",
125        "enumid" => 1,
126        "name" => "ISO-8859-1",
127        "file" => "mappings/8859-1.TXT",
128    ),
129    array(
130        "ident" => "iso88595",
131        "enumid" => 5,
132        "name" => "ISO-8859-5",
133        "file" => "mappings/8859-5.TXT",
134    ),
135    array(
136        "ident" => "iso885915",
137        "enumid" => 3,
138        "name" => "ISO-8859-15",
139        "file" => "mappings/8859-15.TXT",
140    ),
141    array(
142        "ident" => "win1252",
143        "enumid" => 2,
144        "enumident" => "cp1252",
145        "name" => "Windows-1252",
146        "file" => "mappings/CP1252.TXT",
147    ),
148    array(
149        "ident" => "win1251",
150        "enumid" => 4,
151        "enumident" => "cp1252",
152        "name" => "Windows-1251",
153        "file" => "mappings/CP1251.TXT",
154    ),
155    array(
156        "ident" => "koi8r",
157        "enumid" => 8,
158        "name" => "KOI8-R",
159        "file" => "mappings/KOI8-R.TXT",
160    ),
161    array(
162        "ident" => "cp866",
163        "enumid" => 6,
164        "name" => "CP-866",
165        "file" => "mappings/CP866.TXT",
166    ),
167    array(
168        "ident" => "macroman",
169        "enumid" => 7,
170        "name" => "MacRoman",
171        "file" => "mappings/ROMAN.TXT",
172    ),
173);
174
175$prevStage2 = array();
176
177foreach ($encodings as $e) {
178    echo
179"/* {{{ Mappings *to* Unicode for {$e['name']} */\n\n";
180
181    /* process file */
182    $map = array();
183    $lines = explode("\n", file_get_contents($e{'file'}));
184    foreach ($lines as $l) {
185        if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})/i", $l, $matches))
186            $map[] = array($matches[1], $matches[2]);
187    }
188
189    $mappy = array();
190    foreach ($map as $v) { $mappy[hexdec($v[0])] = hexdec($v[1]); }
191
192    $mstable = array("ident" => $e['ident']);
193    /* calculate two-stage tables */
194    for ($i = 0; $i < 4; $i++) {
195        for ($j = 0; $j < 64; $j++) {
196            $cp = $i << 6 | $j;
197            $mstable[$i][$j] = isset($mappy[$cp]) ? $mappy[$cp] : NULL;
198        }
199    }
200
201    echo
202"/* {{{ Stage 2 tables for {$e['name']} */\n\n";
203
204    $s2tables_idents = array();
205    for ($i = 0; $i < 4; $i++) {
206        if (($t = array_keys($prevStage2, $mstable[$i])) !== array()) {
207            $s2tables_idents[$i] = $encodings[$t[0]/5]["ident"];
208            continue;
209        }
210
211        $s2tables_idents[$i] = $e["ident"];
212
213        echo "static const enc_to_uni_stage2 enc_to_uni_s2_{$e['ident']}_".
214            sprintf("%02X", $i << 6)." = { {\n";
215        for ($j = 0; $j < 64; $j++) {
216            if ($j == 0) echo "\t";
217            elseif ($j % 6 == 0) echo "\n\t";
218            else echo " ";
219            if ($mstable[$i][$j] !== NULL)
220                echo sprintf("0x%04X,", $mstable[$i][$j]);
221            else
222                echo "0xFFFF,"; /* special value; indicates no mapping */
223        }
224        echo "\n} };\n\n";
225
226        $prevStage2[] = $mstable[$i];
227    }
228
229    echo
230"/* end of stage 2 tables for {$e['name']} }}} */\n\n";
231
232    echo
233"/* {{{ Stage 1 table for {$e['name']} */\n";
234
235    echo
236"static const enc_to_uni enc_to_uni_{$e['ident']} = { {
237\t&enc_to_uni_s2_{$s2tables_idents[0]}_00,
238\t&enc_to_uni_s2_{$s2tables_idents[1]}_40,
239\t&enc_to_uni_s2_{$s2tables_idents[2]}_80,
240\t&enc_to_uni_s2_{$s2tables_idents[3]}_C0 }
241};
242";
243
244    echo
245"/* end of stage 1 table for {$e['name']} }}} */\n\n";
246}
247
248$maxencnum = max(array_map(function($e) { return $e['enumid']; }, $encodings));
249$a = range(0, $maxencnum);
250foreach ($encodings as $e) { $a[$e['enumid']] = $e['ident']; }
251
252    echo
253"/* {{{ Index of tables for encoding conversion */
254static const enc_to_uni *const enc_to_uni_index[cs_numelems] = {\n";
255
256foreach ($a as $k => $v) {
257    if (is_numeric($v))
258        echo "\tNULL,\n";
259    else
260        echo "\t&enc_to_uni_$v,\n";
261}
262
263    echo
264"};
265/* }}} */\n";
266
267$t = <<<CODE
268
269/* Definitions for mappings *from* Unicode */
270
271typedef struct {
272	unsigned short un_code_point; /* we don't need bigger */
273	unsigned char cs_code; /* currently, we only have maps to single-byte encodings */
274} uni_to_enc;
275
276
277CODE;
278
279echo $t;
280
281$encodings = array(
282    array(
283        "ident" => "iso885915",
284        "name" => "ISO-8859-15",
285        "file" => "mappings/8859-15.TXT",
286        "range" => array(0xA4, 0xBE),
287    ),
288    array(
289        "ident" => "win1252",
290        "name" => "Windows-1252",
291        "file" => "mappings/CP1252.TXT",
292        "range" => array(0x80, 0x9F),
293    ),
294    array(
295        "ident" => "win1251",
296        "name" => "Windows-1251",
297        "file" => "mappings/CP1251.TXT",
298        "range" => array(0x80, 0xFF),
299    ),
300    array(
301        "ident" => "koi8r",
302        "name" => "KOI8-R",
303        "file" => "mappings/KOI8-R.TXT",
304        "range" => array(0x80, 0xFF),
305    ),
306    array(
307        "ident" => "cp866",
308        "name" => "CP-866",
309        "file" => "mappings/CP866.TXT",
310        "range" => array(0x80, 0xFF),
311    ),
312    array(
313        "ident" => "macroman",
314        "name" => "MacRoman",
315        "file" => "mappings/ROMAN.TXT",
316        "range" => array(0x80, 0xFF),
317    ),
318);
319
320foreach ($encodings as $e) {
321    echo
322"/* {{{ Mappings *from* Unicode for {$e['name']} */\n";
323
324    /* process file */
325    $map = array();
326    $lines = explode("\n", file_get_contents($e{'file'}));
327    foreach ($lines as $l) {
328        if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})\s+#\s*(.*)$/i", $l, $matches))
329            $map[] = array($matches[1], $matches[2], rtrim($matches[3]));
330    }
331
332    $mappy = array();
333    foreach ($map as $v) {
334        if (hexdec($v[0]) >= $e['range'][0] && hexdec($v[0]) <= $e['range'][1])
335            $mappy[hexdec($v[1])] = array(hexdec($v[0]), strtolower($v[2]));
336    }
337    ksort($mappy);
338
339    echo
340"static const uni_to_enc unimap_{$e['ident']}[] = {\n";
341
342    foreach ($mappy as $k => $v) {
343        echo "\t{ ", sprintf("0x%04X", $k), ", ", sprintf("0x%02X", $v[0]), " },\t/* ",
344            $v[1], " */\n";
345    }
346    echo "};\n";
347
348    echo
349"/* {{{ end of mappings *from* Unicode for {$e['name']} */\n\n";
350}
351
352$data = file_get_contents("ents_html5.txt");
353$pass2 = false;
354$name = "HTML5";
355$ident = "html5";
356again:
357
358$t = <<<'CODE'
359/* HTML 5 has many more named entities.
360 * Some of them map to two unicode code points, not one.
361 * We're going to use a three-stage table (with an extra one for the entities
362 * with two code points). */
363
364#define ENT_STAGE1_INDEX(k) (((k) & 0xFFF000) >> 12) /* > 1D, we have no mapping */
365#define ENT_STAGE2_INDEX(k) (((k) & 0xFC0) >> 6)
366#define ENT_STAGE3_INDEX(k) ((k) & 0x3F)
367#define ENT_CODE_POINT_FROM_STAGES(i,j,k) (((i) << 12) | ((j) << 6) | (k))
368
369/* The default entity may be NULL. Binary search is still possible while
370   is senseless as there are just two rows (see also find_entity_for_char()). */
371typedef union {
372	struct {
373		const char *default_entity;
374		unsigned size; /* number of remaining entries in the table */
375		unsigned short default_entity_len;
376	} leading_entry;
377	struct {
378		const char *entity;
379		unsigned second_cp; /* second code point */
380		unsigned short entity_len;
381	} normal_entry;
382} entity_multicodepoint_row;
383
384/* blocks of these should start at code points k where k % 0xFC0 == 0 */
385typedef struct {
386	char ambiguous; /* if 0 look into entity */
387	union {
388		struct {
389			const char *entity; /* may be NULL */
390			unsigned short entity_len;
391		} ent;
392		const entity_multicodepoint_row *multicodepoint_table;
393	} data;
394} entity_stage3_row;
395
396/* Calculate k & 0x3F Use as offset */
397typedef const entity_stage3_row *entity_stage2_row; /* 64 elements */
398
399/* Calculate k & 0xFC0 >> 6. Use as offset */
400typedef const entity_stage3_row *const *entity_stage1_row; /* 64 elements */
401
402/* For stage 1, Calculate k & 0xFFF000 >> 3*4.
403 * If larger than 1D, we have no mapping. Otherwise lookup that index */
404
405typedef struct {
406	const entity_stage1_row *ms_table;
407	/* for tables with only basic entities, this member is to be accessed
408	 * directly for better performance: */
409	const entity_stage3_row *table;
410} entity_table_opt;
411
412/* Replaced "GT" > "gt" and "QUOT" > "quot" for consistency's sake. */
413
414
415CODE;
416
417if (!$pass2)
418    echo $t;
419
420$dp = array();
421
422foreach (explode("\n", $data) as $l) {
423	if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+) ([a-f0-9]+)/i', $l, $matches)) {
424		//echo sprintf("\t{\"%-21s 1, 0x%05d},\n", $matches[1].",", $matches[2]);
425		$dp[] = array($matches[1], $matches[2], $matches[3]);
426	} else if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+)/i', $l, $matches)) {
427		$dp[] = array($matches[1], $matches[2]);
428	}
429}
430
431$origdp = $dp;
432
433usort($dp, function($a, $b) { return hexdec($a[1])-hexdec($b[1]); });
434
435$multicp_rows = array();
436foreach ($dp as $el) {
437	if (count($el) == 3) {
438		$multicp_rows[$el[1]] = array();
439	}
440}
441
442foreach ($dp as $el) {
443	if (key_exists($el[1], $multicp_rows)) {
444		if (count($el) == 3)
445			$multicp_rows[$el[1]][$el[2]] = $el[0];
446		else
447			$multicp_rows[$el[1]]["default"] = $el[0];
448	}
449}
450
451if ($pass2 < 2)
452    echo "/* {{{ Start of $name multi-stage table for codepoint -> entity */", "\n\n";
453else
454    echo "/* {{{ Start of $name table for codepoint -> entity */", "\n\n";
455
456if (empty($multicp_rows))
457    goto skip_multicp;
458
459ksort($multicp_rows);
460foreach ($multicp_rows as &$v) { ksort($v); }
461unset($v);
462
463echo
464"/* {{{ Start of double code point tables for $name */", "\n\n";
465
466foreach ($multicp_rows as $k => $v) {
467	echo "static const entity_multicodepoint_row multi_cp_{$ident}_",
468		sprintf("%05s", $k), "[] = {", "\n";
469	if (key_exists("default", $v)) {
470        if ($v['default'] == 'GT') /* hack to make > translate to &gt; not GT; */
471            $v['default'] = "gt";
472		echo "\t{ {", sprintf("\"%-21s", $v["default"].'",'),
473			"\t", sprintf("%02d", (count($v) - 1)), ",\t\t",
474            sprintf("% 2d", strlen($v["default"])), '} },', "\n";
475	} else {
476		echo "\t{ {", sprintf("%-22s", 'NULL,'),
477			"\t", sprintf("%02d", count($v)), ",\t\t0} },\n";
478	}
479	unset($v["default"]);
480	foreach ($v as $l => $w) {
481		echo "\t{ {", sprintf("\"%-21s", $w.'",'), "\t", sprintf("0x%05s", $l), ",\t",
482            sprintf("% 2d", strlen($w)), '} },', "\n";
483	}
484	echo "};\n";
485}
486echo "\n/* End of double code point tables }}} */", "\n\n";
487
488skip_multicp:
489
490if ($pass2 < 2)
491    echo "/* {{{ Stage 3 Tables for $name */", "\n\n";
492
493$t = <<<CODE
494static const entity_stage3_row empty_stage3_table[] = {
495	/* 64 elements */
496	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
497	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
498	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
499	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
500	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
501	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
502	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
503	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
504	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
505	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
506	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
507	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
508	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
509	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
510	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
511	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
512};
513
514CODE;
515
516if (!$pass2)
517    echo $t;
518
519$mstable = array();
520foreach ($dp as $el) {
521	$s1 = (hexdec($el[1]) & 0xFFF000) >> 12;
522	$s2 = (hexdec($el[1]) & 0xFC0) >> 6;
523	$s3 = hexdec($el[1]) & 0x3F;
524	if (key_exists($el[1], $multicp_rows)) {
525		$mstable[$s1][$s2][$s3] = "";
526	} else {
527		$mstable[$s1][$s2][$s3] = $el[0];
528	}
529}
530
531for ($i = 0; $i < 0x1E; $i++) {
532	for ($k = 0; $k < 64; $k++) {
533		$any3 = false;
534		$col3 = array();
535		for ($l = 0; $l < 64; $l++) {
536			if (isset($mstable[$i][$k][$l])) {
537				$any3 = true;
538				$col3[$l] = $mstable[$i][$k][$l];
539			} else {
540				$col3[$l] = null;
541			}
542		}
543		if ($any3) {
544			echo "static const entity_stage3_row stage3_table_{$ident}_",
545				sprintf("%02X%03X", $i, $k << 6), "[] = {\n";
546			foreach ($col3 as $y => $z) {
547				if ($y == 0) echo "\t";
548				elseif ($y % 4 == 0) echo "\n\t";
549				else echo " ";
550				if ($z === NULL)
551					echo "{0, { {NULL, 0} } },";
552                elseif ($z === "QUOT") /* hack to translate " into &quote;, not &QUOT; */
553                    echo "{0, { {\"quot\", 4} } },";
554				elseif ($z !== "")
555					echo "{0, { {\"$z\", ", strlen($z), "} } },";
556				else
557					echo "{1, { {(void *)", sprintf("multi_cp_{$ident}_%05X",
558						($i << 12) | ($k << 6) | $y ), ", 0} } },";
559
560			}
561			echo "\n};\n\n";
562		}
563	}
564}
565
566if ($pass2 < 2)
567    echo "/* end of stage 3 Tables for $name }}} */", "\n\n";
568
569if ($pass2 > 1)
570    goto hashtables;
571
572echo
573"/* {{{ Stage 2 Tables for $name */", "\n\n";
574
575$t = <<<CODE
576static const entity_stage2_row empty_stage2_table[] = {
577	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
578	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
579	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
580	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
581	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
582	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
583	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
584	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
585	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
586	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
587	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
588	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
589	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
590	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
591	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
592	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
593};
594
595CODE;
596
597if (!$pass2)
598    echo $t;
599
600for ($i = 0; $i < 0x1E; $i++) {
601	$any = false;
602	for ($k = 0; $k < 64; $k++) {
603		if (isset($mstable[$i][$k]))
604			$any = true;
605	}
606	if ($any) {
607		echo "static const entity_stage2_row stage2_table_{$ident}_",
608			sprintf("%02X000", $i), "[] = {\n";
609		for ($k = 0; $k < 64; $k++) {
610			if ($k == 0) echo "\t";
611			elseif ($k % 4 == 0) echo "\n\t";
612			else echo " ";
613			if (isset($mstable[$i][$k])) {
614				echo sprintf("stage3_table_{$ident}_%05X", ($i << 12) | ($k << 6)), ",";
615			} else {
616				echo "empty_stage3_table", ",";
617			}
618		}
619		echo "\n};\n\n";
620	}
621}
622
623echo
624"/* end of stage 2 tables for $name }}} */", "\n\n";
625
626echo "static const entity_stage1_row entity_ms_table_{$ident}[] = {\n";
627for ($i = 0; $i < 0x1E; $i++) {
628	if (isset($mstable[$i]))
629		echo "\t", sprintf("stage2_table_{$ident}_%02X000", $i), ",\n";
630	else
631		echo "\tempty_stage2_table,\n";
632}
633echo "};\n\n";
634
635echo
636"/* end of $name multi-stage table for codepoint -> entity }}} */\n\n";
637
638/* commented-out; this enabled binary search, which turned out to be
639 * significantly slower than the hash tables for html 5 entities */
640//echo
641//"/* {{{ HTML 5 tables for entity -> codepoint */", "\n\n";
642
643//$t = <<<CODE
644//typedef struct {
645//	const char *entity;
646//	unsigned short entity_len;
647//	unsigned int codepoint1;
648//	unsigned int codepoint2;
649//} entity_cp_map;
650//
651//#define ENTITY_CP_MAP_CMP(l, lsize, r, rsize) \
652//	( ((lsize)==(rsize)) ? (memcmp((l), (r), (lsize))) : ((lsize)-(rsize)) )
653//
654//static const entity_cp_map html5_ent_cp_map[] = {
655//
656//CODE;
657//echo $t;
658//
659//$dp = $origdp;
660//usort($dp, function($a, $b) { $d = strlen($a[0])-strlen($b[0]);
661//	return $d==0?strcmp($a[0], $b[0]):$d; });
662//
663//$k = 0;
664//foreach ($dp as $o) {
665//	if ($k == 0) echo "\t";
666//	elseif ($k % 3 == 0) echo "\n\t";
667//	else echo " ";
668//	if (isset($o[2]))
669//		echo sprintf('{"%s", %d, 0x%X, 0x%X},', $o[0], strlen($o[0]),
670//			hexdec($o[1]), hexdec($o[2]));
671//	else
672//		echo sprintf('{"%s", %d, 0x%X, 0},', $o[0], strlen($o[0]),
673//			hexdec($o[1]));
674//
675//	if (isset($o[2])) {
676//		$entlen = strlen($o[0]) + 2;
677//		$utf8len = strlen(
678//			mb_convert_encoding("&#x{$o[1]};&#x{$o[2]};", "UTF-8", "HTML-ENTITIES"));
679//		if ($utf8len > $entlen*1.2) {
680//			die("violated assumption for traverse_for_entities");
681//		}
682//	}
683//
684//	$k++;
685//}
686//echo "\n};\n\n";
687//
688//echo "static const size_t html5_ent_cp_map_size = $k;\n\n";
689//
690//echo
691//"/* end of HTML 5 tables for entity -> codepoint }}} */\n\n";
692
693hashtables:
694
695echo
696"/* {{{ $name hash table for entity -> codepoint */", "\n\n";
697
698$t = <<<CODE
699typedef struct {
700	const char *entity;
701	unsigned short entity_len;
702	unsigned int codepoint1;
703	unsigned int codepoint2;
704} entity_cp_map;
705
706typedef const entity_cp_map *entity_ht_bucket;
707
708typedef struct {
709	unsigned num_elems; /* power of 2 */
710	const entity_ht_bucket *buckets; /* .num_elems elements */
711} entity_ht;
712
713static const entity_cp_map ht_bucket_empty[] = { {NULL, 0, 0, 0} };
714
715CODE;
716
717if (!$pass2)
718    echo $t;
719
720function hashfun($str)
721{
722
723	$hash = 5381;
724	$nKeyLength = strlen($str);
725	$pos = 0;
726
727	for (; $nKeyLength > 0; $nKeyLength--) {
728		$hash = (int)(((int)(((int)($hash << 5)) + $hash)) + ord($str[$pos++]))
729				 & 0xFFFFFFFF;
730	}
731	return $hash;
732
733}
734
735$numelems = max(pow(2, ceil(log(1.5*count($origdp))/log(2))),16);
736$mask = $numelems - 1;
737$hashes = array();
738foreach ($origdp as $e) {
739	$hashes[hashfun($e[0]) & $mask][] = $e;
740	if (isset($e[2])) {
741		$entlen = strlen($e[0]) + 2;
742		$utf8len = strlen(
743			mb_convert_encoding("&#x{$e[1]};&#x{$e[2]};", "UTF-8", "HTML-ENTITIES"));
744		if ($utf8len > $entlen*1.2) {
745			die("violated assumption for traverse_for_entities");
746		}
747	}
748}
749
750for ($i = 0; $i < $numelems; $i++) {
751	if (empty($hashes[$i]))
752		continue;
753	echo "static const entity_cp_map ht_bucket_{$ident}_", sprintf("%03X", $i) ,"[] = {";
754	foreach ($hashes[$i] as $h) {
755		if (isset($h[2])) {
756			echo sprintf(' {"%s", %d, 0x%05X, 0x%05X},',
757				$h[0], strlen($h[0]), hexdec($h[1]), hexdec($h[2]));
758		} else {
759			echo sprintf(' {"%s", %d, 0x%05X, 0},',
760				$h[0], strlen($h[0]), hexdec($h[1]));
761		}
762	}
763	echo " {NULL, 0, 0, 0} };\n";
764}
765echo "\n";
766
767echo
768"static const entity_cp_map *const ht_buckets_{$ident}[] = {\n";
769
770for ($i = 0; $i < $numelems; $i++) {
771	if ($i == 0) echo "\t";
772	elseif ($i % 4 == 0) echo "\n\t";
773	else echo " ";
774	if (empty($hashes[$i]))
775		echo "ht_bucket_empty,";
776	else
777		echo "ht_bucket_{$ident}_", sprintf("%03X", $i), ",";
778}
779echo "\n};\n\n";
780
781echo
782"static const entity_ht ent_ht_{$ident} = {
783	", sprintf("0x%X", $numelems), ",
784	ht_buckets_{$ident}
785};\n\n";
786
787echo
788"/* end of $name hash table for entity -> codepoint }}} */\n\n";
789
790if (!$pass2) {
791    $data = file_get_contents("ents_html401.txt");
792    $pass2 = 1;
793    $name = "HTML 4.01";
794    $ident = "html4";
795    goto again;
796} elseif ($pass2 == 1) {
797    $data = file_get_contents("ents_basic.txt");
798    $pass2 = 2;
799    $name = "Basic entities (no apos)";
800    $ident = "be_noapos";
801    goto again;
802} elseif ($pass2 == 2) {
803    $data = file_get_contents("ents_basic_apos.txt");
804    $pass2 = 3;
805    $name = "Basic entities (with apos)";
806    $ident = "be_apos";
807    goto again;
808}
809
810echo "#endif /* HTML_TABLES_H */\n";
811