1#!/usr/bin/env php
2<?php
3/*
4   +----------------------------------------------------------------------+
5   | PHP Version 7                                                        |
6   +----------------------------------------------------------------------+
7   | Copyright (c) The PHP Group                                          |
8   +----------------------------------------------------------------------+
9   | This source file is subject to version 3.01 of the PHP license,      |
10   | that is bundled with this package in the file LICENSE, and is        |
11   | available through the world-wide-web at the following url:           |
12   | http://www.php.net/license/3_01.txt                                  |
13   | If you did not receive a copy of the PHP license and are unable to   |
14   | obtain it through the world-wide-web, please send a note to          |
15   | license@php.net so we can mail you a copy immediately.               |
16   +----------------------------------------------------------------------+
17   | Authors: Gustavo Lopes  <cataphract@php.net>                         |
18   +----------------------------------------------------------------------+
19*/
20
21/* This file prints to stdout the contents of ext/standard/html_tables.h */
22/* put together with glue; have patience */
23
24$t = <<<CODE
25/*
26   +----------------------------------------------------------------------+
27   | PHP Version 7                                                        |
28   +----------------------------------------------------------------------+
29   | Copyright (c) The PHP Group                                          |
30   +----------------------------------------------------------------------+
31   | This source file is subject to version 3.01 of the PHP license,      |
32   | that is bundled with this package in the file LICENSE, and is        |
33   | available through the world-wide-web at the following url:           |
34   | http://www.php.net/license/3_01.txt                                  |
35   | If you did not receive a copy of the PHP license and are unable to   |
36   | obtain it through the world-wide-web, please send a note to          |
37   | license@php.net so we can mail you a copy immediately.               |
38   +----------------------------------------------------------------------+
39*/
40
41#ifndef HTML_TABLES_H
42#define HTML_TABLES_H
43
44/**************************************************************************
45***************************************************************************
46**        THIS FILE IS AUTOMATICALLY GENERATED. DO NOT MODIFY IT.        **
47***************************************************************************
48** Please change html_tables/html_table_gen.php instead and then         **
49** run it in order to generate this file                                 **
50***************************************************************************
51**************************************************************************/
52
53enum entity_charset { cs_utf_8, cs_8859_1, cs_cp1252, cs_8859_15, cs_cp1251,
54					  cs_8859_5, cs_cp866, cs_macroman, cs_koi8r, cs_big5,
55					  cs_gb2312, cs_big5hkscs, cs_sjis, cs_eucjp,
56					  cs_numelems /* used to count the number of charsets */
57					};
58#define CHARSET_UNICODE_COMPAT(cs)	((cs) <= cs_8859_1)
59#define CHARSET_SINGLE_BYTE(cs)		((cs) > cs_utf_8 && (cs) < cs_big5)
60#define CHARSET_PARTIAL_SUPPORT(cs)	((cs) >= cs_big5)
61
62static const struct {
63	const char *codeset;
64	uint32_t codeset_len;
65	enum entity_charset charset;
66} charset_map[] = {
67	{ "ISO-8859-1",		sizeof("ISO-8859-1")-1,		cs_8859_1 },
68	{ "ISO8859-1",		sizeof("ISO8859-1")-1,		cs_8859_1 },
69	{ "ISO-8859-15",	sizeof("ISO-8859-15")-1,	cs_8859_15 },
70	{ "ISO8859-15",		sizeof("ISO8859-15")-1,		cs_8859_15 },
71	{ "utf-8",			sizeof("utf-8")-1,			cs_utf_8 },
72	{ "cp1252", 		sizeof("cp1252")-1, 		cs_cp1252 },
73	{ "Windows-1252",	sizeof("Windows-1252")-1,	cs_cp1252 },
74	{ "1252",			sizeof("1252")-1,			cs_cp1252 },
75	{ "BIG5",			sizeof("BIG5")-1,			cs_big5 },
76	{ "950",			sizeof("950")-1,			cs_big5 },
77	{ "GB2312",			sizeof("GB2312")-1,			cs_gb2312 },
78	{ "936",			sizeof("936")-1,			cs_gb2312 },
79	{ "BIG5-HKSCS",		sizeof("BIG5-HKSCS")-1,		cs_big5hkscs },
80	{ "Shift_JIS",		sizeof("Shift_JIS")-1,		cs_sjis },
81	{ "SJIS",			sizeof("SJIS")-1,			cs_sjis },
82	{ "932",			sizeof("932")-1,			cs_sjis },
83	{ "SJIS-win",		sizeof("SJIS-win")-1,		cs_sjis },
84	{ "CP932",			sizeof("CP932")-1,			cs_sjis },
85	{ "EUCJP",			sizeof("EUCJP")-1,			cs_eucjp },
86	{ "EUC-JP",			sizeof("EUC-JP")-1,			cs_eucjp },
87	{ "eucJP-win",		sizeof("eucJP-win")-1,		cs_eucjp },
88	{ "KOI8-R",			sizeof("KOI8-R")-1,			cs_koi8r },
89	{ "koi8-ru",		sizeof("koi8-ru")-1,		cs_koi8r },
90	{ "koi8r",			sizeof("koi8r")-1,			cs_koi8r },
91	{ "cp1251",			sizeof("cp1251")-1,			cs_cp1251 },
92	{ "Windows-1251",	sizeof("Windows-1251")-1,	cs_cp1251 },
93	{ "win-1251",		sizeof("win-1251")-1,		cs_cp1251 },
94	{ "iso8859-5",		sizeof("iso8859-5")-1,		cs_8859_5 },
95	{ "iso-8859-5",		sizeof("iso-8859-5")-1,		cs_8859_5 },
96	{ "cp866",			sizeof("cp866")-1,			cs_cp866 },
97	{ "866",			sizeof("866")-1,			cs_cp866 },
98	{ "ibm866",			sizeof("ibm866")-1,			cs_cp866 },
99	{ "MacRoman",		sizeof("MacRoman")-1,		cs_macroman }
100};
101
102/* longest entity name length excluding & and ; */
103#define LONGEST_ENTITY_LENGTH 31
104
105/* Definitions for mappings *to* Unicode.
106 * The origin charset must have at most 256 code points.
107 * The multi-byte encodings are not supported */
108typedef struct {
109    unsigned short uni_cp[64];
110} enc_to_uni_stage2;
111
112typedef struct {
113    const enc_to_uni_stage2 *inner[4];
114} enc_to_uni;
115
116/* bits 7-8 bits (only single bytes encodings supported )*/
117#define ENT_ENC_TO_UNI_STAGE1(k) ((k & 0xC0) >> 6)
118/* bits 1-6 */
119#define ENT_ENC_TO_UNI_STAGE2(k) ((k) & 0x3F)
120
121
122CODE;
123
124echo $t;
125
126$encodings = array(
127    array(
128        "ident" => "iso88591",
129        "enumid" => 1,
130        "name" => "ISO-8859-1",
131        "file" => "mappings/8859-1.TXT",
132    ),
133    array(
134        "ident" => "iso88595",
135        "enumid" => 5,
136        "name" => "ISO-8859-5",
137        "file" => "mappings/8859-5.TXT",
138    ),
139    array(
140        "ident" => "iso885915",
141        "enumid" => 3,
142        "name" => "ISO-8859-15",
143        "file" => "mappings/8859-15.TXT",
144    ),
145    array(
146        "ident" => "win1252",
147        "enumid" => 2,
148        "enumident" => "cp1252",
149        "name" => "Windows-1252",
150        "file" => "mappings/CP1252.TXT",
151    ),
152    array(
153        "ident" => "win1251",
154        "enumid" => 4,
155        "enumident" => "cp1252",
156        "name" => "Windows-1251",
157        "file" => "mappings/CP1251.TXT",
158    ),
159    array(
160        "ident" => "koi8r",
161        "enumid" => 8,
162        "name" => "KOI8-R",
163        "file" => "mappings/KOI8-R.TXT",
164    ),
165    array(
166        "ident" => "cp866",
167        "enumid" => 6,
168        "name" => "CP-866",
169        "file" => "mappings/CP866.TXT",
170    ),
171    array(
172        "ident" => "macroman",
173        "enumid" => 7,
174        "name" => "MacRoman",
175        "file" => "mappings/ROMAN.TXT",
176    ),
177);
178
179$prevStage2 = array();
180
181foreach ($encodings as $e) {
182    echo
183"/* {{{ Mappings *to* Unicode for {$e['name']} */\n\n";
184
185    /* process file */
186    $map = array();
187    $lines = explode("\n", file_get_contents($e{'file'}));
188    foreach ($lines as $l) {
189        if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})/i", $l, $matches))
190            $map[] = array($matches[1], $matches[2]);
191    }
192
193    $mappy = array();
194    foreach ($map as $v) { $mappy[hexdec($v[0])] = hexdec($v[1]); }
195
196    $mstable = array("ident" => $e['ident']);
197    /* calculate two-stage tables */
198    for ($i = 0; $i < 4; $i++) {
199        for ($j = 0; $j < 64; $j++) {
200            $cp = $i << 6 | $j;
201            $mstable[$i][$j] = isset($mappy[$cp]) ? $mappy[$cp] : NULL;
202        }
203    }
204
205    echo
206"/* {{{ Stage 2 tables for {$e['name']} */\n\n";
207
208    $s2tables_idents = array();
209    for ($i = 0; $i < 4; $i++) {
210        if (($t = array_keys($prevStage2, $mstable[$i])) !== array()) {
211            $s2tables_idents[$i] = $encodings[$t[0]/5]["ident"];
212            continue;
213        }
214
215        $s2tables_idents[$i] = $e["ident"];
216
217        echo "static const enc_to_uni_stage2 enc_to_uni_s2_{$e['ident']}_".
218            sprintf("%02X", $i << 6)." = { {\n";
219        for ($j = 0; $j < 64; $j++) {
220            if ($j == 0) echo "\t";
221            elseif ($j % 6 == 0) echo "\n\t";
222            else echo " ";
223            if ($mstable[$i][$j] !== NULL)
224                echo sprintf("0x%04X,", $mstable[$i][$j]);
225            else
226                echo "0xFFFF,"; /* special value; indicates no mapping */
227        }
228        echo "\n} };\n\n";
229
230        $prevStage2[] = $mstable[$i];
231    }
232
233    echo
234"/* end of stage 2 tables for {$e['name']} }}} */\n\n";
235
236    echo
237"/* {{{ Stage 1 table for {$e['name']} */\n";
238
239    echo
240"static const enc_to_uni enc_to_uni_{$e['ident']} = { {
241\t&enc_to_uni_s2_{$s2tables_idents[0]}_00,
242\t&enc_to_uni_s2_{$s2tables_idents[1]}_40,
243\t&enc_to_uni_s2_{$s2tables_idents[2]}_80,
244\t&enc_to_uni_s2_{$s2tables_idents[3]}_C0 }
245};
246";
247
248    echo
249"/* end of stage 1 table for {$e['name']} }}} */\n\n";
250}
251
252$maxencnum = max(array_map(function($e) { return $e['enumid']; }, $encodings));
253$a = range(0, $maxencnum);
254foreach ($encodings as $e) { $a[$e['enumid']] = $e['ident']; }
255
256    echo
257"/* {{{ Index of tables for encoding conversion */
258static const enc_to_uni *const enc_to_uni_index[cs_numelems] = {\n";
259
260foreach ($a as $k => $v) {
261    if (is_numeric($v))
262        echo "\tNULL,\n";
263    else
264        echo "\t&enc_to_uni_$v,\n";
265}
266
267    echo
268"};
269/* }}} */\n";
270
271$t = <<<CODE
272
273/* Definitions for mappings *from* Unicode */
274
275typedef struct {
276	unsigned short un_code_point; /* we don't need bigger */
277	unsigned char cs_code; /* currently, we only have maps to single-byte encodings */
278} uni_to_enc;
279
280
281CODE;
282
283echo $t;
284
285$encodings = array(
286    array(
287        "ident" => "iso885915",
288        "name" => "ISO-8859-15",
289        "file" => "mappings/8859-15.TXT",
290        "range" => array(0xA4, 0xBE),
291    ),
292    array(
293        "ident" => "win1252",
294        "name" => "Windows-1252",
295        "file" => "mappings/CP1252.TXT",
296        "range" => array(0x80, 0x9F),
297    ),
298    array(
299        "ident" => "win1251",
300        "name" => "Windows-1251",
301        "file" => "mappings/CP1251.TXT",
302        "range" => array(0x80, 0xFF),
303    ),
304    array(
305        "ident" => "koi8r",
306        "name" => "KOI8-R",
307        "file" => "mappings/KOI8-R.TXT",
308        "range" => array(0x80, 0xFF),
309    ),
310    array(
311        "ident" => "cp866",
312        "name" => "CP-866",
313        "file" => "mappings/CP866.TXT",
314        "range" => array(0x80, 0xFF),
315    ),
316    array(
317        "ident" => "macroman",
318        "name" => "MacRoman",
319        "file" => "mappings/ROMAN.TXT",
320        "range" => array(0x80, 0xFF),
321    ),
322);
323
324foreach ($encodings as $e) {
325    echo
326"/* {{{ Mappings *from* Unicode for {$e['name']} */\n";
327
328    /* process file */
329    $map = array();
330    $lines = explode("\n", file_get_contents($e{'file'}));
331    foreach ($lines as $l) {
332        if (preg_match("/^0x([0-9A-Z]{2})\t0x([0-9A-Z]{2,})\s+#\s*(.*)$/i", $l, $matches))
333            $map[] = array($matches[1], $matches[2], rtrim($matches[3]));
334    }
335
336    $mappy = array();
337    foreach ($map as $v) {
338        if (hexdec($v[0]) >= $e['range'][0] && hexdec($v[0]) <= $e['range'][1])
339            $mappy[hexdec($v[1])] = array(hexdec($v[0]), strtolower($v[2]));
340    }
341    ksort($mappy);
342
343    echo
344"static const uni_to_enc unimap_{$e['ident']}[] = {\n";
345
346    foreach ($mappy as $k => $v) {
347        echo "\t{ ", sprintf("0x%04X", $k), ", ", sprintf("0x%02X", $v[0]), " },\t/* ",
348            $v[1], " */\n";
349    }
350    echo "};\n";
351
352    echo
353"/* {{{ end of mappings *from* Unicode for {$e['name']} */\n\n";
354}
355
356$data = file_get_contents("ents_html5.txt");
357$pass2 = false;
358$name = "HTML5";
359$ident = "html5";
360again:
361
362$t = <<<'CODE'
363/* HTML 5 has many more named entities.
364 * Some of them map to two unicode code points, not one.
365 * We're going to use a three-stage table (with an extra one for the entities
366 * with two code points). */
367
368#define ENT_STAGE1_INDEX(k) (((k) & 0xFFF000) >> 12) /* > 1D, we have no mapping */
369#define ENT_STAGE2_INDEX(k) (((k) & 0xFC0) >> 6)
370#define ENT_STAGE3_INDEX(k) ((k) & 0x3F)
371#define ENT_CODE_POINT_FROM_STAGES(i,j,k) (((i) << 12) | ((j) << 6) | (k))
372
373/* The default entity may be NULL. Binary search is still possible while
374   is senseless as there are just two rows (see also find_entity_for_char()). */
375typedef union {
376	struct {
377		const char *default_entity;
378		unsigned size; /* number of remaining entries in the table */
379		unsigned short default_entity_len;
380	} leading_entry;
381	struct {
382		const char *entity;
383		unsigned second_cp; /* second code point */
384		unsigned short entity_len;
385	} normal_entry;
386} entity_multicodepoint_row;
387
388/* blocks of these should start at code points k where k % 0xFC0 == 0 */
389typedef struct {
390	char ambiguous; /* if 0 look into entity */
391	union {
392		struct {
393			const char *entity; /* may be NULL */
394			unsigned short entity_len;
395		} ent;
396		const entity_multicodepoint_row *multicodepoint_table;
397	} data;
398} entity_stage3_row;
399
400/* Calculate k & 0x3F Use as offset */
401typedef const entity_stage3_row *entity_stage2_row; /* 64 elements */
402
403/* Calculate k & 0xFC0 >> 6. Use as offset */
404typedef const entity_stage3_row *const *entity_stage1_row; /* 64 elements */
405
406/* For stage 1, Calculate k & 0xFFF000 >> 3*4.
407 * If larger than 1D, we have no mapping. Otherwise lookup that index */
408
409typedef struct {
410	const entity_stage1_row *ms_table;
411	/* for tables with only basic entities, this member is to be accessed
412	 * directly for better performance: */
413	const entity_stage3_row *table;
414} entity_table_opt;
415
416/* Replaced "GT" > "gt" and "QUOT" > "quot" for consistency's sake. */
417
418
419CODE;
420
421if (!$pass2)
422    echo $t;
423
424$dp = array();
425
426foreach (explode("\n", $data) as $l) {
427	if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+) ([a-f0-9]+)/i', $l, $matches)) {
428		//echo sprintf("\t{\"%-21s 1, 0x%05d},\n", $matches[1].",", $matches[2]);
429		$dp[] = array($matches[1], $matches[2], $matches[3]);
430	} else if (preg_match('/^(#?[a-z0-9]+)\s+([a-f0-9]+)/i', $l, $matches)) {
431		$dp[] = array($matches[1], $matches[2]);
432	}
433}
434
435$origdp = $dp;
436
437usort($dp, function($a, $b) { return hexdec($a[1])-hexdec($b[1]); });
438
439$multicp_rows = array();
440foreach ($dp as $el) {
441	if (count($el) == 3) {
442		$multicp_rows[$el[1]] = array();
443	}
444}
445
446foreach ($dp as $el) {
447	if (key_exists($el[1], $multicp_rows)) {
448		if (count($el) == 3)
449			$multicp_rows[$el[1]][$el[2]] = $el[0];
450		else
451			$multicp_rows[$el[1]]["default"] = $el[0];
452	}
453}
454
455if ($pass2 < 2)
456    echo "/* {{{ Start of $name multi-stage table for codepoint -> entity */", "\n\n";
457else
458    echo "/* {{{ Start of $name table for codepoint -> entity */", "\n\n";
459
460if (empty($multicp_rows))
461    goto skip_multicp;
462
463ksort($multicp_rows);
464foreach ($multicp_rows as &$v) { ksort($v); }
465unset($v);
466
467echo
468"/* {{{ Start of double code point tables for $name */", "\n\n";
469
470foreach ($multicp_rows as $k => $v) {
471	echo "static const entity_multicodepoint_row multi_cp_{$ident}_",
472		sprintf("%05s", $k), "[] = {", "\n";
473	if (key_exists("default", $v)) {
474        if ($v['default'] == 'GT') /* hack to make > translate to &gt; not GT; */
475            $v['default'] = "gt";
476		echo "\t{ {", sprintf("\"%-21s", $v["default"].'",'),
477			"\t", sprintf("%02d", (count($v) - 1)), ",\t\t",
478            sprintf("% 2d", strlen($v["default"])), '} },', "\n";
479	} else {
480		echo "\t{ {", sprintf("%-22s", 'NULL,'),
481			"\t", sprintf("%02d", count($v)), ",\t\t0} },\n";
482	}
483	unset($v["default"]);
484	foreach ($v as $l => $w) {
485		echo "\t{ {", sprintf("\"%-21s", $w.'",'), "\t", sprintf("0x%05s", $l), ",\t",
486            sprintf("% 2d", strlen($w)), '} },', "\n";
487	}
488	echo "};\n";
489}
490echo "\n/* End of double code point tables }}} */", "\n\n";
491
492skip_multicp:
493
494if ($pass2 < 2)
495    echo "/* {{{ Stage 3 Tables for $name */", "\n\n";
496
497$t = <<<CODE
498static const entity_stage3_row empty_stage3_table[] = {
499	/* 64 elements */
500	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
501	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
502	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
503	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
504	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
505	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
506	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
507	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
508	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
509	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
510	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
511	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
512	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
513	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
514	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
515	{0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } }, {0, { {NULL, 0} } },
516};
517
518CODE;
519
520if (!$pass2)
521    echo $t;
522
523$mstable = array();
524foreach ($dp as $el) {
525	$s1 = (hexdec($el[1]) & 0xFFF000) >> 12;
526	$s2 = (hexdec($el[1]) & 0xFC0) >> 6;
527	$s3 = hexdec($el[1]) & 0x3F;
528	if (key_exists($el[1], $multicp_rows)) {
529		$mstable[$s1][$s2][$s3] = "";
530	} else {
531		$mstable[$s1][$s2][$s3] = $el[0];
532	}
533}
534
535for ($i = 0; $i < 0x1E; $i++) {
536	for ($k = 0; $k < 64; $k++) {
537		$any3 = false;
538		$col3 = array();
539		for ($l = 0; $l < 64; $l++) {
540			if (isset($mstable[$i][$k][$l])) {
541				$any3 = true;
542				$col3[$l] = $mstable[$i][$k][$l];
543			} else {
544				$col3[$l] = null;
545			}
546		}
547		if ($any3) {
548			echo "static const entity_stage3_row stage3_table_{$ident}_",
549				sprintf("%02X%03X", $i, $k << 6), "[] = {\n";
550			foreach ($col3 as $y => $z) {
551				if ($y == 0) echo "\t";
552				elseif ($y % 4 == 0) echo "\n\t";
553				else echo " ";
554				if ($z === NULL)
555					echo "{0, { {NULL, 0} } },";
556                elseif ($z === "QUOT") /* hack to translate " into &quote;, not &QUOT; */
557                    echo "{0, { {\"quot\", 4} } },";
558				elseif ($z !== "")
559					echo "{0, { {\"$z\", ", strlen($z), "} } },";
560				else
561					echo "{1, { {(void *)", sprintf("multi_cp_{$ident}_%05X",
562						($i << 12) | ($k << 6) | $y ), ", 0} } },";
563
564			}
565			echo "\n};\n\n";
566		}
567	}
568}
569
570if ($pass2 < 2)
571    echo "/* end of stage 3 Tables for $name }}} */", "\n\n";
572
573if ($pass2 > 1)
574    goto hashtables;
575
576echo
577"/* {{{ Stage 2 Tables for $name */", "\n\n";
578
579$t = <<<CODE
580static const entity_stage2_row empty_stage2_table[] = {
581	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
582	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
583	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
584	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
585	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
586	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
587	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
588	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
589	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
590	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
591	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
592	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
593	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
594	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
595	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
596	empty_stage3_table, empty_stage3_table, empty_stage3_table, empty_stage3_table,
597};
598
599CODE;
600
601if (!$pass2)
602    echo $t;
603
604for ($i = 0; $i < 0x1E; $i++) {
605	$any = false;
606	for ($k = 0; $k < 64; $k++) {
607		if (isset($mstable[$i][$k]))
608			$any = true;
609	}
610	if ($any) {
611		echo "static const entity_stage2_row stage2_table_{$ident}_",
612			sprintf("%02X000", $i), "[] = {\n";
613		for ($k = 0; $k < 64; $k++) {
614			if ($k == 0) echo "\t";
615			elseif ($k % 4 == 0) echo "\n\t";
616			else echo " ";
617			if (isset($mstable[$i][$k])) {
618				echo sprintf("stage3_table_{$ident}_%05X", ($i << 12) | ($k << 6)), ",";
619			} else {
620				echo "empty_stage3_table", ",";
621			}
622		}
623		echo "\n};\n\n";
624	}
625}
626
627echo
628"/* end of stage 2 tables for $name }}} */", "\n\n";
629
630echo "static const entity_stage1_row entity_ms_table_{$ident}[] = {\n";
631for ($i = 0; $i < 0x1E; $i++) {
632	if (isset($mstable[$i]))
633		echo "\t", sprintf("stage2_table_{$ident}_%02X000", $i), ",\n";
634	else
635		echo "\tempty_stage2_table,\n";
636}
637echo "};\n\n";
638
639echo
640"/* end of $name multi-stage table for codepoint -> entity }}} */\n\n";
641
642/* commented-out; this enabled binary search, which turned out to be
643 * significantly slower than the hash tables for html 5 entities */
644//echo
645//"/* {{{ HTML 5 tables for entity -> codepoint */", "\n\n";
646
647//$t = <<<CODE
648//typedef struct {
649//	const char *entity;
650//	unsigned short entity_len;
651//	unsigned int codepoint1;
652//	unsigned int codepoint2;
653//} entity_cp_map;
654//
655//#define ENTITY_CP_MAP_CMP(l, lsize, r, rsize) \
656//	( ((lsize)==(rsize)) ? (memcmp((l), (r), (lsize))) : ((lsize)-(rsize)) )
657//
658//static const entity_cp_map html5_ent_cp_map[] = {
659//
660//CODE;
661//echo $t;
662//
663//$dp = $origdp;
664//usort($dp, function($a, $b) { $d = strlen($a[0])-strlen($b[0]);
665//	return $d==0?strcmp($a[0], $b[0]):$d; });
666//
667//$k = 0;
668//foreach ($dp as $o) {
669//	if ($k == 0) echo "\t";
670//	elseif ($k % 3 == 0) echo "\n\t";
671//	else echo " ";
672//	if (isset($o[2]))
673//		echo sprintf('{"%s", %d, 0x%X, 0x%X},', $o[0], strlen($o[0]),
674//			hexdec($o[1]), hexdec($o[2]));
675//	else
676//		echo sprintf('{"%s", %d, 0x%X, 0},', $o[0], strlen($o[0]),
677//			hexdec($o[1]));
678//
679//	if (isset($o[2])) {
680//		$entlen = strlen($o[0]) + 2;
681//		$utf8len = strlen(
682//			mb_convert_encoding("&#x{$o[1]};&#x{$o[2]};", "UTF-8", "HTML-ENTITIES"));
683//		if ($utf8len > $entlen*1.2) {
684//			die("violated assumption for traverse_for_entities");
685//		}
686//	}
687//
688//	$k++;
689//}
690//echo "\n};\n\n";
691//
692//echo "static const size_t html5_ent_cp_map_size = $k;\n\n";
693//
694//echo
695//"/* end of HTML 5 tables for entity -> codepoint }}} */\n\n";
696
697hashtables:
698
699echo
700"/* {{{ $name hash table for entity -> codepoint */", "\n\n";
701
702$t = <<<CODE
703typedef struct {
704	const char *entity;
705	unsigned short entity_len;
706	unsigned int codepoint1;
707	unsigned int codepoint2;
708} entity_cp_map;
709
710typedef const entity_cp_map *entity_ht_bucket;
711
712typedef struct {
713	unsigned num_elems; /* power of 2 */
714	const entity_ht_bucket *buckets; /* .num_elems elements */
715} entity_ht;
716
717static const entity_cp_map ht_bucket_empty[] = { {NULL, 0, 0, 0} };
718
719CODE;
720
721if (!$pass2)
722    echo $t;
723
724function hashfun($str)
725{
726
727	$hash = 5381;
728	$nKeyLength = strlen($str);
729	$pos = 0;
730
731	for (; $nKeyLength > 0; $nKeyLength--) {
732		$hash = (int)(((int)(((int)($hash << 5)) + $hash)) + ord($str[$pos++]))
733				 & 0xFFFFFFFF;
734	}
735	return $hash;
736
737}
738
739$numelems = max(pow(2, ceil(log(1.5*count($origdp))/log(2))),16);
740$mask = $numelems - 1;
741$hashes = array();
742foreach ($origdp as $e) {
743	$hashes[hashfun($e[0]) & $mask][] = $e;
744	if (isset($e[2])) {
745		$entlen = strlen($e[0]) + 2;
746		$utf8len = strlen(
747			mb_convert_encoding("&#x{$e[1]};&#x{$e[2]};", "UTF-8", "HTML-ENTITIES"));
748		if ($utf8len > $entlen*1.2) {
749			die("violated assumption for traverse_for_entities");
750		}
751	}
752}
753
754for ($i = 0; $i < $numelems; $i++) {
755	if (empty($hashes[$i]))
756		continue;
757	echo "static const entity_cp_map ht_bucket_{$ident}_", sprintf("%03X", $i) ,"[] = {";
758	foreach ($hashes[$i] as $h) {
759		if (isset($h[2])) {
760			echo sprintf(' {"%s", %d, 0x%05X, 0x%05X},',
761				$h[0], strlen($h[0]), hexdec($h[1]), hexdec($h[2]));
762		} else {
763			echo sprintf(' {"%s", %d, 0x%05X, 0},',
764				$h[0], strlen($h[0]), hexdec($h[1]));
765		}
766	}
767	echo " {NULL, 0, 0, 0} };\n";
768}
769echo "\n";
770
771echo
772"static const entity_cp_map *const ht_buckets_{$ident}[] = {\n";
773
774for ($i = 0; $i < $numelems; $i++) {
775	if ($i == 0) echo "\t";
776	elseif ($i % 4 == 0) echo "\n\t";
777	else echo " ";
778	if (empty($hashes[$i]))
779		echo "ht_bucket_empty,";
780	else
781		echo "ht_bucket_{$ident}_", sprintf("%03X", $i), ",";
782}
783echo "\n};\n\n";
784
785echo
786"static const entity_ht ent_ht_{$ident} = {
787	", sprintf("0x%X", $numelems), ",
788	ht_buckets_{$ident}
789};\n\n";
790
791echo
792"/* end of $name hash table for entity -> codepoint }}} */\n\n";
793
794if (!$pass2) {
795    $data = file_get_contents("ents_html401.txt");
796    $pass2 = 1;
797    $name = "HTML 4.01";
798    $ident = "html4";
799    goto again;
800} elseif ($pass2 == 1) {
801    $data = file_get_contents("ents_basic.txt");
802    $pass2 = 2;
803    $name = "Basic entities (no apos)";
804    $ident = "be_noapos";
805    goto again;
806} elseif ($pass2 == 2) {
807    $data = file_get_contents("ents_basic_apos.txt");
808    $pass2 = 3;
809    $name = "Basic entities (with apos)";
810    $ident = "be_apos";
811    goto again;
812}
813
814echo "#endif /* HTML_TABLES_H */\n";
815