From 16daa8e860e393ff39613b908550b0982a2210f2 Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Wed, 29 Nov 2023 21:29:31 +0100 Subject: [PATCH 5/6] Shrink size of static binary search tree This also makes it more efficient on the data cache. --- source/lexbor/core/sbst.h | 10 +++++----- source/lexbor/html/tokenizer/state.c | 2 +- utils/lexbor/html/tmp/tokenizer_res.h | 2 +- utils/lexbor/html/tokenizer_entities_bst.py | 8 ++++---- utils/lexbor/lexbor/LXB.py | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/source/lexbor/core/sbst.h b/source/lexbor/core/sbst.h index b0fbc54..40e0e91 100644 --- a/source/lexbor/core/sbst.h +++ b/source/lexbor/core/sbst.h @@ -19,12 +19,12 @@ extern "C" { typedef struct { lxb_char_t key; - void *value; - size_t value_len; + lxb_char_t value[6]; + unsigned char value_len; - size_t left; - size_t right; - size_t next; + unsigned short left; + unsigned short right; + unsigned short next; } lexbor_sbst_entry_static_t; diff --git a/source/lexbor/html/tokenizer/state.c b/source/lexbor/html/tokenizer/state.c index 70ca391..2f3414f 100644 --- a/source/lexbor/html/tokenizer/state.c +++ b/source/lexbor/html/tokenizer/state.c @@ -1815,7 +1815,7 @@ lxb_html_tokenizer_state_char_ref_named(lxb_html_tokenizer_t *tkz, goto done; } - if (entry->value != NULL) { + if (entry->value[0] != 0) { tkz->entity_end = (tkz->pos + (data - begin)) - tkz->start; tkz->entity_match = entry; } diff --git a/utils/lexbor/html/tmp/tokenizer_res.h b/utils/lexbor/html/tmp/tokenizer_res.h index b3701d5..73ab66e 100644 --- a/utils/lexbor/html/tmp/tokenizer_res.h +++ b/utils/lexbor/html/tmp/tokenizer_res.h @@ -6,7 +6,7 @@ /* * Caution!!! Important!!! - * This file generated by the script + * This file is generated by the script * "utils/lexbor/html/tokenizer_entities_bst.py"! * Do not change this file! */ diff --git a/utils/lexbor/html/tokenizer_entities_bst.py b/utils/lexbor/html/tokenizer_entities_bst.py index ee7dcb4..7cd1335 100755 --- a/utils/lexbor/html/tokenizer_entities_bst.py +++ b/utils/lexbor/html/tokenizer_entities_bst.py @@ -1,6 +1,6 @@ import json -import sys, re, os +import sys, os # Find and append run script run dir to module search path ABS_PATH = os.path.dirname(os.path.abspath(__file__)) @@ -62,7 +62,7 @@ def entities_bst_create_layer(name, entry, index): def entities_bst_create(index): bst = {} - bst[0] = ["\0", 0, 0, 0, "NULL"] + bst[0] = ["\0", 0, 0, 0, "{0}"] begin = 1 idx = end = entities_bst_create_tree(index, bst, begin) @@ -114,7 +114,7 @@ def entities_bst_create_tree(index, bst, idx): assert len(index[ split[0] ]['values']) < 2, 'Double values' if len(index[ split[0] ]['values']) == 0: - value = "NULL" + value = "{0}" else: value = '"{}"'.format(toHex(index[ split[0] ]['values'][0]['characters'])) @@ -210,5 +210,5 @@ def entities_bst_print(bst): if __name__ == "__main__": entities_bst("tmp/tokenizer_res.h", - "../../../source/lexbor/html/tokenizer_res.h", + "../../../source/lexbor/html/tokenizer/res.h", "data/entities.json"); diff --git a/utils/lexbor/lexbor/LXB.py b/utils/lexbor/lexbor/LXB.py index 2370c66..c41e645 100755 --- a/utils/lexbor/lexbor/LXB.py +++ b/utils/lexbor/lexbor/LXB.py @@ -27,7 +27,7 @@ class Temp: for line in fh: for name in self.patterns: - line = re.sub(name, '\n'.join(self.patterns[name]), line) + line = line.replace(name, '\n'.join(self.patterns[name])) self.buffer.append(line) fh.close() -- 2.44.0