lexbor/patches/0005-Shrink-size-of-static-binary-search-tree.patch

From 16daa8e860e393ff39613b908550b0982a2210f2 Mon Sep 17 00:00:00 2001
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
Date: Wed, 29 Nov 2023 21:29:31 +0100
Subject: [PATCH 5/6] Shrink size of static binary search tree

This also makes it more efficient on the data cache.
---
 source/lexbor/core/sbst.h                   | 10 +++++-----
 source/lexbor/html/tokenizer/state.c        |  2 +-
 utils/lexbor/html/tmp/tokenizer_res.h       |  2 +-
 utils/lexbor/html/tokenizer_entities_bst.py |  8 ++++----
 utils/lexbor/lexbor/LXB.py                  |  2 +-
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/source/lexbor/core/sbst.h b/source/lexbor/core/sbst.h
index b0fbc54..40e0e91 100644
--- a/source/lexbor/core/sbst.h
+++ b/source/lexbor/core/sbst.h
@@ -19,12 +19,12 @@ extern "C" {
 typedef struct {
     lxb_char_t key;

-    void       *value;
-    size_t     value_len;
+    lxb_char_t         value[6];
+    unsigned char      value_len;

-    size_t     left;
-    size_t     right;
-    size_t     next;
+    unsigned short     left;
+    unsigned short     right;
+    unsigned short     next;
 }
 lexbor_sbst_entry_static_t;

diff --git a/source/lexbor/html/tokenizer/state.c b/source/lexbor/html/tokenizer/state.c
index 70ca391..2f3414f 100644
--- a/source/lexbor/html/tokenizer/state.c
+++ b/source/lexbor/html/tokenizer/state.c
@@ -1815,7 +1815,7 @@ lxb_html_tokenizer_state_char_ref_named(lxb_html_tokenizer_t *tkz,
             goto done;
         }

-        if (entry->value != NULL) {
+        if (entry->value[0] != 0) {
             tkz->entity_end = (tkz->pos + (data - begin)) - tkz->start;
             tkz->entity_match = entry;
         }
diff --git a/utils/lexbor/html/tmp/tokenizer_res.h b/utils/lexbor/html/tmp/tokenizer_res.h
index b3701d5..73ab66e 100644
--- a/utils/lexbor/html/tmp/tokenizer_res.h
+++ b/utils/lexbor/html/tmp/tokenizer_res.h
@@ -6,7 +6,7 @@

 /*
  * Caution!!! Important!!!
- * This file generated by the script
+ * This file is generated by the script
  * "utils/lexbor/html/tokenizer_entities_bst.py"!
  * Do not change this file!
  */
diff --git a/utils/lexbor/html/tokenizer_entities_bst.py b/utils/lexbor/html/tokenizer_entities_bst.py
index ee7dcb4..7cd1335 100755
--- a/utils/lexbor/html/tokenizer_entities_bst.py
+++ b/utils/lexbor/html/tokenizer_entities_bst.py
@@ -1,6 +1,6 @@

 import json
-import sys, re, os
+import sys, os

 # Find and append run script run dir to module search path
 ABS_PATH = os.path.dirname(os.path.abspath(__file__))
@@ -62,7 +62,7 @@ def entities_bst_create_layer(name, entry, index):

 def entities_bst_create(index):
     bst = {}
-    bst[0] = ["\0", 0, 0, 0, "NULL"]
+    bst[0] = ["\0", 0, 0, 0, "{0}"]

     begin = 1
     idx = end = entities_bst_create_tree(index, bst, begin)
@@ -114,7 +114,7 @@ def entities_bst_create_tree(index, bst, idx):
         assert len(index[ split[0] ]['values']) < 2, 'Double values'

         if len(index[ split[0] ]['values']) == 0:
-            value = "NULL"
+            value = "{0}"
         else:
             value = '"{}"'.format(toHex(index[ split[0] ]['values'][0]['characters']))

@@ -210,5 +210,5 @@ def entities_bst_print(bst):

 if __name__ == "__main__":
     entities_bst("tmp/tokenizer_res.h",
-                 "../../../source/lexbor/html/tokenizer_res.h",
+                 "../../../source/lexbor/html/tokenizer/res.h",
                  "data/entities.json");
diff --git a/utils/lexbor/lexbor/LXB.py b/utils/lexbor/lexbor/LXB.py
index 2370c66..c41e645 100755
--- a/utils/lexbor/lexbor/LXB.py
+++ b/utils/lexbor/lexbor/LXB.py
@@ -27,7 +27,7 @@ class Temp:

         for line in fh:
             for name in self.patterns:
-                line = re.sub(name, '\n'.join(self.patterns[name]), line)
+                line = line.replace(name, '\n'.join(self.patterns[name]))
             self.buffer.append(line)
         fh.close()

--
2.44.0