1From 7c7c35d8ea9f65f081564b3ad1bfe9f0db33dd69 Mon Sep 17 00:00:00 2001
2From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
3Date: Sat, 26 Aug 2023 15:08:59 +0200
4Subject: [PATCH 1/6] Expose line and column information for use in PHP
5
6---
7 source/lexbor/dom/interfaces/node.h  |  2 ++
8 source/lexbor/html/token.h           |  2 ++
9 source/lexbor/html/tokenizer.c       | 24 +++++++++++++++++++++++-
10 source/lexbor/html/tokenizer.h       |  2 ++
11 source/lexbor/html/tokenizer/state.h |  2 ++
12 source/lexbor/html/tree.c            | 11 +++++++++++
13 source/lexbor/html/tree/error.c      |  5 +++--
14 source/lexbor/html/tree/error.h      |  5 +++--
15 8 files changed, 48 insertions(+), 5 deletions(-)
16
17diff --git a/source/lexbor/dom/interfaces/node.h b/source/lexbor/dom/interfaces/node.h
18index c37b790..8ac218b 100644
19--- a/source/lexbor/dom/interfaces/node.h
20+++ b/source/lexbor/dom/interfaces/node.h
21@@ -58,6 +58,8 @@ struct lxb_dom_node {
22
23     lxb_dom_node_type_t    type;
24
25+    size_t                 line;
26+
27 #ifdef LXB_DOM_NODE_USER_VARIABLES
28     LXB_DOM_NODE_USER_VARIABLES
29 #endif /* LXB_DOM_NODE_USER_VARIABLES */
30diff --git a/source/lexbor/html/token.h b/source/lexbor/html/token.h
31index 79accd0..0b7f4fd 100644
32--- a/source/lexbor/html/token.h
33+++ b/source/lexbor/html/token.h
34@@ -33,6 +33,8 @@ enum lxb_html_token_type {
35 typedef struct {
36     const lxb_char_t      *begin;
37     const lxb_char_t      *end;
38+    size_t                line;
39+    size_t                column;
40
41     const lxb_char_t      *text_start;
42     const lxb_char_t      *text_end;
43diff --git a/source/lexbor/html/tokenizer.c b/source/lexbor/html/tokenizer.c
44index 741bced..0bd9aec 100644
45--- a/source/lexbor/html/tokenizer.c
46+++ b/source/lexbor/html/tokenizer.c
47@@ -91,6 +91,7 @@ lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz)
48
49     tkz->pos = tkz->start;
50     tkz->end = tkz->start + LXB_HTML_TKZ_TEMP_SIZE;
51+    /* current_line & current_column already initialized by calloc (zero-based) */
52
53     tkz->tree = NULL;
54     tkz->tags = NULL;
55@@ -152,6 +153,8 @@ lxb_html_tokenizer_inherit(lxb_html_tokenizer_t *tkz_to,
56     tkz_to->start = tkz_from->start;
57     tkz_to->end = tkz_from->end;
58     tkz_to->pos = tkz_to->start;
59+    tkz_to->current_line = tkz_from->current_line;
60+    tkz_to->current_column = tkz_from->current_column;
61
62     return LXB_STATUS_OK;
63 }
64@@ -312,7 +315,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
65     tkz->last = end;
66
67     while (data < end) {
68-        data = tkz->state(tkz, data, end);
69+        size_t current_column = tkz->current_column;
70+        const lxb_char_t *new_data = tkz->state(tkz, data, end);
71+        while (data < new_data) {
72+            /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
73+            if (*data == '\n') {
74+                tkz->current_line++;
75+                current_column = 0;
76+            } else {
77+                /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code.
78+                 * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */
79+                if ((*data & 0b11000000) == 0b10000000) {
80+                    /* Continuation byte, do nothing */
81+                } else {
82+                    /* First byte for a codepoint */
83+                    current_column++;
84+                }
85+            }
86+            data++;
87+        }
88+        tkz->current_column = current_column;
89     }
90
91     return tkz->status;
92diff --git a/source/lexbor/html/tokenizer.h b/source/lexbor/html/tokenizer.h
93index ba9602f..74bb55e 100644
94--- a/source/lexbor/html/tokenizer.h
95+++ b/source/lexbor/html/tokenizer.h
96@@ -73,6 +73,8 @@ struct lxb_html_tokenizer {
97     const lxb_char_t                 *end;
98     const lxb_char_t                 *begin;
99     const lxb_char_t                 *last;
100+    size_t                           current_line;
101+    size_t                           current_column;
102
103     /* Entities */
104     const lexbor_sbst_entry_static_t *entity;
105diff --git a/source/lexbor/html/tokenizer/state.h b/source/lexbor/html/tokenizer/state.h
106index 0892846..77b86ac 100644
107--- a/source/lexbor/html/tokenizer/state.h
108+++ b/source/lexbor/html/tokenizer/state.h
109@@ -90,6 +90,8 @@ extern "C" {
110     do {                                                                       \
111         tkz->pos = tkz->start;                                                 \
112         tkz->token->begin = v_begin;                                           \
113+        tkz->token->line = tkz->current_line;                                  \
114+        tkz->token->column = tkz->current_column;                              \
115     }                                                                          \
116     while (0)
117
118diff --git a/source/lexbor/html/tree.c b/source/lexbor/html/tree.c
119index 0f067e4..bdec6a5 100644
120--- a/source/lexbor/html/tree.c
121+++ b/source/lexbor/html/tree.c
122@@ -434,6 +434,9 @@ lxb_html_tree_create_element_for_token(lxb_html_tree_t *tree,
123         return NULL;
124     }
125
126+    node->line = token->line;
127+    /* We only expose line number in PHP DOM */
128+
129     lxb_status_t status;
130     lxb_dom_element_t *element = lxb_dom_interface_element(node);
131
132@@ -770,6 +773,11 @@ lxb_html_tree_insert_character_for_data(lxb_html_tree_t *tree,
133
134     lxb_dom_interface_text(text)->char_data.data = *str;
135
136+    if (tree->tkz_ref) {
137+        text->line = tree->tkz_ref->token->line;
138+        /* We only expose line number in PHP DOM */
139+    }
140+
141     if (ret_node != NULL) {
142         *ret_node = text;
143     }
144@@ -809,6 +817,9 @@ lxb_html_tree_insert_comment(lxb_html_tree_t *tree,
145         return NULL;
146     }
147
148+    node->line = token->line;
149+    /* We only expose line number in PHP DOM */
150+
151     tree->status = lxb_html_token_make_text(token, &comment->char_data.data,
152                                             tree->document->dom_document.text);
153     if (tree->status != LXB_STATUS_OK) {
154diff --git a/source/lexbor/html/tree/error.c b/source/lexbor/html/tree/error.c
155index e6e43f4..88ad8c4 100644
156--- a/source/lexbor/html/tree/error.c
157+++ b/source/lexbor/html/tree/error.c
158@@ -21,8 +21,9 @@ lxb_html_tree_error_add(lexbor_array_obj_t *parse_errors,
159     }
160
161     entry->id = id;
162-    entry->begin = token->begin;
163-    entry->end = token->end;
164+    entry->line = token->line;
165+    entry->column = token->column;
166+    entry->length = token->end - token->begin;
167
168     return entry;
169 }
170diff --git a/source/lexbor/html/tree/error.h b/source/lexbor/html/tree/error.h
171index 2fd06cb..ed1859f 100644
172--- a/source/lexbor/html/tree/error.h
173+++ b/source/lexbor/html/tree/error.h
174@@ -97,8 +97,9 @@ lxb_html_tree_error_id_t;
175
176 typedef struct {
177     lxb_html_tree_error_id_t id;
178-    const lxb_char_t         *begin;
179-    const lxb_char_t         *end;
180+    size_t                   line;
181+    size_t                   column;
182+    size_t                   length;
183 }
184 lxb_html_tree_error_t;
185
186--
1872.44.0
188
189