1From 7c7c35d8ea9f65f081564b3ad1bfe9f0db33dd69 Mon Sep 17 00:00:00 2001 2From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> 3Date: Sat, 26 Aug 2023 15:08:59 +0200 4Subject: [PATCH 1/6] Expose line and column information for use in PHP 5 6--- 7 source/lexbor/dom/interfaces/node.h | 2 ++ 8 source/lexbor/html/token.h | 2 ++ 9 source/lexbor/html/tokenizer.c | 24 +++++++++++++++++++++++- 10 source/lexbor/html/tokenizer.h | 2 ++ 11 source/lexbor/html/tokenizer/state.h | 2 ++ 12 source/lexbor/html/tree.c | 11 +++++++++++ 13 source/lexbor/html/tree/error.c | 5 +++-- 14 source/lexbor/html/tree/error.h | 5 +++-- 15 8 files changed, 48 insertions(+), 5 deletions(-) 16 17diff --git a/source/lexbor/dom/interfaces/node.h b/source/lexbor/dom/interfaces/node.h 18index c37b790..8ac218b 100644 19--- a/source/lexbor/dom/interfaces/node.h 20+++ b/source/lexbor/dom/interfaces/node.h 21@@ -58,6 +58,8 @@ struct lxb_dom_node { 22 23 lxb_dom_node_type_t type; 24 25+ size_t line; 26+ 27 #ifdef LXB_DOM_NODE_USER_VARIABLES 28 LXB_DOM_NODE_USER_VARIABLES 29 #endif /* LXB_DOM_NODE_USER_VARIABLES */ 30diff --git a/source/lexbor/html/token.h b/source/lexbor/html/token.h 31index 79accd0..0b7f4fd 100644 32--- a/source/lexbor/html/token.h 33+++ b/source/lexbor/html/token.h 34@@ -33,6 +33,8 @@ enum lxb_html_token_type { 35 typedef struct { 36 const lxb_char_t *begin; 37 const lxb_char_t *end; 38+ size_t line; 39+ size_t column; 40 41 const lxb_char_t *text_start; 42 const lxb_char_t *text_end; 43diff --git a/source/lexbor/html/tokenizer.c b/source/lexbor/html/tokenizer.c 44index 741bced..0bd9aec 100644 45--- a/source/lexbor/html/tokenizer.c 46+++ b/source/lexbor/html/tokenizer.c 47@@ -91,6 +91,7 @@ lxb_html_tokenizer_init(lxb_html_tokenizer_t *tkz) 48 49 tkz->pos = tkz->start; 50 tkz->end = tkz->start + LXB_HTML_TKZ_TEMP_SIZE; 51+ /* current_line & current_column already initialized by calloc (zero-based) */ 52 53 tkz->tree = NULL; 54 tkz->tags = NULL; 55@@ -152,6 +153,8 @@ lxb_html_tokenizer_inherit(lxb_html_tokenizer_t *tkz_to, 56 tkz_to->start = tkz_from->start; 57 tkz_to->end = tkz_from->end; 58 tkz_to->pos = tkz_to->start; 59+ tkz_to->current_line = tkz_from->current_line; 60+ tkz_to->current_column = tkz_from->current_column; 61 62 return LXB_STATUS_OK; 63 } 64@@ -312,7 +315,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, 65 tkz->last = end; 66 67 while (data < end) { 68- data = tkz->state(tkz, data, end); 69+ size_t current_column = tkz->current_column; 70+ const lxb_char_t *new_data = tkz->state(tkz, data, end); 71+ while (data < new_data) { 72+ /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */ 73+ if (*data == '\n') { 74+ tkz->current_line++; 75+ current_column = 0; 76+ } else { 77+ /* Other characters can be mapped back to the unicode codepoint offset because UTF-8 is a prefix code. 78+ * Continuation bytes start with 0b10XXXXXX so we can skip those to only get the start of an encoded code point. */ 79+ if ((*data & 0b11000000) == 0b10000000) { 80+ /* Continuation byte, do nothing */ 81+ } else { 82+ /* First byte for a codepoint */ 83+ current_column++; 84+ } 85+ } 86+ data++; 87+ } 88+ tkz->current_column = current_column; 89 } 90 91 return tkz->status; 92diff --git a/source/lexbor/html/tokenizer.h b/source/lexbor/html/tokenizer.h 93index ba9602f..74bb55e 100644 94--- a/source/lexbor/html/tokenizer.h 95+++ b/source/lexbor/html/tokenizer.h 96@@ -73,6 +73,8 @@ struct lxb_html_tokenizer { 97 const lxb_char_t *end; 98 const lxb_char_t *begin; 99 const lxb_char_t *last; 100+ size_t current_line; 101+ size_t current_column; 102 103 /* Entities */ 104 const lexbor_sbst_entry_static_t *entity; 105diff --git a/source/lexbor/html/tokenizer/state.h b/source/lexbor/html/tokenizer/state.h 106index 0892846..77b86ac 100644 107--- a/source/lexbor/html/tokenizer/state.h 108+++ b/source/lexbor/html/tokenizer/state.h 109@@ -90,6 +90,8 @@ extern "C" { 110 do { \ 111 tkz->pos = tkz->start; \ 112 tkz->token->begin = v_begin; \ 113+ tkz->token->line = tkz->current_line; \ 114+ tkz->token->column = tkz->current_column; \ 115 } \ 116 while (0) 117 118diff --git a/source/lexbor/html/tree.c b/source/lexbor/html/tree.c 119index 0f067e4..bdec6a5 100644 120--- a/source/lexbor/html/tree.c 121+++ b/source/lexbor/html/tree.c 122@@ -434,6 +434,9 @@ lxb_html_tree_create_element_for_token(lxb_html_tree_t *tree, 123 return NULL; 124 } 125 126+ node->line = token->line; 127+ /* We only expose line number in PHP DOM */ 128+ 129 lxb_status_t status; 130 lxb_dom_element_t *element = lxb_dom_interface_element(node); 131 132@@ -770,6 +773,11 @@ lxb_html_tree_insert_character_for_data(lxb_html_tree_t *tree, 133 134 lxb_dom_interface_text(text)->char_data.data = *str; 135 136+ if (tree->tkz_ref) { 137+ text->line = tree->tkz_ref->token->line; 138+ /* We only expose line number in PHP DOM */ 139+ } 140+ 141 if (ret_node != NULL) { 142 *ret_node = text; 143 } 144@@ -809,6 +817,9 @@ lxb_html_tree_insert_comment(lxb_html_tree_t *tree, 145 return NULL; 146 } 147 148+ node->line = token->line; 149+ /* We only expose line number in PHP DOM */ 150+ 151 tree->status = lxb_html_token_make_text(token, &comment->char_data.data, 152 tree->document->dom_document.text); 153 if (tree->status != LXB_STATUS_OK) { 154diff --git a/source/lexbor/html/tree/error.c b/source/lexbor/html/tree/error.c 155index e6e43f4..88ad8c4 100644 156--- a/source/lexbor/html/tree/error.c 157+++ b/source/lexbor/html/tree/error.c 158@@ -21,8 +21,9 @@ lxb_html_tree_error_add(lexbor_array_obj_t *parse_errors, 159 } 160 161 entry->id = id; 162- entry->begin = token->begin; 163- entry->end = token->end; 164+ entry->line = token->line; 165+ entry->column = token->column; 166+ entry->length = token->end - token->begin; 167 168 return entry; 169 } 170diff --git a/source/lexbor/html/tree/error.h b/source/lexbor/html/tree/error.h 171index 2fd06cb..ed1859f 100644 172--- a/source/lexbor/html/tree/error.h 173+++ b/source/lexbor/html/tree/error.h 174@@ -97,8 +97,9 @@ lxb_html_tree_error_id_t; 175 176 typedef struct { 177 lxb_html_tree_error_id_t id; 178- const lxb_char_t *begin; 179- const lxb_char_t *end; 180+ size_t line; 181+ size_t column; 182+ size_t length; 183 } 184 lxb_html_tree_error_t; 185 186-- 1872.44.0 188 189