xref: /php-src/ext/dom/lexbor/lexbor/encoding/base.h (revision f0934090)
1 /*
2  * Copyright (C) 2019 Alexander Borisov
3  *
4  * Author: Alexander Borisov <borisov@lexbor.com>
5  */
6 
7 #ifndef LEXBOR_ENCODING_BASE_H
8 #define LEXBOR_ENCODING_BASE_H
9 
10 #ifdef __cplusplus
11 extern "C" {
12 #endif
13 
14 #include "lexbor/core/base.h"
15 #include "lexbor/encoding/const.h"
16 
17 
18 #define LXB_ENCODING_VERSION_MAJOR 2
19 #define LXB_ENCODING_VERSION_MINOR 0
20 #define LXB_ENCODING_VERSION_PATCH 1
21 
22 #define LXB_ENCODING_VERSION_STRING                                            \
23         LEXBOR_STRINGIZE(LXB_ENCODING_VERSION_MAJOR) "."                       \
24         LEXBOR_STRINGIZE(LXB_ENCODING_VERSION_MINOR) "."                       \
25         LEXBOR_STRINGIZE(LXB_ENCODING_VERSION_PATCH)
26 
27 
28 #define LXB_ENCODING_REPLACEMENT_BYTES ((lxb_char_t *) "\xEF\xBF\xBD")
29 
30 #define LXB_ENCODING_REPLACEMENT_BUFFER_LEN 1
31 #define LXB_ENCODING_REPLACEMENT_BUFFER                                        \
32     (&(const lxb_codepoint_t) {LXB_ENCODING_REPLACEMENT_CODEPOINT})
33 
34 
35 /*
36  * In UTF-8 0x10FFFF value is maximum (inclusive)
37  */
38 enum {
39     LXB_ENCODING_REPLACEMENT_SIZE      = 0x03,
40     LXB_ENCODING_REPLACEMENT_CODEPOINT = 0xFFFD,
41     LXB_ENCODING_MAX_CODEPOINT         = 0x10FFFF,
42     LXB_ENCODING_ERROR_CODEPOINT       = 0x1FFFFF
43 };
44 
45 enum {
46     LXB_ENCODING_ENCODE_OK           =  0x00,
47     LXB_ENCODING_ENCODE_ERROR        = -0x01,
48     LXB_ENCODING_ENCODE_SMALL_BUFFER = -0x02
49 };
50 
51 enum {
52     LXB_ENCODING_DECODE_MAX_CODEPOINT = LXB_ENCODING_MAX_CODEPOINT,
53     LXB_ENCODING_DECODE_ERROR         = LXB_ENCODING_ERROR_CODEPOINT,
54     LXB_ENCODING_DECODE_CONTINUE      = 0x2FFFFF
55 };
56 
57 enum {
58     LXB_ENCODING_DECODE_2022_JP_ASCII = 0x00,
59     LXB_ENCODING_DECODE_2022_JP_ROMAN,
60     LXB_ENCODING_DECODE_2022_JP_KATAKANA,
61     LXB_ENCODING_DECODE_2022_JP_LEAD,
62     LXB_ENCODING_DECODE_2022_JP_TRAIL,
63     LXB_ENCODING_DECODE_2022_JP_ESCAPE_START,
64     LXB_ENCODING_DECODE_2022_JP_ESCAPE,
65     LXB_ENCODING_DECODE_2022_JP_UNSET
66 };
67 
68 enum {
69     LXB_ENCODING_ENCODE_2022_JP_ASCII = 0x00,
70     LXB_ENCODING_ENCODE_2022_JP_ROMAN,
71     LXB_ENCODING_ENCODE_2022_JP_JIS0208
72 };
73 
74 typedef struct {
75     unsigned   need;
76     lxb_char_t lower;
77     lxb_char_t upper;
78 }
79 lxb_encoding_ctx_utf_8_t;
80 
81 typedef struct {
82     lxb_char_t first;
83     lxb_char_t second;
84     lxb_char_t third;
85 }
86 lxb_encoding_ctx_gb18030_t;
87 
88 typedef struct {
89     lxb_char_t lead;
90     bool       is_jis0212;
91 }
92 lxb_encoding_ctx_euc_jp_t;
93 
94 typedef struct {
95     lxb_char_t lead;
96     lxb_char_t prepand;
97     unsigned   state;
98     unsigned   out_state;
99     bool       out_flag;
100 }
101 lxb_encoding_ctx_2022_jp_t;
102 
103 typedef struct lxb_encoding_data lxb_encoding_data_t;
104 
105 typedef struct {
106     const lxb_encoding_data_t *encoding_data;
107 
108     /* Out buffer */
109     lxb_codepoint_t           *buffer_out;
110     size_t                    buffer_length;
111     size_t                    buffer_used;
112 
113     /*
114      * Bad code points will be replaced to user code point.
115      * If replace_to == 0 stop parsing and return error ot user.
116      */
117     const lxb_codepoint_t     *replace_to;
118     size_t                    replace_len;
119 
120     /* Not for users */
121     lxb_codepoint_t           codepoint;
122     lxb_codepoint_t           second_codepoint;
123     bool                      prepend;
124     bool                      have_error;
125 
126     lxb_status_t              status;
127 
128     union {
129         lxb_encoding_ctx_utf_8_t   utf_8;
130         lxb_encoding_ctx_gb18030_t gb18030;
131         unsigned                   lead;
132         lxb_encoding_ctx_euc_jp_t  euc_jp;
133         lxb_encoding_ctx_2022_jp_t iso_2022_jp;
134     } u;
135 }
136 lxb_encoding_decode_t;
137 
138 typedef struct {
139     const lxb_encoding_data_t *encoding_data;
140 
141     /* Out buffer */
142     lxb_char_t                *buffer_out;
143     size_t                    buffer_length;
144     size_t                    buffer_used;
145 
146     /*
147      * Bad code points will be replaced to user bytes.
148      * If replace_to == NULL stop parsing and return error ot user.
149      */
150     const lxb_char_t          *replace_to;
151     size_t                    replace_len;
152 
153     unsigned                  state;
154 }
155 lxb_encoding_encode_t;
156 
157 /*
158 * Why can't I pass a char ** to a function which expects a const char **?
159 * http://c-faq.com/ansi/constmismatch.html
160 *
161 * Short answer: use cast (const char **).
162 *
163 * For example:
164 *     lxb_encoding_ctx_t ctx = {0};
165 *     const lxb_encoding_data_t *enc;
166 *
167 *     lxb_char_t *data = (lxb_char_t *) "\x81\x30\x84\x36";
168 *
169 *     enc = lxb_encoding_data(LXB_ENCODING_GB18030);
170 *
171 *     enc->decode(&ctx, (const lxb_char_t **) &data, data + 4);
172 */
173 typedef lxb_status_t
174 (*lxb_encoding_encode_f)(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cp,
175                          const lxb_codepoint_t *end);
176 
177 typedef lxb_status_t
178 (*lxb_encoding_decode_f)(lxb_encoding_decode_t *ctx,
179                          const lxb_char_t **data, const lxb_char_t *end);
180 
181 typedef int8_t
182 (*lxb_encoding_encode_single_f)(lxb_encoding_encode_t *ctx, lxb_char_t **data,
183                                 const lxb_char_t *end, lxb_codepoint_t cp);
184 
185 typedef lxb_codepoint_t
186 (*lxb_encoding_decode_single_f)(lxb_encoding_decode_t *ctx,
187                                 const lxb_char_t **data, const lxb_char_t *end);
188 
189 struct lxb_encoding_data {
190     lxb_encoding_t               encoding;
191     lxb_encoding_encode_f        encode;
192     lxb_encoding_decode_f        decode;
193     lxb_encoding_encode_single_f encode_single;
194     lxb_encoding_decode_single_f decode_single;
195     lxb_char_t                   *name;
196 };
197 
198 typedef struct {
199     lxb_char_t      name[4];
200     unsigned        size;
201     lxb_codepoint_t codepoint;
202 }
203 lxb_encoding_single_index_t;
204 
205 typedef lxb_encoding_single_index_t lxb_encoding_multi_index_t;
206 
207 typedef struct {
208     unsigned        index;
209     lxb_codepoint_t codepoint;
210 }
211 lxb_encoding_range_index_t;
212 
213 
214 #ifdef __cplusplus
215 } /* extern "C" */
216 #endif
217 
218 #endif /* LEXBOR_ENCODING_BASE_H */
219