1 /* 2 * Copyright (C) 2019 Alexander Borisov 3 * 4 * Author: Alexander Borisov <borisov@lexbor.com> 5 */ 6 7 #ifndef LEXBOR_ENCODING_BASE_H 8 #define LEXBOR_ENCODING_BASE_H 9 10 #ifdef __cplusplus 11 extern "C" { 12 #endif 13 14 #include "lexbor/core/base.h" 15 #include "lexbor/encoding/const.h" 16 17 18 #define LXB_ENCODING_VERSION_MAJOR 2 19 #define LXB_ENCODING_VERSION_MINOR 0 20 #define LXB_ENCODING_VERSION_PATCH 1 21 22 #define LXB_ENCODING_VERSION_STRING \ 23 LEXBOR_STRINGIZE(LXB_ENCODING_VERSION_MAJOR) "." \ 24 LEXBOR_STRINGIZE(LXB_ENCODING_VERSION_MINOR) "." \ 25 LEXBOR_STRINGIZE(LXB_ENCODING_VERSION_PATCH) 26 27 28 #define LXB_ENCODING_REPLACEMENT_BYTES ((lxb_char_t *) "\xEF\xBF\xBD") 29 30 #define LXB_ENCODING_REPLACEMENT_BUFFER_LEN 1 31 #define LXB_ENCODING_REPLACEMENT_BUFFER \ 32 (&(const lxb_codepoint_t) {LXB_ENCODING_REPLACEMENT_CODEPOINT}) 33 34 35 /* 36 * In UTF-8 0x10FFFF value is maximum (inclusive) 37 */ 38 enum { 39 LXB_ENCODING_REPLACEMENT_SIZE = 0x03, 40 LXB_ENCODING_REPLACEMENT_CODEPOINT = 0xFFFD, 41 LXB_ENCODING_MAX_CODEPOINT = 0x10FFFF, 42 LXB_ENCODING_ERROR_CODEPOINT = 0x1FFFFF 43 }; 44 45 enum { 46 LXB_ENCODING_ENCODE_OK = 0x00, 47 LXB_ENCODING_ENCODE_ERROR = -0x01, 48 LXB_ENCODING_ENCODE_SMALL_BUFFER = -0x02 49 }; 50 51 enum { 52 LXB_ENCODING_DECODE_MAX_CODEPOINT = LXB_ENCODING_MAX_CODEPOINT, 53 LXB_ENCODING_DECODE_ERROR = LXB_ENCODING_ERROR_CODEPOINT, 54 LXB_ENCODING_DECODE_CONTINUE = 0x2FFFFF 55 }; 56 57 enum { 58 LXB_ENCODING_DECODE_2022_JP_ASCII = 0x00, 59 LXB_ENCODING_DECODE_2022_JP_ROMAN, 60 LXB_ENCODING_DECODE_2022_JP_KATAKANA, 61 LXB_ENCODING_DECODE_2022_JP_LEAD, 62 LXB_ENCODING_DECODE_2022_JP_TRAIL, 63 LXB_ENCODING_DECODE_2022_JP_ESCAPE_START, 64 LXB_ENCODING_DECODE_2022_JP_ESCAPE, 65 LXB_ENCODING_DECODE_2022_JP_UNSET 66 }; 67 68 enum { 69 LXB_ENCODING_ENCODE_2022_JP_ASCII = 0x00, 70 LXB_ENCODING_ENCODE_2022_JP_ROMAN, 71 LXB_ENCODING_ENCODE_2022_JP_JIS0208 72 }; 73 74 typedef struct { 75 unsigned need; 76 lxb_char_t lower; 77 lxb_char_t upper; 78 } 79 lxb_encoding_ctx_utf_8_t; 80 81 typedef struct { 82 lxb_char_t first; 83 lxb_char_t second; 84 lxb_char_t third; 85 } 86 lxb_encoding_ctx_gb18030_t; 87 88 typedef struct { 89 lxb_char_t lead; 90 bool is_jis0212; 91 } 92 lxb_encoding_ctx_euc_jp_t; 93 94 typedef struct { 95 lxb_char_t lead; 96 lxb_char_t prepand; 97 unsigned state; 98 unsigned out_state; 99 bool out_flag; 100 } 101 lxb_encoding_ctx_2022_jp_t; 102 103 typedef struct lxb_encoding_data lxb_encoding_data_t; 104 105 typedef struct { 106 const lxb_encoding_data_t *encoding_data; 107 108 /* Out buffer */ 109 lxb_codepoint_t *buffer_out; 110 size_t buffer_length; 111 size_t buffer_used; 112 113 /* 114 * Bad code points will be replaced to user code point. 115 * If replace_to == 0 stop parsing and return error ot user. 116 */ 117 const lxb_codepoint_t *replace_to; 118 size_t replace_len; 119 120 /* Not for users */ 121 lxb_codepoint_t codepoint; 122 lxb_codepoint_t second_codepoint; 123 bool prepend; 124 bool have_error; 125 126 lxb_status_t status; 127 128 union { 129 lxb_encoding_ctx_utf_8_t utf_8; 130 lxb_encoding_ctx_gb18030_t gb18030; 131 unsigned lead; 132 lxb_encoding_ctx_euc_jp_t euc_jp; 133 lxb_encoding_ctx_2022_jp_t iso_2022_jp; 134 } u; 135 } 136 lxb_encoding_decode_t; 137 138 typedef struct { 139 const lxb_encoding_data_t *encoding_data; 140 141 /* Out buffer */ 142 lxb_char_t *buffer_out; 143 size_t buffer_length; 144 size_t buffer_used; 145 146 /* 147 * Bad code points will be replaced to user bytes. 148 * If replace_to == NULL stop parsing and return error ot user. 149 */ 150 const lxb_char_t *replace_to; 151 size_t replace_len; 152 153 unsigned state; 154 } 155 lxb_encoding_encode_t; 156 157 /* 158 * Why can't I pass a char ** to a function which expects a const char **? 159 * http://c-faq.com/ansi/constmismatch.html 160 * 161 * Short answer: use cast (const char **). 162 * 163 * For example: 164 * lxb_encoding_ctx_t ctx = {0}; 165 * const lxb_encoding_data_t *enc; 166 * 167 * lxb_char_t *data = (lxb_char_t *) "\x81\x30\x84\x36"; 168 * 169 * enc = lxb_encoding_data(LXB_ENCODING_GB18030); 170 * 171 * enc->decode(&ctx, (const lxb_char_t **) &data, data + 4); 172 */ 173 typedef lxb_status_t 174 (*lxb_encoding_encode_f)(lxb_encoding_encode_t *ctx, const lxb_codepoint_t **cp, 175 const lxb_codepoint_t *end); 176 177 typedef lxb_status_t 178 (*lxb_encoding_decode_f)(lxb_encoding_decode_t *ctx, 179 const lxb_char_t **data, const lxb_char_t *end); 180 181 typedef int8_t 182 (*lxb_encoding_encode_single_f)(lxb_encoding_encode_t *ctx, lxb_char_t **data, 183 const lxb_char_t *end, lxb_codepoint_t cp); 184 185 typedef lxb_codepoint_t 186 (*lxb_encoding_decode_single_f)(lxb_encoding_decode_t *ctx, 187 const lxb_char_t **data, const lxb_char_t *end); 188 189 struct lxb_encoding_data { 190 lxb_encoding_t encoding; 191 lxb_encoding_encode_f encode; 192 lxb_encoding_decode_f decode; 193 lxb_encoding_encode_single_f encode_single; 194 lxb_encoding_decode_single_f decode_single; 195 lxb_char_t *name; 196 }; 197 198 typedef struct { 199 lxb_char_t name[4]; 200 unsigned size; 201 lxb_codepoint_t codepoint; 202 } 203 lxb_encoding_single_index_t; 204 205 typedef lxb_encoding_single_index_t lxb_encoding_multi_index_t; 206 207 typedef struct { 208 unsigned index; 209 lxb_codepoint_t codepoint; 210 } 211 lxb_encoding_range_index_t; 212 213 214 #ifdef __cplusplus 215 } /* extern "C" */ 216 #endif 217 218 #endif /* LEXBOR_ENCODING_BASE_H */ 219