1 /*
2 * "streamable kanji code filter and converter"
3 * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4 *
5 * LICENSE NOTICES
6 *
7 * This file is part of "streamable kanji code filter and converter",
8 * which is distributed under the terms of GNU Lesser General Public
9 * License (version 2) as published by the Free Software Foundation.
10 *
11 * This software is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with "streamable kanji code filter and converter";
18 * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19 * Suite 330, Boston, MA 02111-1307 USA
20 *
21 * The author of this file:
22 *
23 */
24 /*
25 * The source code included in this files was separated from mbfilter.h
26 * by Moriyoshi Koizumi <moriyoshi@php.net> on 20 Dec 2002. The file
27 * mbfilter.h is included in this package .
28 *
29 */
30
31 #ifndef MBFL_ENCODING_H
32 #define MBFL_ENCODING_H
33
34 #include "mbfl_defs.h"
35 #include "mbfl_consts.h"
36 #include "zend.h"
37
38 enum mbfl_no_encoding {
39 mbfl_no_encoding_invalid = -1,
40 mbfl_no_encoding_pass,
41 mbfl_no_encoding_wchar,
42 mbfl_no_encoding_base64,
43 mbfl_no_encoding_uuencode,
44 mbfl_no_encoding_html_ent,
45 mbfl_no_encoding_qprint,
46 mbfl_no_encoding_7bit,
47 mbfl_no_encoding_8bit,
48 mbfl_no_encoding_charset_min,
49 mbfl_no_encoding_ucs4,
50 mbfl_no_encoding_ucs4be,
51 mbfl_no_encoding_ucs4le,
52 mbfl_no_encoding_ucs2,
53 mbfl_no_encoding_ucs2be,
54 mbfl_no_encoding_ucs2le,
55 mbfl_no_encoding_utf32,
56 mbfl_no_encoding_utf32be,
57 mbfl_no_encoding_utf32le,
58 mbfl_no_encoding_utf16,
59 mbfl_no_encoding_utf16be,
60 mbfl_no_encoding_utf16le,
61 mbfl_no_encoding_utf8,
62 mbfl_no_encoding_utf8_docomo,
63 mbfl_no_encoding_utf8_kddi_a,
64 mbfl_no_encoding_utf8_kddi_b,
65 mbfl_no_encoding_utf8_sb,
66 mbfl_no_encoding_utf7,
67 mbfl_no_encoding_utf7imap,
68 mbfl_no_encoding_ascii,
69 mbfl_no_encoding_euc_jp,
70 mbfl_no_encoding_eucjp2004,
71 mbfl_no_encoding_sjis,
72 mbfl_no_encoding_eucjp_win,
73 mbfl_no_encoding_sjis_docomo,
74 mbfl_no_encoding_sjis_kddi,
75 mbfl_no_encoding_sjis_sb,
76 mbfl_no_encoding_sjis_mac,
77 mbfl_no_encoding_sjis2004,
78 mbfl_no_encoding_cp932,
79 mbfl_no_encoding_sjiswin,
80 mbfl_no_encoding_cp51932,
81 mbfl_no_encoding_jis,
82 mbfl_no_encoding_2022jp,
83 mbfl_no_encoding_2022jp_2004,
84 mbfl_no_encoding_2022jp_kddi,
85 mbfl_no_encoding_2022jpms,
86 mbfl_no_encoding_gb18030,
87 mbfl_no_encoding_cp1252,
88 mbfl_no_encoding_cp1254,
89 mbfl_no_encoding_8859_1,
90 mbfl_no_encoding_8859_2,
91 mbfl_no_encoding_8859_3,
92 mbfl_no_encoding_8859_4,
93 mbfl_no_encoding_8859_5,
94 mbfl_no_encoding_8859_6,
95 mbfl_no_encoding_8859_7,
96 mbfl_no_encoding_8859_8,
97 mbfl_no_encoding_8859_9,
98 mbfl_no_encoding_8859_10,
99 mbfl_no_encoding_8859_13,
100 mbfl_no_encoding_8859_14,
101 mbfl_no_encoding_8859_15,
102 mbfl_no_encoding_euc_cn,
103 mbfl_no_encoding_cp936,
104 mbfl_no_encoding_euc_tw,
105 mbfl_no_encoding_big5,
106 mbfl_no_encoding_cp950,
107 mbfl_no_encoding_euc_kr,
108 mbfl_no_encoding_2022kr,
109 mbfl_no_encoding_uhc,
110 mbfl_no_encoding_hz,
111 mbfl_no_encoding_cp1251,
112 mbfl_no_encoding_cp866,
113 mbfl_no_encoding_koi8r,
114 mbfl_no_encoding_koi8u,
115 mbfl_no_encoding_8859_16,
116 mbfl_no_encoding_armscii8,
117 mbfl_no_encoding_cp850,
118 mbfl_no_encoding_cp50220,
119 mbfl_no_encoding_cp50221,
120 mbfl_no_encoding_cp50222,
121 mbfl_no_encoding_charset_max
122 };
123
124 struct _mbfl_convert_filter;
125 struct mbfl_convert_vtbl {
126 enum mbfl_no_encoding from;
127 enum mbfl_no_encoding to;
128 void (*filter_ctor)(struct _mbfl_convert_filter *filter);
129 void (*filter_dtor)(struct _mbfl_convert_filter *filter);
130 int (*filter_function)(int c, struct _mbfl_convert_filter *filter);
131 int (*filter_flush)(struct _mbfl_convert_filter *filter);
132 void (*filter_copy)(struct _mbfl_convert_filter *src, struct _mbfl_convert_filter *dest);
133 };
134
135 typedef struct {
136 unsigned char *out;
137 unsigned char *limit;
138 uint32_t state;
139 uint32_t errors;
140 uint32_t replacement_char;
141 unsigned int error_mode;
142 zend_string *str;
143 } mb_convert_buf;
144
145 typedef size_t (*mb_to_wchar_fn)(unsigned char **in, size_t *in_len, uint32_t *out, size_t out_len, unsigned int *state);
146 typedef void (*mb_from_wchar_fn)(uint32_t *in, size_t in_len, mb_convert_buf *out, bool end);
147 typedef bool (*mb_check_fn)(unsigned char *in, size_t in_len);
148
149 /* When converting encoded text to a buffer of wchars (Unicode codepoints) using `mb_to_wchar_fn`,
150 * the buffer must be at least this size (to work with all supported text encodings) */
151 #define MBSTRING_MIN_WCHAR_BUFSIZE 5
152
mb_convert_buf_init(mb_convert_buf * buf,size_t initsize,uint32_t repl_char,unsigned int err_mode)153 static inline void mb_convert_buf_init(mb_convert_buf *buf, size_t initsize, uint32_t repl_char, unsigned int err_mode)
154 {
155 buf->state = buf->errors = 0;
156 buf->str = emalloc(_ZSTR_STRUCT_SIZE(initsize));
157 buf->out = (unsigned char*)ZSTR_VAL(buf->str);
158 buf->limit = buf->out + initsize;
159 buf->replacement_char = repl_char;
160 buf->error_mode = err_mode;
161 }
162
163 #define MB_CONVERT_BUF_ENSURE(buf, out, limit, needed) \
164 ZEND_ASSERT(out <= limit); \
165 if ((limit - out) < (needed)) { \
166 size_t oldsize = limit - (unsigned char*)ZSTR_VAL((buf)->str); \
167 size_t newsize = oldsize + MAX(oldsize >> 1, needed); \
168 zend_string *newstr = erealloc((buf)->str, _ZSTR_STRUCT_SIZE(newsize)); \
169 out = (unsigned char*)ZSTR_VAL(newstr) + (out - (unsigned char*)ZSTR_VAL((buf)->str)); \
170 limit = (unsigned char*)ZSTR_VAL(newstr) + newsize; \
171 (buf)->str = newstr; \
172 }
173
174 #define MB_CONVERT_BUF_STORE(buf, _out, _limit) (buf)->out = _out; (buf)->limit = _limit
175
176 #define MB_CONVERT_BUF_LOAD(buf, _out, _limit) _out = (buf)->out; _limit = (buf)->limit
177
178 #define MB_CONVERT_ERROR(buf, out, limit, bad_cp, conv_fn) \
179 MB_CONVERT_BUF_STORE(buf, out, limit); \
180 mb_illegal_output(bad_cp, conv_fn, buf); \
181 MB_CONVERT_BUF_LOAD(buf, out, limit)
182
mb_convert_buf_add(unsigned char * out,char c)183 static inline unsigned char* mb_convert_buf_add(unsigned char *out, char c)
184 {
185 *out++ = c;
186 return out;
187 }
188
mb_convert_buf_add2(unsigned char * out,char c1,char c2)189 static inline unsigned char* mb_convert_buf_add2(unsigned char *out, char c1, char c2)
190 {
191 *out++ = c1;
192 *out++ = c2;
193 return out;
194 }
195
mb_convert_buf_add3(unsigned char * out,char c1,char c2,char c3)196 static inline unsigned char* mb_convert_buf_add3(unsigned char *out, char c1, char c2, char c3)
197 {
198 *out++ = c1;
199 *out++ = c2;
200 *out++ = c3;
201 return out;
202 }
203
mb_convert_buf_add4(unsigned char * out,char c1,char c2,char c3,char c4)204 static inline unsigned char* mb_convert_buf_add4(unsigned char *out, char c1, char c2, char c3, char c4)
205 {
206 *out++ = c1;
207 *out++ = c2;
208 *out++ = c3;
209 *out++ = c4;
210 return out;
211 }
212
mb_convert_buf_appends(unsigned char * out,const char * s)213 static inline unsigned char* mb_convert_buf_appends(unsigned char *out, const char *s)
214 {
215 while (*s) {
216 *out++ = *s++;
217 }
218 return out;
219 }
220
mb_convert_buf_appendn(unsigned char * out,const char * s,size_t n)221 static inline unsigned char* mb_convert_buf_appendn(unsigned char *out, const char *s, size_t n)
222 {
223 while (n--) {
224 *out++ = *s++;
225 }
226 return out;
227 }
228
mb_convert_buf_result_raw(mb_convert_buf * buf)229 static inline zend_string* mb_convert_buf_result_raw(mb_convert_buf *buf)
230 {
231 ZEND_ASSERT(buf->out <= buf->limit);
232 zend_string *ret = buf->str;
233 /* See `zend_string_alloc` in zend_string.h */
234 GC_SET_REFCOUNT(ret, 1);
235 GC_TYPE_INFO(ret) = GC_STRING;
236 ZSTR_H(ret) = 0;
237 ZSTR_LEN(ret) = buf->out - (unsigned char*)ZSTR_VAL(ret);
238 *(buf->out) = '\0';
239 return ret;
240 }
241
242 typedef struct {
243 enum mbfl_no_encoding no_encoding;
244 const char *name;
245 const char *mime_name;
246 const char **aliases;
247 const unsigned char *mblen_table;
248 unsigned int flag;
249 const struct mbfl_convert_vtbl *input_filter;
250 const struct mbfl_convert_vtbl *output_filter;
251 mb_to_wchar_fn to_wchar;
252 mb_from_wchar_fn from_wchar;
253 mb_check_fn check;
254 } mbfl_encoding;
255
256 extern const mbfl_encoding mbfl_encoding_utf8;
257
mb_convert_buf_result(mb_convert_buf * buf,const mbfl_encoding * enc)258 static inline zend_string* mb_convert_buf_result(mb_convert_buf *buf, const mbfl_encoding *enc)
259 {
260 zend_string *ret = mb_convert_buf_result_raw(buf);
261 if (enc == &mbfl_encoding_utf8 && buf->error_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8) {
262 GC_ADD_FLAGS(ret, IS_STR_VALID_UTF8);
263 }
264 return ret;
265 }
266
267 /* Used if we initialize an `mb_convert_buf` but then discover we don't actually
268 * want to return `zend_string` */
mb_convert_buf_free(mb_convert_buf * buf)269 static inline void mb_convert_buf_free(mb_convert_buf *buf)
270 {
271 efree(buf->str);
272 }
273
mb_convert_buf_len(mb_convert_buf * buf)274 static inline size_t mb_convert_buf_len(mb_convert_buf *buf)
275 {
276 return buf->out - (unsigned char*)ZSTR_VAL(buf->str);
277 }
278
mb_convert_buf_reset(mb_convert_buf * buf,size_t len)279 static inline void mb_convert_buf_reset(mb_convert_buf *buf, size_t len)
280 {
281 buf->out = (unsigned char*)ZSTR_VAL(buf->str) + len;
282 ZEND_ASSERT(buf->out <= buf->limit);
283 }
284
285 MBFLAPI extern const mbfl_encoding *mbfl_name2encoding(const char *name);
286 MBFLAPI extern const mbfl_encoding *mbfl_no2encoding(enum mbfl_no_encoding no_encoding);
287 MBFLAPI extern const mbfl_encoding **mbfl_get_supported_encodings(void);
288 MBFLAPI extern const char *mbfl_no_encoding2name(enum mbfl_no_encoding no_encoding);
289 MBFLAPI extern const char *mbfl_encoding_preferred_mime_name(const mbfl_encoding *encoding);
290
291 #endif /* MBFL_ENCODING_H */
292