1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this file:
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.h
26  * by Moriyoshi Koizumi <moriyoshi@php.net> on 20 Dec 2002. The file
27  * mbfilter.h is included in this package .
28  *
29  */
30 
31 #ifndef MBFL_ENCODING_H
32 #define MBFL_ENCODING_H
33 
34 #include "mbfl_defs.h"
35 #include "mbfl_consts.h"
36 #include "zend.h"
37 
38 enum mbfl_no_encoding {
39 	mbfl_no_encoding_invalid = -1,
40 	mbfl_no_encoding_pass,
41 	mbfl_no_encoding_wchar,
42 	mbfl_no_encoding_base64,
43 	mbfl_no_encoding_uuencode,
44 	mbfl_no_encoding_html_ent,
45 	mbfl_no_encoding_qprint,
46 	mbfl_no_encoding_7bit,
47 	mbfl_no_encoding_8bit,
48 	mbfl_no_encoding_charset_min,
49 	mbfl_no_encoding_ucs4,
50 	mbfl_no_encoding_ucs4be,
51 	mbfl_no_encoding_ucs4le,
52 	mbfl_no_encoding_ucs2,
53 	mbfl_no_encoding_ucs2be,
54 	mbfl_no_encoding_ucs2le,
55 	mbfl_no_encoding_utf32,
56 	mbfl_no_encoding_utf32be,
57 	mbfl_no_encoding_utf32le,
58 	mbfl_no_encoding_utf16,
59 	mbfl_no_encoding_utf16be,
60 	mbfl_no_encoding_utf16le,
61 	mbfl_no_encoding_utf8,
62 	mbfl_no_encoding_utf8_docomo,
63 	mbfl_no_encoding_utf8_kddi_a,
64 	mbfl_no_encoding_utf8_kddi_b,
65 	mbfl_no_encoding_utf8_sb,
66 	mbfl_no_encoding_utf7,
67 	mbfl_no_encoding_utf7imap,
68 	mbfl_no_encoding_ascii,
69 	mbfl_no_encoding_euc_jp,
70 	mbfl_no_encoding_eucjp2004,
71 	mbfl_no_encoding_sjis,
72 	mbfl_no_encoding_eucjp_win,
73  	mbfl_no_encoding_sjis_docomo,
74  	mbfl_no_encoding_sjis_kddi,
75  	mbfl_no_encoding_sjis_sb,
76  	mbfl_no_encoding_sjis_mac,
77 	mbfl_no_encoding_sjis2004,
78 	mbfl_no_encoding_cp932,
79 	mbfl_no_encoding_sjiswin,
80 	mbfl_no_encoding_cp51932,
81 	mbfl_no_encoding_jis,
82 	mbfl_no_encoding_2022jp,
83 	mbfl_no_encoding_2022jp_2004,
84 	mbfl_no_encoding_2022jp_kddi,
85 	mbfl_no_encoding_2022jpms,
86 	mbfl_no_encoding_gb18030,
87 	mbfl_no_encoding_gb18030_2022,
88 	mbfl_no_encoding_cp1252,
89 	mbfl_no_encoding_cp1254,
90 	mbfl_no_encoding_8859_1,
91 	mbfl_no_encoding_8859_2,
92 	mbfl_no_encoding_8859_3,
93 	mbfl_no_encoding_8859_4,
94 	mbfl_no_encoding_8859_5,
95 	mbfl_no_encoding_8859_6,
96 	mbfl_no_encoding_8859_7,
97 	mbfl_no_encoding_8859_8,
98 	mbfl_no_encoding_8859_9,
99 	mbfl_no_encoding_8859_10,
100 	mbfl_no_encoding_8859_13,
101 	mbfl_no_encoding_8859_14,
102 	mbfl_no_encoding_8859_15,
103 	mbfl_no_encoding_euc_cn,
104 	mbfl_no_encoding_cp936,
105 	mbfl_no_encoding_euc_tw,
106 	mbfl_no_encoding_big5,
107 	mbfl_no_encoding_cp950,
108 	mbfl_no_encoding_euc_kr,
109 	mbfl_no_encoding_2022kr,
110 	mbfl_no_encoding_uhc,
111 	mbfl_no_encoding_hz,
112 	mbfl_no_encoding_cp1251,
113 	mbfl_no_encoding_cp866,
114 	mbfl_no_encoding_koi8r,
115 	mbfl_no_encoding_koi8u,
116 	mbfl_no_encoding_8859_16,
117 	mbfl_no_encoding_armscii8,
118 	mbfl_no_encoding_cp850,
119 	mbfl_no_encoding_cp50220,
120 	mbfl_no_encoding_cp50221,
121 	mbfl_no_encoding_cp50222,
122 	mbfl_no_encoding_charset_max
123 };
124 
125 struct _mbfl_convert_filter;
126 struct mbfl_convert_vtbl {
127 	enum mbfl_no_encoding from;
128 	enum mbfl_no_encoding to;
129 	void (*filter_ctor)(struct _mbfl_convert_filter *filter);
130 	void (*filter_dtor)(struct _mbfl_convert_filter *filter);
131 	int (*filter_function)(int c, struct _mbfl_convert_filter *filter);
132 	int (*filter_flush)(struct _mbfl_convert_filter *filter);
133 	void (*filter_copy)(struct _mbfl_convert_filter *src, struct _mbfl_convert_filter *dest);
134 };
135 
136 typedef struct {
137 	unsigned char *out;
138 	unsigned char *limit;
139 	uint32_t state;
140 	uint32_t errors;
141 	uint32_t replacement_char;
142 	unsigned int error_mode;
143 	zend_string *str;
144 } mb_convert_buf;
145 
146 typedef size_t (*mb_to_wchar_fn)(unsigned char **in, size_t *in_len, uint32_t *out, size_t out_len, unsigned int *state);
147 typedef void (*mb_from_wchar_fn)(uint32_t *in, size_t in_len, mb_convert_buf *out, bool end);
148 typedef bool (*mb_check_fn)(unsigned char *in, size_t in_len);
149 typedef zend_string* (*mb_cut_fn)(unsigned char *str, size_t from, size_t len, unsigned char *end);
150 
151 /* When converting encoded text to a buffer of wchars (Unicode codepoints) using `mb_to_wchar_fn`,
152  * the buffer must be at least this size (to work with all supported text encodings) */
153 #define MBSTRING_MIN_WCHAR_BUFSIZE 5
154 
mb_convert_buf_init(mb_convert_buf * buf,size_t initsize,uint32_t repl_char,unsigned int err_mode)155 static inline void mb_convert_buf_init(mb_convert_buf *buf, size_t initsize, uint32_t repl_char, unsigned int err_mode)
156 {
157 	buf->state = buf->errors = 0;
158 	buf->str = emalloc(_ZSTR_STRUCT_SIZE(initsize));
159 	buf->out = (unsigned char*)ZSTR_VAL(buf->str);
160 	buf->limit = buf->out + initsize;
161 	buf->replacement_char = repl_char;
162 	buf->error_mode = err_mode;
163 }
164 
165 #define MB_CONVERT_BUF_ENSURE(buf, out, limit, needed) \
166 	ZEND_ASSERT(out <= limit); \
167 	if ((limit - out) < (needed)) { \
168 		size_t oldsize = limit - (unsigned char*)ZSTR_VAL((buf)->str); \
169 		size_t newsize = oldsize + MAX(oldsize >> 1, needed); \
170 		zend_string *newstr = erealloc((buf)->str, _ZSTR_STRUCT_SIZE(newsize)); \
171 		out = (unsigned char*)ZSTR_VAL(newstr) + (out - (unsigned char*)ZSTR_VAL((buf)->str)); \
172 		limit = (unsigned char*)ZSTR_VAL(newstr) + newsize; \
173 		(buf)->str = newstr; \
174 	}
175 
176 #define MB_CONVERT_BUF_STORE(buf, _out, _limit) (buf)->out = _out; (buf)->limit = _limit
177 
178 #define MB_CONVERT_BUF_LOAD(buf, _out, _limit) _out = (buf)->out; _limit = (buf)->limit
179 
180 #define MB_CONVERT_ERROR(buf, out, limit, bad_cp, conv_fn) \
181 	MB_CONVERT_BUF_STORE(buf, out, limit); \
182 	mb_illegal_output(bad_cp, conv_fn, buf); \
183 	MB_CONVERT_BUF_LOAD(buf, out, limit)
184 
mb_convert_buf_add(unsigned char * out,char c)185 static inline unsigned char* mb_convert_buf_add(unsigned char *out, char c)
186 {
187 	*out++ = c;
188 	return out;
189 }
190 
mb_convert_buf_add2(unsigned char * out,char c1,char c2)191 static inline unsigned char* mb_convert_buf_add2(unsigned char *out, char c1, char c2)
192 {
193 	*out++ = c1;
194 	*out++ = c2;
195 	return out;
196 }
197 
mb_convert_buf_add3(unsigned char * out,char c1,char c2,char c3)198 static inline unsigned char* mb_convert_buf_add3(unsigned char *out, char c1, char c2, char c3)
199 {
200 	*out++ = c1;
201 	*out++ = c2;
202 	*out++ = c3;
203 	return out;
204 }
205 
mb_convert_buf_add4(unsigned char * out,char c1,char c2,char c3,char c4)206 static inline unsigned char* mb_convert_buf_add4(unsigned char *out, char c1, char c2, char c3, char c4)
207 {
208 	*out++ = c1;
209 	*out++ = c2;
210 	*out++ = c3;
211 	*out++ = c4;
212 	return out;
213 }
214 
mb_convert_buf_appends(unsigned char * out,const char * s)215 static inline unsigned char* mb_convert_buf_appends(unsigned char *out, const char *s)
216 {
217 	while (*s) {
218 		*out++ = *s++;
219 	}
220 	return out;
221 }
222 
mb_convert_buf_appendn(unsigned char * out,const char * s,size_t n)223 static inline unsigned char* mb_convert_buf_appendn(unsigned char *out, const char *s, size_t n)
224 {
225 	while (n--) {
226 		*out++ = *s++;
227 	}
228 	return out;
229 }
230 
mb_convert_buf_result_raw(mb_convert_buf * buf)231 static inline zend_string* mb_convert_buf_result_raw(mb_convert_buf *buf)
232 {
233 	ZEND_ASSERT(buf->out <= buf->limit);
234 	zend_string *ret = buf->str;
235 	/* See `zend_string_alloc` in zend_string.h */
236 	GC_SET_REFCOUNT(ret, 1);
237 	GC_TYPE_INFO(ret) = GC_STRING;
238 	ZSTR_H(ret) = 0;
239 	ZSTR_LEN(ret) = buf->out - (unsigned char*)ZSTR_VAL(ret);
240 	*(buf->out) = '\0';
241 	return ret;
242 }
243 
244 typedef struct {
245 	enum mbfl_no_encoding no_encoding;
246 	const char *name;
247 	const char *mime_name;
248 	const char **aliases;
249 	const unsigned char *mblen_table;
250 	unsigned int flag;
251 	const struct mbfl_convert_vtbl *input_filter;
252 	const struct mbfl_convert_vtbl *output_filter;
253 	mb_to_wchar_fn to_wchar;
254 	mb_from_wchar_fn from_wchar;
255 	mb_check_fn check;
256 	mb_cut_fn cut;
257 } mbfl_encoding;
258 
259 extern const mbfl_encoding mbfl_encoding_utf8;
260 
mb_convert_buf_result(mb_convert_buf * buf,const mbfl_encoding * enc)261 static inline zend_string* mb_convert_buf_result(mb_convert_buf *buf, const mbfl_encoding *enc)
262 {
263 	zend_string *ret = mb_convert_buf_result_raw(buf);
264 	if (enc == &mbfl_encoding_utf8 && buf->error_mode != MBFL_OUTPUTFILTER_ILLEGAL_MODE_BADUTF8) {
265 		GC_ADD_FLAGS(ret, IS_STR_VALID_UTF8);
266 	}
267 	return ret;
268 }
269 
270 /* Used if we initialize an `mb_convert_buf` but then discover we don't actually
271  * want to return `zend_string` */
mb_convert_buf_free(mb_convert_buf * buf)272 static inline void mb_convert_buf_free(mb_convert_buf *buf)
273 {
274 	efree(buf->str);
275 }
276 
mb_convert_buf_len(mb_convert_buf * buf)277 static inline size_t mb_convert_buf_len(mb_convert_buf *buf)
278 {
279 	return buf->out - (unsigned char*)ZSTR_VAL(buf->str);
280 }
281 
mb_convert_buf_reset(mb_convert_buf * buf,size_t len)282 static inline void mb_convert_buf_reset(mb_convert_buf *buf, size_t len)
283 {
284 	buf->out = (unsigned char*)ZSTR_VAL(buf->str) + len;
285 	ZEND_ASSERT(buf->out <= buf->limit);
286 }
287 
288 MBFLAPI extern const mbfl_encoding *mbfl_name2encoding(const char *name);
289 MBFLAPI extern const mbfl_encoding *mbfl_name2encoding_ex(const char *name, size_t name_len);
290 MBFLAPI extern const mbfl_encoding *mbfl_no2encoding(enum mbfl_no_encoding no_encoding);
291 MBFLAPI extern const mbfl_encoding **mbfl_get_supported_encodings(void);
292 MBFLAPI extern const char *mbfl_no_encoding2name(enum mbfl_no_encoding no_encoding);
293 MBFLAPI extern const char *mbfl_encoding_preferred_mime_name(const mbfl_encoding *encoding);
294 
295 #endif /* MBFL_ENCODING_H */
296