1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this part: Marcus Boerger <helly@php.net>
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #include <string.h>
31 #include "mbfilter.h"
32 #include "mbfilter_htmlent.h"
33 #include "html_entities.h"
34 
35 static const int htmlentitifieds[256] = {
36   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
37   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
38   0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
39   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,
40   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
41   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
42   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
43   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
44   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
47   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
48   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
49   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
50   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
52 };
53 
54 static const char *mbfl_encoding_html_ent_aliases[] = {"HTML", "html", NULL};
55 
56 const mbfl_encoding mbfl_encoding_html_ent = {
57 	mbfl_no_encoding_html_ent,
58 	"HTML-ENTITIES",
59 	"HTML-ENTITIES",
60 	(const char *(*)[])&mbfl_encoding_html_ent_aliases,
61 	NULL,
62 	MBFL_ENCTYPE_GL_UNSAFE,
63 	&vtbl_html_wchar,
64 	&vtbl_wchar_html
65 };
66 
67 const struct mbfl_convert_vtbl vtbl_wchar_html = {
68 	mbfl_no_encoding_wchar,
69 	mbfl_no_encoding_html_ent,
70 	mbfl_filt_conv_common_ctor,
71 	NULL,
72 	mbfl_filt_conv_html_enc,
73 	mbfl_filt_conv_html_enc_flush,
74 	NULL,
75 };
76 
77 const struct mbfl_convert_vtbl vtbl_html_wchar = {
78 	mbfl_no_encoding_html_ent,
79 	mbfl_no_encoding_wchar,
80 	mbfl_filt_conv_html_dec_ctor,
81 	mbfl_filt_conv_html_dec_dtor,
82 	mbfl_filt_conv_html_dec,
83 	mbfl_filt_conv_html_dec_flush,
84 	mbfl_filt_conv_html_dec_copy,
85 };
86 
87 
88 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
89 
90 /*
91  * any => HTML
92  */
mbfl_filt_conv_html_enc(int c,mbfl_convert_filter * filter)93 int mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter)
94 {
95 	int tmp[64];
96 	int i;
97 	unsigned int uc;
98 	const mbfl_html_entity_entry *e;
99 
100 	if (c < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) &&
101 				htmlentitifieds[c] != 1) {
102 		CK((*filter->output_function)(c, filter->data));
103 	} else {
104  		CK((*filter->output_function)('&', filter->data));
105 		for (i = 0; (e = &mbfl_html_entity_list[i])->name != NULL; i++) {
106 			if (c == e->code) {
107 				char *p;
108 
109 				for (p = e->name; *p != '\0'; p++) {
110 					CK((*filter->output_function)((int)*p, filter->data));
111 				}
112 				goto last;
113 			}
114 		}
115 
116 		{
117 			int *p = tmp + sizeof(tmp) / sizeof(tmp[0]);
118 
119 			CK((*filter->output_function)('#', filter->data));
120 
121 			uc = (unsigned int)c;
122 
123 			*(--p) = '\0';
124 			do {
125 				*(--p) = "0123456789"[uc % 10];
126 				uc /= 10;
127 			} while (uc);
128 
129 			for (; *p != '\0'; p++) {
130 				CK((*filter->output_function)(*p, filter->data));
131 			}
132 		}
133 	last:
134 		CK((*filter->output_function)(';', filter->data));
135 	}
136 	return c;
137 }
138 
mbfl_filt_conv_html_enc_flush(mbfl_convert_filter * filter)139 int mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter)
140 {
141 	filter->status = 0;
142 	filter->opaque = NULL;
143 
144 	if (filter->flush_function != NULL) {
145 		(*filter->flush_function)(filter->data);
146 	}
147 
148 	return 0;
149 }
150 
151 /*
152  * HTML => any
153  */
154 #define html_enc_buffer_size	16
155 static const char html_entity_chars[] = "#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
156 
mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter * filter)157 void mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter)
158 {
159 	filter->status = 0;
160 	filter->opaque = emalloc(html_enc_buffer_size+1);
161 }
162 
mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter * filter)163 void mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter)
164 {
165 	filter->status = 0;
166 	if (filter->opaque)
167 	{
168 		efree((void*)filter->opaque);
169 	}
170 	filter->opaque = NULL;
171 }
172 
mbfl_filt_conv_html_dec(int c,mbfl_convert_filter * filter)173 int mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter)
174 {
175 	int  pos, ent = 0;
176 	mbfl_html_entity_entry *entity;
177 	char *buffer = (char*)filter->opaque;
178 
179 	if (!filter->status) {
180 		if (c == '&' ) {
181 			filter->status = 1;
182 			buffer[0] = '&';
183 		} else {
184 			CK((*filter->output_function)(c, filter->data));
185 		}
186 	} else {
187 		if (c == ';') {
188 			if (buffer[1]=='#') {
189 				if (filter->status > 2 && (buffer[2] == 'x' || buffer[2] == 'X')) {
190 					if (filter->status > 3) {
191 						/* numeric entity */
192 						for (pos=3; pos<filter->status; pos++) {
193 							int v =  buffer[pos];
194 							if (v >= '0' && v <= '9') {
195 								v = v - '0';
196 							} else if (v >= 'A' && v <= 'F') {
197 								v = v - 'A' + 10;
198 							} else if (v >= 'a' && v <= 'f') {
199 								v = v - 'a' + 10;
200 							} else {
201 								ent = -1;
202 								break;
203 							}
204 							ent = ent * 16 + v;
205 						}
206 					} else {
207 						ent = -1;
208 					}
209 				} else {
210 					/* numeric entity */
211 					if (filter->status > 2) {
212 						for (pos=2; pos<filter->status; pos++) {
213 							int v = buffer[pos];
214 							if (v >= '0' && v <= '9') {
215 								v = v - '0';
216 							} else {
217 								ent = -1;
218 								break;
219 							}
220 							ent = ent*10 + v;
221 						}
222 					} else {
223 						ent = -1;
224 					}
225 				}
226 				if (ent >= 0 && ent < 0x110000) {
227 					CK((*filter->output_function)(ent, filter->data));
228 				} else {
229 					for (pos = 0; pos < filter->status; pos++) {
230 						CK((*filter->output_function)(buffer[pos], filter->data));
231 					}
232 					CK((*filter->output_function)(c, filter->data));
233 				}
234 				filter->status = 0;
235 				/*php_error_docref("ref.mbstring", E_NOTICE, "mbstring decoded '%s'=%d", buffer, ent);*/
236 			} else {
237 				/* named entity */
238 				buffer[filter->status] = 0;
239 				entity = (mbfl_html_entity_entry *)mbfl_html_entity_list;
240 				while (entity->name) {
241 					if (!strcmp(buffer+1, entity->name))	{
242 						ent = entity->code;
243 						break;
244 					}
245 					entity++;
246 				}
247 				if (ent) {
248 					/* decoded */
249 					CK((*filter->output_function)(ent, filter->data));
250 					filter->status = 0;
251 					/*php_error_docref("ref.mbstring", E_NOTICE,"mbstring decoded '%s'=%d", buffer, ent);*/
252 				} else {
253 					/* failure */
254 					buffer[filter->status++] = ';';
255 					buffer[filter->status] = 0;
256 					/* php_error_docref("ref.mbstring", E_WARNING, "mbstring cannot decode '%s'", buffer); */
257 					mbfl_filt_conv_html_dec_flush(filter);
258 				}
259 			}
260 		} else {
261 			/* add character */
262 			buffer[filter->status++] = c;
263 			/* add character and check */
264 			if (!strchr(html_entity_chars, c) || filter->status+1==html_enc_buffer_size || (c=='#' && filter->status>2))
265 			{
266 				/* illegal character or end of buffer */
267 				if (c=='&')
268 					filter->status--;
269 				buffer[filter->status] = 0;
270 				/* php_error_docref("ref.mbstring", E_WARNING, "mbstring cannot decode '%s'", buffer)l */
271 				mbfl_filt_conv_html_dec_flush(filter);
272 				if (c=='&')
273 				{
274 					buffer[filter->status++] = '&';
275 				}
276 			}
277 		}
278 	}
279 	return c;
280 }
281 
mbfl_filt_conv_html_dec_flush(mbfl_convert_filter * filter)282 int mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter)
283 {
284 	int status, pos = 0;
285 	unsigned char *buffer;
286 	int err = 0;
287 
288 	buffer = (unsigned char*)filter->opaque;
289 	status = filter->status;
290 	filter->status = 0;
291 
292 	/* flush fragments */
293 	while (status--) {
294 		int e = (*filter->output_function)(buffer[pos++], filter->data);
295 		if (e != 0)
296 			err = e;
297 	}
298 
299 	if (filter->flush_function != NULL) {
300 		(*filter->flush_function)(filter->data);
301 	}
302 
303 	return err;
304 }
305 
mbfl_filt_conv_html_dec_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)306 void mbfl_filt_conv_html_dec_copy(mbfl_convert_filter *src, mbfl_convert_filter *dest)
307 {
308 	*dest = *src;
309 	dest->opaque = emalloc(html_enc_buffer_size+1);
310 	memcpy(dest->opaque, src->opaque, html_enc_buffer_size+1);
311 }
312