1 /*
2  * "streamable kanji code filter and converter"
3  * Copyright (c) 1998-2002 HappySize, Inc. All rights reserved.
4  *
5  * LICENSE NOTICES
6  *
7  * This file is part of "streamable kanji code filter and converter",
8  * which is distributed under the terms of GNU Lesser General Public
9  * License (version 2) as published by the Free Software Foundation.
10  *
11  * This software is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with "streamable kanji code filter and converter";
18  * if not, write to the Free Software Foundation, Inc., 59 Temple Place,
19  * Suite 330, Boston, MA  02111-1307  USA
20  *
21  * The author of this part: Marcus Boerger <helly@php.net>
22  *
23  */
24 /*
25  * The source code included in this files was separated from mbfilter.c
26  * by moriyoshi koizumi <moriyoshi@php.net> on 4 dec 2002.
27  *
28  */
29 
30 #ifdef HAVE_CONFIG_H
31 #include "config.h"
32 #endif
33 
34 #ifdef HAVE_STRING_H
35 #include <string.h>
36 #endif
37 
38 #ifdef HAVE_STRINGS_H
39 #include <strings.h>
40 #endif
41 
42 #include "mbfilter.h"
43 #include "mbfilter_htmlent.h"
44 #include "html_entities.h"
45 
46 static const int htmlentitifieds[256] = {
47   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
49   0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
50   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,
51   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
52   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
53   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
54   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
55   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
56   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
57   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
58   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
59   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
60   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
61   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
62   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
63 };
64 
65 static const char *mbfl_encoding_html_ent_aliases[] = {"HTML", "html", NULL};
66 
67 const mbfl_encoding mbfl_encoding_html_ent = {
68 	mbfl_no_encoding_html_ent,
69 	"HTML-ENTITIES",
70 	"HTML-ENTITIES",
71 	(const char *(*)[])&mbfl_encoding_html_ent_aliases,
72 	NULL,
73 	MBFL_ENCTYPE_ENC_STRM | MBFL_ENCTYPE_GL_UNSAFE,
74 	&vtbl_html_wchar,
75 	&vtbl_wchar_html
76 };
77 
78 const struct mbfl_convert_vtbl vtbl_wchar_html = {
79 	mbfl_no_encoding_wchar,
80 	mbfl_no_encoding_html_ent,
81 	mbfl_filt_conv_common_ctor,
82 	mbfl_filt_conv_common_dtor,
83 	mbfl_filt_conv_html_enc,
84 	mbfl_filt_conv_html_enc_flush
85 };
86 
87 const struct mbfl_convert_vtbl vtbl_html_wchar = {
88 	mbfl_no_encoding_html_ent,
89 	mbfl_no_encoding_wchar,
90 	mbfl_filt_conv_html_dec_ctor,
91 	mbfl_filt_conv_html_dec_dtor,
92 	mbfl_filt_conv_html_dec,
93 	mbfl_filt_conv_html_dec_flush,
94 	mbfl_filt_conv_html_dec_copy };
95 
96 
97 #define CK(statement)	do { if ((statement) < 0) return (-1); } while (0)
98 
99 /*
100  * any => HTML
101  */
mbfl_filt_conv_html_enc(int c,mbfl_convert_filter * filter)102 int mbfl_filt_conv_html_enc(int c, mbfl_convert_filter *filter)
103 {
104 	int tmp[64];
105 	int i;
106 	unsigned int uc;
107 	const mbfl_html_entity_entry *e;
108 
109 	if (c < sizeof(htmlentitifieds) / sizeof(htmlentitifieds[0]) &&
110 				htmlentitifieds[c] != 1) {
111 		CK((*filter->output_function)(c, filter->data));
112 	} else {
113  		CK((*filter->output_function)('&', filter->data));
114 		for (i = 0; (e = &mbfl_html_entity_list[i])->name != NULL; i++) {
115 			if (c == e->code) {
116 				char *p;
117 
118 				for (p = e->name; *p != '\0'; p++) {
119 					CK((*filter->output_function)((int)*p, filter->data));
120 				}
121 				goto last;
122 			}
123 		}
124 
125 		{
126 			int *p = tmp + sizeof(tmp) / sizeof(tmp[0]);
127 
128 			CK((*filter->output_function)('#', filter->data));
129 
130 			uc = (unsigned int)c;
131 
132 			*(--p) = '\0';
133 			do {
134 				*(--p) = "0123456789"[uc % 10];
135 				uc /= 10;
136 			} while (uc);
137 
138 			for (; *p != '\0'; p++) {
139 				CK((*filter->output_function)(*p, filter->data));
140 			}
141 		}
142 	last:
143 		CK((*filter->output_function)(';', filter->data));
144 	}
145 	return c;
146 }
147 
mbfl_filt_conv_html_enc_flush(mbfl_convert_filter * filter)148 int mbfl_filt_conv_html_enc_flush(mbfl_convert_filter *filter)
149 {
150 	filter->status = 0;
151 	filter->opaque = NULL;
152 
153 	if (filter->flush_function != NULL) {
154 		(*filter->flush_function)(filter->data);
155 	}
156 
157 	return 0;
158 }
159 
160 /*
161  * HTML => any
162  */
163 #define html_enc_buffer_size	16
164 static const char html_entity_chars[] = "#0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
165 
mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter * filter)166 void mbfl_filt_conv_html_dec_ctor(mbfl_convert_filter *filter)
167 {
168 	filter->status = 0;
169 	filter->opaque = mbfl_malloc(html_enc_buffer_size+1);
170 }
171 
mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter * filter)172 void mbfl_filt_conv_html_dec_dtor(mbfl_convert_filter *filter)
173 {
174 	filter->status = 0;
175 	if (filter->opaque)
176 	{
177 		mbfl_free((void*)filter->opaque);
178 	}
179 	filter->opaque = NULL;
180 }
181 
mbfl_filt_conv_html_dec(int c,mbfl_convert_filter * filter)182 int mbfl_filt_conv_html_dec(int c, mbfl_convert_filter *filter)
183 {
184 	int  pos, ent = 0;
185 	mbfl_html_entity_entry *entity;
186 	char *buffer = (char*)filter->opaque;
187 
188 	if (!filter->status) {
189 		if (c == '&' ) {
190 			filter->status = 1;
191 			buffer[0] = '&';
192 		} else {
193 			CK((*filter->output_function)(c, filter->data));
194 		}
195 	} else {
196 		if (c == ';') {
197 			if (buffer[1]=='#') {
198 				if (filter->status > 2 && (buffer[2] == 'x' || buffer[2] == 'X')) {
199 					if (filter->status > 3) {
200 						/* numeric entity */
201 						for (pos=3; pos<filter->status; pos++) {
202 							int v =  buffer[pos];
203 							if (v >= '0' && v <= '9') {
204 								v = v - '0';
205 							} else if (v >= 'A' && v <= 'F') {
206 								v = v - 'A' + 10;
207 							} else if (v >= 'a' && v <= 'f') {
208 								v = v - 'a' + 10;
209 							} else {
210 								ent = -1;
211 								break;
212 							}
213 							ent = ent * 16 + v;
214 						}
215 					} else {
216 						ent = -1;
217 					}
218 				} else {
219 					/* numeric entity */
220 					if (filter->status > 2) {
221 						for (pos=2; pos<filter->status; pos++) {
222 							int v = buffer[pos];
223 							if (v >= '0' && v <= '9') {
224 								v = v - '0';
225 							} else {
226 								ent = -1;
227 								break;
228 							}
229 							ent = ent*10 + v;
230 						}
231 					} else {
232 						ent = -1;
233 					}
234 				}
235 				if (ent >= 0 && ent < 0x110000) {
236 					CK((*filter->output_function)(ent, filter->data));
237 				} else {
238 					for (pos = 0; pos < filter->status; pos++) {
239 						CK((*filter->output_function)(buffer[pos], filter->data));
240 					}
241 					CK((*filter->output_function)(c, filter->data));
242 				}
243 				filter->status = 0;
244 				/*php_error_docref("ref.mbstring", E_NOTICE, "mbstring decoded '%s'=%d", buffer, ent);*/
245 			} else {
246 				/* named entity */
247 				buffer[filter->status] = 0;
248 				entity = (mbfl_html_entity_entry *)mbfl_html_entity_list;
249 				while (entity->name) {
250 					if (!strcmp(buffer+1, entity->name))	{
251 						ent = entity->code;
252 						break;
253 					}
254 					entity++;
255 				}
256 				if (ent) {
257 					/* decoded */
258 					CK((*filter->output_function)(ent, filter->data));
259 					filter->status = 0;
260 					/*php_error_docref("ref.mbstring", E_NOTICE,"mbstring decoded '%s'=%d", buffer, ent);*/
261 				} else {
262 					/* failure */
263 					buffer[filter->status++] = ';';
264 					buffer[filter->status] = 0;
265 					/* php_error_docref("ref.mbstring", E_WARNING, "mbstring cannot decode '%s'", buffer); */
266 					mbfl_filt_conv_html_dec_flush(filter);
267 				}
268 			}
269 		} else {
270 			/* add character */
271 			buffer[filter->status++] = c;
272 			/* add character and check */
273 			if (!strchr(html_entity_chars, c) || filter->status+1==html_enc_buffer_size || (c=='#' && filter->status>2))
274 			{
275 				/* illegal character or end of buffer */
276 				if (c=='&')
277 					filter->status--;
278 				buffer[filter->status] = 0;
279 				/* php_error_docref("ref.mbstring", E_WARNING, "mbstring cannot decode '%s'", buffer)l */
280 				mbfl_filt_conv_html_dec_flush(filter);
281 				if (c=='&')
282 				{
283 					buffer[filter->status++] = '&';
284 				}
285 			}
286 		}
287 	}
288 	return c;
289 }
290 
mbfl_filt_conv_html_dec_flush(mbfl_convert_filter * filter)291 int mbfl_filt_conv_html_dec_flush(mbfl_convert_filter *filter)
292 {
293 	int status, pos = 0;
294 	unsigned char *buffer;
295 	int err = 0;
296 
297 	buffer = (unsigned char*)filter->opaque;
298 	status = filter->status;
299 	filter->status = 0;
300 
301 	/* flush fragments */
302 	while (status--) {
303 		int e = (*filter->output_function)(buffer[pos++], filter->data);
304 		if (e != 0)
305 			err = e;
306 	}
307 
308 	if (filter->flush_function != NULL) {
309 		(*filter->flush_function)(filter->data);
310 	}
311 
312 	return err;
313 }
314 
mbfl_filt_conv_html_dec_copy(mbfl_convert_filter * src,mbfl_convert_filter * dest)315 void mbfl_filt_conv_html_dec_copy(mbfl_convert_filter *src, mbfl_convert_filter *dest)
316 {
317 	*dest = *src;
318 	dest->opaque = mbfl_malloc(html_enc_buffer_size+1);
319 	memcpy(dest->opaque, src->opaque, html_enc_buffer_size+1);
320 }
321