1 /*
2 zip_utf-8.c -- UTF-8 support functions for libzip
3 Copyright (C) 2011-2012 Dieter Baron and Thomas Klausner
4
5 This file is part of libzip, a library to manipulate ZIP archives.
6 The authors can be contacted at <libzip@nih.at>
7
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions
10 are met:
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13 2. Redistributions in binary form must reproduce the above copyright
14 notice, this list of conditions and the following disclaimer in
15 the documentation and/or other materials provided with the
16 distribution.
17 3. The names of the authors may not be used to endorse or promote
18 products derived from this software without specific prior
19 written permission.
20
21 THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS
22 OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY
25 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
27 GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
29 IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
30 OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
31 IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34
35
36 #include "zipint.h"
37
38 #include <stdlib.h>
39
40
41
42 static const zip_uint16_t _cp437_to_unicode[256] = {
43 /* 0x00 - 0x0F */
44 0x2007, 0x263A, 0x263B, 0x2665, 0x2666, 0x2663, 0x2660, 0x2022,
45 0x25D8, 0x25CB, 0x25D9, 0x2642, 0x2640, 0x266A, 0x266B, 0x263C,
46
47 /* 0x10 - 0x1F */
48 0x25BA, 0x25C4, 0x2195, 0x203C, 0x00B6, 0x00A7, 0x25AC, 0x21A8,
49 0x2191, 0x2193, 0x2192, 0x2190, 0x221F, 0x2194, 0x25B2, 0x25BC,
50
51 /* 0x20 - 0x2F */
52 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027,
53 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F,
54
55 /* 0x30 - 0x3F */
56 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
57 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F,
58
59 /* 0x40 - 0x4F */
60 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
61 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F,
62
63 /* 0x50 - 0x5F */
64 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
65 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F,
66
67 /* 0x60 - 0x6F */
68 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067,
69 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F,
70
71 /* 0x70 - 0x7F */
72 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077,
73 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x2302,
74
75 /* 0x80 - 0x8F */
76 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
77 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
78
79 /* 0x90 - 0x9F */
80 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
81 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x20A7, 0x0192,
82
83 /* 0xA0 - 0xAF */
84 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
85 0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
86
87 /* 0xB0 - 0xBF */
88 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556,
89 0x2555, 0x2563, 0x2551, 0x2557, 0x255D, 0x255C, 0x255B, 0x2510,
90
91 /* 0xC0 - 0xCF */
92 0x2514, 0x2534, 0x252C, 0x251C, 0x2500, 0x253C, 0x255E, 0x255F,
93 0x255A, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256C, 0x2567,
94
95 /* 0xD0 - 0xDF */
96 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256B,
97 0x256A, 0x2518, 0x250C, 0x2588, 0x2584, 0x258C, 0x2590, 0x2580,
98
99 /* 0xE0 - 0xEF */
100 0x03B1, 0x00DF, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4,
101 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x221E, 0x03C6, 0x03B5, 0x2229,
102
103 /* 0xF0 - 0xFF */
104 0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248,
105 0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x25A0, 0x00A0
106 };
107
108 #define UTF_8_LEN_2_MASK 0xe0
109 #define UTF_8_LEN_2_MATCH 0xc0
110 #define UTF_8_LEN_3_MASK 0xf0
111 #define UTF_8_LEN_3_MATCH 0xe0
112 #define UTF_8_LEN_4_MASK 0xf8
113 #define UTF_8_LEN_4_MATCH 0xf0
114 #define UTF_8_CONTINUE_MASK 0xc0
115 #define UTF_8_CONTINUE_MATCH 0x80
116
117
118
119 enum zip_encoding_type
_zip_guess_encoding(struct zip_string * str,enum zip_encoding_type expected_encoding)120 _zip_guess_encoding(struct zip_string *str, enum zip_encoding_type expected_encoding)
121 {
122 enum zip_encoding_type enc;
123 const zip_uint8_t *name;
124 zip_uint32_t i, j, ulen;
125
126 if (str == NULL)
127 return ZIP_ENCODING_ASCII;
128
129 name = str->raw;
130
131 if (str->encoding != ZIP_ENCODING_UNKNOWN)
132 enc = str->encoding;
133 else {
134 enc = ZIP_ENCODING_ASCII;
135 for (i=0; i<str->length; i++) {
136 if ((name[i] > 31 && name[i] < 128) || name[i] == '\r' || name[i] == '\n' || name[i] == '\t')
137 continue;
138
139 enc = ZIP_ENCODING_UTF8_GUESSED;
140 if ((name[i] & UTF_8_LEN_2_MASK) == UTF_8_LEN_2_MATCH)
141 ulen = 1;
142 else if ((name[i] & UTF_8_LEN_3_MASK) == UTF_8_LEN_3_MATCH)
143 ulen = 2;
144 else if ((name[i] & UTF_8_LEN_4_MASK) == UTF_8_LEN_4_MATCH)
145 ulen = 3;
146 else {
147 enc = ZIP_ENCODING_CP437;
148 break;
149 }
150
151 if (i + ulen >= str->length) {
152 enc = ZIP_ENCODING_CP437;
153 break;
154 }
155
156 for (j=1; j<=ulen; j++) {
157 if ((name[i+j] & UTF_8_CONTINUE_MASK) != UTF_8_CONTINUE_MATCH) {
158 enc = ZIP_ENCODING_CP437;
159 goto done;
160 }
161 }
162 i += ulen;
163 }
164 }
165
166 done:
167 str->encoding = enc;
168
169 if (expected_encoding != ZIP_ENCODING_UNKNOWN) {
170 if (expected_encoding == ZIP_ENCODING_UTF8_KNOWN && enc == ZIP_ENCODING_UTF8_GUESSED)
171 str->encoding = enc = ZIP_ENCODING_UTF8_KNOWN;
172
173 if (expected_encoding != enc && enc != ZIP_ENCODING_ASCII)
174 return ZIP_ENCODING_ERROR;
175 }
176
177 return enc;
178 }
179
180
181
182 static zip_uint32_t
_zip_unicode_to_utf8_len(zip_uint32_t codepoint)183 _zip_unicode_to_utf8_len(zip_uint32_t codepoint)
184 {
185 if (codepoint < 0x0080)
186 return 1;
187 if (codepoint < 0x0800)
188 return 2;
189 if (codepoint < 0x10000)
190 return 3;
191 return 4;
192 }
193
194
195
196 static zip_uint32_t
_zip_unicode_to_utf8(zip_uint32_t codepoint,zip_uint8_t * buf)197 _zip_unicode_to_utf8(zip_uint32_t codepoint, zip_uint8_t *buf)
198 {
199 if (codepoint < 0x0080) {
200 buf[0] = codepoint & 0xff;
201 return 1;
202 }
203 if (codepoint < 0x0800) {
204 buf[0] = UTF_8_LEN_2_MATCH | ((codepoint >> 6) & 0x1f);
205 buf[1] = UTF_8_CONTINUE_MATCH | (codepoint & 0x3f);
206 return 2;
207 }
208 if (codepoint < 0x10000) {
209 buf[0] = UTF_8_LEN_3_MATCH | ((codepoint >> 12) & 0x0f);
210 buf[1] = UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f);
211 buf[2] = UTF_8_CONTINUE_MATCH | (codepoint & 0x3f);
212 return 3;
213 }
214 buf[0] = UTF_8_LEN_4_MATCH | ((codepoint >> 18) & 0x07);
215 buf[1] = UTF_8_CONTINUE_MATCH | ((codepoint >> 12) & 0x3f);
216 buf[2] = UTF_8_CONTINUE_MATCH | ((codepoint >> 6) & 0x3f);
217 buf[3] = UTF_8_CONTINUE_MATCH | (codepoint & 0x3f);
218 return 4;
219 }
220
221
222
223 zip_uint8_t *
_zip_cp437_to_utf8(const zip_uint8_t * const _cp437buf,zip_uint32_t len,zip_uint32_t * utf8_lenp,struct zip_error * error)224 _zip_cp437_to_utf8(const zip_uint8_t * const _cp437buf, zip_uint32_t len,
225 zip_uint32_t *utf8_lenp, struct zip_error *error)
226 {
227 zip_uint8_t *cp437buf = (zip_uint8_t *)_cp437buf;
228 zip_uint8_t *utf8buf;
229 zip_uint32_t buflen, i, offset;
230
231 if (len == 0) {
232 if (utf8_lenp)
233 *utf8_lenp = 0;
234 return NULL;
235 }
236
237 buflen = 1;
238 for (i=0; i<len; i++)
239 buflen += _zip_unicode_to_utf8_len(_cp437_to_unicode[cp437buf[i]]);
240
241 if ((utf8buf=(zip_uint8_t*)malloc(buflen)) == NULL) {
242 _zip_error_set(error, ZIP_ER_MEMORY, 0);
243 return NULL;
244 }
245
246 offset = 0;
247 for (i=0; i<len; i++)
248 offset += _zip_unicode_to_utf8(_cp437_to_unicode[cp437buf[i]],
249 utf8buf+offset);
250
251 utf8buf[buflen-1] = 0;
252 if (utf8_lenp)
253 *utf8_lenp = buflen-1;
254 return utf8buf;
255 }
256