xref: /libuv/test/test-idna.c (revision d05744e3)
1 /* Copyright The libuv project and contributors. All rights reserved.
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining a copy
4  * of this software and associated documentation files (the "Software"), to
5  * deal in the Software without restriction, including without limitation the
6  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
7  * sell copies of the Software, and to permit persons to whom the Software is
8  * furnished to do so, subject to the following conditions:
9  *
10  * The above copyright notice and this permission notice shall be included in
11  * all copies or substantial portions of the Software.
12  *
13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
18  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
19  * IN THE SOFTWARE.
20  */
21 
22 #include "task.h"
23 #define uv__malloc malloc
24 #include "../src/idna.c"
25 #include <string.h>
26 
TEST_IMPL(utf8_decode1)27 TEST_IMPL(utf8_decode1) {
28   const char* p;
29   char b[32];
30   int i;
31 
32   /* ASCII. */
33   p = b;
34   snprintf(b, sizeof(b), "%c\x7F", 0x00);
35   ASSERT_OK(uv__utf8_decode1(&p, b + sizeof(b)));
36   ASSERT_PTR_EQ(p, b + 1);
37   ASSERT_EQ(127, uv__utf8_decode1(&p, b + sizeof(b)));
38   ASSERT_PTR_EQ(p, b + 2);
39 
40   /* Two-byte sequences. */
41   p = b;
42   snprintf(b, sizeof(b), "%s", "\xC2\x80\xDF\xBF");
43   ASSERT_EQ(128, uv__utf8_decode1(&p, b + sizeof(b)));
44   ASSERT_PTR_EQ(p, b + 2);
45   ASSERT_EQ(0x7FF, uv__utf8_decode1(&p, b + sizeof(b)));
46   ASSERT_PTR_EQ(p, b + 4);
47 
48   /* Three-byte sequences. */
49   p = b;
50   snprintf(b, sizeof(b), "%s", "\xE0\xA0\x80\xEF\xBF\xBF");
51   ASSERT_EQ(0x800, uv__utf8_decode1(&p, b + sizeof(b)));
52   ASSERT_PTR_EQ(p, b + 3);
53   ASSERT_EQ(0xFFFF, uv__utf8_decode1(&p, b + sizeof(b)));
54   ASSERT_PTR_EQ(p, b + 6);
55 
56   /* Four-byte sequences. */
57   p = b;
58   snprintf(b, sizeof(b), "%s", "\xF0\x90\x80\x80\xF4\x8F\xBF\xBF");
59   ASSERT_EQ(0x10000, uv__utf8_decode1(&p, b + sizeof(b)));
60   ASSERT_PTR_EQ(p, b + 4);
61   ASSERT_EQ(0x10FFFF, uv__utf8_decode1(&p, b + sizeof(b)));
62   ASSERT_PTR_EQ(p, b + 8);
63 
64   /* Four-byte sequences > U+10FFFF; disallowed. */
65   p = b;
66   snprintf(b, sizeof(b), "%s", "\xF4\x90\xC0\xC0\xF7\xBF\xBF\xBF");
67   ASSERT_EQ((unsigned) -1, uv__utf8_decode1(&p, b + sizeof(b)));
68   ASSERT_PTR_EQ(p, b + 4);
69   ASSERT_EQ((unsigned) -1, uv__utf8_decode1(&p, b + sizeof(b)));
70   ASSERT_PTR_EQ(p, b + 8);
71 
72   /* Overlong; disallowed. */
73   p = b;
74   snprintf(b, sizeof(b), "%s", "\xC0\x80\xC1\x80");
75   ASSERT_EQ((unsigned) -1, uv__utf8_decode1(&p, b + sizeof(b)));
76   ASSERT_PTR_EQ(p, b + 2);
77   ASSERT_EQ((unsigned) -1, uv__utf8_decode1(&p, b + sizeof(b)));
78   ASSERT_PTR_EQ(p, b + 4);
79 
80   /* Surrogate pairs; disallowed. */
81   p = b;
82   snprintf(b, sizeof(b), "%s", "\xED\xA0\x80\xED\xA3\xBF");
83   ASSERT_EQ((unsigned) -1, uv__utf8_decode1(&p, b + sizeof(b)));
84   ASSERT_PTR_EQ(p, b + 3);
85   ASSERT_EQ((unsigned) -1, uv__utf8_decode1(&p, b + sizeof(b)));
86   ASSERT_PTR_EQ(p, b + 6);
87 
88   /* Simply illegal. */
89   p = b;
90   snprintf(b, sizeof(b), "%s", "\xF8\xF9\xFA\xFB\xFC\xFD\xFE\xFF");
91 
92   for (i = 1; i <= 8; i++) {
93     ASSERT_EQ((unsigned) -1, uv__utf8_decode1(&p, b + sizeof(b)));
94     ASSERT_PTR_EQ(p, b + i);
95   }
96 
97   return 0;
98 }
99 
TEST_IMPL(utf8_decode1_overrun)100 TEST_IMPL(utf8_decode1_overrun) {
101   const char* p;
102   char b[1];
103   char c[1];
104 
105   /* Single byte. */
106   p = b;
107   b[0] = 0x7F;
108   ASSERT_EQ(0x7F, uv__utf8_decode1(&p, b + 1));
109   ASSERT_PTR_EQ(p, b + 1);
110 
111   /* Multi-byte. */
112   p = b;
113   b[0] = 0xC0;
114   ASSERT_EQ((unsigned) -1, uv__utf8_decode1(&p, b + 1));
115   ASSERT_PTR_EQ(p, b + 1);
116 
117   b[0] = 0x7F;
118   ASSERT_EQ(UV_EINVAL, uv__idna_toascii(b, b + 0, c, c + 1));
119   ASSERT_EQ(UV_EINVAL, uv__idna_toascii(b, b + 1, c, c + 1));
120 
121   return 0;
122 }
123 
124 /* Doesn't work on z/OS because that platform uses EBCDIC, not ASCII. */
125 #ifndef __MVS__
126 
127 #define F(input, err)                                                         \
128   do {                                                                        \
129     char d[256] = {0};                                                        \
130     static const char s[] = "" input "";                                      \
131     ASSERT_EQ(err, uv__idna_toascii(s, s + sizeof(s) - 1, d, d + sizeof(d))); \
132   } while (0)
133 
134 #define T(input, expected)                                                    \
135   do {                                                                        \
136     long n;                                                                   \
137     char d1[256] = {0};                                                       \
138     char d2[256] = {0};                                                       \
139     static const char s[] = "" input "";                                      \
140     n = uv__idna_toascii(s, s + sizeof(s) - 1, d1, d1 + sizeof(d1));          \
141     ASSERT_EQ(n, sizeof(expected));                                           \
142     ASSERT_OK(memcmp(d1, expected, n));                                       \
143     /* Sanity check: encoding twice should not change the output. */          \
144     n = uv__idna_toascii(d1, d1 + strlen(d1), d2, d2 + sizeof(d2));           \
145     ASSERT_EQ(n, sizeof(expected));                                           \
146     ASSERT_OK(memcmp(d2, expected, n));                                       \
147     ASSERT_OK(memcmp(d1, d2, sizeof(d2)));                                    \
148   } while (0)
149 
TEST_IMPL(idna_toascii)150 TEST_IMPL(idna_toascii) {
151   /* Illegal inputs. */
152   F("\xC0\x80\xC1\x80", UV_EINVAL);  /* Overlong UTF-8 sequence. */
153   F("\xC0\x80\xC1\x80.com", UV_EINVAL);  /* Overlong UTF-8 sequence. */
154   F("", UV_EINVAL);
155   /* No conversion. */
156   T(".", ".");
157   T(".com", ".com");
158   T("example", "example");
159   T("example-", "example-");
160   T("straße.de", "xn--strae-oqa.de");
161   /* Test cases adapted from punycode.js. Most are from RFC 3492. */
162   T("foo.bar", "foo.bar");
163   T("mañana.com", "xn--maana-pta.com");
164   T("example.com.", "example.com.");
165   T("bücher.com", "xn--bcher-kva.com");
166   T("café.com", "xn--caf-dma.com");
167   T("café.café.com", "xn--caf-dma.xn--caf-dma.com");
168   T("☃-⌘.com", "xn----dqo34k.com");
169   T("퐀☃-⌘.com", "xn----dqo34kn65z.com");
170   T("��.la", "xn--ls8h.la");
171   T("mañana.com", "xn--maana-pta.com");
172   T("mañana。com", "xn--maana-pta.com");
173   T("mañana.com", "xn--maana-pta.com");
174   T("mañana。com", "xn--maana-pta.com");
175   T("ü", "xn--tda");
176   T(".ü", ".xn--tda");
177   T("ü.ü", "xn--tda.xn--tda");
178   T("ü.ü.", "xn--tda.xn--tda.");
179   T("üëäö♥", "xn--4can8av2009b");
180   T("Willst du die Blüthe des frühen, die Früchte des späteren Jahres",
181     "xn--Willst du die Blthe des frhen, "
182     "die Frchte des spteren Jahres-x9e96lkal");
183   T("ليهمابتكلموشعربي؟", "xn--egbpdaj6bu4bxfgehfvwxn");
184   T("他们为什么不说中文", "xn--ihqwcrb4cv8a8dqg056pqjye");
185   T("他們爲什麽不說中文", "xn--ihqwctvzc91f659drss3x8bo0yb");
186   T("Pročprostěnemluvíčesky", "xn--Proprostnemluvesky-uyb24dma41a");
187   T("למההםפשוטלאמדבריםעברית", "xn--4dbcagdahymbxekheh6e0a7fei0b");
188   T("यहलोगहिन्दीक्योंनहींबोलसकतेहैं",
189     "xn--i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd");
190   T("なぜみんな日本語を話してくれないのか",
191     "xn--n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa");
192   T("세계의모든사람들이한국어를이해한다면얼마나좋을까",
193     "xn--989aomsvi5e83db1d2a355cv1e0vak1d"
194     "wrv93d5xbh15a0dt30a5jpsd879ccm6fea98c");
195   T("почемужеонинеговорятпорусски", "xn--b1abfaaepdrnnbgefbadotcwatmq2g4l");
196   T("PorquénopuedensimplementehablarenEspañol",
197     "xn--PorqunopuedensimplementehablarenEspaol-fmd56a");
198   T("TạisaohọkhôngthểchỉnóitiếngViệt",
199     "xn--TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g");
200   T("3年B組金八先生", "xn--3B-ww4c5e180e575a65lsy2b");
201   T("安室奈美恵-with-SUPER-MONKEYS",
202     "xn---with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n");
203   T("Hello-Another-Way-それぞれの場所",
204     "xn--Hello-Another-Way--fc4qua05auwb3674vfr0b");
205   T("ひとつ屋根の下2", "xn--2-u9tlzr9756bt3uc0v");
206   T("MajiでKoiする5秒前", "xn--MajiKoi5-783gue6qz075azm5e");
207   T("パフィーdeルンバ", "xn--de-jg4avhby1noc0d");
208   T("そのスピードで", "xn--d9juau41awczczp");
209   T("-> $1.00 <-", "-> $1.00 <-");
210   /* Test cases from https://unicode.org/reports/tr46/ */
211   T("faß.de", "xn--fa-hia.de");
212   T("βόλος.com", "xn--nxasmm1c.com");
213   T("ශ්‍රී.com", "xn--10cl1a0b660p.com");
214   T("نامه‌ای.com", "xn--mgba3gch31f060k.com");
215   return 0;
216 }
217 
218 #undef T
219 
220 #endif  /* __MVS__ */
221 
TEST_IMPL(wtf8)222 TEST_IMPL(wtf8) {
223   static const char input[] = "ᜄȺy��:������'¥3̞[<i$";
224   uint16_t buf[32];
225   ssize_t len;
226 
227   len = uv_wtf8_length_as_utf16(input);
228   ASSERT_GT(len, 0);
229   ASSERT_LT(len, ARRAY_SIZE(buf));
230   uv_wtf8_to_utf16(input, buf, len);
231   return 0;
232 }
233