xref: /libuv/src/idna.c (revision 3530bcc3)
1 /* Copyright libuv contributors. All rights reserved.
2  *
3  * Permission to use, copy, modify, and/or distribute this software for any
4  * purpose with or without fee is hereby granted, provided that the above
5  * copyright notice and this permission notice appear in all copies.
6  *
7  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14  */
15 
16 /* Derived from https://github.com/bnoordhuis/punycode
17  * but updated to support IDNA 2008.
18  */
19 
20 #include "uv.h"
21 #include "uv-common.h"
22 #include "idna.h"
23 #include <assert.h>
24 #include <string.h>
25 #include <limits.h> /* UINT_MAX */
26 
27 
uv__wtf8_decode1(const char ** input)28 static int32_t uv__wtf8_decode1(const char** input) {
29   uint32_t code_point;
30   uint8_t b1;
31   uint8_t b2;
32   uint8_t b3;
33   uint8_t b4;
34 
35   b1 = **input;
36   if (b1 <= 0x7F)
37     return b1; /* ASCII code point */
38   if (b1 < 0xC2)
39     return -1; /* invalid: continuation byte */
40   code_point = b1;
41 
42   b2 = *++*input;
43   if ((b2 & 0xC0) != 0x80)
44     return -1; /* invalid: not a continuation byte */
45   code_point = (code_point << 6) | (b2 & 0x3F);
46   if (b1 <= 0xDF)
47     return 0x7FF & code_point; /* two-byte character */
48 
49   b3 = *++*input;
50   if ((b3 & 0xC0) != 0x80)
51     return -1; /* invalid: not a continuation byte */
52   code_point = (code_point << 6) | (b3 & 0x3F);
53   if (b1 <= 0xEF)
54     return 0xFFFF & code_point; /* three-byte character */
55 
56   b4 = *++*input;
57   if ((b4 & 0xC0) != 0x80)
58     return -1; /* invalid: not a continuation byte */
59   code_point = (code_point << 6) | (b4 & 0x3F);
60   if (b1 <= 0xF4) {
61     code_point &= 0x1FFFFF;
62     if (code_point <= 0x10FFFF)
63       return code_point; /* four-byte character */
64   }
65 
66   /* code point too large */
67   return -1;
68 }
69 
70 
uv__utf8_decode1_slow(const char ** p,const char * pe,unsigned a)71 static unsigned uv__utf8_decode1_slow(const char** p,
72                                       const char* pe,
73                                       unsigned a) {
74   unsigned b;
75   unsigned c;
76   unsigned d;
77   unsigned min;
78 
79   if (a > 0xF7)
80     return -1;
81 
82   switch (pe - *p) {
83   default:
84     if (a > 0xEF) {
85       min = 0x10000;
86       a = a & 7;
87       b = (unsigned char) *(*p)++;
88       c = (unsigned char) *(*p)++;
89       d = (unsigned char) *(*p)++;
90       break;
91     }
92     /* Fall through. */
93   case 2:
94     if (a > 0xDF) {
95       min = 0x800;
96       b = 0x80 | (a & 15);
97       c = (unsigned char) *(*p)++;
98       d = (unsigned char) *(*p)++;
99       a = 0;
100       break;
101     }
102     /* Fall through. */
103   case 1:
104     if (a > 0xBF) {
105       min = 0x80;
106       b = 0x80;
107       c = 0x80 | (a & 31);
108       d = (unsigned char) *(*p)++;
109       a = 0;
110       break;
111     }
112     /* Fall through. */
113   case 0:
114     return -1;  /* Invalid continuation byte. */
115   }
116 
117   if (0x80 != (0xC0 & (b ^ c ^ d)))
118     return -1;  /* Invalid sequence. */
119 
120   b &= 63;
121   c &= 63;
122   d &= 63;
123   a = (a << 18) | (b << 12) | (c << 6) | d;
124 
125   if (a < min)
126     return -1;  /* Overlong sequence. */
127 
128   if (a > 0x10FFFF)
129     return -1;  /* Four-byte sequence > U+10FFFF. */
130 
131   if (a >= 0xD800 && a <= 0xDFFF)
132     return -1;  /* Surrogate pair. */
133 
134   return a;
135 }
136 
137 
uv__utf8_decode1(const char ** p,const char * pe)138 unsigned uv__utf8_decode1(const char** p, const char* pe) {
139   unsigned a;
140 
141   assert(*p < pe);
142 
143   a = (unsigned char) *(*p)++;
144 
145   if (a < 128)
146     return a;  /* ASCII, common case. */
147 
148   return uv__utf8_decode1_slow(p, pe, a);
149 }
150 
151 
uv__idna_toascii_label(const char * s,const char * se,char ** d,char * de)152 static int uv__idna_toascii_label(const char* s, const char* se,
153                                   char** d, char* de) {
154   static const char alphabet[] = "abcdefghijklmnopqrstuvwxyz0123456789";
155   const char* ss;
156   unsigned c;
157   unsigned h;
158   unsigned k;
159   unsigned n;
160   unsigned m;
161   unsigned q;
162   unsigned t;
163   unsigned x;
164   unsigned y;
165   unsigned bias;
166   unsigned delta;
167   unsigned todo;
168   int first;
169 
170   h = 0;
171   ss = s;
172   todo = 0;
173 
174   /* Note: after this loop we've visited all UTF-8 characters and know
175    * they're legal so we no longer need to check for decode errors.
176    */
177   while (s < se) {
178     c = uv__utf8_decode1(&s, se);
179 
180     if (c == UINT_MAX)
181       return UV_EINVAL;
182 
183     if (c < 128)
184       h++;
185     else
186       todo++;
187   }
188 
189   /* Only write "xn--" when there are non-ASCII characters. */
190   if (todo > 0) {
191     if (*d < de) *(*d)++ = 'x';
192     if (*d < de) *(*d)++ = 'n';
193     if (*d < de) *(*d)++ = '-';
194     if (*d < de) *(*d)++ = '-';
195   }
196 
197   /* Write ASCII characters. */
198   x = 0;
199   s = ss;
200   while (s < se) {
201     c = uv__utf8_decode1(&s, se);
202     assert(c != UINT_MAX);
203 
204     if (c > 127)
205       continue;
206 
207     if (*d < de)
208       *(*d)++ = c;
209 
210     if (++x == h)
211       break;  /* Visited all ASCII characters. */
212   }
213 
214   if (todo == 0)
215     return h;
216 
217   /* Only write separator when we've written ASCII characters first. */
218   if (h > 0)
219     if (*d < de)
220       *(*d)++ = '-';
221 
222   n = 128;
223   bias = 72;
224   delta = 0;
225   first = 1;
226 
227   while (todo > 0) {
228     m = -1;
229     s = ss;
230 
231     while (s < se) {
232       c = uv__utf8_decode1(&s, se);
233       assert(c != UINT_MAX);
234 
235       if (c >= n)
236         if (c < m)
237           m = c;
238     }
239 
240     x = m - n;
241     y = h + 1;
242 
243     if (x > ~delta / y)
244       return UV_E2BIG;  /* Overflow. */
245 
246     delta += x * y;
247     n = m;
248 
249     s = ss;
250     while (s < se) {
251       c = uv__utf8_decode1(&s, se);
252       assert(c != UINT_MAX);
253 
254       if (c < n)
255         if (++delta == 0)
256           return UV_E2BIG;  /* Overflow. */
257 
258       if (c != n)
259         continue;
260 
261       for (k = 36, q = delta; /* empty */; k += 36) {
262         t = 1;
263 
264         if (k > bias)
265           t = k - bias;
266 
267         if (t > 26)
268           t = 26;
269 
270         if (q < t)
271           break;
272 
273         /* TODO(bnoordhuis) Since 1 <= t <= 26 and therefore
274          * 10 <= y <= 35, we can optimize the long division
275          * into a table-based reciprocal multiplication.
276          */
277         x = q - t;
278         y = 36 - t;  /* 10 <= y <= 35 since 1 <= t <= 26. */
279         q = x / y;
280         t = t + x % y;  /* 1 <= t <= 35 because of y. */
281 
282         if (*d < de)
283           *(*d)++ = alphabet[t];
284       }
285 
286       if (*d < de)
287         *(*d)++ = alphabet[q];
288 
289       delta /= 2;
290 
291       if (first) {
292         delta /= 350;
293         first = 0;
294       }
295 
296       /* No overflow check is needed because |delta| was just
297        * divided by 2 and |delta+delta >= delta + delta/h|.
298        */
299       h++;
300       delta += delta / h;
301 
302       for (bias = 0; delta > 35 * 26 / 2; bias += 36)
303         delta /= 35;
304 
305       bias += 36 * delta / (delta + 38);
306       delta = 0;
307       todo--;
308     }
309 
310     delta++;
311     n++;
312   }
313 
314   return 0;
315 }
316 
317 
uv__idna_toascii(const char * s,const char * se,char * d,char * de)318 ssize_t uv__idna_toascii(const char* s, const char* se, char* d, char* de) {
319   const char* si;
320   const char* st;
321   unsigned c;
322   char* ds;
323   int rc;
324 
325   if (s == se)
326     return UV_EINVAL;
327 
328   ds = d;
329 
330   si = s;
331   while (si < se) {
332     st = si;
333     c = uv__utf8_decode1(&si, se);
334 
335     if (c == UINT_MAX)
336       return UV_EINVAL;
337 
338     if (c != '.')
339       if (c != 0x3002)  /* 。 */
340         if (c != 0xFF0E)  /* . */
341           if (c != 0xFF61)  /* 。 */
342             continue;
343 
344     rc = uv__idna_toascii_label(s, st, &d, de);
345 
346     if (rc < 0)
347       return rc;
348 
349     if (d < de)
350       *d++ = '.';
351 
352     s = si;
353   }
354 
355   if (s < se) {
356     rc = uv__idna_toascii_label(s, se, &d, de);
357 
358     if (rc < 0)
359       return rc;
360   }
361 
362   if (d >= de)
363     return UV_EINVAL;
364 
365   *d++ = '\0';
366   return d - ds;  /* Number of bytes written. */
367 }
368 
369 
uv_wtf8_length_as_utf16(const char * source_ptr)370 ssize_t uv_wtf8_length_as_utf16(const char* source_ptr) {
371   size_t w_target_len = 0;
372   int32_t code_point;
373 
374   do {
375     code_point = uv__wtf8_decode1(&source_ptr);
376     if (code_point < 0)
377       return -1;
378     if (code_point > 0xFFFF)
379       w_target_len++;
380     w_target_len++;
381   } while (*source_ptr++);
382 
383   return w_target_len;
384 }
385 
386 
uv_wtf8_to_utf16(const char * source_ptr,uint16_t * w_target,size_t w_target_len)387 void uv_wtf8_to_utf16(const char* source_ptr,
388                       uint16_t* w_target,
389                       size_t w_target_len) {
390   int32_t code_point;
391 
392   do {
393     code_point = uv__wtf8_decode1(&source_ptr);
394     /* uv_wtf8_length_as_utf16 should have been called and checked first. */
395     assert(code_point >= 0);
396     if (code_point > 0x10000) {
397       assert(code_point < 0x10FFFF);
398       *w_target++ = (((code_point - 0x10000) >> 10) + 0xD800);
399       *w_target++ = ((code_point - 0x10000) & 0x3FF) + 0xDC00;
400       w_target_len -= 2;
401     } else {
402       *w_target++ = code_point;
403       w_target_len -= 1;
404     }
405   } while (*source_ptr++);
406 
407   (void)w_target_len;
408   assert(w_target_len == 0);
409 }
410 
411 
uv__get_surrogate_value(const uint16_t * w_source_ptr,ssize_t w_source_len)412 static int32_t uv__get_surrogate_value(const uint16_t* w_source_ptr,
413                                        ssize_t w_source_len) {
414   uint16_t u;
415   uint16_t next;
416 
417   u = w_source_ptr[0];
418   if (u >= 0xD800 && u <= 0xDBFF && w_source_len != 1) {
419     next = w_source_ptr[1];
420     if (next >= 0xDC00 && next <= 0xDFFF)
421       return 0x10000 + ((u - 0xD800) << 10) + (next - 0xDC00);
422   }
423   return u;
424 }
425 
426 
uv_utf16_length_as_wtf8(const uint16_t * w_source_ptr,ssize_t w_source_len)427 size_t uv_utf16_length_as_wtf8(const uint16_t* w_source_ptr,
428                                ssize_t w_source_len) {
429   size_t target_len;
430   int32_t code_point;
431 
432   target_len = 0;
433   while (w_source_len) {
434     code_point = uv__get_surrogate_value(w_source_ptr, w_source_len);
435     /* Can be invalid UTF-8 but must be valid WTF-8. */
436     assert(code_point >= 0);
437     if (w_source_len < 0 && code_point == 0)
438       break;
439     if (code_point < 0x80)
440       target_len += 1;
441     else if (code_point < 0x800)
442       target_len += 2;
443     else if (code_point < 0x10000)
444       target_len += 3;
445     else {
446       target_len += 4;
447       w_source_ptr++;
448       if (w_source_len > 0)
449         w_source_len--;
450     }
451     w_source_ptr++;
452     if (w_source_len > 0)
453       w_source_len--;
454   }
455 
456   return target_len;
457 }
458 
459 
uv_utf16_to_wtf8(const uint16_t * w_source_ptr,ssize_t w_source_len,char ** target_ptr,size_t * target_len_ptr)460 int uv_utf16_to_wtf8(const uint16_t* w_source_ptr,
461                      ssize_t w_source_len,
462                      char** target_ptr,
463                      size_t* target_len_ptr) {
464   size_t target_len;
465   char* target;
466   char* target_end;
467   int32_t code_point;
468 
469   /* If *target_ptr is provided, then *target_len_ptr must be its length
470    * (excluding space for NUL), otherwise we will compute the target_len_ptr
471    * length and may return a new allocation in *target_ptr if target_ptr is
472    * provided. */
473   if (target_ptr == NULL || *target_ptr == NULL) {
474     target_len = uv_utf16_length_as_wtf8(w_source_ptr, w_source_len);
475     if (target_len_ptr != NULL)
476       *target_len_ptr = target_len;
477   } else {
478     target_len = *target_len_ptr;
479   }
480 
481   if (target_ptr == NULL)
482     return 0;
483 
484   if (*target_ptr == NULL) {
485     target = uv__malloc(target_len + 1);
486     if (target == NULL) {
487       return UV_ENOMEM;
488     }
489     *target_ptr = target;
490   } else {
491     target = *target_ptr;
492   }
493 
494   target_end = target + target_len;
495 
496   while (target != target_end && w_source_len) {
497     code_point = uv__get_surrogate_value(w_source_ptr, w_source_len);
498     /* Can be invalid UTF-8 but must be valid WTF-8. */
499     assert(code_point >= 0);
500     if (w_source_len < 0 && code_point == 0) {
501       w_source_len = 0;
502       break;
503     }
504     if (code_point < 0x80) {
505       *target++ = code_point;
506     } else if (code_point < 0x800) {
507       *target++ = 0xC0 | (code_point >> 6);
508       if (target == target_end)
509         break;
510       *target++ = 0x80 | (code_point & 0x3F);
511     } else if (code_point < 0x10000) {
512       *target++ = 0xE0 | (code_point >> 12);
513       if (target == target_end)
514         break;
515       *target++ = 0x80 | ((code_point >> 6) & 0x3F);
516       if (target == target_end)
517         break;
518       *target++ = 0x80 | (code_point & 0x3F);
519     } else {
520       *target++ = 0xF0 | (code_point >> 18);
521       if (target == target_end)
522         break;
523       *target++ = 0x80 | ((code_point >> 12) & 0x3F);
524       if (target == target_end)
525         break;
526       *target++ = 0x80 | ((code_point >> 6) & 0x3F);
527       if (target == target_end)
528         break;
529       *target++ = 0x80 | (code_point & 0x3F);
530       /* uv__get_surrogate_value consumed 2 input characters */
531       w_source_ptr++;
532       if (w_source_len > 0)
533         w_source_len--;
534     }
535     target_len = target - *target_ptr;
536     w_source_ptr++;
537     if (w_source_len > 0)
538       w_source_len--;
539   }
540 
541   if (target != target_end && target_len_ptr != NULL)
542     /* Did not fill all of the provided buffer, so update the target_len_ptr
543      * output with the space used. */
544     *target_len_ptr = target - *target_ptr;
545 
546   /* Check if input fit into target exactly. */
547   if (w_source_len < 0 && target == target_end && w_source_ptr[0] == 0)
548     w_source_len = 0;
549 
550   *target++ = '\0';
551 
552   /* Characters remained after filling the buffer, compute the remaining length now. */
553   if (w_source_len) {
554     if (target_len_ptr != NULL)
555       *target_len_ptr = target_len + uv_utf16_length_as_wtf8(w_source_ptr, w_source_len);
556     return UV_ENOBUFS;
557   }
558 
559   return 0;
560 }
561