xref: /curl/lib/urlapi.c (revision 566a6d7b)
1 /***************************************************************************
2  *                                  _   _ ____  _
3  *  Project                     ___| | | |  _ \| |
4  *                             / __| | | | |_) | |
5  *                            | (__| |_| |  _ <| |___
6  *                             \___|\___/|_| \_\_____|
7  *
8  * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
9  *
10  * This software is licensed as described in the file COPYING, which
11  * you should have received as part of this distribution. The terms
12  * are also available at https://curl.se/docs/copyright.html.
13  *
14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15  * copies of the Software, and permit persons to whom the Software is
16  * furnished to do so, under the terms of the COPYING file.
17  *
18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19  * KIND, either express or implied.
20  *
21  * SPDX-License-Identifier: curl
22  *
23  ***************************************************************************/
24 
25 #include "curl_setup.h"
26 
27 #include "urldata.h"
28 #include "urlapi-int.h"
29 #include "strcase.h"
30 #include "url.h"
31 #include "escape.h"
32 #include "curl_ctype.h"
33 #include "inet_pton.h"
34 #include "inet_ntop.h"
35 #include "strdup.h"
36 #include "idn.h"
37 
38 /* The last 3 #include files should be in this order */
39 #include "curl_printf.h"
40 #include "curl_memory.h"
41 #include "memdebug.h"
42 
43   /* MS-DOS/Windows style drive prefix, eg c: in c:foo */
44 #define STARTS_WITH_DRIVE_PREFIX(str) \
45   ((('a' <= str[0] && str[0] <= 'z') || \
46     ('A' <= str[0] && str[0] <= 'Z')) && \
47    (str[1] == ':'))
48 
49   /* MS-DOS/Windows style drive prefix, optionally with
50    * a '|' instead of ':', followed by a slash or NUL */
51 #define STARTS_WITH_URL_DRIVE_PREFIX(str) \
52   ((('a' <= (str)[0] && (str)[0] <= 'z') || \
53     ('A' <= (str)[0] && (str)[0] <= 'Z')) && \
54    ((str)[1] == ':' || (str)[1] == '|') && \
55    ((str)[2] == '/' || (str)[2] == '\\' || (str)[2] == 0))
56 
57 /* scheme is not URL encoded, the longest libcurl supported ones are... */
58 #define MAX_SCHEME_LEN 40
59 
60 /*
61  * If USE_IPV6 is disabled, we still want to parse IPv6 addresses, so make
62  * sure we have _some_ value for AF_INET6 without polluting our fake value
63  * everywhere.
64  */
65 #if !defined(USE_IPV6) && !defined(AF_INET6)
66 #define AF_INET6 (AF_INET + 1)
67 #endif
68 
69 /* Internal representation of CURLU. Point to URL-encoded strings. */
70 struct Curl_URL {
71   char *scheme;
72   char *user;
73   char *password;
74   char *options; /* IMAP only? */
75   char *host;
76   char *zoneid; /* for numerical IPv6 addresses */
77   char *port;
78   char *path;
79   char *query;
80   char *fragment;
81   unsigned short portnum; /* the numerical version (if 'port' is set) */
82   BIT(query_present);    /* to support blank */
83   BIT(fragment_present); /* to support blank */
84   BIT(guessed_scheme);   /* when a URL without scheme is parsed */
85 };
86 
87 #define DEFAULT_SCHEME "https"
88 
free_urlhandle(struct Curl_URL * u)89 static void free_urlhandle(struct Curl_URL *u)
90 {
91   free(u->scheme);
92   free(u->user);
93   free(u->password);
94   free(u->options);
95   free(u->host);
96   free(u->zoneid);
97   free(u->port);
98   free(u->path);
99   free(u->query);
100   free(u->fragment);
101 }
102 
103 /*
104  * Find the separator at the end of the hostname, or the '?' in cases like
105  * http://www.example.com?id=2380
106  */
find_host_sep(const char * url)107 static const char *find_host_sep(const char *url)
108 {
109   const char *sep;
110   const char *query;
111 
112   /* Find the start of the hostname */
113   sep = strstr(url, "//");
114   if(!sep)
115     sep = url;
116   else
117     sep += 2;
118 
119   query = strchr(sep, '?');
120   sep = strchr(sep, '/');
121 
122   if(!sep)
123     sep = url + strlen(url);
124 
125   if(!query)
126     query = url + strlen(url);
127 
128   return sep < query ? sep : query;
129 }
130 
131 /* convert CURLcode to CURLUcode */
132 #define cc2cu(x) ((x) == CURLE_TOO_LARGE ? CURLUE_TOO_LARGE :   \
133                   CURLUE_OUT_OF_MEMORY)
134 /*
135  * Decide whether a character in a URL must be escaped.
136  */
137 #define urlchar_needs_escaping(c) (!(ISCNTRL(c) || ISSPACE(c) || ISGRAPH(c)))
138 
139 static const char hexdigits[] = "0123456789abcdef";
140 /* urlencode_str() writes data into an output dynbuf and URL-encodes the
141  * spaces in the source URL accordingly.
142  *
143  * URL encoding should be skipped for hostnames, otherwise IDN resolution
144  * will fail.
145  */
urlencode_str(struct dynbuf * o,const char * url,size_t len,bool relative,bool query)146 static CURLUcode urlencode_str(struct dynbuf *o, const char *url,
147                                size_t len, bool relative,
148                                bool query)
149 {
150   /* we must add this with whitespace-replacing */
151   bool left = !query;
152   const unsigned char *iptr;
153   const unsigned char *host_sep = (const unsigned char *) url;
154   CURLcode result;
155 
156   if(!relative)
157     host_sep = (const unsigned char *) find_host_sep(url);
158 
159   for(iptr = (unsigned char *)url;    /* read from here */
160       len; iptr++, len--) {
161 
162     if(iptr < host_sep) {
163       result = Curl_dyn_addn(o, iptr, 1);
164       if(result)
165         return cc2cu(result);
166       continue;
167     }
168 
169     if(*iptr == ' ') {
170       if(left)
171         result = Curl_dyn_addn(o, "%20", 3);
172       else
173         result = Curl_dyn_addn(o, "+", 1);
174       if(result)
175         return cc2cu(result);
176       continue;
177     }
178 
179     if(*iptr == '?')
180       left = FALSE;
181 
182     if(urlchar_needs_escaping(*iptr)) {
183       char out[3]={'%'};
184       out[1] = hexdigits[*iptr >> 4];
185       out[2] = hexdigits[*iptr & 0xf];
186       result = Curl_dyn_addn(o, out, 3);
187     }
188     else
189       result = Curl_dyn_addn(o, iptr, 1);
190     if(result)
191       return cc2cu(result);
192   }
193 
194   return CURLUE_OK;
195 }
196 
197 /*
198  * Returns the length of the scheme if the given URL is absolute (as opposed
199  * to relative). Stores the scheme in the buffer if TRUE and 'buf' is
200  * non-NULL. The buflen must be larger than MAX_SCHEME_LEN if buf is set.
201  *
202  * If 'guess_scheme' is TRUE, it means the URL might be provided without
203  * scheme.
204  */
Curl_is_absolute_url(const char * url,char * buf,size_t buflen,bool guess_scheme)205 size_t Curl_is_absolute_url(const char *url, char *buf, size_t buflen,
206                             bool guess_scheme)
207 {
208   size_t i = 0;
209   DEBUGASSERT(!buf || (buflen > MAX_SCHEME_LEN));
210   (void)buflen; /* only used in debug-builds */
211   if(buf)
212     buf[0] = 0; /* always leave a defined value in buf */
213 #ifdef _WIN32
214   if(guess_scheme && STARTS_WITH_DRIVE_PREFIX(url))
215     return 0;
216 #endif
217   if(ISALPHA(url[0]))
218     for(i = 1; i < MAX_SCHEME_LEN; ++i) {
219       char s = url[i];
220       if(s && (ISALNUM(s) || (s == '+') || (s == '-') || (s == '.') )) {
221         /* RFC 3986 3.1 explains:
222            scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
223         */
224       }
225       else {
226         break;
227       }
228     }
229   if(i && (url[i] == ':') && ((url[i + 1] == '/') || !guess_scheme)) {
230     /* If this does not guess scheme, the scheme always ends with the colon so
231        that this also detects data: URLs etc. In guessing mode, data: could
232        be the hostname "data" with a specified port number. */
233 
234     /* the length of the scheme is the name part only */
235     size_t len = i;
236     if(buf) {
237       Curl_strntolower(buf, url, i);
238       buf[i] = 0;
239     }
240     return len;
241   }
242   return 0;
243 }
244 
245 /*
246  * Concatenate a relative URL to a base URL making it absolute.
247  * URL-encodes any spaces.
248  * The returned pointer must be freed by the caller unless NULL
249  * (returns NULL on out of memory).
250  *
251  * Note that this function destroys the 'base' string.
252  */
concat_url(char * base,const char * relurl,char ** newurl)253 static CURLcode concat_url(char *base, const char *relurl, char **newurl)
254 {
255   /***
256    TRY to append this new path to the old URL
257    to the right of the host part. Oh crap, this is doomed to cause
258    problems in the future...
259   */
260   struct dynbuf newest;
261   char *protsep;
262   char *pathsep;
263   bool host_changed = FALSE;
264   const char *useurl = relurl;
265   CURLcode result = CURLE_OK;
266   CURLUcode uc;
267   bool skip_slash = FALSE;
268   *newurl = NULL;
269 
270   /* protsep points to the start of the hostname */
271   protsep = strstr(base, "//");
272   if(!protsep)
273     protsep = base;
274   else
275     protsep += 2; /* pass the slashes */
276 
277   if('/' != relurl[0]) {
278     int level = 0;
279 
280     /* First we need to find out if there is a ?-letter in the URL,
281        and cut it and the right-side of that off */
282     pathsep = strchr(protsep, '?');
283     if(pathsep)
284       *pathsep = 0;
285 
286     /* we have a relative path to append to the last slash if there is one
287        available, or the new URL is just a query string (starts with a '?') or
288        a fragment (starts with '#') we append the new one at the end of the
289        current URL */
290     if((useurl[0] != '?') && (useurl[0] != '#')) {
291       pathsep = strrchr(protsep, '/');
292       if(pathsep)
293         *pathsep = 0;
294 
295       /* Check if there is any slash after the hostname, and if so, remember
296          that position instead */
297       pathsep = strchr(protsep, '/');
298       if(pathsep)
299         protsep = pathsep + 1;
300       else
301         protsep = NULL;
302 
303       /* now deal with one "./" or any amount of "../" in the newurl
304          and act accordingly */
305 
306       if((useurl[0] == '.') && (useurl[1] == '/'))
307         useurl += 2; /* just skip the "./" */
308 
309       while((useurl[0] == '.') &&
310             (useurl[1] == '.') &&
311             (useurl[2] == '/')) {
312         level++;
313         useurl += 3; /* pass the "../" */
314       }
315 
316       if(protsep) {
317         while(level--) {
318           /* cut off one more level from the right of the original URL */
319           pathsep = strrchr(protsep, '/');
320           if(pathsep)
321             *pathsep = 0;
322           else {
323             *protsep = 0;
324             break;
325           }
326         }
327       }
328     }
329     else
330       skip_slash = TRUE;
331   }
332   else {
333     /* We got a new absolute path for this server */
334 
335     if(relurl[1] == '/') {
336       /* the new URL starts with //, just keep the protocol part from the
337          original one */
338       *protsep = 0;
339       useurl = &relurl[2]; /* we keep the slashes from the original, so we
340                               skip the new ones */
341       host_changed = TRUE;
342     }
343     else {
344       /* cut off the original URL from the first slash, or deal with URLs
345          without slash */
346       pathsep = strchr(protsep, '/');
347       if(pathsep) {
348         /* When people use badly formatted URLs, such as
349            "http://www.example.com?dir=/home/daniel" we must not use the first
350            slash, if there is a ?-letter before it! */
351         char *sep = strchr(protsep, '?');
352         if(sep && (sep < pathsep))
353           pathsep = sep;
354         *pathsep = 0;
355       }
356       else {
357         /* There was no slash. Now, since we might be operating on a badly
358            formatted URL, such as "http://www.example.com?id=2380" which does
359            not use a slash separator as it is supposed to, we need to check
360            for a ?-letter as well! */
361         pathsep = strchr(protsep, '?');
362         if(pathsep)
363           *pathsep = 0;
364       }
365     }
366   }
367 
368   Curl_dyn_init(&newest, CURL_MAX_INPUT_LENGTH);
369 
370   /* copy over the root URL part */
371   result = Curl_dyn_add(&newest, base);
372   if(result)
373     return result;
374 
375   /* check if we need to append a slash */
376   if(('/' == useurl[0]) || (protsep && !*protsep) || skip_slash)
377     ;
378   else {
379     result = Curl_dyn_addn(&newest, "/", 1);
380     if(result)
381       return result;
382   }
383 
384   /* then append the new piece on the right side */
385   uc = urlencode_str(&newest, useurl, strlen(useurl), !host_changed,
386                      FALSE);
387   if(uc)
388     return (uc == CURLUE_TOO_LARGE) ? CURLE_TOO_LARGE : CURLE_OUT_OF_MEMORY;
389 
390   *newurl = Curl_dyn_ptr(&newest);
391   return CURLE_OK;
392 }
393 
394 /* scan for byte values <= 31, 127 and sometimes space */
junkscan(const char * url,size_t * urllen,unsigned int flags)395 static CURLUcode junkscan(const char *url, size_t *urllen, unsigned int flags)
396 {
397   static const char badbytes[]={
398     /* */ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
399     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
400     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
401     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
402     0x7f, 0x00 /* null-terminate */
403   };
404   size_t n = strlen(url);
405   size_t nfine;
406 
407   if(n > CURL_MAX_INPUT_LENGTH)
408     /* excessive input length */
409     return CURLUE_MALFORMED_INPUT;
410 
411   nfine = strcspn(url, badbytes);
412   if((nfine != n) ||
413      (!(flags & CURLU_ALLOW_SPACE) && strchr(url, ' ')))
414     return CURLUE_MALFORMED_INPUT;
415 
416   *urllen = n;
417   return CURLUE_OK;
418 }
419 
420 /*
421  * parse_hostname_login()
422  *
423  * Parse the login details (username, password and options) from the URL and
424  * strip them out of the hostname
425  *
426  */
parse_hostname_login(struct Curl_URL * u,const char * login,size_t len,unsigned int flags,size_t * offset)427 static CURLUcode parse_hostname_login(struct Curl_URL *u,
428                                       const char *login,
429                                       size_t len,
430                                       unsigned int flags,
431                                       size_t *offset) /* to the hostname */
432 {
433   CURLUcode result = CURLUE_OK;
434   CURLcode ccode;
435   char *userp = NULL;
436   char *passwdp = NULL;
437   char *optionsp = NULL;
438   const struct Curl_handler *h = NULL;
439 
440   /* At this point, we assume all the other special cases have been taken
441    * care of, so the host is at most
442    *
443    *   [user[:password][;options]]@]hostname
444    *
445    * We need somewhere to put the embedded details, so do that first.
446    */
447   char *ptr;
448 
449   DEBUGASSERT(login);
450 
451   *offset = 0;
452   ptr = memchr(login, '@', len);
453   if(!ptr)
454     goto out;
455 
456   /* We will now try to extract the
457    * possible login information in a string like:
458    * ftp://user:password@ftp.my.site:8021/README */
459   ptr++;
460 
461   /* if this is a known scheme, get some details */
462   if(u->scheme)
463     h = Curl_get_scheme_handler(u->scheme);
464 
465   /* We could use the login information in the URL so extract it. Only parse
466      options if the handler says we should. Note that 'h' might be NULL! */
467   ccode = Curl_parse_login_details(login, ptr - login - 1,
468                                    &userp, &passwdp,
469                                    (h && (h->flags & PROTOPT_URLOPTIONS)) ?
470                                    &optionsp : NULL);
471   if(ccode) {
472     result = CURLUE_BAD_LOGIN;
473     goto out;
474   }
475 
476   if(userp) {
477     if(flags & CURLU_DISALLOW_USER) {
478       /* Option DISALLOW_USER is set and URL contains username. */
479       result = CURLUE_USER_NOT_ALLOWED;
480       goto out;
481     }
482     free(u->user);
483     u->user = userp;
484   }
485 
486   if(passwdp) {
487     free(u->password);
488     u->password = passwdp;
489   }
490 
491   if(optionsp) {
492     free(u->options);
493     u->options = optionsp;
494   }
495 
496   /* the hostname starts at this offset */
497   *offset = ptr - login;
498   return CURLUE_OK;
499 
500 out:
501 
502   free(userp);
503   free(passwdp);
504   free(optionsp);
505   u->user = NULL;
506   u->password = NULL;
507   u->options = NULL;
508 
509   return result;
510 }
511 
Curl_parse_port(struct Curl_URL * u,struct dynbuf * host,bool has_scheme)512 UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, struct dynbuf *host,
513                                    bool has_scheme)
514 {
515   char *portptr;
516   char *hostname = Curl_dyn_ptr(host);
517   /*
518    * Find the end of an IPv6 address on the ']' ending bracket.
519    */
520   if(hostname[0] == '[') {
521     portptr = strchr(hostname, ']');
522     if(!portptr)
523       return CURLUE_BAD_IPV6;
524     portptr++;
525     /* this is a RFC2732-style specified IP-address */
526     if(*portptr) {
527       if(*portptr != ':')
528         return CURLUE_BAD_PORT_NUMBER;
529     }
530     else
531       portptr = NULL;
532   }
533   else
534     portptr = strchr(hostname, ':');
535 
536   if(portptr) {
537     char *rest = NULL;
538     unsigned long port;
539     size_t keep = portptr - hostname;
540 
541     /* Browser behavior adaptation. If there is a colon with no digits after,
542        just cut off the name there which makes us ignore the colon and just
543        use the default port. Firefox, Chrome and Safari all do that.
544 
545        Do not do it if the URL has no scheme, to make something that looks like
546        a scheme not work!
547     */
548     Curl_dyn_setlen(host, keep);
549     portptr++;
550     if(!*portptr)
551       return has_scheme ? CURLUE_OK : CURLUE_BAD_PORT_NUMBER;
552 
553     if(!ISDIGIT(*portptr))
554       return CURLUE_BAD_PORT_NUMBER;
555 
556     errno = 0;
557     port = strtoul(portptr, &rest, 10);  /* Port number must be decimal */
558 
559     if(errno || (port > 0xffff) || *rest)
560       return CURLUE_BAD_PORT_NUMBER;
561 
562     u->portnum = (unsigned short) port;
563     /* generate a new port number string to get rid of leading zeroes etc */
564     free(u->port);
565     u->port = aprintf("%ld", port);
566     if(!u->port)
567       return CURLUE_OUT_OF_MEMORY;
568   }
569 
570   return CURLUE_OK;
571 }
572 
573 /* this assumes 'hostname' now starts with [ */
ipv6_parse(struct Curl_URL * u,char * hostname,size_t hlen)574 static CURLUcode ipv6_parse(struct Curl_URL *u, char *hostname,
575                             size_t hlen) /* length of hostname */
576 {
577   size_t len;
578   DEBUGASSERT(*hostname == '[');
579   if(hlen < 4) /* '[::]' is the shortest possible valid string */
580     return CURLUE_BAD_IPV6;
581   hostname++;
582   hlen -= 2;
583 
584   /* only valid IPv6 letters are ok */
585   len = strspn(hostname, "0123456789abcdefABCDEF:.");
586 
587   if(hlen != len) {
588     hlen = len;
589     if(hostname[len] == '%') {
590       /* this could now be '%[zone id]' */
591       char zoneid[16];
592       int i = 0;
593       char *h = &hostname[len + 1];
594       /* pass '25' if present and is a URL encoded percent sign */
595       if(!strncmp(h, "25", 2) && h[2] && (h[2] != ']'))
596         h += 2;
597       while(*h && (*h != ']') && (i < 15))
598         zoneid[i++] = *h++;
599       if(!i || (']' != *h))
600         return CURLUE_BAD_IPV6;
601       zoneid[i] = 0;
602       u->zoneid = strdup(zoneid);
603       if(!u->zoneid)
604         return CURLUE_OUT_OF_MEMORY;
605       hostname[len] = ']'; /* insert end bracket */
606       hostname[len + 1] = 0; /* terminate the hostname */
607     }
608     else
609       return CURLUE_BAD_IPV6;
610     /* hostname is fine */
611   }
612 
613   /* Normalize the IPv6 address */
614   {
615     char dest[16]; /* fits a binary IPv6 address */
616     hostname[hlen] = 0; /* end the address there */
617     if(1 != Curl_inet_pton(AF_INET6, hostname, dest))
618       return CURLUE_BAD_IPV6;
619     if(Curl_inet_ntop(AF_INET6, dest, hostname, hlen)) {
620       hlen = strlen(hostname); /* might be shorter now */
621       hostname[hlen + 1] = 0;
622     }
623     hostname[hlen] = ']'; /* restore ending bracket */
624   }
625   return CURLUE_OK;
626 }
627 
hostname_check(struct Curl_URL * u,char * hostname,size_t hlen)628 static CURLUcode hostname_check(struct Curl_URL *u, char *hostname,
629                                 size_t hlen) /* length of hostname */
630 {
631   size_t len;
632   DEBUGASSERT(hostname);
633 
634   if(!hlen)
635     return CURLUE_NO_HOST;
636   else if(hostname[0] == '[')
637     return ipv6_parse(u, hostname, hlen);
638   else {
639     /* letters from the second string are not ok */
640     len = strcspn(hostname, " \r\n\t/:#?!@{}[]\\$\'\"^`*<>=;,+&()%");
641     if(hlen != len)
642       /* hostname with bad content */
643       return CURLUE_BAD_HOSTNAME;
644   }
645   return CURLUE_OK;
646 }
647 
648 /*
649  * Handle partial IPv4 numerical addresses and different bases, like
650  * '16843009', '0x7f', '0x7f.1' '0177.1.1.1' etc.
651  *
652  * If the given input string is syntactically wrong IPv4 or any part for
653  * example is too big, this function returns HOST_NAME.
654  *
655  * Output the "normalized" version of that input string in plain quad decimal
656  * integers.
657  *
658  * Returns the host type.
659  */
660 
661 #define HOST_ERROR   -1 /* out of memory */
662 
663 #define HOST_NAME    1
664 #define HOST_IPV4    2
665 #define HOST_IPV6    3
666 
ipv4_normalize(struct dynbuf * host)667 static int ipv4_normalize(struct dynbuf *host)
668 {
669   bool done = FALSE;
670   int n = 0;
671   const char *c = Curl_dyn_ptr(host);
672   unsigned long parts[4] = {0, 0, 0, 0};
673   CURLcode result = CURLE_OK;
674 
675   if(*c == '[')
676     return HOST_IPV6;
677 
678   errno = 0; /* for strtoul */
679   while(!done) {
680     char *endp = NULL;
681     unsigned long l;
682     if(!ISDIGIT(*c))
683       /* most importantly this does not allow a leading plus or minus */
684       return HOST_NAME;
685     l = strtoul(c, &endp, 0);
686     if(errno)
687       return HOST_NAME;
688 #if SIZEOF_LONG > 4
689     /* a value larger than 32 bits */
690     if(l > UINT_MAX)
691       return HOST_NAME;
692 #endif
693 
694     parts[n] = l;
695     c = endp;
696 
697     switch(*c) {
698     case '.':
699       if(n == 3)
700         return HOST_NAME;
701       n++;
702       c++;
703       break;
704 
705     case '\0':
706       done = TRUE;
707       break;
708 
709     default:
710       return HOST_NAME;
711     }
712   }
713 
714   switch(n) {
715   case 0: /* a -- 32 bits */
716     Curl_dyn_reset(host);
717 
718     result = Curl_dyn_addf(host, "%u.%u.%u.%u",
719                            (unsigned int)(parts[0] >> 24),
720                            (unsigned int)((parts[0] >> 16) & 0xff),
721                            (unsigned int)((parts[0] >> 8) & 0xff),
722                            (unsigned int)(parts[0] & 0xff));
723     break;
724   case 1: /* a.b -- 8.24 bits */
725     if((parts[0] > 0xff) || (parts[1] > 0xffffff))
726       return HOST_NAME;
727     Curl_dyn_reset(host);
728     result = Curl_dyn_addf(host, "%u.%u.%u.%u",
729                            (unsigned int)(parts[0]),
730                            (unsigned int)((parts[1] >> 16) & 0xff),
731                            (unsigned int)((parts[1] >> 8) & 0xff),
732                            (unsigned int)(parts[1] & 0xff));
733     break;
734   case 2: /* a.b.c -- 8.8.16 bits */
735     if((parts[0] > 0xff) || (parts[1] > 0xff) || (parts[2] > 0xffff))
736       return HOST_NAME;
737     Curl_dyn_reset(host);
738     result = Curl_dyn_addf(host, "%u.%u.%u.%u",
739                            (unsigned int)(parts[0]),
740                            (unsigned int)(parts[1]),
741                            (unsigned int)((parts[2] >> 8) & 0xff),
742                            (unsigned int)(parts[2] & 0xff));
743     break;
744   case 3: /* a.b.c.d -- 8.8.8.8 bits */
745     if((parts[0] > 0xff) || (parts[1] > 0xff) || (parts[2] > 0xff) ||
746        (parts[3] > 0xff))
747       return HOST_NAME;
748     Curl_dyn_reset(host);
749     result = Curl_dyn_addf(host, "%u.%u.%u.%u",
750                            (unsigned int)(parts[0]),
751                            (unsigned int)(parts[1]),
752                            (unsigned int)(parts[2]),
753                            (unsigned int)(parts[3]));
754     break;
755   }
756   if(result)
757     return HOST_ERROR;
758   return HOST_IPV4;
759 }
760 
761 /* if necessary, replace the host content with a URL decoded version */
urldecode_host(struct dynbuf * host)762 static CURLUcode urldecode_host(struct dynbuf *host)
763 {
764   char *per = NULL;
765   const char *hostname = Curl_dyn_ptr(host);
766   per = strchr(hostname, '%');
767   if(!per)
768     /* nothing to decode */
769     return CURLUE_OK;
770   else {
771     /* encoded */
772     size_t dlen;
773     char *decoded;
774     CURLcode result = Curl_urldecode(hostname, 0, &decoded, &dlen,
775                                      REJECT_CTRL);
776     if(result)
777       return CURLUE_BAD_HOSTNAME;
778     Curl_dyn_reset(host);
779     result = Curl_dyn_addn(host, decoded, dlen);
780     free(decoded);
781     if(result)
782       return cc2cu(result);
783   }
784 
785   return CURLUE_OK;
786 }
787 
parse_authority(struct Curl_URL * u,const char * auth,size_t authlen,unsigned int flags,struct dynbuf * host,bool has_scheme)788 static CURLUcode parse_authority(struct Curl_URL *u,
789                                  const char *auth, size_t authlen,
790                                  unsigned int flags,
791                                  struct dynbuf *host,
792                                  bool has_scheme)
793 {
794   size_t offset;
795   CURLUcode uc;
796   CURLcode result;
797 
798   /*
799    * Parse the login details and strip them out of the hostname.
800    */
801   uc = parse_hostname_login(u, auth, authlen, flags, &offset);
802   if(uc)
803     goto out;
804 
805   result = Curl_dyn_addn(host, auth + offset, authlen - offset);
806   if(result) {
807     uc = cc2cu(result);
808     goto out;
809   }
810 
811   uc = Curl_parse_port(u, host, has_scheme);
812   if(uc)
813     goto out;
814 
815   if(!Curl_dyn_len(host))
816     return CURLUE_NO_HOST;
817 
818   switch(ipv4_normalize(host)) {
819   case HOST_IPV4:
820     break;
821   case HOST_IPV6:
822     uc = ipv6_parse(u, Curl_dyn_ptr(host), Curl_dyn_len(host));
823     break;
824   case HOST_NAME:
825     uc = urldecode_host(host);
826     if(!uc)
827       uc = hostname_check(u, Curl_dyn_ptr(host), Curl_dyn_len(host));
828     break;
829   case HOST_ERROR:
830     uc = CURLUE_OUT_OF_MEMORY;
831     break;
832   default:
833     uc = CURLUE_BAD_HOSTNAME; /* Bad IPv4 address even */
834     break;
835   }
836 
837 out:
838   return uc;
839 }
840 
841 /* used for HTTP/2 server push */
Curl_url_set_authority(CURLU * u,const char * authority)842 CURLUcode Curl_url_set_authority(CURLU *u, const char *authority)
843 {
844   CURLUcode result;
845   struct dynbuf host;
846 
847   DEBUGASSERT(authority);
848   Curl_dyn_init(&host, CURL_MAX_INPUT_LENGTH);
849 
850   result = parse_authority(u, authority, strlen(authority),
851                            CURLU_DISALLOW_USER, &host, !!u->scheme);
852   if(result)
853     Curl_dyn_free(&host);
854   else {
855     free(u->host);
856     u->host = Curl_dyn_ptr(&host);
857   }
858   return result;
859 }
860 
861 /*
862  * "Remove Dot Segments"
863  * https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4
864  */
865 
866 /*
867  * dedotdotify()
868  * @unittest: 1395
869  *
870  * This function gets a null-terminated path with dot and dotdot sequences
871  * passed in and strips them off according to the rules in RFC 3986 section
872  * 5.2.4.
873  *
874  * The function handles a query part ('?' + stuff) appended but it expects
875  * that fragments ('#' + stuff) have already been cut off.
876  *
877  * RETURNS
878  *
879  * Zero for success and 'out' set to an allocated dedotdotified string.
880  */
881 UNITTEST int dedotdotify(const char *input, size_t clen, char **outp);
dedotdotify(const char * input,size_t clen,char ** outp)882 UNITTEST int dedotdotify(const char *input, size_t clen, char **outp)
883 {
884   char *outptr;
885   const char *endp = &input[clen];
886   char *out;
887 
888   *outp = NULL;
889   /* the path always starts with a slash, and a slash has not dot */
890   if((clen < 2) || !memchr(input, '.', clen))
891     return 0;
892 
893   out = malloc(clen + 1);
894   if(!out)
895     return 1; /* out of memory */
896 
897   *out = 0; /* null-terminates, for inputs like "./" */
898   outptr = out;
899 
900   do {
901     bool dotdot = TRUE;
902     if(*input == '.') {
903       /*  A. If the input buffer begins with a prefix of "../" or "./", then
904           remove that prefix from the input buffer; otherwise, */
905 
906       if(!strncmp("./", input, 2)) {
907         input += 2;
908         clen -= 2;
909       }
910       else if(!strncmp("../", input, 3)) {
911         input += 3;
912         clen -= 3;
913       }
914       /*  D. if the input buffer consists only of "." or "..", then remove
915           that from the input buffer; otherwise, */
916 
917       else if(!strcmp(".", input) || !strcmp("..", input) ||
918               !strncmp(".?", input, 2) || !strncmp("..?", input, 3)) {
919         *out = 0;
920         break;
921       }
922       else
923         dotdot = FALSE;
924     }
925     else if(*input == '/') {
926       /*  B. if the input buffer begins with a prefix of "/./" or "/.", where
927           "."  is a complete path segment, then replace that prefix with "/" in
928           the input buffer; otherwise, */
929       if(!strncmp("/./", input, 3)) {
930         input += 2;
931         clen -= 2;
932       }
933       else if(!strcmp("/.", input) || !strncmp("/.?", input, 3)) {
934         *outptr++ = '/';
935         *outptr = 0;
936         break;
937       }
938 
939       /*  C. if the input buffer begins with a prefix of "/../" or "/..",
940           where ".." is a complete path segment, then replace that prefix with
941           "/" in the input buffer and remove the last segment and its
942           preceding "/" (if any) from the output buffer; otherwise, */
943 
944       else if(!strncmp("/../", input, 4)) {
945         input += 3;
946         clen -= 3;
947         /* remove the last segment from the output buffer */
948         while(outptr > out) {
949           outptr--;
950           if(*outptr == '/')
951             break;
952         }
953         *outptr = 0; /* null-terminate where it stops */
954       }
955       else if(!strcmp("/..", input) || !strncmp("/..?", input, 4)) {
956         /* remove the last segment from the output buffer */
957         while(outptr > out) {
958           outptr--;
959           if(*outptr == '/')
960             break;
961         }
962         *outptr++ = '/';
963         *outptr = 0; /* null-terminate where it stops */
964         break;
965       }
966       else
967         dotdot = FALSE;
968     }
969     else
970       dotdot = FALSE;
971 
972     if(!dotdot) {
973       /*  E. move the first path segment in the input buffer to the end of
974           the output buffer, including the initial "/" character (if any) and
975           any subsequent characters up to, but not including, the next "/"
976           character or the end of the input buffer. */
977 
978       do {
979         *outptr++ = *input++;
980         clen--;
981       } while(*input && (*input != '/') && (*input != '?'));
982       *outptr = 0;
983     }
984 
985     /* continue until end of path */
986   } while(input < endp);
987 
988   *outp = out;
989   return 0; /* success */
990 }
991 
parseurl(const char * url,CURLU * u,unsigned int flags)992 static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
993 {
994   const char *path;
995   size_t pathlen;
996   char *query = NULL;
997   char *fragment = NULL;
998   char schemebuf[MAX_SCHEME_LEN + 1];
999   size_t schemelen = 0;
1000   size_t urllen;
1001   CURLUcode result = CURLUE_OK;
1002   size_t fraglen = 0;
1003   struct dynbuf host;
1004 
1005   DEBUGASSERT(url);
1006 
1007   Curl_dyn_init(&host, CURL_MAX_INPUT_LENGTH);
1008 
1009   result = junkscan(url, &urllen, flags);
1010   if(result)
1011     goto fail;
1012 
1013   schemelen = Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf),
1014                                    flags & (CURLU_GUESS_SCHEME|
1015                                             CURLU_DEFAULT_SCHEME));
1016 
1017   /* handle the file: scheme */
1018   if(schemelen && !strcmp(schemebuf, "file")) {
1019     bool uncpath = FALSE;
1020     if(urllen <= 6) {
1021       /* file:/ is not enough to actually be a complete file: URL */
1022       result = CURLUE_BAD_FILE_URL;
1023       goto fail;
1024     }
1025 
1026     /* path has been allocated large enough to hold this */
1027     path = (char *)&url[5];
1028     pathlen = urllen - 5;
1029 
1030     u->scheme = strdup("file");
1031     if(!u->scheme) {
1032       result = CURLUE_OUT_OF_MEMORY;
1033       goto fail;
1034     }
1035 
1036     /* Extra handling URLs with an authority component (i.e. that start with
1037      * "file://")
1038      *
1039      * We allow omitted hostname (e.g. file:/<path>) -- valid according to
1040      * RFC 8089, but not the (current) WHAT-WG URL spec.
1041      */
1042     if(path[0] == '/' && path[1] == '/') {
1043       /* swallow the two slashes */
1044       const char *ptr = &path[2];
1045 
1046       /*
1047        * According to RFC 8089, a file: URL can be reliably dereferenced if:
1048        *
1049        *  o it has no/blank hostname, or
1050        *
1051        *  o the hostname matches "localhost" (case-insensitively), or
1052        *
1053        *  o the hostname is a FQDN that resolves to this machine, or
1054        *
1055        *  o it is an UNC String transformed to an URI (Windows only, RFC 8089
1056        *    Appendix E.3).
1057        *
1058        * For brevity, we only consider URLs with empty, "localhost", or
1059        * "127.0.0.1" hostnames as local, otherwise as an UNC String.
1060        *
1061        * Additionally, there is an exception for URLs with a Windows drive
1062        * letter in the authority (which was accidentally omitted from RFC 8089
1063        * Appendix E, but believe me, it was meant to be there. --MK)
1064        */
1065       if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) {
1066         /* the URL includes a hostname, it must match "localhost" or
1067            "127.0.0.1" to be valid */
1068         if(checkprefix("localhost/", ptr) ||
1069            checkprefix("127.0.0.1/", ptr)) {
1070           ptr += 9; /* now points to the slash after the host */
1071         }
1072         else {
1073 #if defined(_WIN32)
1074           size_t len;
1075 
1076           /* the hostname, NetBIOS computer name, can not contain disallowed
1077              chars, and the delimiting slash character must be appended to the
1078              hostname */
1079           path = strpbrk(ptr, "/\\:*?\"<>|");
1080           if(!path || *path != '/') {
1081             result = CURLUE_BAD_FILE_URL;
1082             goto fail;
1083           }
1084 
1085           len = path - ptr;
1086           if(len) {
1087             CURLcode code = Curl_dyn_addn(&host, ptr, len);
1088             if(code) {
1089               result = cc2cu(code);
1090               goto fail;
1091             }
1092             uncpath = TRUE;
1093           }
1094 
1095           ptr -= 2; /* now points to the // before the host in UNC */
1096 #else
1097           /* Invalid file://hostname/, expected localhost or 127.0.0.1 or
1098              none */
1099           result = CURLUE_BAD_FILE_URL;
1100           goto fail;
1101 #endif
1102         }
1103       }
1104 
1105       path = ptr;
1106       pathlen = urllen - (ptr - url);
1107     }
1108 
1109     if(!uncpath)
1110       /* no host for file: URLs by default */
1111       Curl_dyn_reset(&host);
1112 
1113 #if !defined(_WIN32) && !defined(MSDOS) && !defined(__CYGWIN__)
1114     /* Do not allow Windows drive letters when not in Windows.
1115      * This catches both "file:/c:" and "file:c:" */
1116     if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) ||
1117        STARTS_WITH_URL_DRIVE_PREFIX(path)) {
1118       /* File drive letters are only accepted in MS-DOS/Windows */
1119       result = CURLUE_BAD_FILE_URL;
1120       goto fail;
1121     }
1122 #else
1123     /* If the path starts with a slash and a drive letter, ditch the slash */
1124     if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) {
1125       /* This cannot be done with strcpy, as the memory chunks overlap! */
1126       path++;
1127       pathlen--;
1128     }
1129 #endif
1130 
1131   }
1132   else {
1133     /* clear path */
1134     const char *schemep = NULL;
1135     const char *hostp;
1136     size_t hostlen;
1137 
1138     if(schemelen) {
1139       int i = 0;
1140       const char *p = &url[schemelen + 1];
1141       while((*p == '/') && (i < 4)) {
1142         p++;
1143         i++;
1144       }
1145 
1146       schemep = schemebuf;
1147       if(!Curl_get_scheme_handler(schemep) &&
1148          !(flags & CURLU_NON_SUPPORT_SCHEME)) {
1149         result = CURLUE_UNSUPPORTED_SCHEME;
1150         goto fail;
1151       }
1152 
1153       if((i < 1) || (i > 3)) {
1154         /* less than one or more than three slashes */
1155         result = CURLUE_BAD_SLASHES;
1156         goto fail;
1157       }
1158       hostp = p; /* hostname starts here */
1159     }
1160     else {
1161       /* no scheme! */
1162 
1163       if(!(flags & (CURLU_DEFAULT_SCHEME|CURLU_GUESS_SCHEME))) {
1164         result = CURLUE_BAD_SCHEME;
1165         goto fail;
1166       }
1167       if(flags & CURLU_DEFAULT_SCHEME)
1168         schemep = DEFAULT_SCHEME;
1169 
1170       /*
1171        * The URL was badly formatted, let's try without scheme specified.
1172        */
1173       hostp = url;
1174     }
1175 
1176     if(schemep) {
1177       u->scheme = strdup(schemep);
1178       if(!u->scheme) {
1179         result = CURLUE_OUT_OF_MEMORY;
1180         goto fail;
1181       }
1182     }
1183 
1184     /* find the end of the hostname + port number */
1185     hostlen = strcspn(hostp, "/?#");
1186     path = &hostp[hostlen];
1187 
1188     /* this pathlen also contains the query and the fragment */
1189     pathlen = urllen - (path - url);
1190     if(hostlen) {
1191 
1192       result = parse_authority(u, hostp, hostlen, flags, &host, schemelen);
1193       if(result)
1194         goto fail;
1195 
1196       if((flags & CURLU_GUESS_SCHEME) && !schemep) {
1197         const char *hostname = Curl_dyn_ptr(&host);
1198         /* legacy curl-style guess based on hostname */
1199         if(checkprefix("ftp.", hostname))
1200           schemep = "ftp";
1201         else if(checkprefix("dict.", hostname))
1202           schemep = "dict";
1203         else if(checkprefix("ldap.", hostname))
1204           schemep = "ldap";
1205         else if(checkprefix("imap.", hostname))
1206           schemep = "imap";
1207         else if(checkprefix("smtp.", hostname))
1208           schemep = "smtp";
1209         else if(checkprefix("pop3.", hostname))
1210           schemep = "pop3";
1211         else
1212           schemep = "http";
1213 
1214         u->scheme = strdup(schemep);
1215         if(!u->scheme) {
1216           result = CURLUE_OUT_OF_MEMORY;
1217           goto fail;
1218         }
1219         u->guessed_scheme = TRUE;
1220       }
1221     }
1222     else if(flags & CURLU_NO_AUTHORITY) {
1223       /* allowed to be empty. */
1224       if(Curl_dyn_add(&host, "")) {
1225         result = CURLUE_OUT_OF_MEMORY;
1226         goto fail;
1227       }
1228     }
1229     else {
1230       result = CURLUE_NO_HOST;
1231       goto fail;
1232     }
1233   }
1234 
1235   fragment = strchr(path, '#');
1236   if(fragment) {
1237     fraglen = pathlen - (fragment - path);
1238     u->fragment_present = TRUE;
1239     if(fraglen > 1) {
1240       /* skip the leading '#' in the copy but include the terminating null */
1241       if(flags & CURLU_URLENCODE) {
1242         struct dynbuf enc;
1243         Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1244         result = urlencode_str(&enc, fragment + 1, fraglen - 1, TRUE, FALSE);
1245         if(result)
1246           goto fail;
1247         u->fragment = Curl_dyn_ptr(&enc);
1248       }
1249       else {
1250         u->fragment = Curl_memdup0(fragment + 1, fraglen - 1);
1251         if(!u->fragment) {
1252           result = CURLUE_OUT_OF_MEMORY;
1253           goto fail;
1254         }
1255       }
1256     }
1257     /* after this, pathlen still contains the query */
1258     pathlen -= fraglen;
1259   }
1260 
1261   query = memchr(path, '?', pathlen);
1262   if(query) {
1263     size_t qlen = fragment ? (size_t)(fragment - query) :
1264       pathlen - (query - path);
1265     pathlen -= qlen;
1266     u->query_present = TRUE;
1267     if(qlen > 1) {
1268       if(flags & CURLU_URLENCODE) {
1269         struct dynbuf enc;
1270         Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1271         /* skip the leading question mark */
1272         result = urlencode_str(&enc, query + 1, qlen - 1, TRUE, TRUE);
1273         if(result)
1274           goto fail;
1275         u->query = Curl_dyn_ptr(&enc);
1276       }
1277       else {
1278         u->query = Curl_memdup0(query + 1, qlen - 1);
1279         if(!u->query) {
1280           result = CURLUE_OUT_OF_MEMORY;
1281           goto fail;
1282         }
1283       }
1284     }
1285     else {
1286       /* single byte query */
1287       u->query = strdup("");
1288       if(!u->query) {
1289         result = CURLUE_OUT_OF_MEMORY;
1290         goto fail;
1291       }
1292     }
1293   }
1294 
1295   if(pathlen && (flags & CURLU_URLENCODE)) {
1296     struct dynbuf enc;
1297     Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1298     result = urlencode_str(&enc, path, pathlen, TRUE, FALSE);
1299     if(result)
1300       goto fail;
1301     pathlen = Curl_dyn_len(&enc);
1302     path = u->path = Curl_dyn_ptr(&enc);
1303   }
1304 
1305   if(pathlen <= 1) {
1306     /* there is no path left or just the slash, unset */
1307     path = NULL;
1308   }
1309   else {
1310     if(!u->path) {
1311       u->path = Curl_memdup0(path, pathlen);
1312       if(!u->path) {
1313         result = CURLUE_OUT_OF_MEMORY;
1314         goto fail;
1315       }
1316       path = u->path;
1317     }
1318     else if(flags & CURLU_URLENCODE)
1319       /* it might have encoded more than just the path so cut it */
1320       u->path[pathlen] = 0;
1321 
1322     if(!(flags & CURLU_PATH_AS_IS)) {
1323       /* remove ../ and ./ sequences according to RFC3986 */
1324       char *dedot;
1325       int err = dedotdotify((char *)path, pathlen, &dedot);
1326       if(err) {
1327         result = CURLUE_OUT_OF_MEMORY;
1328         goto fail;
1329       }
1330       if(dedot) {
1331         free(u->path);
1332         u->path = dedot;
1333       }
1334     }
1335   }
1336 
1337   u->host = Curl_dyn_ptr(&host);
1338 
1339   return result;
1340 fail:
1341   Curl_dyn_free(&host);
1342   free_urlhandle(u);
1343   return result;
1344 }
1345 
1346 /*
1347  * Parse the URL and, if successful, replace everything in the Curl_URL struct.
1348  */
parseurl_and_replace(const char * url,CURLU * u,unsigned int flags)1349 static CURLUcode parseurl_and_replace(const char *url, CURLU *u,
1350                                       unsigned int flags)
1351 {
1352   CURLUcode result;
1353   CURLU tmpurl;
1354   memset(&tmpurl, 0, sizeof(tmpurl));
1355   result = parseurl(url, &tmpurl, flags);
1356   if(!result) {
1357     free_urlhandle(u);
1358     *u = tmpurl;
1359   }
1360   return result;
1361 }
1362 
1363 /*
1364  */
curl_url(void)1365 CURLU *curl_url(void)
1366 {
1367   return calloc(1, sizeof(struct Curl_URL));
1368 }
1369 
curl_url_cleanup(CURLU * u)1370 void curl_url_cleanup(CURLU *u)
1371 {
1372   if(u) {
1373     free_urlhandle(u);
1374     free(u);
1375   }
1376 }
1377 
1378 #define DUP(dest, src, name)                    \
1379   do {                                          \
1380     if(src->name) {                             \
1381       dest->name = strdup(src->name);           \
1382       if(!dest->name)                           \
1383         goto fail;                              \
1384     }                                           \
1385   } while(0)
1386 
curl_url_dup(const CURLU * in)1387 CURLU *curl_url_dup(const CURLU *in)
1388 {
1389   struct Curl_URL *u = calloc(1, sizeof(struct Curl_URL));
1390   if(u) {
1391     DUP(u, in, scheme);
1392     DUP(u, in, user);
1393     DUP(u, in, password);
1394     DUP(u, in, options);
1395     DUP(u, in, host);
1396     DUP(u, in, port);
1397     DUP(u, in, path);
1398     DUP(u, in, query);
1399     DUP(u, in, fragment);
1400     DUP(u, in, zoneid);
1401     u->portnum = in->portnum;
1402     u->fragment_present = in->fragment_present;
1403     u->query_present = in->query_present;
1404   }
1405   return u;
1406 fail:
1407   curl_url_cleanup(u);
1408   return NULL;
1409 }
1410 
curl_url_get(const CURLU * u,CURLUPart what,char ** part,unsigned int flags)1411 CURLUcode curl_url_get(const CURLU *u, CURLUPart what,
1412                        char **part, unsigned int flags)
1413 {
1414   const char *ptr;
1415   CURLUcode ifmissing = CURLUE_UNKNOWN_PART;
1416   char portbuf[7];
1417   bool urldecode = (flags & CURLU_URLDECODE) ? 1 : 0;
1418   bool urlencode = (flags & CURLU_URLENCODE) ? 1 : 0;
1419   bool punycode = FALSE;
1420   bool depunyfy = FALSE;
1421   bool plusdecode = FALSE;
1422   (void)flags;
1423   if(!u)
1424     return CURLUE_BAD_HANDLE;
1425   if(!part)
1426     return CURLUE_BAD_PARTPOINTER;
1427   *part = NULL;
1428 
1429   switch(what) {
1430   case CURLUPART_SCHEME:
1431     ptr = u->scheme;
1432     ifmissing = CURLUE_NO_SCHEME;
1433     urldecode = FALSE; /* never for schemes */
1434     if((flags & CURLU_NO_GUESS_SCHEME) && u->guessed_scheme)
1435       return CURLUE_NO_SCHEME;
1436     break;
1437   case CURLUPART_USER:
1438     ptr = u->user;
1439     ifmissing = CURLUE_NO_USER;
1440     break;
1441   case CURLUPART_PASSWORD:
1442     ptr = u->password;
1443     ifmissing = CURLUE_NO_PASSWORD;
1444     break;
1445   case CURLUPART_OPTIONS:
1446     ptr = u->options;
1447     ifmissing = CURLUE_NO_OPTIONS;
1448     break;
1449   case CURLUPART_HOST:
1450     ptr = u->host;
1451     ifmissing = CURLUE_NO_HOST;
1452     punycode = (flags & CURLU_PUNYCODE) ? 1 : 0;
1453     depunyfy = (flags & CURLU_PUNY2IDN) ? 1 : 0;
1454     break;
1455   case CURLUPART_ZONEID:
1456     ptr = u->zoneid;
1457     ifmissing = CURLUE_NO_ZONEID;
1458     break;
1459   case CURLUPART_PORT:
1460     ptr = u->port;
1461     ifmissing = CURLUE_NO_PORT;
1462     urldecode = FALSE; /* never for port */
1463     if(!ptr && (flags & CURLU_DEFAULT_PORT) && u->scheme) {
1464       /* there is no stored port number, but asked to deliver
1465          a default one for the scheme */
1466       const struct Curl_handler *h = Curl_get_scheme_handler(u->scheme);
1467       if(h) {
1468         msnprintf(portbuf, sizeof(portbuf), "%u", h->defport);
1469         ptr = portbuf;
1470       }
1471     }
1472     else if(ptr && u->scheme) {
1473       /* there is a stored port number, but ask to inhibit if
1474          it matches the default one for the scheme */
1475       const struct Curl_handler *h = Curl_get_scheme_handler(u->scheme);
1476       if(h && (h->defport == u->portnum) &&
1477          (flags & CURLU_NO_DEFAULT_PORT))
1478         ptr = NULL;
1479     }
1480     break;
1481   case CURLUPART_PATH:
1482     ptr = u->path;
1483     if(!ptr)
1484       ptr = "/";
1485     break;
1486   case CURLUPART_QUERY:
1487     ptr = u->query;
1488     ifmissing = CURLUE_NO_QUERY;
1489     plusdecode = urldecode;
1490     if(ptr && !ptr[0] && !(flags & CURLU_GET_EMPTY))
1491       /* there was a blank query and the user do not ask for it */
1492       ptr = NULL;
1493     break;
1494   case CURLUPART_FRAGMENT:
1495     ptr = u->fragment;
1496     ifmissing = CURLUE_NO_FRAGMENT;
1497     if(!ptr && u->fragment_present && flags & CURLU_GET_EMPTY)
1498       /* there was a blank fragment and the user asks for it */
1499       ptr = "";
1500     break;
1501   case CURLUPART_URL: {
1502     char *url;
1503     char *scheme;
1504     char *options = u->options;
1505     char *port = u->port;
1506     char *allochost = NULL;
1507     bool show_fragment =
1508       u->fragment || (u->fragment_present && flags & CURLU_GET_EMPTY);
1509     bool show_query =
1510       (u->query && u->query[0]) ||
1511       (u->query_present && flags & CURLU_GET_EMPTY);
1512     punycode = (flags & CURLU_PUNYCODE) ? 1 : 0;
1513     depunyfy = (flags & CURLU_PUNY2IDN) ? 1 : 0;
1514     if(u->scheme && strcasecompare("file", u->scheme)) {
1515       url = aprintf("file://%s%s%s",
1516                     u->path,
1517                     show_fragment ? "#": "",
1518                     u->fragment ? u->fragment : "");
1519     }
1520     else if(!u->host)
1521       return CURLUE_NO_HOST;
1522     else {
1523       const struct Curl_handler *h = NULL;
1524       char schemebuf[MAX_SCHEME_LEN + 5];
1525       if(u->scheme)
1526         scheme = u->scheme;
1527       else if(flags & CURLU_DEFAULT_SCHEME)
1528         scheme = (char *) DEFAULT_SCHEME;
1529       else
1530         return CURLUE_NO_SCHEME;
1531 
1532       h = Curl_get_scheme_handler(scheme);
1533       if(!port && (flags & CURLU_DEFAULT_PORT)) {
1534         /* there is no stored port number, but asked to deliver
1535            a default one for the scheme */
1536         if(h) {
1537           msnprintf(portbuf, sizeof(portbuf), "%u", h->defport);
1538           port = portbuf;
1539         }
1540       }
1541       else if(port) {
1542         /* there is a stored port number, but asked to inhibit if it matches
1543            the default one for the scheme */
1544         if(h && (h->defport == u->portnum) &&
1545            (flags & CURLU_NO_DEFAULT_PORT))
1546           port = NULL;
1547       }
1548 
1549       if(h && !(h->flags & PROTOPT_URLOPTIONS))
1550         options = NULL;
1551 
1552       if(u->host[0] == '[') {
1553         if(u->zoneid) {
1554           /* make it '[ host %25 zoneid ]' */
1555           struct dynbuf enc;
1556           size_t hostlen = strlen(u->host);
1557           Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1558           if(Curl_dyn_addf(&enc, "%.*s%%25%s]", (int)hostlen - 1, u->host,
1559                            u->zoneid))
1560             return CURLUE_OUT_OF_MEMORY;
1561           allochost = Curl_dyn_ptr(&enc);
1562         }
1563       }
1564       else if(urlencode) {
1565         allochost = curl_easy_escape(NULL, u->host, 0);
1566         if(!allochost)
1567           return CURLUE_OUT_OF_MEMORY;
1568       }
1569       else if(punycode) {
1570         if(!Curl_is_ASCII_name(u->host)) {
1571 #ifndef USE_IDN
1572           return CURLUE_LACKS_IDN;
1573 #else
1574           CURLcode result = Curl_idn_decode(u->host, &allochost);
1575           if(result)
1576             return (result == CURLE_OUT_OF_MEMORY) ?
1577               CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME;
1578 #endif
1579         }
1580       }
1581       else if(depunyfy) {
1582         if(Curl_is_ASCII_name(u->host) && !strncmp("xn--", u->host, 4)) {
1583 #ifndef USE_IDN
1584           return CURLUE_LACKS_IDN;
1585 #else
1586           CURLcode result = Curl_idn_encode(u->host, &allochost);
1587           if(result)
1588             /* this is the most likely error */
1589             return (result == CURLE_OUT_OF_MEMORY) ?
1590               CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME;
1591 #endif
1592         }
1593       }
1594 
1595       if(!(flags & CURLU_NO_GUESS_SCHEME) || !u->guessed_scheme)
1596         msnprintf(schemebuf, sizeof(schemebuf), "%s://", scheme);
1597       else
1598         schemebuf[0] = 0;
1599 
1600       url = aprintf("%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
1601                     schemebuf,
1602                     u->user ? u->user : "",
1603                     u->password ? ":": "",
1604                     u->password ? u->password : "",
1605                     options ? ";" : "",
1606                     options ? options : "",
1607                     (u->user || u->password || options) ? "@": "",
1608                     allochost ? allochost : u->host,
1609                     port ? ":": "",
1610                     port ? port : "",
1611                     u->path ? u->path : "/",
1612                     show_query ? "?": "",
1613                     u->query ? u->query : "",
1614                     show_fragment ? "#": "",
1615                     u->fragment ? u->fragment : "");
1616       free(allochost);
1617     }
1618     if(!url)
1619       return CURLUE_OUT_OF_MEMORY;
1620     *part = url;
1621     return CURLUE_OK;
1622   }
1623   default:
1624     ptr = NULL;
1625     break;
1626   }
1627   if(ptr) {
1628     size_t partlen = strlen(ptr);
1629     size_t i = 0;
1630     *part = Curl_memdup0(ptr, partlen);
1631     if(!*part)
1632       return CURLUE_OUT_OF_MEMORY;
1633     if(plusdecode) {
1634       /* convert + to space */
1635       char *plus = *part;
1636       for(i = 0; i < partlen; ++plus, i++) {
1637         if(*plus == '+')
1638           *plus = ' ';
1639       }
1640     }
1641     if(urldecode) {
1642       char *decoded;
1643       size_t dlen;
1644       /* this unconditional rejection of control bytes is documented
1645          API behavior */
1646       CURLcode res = Curl_urldecode(*part, 0, &decoded, &dlen, REJECT_CTRL);
1647       free(*part);
1648       if(res) {
1649         *part = NULL;
1650         return CURLUE_URLDECODE;
1651       }
1652       *part = decoded;
1653       partlen = dlen;
1654     }
1655     if(urlencode) {
1656       struct dynbuf enc;
1657       CURLUcode uc;
1658       Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1659       uc = urlencode_str(&enc, *part, partlen, TRUE, what == CURLUPART_QUERY);
1660       if(uc)
1661         return uc;
1662       free(*part);
1663       *part = Curl_dyn_ptr(&enc);
1664     }
1665     else if(punycode) {
1666       if(!Curl_is_ASCII_name(u->host)) {
1667 #ifndef USE_IDN
1668         return CURLUE_LACKS_IDN;
1669 #else
1670         char *allochost;
1671         CURLcode result = Curl_idn_decode(*part, &allochost);
1672         if(result)
1673           return (result == CURLE_OUT_OF_MEMORY) ?
1674             CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME;
1675         free(*part);
1676         *part = allochost;
1677 #endif
1678       }
1679     }
1680     else if(depunyfy) {
1681       if(Curl_is_ASCII_name(u->host)  && !strncmp("xn--", u->host, 4)) {
1682 #ifndef USE_IDN
1683         return CURLUE_LACKS_IDN;
1684 #else
1685         char *allochost;
1686         CURLcode result = Curl_idn_encode(*part, &allochost);
1687         if(result)
1688           return (result == CURLE_OUT_OF_MEMORY) ?
1689             CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME;
1690         free(*part);
1691         *part = allochost;
1692 #endif
1693       }
1694     }
1695 
1696     return CURLUE_OK;
1697   }
1698   else
1699     return ifmissing;
1700 }
1701 
curl_url_set(CURLU * u,CURLUPart what,const char * part,unsigned int flags)1702 CURLUcode curl_url_set(CURLU *u, CURLUPart what,
1703                        const char *part, unsigned int flags)
1704 {
1705   char **storep = NULL;
1706   bool urlencode = (flags & CURLU_URLENCODE) ? 1 : 0;
1707   bool plusencode = FALSE;
1708   bool urlskipslash = FALSE;
1709   bool leadingslash = FALSE;
1710   bool appendquery = FALSE;
1711   bool equalsencode = FALSE;
1712   size_t nalloc;
1713 
1714   if(!u)
1715     return CURLUE_BAD_HANDLE;
1716   if(!part) {
1717     /* setting a part to NULL clears it */
1718     switch(what) {
1719     case CURLUPART_URL:
1720       break;
1721     case CURLUPART_SCHEME:
1722       storep = &u->scheme;
1723       u->guessed_scheme = FALSE;
1724       break;
1725     case CURLUPART_USER:
1726       storep = &u->user;
1727       break;
1728     case CURLUPART_PASSWORD:
1729       storep = &u->password;
1730       break;
1731     case CURLUPART_OPTIONS:
1732       storep = &u->options;
1733       break;
1734     case CURLUPART_HOST:
1735       storep = &u->host;
1736       break;
1737     case CURLUPART_ZONEID:
1738       storep = &u->zoneid;
1739       break;
1740     case CURLUPART_PORT:
1741       u->portnum = 0;
1742       storep = &u->port;
1743       break;
1744     case CURLUPART_PATH:
1745       storep = &u->path;
1746       break;
1747     case CURLUPART_QUERY:
1748       storep = &u->query;
1749       u->query_present = FALSE;
1750       break;
1751     case CURLUPART_FRAGMENT:
1752       storep = &u->fragment;
1753       u->fragment_present = FALSE;
1754       break;
1755     default:
1756       return CURLUE_UNKNOWN_PART;
1757     }
1758     if(storep && *storep) {
1759       Curl_safefree(*storep);
1760     }
1761     else if(!storep) {
1762       free_urlhandle(u);
1763       memset(u, 0, sizeof(struct Curl_URL));
1764     }
1765     return CURLUE_OK;
1766   }
1767 
1768   nalloc = strlen(part);
1769   if(nalloc > CURL_MAX_INPUT_LENGTH)
1770     /* excessive input length */
1771     return CURLUE_MALFORMED_INPUT;
1772 
1773   switch(what) {
1774   case CURLUPART_SCHEME: {
1775     size_t plen = strlen(part);
1776     const char *s = part;
1777     if((plen > MAX_SCHEME_LEN) || (plen < 1))
1778       /* too long or too short */
1779       return CURLUE_BAD_SCHEME;
1780    /* verify that it is a fine scheme */
1781     if(!(flags & CURLU_NON_SUPPORT_SCHEME) && !Curl_get_scheme_handler(part))
1782       return CURLUE_UNSUPPORTED_SCHEME;
1783     storep = &u->scheme;
1784     urlencode = FALSE; /* never */
1785     if(ISALPHA(*s)) {
1786       /* ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */
1787       while(--plen) {
1788         if(ISALNUM(*s) || (*s == '+') || (*s == '-') || (*s == '.'))
1789           s++; /* fine */
1790         else
1791           return CURLUE_BAD_SCHEME;
1792       }
1793     }
1794     else
1795       return CURLUE_BAD_SCHEME;
1796     u->guessed_scheme = FALSE;
1797     break;
1798   }
1799   case CURLUPART_USER:
1800     storep = &u->user;
1801     break;
1802   case CURLUPART_PASSWORD:
1803     storep = &u->password;
1804     break;
1805   case CURLUPART_OPTIONS:
1806     storep = &u->options;
1807     break;
1808   case CURLUPART_HOST:
1809     storep = &u->host;
1810     Curl_safefree(u->zoneid);
1811     break;
1812   case CURLUPART_ZONEID:
1813     storep = &u->zoneid;
1814     break;
1815   case CURLUPART_PORT:
1816     if(!ISDIGIT(part[0]))
1817       /* not a number */
1818       return CURLUE_BAD_PORT_NUMBER;
1819     else {
1820       char *tmp;
1821       char *endp;
1822       unsigned long port;
1823       errno = 0;
1824       port = strtoul(part, &endp, 10);  /* must be decimal */
1825       if(errno || (port > 0xffff) || *endp)
1826         /* weirdly provided number, not good! */
1827         return CURLUE_BAD_PORT_NUMBER;
1828       tmp = strdup(part);
1829       if(!tmp)
1830         return CURLUE_OUT_OF_MEMORY;
1831       free(u->port);
1832       u->port = tmp;
1833       u->portnum = (unsigned short)port;
1834       return CURLUE_OK;
1835     }
1836   case CURLUPART_PATH:
1837     urlskipslash = TRUE;
1838     leadingslash = TRUE; /* enforce */
1839     storep = &u->path;
1840     break;
1841   case CURLUPART_QUERY:
1842     plusencode = urlencode;
1843     appendquery = (flags & CURLU_APPENDQUERY) ? 1 : 0;
1844     equalsencode = appendquery;
1845     storep = &u->query;
1846     u->query_present = TRUE;
1847     break;
1848   case CURLUPART_FRAGMENT:
1849     storep = &u->fragment;
1850     u->fragment_present = TRUE;
1851     break;
1852   case CURLUPART_URL: {
1853     /*
1854      * Allow a new URL to replace the existing (if any) contents.
1855      *
1856      * If the existing contents is enough for a URL, allow a relative URL to
1857      * replace it.
1858      */
1859     CURLcode result;
1860     CURLUcode uc;
1861     char *oldurl;
1862     char *redired_url;
1863 
1864     if(!nalloc)
1865       /* a blank URL is not a valid URL */
1866       return CURLUE_MALFORMED_INPUT;
1867 
1868     /* if the new thing is absolute or the old one is not
1869      * (we could not get an absolute URL in 'oldurl'),
1870      * then replace the existing with the new. */
1871     if(Curl_is_absolute_url(part, NULL, 0,
1872                             flags & (CURLU_GUESS_SCHEME|
1873                                      CURLU_DEFAULT_SCHEME))
1874        || curl_url_get(u, CURLUPART_URL, &oldurl, flags)) {
1875       return parseurl_and_replace(part, u, flags);
1876     }
1877 
1878     /* apply the relative part to create a new URL
1879      * and replace the existing one with it. */
1880     result = concat_url(oldurl, part, &redired_url);
1881     free(oldurl);
1882     if(result)
1883       return cc2cu(result);
1884 
1885     uc = parseurl_and_replace(redired_url, u, flags);
1886     free(redired_url);
1887     return uc;
1888   }
1889   default:
1890     return CURLUE_UNKNOWN_PART;
1891   }
1892   DEBUGASSERT(storep);
1893   {
1894     const char *newp;
1895     struct dynbuf enc;
1896     Curl_dyn_init(&enc, nalloc * 3 + 1 + leadingslash);
1897 
1898     if(leadingslash && (part[0] != '/')) {
1899       CURLcode result = Curl_dyn_addn(&enc, "/", 1);
1900       if(result)
1901         return cc2cu(result);
1902     }
1903     if(urlencode) {
1904       const unsigned char *i;
1905 
1906       for(i = (const unsigned char *)part; *i; i++) {
1907         CURLcode result;
1908         if((*i == ' ') && plusencode) {
1909           result = Curl_dyn_addn(&enc, "+", 1);
1910           if(result)
1911             return CURLUE_OUT_OF_MEMORY;
1912         }
1913         else if(ISUNRESERVED(*i) ||
1914                 ((*i == '/') && urlskipslash) ||
1915                 ((*i == '=') && equalsencode)) {
1916           if((*i == '=') && equalsencode)
1917             /* only skip the first equals sign */
1918             equalsencode = FALSE;
1919           result = Curl_dyn_addn(&enc, i, 1);
1920           if(result)
1921             return cc2cu(result);
1922         }
1923         else {
1924           char out[3]={'%'};
1925           out[1] = hexdigits[*i >> 4];
1926           out[2] = hexdigits[*i & 0xf];
1927           result = Curl_dyn_addn(&enc, out, 3);
1928           if(result)
1929             return cc2cu(result);
1930         }
1931       }
1932     }
1933     else {
1934       char *p;
1935       CURLcode result = Curl_dyn_add(&enc, part);
1936       if(result)
1937         return cc2cu(result);
1938       p = Curl_dyn_ptr(&enc);
1939       while(*p) {
1940         /* make sure percent encoded are lower case */
1941         if((*p == '%') && ISXDIGIT(p[1]) && ISXDIGIT(p[2]) &&
1942            (ISUPPER(p[1]) || ISUPPER(p[2]))) {
1943           p[1] = Curl_raw_tolower(p[1]);
1944           p[2] = Curl_raw_tolower(p[2]);
1945           p += 3;
1946         }
1947         else
1948           p++;
1949       }
1950     }
1951     newp = Curl_dyn_ptr(&enc);
1952 
1953     if(appendquery && newp) {
1954       /* Append the 'newp' string onto the old query. Add a '&' separator if
1955          none is present at the end of the existing query already */
1956 
1957       size_t querylen = u->query ? strlen(u->query) : 0;
1958       bool addamperand = querylen && (u->query[querylen -1] != '&');
1959       if(querylen) {
1960         struct dynbuf qbuf;
1961         Curl_dyn_init(&qbuf, CURL_MAX_INPUT_LENGTH);
1962 
1963         if(Curl_dyn_addn(&qbuf, u->query, querylen)) /* add original query */
1964           goto nomem;
1965 
1966         if(addamperand) {
1967           if(Curl_dyn_addn(&qbuf, "&", 1))
1968             goto nomem;
1969         }
1970         if(Curl_dyn_add(&qbuf, newp))
1971           goto nomem;
1972         Curl_dyn_free(&enc);
1973         free(*storep);
1974         *storep = Curl_dyn_ptr(&qbuf);
1975         return CURLUE_OK;
1976 nomem:
1977         Curl_dyn_free(&enc);
1978         return CURLUE_OUT_OF_MEMORY;
1979       }
1980     }
1981 
1982     else if(what == CURLUPART_HOST) {
1983       size_t n = Curl_dyn_len(&enc);
1984       if(!n && (flags & CURLU_NO_AUTHORITY)) {
1985         /* Skip hostname check, it is allowed to be empty. */
1986       }
1987       else {
1988         bool bad = FALSE;
1989         if(!n)
1990           bad = TRUE; /* empty hostname is not okay */
1991         else if(!urlencode) {
1992           /* if the host name part was not URL encoded here, it was set ready
1993              URL encoded so we need to decode it to check */
1994           size_t dlen;
1995           char *decoded = NULL;
1996           CURLcode result =
1997             Curl_urldecode(newp, n, &decoded, &dlen, REJECT_CTRL);
1998           if(result || hostname_check(u, decoded, dlen))
1999             bad = TRUE;
2000           free(decoded);
2001         }
2002         else if(hostname_check(u, (char *)newp, n))
2003           bad = TRUE;
2004         if(bad) {
2005           Curl_dyn_free(&enc);
2006           return CURLUE_BAD_HOSTNAME;
2007         }
2008       }
2009     }
2010 
2011     free(*storep);
2012     *storep = (char *)newp;
2013   }
2014   return CURLUE_OK;
2015 }
2016