xref: /curl/lib/urlapi.c (revision fe17c162)
1 /***************************************************************************
2  *                                  _   _ ____  _
3  *  Project                     ___| | | |  _ \| |
4  *                             / __| | | | |_) | |
5  *                            | (__| |_| |  _ <| |___
6  *                             \___|\___/|_| \_\_____|
7  *
8  * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
9  *
10  * This software is licensed as described in the file COPYING, which
11  * you should have received as part of this distribution. The terms
12  * are also available at https://curl.se/docs/copyright.html.
13  *
14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15  * copies of the Software, and permit persons to whom the Software is
16  * furnished to do so, under the terms of the COPYING file.
17  *
18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19  * KIND, either express or implied.
20  *
21  * SPDX-License-Identifier: curl
22  *
23  ***************************************************************************/
24 
25 #include "curl_setup.h"
26 
27 #include "urldata.h"
28 #include "urlapi-int.h"
29 #include "strcase.h"
30 #include "url.h"
31 #include "escape.h"
32 #include "curl_ctype.h"
33 #include "inet_pton.h"
34 #include "inet_ntop.h"
35 #include "strdup.h"
36 #include "idn.h"
37 #include "curl_memrchr.h"
38 
39 /* The last 3 #include files should be in this order */
40 #include "curl_printf.h"
41 #include "curl_memory.h"
42 #include "memdebug.h"
43 
44   /* MSDOS/Windows style drive prefix, eg c: in c:foo */
45 #define STARTS_WITH_DRIVE_PREFIX(str) \
46   ((('a' <= str[0] && str[0] <= 'z') || \
47     ('A' <= str[0] && str[0] <= 'Z')) && \
48    (str[1] == ':'))
49 
50   /* MSDOS/Windows style drive prefix, optionally with
51    * a '|' instead of ':', followed by a slash or NUL */
52 #define STARTS_WITH_URL_DRIVE_PREFIX(str) \
53   ((('a' <= (str)[0] && (str)[0] <= 'z') || \
54     ('A' <= (str)[0] && (str)[0] <= 'Z')) && \
55    ((str)[1] == ':' || (str)[1] == '|') && \
56    ((str)[2] == '/' || (str)[2] == '\\' || (str)[2] == 0))
57 
58 /* scheme is not URL encoded, the longest libcurl supported ones are... */
59 #define MAX_SCHEME_LEN 40
60 
61 /*
62  * If USE_IPV6 is disabled, we still want to parse IPv6 addresses, so make
63  * sure we have _some_ value for AF_INET6 without polluting our fake value
64  * everywhere.
65  */
66 #if !defined(USE_IPV6) && !defined(AF_INET6)
67 #define AF_INET6 (AF_INET + 1)
68 #endif
69 
70 /* Internal representation of CURLU. Point to URL-encoded strings. */
71 struct Curl_URL {
72   char *scheme;
73   char *user;
74   char *password;
75   char *options; /* IMAP only? */
76   char *host;
77   char *zoneid; /* for numerical IPv6 addresses */
78   char *port;
79   char *path;
80   char *query;
81   char *fragment;
82   unsigned short portnum; /* the numerical version (if 'port' is set) */
83   BIT(query_present);    /* to support blank */
84   BIT(fragment_present); /* to support blank */
85 };
86 
87 #define DEFAULT_SCHEME "https"
88 
free_urlhandle(struct Curl_URL * u)89 static void free_urlhandle(struct Curl_URL *u)
90 {
91   free(u->scheme);
92   free(u->user);
93   free(u->password);
94   free(u->options);
95   free(u->host);
96   free(u->zoneid);
97   free(u->port);
98   free(u->path);
99   free(u->query);
100   free(u->fragment);
101 }
102 
103 /*
104  * Find the separator at the end of the host name, or the '?' in cases like
105  * http://www.example.com?id=2380
106  */
find_host_sep(const char * url)107 static const char *find_host_sep(const char *url)
108 {
109   const char *sep;
110   const char *query;
111 
112   /* Find the start of the hostname */
113   sep = strstr(url, "//");
114   if(!sep)
115     sep = url;
116   else
117     sep += 2;
118 
119   query = strchr(sep, '?');
120   sep = strchr(sep, '/');
121 
122   if(!sep)
123     sep = url + strlen(url);
124 
125   if(!query)
126     query = url + strlen(url);
127 
128   return sep < query ? sep : query;
129 }
130 
131 /* convert CURLcode to CURLUcode */
132 #define cc2cu(x) ((x) == CURLE_TOO_LARGE ? CURLUE_TOO_LARGE :   \
133                   CURLUE_OUT_OF_MEMORY)
134 /*
135  * Decide whether a character in a URL must be escaped.
136  */
137 #define urlchar_needs_escaping(c) (!(ISCNTRL(c) || ISSPACE(c) || ISGRAPH(c)))
138 
139 static const char hexdigits[] = "0123456789abcdef";
140 /* urlencode_str() writes data into an output dynbuf and URL-encodes the
141  * spaces in the source URL accordingly.
142  *
143  * URL encoding should be skipped for host names, otherwise IDN resolution
144  * will fail.
145  */
urlencode_str(struct dynbuf * o,const char * url,size_t len,bool relative,bool query)146 static CURLUcode urlencode_str(struct dynbuf *o, const char *url,
147                                size_t len, bool relative,
148                                bool query)
149 {
150   /* we must add this with whitespace-replacing */
151   bool left = !query;
152   const unsigned char *iptr;
153   const unsigned char *host_sep = (const unsigned char *) url;
154   CURLcode result;
155 
156   if(!relative)
157     host_sep = (const unsigned char *) find_host_sep(url);
158 
159   for(iptr = (unsigned char *)url;    /* read from here */
160       len; iptr++, len--) {
161 
162     if(iptr < host_sep) {
163       result = Curl_dyn_addn(o, iptr, 1);
164       if(result)
165         return cc2cu(result);
166       continue;
167     }
168 
169     if(*iptr == ' ') {
170       if(left)
171         result = Curl_dyn_addn(o, "%20", 3);
172       else
173         result = Curl_dyn_addn(o, "+", 1);
174       if(result)
175         return cc2cu(result);
176       continue;
177     }
178 
179     if(*iptr == '?')
180       left = FALSE;
181 
182     if(urlchar_needs_escaping(*iptr)) {
183       char out[3]={'%'};
184       out[1] = hexdigits[*iptr>>4];
185       out[2] = hexdigits[*iptr & 0xf];
186       result = Curl_dyn_addn(o, out, 3);
187     }
188     else
189       result = Curl_dyn_addn(o, iptr, 1);
190     if(result)
191       return cc2cu(result);
192   }
193 
194   return CURLUE_OK;
195 }
196 
197 /*
198  * Returns the length of the scheme if the given URL is absolute (as opposed
199  * to relative). Stores the scheme in the buffer if TRUE and 'buf' is
200  * non-NULL. The buflen must be larger than MAX_SCHEME_LEN if buf is set.
201  *
202  * If 'guess_scheme' is TRUE, it means the URL might be provided without
203  * scheme.
204  */
Curl_is_absolute_url(const char * url,char * buf,size_t buflen,bool guess_scheme)205 size_t Curl_is_absolute_url(const char *url, char *buf, size_t buflen,
206                             bool guess_scheme)
207 {
208   int i = 0;
209   DEBUGASSERT(!buf || (buflen > MAX_SCHEME_LEN));
210   (void)buflen; /* only used in debug-builds */
211   if(buf)
212     buf[0] = 0; /* always leave a defined value in buf */
213 #ifdef _WIN32
214   if(guess_scheme && STARTS_WITH_DRIVE_PREFIX(url))
215     return 0;
216 #endif
217   if(ISALPHA(url[0]))
218     for(i = 1; i < MAX_SCHEME_LEN; ++i) {
219       char s = url[i];
220       if(s && (ISALNUM(s) || (s == '+') || (s == '-') || (s == '.') )) {
221         /* RFC 3986 3.1 explains:
222            scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
223         */
224       }
225       else {
226         break;
227       }
228     }
229   if(i && (url[i] == ':') && ((url[i + 1] == '/') || !guess_scheme)) {
230     /* If this does not guess scheme, the scheme always ends with the colon so
231        that this also detects data: URLs etc. In guessing mode, data: could
232        be the host name "data" with a specified port number. */
233 
234     /* the length of the scheme is the name part only */
235     size_t len = i;
236     if(buf) {
237       buf[i] = 0;
238       while(i--) {
239         buf[i] = Curl_raw_tolower(url[i]);
240       }
241     }
242     return len;
243   }
244   return 0;
245 }
246 
247 /*
248  * Concatenate a relative URL to a base URL making it absolute.
249  * URL-encodes any spaces.
250  * The returned pointer must be freed by the caller unless NULL
251  * (returns NULL on out of memory).
252  *
253  * Note that this function destroys the 'base' string.
254  */
concat_url(char * base,const char * relurl,char ** newurl)255 static CURLcode concat_url(char *base, const char *relurl, char **newurl)
256 {
257   /***
258    TRY to append this new path to the old URL
259    to the right of the host part. Oh crap, this is doomed to cause
260    problems in the future...
261   */
262   struct dynbuf newest;
263   char *protsep;
264   char *pathsep;
265   bool host_changed = FALSE;
266   const char *useurl = relurl;
267   CURLcode result = CURLE_OK;
268   CURLUcode uc;
269   bool skip_slash = FALSE;
270   *newurl = NULL;
271 
272   /* protsep points to the start of the host name */
273   protsep = strstr(base, "//");
274   if(!protsep)
275     protsep = base;
276   else
277     protsep += 2; /* pass the slashes */
278 
279   if('/' != relurl[0]) {
280     int level = 0;
281 
282     /* First we need to find out if there's a ?-letter in the URL,
283        and cut it and the right-side of that off */
284     pathsep = strchr(protsep, '?');
285     if(pathsep)
286       *pathsep = 0;
287 
288     /* we have a relative path to append to the last slash if there's one
289        available, or the new URL is just a query string (starts with a '?') or
290        a fragment (starts with '#') we append the new one at the end of the
291        current URL */
292     if((useurl[0] != '?') && (useurl[0] != '#')) {
293       pathsep = strrchr(protsep, '/');
294       if(pathsep)
295         *pathsep = 0;
296 
297       /* Check if there's any slash after the host name, and if so, remember
298          that position instead */
299       pathsep = strchr(protsep, '/');
300       if(pathsep)
301         protsep = pathsep + 1;
302       else
303         protsep = NULL;
304 
305       /* now deal with one "./" or any amount of "../" in the newurl
306          and act accordingly */
307 
308       if((useurl[0] == '.') && (useurl[1] == '/'))
309         useurl += 2; /* just skip the "./" */
310 
311       while((useurl[0] == '.') &&
312             (useurl[1] == '.') &&
313             (useurl[2] == '/')) {
314         level++;
315         useurl += 3; /* pass the "../" */
316       }
317 
318       if(protsep) {
319         while(level--) {
320           /* cut off one more level from the right of the original URL */
321           pathsep = strrchr(protsep, '/');
322           if(pathsep)
323             *pathsep = 0;
324           else {
325             *protsep = 0;
326             break;
327           }
328         }
329       }
330     }
331     else
332       skip_slash = TRUE;
333   }
334   else {
335     /* We got a new absolute path for this server */
336 
337     if(relurl[1] == '/') {
338       /* the new URL starts with //, just keep the protocol part from the
339          original one */
340       *protsep = 0;
341       useurl = &relurl[2]; /* we keep the slashes from the original, so we
342                               skip the new ones */
343       host_changed = TRUE;
344     }
345     else {
346       /* cut off the original URL from the first slash, or deal with URLs
347          without slash */
348       pathsep = strchr(protsep, '/');
349       if(pathsep) {
350         /* When people use badly formatted URLs, such as
351            "http://www.example.com?dir=/home/daniel" we must not use the first
352            slash, if there's a ?-letter before it! */
353         char *sep = strchr(protsep, '?');
354         if(sep && (sep < pathsep))
355           pathsep = sep;
356         *pathsep = 0;
357       }
358       else {
359         /* There was no slash. Now, since we might be operating on a badly
360            formatted URL, such as "http://www.example.com?id=2380" which
361            doesn't use a slash separator as it is supposed to, we need to check
362            for a ?-letter as well! */
363         pathsep = strchr(protsep, '?');
364         if(pathsep)
365           *pathsep = 0;
366       }
367     }
368   }
369 
370   Curl_dyn_init(&newest, CURL_MAX_INPUT_LENGTH);
371 
372   /* copy over the root url part */
373   result = Curl_dyn_add(&newest, base);
374   if(result)
375     return result;
376 
377   /* check if we need to append a slash */
378   if(('/' == useurl[0]) || (protsep && !*protsep) || skip_slash)
379     ;
380   else {
381     result = Curl_dyn_addn(&newest, "/", 1);
382     if(result)
383       return result;
384   }
385 
386   /* then append the new piece on the right side */
387   uc = urlencode_str(&newest, useurl, strlen(useurl), !host_changed,
388                      FALSE);
389   if(uc)
390     return (uc == CURLUE_TOO_LARGE) ? CURLE_TOO_LARGE : CURLE_OUT_OF_MEMORY;
391 
392   *newurl = Curl_dyn_ptr(&newest);
393   return CURLE_OK;
394 }
395 
396 /* scan for byte values <= 31, 127 and sometimes space */
junkscan(const char * url,size_t * urllen,unsigned int flags)397 static CURLUcode junkscan(const char *url, size_t *urllen, unsigned int flags)
398 {
399   static const char badbytes[]={
400     /* */ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
401     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
402     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
403     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
404     0x7f, 0x00 /* null-terminate */
405   };
406   size_t n = strlen(url);
407   size_t nfine;
408 
409   if(n > CURL_MAX_INPUT_LENGTH)
410     /* excessive input length */
411     return CURLUE_MALFORMED_INPUT;
412 
413   nfine = strcspn(url, badbytes);
414   if((nfine != n) ||
415      (!(flags & CURLU_ALLOW_SPACE) && strchr(url, ' ')))
416     return CURLUE_MALFORMED_INPUT;
417 
418   *urllen = n;
419   return CURLUE_OK;
420 }
421 
422 /*
423  * parse_hostname_login()
424  *
425  * Parse the login details (user name, password and options) from the URL and
426  * strip them out of the host name
427  *
428  */
parse_hostname_login(struct Curl_URL * u,const char * login,size_t len,unsigned int flags,size_t * offset)429 static CURLUcode parse_hostname_login(struct Curl_URL *u,
430                                       const char *login,
431                                       size_t len,
432                                       unsigned int flags,
433                                       size_t *offset) /* to the host name */
434 {
435   CURLUcode result = CURLUE_OK;
436   CURLcode ccode;
437   char *userp = NULL;
438   char *passwdp = NULL;
439   char *optionsp = NULL;
440   const struct Curl_handler *h = NULL;
441 
442   /* At this point, we assume all the other special cases have been taken
443    * care of, so the host is at most
444    *
445    *   [user[:password][;options]]@]hostname
446    *
447    * We need somewhere to put the embedded details, so do that first.
448    */
449   char *ptr;
450 
451   DEBUGASSERT(login);
452 
453   *offset = 0;
454   ptr = memchr(login, '@', len);
455   if(!ptr)
456     goto out;
457 
458   /* We will now try to extract the
459    * possible login information in a string like:
460    * ftp://user:password@ftp.my.site:8021/README */
461   ptr++;
462 
463   /* if this is a known scheme, get some details */
464   if(u->scheme)
465     h = Curl_get_scheme_handler(u->scheme);
466 
467   /* We could use the login information in the URL so extract it. Only parse
468      options if the handler says we should. Note that 'h' might be NULL! */
469   ccode = Curl_parse_login_details(login, ptr - login - 1,
470                                    &userp, &passwdp,
471                                    (h && (h->flags & PROTOPT_URLOPTIONS)) ?
472                                    &optionsp:NULL);
473   if(ccode) {
474     result = CURLUE_BAD_LOGIN;
475     goto out;
476   }
477 
478   if(userp) {
479     if(flags & CURLU_DISALLOW_USER) {
480       /* Option DISALLOW_USER is set and url contains username. */
481       result = CURLUE_USER_NOT_ALLOWED;
482       goto out;
483     }
484     free(u->user);
485     u->user = userp;
486   }
487 
488   if(passwdp) {
489     free(u->password);
490     u->password = passwdp;
491   }
492 
493   if(optionsp) {
494     free(u->options);
495     u->options = optionsp;
496   }
497 
498   /* the host name starts at this offset */
499   *offset = ptr - login;
500   return CURLUE_OK;
501 
502 out:
503 
504   free(userp);
505   free(passwdp);
506   free(optionsp);
507   u->user = NULL;
508   u->password = NULL;
509   u->options = NULL;
510 
511   return result;
512 }
513 
Curl_parse_port(struct Curl_URL * u,struct dynbuf * host,bool has_scheme)514 UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, struct dynbuf *host,
515                                    bool has_scheme)
516 {
517   char *portptr;
518   char *hostname = Curl_dyn_ptr(host);
519   /*
520    * Find the end of an IPv6 address on the ']' ending bracket.
521    */
522   if(hostname[0] == '[') {
523     portptr = strchr(hostname, ']');
524     if(!portptr)
525       return CURLUE_BAD_IPV6;
526     portptr++;
527     /* this is a RFC2732-style specified IP-address */
528     if(*portptr) {
529       if(*portptr != ':')
530         return CURLUE_BAD_PORT_NUMBER;
531     }
532     else
533       portptr = NULL;
534   }
535   else
536     portptr = strchr(hostname, ':');
537 
538   if(portptr) {
539     char *rest = NULL;
540     unsigned long port;
541     size_t keep = portptr - hostname;
542 
543     /* Browser behavior adaptation. If there's a colon with no digits after,
544        just cut off the name there which makes us ignore the colon and just
545        use the default port. Firefox, Chrome and Safari all do that.
546 
547        Don't do it if the URL has no scheme, to make something that looks like
548        a scheme not work!
549     */
550     Curl_dyn_setlen(host, keep);
551     portptr++;
552     if(!*portptr)
553       return has_scheme ? CURLUE_OK : CURLUE_BAD_PORT_NUMBER;
554 
555     if(!ISDIGIT(*portptr))
556       return CURLUE_BAD_PORT_NUMBER;
557 
558     errno = 0;
559     port = strtoul(portptr, &rest, 10);  /* Port number must be decimal */
560 
561     if(errno || (port > 0xffff) || *rest)
562       return CURLUE_BAD_PORT_NUMBER;
563 
564     u->portnum = (unsigned short) port;
565     /* generate a new port number string to get rid of leading zeroes etc */
566     free(u->port);
567     u->port = aprintf("%ld", port);
568     if(!u->port)
569       return CURLUE_OUT_OF_MEMORY;
570   }
571 
572   return CURLUE_OK;
573 }
574 
575 /* this assumes 'hostname' now starts with [ */
ipv6_parse(struct Curl_URL * u,char * hostname,size_t hlen)576 static CURLUcode ipv6_parse(struct Curl_URL *u, char *hostname,
577                             size_t hlen) /* length of hostname */
578 {
579   size_t len;
580   DEBUGASSERT(*hostname == '[');
581   if(hlen < 4) /* '[::]' is the shortest possible valid string */
582     return CURLUE_BAD_IPV6;
583   hostname++;
584   hlen -= 2;
585 
586   /* only valid IPv6 letters are ok */
587   len = strspn(hostname, "0123456789abcdefABCDEF:.");
588 
589   if(hlen != len) {
590     hlen = len;
591     if(hostname[len] == '%') {
592       /* this could now be '%[zone id]' */
593       char zoneid[16];
594       int i = 0;
595       char *h = &hostname[len + 1];
596       /* pass '25' if present and is a url encoded percent sign */
597       if(!strncmp(h, "25", 2) && h[2] && (h[2] != ']'))
598         h += 2;
599       while(*h && (*h != ']') && (i < 15))
600         zoneid[i++] = *h++;
601       if(!i || (']' != *h))
602         return CURLUE_BAD_IPV6;
603       zoneid[i] = 0;
604       u->zoneid = strdup(zoneid);
605       if(!u->zoneid)
606         return CURLUE_OUT_OF_MEMORY;
607       hostname[len] = ']'; /* insert end bracket */
608       hostname[len + 1] = 0; /* terminate the hostname */
609     }
610     else
611       return CURLUE_BAD_IPV6;
612     /* hostname is fine */
613   }
614 
615   /* Check the IPv6 address. */
616   {
617     char dest[16]; /* fits a binary IPv6 address */
618     char norm[MAX_IPADR_LEN];
619     hostname[hlen] = 0; /* end the address there */
620     if(1 != Curl_inet_pton(AF_INET6, hostname, dest))
621       return CURLUE_BAD_IPV6;
622 
623     /* check if it can be done shorter */
624     if(Curl_inet_ntop(AF_INET6, dest, norm, sizeof(norm)) &&
625        (strlen(norm) < hlen)) {
626       strcpy(hostname, norm);
627       hlen = strlen(norm);
628       hostname[hlen + 1] = 0;
629     }
630     hostname[hlen] = ']'; /* restore ending bracket */
631   }
632   return CURLUE_OK;
633 }
634 
hostname_check(struct Curl_URL * u,char * hostname,size_t hlen)635 static CURLUcode hostname_check(struct Curl_URL *u, char *hostname,
636                                 size_t hlen) /* length of hostname */
637 {
638   size_t len;
639   DEBUGASSERT(hostname);
640 
641   if(!hlen)
642     return CURLUE_NO_HOST;
643   else if(hostname[0] == '[')
644     return ipv6_parse(u, hostname, hlen);
645   else {
646     /* letters from the second string are not ok */
647     len = strcspn(hostname, " \r\n\t/:#?!@{}[]\\$\'\"^`*<>=;,+&()%");
648     if(hlen != len)
649       /* hostname with bad content */
650       return CURLUE_BAD_HOSTNAME;
651   }
652   return CURLUE_OK;
653 }
654 
655 /*
656  * Handle partial IPv4 numerical addresses and different bases, like
657  * '16843009', '0x7f', '0x7f.1' '0177.1.1.1' etc.
658  *
659  * If the given input string is syntactically wrong IPv4 or any part for
660  * example is too big, this function returns HOST_NAME.
661  *
662  * Output the "normalized" version of that input string in plain quad decimal
663  * integers.
664  *
665  * Returns the host type.
666  */
667 
668 #define HOST_ERROR   -1 /* out of memory */
669 #define HOST_BAD     -2 /* bad IPv4 address */
670 
671 #define HOST_NAME    1
672 #define HOST_IPV4    2
673 #define HOST_IPV6    3
674 
ipv4_normalize(struct dynbuf * host)675 static int ipv4_normalize(struct dynbuf *host)
676 {
677   bool done = FALSE;
678   int n = 0;
679   const char *c = Curl_dyn_ptr(host);
680   unsigned long parts[4] = {0, 0, 0, 0};
681   CURLcode result = CURLE_OK;
682 
683   if(*c == '[')
684     return HOST_IPV6;
685 
686   errno = 0; /* for strtoul */
687   while(!done) {
688     char *endp = NULL;
689     unsigned long l;
690     if(!ISDIGIT(*c))
691       /* most importantly this doesn't allow a leading plus or minus */
692       return HOST_NAME;
693     l = strtoul(c, &endp, 0);
694     if(errno)
695       return HOST_NAME;
696 #if SIZEOF_LONG > 4
697     /* a value larger than 32 bits */
698     if(l > UINT_MAX)
699       return HOST_NAME;
700 #endif
701 
702     parts[n] = l;
703     c = endp;
704 
705     switch(*c) {
706     case '.':
707       if(n == 3)
708         return HOST_NAME;
709       n++;
710       c++;
711       break;
712 
713     case '\0':
714       done = TRUE;
715       break;
716 
717     default:
718       return HOST_NAME;
719     }
720   }
721 
722   switch(n) {
723   case 0: /* a -- 32 bits */
724     Curl_dyn_reset(host);
725 
726     result = Curl_dyn_addf(host, "%u.%u.%u.%u",
727                            (unsigned int)(parts[0] >> 24),
728                            (unsigned int)((parts[0] >> 16) & 0xff),
729                            (unsigned int)((parts[0] >> 8) & 0xff),
730                            (unsigned int)(parts[0] & 0xff));
731     break;
732   case 1: /* a.b -- 8.24 bits */
733     if((parts[0] > 0xff) || (parts[1] > 0xffffff))
734       return HOST_NAME;
735     Curl_dyn_reset(host);
736     result = Curl_dyn_addf(host, "%u.%u.%u.%u",
737                            (unsigned int)(parts[0]),
738                            (unsigned int)((parts[1] >> 16) & 0xff),
739                            (unsigned int)((parts[1] >> 8) & 0xff),
740                            (unsigned int)(parts[1] & 0xff));
741     break;
742   case 2: /* a.b.c -- 8.8.16 bits */
743     if((parts[0] > 0xff) || (parts[1] > 0xff) || (parts[2] > 0xffff))
744       return HOST_NAME;
745     Curl_dyn_reset(host);
746     result = Curl_dyn_addf(host, "%u.%u.%u.%u",
747                            (unsigned int)(parts[0]),
748                            (unsigned int)(parts[1]),
749                            (unsigned int)((parts[2] >> 8) & 0xff),
750                            (unsigned int)(parts[2] & 0xff));
751     break;
752   case 3: /* a.b.c.d -- 8.8.8.8 bits */
753     if((parts[0] > 0xff) || (parts[1] > 0xff) || (parts[2] > 0xff) ||
754        (parts[3] > 0xff))
755       return HOST_NAME;
756     Curl_dyn_reset(host);
757     result = Curl_dyn_addf(host, "%u.%u.%u.%u",
758                            (unsigned int)(parts[0]),
759                            (unsigned int)(parts[1]),
760                            (unsigned int)(parts[2]),
761                            (unsigned int)(parts[3]));
762     break;
763   }
764   if(result)
765     return HOST_ERROR;
766   return HOST_IPV4;
767 }
768 
769 /* if necessary, replace the host content with a URL decoded version */
urldecode_host(struct dynbuf * host)770 static CURLUcode urldecode_host(struct dynbuf *host)
771 {
772   char *per = NULL;
773   const char *hostname = Curl_dyn_ptr(host);
774   per = strchr(hostname, '%');
775   if(!per)
776     /* nothing to decode */
777     return CURLUE_OK;
778   else {
779     /* encoded */
780     size_t dlen;
781     char *decoded;
782     CURLcode result = Curl_urldecode(hostname, 0, &decoded, &dlen,
783                                      REJECT_CTRL);
784     if(result)
785       return CURLUE_BAD_HOSTNAME;
786     Curl_dyn_reset(host);
787     result = Curl_dyn_addn(host, decoded, dlen);
788     free(decoded);
789     if(result)
790       return cc2cu(result);
791   }
792 
793   return CURLUE_OK;
794 }
795 
parse_authority(struct Curl_URL * u,const char * auth,size_t authlen,unsigned int flags,struct dynbuf * host,bool has_scheme)796 static CURLUcode parse_authority(struct Curl_URL *u,
797                                  const char *auth, size_t authlen,
798                                  unsigned int flags,
799                                  struct dynbuf *host,
800                                  bool has_scheme)
801 {
802   size_t offset;
803   CURLUcode uc;
804   CURLcode result;
805 
806   /*
807    * Parse the login details and strip them out of the host name.
808    */
809   uc = parse_hostname_login(u, auth, authlen, flags, &offset);
810   if(uc)
811     goto out;
812 
813   result = Curl_dyn_addn(host, auth + offset, authlen - offset);
814   if(result) {
815     uc = cc2cu(result);
816     goto out;
817   }
818 
819   uc = Curl_parse_port(u, host, has_scheme);
820   if(uc)
821     goto out;
822 
823   if(!Curl_dyn_len(host))
824     return CURLUE_NO_HOST;
825 
826   switch(ipv4_normalize(host)) {
827   case HOST_IPV4:
828     break;
829   case HOST_IPV6:
830     uc = ipv6_parse(u, Curl_dyn_ptr(host), Curl_dyn_len(host));
831     break;
832   case HOST_NAME:
833     uc = urldecode_host(host);
834     if(!uc)
835       uc = hostname_check(u, Curl_dyn_ptr(host), Curl_dyn_len(host));
836     break;
837   case HOST_ERROR:
838     uc = CURLUE_OUT_OF_MEMORY;
839     break;
840   case HOST_BAD:
841   default:
842     uc = CURLUE_BAD_HOSTNAME; /* Bad IPv4 address even */
843     break;
844   }
845 
846 out:
847   return uc;
848 }
849 
850 /* used for HTTP/2 server push */
Curl_url_set_authority(CURLU * u,const char * authority)851 CURLUcode Curl_url_set_authority(CURLU *u, const char *authority)
852 {
853   CURLUcode result;
854   struct dynbuf host;
855 
856   DEBUGASSERT(authority);
857   Curl_dyn_init(&host, CURL_MAX_INPUT_LENGTH);
858 
859   result = parse_authority(u, authority, strlen(authority),
860                            CURLU_DISALLOW_USER, &host, !!u->scheme);
861   if(result)
862     Curl_dyn_free(&host);
863   else {
864     free(u->host);
865     u->host = Curl_dyn_ptr(&host);
866   }
867   return result;
868 }
869 
870 /*
871  * "Remove Dot Segments"
872  * https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4
873  */
874 
875 /*
876  * dedotdotify()
877  * @unittest: 1395
878  *
879  * This function gets a null-terminated path with dot and dotdot sequences
880  * passed in and strips them off according to the rules in RFC 3986 section
881  * 5.2.4.
882  *
883  * The function handles a query part ('?' + stuff) appended but it expects
884  * that fragments ('#' + stuff) have already been cut off.
885  *
886  * RETURNS
887  *
888  * Zero for success and 'out' set to an allocated dedotdotified string.
889  */
890 UNITTEST int dedotdotify(const char *input, size_t clen, char **outp);
dedotdotify(const char * input,size_t clen,char ** outp)891 UNITTEST int dedotdotify(const char *input, size_t clen, char **outp)
892 {
893   char *outptr;
894   const char *endp = &input[clen];
895   char *out;
896 
897   *outp = NULL;
898   /* the path always starts with a slash, and a slash has not dot */
899   if((clen < 2) || !memchr(input, '.', clen))
900     return 0;
901 
902   out = malloc(clen + 1);
903   if(!out)
904     return 1; /* out of memory */
905 
906   *out = 0; /* null-terminates, for inputs like "./" */
907   outptr = out;
908 
909   do {
910     bool dotdot = TRUE;
911     if(*input == '.') {
912       /*  A.  If the input buffer begins with a prefix of "../" or "./", then
913           remove that prefix from the input buffer; otherwise, */
914 
915       if(!strncmp("./", input, 2)) {
916         input += 2;
917         clen -= 2;
918       }
919       else if(!strncmp("../", input, 3)) {
920         input += 3;
921         clen -= 3;
922       }
923       /*  D.  if the input buffer consists only of "." or "..", then remove
924           that from the input buffer; otherwise, */
925 
926       else if(!strcmp(".", input) || !strcmp("..", input) ||
927               !strncmp(".?", input, 2) || !strncmp("..?", input, 3)) {
928         *out = 0;
929         break;
930       }
931       else
932         dotdot = FALSE;
933     }
934     else if(*input == '/') {
935       /*  B.  if the input buffer begins with a prefix of "/./" or "/.", where
936           "."  is a complete path segment, then replace that prefix with "/" in
937           the input buffer; otherwise, */
938       if(!strncmp("/./", input, 3)) {
939         input += 2;
940         clen -= 2;
941       }
942       else if(!strcmp("/.", input) || !strncmp("/.?", input, 3)) {
943         *outptr++ = '/';
944         *outptr = 0;
945         break;
946       }
947 
948       /*  C.  if the input buffer begins with a prefix of "/../" or "/..",
949           where ".." is a complete path segment, then replace that prefix with
950           "/" in the input buffer and remove the last segment and its
951           preceding "/" (if any) from the output buffer; otherwise, */
952 
953       else if(!strncmp("/../", input, 4)) {
954         input += 3;
955         clen -= 3;
956         /* remove the last segment from the output buffer */
957         while(outptr > out) {
958           outptr--;
959           if(*outptr == '/')
960             break;
961         }
962         *outptr = 0; /* null-terminate where it stops */
963       }
964       else if(!strcmp("/..", input) || !strncmp("/..?", input, 4)) {
965         /* remove the last segment from the output buffer */
966         while(outptr > out) {
967           outptr--;
968           if(*outptr == '/')
969             break;
970         }
971         *outptr++ = '/';
972         *outptr = 0; /* null-terminate where it stops */
973         break;
974       }
975       else
976         dotdot = FALSE;
977     }
978     else
979       dotdot = FALSE;
980 
981     if(!dotdot) {
982       /*  E.  move the first path segment in the input buffer to the end of
983           the output buffer, including the initial "/" character (if any) and
984           any subsequent characters up to, but not including, the next "/"
985           character or the end of the input buffer. */
986 
987       do {
988         *outptr++ = *input++;
989         clen--;
990       } while(*input && (*input != '/') && (*input != '?'));
991       *outptr = 0;
992     }
993 
994     /* continue until end of path */
995   } while(input < endp);
996 
997   *outp = out;
998   return 0; /* success */
999 }
1000 
parseurl(const char * url,CURLU * u,unsigned int flags)1001 static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
1002 {
1003   const char *path;
1004   size_t pathlen;
1005   char *query = NULL;
1006   char *fragment = NULL;
1007   char schemebuf[MAX_SCHEME_LEN + 1];
1008   size_t schemelen = 0;
1009   size_t urllen;
1010   CURLUcode result = CURLUE_OK;
1011   size_t fraglen = 0;
1012   struct dynbuf host;
1013 
1014   DEBUGASSERT(url);
1015 
1016   Curl_dyn_init(&host, CURL_MAX_INPUT_LENGTH);
1017 
1018   result = junkscan(url, &urllen, flags);
1019   if(result)
1020     goto fail;
1021 
1022   schemelen = Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf),
1023                                    flags & (CURLU_GUESS_SCHEME|
1024                                             CURLU_DEFAULT_SCHEME));
1025 
1026   /* handle the file: scheme */
1027   if(schemelen && !strcmp(schemebuf, "file")) {
1028     bool uncpath = FALSE;
1029     if(urllen <= 6) {
1030       /* file:/ is not enough to actually be a complete file: URL */
1031       result = CURLUE_BAD_FILE_URL;
1032       goto fail;
1033     }
1034 
1035     /* path has been allocated large enough to hold this */
1036     path = (char *)&url[5];
1037     pathlen = urllen - 5;
1038 
1039     u->scheme = strdup("file");
1040     if(!u->scheme) {
1041       result = CURLUE_OUT_OF_MEMORY;
1042       goto fail;
1043     }
1044 
1045     /* Extra handling URLs with an authority component (i.e. that start with
1046      * "file://")
1047      *
1048      * We allow omitted hostname (e.g. file:/<path>) -- valid according to
1049      * RFC 8089, but not the (current) WHAT-WG URL spec.
1050      */
1051     if(path[0] == '/' && path[1] == '/') {
1052       /* swallow the two slashes */
1053       const char *ptr = &path[2];
1054 
1055       /*
1056        * According to RFC 8089, a file: URL can be reliably dereferenced if:
1057        *
1058        *  o it has no/blank hostname, or
1059        *
1060        *  o the hostname matches "localhost" (case-insensitively), or
1061        *
1062        *  o the hostname is a FQDN that resolves to this machine, or
1063        *
1064        *  o it is an UNC String transformed to an URI (Windows only, RFC 8089
1065        *    Appendix E.3).
1066        *
1067        * For brevity, we only consider URLs with empty, "localhost", or
1068        * "127.0.0.1" hostnames as local, otherwise as an UNC String.
1069        *
1070        * Additionally, there is an exception for URLs with a Windows drive
1071        * letter in the authority (which was accidentally omitted from RFC 8089
1072        * Appendix E, but believe me, it was meant to be there. --MK)
1073        */
1074       if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) {
1075         /* the URL includes a host name, it must match "localhost" or
1076            "127.0.0.1" to be valid */
1077         if(checkprefix("localhost/", ptr) ||
1078            checkprefix("127.0.0.1/", ptr)) {
1079           ptr += 9; /* now points to the slash after the host */
1080         }
1081         else {
1082 #if defined(_WIN32)
1083           size_t len;
1084 
1085           /* the host name, NetBIOS computer name, can not contain disallowed
1086              chars, and the delimiting slash character must be appended to the
1087              host name */
1088           path = strpbrk(ptr, "/\\:*?\"<>|");
1089           if(!path || *path != '/') {
1090             result = CURLUE_BAD_FILE_URL;
1091             goto fail;
1092           }
1093 
1094           len = path - ptr;
1095           if(len) {
1096             CURLcode code = Curl_dyn_addn(&host, ptr, len);
1097             if(code) {
1098               result = cc2cu(code);
1099               goto fail;
1100             }
1101             uncpath = TRUE;
1102           }
1103 
1104           ptr -= 2; /* now points to the // before the host in UNC */
1105 #else
1106           /* Invalid file://hostname/, expected localhost or 127.0.0.1 or
1107              none */
1108           result = CURLUE_BAD_FILE_URL;
1109           goto fail;
1110 #endif
1111         }
1112       }
1113 
1114       path = ptr;
1115       pathlen = urllen - (ptr - url);
1116     }
1117 
1118     if(!uncpath)
1119       /* no host for file: URLs by default */
1120       Curl_dyn_reset(&host);
1121 
1122 #if !defined(_WIN32) && !defined(MSDOS) && !defined(__CYGWIN__)
1123     /* Don't allow Windows drive letters when not in Windows.
1124      * This catches both "file:/c:" and "file:c:" */
1125     if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) ||
1126        STARTS_WITH_URL_DRIVE_PREFIX(path)) {
1127       /* File drive letters are only accepted in MSDOS/Windows */
1128       result = CURLUE_BAD_FILE_URL;
1129       goto fail;
1130     }
1131 #else
1132     /* If the path starts with a slash and a drive letter, ditch the slash */
1133     if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) {
1134       /* This cannot be done with strcpy, as the memory chunks overlap! */
1135       path++;
1136       pathlen--;
1137     }
1138 #endif
1139 
1140   }
1141   else {
1142     /* clear path */
1143     const char *schemep = NULL;
1144     const char *hostp;
1145     size_t hostlen;
1146 
1147     if(schemelen) {
1148       int i = 0;
1149       const char *p = &url[schemelen + 1];
1150       while((*p == '/') && (i < 4)) {
1151         p++;
1152         i++;
1153       }
1154 
1155       schemep = schemebuf;
1156       if(!Curl_get_scheme_handler(schemep) &&
1157          !(flags & CURLU_NON_SUPPORT_SCHEME)) {
1158         result = CURLUE_UNSUPPORTED_SCHEME;
1159         goto fail;
1160       }
1161 
1162       if((i < 1) || (i > 3)) {
1163         /* less than one or more than three slashes */
1164         result = CURLUE_BAD_SLASHES;
1165         goto fail;
1166       }
1167       hostp = p; /* host name starts here */
1168     }
1169     else {
1170       /* no scheme! */
1171 
1172       if(!(flags & (CURLU_DEFAULT_SCHEME|CURLU_GUESS_SCHEME))) {
1173         result = CURLUE_BAD_SCHEME;
1174         goto fail;
1175       }
1176       if(flags & CURLU_DEFAULT_SCHEME)
1177         schemep = DEFAULT_SCHEME;
1178 
1179       /*
1180        * The URL was badly formatted, let's try without scheme specified.
1181        */
1182       hostp = url;
1183     }
1184 
1185     if(schemep) {
1186       u->scheme = strdup(schemep);
1187       if(!u->scheme) {
1188         result = CURLUE_OUT_OF_MEMORY;
1189         goto fail;
1190       }
1191     }
1192 
1193     /* find the end of the host name + port number */
1194     hostlen = strcspn(hostp, "/?#");
1195     path = &hostp[hostlen];
1196 
1197     /* this pathlen also contains the query and the fragment */
1198     pathlen = urllen - (path - url);
1199     if(hostlen) {
1200 
1201       result = parse_authority(u, hostp, hostlen, flags, &host, schemelen);
1202       if(result)
1203         goto fail;
1204 
1205       if((flags & CURLU_GUESS_SCHEME) && !schemep) {
1206         const char *hostname = Curl_dyn_ptr(&host);
1207         /* legacy curl-style guess based on host name */
1208         if(checkprefix("ftp.", hostname))
1209           schemep = "ftp";
1210         else if(checkprefix("dict.", hostname))
1211           schemep = "dict";
1212         else if(checkprefix("ldap.", hostname))
1213           schemep = "ldap";
1214         else if(checkprefix("imap.", hostname))
1215           schemep = "imap";
1216         else if(checkprefix("smtp.", hostname))
1217           schemep = "smtp";
1218         else if(checkprefix("pop3.", hostname))
1219           schemep = "pop3";
1220         else
1221           schemep = "http";
1222 
1223         u->scheme = strdup(schemep);
1224         if(!u->scheme) {
1225           result = CURLUE_OUT_OF_MEMORY;
1226           goto fail;
1227         }
1228       }
1229     }
1230     else if(flags & CURLU_NO_AUTHORITY) {
1231       /* allowed to be empty. */
1232       if(Curl_dyn_add(&host, "")) {
1233         result = CURLUE_OUT_OF_MEMORY;
1234         goto fail;
1235       }
1236     }
1237     else {
1238       result = CURLUE_NO_HOST;
1239       goto fail;
1240     }
1241   }
1242 
1243   fragment = strchr(path, '#');
1244   if(fragment) {
1245     fraglen = pathlen - (fragment - path);
1246     u->fragment_present = TRUE;
1247     if(fraglen > 1) {
1248       /* skip the leading '#' in the copy but include the terminating null */
1249       if(flags & CURLU_URLENCODE) {
1250         struct dynbuf enc;
1251         Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1252         result = urlencode_str(&enc, fragment + 1, fraglen - 1, TRUE, FALSE);
1253         if(result)
1254           goto fail;
1255         u->fragment = Curl_dyn_ptr(&enc);
1256       }
1257       else {
1258         u->fragment = Curl_memdup0(fragment + 1, fraglen - 1);
1259         if(!u->fragment) {
1260           result = CURLUE_OUT_OF_MEMORY;
1261           goto fail;
1262         }
1263       }
1264     }
1265     /* after this, pathlen still contains the query */
1266     pathlen -= fraglen;
1267   }
1268 
1269   query = memchr(path, '?', pathlen);
1270   if(query) {
1271     size_t qlen = fragment ? (size_t)(fragment - query) :
1272       pathlen - (query - path);
1273     pathlen -= qlen;
1274     u->query_present = TRUE;
1275     if(qlen > 1) {
1276       if(flags & CURLU_URLENCODE) {
1277         struct dynbuf enc;
1278         Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1279         /* skip the leading question mark */
1280         result = urlencode_str(&enc, query + 1, qlen - 1, TRUE, TRUE);
1281         if(result)
1282           goto fail;
1283         u->query = Curl_dyn_ptr(&enc);
1284       }
1285       else {
1286         u->query = Curl_memdup0(query + 1, qlen - 1);
1287         if(!u->query) {
1288           result = CURLUE_OUT_OF_MEMORY;
1289           goto fail;
1290         }
1291       }
1292     }
1293     else {
1294       /* single byte query */
1295       u->query = strdup("");
1296       if(!u->query) {
1297         result = CURLUE_OUT_OF_MEMORY;
1298         goto fail;
1299       }
1300     }
1301   }
1302 
1303   if(pathlen && (flags & CURLU_URLENCODE)) {
1304     struct dynbuf enc;
1305     Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1306     result = urlencode_str(&enc, path, pathlen, TRUE, FALSE);
1307     if(result)
1308       goto fail;
1309     pathlen = Curl_dyn_len(&enc);
1310     path = u->path = Curl_dyn_ptr(&enc);
1311   }
1312 
1313   if(pathlen <= 1) {
1314     /* there is no path left or just the slash, unset */
1315     path = NULL;
1316   }
1317   else {
1318     if(!u->path) {
1319       u->path = Curl_memdup0(path, pathlen);
1320       if(!u->path) {
1321         result = CURLUE_OUT_OF_MEMORY;
1322         goto fail;
1323       }
1324       path = u->path;
1325     }
1326     else if(flags & CURLU_URLENCODE)
1327       /* it might have encoded more than just the path so cut it */
1328       u->path[pathlen] = 0;
1329 
1330     if(!(flags & CURLU_PATH_AS_IS)) {
1331       /* remove ../ and ./ sequences according to RFC3986 */
1332       char *dedot;
1333       int err = dedotdotify((char *)path, pathlen, &dedot);
1334       if(err) {
1335         result = CURLUE_OUT_OF_MEMORY;
1336         goto fail;
1337       }
1338       if(dedot) {
1339         free(u->path);
1340         u->path = dedot;
1341       }
1342     }
1343   }
1344 
1345   u->host = Curl_dyn_ptr(&host);
1346 
1347   return result;
1348 fail:
1349   Curl_dyn_free(&host);
1350   free_urlhandle(u);
1351   return result;
1352 }
1353 
1354 /*
1355  * Parse the URL and, if successful, replace everything in the Curl_URL struct.
1356  */
parseurl_and_replace(const char * url,CURLU * u,unsigned int flags)1357 static CURLUcode parseurl_and_replace(const char *url, CURLU *u,
1358                                       unsigned int flags)
1359 {
1360   CURLUcode result;
1361   CURLU tmpurl;
1362   memset(&tmpurl, 0, sizeof(tmpurl));
1363   result = parseurl(url, &tmpurl, flags);
1364   if(!result) {
1365     free_urlhandle(u);
1366     *u = tmpurl;
1367   }
1368   return result;
1369 }
1370 
1371 /*
1372  */
curl_url(void)1373 CURLU *curl_url(void)
1374 {
1375   return calloc(1, sizeof(struct Curl_URL));
1376 }
1377 
curl_url_cleanup(CURLU * u)1378 void curl_url_cleanup(CURLU *u)
1379 {
1380   if(u) {
1381     free_urlhandle(u);
1382     free(u);
1383   }
1384 }
1385 
1386 #define DUP(dest, src, name)                    \
1387   do {                                          \
1388     if(src->name) {                             \
1389       dest->name = strdup(src->name);           \
1390       if(!dest->name)                           \
1391         goto fail;                              \
1392     }                                           \
1393   } while(0)
1394 
curl_url_dup(const CURLU * in)1395 CURLU *curl_url_dup(const CURLU *in)
1396 {
1397   struct Curl_URL *u = calloc(1, sizeof(struct Curl_URL));
1398   if(u) {
1399     DUP(u, in, scheme);
1400     DUP(u, in, user);
1401     DUP(u, in, password);
1402     DUP(u, in, options);
1403     DUP(u, in, host);
1404     DUP(u, in, port);
1405     DUP(u, in, path);
1406     DUP(u, in, query);
1407     DUP(u, in, fragment);
1408     DUP(u, in, zoneid);
1409     u->portnum = in->portnum;
1410     u->fragment_present = in->fragment_present;
1411     u->query_present = in->query_present;
1412   }
1413   return u;
1414 fail:
1415   curl_url_cleanup(u);
1416   return NULL;
1417 }
1418 
curl_url_get(const CURLU * u,CURLUPart what,char ** part,unsigned int flags)1419 CURLUcode curl_url_get(const CURLU *u, CURLUPart what,
1420                        char **part, unsigned int flags)
1421 {
1422   const char *ptr;
1423   CURLUcode ifmissing = CURLUE_UNKNOWN_PART;
1424   char portbuf[7];
1425   bool urldecode = (flags & CURLU_URLDECODE)?1:0;
1426   bool urlencode = (flags & CURLU_URLENCODE)?1:0;
1427   bool punycode = FALSE;
1428   bool depunyfy = FALSE;
1429   bool plusdecode = FALSE;
1430   (void)flags;
1431   if(!u)
1432     return CURLUE_BAD_HANDLE;
1433   if(!part)
1434     return CURLUE_BAD_PARTPOINTER;
1435   *part = NULL;
1436 
1437   switch(what) {
1438   case CURLUPART_SCHEME:
1439     ptr = u->scheme;
1440     ifmissing = CURLUE_NO_SCHEME;
1441     urldecode = FALSE; /* never for schemes */
1442     break;
1443   case CURLUPART_USER:
1444     ptr = u->user;
1445     ifmissing = CURLUE_NO_USER;
1446     break;
1447   case CURLUPART_PASSWORD:
1448     ptr = u->password;
1449     ifmissing = CURLUE_NO_PASSWORD;
1450     break;
1451   case CURLUPART_OPTIONS:
1452     ptr = u->options;
1453     ifmissing = CURLUE_NO_OPTIONS;
1454     break;
1455   case CURLUPART_HOST:
1456     ptr = u->host;
1457     ifmissing = CURLUE_NO_HOST;
1458     punycode = (flags & CURLU_PUNYCODE)?1:0;
1459     depunyfy = (flags & CURLU_PUNY2IDN)?1:0;
1460     break;
1461   case CURLUPART_ZONEID:
1462     ptr = u->zoneid;
1463     ifmissing = CURLUE_NO_ZONEID;
1464     break;
1465   case CURLUPART_PORT:
1466     ptr = u->port;
1467     ifmissing = CURLUE_NO_PORT;
1468     urldecode = FALSE; /* never for port */
1469     if(!ptr && (flags & CURLU_DEFAULT_PORT) && u->scheme) {
1470       /* there's no stored port number, but asked to deliver
1471          a default one for the scheme */
1472       const struct Curl_handler *h = Curl_get_scheme_handler(u->scheme);
1473       if(h) {
1474         msnprintf(portbuf, sizeof(portbuf), "%u", h->defport);
1475         ptr = portbuf;
1476       }
1477     }
1478     else if(ptr && u->scheme) {
1479       /* there is a stored port number, but ask to inhibit if
1480          it matches the default one for the scheme */
1481       const struct Curl_handler *h = Curl_get_scheme_handler(u->scheme);
1482       if(h && (h->defport == u->portnum) &&
1483          (flags & CURLU_NO_DEFAULT_PORT))
1484         ptr = NULL;
1485     }
1486     break;
1487   case CURLUPART_PATH:
1488     ptr = u->path;
1489     if(!ptr)
1490       ptr = "/";
1491     break;
1492   case CURLUPART_QUERY:
1493     ptr = u->query;
1494     ifmissing = CURLUE_NO_QUERY;
1495     plusdecode = urldecode;
1496     if(ptr && !ptr[0] && !(flags & CURLU_GET_EMPTY))
1497       /* there was a blank query and the user do not ask for it */
1498       ptr = NULL;
1499     break;
1500   case CURLUPART_FRAGMENT:
1501     ptr = u->fragment;
1502     ifmissing = CURLUE_NO_FRAGMENT;
1503     if(!ptr && u->fragment_present && flags & CURLU_GET_EMPTY)
1504       /* there was a blank fragment and the user asks for it */
1505       ptr = "";
1506     break;
1507   case CURLUPART_URL: {
1508     char *url;
1509     char *scheme;
1510     char *options = u->options;
1511     char *port = u->port;
1512     char *allochost = NULL;
1513     bool show_fragment =
1514       u->fragment || (u->fragment_present && flags & CURLU_GET_EMPTY);
1515     bool show_query =
1516       (u->query && u->query[0]) ||
1517       (u->query_present && flags & CURLU_GET_EMPTY);
1518     punycode = (flags & CURLU_PUNYCODE)?1:0;
1519     depunyfy = (flags & CURLU_PUNY2IDN)?1:0;
1520     if(u->scheme && strcasecompare("file", u->scheme)) {
1521       url = aprintf("file://%s%s%s",
1522                     u->path,
1523                     show_fragment ? "#": "",
1524                     u->fragment ? u->fragment : "");
1525     }
1526     else if(!u->host)
1527       return CURLUE_NO_HOST;
1528     else {
1529       const struct Curl_handler *h = NULL;
1530       if(u->scheme)
1531         scheme = u->scheme;
1532       else if(flags & CURLU_DEFAULT_SCHEME)
1533         scheme = (char *) DEFAULT_SCHEME;
1534       else
1535         return CURLUE_NO_SCHEME;
1536 
1537       h = Curl_get_scheme_handler(scheme);
1538       if(!port && (flags & CURLU_DEFAULT_PORT)) {
1539         /* there's no stored port number, but asked to deliver
1540            a default one for the scheme */
1541         if(h) {
1542           msnprintf(portbuf, sizeof(portbuf), "%u", h->defport);
1543           port = portbuf;
1544         }
1545       }
1546       else if(port) {
1547         /* there is a stored port number, but asked to inhibit if it matches
1548            the default one for the scheme */
1549         if(h && (h->defport == u->portnum) &&
1550            (flags & CURLU_NO_DEFAULT_PORT))
1551           port = NULL;
1552       }
1553 
1554       if(h && !(h->flags & PROTOPT_URLOPTIONS))
1555         options = NULL;
1556 
1557       if(u->host[0] == '[') {
1558         if(u->zoneid) {
1559           /* make it '[ host %25 zoneid ]' */
1560           struct dynbuf enc;
1561           size_t hostlen = strlen(u->host);
1562           Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1563           if(Curl_dyn_addf(&enc, "%.*s%%25%s]", (int)hostlen - 1, u->host,
1564                            u->zoneid))
1565             return CURLUE_OUT_OF_MEMORY;
1566           allochost = Curl_dyn_ptr(&enc);
1567         }
1568       }
1569       else if(urlencode) {
1570         allochost = curl_easy_escape(NULL, u->host, 0);
1571         if(!allochost)
1572           return CURLUE_OUT_OF_MEMORY;
1573       }
1574       else if(punycode) {
1575         if(!Curl_is_ASCII_name(u->host)) {
1576 #ifndef USE_IDN
1577           return CURLUE_LACKS_IDN;
1578 #else
1579           CURLcode result = Curl_idn_decode(u->host, &allochost);
1580           if(result)
1581             return (result == CURLE_OUT_OF_MEMORY) ?
1582               CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME;
1583 #endif
1584         }
1585       }
1586       else if(depunyfy) {
1587         if(Curl_is_ASCII_name(u->host) && !strncmp("xn--", u->host, 4)) {
1588 #ifndef USE_IDN
1589           return CURLUE_LACKS_IDN;
1590 #else
1591           CURLcode result = Curl_idn_encode(u->host, &allochost);
1592           if(result)
1593             /* this is the most likely error */
1594             return (result == CURLE_OUT_OF_MEMORY) ?
1595               CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME;
1596 #endif
1597         }
1598       }
1599 
1600       url = aprintf("%s://%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
1601                     scheme,
1602                     u->user ? u->user : "",
1603                     u->password ? ":": "",
1604                     u->password ? u->password : "",
1605                     options ? ";" : "",
1606                     options ? options : "",
1607                     (u->user || u->password || options) ? "@": "",
1608                     allochost ? allochost : u->host,
1609                     port ? ":": "",
1610                     port ? port : "",
1611                     u->path ? u->path : "/",
1612                     show_query ? "?": "",
1613                     u->query ? u->query : "",
1614                     show_fragment ? "#": "",
1615                     u->fragment? u->fragment : "");
1616       free(allochost);
1617     }
1618     if(!url)
1619       return CURLUE_OUT_OF_MEMORY;
1620     *part = url;
1621     return CURLUE_OK;
1622   }
1623   default:
1624     ptr = NULL;
1625     break;
1626   }
1627   if(ptr) {
1628     size_t partlen = strlen(ptr);
1629     size_t i = 0;
1630     *part = Curl_memdup0(ptr, partlen);
1631     if(!*part)
1632       return CURLUE_OUT_OF_MEMORY;
1633     if(plusdecode) {
1634       /* convert + to space */
1635       char *plus = *part;
1636       for(i = 0; i < partlen; ++plus, i++) {
1637         if(*plus == '+')
1638           *plus = ' ';
1639       }
1640     }
1641     if(urldecode) {
1642       char *decoded;
1643       size_t dlen;
1644       /* this unconditional rejection of control bytes is documented
1645          API behavior */
1646       CURLcode res = Curl_urldecode(*part, 0, &decoded, &dlen, REJECT_CTRL);
1647       free(*part);
1648       if(res) {
1649         *part = NULL;
1650         return CURLUE_URLDECODE;
1651       }
1652       *part = decoded;
1653       partlen = dlen;
1654     }
1655     if(urlencode) {
1656       struct dynbuf enc;
1657       CURLUcode uc;
1658       Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1659       uc = urlencode_str(&enc, *part, partlen, TRUE, what == CURLUPART_QUERY);
1660       if(uc)
1661         return uc;
1662       free(*part);
1663       *part = Curl_dyn_ptr(&enc);
1664     }
1665     else if(punycode) {
1666       if(!Curl_is_ASCII_name(u->host)) {
1667 #ifndef USE_IDN
1668         return CURLUE_LACKS_IDN;
1669 #else
1670         char *allochost;
1671         CURLcode result = Curl_idn_decode(*part, &allochost);
1672         if(result)
1673           return (result == CURLE_OUT_OF_MEMORY) ?
1674             CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME;
1675         free(*part);
1676         *part = allochost;
1677 #endif
1678       }
1679     }
1680     else if(depunyfy) {
1681       if(Curl_is_ASCII_name(u->host)  && !strncmp("xn--", u->host, 4)) {
1682 #ifndef USE_IDN
1683         return CURLUE_LACKS_IDN;
1684 #else
1685         char *allochost;
1686         CURLcode result = Curl_idn_encode(*part, &allochost);
1687         if(result)
1688           return (result == CURLE_OUT_OF_MEMORY) ?
1689             CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME;
1690         free(*part);
1691         *part = allochost;
1692 #endif
1693       }
1694     }
1695 
1696     return CURLUE_OK;
1697   }
1698   else
1699     return ifmissing;
1700 }
1701 
curl_url_set(CURLU * u,CURLUPart what,const char * part,unsigned int flags)1702 CURLUcode curl_url_set(CURLU *u, CURLUPart what,
1703                        const char *part, unsigned int flags)
1704 {
1705   char **storep = NULL;
1706   bool urlencode = (flags & CURLU_URLENCODE)? 1 : 0;
1707   bool plusencode = FALSE;
1708   bool urlskipslash = FALSE;
1709   bool leadingslash = FALSE;
1710   bool appendquery = FALSE;
1711   bool equalsencode = FALSE;
1712   size_t nalloc;
1713 
1714   if(!u)
1715     return CURLUE_BAD_HANDLE;
1716   if(!part) {
1717     /* setting a part to NULL clears it */
1718     switch(what) {
1719     case CURLUPART_URL:
1720       break;
1721     case CURLUPART_SCHEME:
1722       storep = &u->scheme;
1723       break;
1724     case CURLUPART_USER:
1725       storep = &u->user;
1726       break;
1727     case CURLUPART_PASSWORD:
1728       storep = &u->password;
1729       break;
1730     case CURLUPART_OPTIONS:
1731       storep = &u->options;
1732       break;
1733     case CURLUPART_HOST:
1734       storep = &u->host;
1735       break;
1736     case CURLUPART_ZONEID:
1737       storep = &u->zoneid;
1738       break;
1739     case CURLUPART_PORT:
1740       u->portnum = 0;
1741       storep = &u->port;
1742       break;
1743     case CURLUPART_PATH:
1744       storep = &u->path;
1745       break;
1746     case CURLUPART_QUERY:
1747       storep = &u->query;
1748       u->query_present = FALSE;
1749       break;
1750     case CURLUPART_FRAGMENT:
1751       storep = &u->fragment;
1752       u->fragment_present = FALSE;
1753       break;
1754     default:
1755       return CURLUE_UNKNOWN_PART;
1756     }
1757     if(storep && *storep) {
1758       Curl_safefree(*storep);
1759     }
1760     else if(!storep) {
1761       free_urlhandle(u);
1762       memset(u, 0, sizeof(struct Curl_URL));
1763     }
1764     return CURLUE_OK;
1765   }
1766 
1767   nalloc = strlen(part);
1768   if(nalloc > CURL_MAX_INPUT_LENGTH)
1769     /* excessive input length */
1770     return CURLUE_MALFORMED_INPUT;
1771 
1772   switch(what) {
1773   case CURLUPART_SCHEME: {
1774     size_t plen = strlen(part);
1775     const char *s = part;
1776     if((plen > MAX_SCHEME_LEN) || (plen < 1))
1777       /* too long or too short */
1778       return CURLUE_BAD_SCHEME;
1779    /* verify that it is a fine scheme */
1780     if(!(flags & CURLU_NON_SUPPORT_SCHEME) && !Curl_get_scheme_handler(part))
1781       return CURLUE_UNSUPPORTED_SCHEME;
1782     storep = &u->scheme;
1783     urlencode = FALSE; /* never */
1784     if(ISALPHA(*s)) {
1785       /* ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */
1786       while(--plen) {
1787         if(ISALNUM(*s) || (*s == '+') || (*s == '-') || (*s == '.'))
1788           s++; /* fine */
1789         else
1790           return CURLUE_BAD_SCHEME;
1791       }
1792     }
1793     else
1794       return CURLUE_BAD_SCHEME;
1795     break;
1796   }
1797   case CURLUPART_USER:
1798     storep = &u->user;
1799     break;
1800   case CURLUPART_PASSWORD:
1801     storep = &u->password;
1802     break;
1803   case CURLUPART_OPTIONS:
1804     storep = &u->options;
1805     break;
1806   case CURLUPART_HOST:
1807     storep = &u->host;
1808     Curl_safefree(u->zoneid);
1809     break;
1810   case CURLUPART_ZONEID:
1811     storep = &u->zoneid;
1812     break;
1813   case CURLUPART_PORT:
1814     if(!ISDIGIT(part[0]))
1815       /* not a number */
1816       return CURLUE_BAD_PORT_NUMBER;
1817     else {
1818       char *tmp;
1819       char *endp;
1820       unsigned long port;
1821       errno = 0;
1822       port = strtoul(part, &endp, 10);  /* must be decimal */
1823       if(errno || (port > 0xffff) || *endp)
1824         /* weirdly provided number, not good! */
1825         return CURLUE_BAD_PORT_NUMBER;
1826       tmp = strdup(part);
1827       if(!tmp)
1828         return CURLUE_OUT_OF_MEMORY;
1829       free(u->port);
1830       u->port = tmp;
1831       u->portnum = (unsigned short)port;
1832       return CURLUE_OK;
1833     }
1834   case CURLUPART_PATH:
1835     urlskipslash = TRUE;
1836     leadingslash = TRUE; /* enforce */
1837     storep = &u->path;
1838     break;
1839   case CURLUPART_QUERY:
1840     plusencode = urlencode;
1841     appendquery = (flags & CURLU_APPENDQUERY)?1:0;
1842     equalsencode = appendquery;
1843     storep = &u->query;
1844     u->query_present = TRUE;
1845     break;
1846   case CURLUPART_FRAGMENT:
1847     storep = &u->fragment;
1848     u->fragment_present = TRUE;
1849     break;
1850   case CURLUPART_URL: {
1851     /*
1852      * Allow a new URL to replace the existing (if any) contents.
1853      *
1854      * If the existing contents is enough for a URL, allow a relative URL to
1855      * replace it.
1856      */
1857     CURLcode result;
1858     CURLUcode uc;
1859     char *oldurl;
1860     char *redired_url;
1861 
1862     if(!nalloc)
1863       /* a blank URL is not a valid URL */
1864       return CURLUE_MALFORMED_INPUT;
1865 
1866     /* if the new thing is absolute or the old one is not
1867      * (we could not get an absolute url in 'oldurl'),
1868      * then replace the existing with the new. */
1869     if(Curl_is_absolute_url(part, NULL, 0,
1870                             flags & (CURLU_GUESS_SCHEME|
1871                                      CURLU_DEFAULT_SCHEME))
1872        || curl_url_get(u, CURLUPART_URL, &oldurl, flags)) {
1873       return parseurl_and_replace(part, u, flags);
1874     }
1875 
1876     /* apply the relative part to create a new URL
1877      * and replace the existing one with it. */
1878     result = concat_url(oldurl, part, &redired_url);
1879     free(oldurl);
1880     if(result)
1881       return cc2cu(result);
1882 
1883     uc = parseurl_and_replace(redired_url, u, flags);
1884     free(redired_url);
1885     return uc;
1886   }
1887   default:
1888     return CURLUE_UNKNOWN_PART;
1889   }
1890   DEBUGASSERT(storep);
1891   {
1892     const char *newp;
1893     struct dynbuf enc;
1894     Curl_dyn_init(&enc, nalloc * 3 + 1 + leadingslash);
1895 
1896     if(leadingslash && (part[0] != '/')) {
1897       CURLcode result = Curl_dyn_addn(&enc, "/", 1);
1898       if(result)
1899         return cc2cu(result);
1900     }
1901     if(urlencode) {
1902       const unsigned char *i;
1903 
1904       for(i = (const unsigned char *)part; *i; i++) {
1905         CURLcode result;
1906         if((*i == ' ') && plusencode) {
1907           result = Curl_dyn_addn(&enc, "+", 1);
1908           if(result)
1909             return CURLUE_OUT_OF_MEMORY;
1910         }
1911         else if(ISUNRESERVED(*i) ||
1912                 ((*i == '/') && urlskipslash) ||
1913                 ((*i == '=') && equalsencode)) {
1914           if((*i == '=') && equalsencode)
1915             /* only skip the first equals sign */
1916             equalsencode = FALSE;
1917           result = Curl_dyn_addn(&enc, i, 1);
1918           if(result)
1919             return cc2cu(result);
1920         }
1921         else {
1922           char out[3]={'%'};
1923           out[1] = hexdigits[*i>>4];
1924           out[2] = hexdigits[*i & 0xf];
1925           result = Curl_dyn_addn(&enc, out, 3);
1926           if(result)
1927             return cc2cu(result);
1928         }
1929       }
1930     }
1931     else {
1932       char *p;
1933       CURLcode result = Curl_dyn_add(&enc, part);
1934       if(result)
1935         return cc2cu(result);
1936       p = Curl_dyn_ptr(&enc);
1937       while(*p) {
1938         /* make sure percent encoded are lower case */
1939         if((*p == '%') && ISXDIGIT(p[1]) && ISXDIGIT(p[2]) &&
1940            (ISUPPER(p[1]) || ISUPPER(p[2]))) {
1941           p[1] = Curl_raw_tolower(p[1]);
1942           p[2] = Curl_raw_tolower(p[2]);
1943           p += 3;
1944         }
1945         else
1946           p++;
1947       }
1948     }
1949     newp = Curl_dyn_ptr(&enc);
1950 
1951     if(appendquery && newp) {
1952       /* Append the 'newp' string onto the old query. Add a '&' separator if
1953          none is present at the end of the existing query already */
1954 
1955       size_t querylen = u->query ? strlen(u->query) : 0;
1956       bool addamperand = querylen && (u->query[querylen -1] != '&');
1957       if(querylen) {
1958         struct dynbuf qbuf;
1959         Curl_dyn_init(&qbuf, CURL_MAX_INPUT_LENGTH);
1960 
1961         if(Curl_dyn_addn(&qbuf, u->query, querylen)) /* add original query */
1962           goto nomem;
1963 
1964         if(addamperand) {
1965           if(Curl_dyn_addn(&qbuf, "&", 1))
1966             goto nomem;
1967         }
1968         if(Curl_dyn_add(&qbuf, newp))
1969           goto nomem;
1970         Curl_dyn_free(&enc);
1971         free(*storep);
1972         *storep = Curl_dyn_ptr(&qbuf);
1973         return CURLUE_OK;
1974 nomem:
1975         Curl_dyn_free(&enc);
1976         return CURLUE_OUT_OF_MEMORY;
1977       }
1978     }
1979 
1980     else if(what == CURLUPART_HOST) {
1981       size_t n = Curl_dyn_len(&enc);
1982       if(!n && (flags & CURLU_NO_AUTHORITY)) {
1983         /* Skip hostname check, it's allowed to be empty. */
1984       }
1985       else {
1986         if(!n || hostname_check(u, (char *)newp, n)) {
1987           Curl_dyn_free(&enc);
1988           return CURLUE_BAD_HOSTNAME;
1989         }
1990       }
1991     }
1992 
1993     free(*storep);
1994     *storep = (char *)newp;
1995   }
1996   return CURLUE_OK;
1997 }
1998