xref: /curl/lib/urlapi.c (revision fbf5d507)
1 /***************************************************************************
2  *                                  _   _ ____  _
3  *  Project                     ___| | | |  _ \| |
4  *                             / __| | | | |_) | |
5  *                            | (__| |_| |  _ <| |___
6  *                             \___|\___/|_| \_\_____|
7  *
8  * Copyright (C) Daniel Stenberg, <daniel@haxx.se>, et al.
9  *
10  * This software is licensed as described in the file COPYING, which
11  * you should have received as part of this distribution. The terms
12  * are also available at https://curl.se/docs/copyright.html.
13  *
14  * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15  * copies of the Software, and permit persons to whom the Software is
16  * furnished to do so, under the terms of the COPYING file.
17  *
18  * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19  * KIND, either express or implied.
20  *
21  * SPDX-License-Identifier: curl
22  *
23  ***************************************************************************/
24 
25 #include "curl_setup.h"
26 
27 #include "urldata.h"
28 #include "urlapi-int.h"
29 #include "strcase.h"
30 #include "url.h"
31 #include "escape.h"
32 #include "curl_ctype.h"
33 #include "inet_pton.h"
34 #include "inet_ntop.h"
35 #include "strdup.h"
36 #include "idn.h"
37 
38 /* The last 3 #include files should be in this order */
39 #include "curl_printf.h"
40 #include "curl_memory.h"
41 #include "memdebug.h"
42 
43   /* MS-DOS/Windows style drive prefix, eg c: in c:foo */
44 #define STARTS_WITH_DRIVE_PREFIX(str) \
45   ((('a' <= str[0] && str[0] <= 'z') || \
46     ('A' <= str[0] && str[0] <= 'Z')) && \
47    (str[1] == ':'))
48 
49   /* MS-DOS/Windows style drive prefix, optionally with
50    * a '|' instead of ':', followed by a slash or NUL */
51 #define STARTS_WITH_URL_DRIVE_PREFIX(str) \
52   ((('a' <= (str)[0] && (str)[0] <= 'z') || \
53     ('A' <= (str)[0] && (str)[0] <= 'Z')) && \
54    ((str)[1] == ':' || (str)[1] == '|') && \
55    ((str)[2] == '/' || (str)[2] == '\\' || (str)[2] == 0))
56 
57 /* scheme is not URL encoded, the longest libcurl supported ones are... */
58 #define MAX_SCHEME_LEN 40
59 
60 /*
61  * If USE_IPV6 is disabled, we still want to parse IPv6 addresses, so make
62  * sure we have _some_ value for AF_INET6 without polluting our fake value
63  * everywhere.
64  */
65 #if !defined(USE_IPV6) && !defined(AF_INET6)
66 #define AF_INET6 (AF_INET + 1)
67 #endif
68 
69 /* Internal representation of CURLU. Point to URL-encoded strings. */
70 struct Curl_URL {
71   char *scheme;
72   char *user;
73   char *password;
74   char *options; /* IMAP only? */
75   char *host;
76   char *zoneid; /* for numerical IPv6 addresses */
77   char *port;
78   char *path;
79   char *query;
80   char *fragment;
81   unsigned short portnum; /* the numerical version (if 'port' is set) */
82   BIT(query_present);    /* to support blank */
83   BIT(fragment_present); /* to support blank */
84   BIT(guessed_scheme);   /* when a URL without scheme is parsed */
85 };
86 
87 #define DEFAULT_SCHEME "https"
88 
free_urlhandle(struct Curl_URL * u)89 static void free_urlhandle(struct Curl_URL *u)
90 {
91   free(u->scheme);
92   free(u->user);
93   free(u->password);
94   free(u->options);
95   free(u->host);
96   free(u->zoneid);
97   free(u->port);
98   free(u->path);
99   free(u->query);
100   free(u->fragment);
101 }
102 
103 /*
104  * Find the separator at the end of the hostname, or the '?' in cases like
105  * http://www.example.com?id=2380
106  */
find_host_sep(const char * url)107 static const char *find_host_sep(const char *url)
108 {
109   const char *sep;
110   const char *query;
111 
112   /* Find the start of the hostname */
113   sep = strstr(url, "//");
114   if(!sep)
115     sep = url;
116   else
117     sep += 2;
118 
119   query = strchr(sep, '?');
120   sep = strchr(sep, '/');
121 
122   if(!sep)
123     sep = url + strlen(url);
124 
125   if(!query)
126     query = url + strlen(url);
127 
128   return sep < query ? sep : query;
129 }
130 
131 /* convert CURLcode to CURLUcode */
132 #define cc2cu(x) ((x) == CURLE_TOO_LARGE ? CURLUE_TOO_LARGE :   \
133                   CURLUE_OUT_OF_MEMORY)
134 /*
135  * Decide whether a character in a URL must be escaped.
136  */
137 #define urlchar_needs_escaping(c) (!(ISCNTRL(c) || ISSPACE(c) || ISGRAPH(c)))
138 
139 static const char hexdigits[] = "0123456789abcdef";
140 /* urlencode_str() writes data into an output dynbuf and URL-encodes the
141  * spaces in the source URL accordingly.
142  *
143  * URL encoding should be skipped for hostnames, otherwise IDN resolution
144  * will fail.
145  */
urlencode_str(struct dynbuf * o,const char * url,size_t len,bool relative,bool query)146 static CURLUcode urlencode_str(struct dynbuf *o, const char *url,
147                                size_t len, bool relative,
148                                bool query)
149 {
150   /* we must add this with whitespace-replacing */
151   bool left = !query;
152   const unsigned char *iptr;
153   const unsigned char *host_sep = (const unsigned char *) url;
154   CURLcode result;
155 
156   if(!relative)
157     host_sep = (const unsigned char *) find_host_sep(url);
158 
159   for(iptr = (unsigned char *)url;    /* read from here */
160       len; iptr++, len--) {
161 
162     if(iptr < host_sep) {
163       result = Curl_dyn_addn(o, iptr, 1);
164       if(result)
165         return cc2cu(result);
166       continue;
167     }
168 
169     if(*iptr == ' ') {
170       if(left)
171         result = Curl_dyn_addn(o, "%20", 3);
172       else
173         result = Curl_dyn_addn(o, "+", 1);
174       if(result)
175         return cc2cu(result);
176       continue;
177     }
178 
179     if(*iptr == '?')
180       left = FALSE;
181 
182     if(urlchar_needs_escaping(*iptr)) {
183       char out[3]={'%'};
184       out[1] = hexdigits[*iptr >> 4];
185       out[2] = hexdigits[*iptr & 0xf];
186       result = Curl_dyn_addn(o, out, 3);
187     }
188     else
189       result = Curl_dyn_addn(o, iptr, 1);
190     if(result)
191       return cc2cu(result);
192   }
193 
194   return CURLUE_OK;
195 }
196 
197 /*
198  * Returns the length of the scheme if the given URL is absolute (as opposed
199  * to relative). Stores the scheme in the buffer if TRUE and 'buf' is
200  * non-NULL. The buflen must be larger than MAX_SCHEME_LEN if buf is set.
201  *
202  * If 'guess_scheme' is TRUE, it means the URL might be provided without
203  * scheme.
204  */
Curl_is_absolute_url(const char * url,char * buf,size_t buflen,bool guess_scheme)205 size_t Curl_is_absolute_url(const char *url, char *buf, size_t buflen,
206                             bool guess_scheme)
207 {
208   size_t i = 0;
209   DEBUGASSERT(!buf || (buflen > MAX_SCHEME_LEN));
210   (void)buflen; /* only used in debug-builds */
211   if(buf)
212     buf[0] = 0; /* always leave a defined value in buf */
213 #ifdef _WIN32
214   if(guess_scheme && STARTS_WITH_DRIVE_PREFIX(url))
215     return 0;
216 #endif
217   if(ISALPHA(url[0]))
218     for(i = 1; i < MAX_SCHEME_LEN; ++i) {
219       char s = url[i];
220       if(s && (ISALNUM(s) || (s == '+') || (s == '-') || (s == '.') )) {
221         /* RFC 3986 3.1 explains:
222            scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
223         */
224       }
225       else {
226         break;
227       }
228     }
229   if(i && (url[i] == ':') && ((url[i + 1] == '/') || !guess_scheme)) {
230     /* If this does not guess scheme, the scheme always ends with the colon so
231        that this also detects data: URLs etc. In guessing mode, data: could
232        be the hostname "data" with a specified port number. */
233 
234     /* the length of the scheme is the name part only */
235     size_t len = i;
236     if(buf) {
237       Curl_strntolower(buf, url, i);
238       buf[i] = 0;
239     }
240     return len;
241   }
242   return 0;
243 }
244 
245 /*
246  * Concatenate a relative URL to a base URL making it absolute.
247  * URL-encodes any spaces.
248  * The returned pointer must be freed by the caller unless NULL
249  * (returns NULL on out of memory).
250  *
251  * Note that this function destroys the 'base' string.
252  */
concat_url(char * base,const char * relurl,char ** newurl)253 static CURLcode concat_url(char *base, const char *relurl, char **newurl)
254 {
255   /***
256    TRY to append this new path to the old URL
257    to the right of the host part. Oh crap, this is doomed to cause
258    problems in the future...
259   */
260   struct dynbuf newest;
261   char *protsep;
262   char *pathsep;
263   bool host_changed = FALSE;
264   const char *useurl = relurl;
265   CURLcode result = CURLE_OK;
266   CURLUcode uc;
267   bool skip_slash = FALSE;
268   *newurl = NULL;
269 
270   /* protsep points to the start of the hostname */
271   protsep = strstr(base, "//");
272   if(!protsep)
273     protsep = base;
274   else
275     protsep += 2; /* pass the slashes */
276 
277   if('/' != relurl[0]) {
278     int level = 0;
279 
280     /* First we need to find out if there is a ?-letter in the URL,
281        and cut it and the right-side of that off */
282     pathsep = strchr(protsep, '?');
283     if(pathsep)
284       *pathsep = 0;
285 
286     /* we have a relative path to append to the last slash if there is one
287        available, or the new URL is just a query string (starts with a '?') or
288        a fragment (starts with '#') we append the new one at the end of the
289        current URL */
290     if((useurl[0] != '?') && (useurl[0] != '#')) {
291       pathsep = strrchr(protsep, '/');
292       if(pathsep)
293         *pathsep = 0;
294 
295       /* Check if there is any slash after the hostname, and if so, remember
296          that position instead */
297       pathsep = strchr(protsep, '/');
298       if(pathsep)
299         protsep = pathsep + 1;
300       else
301         protsep = NULL;
302 
303       /* now deal with one "./" or any amount of "../" in the newurl
304          and act accordingly */
305 
306       if((useurl[0] == '.') && (useurl[1] == '/'))
307         useurl += 2; /* just skip the "./" */
308 
309       while((useurl[0] == '.') &&
310             (useurl[1] == '.') &&
311             (useurl[2] == '/')) {
312         level++;
313         useurl += 3; /* pass the "../" */
314       }
315 
316       if(protsep) {
317         while(level--) {
318           /* cut off one more level from the right of the original URL */
319           pathsep = strrchr(protsep, '/');
320           if(pathsep)
321             *pathsep = 0;
322           else {
323             *protsep = 0;
324             break;
325           }
326         }
327       }
328     }
329     else
330       skip_slash = TRUE;
331   }
332   else {
333     /* We got a new absolute path for this server */
334 
335     if(relurl[1] == '/') {
336       /* the new URL starts with //, just keep the protocol part from the
337          original one */
338       *protsep = 0;
339       useurl = &relurl[2]; /* we keep the slashes from the original, so we
340                               skip the new ones */
341       host_changed = TRUE;
342     }
343     else {
344       /* cut off the original URL from the first slash, or deal with URLs
345          without slash */
346       pathsep = strchr(protsep, '/');
347       if(pathsep) {
348         /* When people use badly formatted URLs, such as
349            "http://www.example.com?dir=/home/daniel" we must not use the first
350            slash, if there is a ?-letter before it! */
351         char *sep = strchr(protsep, '?');
352         if(sep && (sep < pathsep))
353           pathsep = sep;
354         *pathsep = 0;
355       }
356       else {
357         /* There was no slash. Now, since we might be operating on a badly
358            formatted URL, such as "http://www.example.com?id=2380" which does
359            not use a slash separator as it is supposed to, we need to check
360            for a ?-letter as well! */
361         pathsep = strchr(protsep, '?');
362         if(pathsep)
363           *pathsep = 0;
364       }
365     }
366   }
367 
368   Curl_dyn_init(&newest, CURL_MAX_INPUT_LENGTH);
369 
370   /* copy over the root URL part */
371   result = Curl_dyn_add(&newest, base);
372   if(result)
373     return result;
374 
375   /* check if we need to append a slash */
376   if(('/' == useurl[0]) || (protsep && !*protsep) || skip_slash)
377     ;
378   else {
379     result = Curl_dyn_addn(&newest, "/", 1);
380     if(result)
381       return result;
382   }
383 
384   /* then append the new piece on the right side */
385   uc = urlencode_str(&newest, useurl, strlen(useurl), !host_changed,
386                      FALSE);
387   if(uc)
388     return (uc == CURLUE_TOO_LARGE) ? CURLE_TOO_LARGE : CURLE_OUT_OF_MEMORY;
389 
390   *newurl = Curl_dyn_ptr(&newest);
391   return CURLE_OK;
392 }
393 
394 /* scan for byte values <= 31, 127 and sometimes space */
junkscan(const char * url,size_t * urllen,unsigned int flags)395 static CURLUcode junkscan(const char *url, size_t *urllen, unsigned int flags)
396 {
397   static const char badbytes[]={
398     /* */ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
399     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
400     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
401     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
402     0x7f, 0x00 /* null-terminate */
403   };
404   size_t n = strlen(url);
405   size_t nfine;
406 
407   if(n > CURL_MAX_INPUT_LENGTH)
408     /* excessive input length */
409     return CURLUE_MALFORMED_INPUT;
410 
411   nfine = strcspn(url, badbytes);
412   if((nfine != n) ||
413      (!(flags & CURLU_ALLOW_SPACE) && strchr(url, ' ')))
414     return CURLUE_MALFORMED_INPUT;
415 
416   *urllen = n;
417   return CURLUE_OK;
418 }
419 
420 /*
421  * parse_hostname_login()
422  *
423  * Parse the login details (username, password and options) from the URL and
424  * strip them out of the hostname
425  *
426  */
parse_hostname_login(struct Curl_URL * u,const char * login,size_t len,unsigned int flags,size_t * offset)427 static CURLUcode parse_hostname_login(struct Curl_URL *u,
428                                       const char *login,
429                                       size_t len,
430                                       unsigned int flags,
431                                       size_t *offset) /* to the hostname */
432 {
433   CURLUcode result = CURLUE_OK;
434   CURLcode ccode;
435   char *userp = NULL;
436   char *passwdp = NULL;
437   char *optionsp = NULL;
438   const struct Curl_handler *h = NULL;
439 
440   /* At this point, we assume all the other special cases have been taken
441    * care of, so the host is at most
442    *
443    *   [user[:password][;options]]@]hostname
444    *
445    * We need somewhere to put the embedded details, so do that first.
446    */
447   char *ptr;
448 
449   DEBUGASSERT(login);
450 
451   *offset = 0;
452   ptr = memchr(login, '@', len);
453   if(!ptr)
454     goto out;
455 
456   /* We will now try to extract the
457    * possible login information in a string like:
458    * ftp://user:password@ftp.my.site:8021/README */
459   ptr++;
460 
461   /* if this is a known scheme, get some details */
462   if(u->scheme)
463     h = Curl_get_scheme_handler(u->scheme);
464 
465   /* We could use the login information in the URL so extract it. Only parse
466      options if the handler says we should. Note that 'h' might be NULL! */
467   ccode = Curl_parse_login_details(login, ptr - login - 1,
468                                    &userp, &passwdp,
469                                    (h && (h->flags & PROTOPT_URLOPTIONS)) ?
470                                    &optionsp : NULL);
471   if(ccode) {
472     result = CURLUE_BAD_LOGIN;
473     goto out;
474   }
475 
476   if(userp) {
477     if(flags & CURLU_DISALLOW_USER) {
478       /* Option DISALLOW_USER is set and URL contains username. */
479       result = CURLUE_USER_NOT_ALLOWED;
480       goto out;
481     }
482     free(u->user);
483     u->user = userp;
484   }
485 
486   if(passwdp) {
487     free(u->password);
488     u->password = passwdp;
489   }
490 
491   if(optionsp) {
492     free(u->options);
493     u->options = optionsp;
494   }
495 
496   /* the hostname starts at this offset */
497   *offset = ptr - login;
498   return CURLUE_OK;
499 
500 out:
501 
502   free(userp);
503   free(passwdp);
504   free(optionsp);
505   u->user = NULL;
506   u->password = NULL;
507   u->options = NULL;
508 
509   return result;
510 }
511 
Curl_parse_port(struct Curl_URL * u,struct dynbuf * host,bool has_scheme)512 UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, struct dynbuf *host,
513                                    bool has_scheme)
514 {
515   char *portptr;
516   char *hostname = Curl_dyn_ptr(host);
517   /*
518    * Find the end of an IPv6 address on the ']' ending bracket.
519    */
520   if(hostname[0] == '[') {
521     portptr = strchr(hostname, ']');
522     if(!portptr)
523       return CURLUE_BAD_IPV6;
524     portptr++;
525     /* this is a RFC2732-style specified IP-address */
526     if(*portptr) {
527       if(*portptr != ':')
528         return CURLUE_BAD_PORT_NUMBER;
529     }
530     else
531       portptr = NULL;
532   }
533   else
534     portptr = strchr(hostname, ':');
535 
536   if(portptr) {
537     char *rest = NULL;
538     unsigned long port;
539     size_t keep = portptr - hostname;
540 
541     /* Browser behavior adaptation. If there is a colon with no digits after,
542        just cut off the name there which makes us ignore the colon and just
543        use the default port. Firefox, Chrome and Safari all do that.
544 
545        Do not do it if the URL has no scheme, to make something that looks like
546        a scheme not work!
547     */
548     Curl_dyn_setlen(host, keep);
549     portptr++;
550     if(!*portptr)
551       return has_scheme ? CURLUE_OK : CURLUE_BAD_PORT_NUMBER;
552 
553     if(!ISDIGIT(*portptr))
554       return CURLUE_BAD_PORT_NUMBER;
555 
556     errno = 0;
557     port = strtoul(portptr, &rest, 10);  /* Port number must be decimal */
558 
559     if(errno || (port > 0xffff) || *rest)
560       return CURLUE_BAD_PORT_NUMBER;
561 
562     u->portnum = (unsigned short) port;
563     /* generate a new port number string to get rid of leading zeroes etc */
564     free(u->port);
565     u->port = aprintf("%ld", port);
566     if(!u->port)
567       return CURLUE_OUT_OF_MEMORY;
568   }
569 
570   return CURLUE_OK;
571 }
572 
573 /* this assumes 'hostname' now starts with [ */
ipv6_parse(struct Curl_URL * u,char * hostname,size_t hlen)574 static CURLUcode ipv6_parse(struct Curl_URL *u, char *hostname,
575                             size_t hlen) /* length of hostname */
576 {
577   size_t len;
578   DEBUGASSERT(*hostname == '[');
579   if(hlen < 4) /* '[::]' is the shortest possible valid string */
580     return CURLUE_BAD_IPV6;
581   hostname++;
582   hlen -= 2;
583 
584   /* only valid IPv6 letters are ok */
585   len = strspn(hostname, "0123456789abcdefABCDEF:.");
586 
587   if(hlen != len) {
588     hlen = len;
589     if(hostname[len] == '%') {
590       /* this could now be '%[zone id]' */
591       char zoneid[16];
592       int i = 0;
593       char *h = &hostname[len + 1];
594       /* pass '25' if present and is a URL encoded percent sign */
595       if(!strncmp(h, "25", 2) && h[2] && (h[2] != ']'))
596         h += 2;
597       while(*h && (*h != ']') && (i < 15))
598         zoneid[i++] = *h++;
599       if(!i || (']' != *h))
600         return CURLUE_BAD_IPV6;
601       zoneid[i] = 0;
602       u->zoneid = strdup(zoneid);
603       if(!u->zoneid)
604         return CURLUE_OUT_OF_MEMORY;
605       hostname[len] = ']'; /* insert end bracket */
606       hostname[len + 1] = 0; /* terminate the hostname */
607     }
608     else
609       return CURLUE_BAD_IPV6;
610     /* hostname is fine */
611   }
612 
613   /* Check the IPv6 address. */
614   {
615     char dest[16]; /* fits a binary IPv6 address */
616     char norm[MAX_IPADR_LEN];
617     hostname[hlen] = 0; /* end the address there */
618     if(1 != Curl_inet_pton(AF_INET6, hostname, dest))
619       return CURLUE_BAD_IPV6;
620 
621     /* check if it can be done shorter */
622     if(Curl_inet_ntop(AF_INET6, dest, norm, sizeof(norm)) &&
623        (strlen(norm) < hlen)) {
624       strcpy(hostname, norm);
625       hlen = strlen(norm);
626       hostname[hlen + 1] = 0;
627     }
628     hostname[hlen] = ']'; /* restore ending bracket */
629   }
630   return CURLUE_OK;
631 }
632 
hostname_check(struct Curl_URL * u,char * hostname,size_t hlen)633 static CURLUcode hostname_check(struct Curl_URL *u, char *hostname,
634                                 size_t hlen) /* length of hostname */
635 {
636   size_t len;
637   DEBUGASSERT(hostname);
638 
639   if(!hlen)
640     return CURLUE_NO_HOST;
641   else if(hostname[0] == '[')
642     return ipv6_parse(u, hostname, hlen);
643   else {
644     /* letters from the second string are not ok */
645     len = strcspn(hostname, " \r\n\t/:#?!@{}[]\\$\'\"^`*<>=;,+&()%");
646     if(hlen != len)
647       /* hostname with bad content */
648       return CURLUE_BAD_HOSTNAME;
649   }
650   return CURLUE_OK;
651 }
652 
653 /*
654  * Handle partial IPv4 numerical addresses and different bases, like
655  * '16843009', '0x7f', '0x7f.1' '0177.1.1.1' etc.
656  *
657  * If the given input string is syntactically wrong IPv4 or any part for
658  * example is too big, this function returns HOST_NAME.
659  *
660  * Output the "normalized" version of that input string in plain quad decimal
661  * integers.
662  *
663  * Returns the host type.
664  */
665 
666 #define HOST_ERROR   -1 /* out of memory */
667 
668 #define HOST_NAME    1
669 #define HOST_IPV4    2
670 #define HOST_IPV6    3
671 
ipv4_normalize(struct dynbuf * host)672 static int ipv4_normalize(struct dynbuf *host)
673 {
674   bool done = FALSE;
675   int n = 0;
676   const char *c = Curl_dyn_ptr(host);
677   unsigned long parts[4] = {0, 0, 0, 0};
678   CURLcode result = CURLE_OK;
679 
680   if(*c == '[')
681     return HOST_IPV6;
682 
683   errno = 0; /* for strtoul */
684   while(!done) {
685     char *endp = NULL;
686     unsigned long l;
687     if(!ISDIGIT(*c))
688       /* most importantly this does not allow a leading plus or minus */
689       return HOST_NAME;
690     l = strtoul(c, &endp, 0);
691     if(errno)
692       return HOST_NAME;
693 #if SIZEOF_LONG > 4
694     /* a value larger than 32 bits */
695     if(l > UINT_MAX)
696       return HOST_NAME;
697 #endif
698 
699     parts[n] = l;
700     c = endp;
701 
702     switch(*c) {
703     case '.':
704       if(n == 3)
705         return HOST_NAME;
706       n++;
707       c++;
708       break;
709 
710     case '\0':
711       done = TRUE;
712       break;
713 
714     default:
715       return HOST_NAME;
716     }
717   }
718 
719   switch(n) {
720   case 0: /* a -- 32 bits */
721     Curl_dyn_reset(host);
722 
723     result = Curl_dyn_addf(host, "%u.%u.%u.%u",
724                            (unsigned int)(parts[0] >> 24),
725                            (unsigned int)((parts[0] >> 16) & 0xff),
726                            (unsigned int)((parts[0] >> 8) & 0xff),
727                            (unsigned int)(parts[0] & 0xff));
728     break;
729   case 1: /* a.b -- 8.24 bits */
730     if((parts[0] > 0xff) || (parts[1] > 0xffffff))
731       return HOST_NAME;
732     Curl_dyn_reset(host);
733     result = Curl_dyn_addf(host, "%u.%u.%u.%u",
734                            (unsigned int)(parts[0]),
735                            (unsigned int)((parts[1] >> 16) & 0xff),
736                            (unsigned int)((parts[1] >> 8) & 0xff),
737                            (unsigned int)(parts[1] & 0xff));
738     break;
739   case 2: /* a.b.c -- 8.8.16 bits */
740     if((parts[0] > 0xff) || (parts[1] > 0xff) || (parts[2] > 0xffff))
741       return HOST_NAME;
742     Curl_dyn_reset(host);
743     result = Curl_dyn_addf(host, "%u.%u.%u.%u",
744                            (unsigned int)(parts[0]),
745                            (unsigned int)(parts[1]),
746                            (unsigned int)((parts[2] >> 8) & 0xff),
747                            (unsigned int)(parts[2] & 0xff));
748     break;
749   case 3: /* a.b.c.d -- 8.8.8.8 bits */
750     if((parts[0] > 0xff) || (parts[1] > 0xff) || (parts[2] > 0xff) ||
751        (parts[3] > 0xff))
752       return HOST_NAME;
753     Curl_dyn_reset(host);
754     result = Curl_dyn_addf(host, "%u.%u.%u.%u",
755                            (unsigned int)(parts[0]),
756                            (unsigned int)(parts[1]),
757                            (unsigned int)(parts[2]),
758                            (unsigned int)(parts[3]));
759     break;
760   }
761   if(result)
762     return HOST_ERROR;
763   return HOST_IPV4;
764 }
765 
766 /* if necessary, replace the host content with a URL decoded version */
urldecode_host(struct dynbuf * host)767 static CURLUcode urldecode_host(struct dynbuf *host)
768 {
769   char *per = NULL;
770   const char *hostname = Curl_dyn_ptr(host);
771   per = strchr(hostname, '%');
772   if(!per)
773     /* nothing to decode */
774     return CURLUE_OK;
775   else {
776     /* encoded */
777     size_t dlen;
778     char *decoded;
779     CURLcode result = Curl_urldecode(hostname, 0, &decoded, &dlen,
780                                      REJECT_CTRL);
781     if(result)
782       return CURLUE_BAD_HOSTNAME;
783     Curl_dyn_reset(host);
784     result = Curl_dyn_addn(host, decoded, dlen);
785     free(decoded);
786     if(result)
787       return cc2cu(result);
788   }
789 
790   return CURLUE_OK;
791 }
792 
parse_authority(struct Curl_URL * u,const char * auth,size_t authlen,unsigned int flags,struct dynbuf * host,bool has_scheme)793 static CURLUcode parse_authority(struct Curl_URL *u,
794                                  const char *auth, size_t authlen,
795                                  unsigned int flags,
796                                  struct dynbuf *host,
797                                  bool has_scheme)
798 {
799   size_t offset;
800   CURLUcode uc;
801   CURLcode result;
802 
803   /*
804    * Parse the login details and strip them out of the hostname.
805    */
806   uc = parse_hostname_login(u, auth, authlen, flags, &offset);
807   if(uc)
808     goto out;
809 
810   result = Curl_dyn_addn(host, auth + offset, authlen - offset);
811   if(result) {
812     uc = cc2cu(result);
813     goto out;
814   }
815 
816   uc = Curl_parse_port(u, host, has_scheme);
817   if(uc)
818     goto out;
819 
820   if(!Curl_dyn_len(host))
821     return CURLUE_NO_HOST;
822 
823   switch(ipv4_normalize(host)) {
824   case HOST_IPV4:
825     break;
826   case HOST_IPV6:
827     uc = ipv6_parse(u, Curl_dyn_ptr(host), Curl_dyn_len(host));
828     break;
829   case HOST_NAME:
830     uc = urldecode_host(host);
831     if(!uc)
832       uc = hostname_check(u, Curl_dyn_ptr(host), Curl_dyn_len(host));
833     break;
834   case HOST_ERROR:
835     uc = CURLUE_OUT_OF_MEMORY;
836     break;
837   default:
838     uc = CURLUE_BAD_HOSTNAME; /* Bad IPv4 address even */
839     break;
840   }
841 
842 out:
843   return uc;
844 }
845 
846 /* used for HTTP/2 server push */
Curl_url_set_authority(CURLU * u,const char * authority)847 CURLUcode Curl_url_set_authority(CURLU *u, const char *authority)
848 {
849   CURLUcode result;
850   struct dynbuf host;
851 
852   DEBUGASSERT(authority);
853   Curl_dyn_init(&host, CURL_MAX_INPUT_LENGTH);
854 
855   result = parse_authority(u, authority, strlen(authority),
856                            CURLU_DISALLOW_USER, &host, !!u->scheme);
857   if(result)
858     Curl_dyn_free(&host);
859   else {
860     free(u->host);
861     u->host = Curl_dyn_ptr(&host);
862   }
863   return result;
864 }
865 
866 /*
867  * "Remove Dot Segments"
868  * https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4
869  */
870 
871 /*
872  * dedotdotify()
873  * @unittest: 1395
874  *
875  * This function gets a null-terminated path with dot and dotdot sequences
876  * passed in and strips them off according to the rules in RFC 3986 section
877  * 5.2.4.
878  *
879  * The function handles a query part ('?' + stuff) appended but it expects
880  * that fragments ('#' + stuff) have already been cut off.
881  *
882  * RETURNS
883  *
884  * Zero for success and 'out' set to an allocated dedotdotified string.
885  */
886 UNITTEST int dedotdotify(const char *input, size_t clen, char **outp);
dedotdotify(const char * input,size_t clen,char ** outp)887 UNITTEST int dedotdotify(const char *input, size_t clen, char **outp)
888 {
889   char *outptr;
890   const char *endp = &input[clen];
891   char *out;
892 
893   *outp = NULL;
894   /* the path always starts with a slash, and a slash has not dot */
895   if((clen < 2) || !memchr(input, '.', clen))
896     return 0;
897 
898   out = malloc(clen + 1);
899   if(!out)
900     return 1; /* out of memory */
901 
902   *out = 0; /* null-terminates, for inputs like "./" */
903   outptr = out;
904 
905   do {
906     bool dotdot = TRUE;
907     if(*input == '.') {
908       /*  A. If the input buffer begins with a prefix of "../" or "./", then
909           remove that prefix from the input buffer; otherwise, */
910 
911       if(!strncmp("./", input, 2)) {
912         input += 2;
913         clen -= 2;
914       }
915       else if(!strncmp("../", input, 3)) {
916         input += 3;
917         clen -= 3;
918       }
919       /*  D. if the input buffer consists only of "." or "..", then remove
920           that from the input buffer; otherwise, */
921 
922       else if(!strcmp(".", input) || !strcmp("..", input) ||
923               !strncmp(".?", input, 2) || !strncmp("..?", input, 3)) {
924         *out = 0;
925         break;
926       }
927       else
928         dotdot = FALSE;
929     }
930     else if(*input == '/') {
931       /*  B. if the input buffer begins with a prefix of "/./" or "/.", where
932           "."  is a complete path segment, then replace that prefix with "/" in
933           the input buffer; otherwise, */
934       if(!strncmp("/./", input, 3)) {
935         input += 2;
936         clen -= 2;
937       }
938       else if(!strcmp("/.", input) || !strncmp("/.?", input, 3)) {
939         *outptr++ = '/';
940         *outptr = 0;
941         break;
942       }
943 
944       /*  C. if the input buffer begins with a prefix of "/../" or "/..",
945           where ".." is a complete path segment, then replace that prefix with
946           "/" in the input buffer and remove the last segment and its
947           preceding "/" (if any) from the output buffer; otherwise, */
948 
949       else if(!strncmp("/../", input, 4)) {
950         input += 3;
951         clen -= 3;
952         /* remove the last segment from the output buffer */
953         while(outptr > out) {
954           outptr--;
955           if(*outptr == '/')
956             break;
957         }
958         *outptr = 0; /* null-terminate where it stops */
959       }
960       else if(!strcmp("/..", input) || !strncmp("/..?", input, 4)) {
961         /* remove the last segment from the output buffer */
962         while(outptr > out) {
963           outptr--;
964           if(*outptr == '/')
965             break;
966         }
967         *outptr++ = '/';
968         *outptr = 0; /* null-terminate where it stops */
969         break;
970       }
971       else
972         dotdot = FALSE;
973     }
974     else
975       dotdot = FALSE;
976 
977     if(!dotdot) {
978       /*  E. move the first path segment in the input buffer to the end of
979           the output buffer, including the initial "/" character (if any) and
980           any subsequent characters up to, but not including, the next "/"
981           character or the end of the input buffer. */
982 
983       do {
984         *outptr++ = *input++;
985         clen--;
986       } while(*input && (*input != '/') && (*input != '?'));
987       *outptr = 0;
988     }
989 
990     /* continue until end of path */
991   } while(input < endp);
992 
993   *outp = out;
994   return 0; /* success */
995 }
996 
parseurl(const char * url,CURLU * u,unsigned int flags)997 static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
998 {
999   const char *path;
1000   size_t pathlen;
1001   char *query = NULL;
1002   char *fragment = NULL;
1003   char schemebuf[MAX_SCHEME_LEN + 1];
1004   size_t schemelen = 0;
1005   size_t urllen;
1006   CURLUcode result = CURLUE_OK;
1007   size_t fraglen = 0;
1008   struct dynbuf host;
1009 
1010   DEBUGASSERT(url);
1011 
1012   Curl_dyn_init(&host, CURL_MAX_INPUT_LENGTH);
1013 
1014   result = junkscan(url, &urllen, flags);
1015   if(result)
1016     goto fail;
1017 
1018   schemelen = Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf),
1019                                    flags & (CURLU_GUESS_SCHEME|
1020                                             CURLU_DEFAULT_SCHEME));
1021 
1022   /* handle the file: scheme */
1023   if(schemelen && !strcmp(schemebuf, "file")) {
1024     bool uncpath = FALSE;
1025     if(urllen <= 6) {
1026       /* file:/ is not enough to actually be a complete file: URL */
1027       result = CURLUE_BAD_FILE_URL;
1028       goto fail;
1029     }
1030 
1031     /* path has been allocated large enough to hold this */
1032     path = (char *)&url[5];
1033     pathlen = urllen - 5;
1034 
1035     u->scheme = strdup("file");
1036     if(!u->scheme) {
1037       result = CURLUE_OUT_OF_MEMORY;
1038       goto fail;
1039     }
1040 
1041     /* Extra handling URLs with an authority component (i.e. that start with
1042      * "file://")
1043      *
1044      * We allow omitted hostname (e.g. file:/<path>) -- valid according to
1045      * RFC 8089, but not the (current) WHAT-WG URL spec.
1046      */
1047     if(path[0] == '/' && path[1] == '/') {
1048       /* swallow the two slashes */
1049       const char *ptr = &path[2];
1050 
1051       /*
1052        * According to RFC 8089, a file: URL can be reliably dereferenced if:
1053        *
1054        *  o it has no/blank hostname, or
1055        *
1056        *  o the hostname matches "localhost" (case-insensitively), or
1057        *
1058        *  o the hostname is a FQDN that resolves to this machine, or
1059        *
1060        *  o it is an UNC String transformed to an URI (Windows only, RFC 8089
1061        *    Appendix E.3).
1062        *
1063        * For brevity, we only consider URLs with empty, "localhost", or
1064        * "127.0.0.1" hostnames as local, otherwise as an UNC String.
1065        *
1066        * Additionally, there is an exception for URLs with a Windows drive
1067        * letter in the authority (which was accidentally omitted from RFC 8089
1068        * Appendix E, but believe me, it was meant to be there. --MK)
1069        */
1070       if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) {
1071         /* the URL includes a hostname, it must match "localhost" or
1072            "127.0.0.1" to be valid */
1073         if(checkprefix("localhost/", ptr) ||
1074            checkprefix("127.0.0.1/", ptr)) {
1075           ptr += 9; /* now points to the slash after the host */
1076         }
1077         else {
1078 #if defined(_WIN32)
1079           size_t len;
1080 
1081           /* the hostname, NetBIOS computer name, can not contain disallowed
1082              chars, and the delimiting slash character must be appended to the
1083              hostname */
1084           path = strpbrk(ptr, "/\\:*?\"<>|");
1085           if(!path || *path != '/') {
1086             result = CURLUE_BAD_FILE_URL;
1087             goto fail;
1088           }
1089 
1090           len = path - ptr;
1091           if(len) {
1092             CURLcode code = Curl_dyn_addn(&host, ptr, len);
1093             if(code) {
1094               result = cc2cu(code);
1095               goto fail;
1096             }
1097             uncpath = TRUE;
1098           }
1099 
1100           ptr -= 2; /* now points to the // before the host in UNC */
1101 #else
1102           /* Invalid file://hostname/, expected localhost or 127.0.0.1 or
1103              none */
1104           result = CURLUE_BAD_FILE_URL;
1105           goto fail;
1106 #endif
1107         }
1108       }
1109 
1110       path = ptr;
1111       pathlen = urllen - (ptr - url);
1112     }
1113 
1114     if(!uncpath)
1115       /* no host for file: URLs by default */
1116       Curl_dyn_reset(&host);
1117 
1118 #if !defined(_WIN32) && !defined(MSDOS) && !defined(__CYGWIN__)
1119     /* Do not allow Windows drive letters when not in Windows.
1120      * This catches both "file:/c:" and "file:c:" */
1121     if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) ||
1122        STARTS_WITH_URL_DRIVE_PREFIX(path)) {
1123       /* File drive letters are only accepted in MS-DOS/Windows */
1124       result = CURLUE_BAD_FILE_URL;
1125       goto fail;
1126     }
1127 #else
1128     /* If the path starts with a slash and a drive letter, ditch the slash */
1129     if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) {
1130       /* This cannot be done with strcpy, as the memory chunks overlap! */
1131       path++;
1132       pathlen--;
1133     }
1134 #endif
1135 
1136   }
1137   else {
1138     /* clear path */
1139     const char *schemep = NULL;
1140     const char *hostp;
1141     size_t hostlen;
1142 
1143     if(schemelen) {
1144       int i = 0;
1145       const char *p = &url[schemelen + 1];
1146       while((*p == '/') && (i < 4)) {
1147         p++;
1148         i++;
1149       }
1150 
1151       schemep = schemebuf;
1152       if(!Curl_get_scheme_handler(schemep) &&
1153          !(flags & CURLU_NON_SUPPORT_SCHEME)) {
1154         result = CURLUE_UNSUPPORTED_SCHEME;
1155         goto fail;
1156       }
1157 
1158       if((i < 1) || (i > 3)) {
1159         /* less than one or more than three slashes */
1160         result = CURLUE_BAD_SLASHES;
1161         goto fail;
1162       }
1163       hostp = p; /* hostname starts here */
1164     }
1165     else {
1166       /* no scheme! */
1167 
1168       if(!(flags & (CURLU_DEFAULT_SCHEME|CURLU_GUESS_SCHEME))) {
1169         result = CURLUE_BAD_SCHEME;
1170         goto fail;
1171       }
1172       if(flags & CURLU_DEFAULT_SCHEME)
1173         schemep = DEFAULT_SCHEME;
1174 
1175       /*
1176        * The URL was badly formatted, let's try without scheme specified.
1177        */
1178       hostp = url;
1179     }
1180 
1181     if(schemep) {
1182       u->scheme = strdup(schemep);
1183       if(!u->scheme) {
1184         result = CURLUE_OUT_OF_MEMORY;
1185         goto fail;
1186       }
1187     }
1188 
1189     /* find the end of the hostname + port number */
1190     hostlen = strcspn(hostp, "/?#");
1191     path = &hostp[hostlen];
1192 
1193     /* this pathlen also contains the query and the fragment */
1194     pathlen = urllen - (path - url);
1195     if(hostlen) {
1196 
1197       result = parse_authority(u, hostp, hostlen, flags, &host, schemelen);
1198       if(result)
1199         goto fail;
1200 
1201       if((flags & CURLU_GUESS_SCHEME) && !schemep) {
1202         const char *hostname = Curl_dyn_ptr(&host);
1203         /* legacy curl-style guess based on hostname */
1204         if(checkprefix("ftp.", hostname))
1205           schemep = "ftp";
1206         else if(checkprefix("dict.", hostname))
1207           schemep = "dict";
1208         else if(checkprefix("ldap.", hostname))
1209           schemep = "ldap";
1210         else if(checkprefix("imap.", hostname))
1211           schemep = "imap";
1212         else if(checkprefix("smtp.", hostname))
1213           schemep = "smtp";
1214         else if(checkprefix("pop3.", hostname))
1215           schemep = "pop3";
1216         else
1217           schemep = "http";
1218 
1219         u->scheme = strdup(schemep);
1220         if(!u->scheme) {
1221           result = CURLUE_OUT_OF_MEMORY;
1222           goto fail;
1223         }
1224         u->guessed_scheme = TRUE;
1225       }
1226     }
1227     else if(flags & CURLU_NO_AUTHORITY) {
1228       /* allowed to be empty. */
1229       if(Curl_dyn_add(&host, "")) {
1230         result = CURLUE_OUT_OF_MEMORY;
1231         goto fail;
1232       }
1233     }
1234     else {
1235       result = CURLUE_NO_HOST;
1236       goto fail;
1237     }
1238   }
1239 
1240   fragment = strchr(path, '#');
1241   if(fragment) {
1242     fraglen = pathlen - (fragment - path);
1243     u->fragment_present = TRUE;
1244     if(fraglen > 1) {
1245       /* skip the leading '#' in the copy but include the terminating null */
1246       if(flags & CURLU_URLENCODE) {
1247         struct dynbuf enc;
1248         Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1249         result = urlencode_str(&enc, fragment + 1, fraglen - 1, TRUE, FALSE);
1250         if(result)
1251           goto fail;
1252         u->fragment = Curl_dyn_ptr(&enc);
1253       }
1254       else {
1255         u->fragment = Curl_memdup0(fragment + 1, fraglen - 1);
1256         if(!u->fragment) {
1257           result = CURLUE_OUT_OF_MEMORY;
1258           goto fail;
1259         }
1260       }
1261     }
1262     /* after this, pathlen still contains the query */
1263     pathlen -= fraglen;
1264   }
1265 
1266   query = memchr(path, '?', pathlen);
1267   if(query) {
1268     size_t qlen = fragment ? (size_t)(fragment - query) :
1269       pathlen - (query - path);
1270     pathlen -= qlen;
1271     u->query_present = TRUE;
1272     if(qlen > 1) {
1273       if(flags & CURLU_URLENCODE) {
1274         struct dynbuf enc;
1275         Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1276         /* skip the leading question mark */
1277         result = urlencode_str(&enc, query + 1, qlen - 1, TRUE, TRUE);
1278         if(result)
1279           goto fail;
1280         u->query = Curl_dyn_ptr(&enc);
1281       }
1282       else {
1283         u->query = Curl_memdup0(query + 1, qlen - 1);
1284         if(!u->query) {
1285           result = CURLUE_OUT_OF_MEMORY;
1286           goto fail;
1287         }
1288       }
1289     }
1290     else {
1291       /* single byte query */
1292       u->query = strdup("");
1293       if(!u->query) {
1294         result = CURLUE_OUT_OF_MEMORY;
1295         goto fail;
1296       }
1297     }
1298   }
1299 
1300   if(pathlen && (flags & CURLU_URLENCODE)) {
1301     struct dynbuf enc;
1302     Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1303     result = urlencode_str(&enc, path, pathlen, TRUE, FALSE);
1304     if(result)
1305       goto fail;
1306     pathlen = Curl_dyn_len(&enc);
1307     path = u->path = Curl_dyn_ptr(&enc);
1308   }
1309 
1310   if(pathlen <= 1) {
1311     /* there is no path left or just the slash, unset */
1312     path = NULL;
1313   }
1314   else {
1315     if(!u->path) {
1316       u->path = Curl_memdup0(path, pathlen);
1317       if(!u->path) {
1318         result = CURLUE_OUT_OF_MEMORY;
1319         goto fail;
1320       }
1321       path = u->path;
1322     }
1323     else if(flags & CURLU_URLENCODE)
1324       /* it might have encoded more than just the path so cut it */
1325       u->path[pathlen] = 0;
1326 
1327     if(!(flags & CURLU_PATH_AS_IS)) {
1328       /* remove ../ and ./ sequences according to RFC3986 */
1329       char *dedot;
1330       int err = dedotdotify((char *)path, pathlen, &dedot);
1331       if(err) {
1332         result = CURLUE_OUT_OF_MEMORY;
1333         goto fail;
1334       }
1335       if(dedot) {
1336         free(u->path);
1337         u->path = dedot;
1338       }
1339     }
1340   }
1341 
1342   u->host = Curl_dyn_ptr(&host);
1343 
1344   return result;
1345 fail:
1346   Curl_dyn_free(&host);
1347   free_urlhandle(u);
1348   return result;
1349 }
1350 
1351 /*
1352  * Parse the URL and, if successful, replace everything in the Curl_URL struct.
1353  */
parseurl_and_replace(const char * url,CURLU * u,unsigned int flags)1354 static CURLUcode parseurl_and_replace(const char *url, CURLU *u,
1355                                       unsigned int flags)
1356 {
1357   CURLUcode result;
1358   CURLU tmpurl;
1359   memset(&tmpurl, 0, sizeof(tmpurl));
1360   result = parseurl(url, &tmpurl, flags);
1361   if(!result) {
1362     free_urlhandle(u);
1363     *u = tmpurl;
1364   }
1365   return result;
1366 }
1367 
1368 /*
1369  */
curl_url(void)1370 CURLU *curl_url(void)
1371 {
1372   return calloc(1, sizeof(struct Curl_URL));
1373 }
1374 
curl_url_cleanup(CURLU * u)1375 void curl_url_cleanup(CURLU *u)
1376 {
1377   if(u) {
1378     free_urlhandle(u);
1379     free(u);
1380   }
1381 }
1382 
1383 #define DUP(dest, src, name)                    \
1384   do {                                          \
1385     if(src->name) {                             \
1386       dest->name = strdup(src->name);           \
1387       if(!dest->name)                           \
1388         goto fail;                              \
1389     }                                           \
1390   } while(0)
1391 
curl_url_dup(const CURLU * in)1392 CURLU *curl_url_dup(const CURLU *in)
1393 {
1394   struct Curl_URL *u = calloc(1, sizeof(struct Curl_URL));
1395   if(u) {
1396     DUP(u, in, scheme);
1397     DUP(u, in, user);
1398     DUP(u, in, password);
1399     DUP(u, in, options);
1400     DUP(u, in, host);
1401     DUP(u, in, port);
1402     DUP(u, in, path);
1403     DUP(u, in, query);
1404     DUP(u, in, fragment);
1405     DUP(u, in, zoneid);
1406     u->portnum = in->portnum;
1407     u->fragment_present = in->fragment_present;
1408     u->query_present = in->query_present;
1409   }
1410   return u;
1411 fail:
1412   curl_url_cleanup(u);
1413   return NULL;
1414 }
1415 
curl_url_get(const CURLU * u,CURLUPart what,char ** part,unsigned int flags)1416 CURLUcode curl_url_get(const CURLU *u, CURLUPart what,
1417                        char **part, unsigned int flags)
1418 {
1419   const char *ptr;
1420   CURLUcode ifmissing = CURLUE_UNKNOWN_PART;
1421   char portbuf[7];
1422   bool urldecode = (flags & CURLU_URLDECODE) ? 1 : 0;
1423   bool urlencode = (flags & CURLU_URLENCODE) ? 1 : 0;
1424   bool punycode = FALSE;
1425   bool depunyfy = FALSE;
1426   bool plusdecode = FALSE;
1427   (void)flags;
1428   if(!u)
1429     return CURLUE_BAD_HANDLE;
1430   if(!part)
1431     return CURLUE_BAD_PARTPOINTER;
1432   *part = NULL;
1433 
1434   switch(what) {
1435   case CURLUPART_SCHEME:
1436     ptr = u->scheme;
1437     ifmissing = CURLUE_NO_SCHEME;
1438     urldecode = FALSE; /* never for schemes */
1439     if((flags & CURLU_NO_GUESS_SCHEME) && u->guessed_scheme)
1440       return CURLUE_NO_SCHEME;
1441     break;
1442   case CURLUPART_USER:
1443     ptr = u->user;
1444     ifmissing = CURLUE_NO_USER;
1445     break;
1446   case CURLUPART_PASSWORD:
1447     ptr = u->password;
1448     ifmissing = CURLUE_NO_PASSWORD;
1449     break;
1450   case CURLUPART_OPTIONS:
1451     ptr = u->options;
1452     ifmissing = CURLUE_NO_OPTIONS;
1453     break;
1454   case CURLUPART_HOST:
1455     ptr = u->host;
1456     ifmissing = CURLUE_NO_HOST;
1457     punycode = (flags & CURLU_PUNYCODE) ? 1 : 0;
1458     depunyfy = (flags & CURLU_PUNY2IDN) ? 1 : 0;
1459     break;
1460   case CURLUPART_ZONEID:
1461     ptr = u->zoneid;
1462     ifmissing = CURLUE_NO_ZONEID;
1463     break;
1464   case CURLUPART_PORT:
1465     ptr = u->port;
1466     ifmissing = CURLUE_NO_PORT;
1467     urldecode = FALSE; /* never for port */
1468     if(!ptr && (flags & CURLU_DEFAULT_PORT) && u->scheme) {
1469       /* there is no stored port number, but asked to deliver
1470          a default one for the scheme */
1471       const struct Curl_handler *h = Curl_get_scheme_handler(u->scheme);
1472       if(h) {
1473         msnprintf(portbuf, sizeof(portbuf), "%u", h->defport);
1474         ptr = portbuf;
1475       }
1476     }
1477     else if(ptr && u->scheme) {
1478       /* there is a stored port number, but ask to inhibit if
1479          it matches the default one for the scheme */
1480       const struct Curl_handler *h = Curl_get_scheme_handler(u->scheme);
1481       if(h && (h->defport == u->portnum) &&
1482          (flags & CURLU_NO_DEFAULT_PORT))
1483         ptr = NULL;
1484     }
1485     break;
1486   case CURLUPART_PATH:
1487     ptr = u->path;
1488     if(!ptr)
1489       ptr = "/";
1490     break;
1491   case CURLUPART_QUERY:
1492     ptr = u->query;
1493     ifmissing = CURLUE_NO_QUERY;
1494     plusdecode = urldecode;
1495     if(ptr && !ptr[0] && !(flags & CURLU_GET_EMPTY))
1496       /* there was a blank query and the user do not ask for it */
1497       ptr = NULL;
1498     break;
1499   case CURLUPART_FRAGMENT:
1500     ptr = u->fragment;
1501     ifmissing = CURLUE_NO_FRAGMENT;
1502     if(!ptr && u->fragment_present && flags & CURLU_GET_EMPTY)
1503       /* there was a blank fragment and the user asks for it */
1504       ptr = "";
1505     break;
1506   case CURLUPART_URL: {
1507     char *url;
1508     char *scheme;
1509     char *options = u->options;
1510     char *port = u->port;
1511     char *allochost = NULL;
1512     bool show_fragment =
1513       u->fragment || (u->fragment_present && flags & CURLU_GET_EMPTY);
1514     bool show_query =
1515       (u->query && u->query[0]) ||
1516       (u->query_present && flags & CURLU_GET_EMPTY);
1517     punycode = (flags & CURLU_PUNYCODE) ? 1 : 0;
1518     depunyfy = (flags & CURLU_PUNY2IDN) ? 1 : 0;
1519     if(u->scheme && strcasecompare("file", u->scheme)) {
1520       url = aprintf("file://%s%s%s",
1521                     u->path,
1522                     show_fragment ? "#": "",
1523                     u->fragment ? u->fragment : "");
1524     }
1525     else if(!u->host)
1526       return CURLUE_NO_HOST;
1527     else {
1528       const struct Curl_handler *h = NULL;
1529       char schemebuf[MAX_SCHEME_LEN + 5];
1530       if(u->scheme)
1531         scheme = u->scheme;
1532       else if(flags & CURLU_DEFAULT_SCHEME)
1533         scheme = (char *) DEFAULT_SCHEME;
1534       else
1535         return CURLUE_NO_SCHEME;
1536 
1537       h = Curl_get_scheme_handler(scheme);
1538       if(!port && (flags & CURLU_DEFAULT_PORT)) {
1539         /* there is no stored port number, but asked to deliver
1540            a default one for the scheme */
1541         if(h) {
1542           msnprintf(portbuf, sizeof(portbuf), "%u", h->defport);
1543           port = portbuf;
1544         }
1545       }
1546       else if(port) {
1547         /* there is a stored port number, but asked to inhibit if it matches
1548            the default one for the scheme */
1549         if(h && (h->defport == u->portnum) &&
1550            (flags & CURLU_NO_DEFAULT_PORT))
1551           port = NULL;
1552       }
1553 
1554       if(h && !(h->flags & PROTOPT_URLOPTIONS))
1555         options = NULL;
1556 
1557       if(u->host[0] == '[') {
1558         if(u->zoneid) {
1559           /* make it '[ host %25 zoneid ]' */
1560           struct dynbuf enc;
1561           size_t hostlen = strlen(u->host);
1562           Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1563           if(Curl_dyn_addf(&enc, "%.*s%%25%s]", (int)hostlen - 1, u->host,
1564                            u->zoneid))
1565             return CURLUE_OUT_OF_MEMORY;
1566           allochost = Curl_dyn_ptr(&enc);
1567         }
1568       }
1569       else if(urlencode) {
1570         allochost = curl_easy_escape(NULL, u->host, 0);
1571         if(!allochost)
1572           return CURLUE_OUT_OF_MEMORY;
1573       }
1574       else if(punycode) {
1575         if(!Curl_is_ASCII_name(u->host)) {
1576 #ifndef USE_IDN
1577           return CURLUE_LACKS_IDN;
1578 #else
1579           CURLcode result = Curl_idn_decode(u->host, &allochost);
1580           if(result)
1581             return (result == CURLE_OUT_OF_MEMORY) ?
1582               CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME;
1583 #endif
1584         }
1585       }
1586       else if(depunyfy) {
1587         if(Curl_is_ASCII_name(u->host) && !strncmp("xn--", u->host, 4)) {
1588 #ifndef USE_IDN
1589           return CURLUE_LACKS_IDN;
1590 #else
1591           CURLcode result = Curl_idn_encode(u->host, &allochost);
1592           if(result)
1593             /* this is the most likely error */
1594             return (result == CURLE_OUT_OF_MEMORY) ?
1595               CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME;
1596 #endif
1597         }
1598       }
1599 
1600       if(!(flags & CURLU_NO_GUESS_SCHEME) || !u->guessed_scheme)
1601         msnprintf(schemebuf, sizeof(schemebuf), "%s://", scheme);
1602       else
1603         schemebuf[0] = 0;
1604 
1605       url = aprintf("%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
1606                     schemebuf,
1607                     u->user ? u->user : "",
1608                     u->password ? ":": "",
1609                     u->password ? u->password : "",
1610                     options ? ";" : "",
1611                     options ? options : "",
1612                     (u->user || u->password || options) ? "@": "",
1613                     allochost ? allochost : u->host,
1614                     port ? ":": "",
1615                     port ? port : "",
1616                     u->path ? u->path : "/",
1617                     show_query ? "?": "",
1618                     u->query ? u->query : "",
1619                     show_fragment ? "#": "",
1620                     u->fragment ? u->fragment : "");
1621       free(allochost);
1622     }
1623     if(!url)
1624       return CURLUE_OUT_OF_MEMORY;
1625     *part = url;
1626     return CURLUE_OK;
1627   }
1628   default:
1629     ptr = NULL;
1630     break;
1631   }
1632   if(ptr) {
1633     size_t partlen = strlen(ptr);
1634     size_t i = 0;
1635     *part = Curl_memdup0(ptr, partlen);
1636     if(!*part)
1637       return CURLUE_OUT_OF_MEMORY;
1638     if(plusdecode) {
1639       /* convert + to space */
1640       char *plus = *part;
1641       for(i = 0; i < partlen; ++plus, i++) {
1642         if(*plus == '+')
1643           *plus = ' ';
1644       }
1645     }
1646     if(urldecode) {
1647       char *decoded;
1648       size_t dlen;
1649       /* this unconditional rejection of control bytes is documented
1650          API behavior */
1651       CURLcode res = Curl_urldecode(*part, 0, &decoded, &dlen, REJECT_CTRL);
1652       free(*part);
1653       if(res) {
1654         *part = NULL;
1655         return CURLUE_URLDECODE;
1656       }
1657       *part = decoded;
1658       partlen = dlen;
1659     }
1660     if(urlencode) {
1661       struct dynbuf enc;
1662       CURLUcode uc;
1663       Curl_dyn_init(&enc, CURL_MAX_INPUT_LENGTH);
1664       uc = urlencode_str(&enc, *part, partlen, TRUE, what == CURLUPART_QUERY);
1665       if(uc)
1666         return uc;
1667       free(*part);
1668       *part = Curl_dyn_ptr(&enc);
1669     }
1670     else if(punycode) {
1671       if(!Curl_is_ASCII_name(u->host)) {
1672 #ifndef USE_IDN
1673         return CURLUE_LACKS_IDN;
1674 #else
1675         char *allochost;
1676         CURLcode result = Curl_idn_decode(*part, &allochost);
1677         if(result)
1678           return (result == CURLE_OUT_OF_MEMORY) ?
1679             CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME;
1680         free(*part);
1681         *part = allochost;
1682 #endif
1683       }
1684     }
1685     else if(depunyfy) {
1686       if(Curl_is_ASCII_name(u->host)  && !strncmp("xn--", u->host, 4)) {
1687 #ifndef USE_IDN
1688         return CURLUE_LACKS_IDN;
1689 #else
1690         char *allochost;
1691         CURLcode result = Curl_idn_encode(*part, &allochost);
1692         if(result)
1693           return (result == CURLE_OUT_OF_MEMORY) ?
1694             CURLUE_OUT_OF_MEMORY : CURLUE_BAD_HOSTNAME;
1695         free(*part);
1696         *part = allochost;
1697 #endif
1698       }
1699     }
1700 
1701     return CURLUE_OK;
1702   }
1703   else
1704     return ifmissing;
1705 }
1706 
curl_url_set(CURLU * u,CURLUPart what,const char * part,unsigned int flags)1707 CURLUcode curl_url_set(CURLU *u, CURLUPart what,
1708                        const char *part, unsigned int flags)
1709 {
1710   char **storep = NULL;
1711   bool urlencode = (flags & CURLU_URLENCODE) ? 1 : 0;
1712   bool plusencode = FALSE;
1713   bool urlskipslash = FALSE;
1714   bool leadingslash = FALSE;
1715   bool appendquery = FALSE;
1716   bool equalsencode = FALSE;
1717   size_t nalloc;
1718 
1719   if(!u)
1720     return CURLUE_BAD_HANDLE;
1721   if(!part) {
1722     /* setting a part to NULL clears it */
1723     switch(what) {
1724     case CURLUPART_URL:
1725       break;
1726     case CURLUPART_SCHEME:
1727       storep = &u->scheme;
1728       u->guessed_scheme = FALSE;
1729       break;
1730     case CURLUPART_USER:
1731       storep = &u->user;
1732       break;
1733     case CURLUPART_PASSWORD:
1734       storep = &u->password;
1735       break;
1736     case CURLUPART_OPTIONS:
1737       storep = &u->options;
1738       break;
1739     case CURLUPART_HOST:
1740       storep = &u->host;
1741       break;
1742     case CURLUPART_ZONEID:
1743       storep = &u->zoneid;
1744       break;
1745     case CURLUPART_PORT:
1746       u->portnum = 0;
1747       storep = &u->port;
1748       break;
1749     case CURLUPART_PATH:
1750       storep = &u->path;
1751       break;
1752     case CURLUPART_QUERY:
1753       storep = &u->query;
1754       u->query_present = FALSE;
1755       break;
1756     case CURLUPART_FRAGMENT:
1757       storep = &u->fragment;
1758       u->fragment_present = FALSE;
1759       break;
1760     default:
1761       return CURLUE_UNKNOWN_PART;
1762     }
1763     if(storep && *storep) {
1764       Curl_safefree(*storep);
1765     }
1766     else if(!storep) {
1767       free_urlhandle(u);
1768       memset(u, 0, sizeof(struct Curl_URL));
1769     }
1770     return CURLUE_OK;
1771   }
1772 
1773   nalloc = strlen(part);
1774   if(nalloc > CURL_MAX_INPUT_LENGTH)
1775     /* excessive input length */
1776     return CURLUE_MALFORMED_INPUT;
1777 
1778   switch(what) {
1779   case CURLUPART_SCHEME: {
1780     size_t plen = strlen(part);
1781     const char *s = part;
1782     if((plen > MAX_SCHEME_LEN) || (plen < 1))
1783       /* too long or too short */
1784       return CURLUE_BAD_SCHEME;
1785    /* verify that it is a fine scheme */
1786     if(!(flags & CURLU_NON_SUPPORT_SCHEME) && !Curl_get_scheme_handler(part))
1787       return CURLUE_UNSUPPORTED_SCHEME;
1788     storep = &u->scheme;
1789     urlencode = FALSE; /* never */
1790     if(ISALPHA(*s)) {
1791       /* ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */
1792       while(--plen) {
1793         if(ISALNUM(*s) || (*s == '+') || (*s == '-') || (*s == '.'))
1794           s++; /* fine */
1795         else
1796           return CURLUE_BAD_SCHEME;
1797       }
1798     }
1799     else
1800       return CURLUE_BAD_SCHEME;
1801     u->guessed_scheme = FALSE;
1802     break;
1803   }
1804   case CURLUPART_USER:
1805     storep = &u->user;
1806     break;
1807   case CURLUPART_PASSWORD:
1808     storep = &u->password;
1809     break;
1810   case CURLUPART_OPTIONS:
1811     storep = &u->options;
1812     break;
1813   case CURLUPART_HOST:
1814     storep = &u->host;
1815     Curl_safefree(u->zoneid);
1816     break;
1817   case CURLUPART_ZONEID:
1818     storep = &u->zoneid;
1819     break;
1820   case CURLUPART_PORT:
1821     if(!ISDIGIT(part[0]))
1822       /* not a number */
1823       return CURLUE_BAD_PORT_NUMBER;
1824     else {
1825       char *tmp;
1826       char *endp;
1827       unsigned long port;
1828       errno = 0;
1829       port = strtoul(part, &endp, 10);  /* must be decimal */
1830       if(errno || (port > 0xffff) || *endp)
1831         /* weirdly provided number, not good! */
1832         return CURLUE_BAD_PORT_NUMBER;
1833       tmp = strdup(part);
1834       if(!tmp)
1835         return CURLUE_OUT_OF_MEMORY;
1836       free(u->port);
1837       u->port = tmp;
1838       u->portnum = (unsigned short)port;
1839       return CURLUE_OK;
1840     }
1841   case CURLUPART_PATH:
1842     urlskipslash = TRUE;
1843     leadingslash = TRUE; /* enforce */
1844     storep = &u->path;
1845     break;
1846   case CURLUPART_QUERY:
1847     plusencode = urlencode;
1848     appendquery = (flags & CURLU_APPENDQUERY) ? 1 : 0;
1849     equalsencode = appendquery;
1850     storep = &u->query;
1851     u->query_present = TRUE;
1852     break;
1853   case CURLUPART_FRAGMENT:
1854     storep = &u->fragment;
1855     u->fragment_present = TRUE;
1856     break;
1857   case CURLUPART_URL: {
1858     /*
1859      * Allow a new URL to replace the existing (if any) contents.
1860      *
1861      * If the existing contents is enough for a URL, allow a relative URL to
1862      * replace it.
1863      */
1864     CURLcode result;
1865     CURLUcode uc;
1866     char *oldurl;
1867     char *redired_url;
1868 
1869     if(!nalloc)
1870       /* a blank URL is not a valid URL */
1871       return CURLUE_MALFORMED_INPUT;
1872 
1873     /* if the new thing is absolute or the old one is not
1874      * (we could not get an absolute URL in 'oldurl'),
1875      * then replace the existing with the new. */
1876     if(Curl_is_absolute_url(part, NULL, 0,
1877                             flags & (CURLU_GUESS_SCHEME|
1878                                      CURLU_DEFAULT_SCHEME))
1879        || curl_url_get(u, CURLUPART_URL, &oldurl, flags)) {
1880       return parseurl_and_replace(part, u, flags);
1881     }
1882 
1883     /* apply the relative part to create a new URL
1884      * and replace the existing one with it. */
1885     result = concat_url(oldurl, part, &redired_url);
1886     free(oldurl);
1887     if(result)
1888       return cc2cu(result);
1889 
1890     uc = parseurl_and_replace(redired_url, u, flags);
1891     free(redired_url);
1892     return uc;
1893   }
1894   default:
1895     return CURLUE_UNKNOWN_PART;
1896   }
1897   DEBUGASSERT(storep);
1898   {
1899     const char *newp;
1900     struct dynbuf enc;
1901     Curl_dyn_init(&enc, nalloc * 3 + 1 + leadingslash);
1902 
1903     if(leadingslash && (part[0] != '/')) {
1904       CURLcode result = Curl_dyn_addn(&enc, "/", 1);
1905       if(result)
1906         return cc2cu(result);
1907     }
1908     if(urlencode) {
1909       const unsigned char *i;
1910 
1911       for(i = (const unsigned char *)part; *i; i++) {
1912         CURLcode result;
1913         if((*i == ' ') && plusencode) {
1914           result = Curl_dyn_addn(&enc, "+", 1);
1915           if(result)
1916             return CURLUE_OUT_OF_MEMORY;
1917         }
1918         else if(ISUNRESERVED(*i) ||
1919                 ((*i == '/') && urlskipslash) ||
1920                 ((*i == '=') && equalsencode)) {
1921           if((*i == '=') && equalsencode)
1922             /* only skip the first equals sign */
1923             equalsencode = FALSE;
1924           result = Curl_dyn_addn(&enc, i, 1);
1925           if(result)
1926             return cc2cu(result);
1927         }
1928         else {
1929           char out[3]={'%'};
1930           out[1] = hexdigits[*i >> 4];
1931           out[2] = hexdigits[*i & 0xf];
1932           result = Curl_dyn_addn(&enc, out, 3);
1933           if(result)
1934             return cc2cu(result);
1935         }
1936       }
1937     }
1938     else {
1939       char *p;
1940       CURLcode result = Curl_dyn_add(&enc, part);
1941       if(result)
1942         return cc2cu(result);
1943       p = Curl_dyn_ptr(&enc);
1944       while(*p) {
1945         /* make sure percent encoded are lower case */
1946         if((*p == '%') && ISXDIGIT(p[1]) && ISXDIGIT(p[2]) &&
1947            (ISUPPER(p[1]) || ISUPPER(p[2]))) {
1948           p[1] = Curl_raw_tolower(p[1]);
1949           p[2] = Curl_raw_tolower(p[2]);
1950           p += 3;
1951         }
1952         else
1953           p++;
1954       }
1955     }
1956     newp = Curl_dyn_ptr(&enc);
1957 
1958     if(appendquery && newp) {
1959       /* Append the 'newp' string onto the old query. Add a '&' separator if
1960          none is present at the end of the existing query already */
1961 
1962       size_t querylen = u->query ? strlen(u->query) : 0;
1963       bool addamperand = querylen && (u->query[querylen -1] != '&');
1964       if(querylen) {
1965         struct dynbuf qbuf;
1966         Curl_dyn_init(&qbuf, CURL_MAX_INPUT_LENGTH);
1967 
1968         if(Curl_dyn_addn(&qbuf, u->query, querylen)) /* add original query */
1969           goto nomem;
1970 
1971         if(addamperand) {
1972           if(Curl_dyn_addn(&qbuf, "&", 1))
1973             goto nomem;
1974         }
1975         if(Curl_dyn_add(&qbuf, newp))
1976           goto nomem;
1977         Curl_dyn_free(&enc);
1978         free(*storep);
1979         *storep = Curl_dyn_ptr(&qbuf);
1980         return CURLUE_OK;
1981 nomem:
1982         Curl_dyn_free(&enc);
1983         return CURLUE_OUT_OF_MEMORY;
1984       }
1985     }
1986 
1987     else if(what == CURLUPART_HOST) {
1988       size_t n = Curl_dyn_len(&enc);
1989       if(!n && (flags & CURLU_NO_AUTHORITY)) {
1990         /* Skip hostname check, it is allowed to be empty. */
1991       }
1992       else {
1993         bool bad = FALSE;
1994         if(!n)
1995           bad = TRUE; /* empty hostname is not okay */
1996         else if(!urlencode) {
1997           /* if the host name part was not URL encoded here, it was set ready
1998              URL encoded so we need to decode it to check */
1999           size_t dlen;
2000           char *decoded = NULL;
2001           CURLcode result =
2002             Curl_urldecode(newp, n, &decoded, &dlen, REJECT_CTRL);
2003           if(result || hostname_check(u, decoded, dlen))
2004             bad = TRUE;
2005           free(decoded);
2006         }
2007         else if(hostname_check(u, (char *)newp, n))
2008           bad = TRUE;
2009         if(bad) {
2010           Curl_dyn_free(&enc);
2011           return CURLUE_BAD_HOSTNAME;
2012         }
2013       }
2014     }
2015 
2016     free(*storep);
2017     *storep = (char *)newp;
2018   }
2019   return CURLUE_OK;
2020 }
2021