1/***************************************************************************
2 * _ _ ____ _
3 * Project ___| | | | _ \| |
4 * / __| | | | |_) | |
5 * | (__| |_| | _ <| |___
6 * \___|\___/|_| \_\_____|
7 *
8 * Copyright (C) 1998 - 2019, Daniel Stenberg, <daniel@haxx.se>, et al.
9 *
10 * This software is licensed as described in the file COPYING, which
11 * you should have received as part of this distribution. The terms
12 * are also available at https://curl.haxx.se/docs/copyright.html.
13 *
14 * You may opt to use, copy, modify, merge, publish, distribute and/or sell
15 * copies of the Software, and permit persons to whom the Software is
16 * furnished to do so, under the terms of the COPYING file.
17 *
18 * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19 * KIND, either express or implied.
20 *
21 ***************************************************************************/
22
23#include "curl_setup.h"
24
25#include "urldata.h"
26#include "urlapi-int.h"
27#include "strcase.h"
28#include "dotdot.h"
29#include "url.h"
30#include "escape.h"
31#include "curl_ctype.h"
32#include "inet_pton.h"
33
34/* The last 3 #include files should be in this order */
35#include "curl_printf.h"
36#include "curl_memory.h"
37#include "memdebug.h"
38
39 /* MSDOS/Windows style drive prefix, eg c: in c:foo */
40#define STARTS_WITH_DRIVE_PREFIX(str) \
41 ((('a' <= str[0] && str[0] <= 'z') || \
42 ('A' <= str[0] && str[0] <= 'Z')) && \
43 (str[1] == ':'))
44
45 /* MSDOS/Windows style drive prefix, optionally with
46 * a '|' instead of ':', followed by a slash or NUL */
47#define STARTS_WITH_URL_DRIVE_PREFIX(str) \
48 ((('a' <= (str)[0] && (str)[0] <= 'z') || \
49 ('A' <= (str)[0] && (str)[0] <= 'Z')) && \
50 ((str)[1] == ':' || (str)[1] == '|') && \
51 ((str)[2] == '/' || (str)[2] == '\\' || (str)[2] == 0))
52
53/* Internal representation of CURLU. Point to URL-encoded strings. */
54struct Curl_URL {
55 char *scheme;
56 char *user;
57 char *password;
58 char *options; /* IMAP only? */
59 char *host;
60 char *zoneid; /* for numerical IPv6 addresses */
61 char *port;
62 char *path;
63 char *query;
64 char *fragment;
65
66 char *scratch; /* temporary scratch area */
67 char *temppath; /* temporary path pointer */
68 long portnum; /* the numerical version */
69};
70
71#define DEFAULT_SCHEME "https"
72
73static void free_urlhandle(struct Curl_URL *u)
74{
75 free(u->scheme);
76 free(u->user);
77 free(u->password);
78 free(u->options);
79 free(u->host);
80 free(u->zoneid);
81 free(u->port);
82 free(u->path);
83 free(u->query);
84 free(u->fragment);
85 free(u->scratch);
86 free(u->temppath);
87}
88
89/* move the full contents of one handle onto another and
90 free the original */
91static void mv_urlhandle(struct Curl_URL *from,
92 struct Curl_URL *to)
93{
94 free_urlhandle(to);
95 *to = *from;
96 free(from);
97}
98
99/*
100 * Find the separator at the end of the host name, or the '?' in cases like
101 * http://www.url.com?id=2380
102 */
103static const char *find_host_sep(const char *url)
104{
105 const char *sep;
106 const char *query;
107
108 /* Find the start of the hostname */
109 sep = strstr(url, "//");
110 if(!sep)
111 sep = url;
112 else
113 sep += 2;
114
115 query = strchr(sep, '?');
116 sep = strchr(sep, '/');
117
118 if(!sep)
119 sep = url + strlen(url);
120
121 if(!query)
122 query = url + strlen(url);
123
124 return sep < query ? sep : query;
125}
126
127/*
128 * Decide in an encoding-independent manner whether a character in an
129 * URL must be escaped. The same criterion must be used in strlen_url()
130 * and strcpy_url().
131 */
132static bool urlchar_needs_escaping(int c)
133{
134 return !(ISCNTRL(c) || ISSPACE(c) || ISGRAPH(c));
135}
136
137/*
138 * strlen_url() returns the length of the given URL if the spaces within the
139 * URL were properly URL encoded.
140 * URL encoding should be skipped for host names, otherwise IDN resolution
141 * will fail.
142 */
143static size_t strlen_url(const char *url, bool relative)
144{
145 const unsigned char *ptr;
146 size_t newlen = 0;
147 bool left = TRUE; /* left side of the ? */
148 const unsigned char *host_sep = (const unsigned char *) url;
149
150 if(!relative)
151 host_sep = (const unsigned char *) find_host_sep(url);
152
153 for(ptr = (unsigned char *)url; *ptr; ptr++) {
154
155 if(ptr < host_sep) {
156 ++newlen;
157 continue;
158 }
159
160 switch(*ptr) {
161 case '?':
162 left = FALSE;
163 /* FALLTHROUGH */
164 default:
165 if(urlchar_needs_escaping(*ptr))
166 newlen += 2;
167 newlen++;
168 break;
169 case ' ':
170 if(left)
171 newlen += 3;
172 else
173 newlen++;
174 break;
175 }
176 }
177 return newlen;
178}
179
180/* strcpy_url() copies a url to a output buffer and URL-encodes the spaces in
181 * the source URL accordingly.
182 * URL encoding should be skipped for host names, otherwise IDN resolution
183 * will fail.
184 */
185static void strcpy_url(char *output, const char *url, bool relative)
186{
187 /* we must add this with whitespace-replacing */
188 bool left = TRUE;
189 const unsigned char *iptr;
190 char *optr = output;
191 const unsigned char *host_sep = (const unsigned char *) url;
192
193 if(!relative)
194 host_sep = (const unsigned char *) find_host_sep(url);
195
196 for(iptr = (unsigned char *)url; /* read from here */
197 *iptr; /* until zero byte */
198 iptr++) {
199
200 if(iptr < host_sep) {
201 *optr++ = *iptr;
202 continue;
203 }
204
205 switch(*iptr) {
206 case '?':
207 left = FALSE;
208 /* FALLTHROUGH */
209 default:
210 if(urlchar_needs_escaping(*iptr)) {
211 msnprintf(optr, 4, "%%%02x", *iptr);
212 optr += 3;
213 }
214 else
215 *optr++=*iptr;
216 break;
217 case ' ':
218 if(left) {
219 *optr++='%'; /* add a '%' */
220 *optr++='2'; /* add a '2' */
221 *optr++='0'; /* add a '0' */
222 }
223 else
224 *optr++='+'; /* add a '+' here */
225 break;
226 }
227 }
228 *optr = 0; /* zero terminate output buffer */
229
230}
231
232/*
233 * Returns true if the given URL is absolute (as opposed to relative) within
234 * the buffer size. Returns the scheme in the buffer if TRUE and 'buf' is
235 * non-NULL.
236 */
237bool Curl_is_absolute_url(const char *url, char *buf, size_t buflen)
238{
239 size_t i;
240#ifdef WIN32
241 if(STARTS_WITH_DRIVE_PREFIX(url))
242 return FALSE;
243#endif
244 for(i = 0; i < buflen && url[i]; ++i) {
245 char s = url[i];
246 if((s == ':') && (url[i + 1] == '/')) {
247 if(buf)
248 buf[i] = 0;
249 return TRUE;
250 }
251 /* RFC 3986 3.1 explains:
252 scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
253 */
254 else if(ISALNUM(s) || (s == '+') || (s == '-') || (s == '.') ) {
255 if(buf)
256 buf[i] = (char)TOLOWER(s);
257 }
258 else
259 break;
260 }
261 return FALSE;
262}
263
264/*
265 * Concatenate a relative URL to a base URL making it absolute.
266 * URL-encodes any spaces.
267 * The returned pointer must be freed by the caller unless NULL
268 * (returns NULL on out of memory).
269 */
270static char *concat_url(const char *base, const char *relurl)
271{
272 /***
273 TRY to append this new path to the old URL
274 to the right of the host part. Oh crap, this is doomed to cause
275 problems in the future...
276 */
277 char *newest;
278 char *protsep;
279 char *pathsep;
280 size_t newlen;
281 bool host_changed = FALSE;
282
283 const char *useurl = relurl;
284 size_t urllen;
285
286 /* we must make our own copy of the URL to play with, as it may
287 point to read-only data */
288 char *url_clone = strdup(base);
289
290 if(!url_clone)
291 return NULL; /* skip out of this NOW */
292
293 /* protsep points to the start of the host name */
294 protsep = strstr(url_clone, "//");
295 if(!protsep)
296 protsep = url_clone;
297 else
298 protsep += 2; /* pass the slashes */
299
300 if('/' != relurl[0]) {
301 int level = 0;
302
303 /* First we need to find out if there's a ?-letter in the URL,
304 and cut it and the right-side of that off */
305 pathsep = strchr(protsep, '?');
306 if(pathsep)
307 *pathsep = 0;
308
309 /* we have a relative path to append to the last slash if there's one
310 available, or if the new URL is just a query string (starts with a
311 '?') we append the new one at the end of the entire currently worked
312 out URL */
313 if(useurl[0] != '?') {
314 pathsep = strrchr(protsep, '/');
315 if(pathsep)
316 *pathsep = 0;
317 }
318
319 /* Check if there's any slash after the host name, and if so, remember
320 that position instead */
321 pathsep = strchr(protsep, '/');
322 if(pathsep)
323 protsep = pathsep + 1;
324 else
325 protsep = NULL;
326
327 /* now deal with one "./" or any amount of "../" in the newurl
328 and act accordingly */
329
330 if((useurl[0] == '.') && (useurl[1] == '/'))
331 useurl += 2; /* just skip the "./" */
332
333 while((useurl[0] == '.') &&
334 (useurl[1] == '.') &&
335 (useurl[2] == '/')) {
336 level++;
337 useurl += 3; /* pass the "../" */
338 }
339
340 if(protsep) {
341 while(level--) {
342 /* cut off one more level from the right of the original URL */
343 pathsep = strrchr(protsep, '/');
344 if(pathsep)
345 *pathsep = 0;
346 else {
347 *protsep = 0;
348 break;
349 }
350 }
351 }
352 }
353 else {
354 /* We got a new absolute path for this server */
355
356 if(relurl[1] == '/') {
357 /* the new URL starts with //, just keep the protocol part from the
358 original one */
359 *protsep = 0;
360 useurl = &relurl[2]; /* we keep the slashes from the original, so we
361 skip the new ones */
362 host_changed = TRUE;
363 }
364 else {
365 /* cut off the original URL from the first slash, or deal with URLs
366 without slash */
367 pathsep = strchr(protsep, '/');
368 if(pathsep) {
369 /* When people use badly formatted URLs, such as
370 "http://www.url.com?dir=/home/daniel" we must not use the first
371 slash, if there's a ?-letter before it! */
372 char *sep = strchr(protsep, '?');
373 if(sep && (sep < pathsep))
374 pathsep = sep;
375 *pathsep = 0;
376 }
377 else {
378 /* There was no slash. Now, since we might be operating on a badly
379 formatted URL, such as "http://www.url.com?id=2380" which doesn't
380 use a slash separator as it is supposed to, we need to check for a
381 ?-letter as well! */
382 pathsep = strchr(protsep, '?');
383 if(pathsep)
384 *pathsep = 0;
385 }
386 }
387 }
388
389 /* If the new part contains a space, this is a mighty stupid redirect
390 but we still make an effort to do "right". To the left of a '?'
391 letter we replace each space with %20 while it is replaced with '+'
392 on the right side of the '?' letter.
393 */
394 newlen = strlen_url(useurl, !host_changed);
395
396 urllen = strlen(url_clone);
397
398 newest = malloc(urllen + 1 + /* possible slash */
399 newlen + 1 /* zero byte */);
400
401 if(!newest) {
402 free(url_clone); /* don't leak this */
403 return NULL;
404 }
405
406 /* copy over the root url part */
407 memcpy(newest, url_clone, urllen);
408
409 /* check if we need to append a slash */
410 if(('/' == useurl[0]) || (protsep && !*protsep) || ('?' == useurl[0]))
411 ;
412 else
413 newest[urllen++]='/';
414
415 /* then append the new piece on the right side */
416 strcpy_url(&newest[urllen], useurl, !host_changed);
417
418 free(url_clone);
419
420 return newest;
421}
422
423/*
424 * parse_hostname_login()
425 *
426 * Parse the login details (user name, password and options) from the URL and
427 * strip them out of the host name
428 *
429 */
430static CURLUcode parse_hostname_login(struct Curl_URL *u,
431 const struct Curl_handler *h,
432 char **hostname,
433 unsigned int flags)
434{
435 CURLUcode result = CURLUE_OK;
436 CURLcode ccode;
437 char *userp = NULL;
438 char *passwdp = NULL;
439 char *optionsp = NULL;
440
441 /* At this point, we're hoping all the other special cases have
442 * been taken care of, so conn->host.name is at most
443 * [user[:password][;options]]@]hostname
444 *
445 * We need somewhere to put the embedded details, so do that first.
446 */
447
448 char *ptr = strchr(*hostname, '@');
449 char *login = *hostname;
450
451 if(!ptr)
452 goto out;
453
454 /* We will now try to extract the
455 * possible login information in a string like:
456 * ftp://user:password@ftp.my.site:8021/README */
457 *hostname = ++ptr;
458
459 /* We could use the login information in the URL so extract it. Only parse
460 options if the handler says we should. Note that 'h' might be NULL! */
461 ccode = Curl_parse_login_details(login, ptr - login - 1,
462 &userp, &passwdp,
463 (h && (h->flags & PROTOPT_URLOPTIONS)) ?
464 &optionsp:NULL);
465 if(ccode) {
466 result = CURLUE_MALFORMED_INPUT;
467 goto out;
468 }
469
470 if(userp) {
471 if(flags & CURLU_DISALLOW_USER) {
472 /* Option DISALLOW_USER is set and url contains username. */
473 result = CURLUE_USER_NOT_ALLOWED;
474 goto out;
475 }
476
477 u->user = userp;
478 }
479
480 if(passwdp)
481 u->password = passwdp;
482
483 if(optionsp)
484 u->options = optionsp;
485
486 return CURLUE_OK;
487 out:
488
489 free(userp);
490 free(passwdp);
491 free(optionsp);
492
493 return result;
494}
495
496UNITTEST CURLUcode Curl_parse_port(struct Curl_URL *u, char *hostname)
497{
498 char *portptr = NULL;
499 char endbracket;
500 int len;
501
502 /*
503 * Find the end of an IPv6 address, either on the ']' ending bracket or
504 * a percent-encoded zone index.
505 */
506 if(1 == sscanf(hostname, "[%*45[0123456789abcdefABCDEF:.]%c%n",
507 &endbracket, &len)) {
508 if(']' == endbracket)
509 portptr = &hostname[len];
510 else if('%' == endbracket) {
511 int zonelen = len;
512 if(1 == sscanf(hostname + zonelen, "%*[^]]%c%n", &endbracket, &len)) {
513 if(']' != endbracket)
514 return CURLUE_MALFORMED_INPUT;
515 portptr = &hostname[--zonelen + len + 1];
516 }
517 else
518 return CURLUE_MALFORMED_INPUT;
519 }
520 else
521 return CURLUE_MALFORMED_INPUT;
522
523 /* this is a RFC2732-style specified IP-address */
524 if(portptr && *portptr) {
525 if(*portptr != ':')
526 return CURLUE_MALFORMED_INPUT;
527 }
528 else
529 portptr = NULL;
530 }
531 else
532 portptr = strchr(hostname, ':');
533
534 if(portptr) {
535 char *rest;
536 long port;
537 char portbuf[7];
538
539 /* Browser behavior adaptation. If there's a colon with no digits after,
540 just cut off the name there which makes us ignore the colon and just
541 use the default port. Firefox, Chrome and Safari all do that. */
542 if(!portptr[1]) {
543 *portptr = '\0';
544 return CURLUE_OK;
545 }
546
547 if(!ISDIGIT(portptr[1]))
548 return CURLUE_BAD_PORT_NUMBER;
549
550 port = strtol(portptr + 1, &rest, 10); /* Port number must be decimal */
551
552 if((port <= 0) || (port > 0xffff))
553 /* Single unix standard says port numbers are 16 bits long, but we don't
554 treat port zero as OK. */
555 return CURLUE_BAD_PORT_NUMBER;
556
557 if(rest[0])
558 return CURLUE_BAD_PORT_NUMBER;
559
560 *portptr++ = '\0'; /* cut off the name there */
561 *rest = 0;
562 /* generate a new port number string to get rid of leading zeroes etc */
563 msnprintf(portbuf, sizeof(portbuf), "%ld", port);
564 u->portnum = port;
565 u->port = strdup(portbuf);
566 if(!u->port)
567 return CURLUE_OUT_OF_MEMORY;
568 }
569
570 return CURLUE_OK;
571}
572
573/* scan for byte values < 31 or 127 */
574static CURLUcode junkscan(char *part)
575{
576 if(part) {
577 static const char badbytes[]={
578 /* */ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
579 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
580 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
581 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
582 0x7f,
583 0x00 /* zero terminate */
584 };
585 size_t n = strlen(part);
586 size_t nfine = strcspn(part, badbytes);
587 if(nfine != n)
588 /* since we don't know which part is scanned, return a generic error
589 code */
590 return CURLUE_MALFORMED_INPUT;
591 }
592 return CURLUE_OK;
593}
594
595static CURLUcode hostname_check(struct Curl_URL *u, char *hostname)
596{
597 size_t len;
598 size_t hlen = strlen(hostname);
599
600 if(hostname[0] == '[') {
601#ifdef ENABLE_IPV6
602 char dest[16]; /* fits a binary IPv6 address */
603#endif
604 const char *l = "0123456789abcdefABCDEF:.";
605 if(hlen < 5) /* '[::1]' is the shortest possible valid string */
606 return CURLUE_MALFORMED_INPUT;
607 hostname++;
608 hlen -= 2;
609
610 if(hostname[hlen] != ']')
611 return CURLUE_MALFORMED_INPUT;
612
613 /* only valid letters are ok */
614 len = strspn(hostname, l);
615 if(hlen != len) {
616 hlen = len;
617 if(hostname[len] == '%') {
618 /* this could now be '%[zone id]' */
619 char zoneid[16];
620 int i = 0;
621 char *h = &hostname[len + 1];
622 /* pass '25' if present and is a url encoded percent sign */
623 if(!strncmp(h, "25", 2) && h[2] && (h[2] != ']'))
624 h += 2;
625 while(*h && (*h != ']') && (i < 15))
626 zoneid[i++] = *h++;
627 if(!i || (']' != *h))
628 return CURLUE_MALFORMED_INPUT;
629 zoneid[i] = 0;
630 u->zoneid = strdup(zoneid);
631 if(!u->zoneid)
632 return CURLUE_OUT_OF_MEMORY;
633 hostname[len] = ']'; /* insert end bracket */
634 hostname[len + 1] = 0; /* terminate the hostname */
635 }
636 else
637 return CURLUE_MALFORMED_INPUT;
638 /* hostname is fine */
639 }
640#ifdef ENABLE_IPV6
641 hostname[hlen] = 0; /* end the address there */
642 if(1 != Curl_inet_pton(AF_INET6, hostname, dest))
643 return CURLUE_MALFORMED_INPUT;
644 hostname[hlen] = ']'; /* restore ending bracket */
645#endif
646 }
647 else {
648 /* letters from the second string is not ok */
649 len = strcspn(hostname, " ");
650 if(hlen != len)
651 /* hostname with bad content */
652 return CURLUE_MALFORMED_INPUT;
653 }
654 if(!hostname[0])
655 return CURLUE_NO_HOST;
656 return CURLUE_OK;
657}
658
659#define HOSTNAME_END(x) (((x) == '/') || ((x) == '?') || ((x) == '#'))
660
661static CURLUcode seturl(const char *url, CURLU *u, unsigned int flags)
662{
663 char *path;
664 bool path_alloced = FALSE;
665 char *hostname;
666 char *query = NULL;
667 char *fragment = NULL;
668 CURLUcode result;
669 bool url_has_scheme = FALSE;
670 char schemebuf[MAX_SCHEME_LEN + 1];
671 char *schemep = NULL;
672 size_t schemelen = 0;
673 size_t urllen;
674 const struct Curl_handler *h = NULL;
675
676 if(!url)
677 return CURLUE_MALFORMED_INPUT;
678
679 /*************************************************************
680 * Parse the URL.
681 ************************************************************/
682 /* allocate scratch area */
683 urllen = strlen(url);
684 if(urllen > CURL_MAX_INPUT_LENGTH)
685 /* excessive input length */
686 return CURLUE_MALFORMED_INPUT;
687
688 path = u->scratch = malloc(urllen * 2 + 2);
689 if(!path)
690 return CURLUE_OUT_OF_MEMORY;
691
692 hostname = &path[urllen + 1];
693 hostname[0] = 0;
694
695 if(Curl_is_absolute_url(url, schemebuf, sizeof(schemebuf))) {
696 url_has_scheme = TRUE;
697 schemelen = strlen(schemebuf);
698 }
699
700 /* handle the file: scheme */
701 if(url_has_scheme && strcasecompare(schemebuf, "file")) {
702 /* path has been allocated large enough to hold this */
703 strcpy(path, &url[5]);
704
705 hostname = NULL; /* no host for file: URLs */
706 u->scheme = strdup("file");
707 if(!u->scheme)
708 return CURLUE_OUT_OF_MEMORY;
709
710 /* Extra handling URLs with an authority component (i.e. that start with
711 * "file://")
712 *
713 * We allow omitted hostname (e.g. file:/<path>) -- valid according to
714 * RFC 8089, but not the (current) WHAT-WG URL spec.
715 */
716 if(path[0] == '/' && path[1] == '/') {
717 /* swallow the two slashes */
718 char *ptr = &path[2];
719
720 /*
721 * According to RFC 8089, a file: URL can be reliably dereferenced if:
722 *
723 * o it has no/blank hostname, or
724 *
725 * o the hostname matches "localhost" (case-insensitively), or
726 *
727 * o the hostname is a FQDN that resolves to this machine.
728 *
729 * For brevity, we only consider URLs with empty, "localhost", or
730 * "127.0.0.1" hostnames as local.
731 *
732 * Additionally, there is an exception for URLs with a Windows drive
733 * letter in the authority (which was accidentally omitted from RFC 8089
734 * Appendix E, but believe me, it was meant to be there. --MK)
735 */
736 if(ptr[0] != '/' && !STARTS_WITH_URL_DRIVE_PREFIX(ptr)) {
737 /* the URL includes a host name, it must match "localhost" or
738 "127.0.0.1" to be valid */
739 if(!checkprefix("localhost/", ptr) &&
740 !checkprefix("127.0.0.1/", ptr)) {
741 /* Invalid file://hostname/, expected localhost or 127.0.0.1 or
742 none */
743 return CURLUE_MALFORMED_INPUT;
744 }
745 ptr += 9; /* now points to the slash after the host */
746 }
747
748 path = ptr;
749 }
750
751#if !defined(MSDOS) && !defined(WIN32) && !defined(__CYGWIN__)
752 /* Don't allow Windows drive letters when not in Windows.
753 * This catches both "file:/c:" and "file:c:" */
754 if(('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) ||
755 STARTS_WITH_URL_DRIVE_PREFIX(path)) {
756 /* File drive letters are only accepted in MSDOS/Windows */
757 return CURLUE_MALFORMED_INPUT;
758 }
759#else
760 /* If the path starts with a slash and a drive letter, ditch the slash */
761 if('/' == path[0] && STARTS_WITH_URL_DRIVE_PREFIX(&path[1])) {
762 /* This cannot be done with strcpy, as the memory chunks overlap! */
763 memmove(path, &path[1], strlen(&path[1]) + 1);
764 }
765#endif
766
767 }
768 else {
769 /* clear path */
770 const char *p;
771 const char *hostp;
772 size_t len;
773 path[0] = 0;
774
775 if(url_has_scheme) {
776 int i = 0;
777 p = &url[schemelen + 1];
778 while(p && (*p == '/') && (i < 4)) {
779 p++;
780 i++;
781 }
782 if((i < 1) || (i>3))
783 /* less than one or more than three slashes */
784 return CURLUE_MALFORMED_INPUT;
785
786 schemep = schemebuf;
787 if(!Curl_builtin_scheme(schemep) &&
788 !(flags & CURLU_NON_SUPPORT_SCHEME))
789 return CURLUE_UNSUPPORTED_SCHEME;
790
791 if(junkscan(schemep))
792 return CURLUE_MALFORMED_INPUT;
793
794 }
795 else {
796 /* no scheme! */
797
798 if(!(flags & (CURLU_DEFAULT_SCHEME|CURLU_GUESS_SCHEME)))
799 return CURLUE_MALFORMED_INPUT;
800 if(flags & CURLU_DEFAULT_SCHEME)
801 schemep = (char *) DEFAULT_SCHEME;
802
803 /*
804 * The URL was badly formatted, let's try without scheme specified.
805 */
806 p = url;
807 }
808 hostp = p; /* host name starts here */
809
810 while(*p && !HOSTNAME_END(*p)) /* find end of host name */
811 p++;
812
813 len = p - hostp;
814 if(len) {
815 memcpy(hostname, hostp, len);
816 hostname[len] = 0;
817 }
818 else {
819 if(!(flags & CURLU_NO_AUTHORITY))
820 return CURLUE_MALFORMED_INPUT;
821 }
822
823 if((flags & CURLU_GUESS_SCHEME) && !schemep) {
824 /* legacy curl-style guess based on host name */
825 if(checkprefix("ftp.", hostname))
826 schemep = (char *)"ftp";
827 else if(checkprefix("dict.", hostname))
828 schemep = (char *)"dict";
829 else if(checkprefix("ldap.", hostname))
830 schemep = (char *)"ldap";
831 else if(checkprefix("imap.", hostname))
832 schemep = (char *)"imap";
833 else if(checkprefix("smtp.", hostname))
834 schemep = (char *)"smtp";
835 else if(checkprefix("pop3.", hostname))
836 schemep = (char *)"pop3";
837 else
838 schemep = (char *)"http";
839 }
840
841 len = strlen(p);
842 memcpy(path, p, len);
843 path[len] = 0;
844
845 u->scheme = strdup(schemep);
846 if(!u->scheme)
847 return CURLUE_OUT_OF_MEMORY;
848 }
849
850 /* if this is a known scheme, get some details */
851 h = Curl_builtin_scheme(u->scheme);
852
853 if(junkscan(path))
854 return CURLUE_MALFORMED_INPUT;
855
856 if((flags & CURLU_URLENCODE) && path[0]) {
857 /* worst case output length is 3x the original! */
858 char *newp = malloc(strlen(path) * 3);
859 if(!newp)
860 return CURLUE_OUT_OF_MEMORY;
861 path_alloced = TRUE;
862 strcpy_url(newp, path, TRUE); /* consider it relative */
863 u->temppath = path = newp;
864 }
865
866 fragment = strchr(path, '#');
867 if(fragment) {
868 *fragment++ = 0;
869 if(fragment[0]) {
870 u->fragment = strdup(fragment);
871 if(!u->fragment)
872 return CURLUE_OUT_OF_MEMORY;
873 }
874 }
875
876 query = strchr(path, '?');
877 if(query) {
878 *query++ = 0;
879 /* done even if the query part is a blank string */
880 u->query = strdup(query);
881 if(!u->query)
882 return CURLUE_OUT_OF_MEMORY;
883 }
884
885 if(!path[0])
886 /* if there's no path left set, unset */
887 path = NULL;
888 else {
889 if(!(flags & CURLU_PATH_AS_IS)) {
890 /* remove ../ and ./ sequences according to RFC3986 */
891 char *newp = Curl_dedotdotify(path);
892 if(!newp)
893 return CURLUE_OUT_OF_MEMORY;
894
895 if(strcmp(newp, path)) {
896 /* if we got a new version */
897 if(path_alloced)
898 Curl_safefree(u->temppath);
899 u->temppath = path = newp;
900 path_alloced = TRUE;
901 }
902 else
903 free(newp);
904 }
905
906 u->path = path_alloced?path:strdup(path);
907 if(!u->path)
908 return CURLUE_OUT_OF_MEMORY;
909 u->temppath = NULL; /* used now */
910 }
911
912 if(hostname) {
913 /*
914 * Parse the login details and strip them out of the host name.
915 */
916 if(junkscan(hostname))
917 return CURLUE_MALFORMED_INPUT;
918
919 result = parse_hostname_login(u, h, &hostname, flags);
920 if(result)
921 return result;
922
923 result = Curl_parse_port(u, hostname);
924 if(result)
925 return result;
926
927 if(0 == strlen(hostname) && (flags & CURLU_NO_AUTHORITY)) {
928 /* Skip hostname check, it's allowed to be empty. */
929 }
930 else {
931 result = hostname_check(u, hostname);
932 if(result)
933 return result;
934 }
935
936 u->host = strdup(hostname);
937 if(!u->host)
938 return CURLUE_OUT_OF_MEMORY;
939 }
940
941 Curl_safefree(u->scratch);
942 Curl_safefree(u->temppath);
943
944 return CURLUE_OK;
945}
946
947/*
948 * Parse the URL and set the relevant members of the Curl_URL struct.
949 */
950static CURLUcode parseurl(const char *url, CURLU *u, unsigned int flags)
951{
952 CURLUcode result = seturl(url, u, flags);
953 if(result) {
954 free_urlhandle(u);
955 memset(u, 0, sizeof(struct Curl_URL));
956 }
957 return result;
958}
959
960/*
961 */
962CURLU *curl_url(void)
963{
964 return calloc(sizeof(struct Curl_URL), 1);
965}
966
967void curl_url_cleanup(CURLU *u)
968{
969 if(u) {
970 free_urlhandle(u);
971 free(u);
972 }
973}
974
975#define DUP(dest, src, name) \
976 if(src->name) { \
977 dest->name = strdup(src->name); \
978 if(!dest->name) \
979 goto fail; \
980 }
981
982CURLU *curl_url_dup(CURLU *in)
983{
984 struct Curl_URL *u = calloc(sizeof(struct Curl_URL), 1);
985 if(u) {
986 DUP(u, in, scheme);
987 DUP(u, in, user);
988 DUP(u, in, password);
989 DUP(u, in, options);
990 DUP(u, in, host);
991 DUP(u, in, port);
992 DUP(u, in, path);
993 DUP(u, in, query);
994 DUP(u, in, fragment);
995 u->portnum = in->portnum;
996 }
997 return u;
998 fail:
999 curl_url_cleanup(u);
1000 return NULL;
1001}
1002
1003CURLUcode curl_url_get(CURLU *u, CURLUPart what,
1004 char **part, unsigned int flags)
1005{
1006 char *ptr;
1007 CURLUcode ifmissing = CURLUE_UNKNOWN_PART;
1008 char portbuf[7];
1009 bool urldecode = (flags & CURLU_URLDECODE)?1:0;
1010 bool plusdecode = FALSE;
1011 (void)flags;
1012 if(!u)
1013 return CURLUE_BAD_HANDLE;
1014 if(!part)
1015 return CURLUE_BAD_PARTPOINTER;
1016 *part = NULL;
1017
1018 switch(what) {
1019 case CURLUPART_SCHEME:
1020 ptr = u->scheme;
1021 ifmissing = CURLUE_NO_SCHEME;
1022 urldecode = FALSE; /* never for schemes */
1023 break;
1024 case CURLUPART_USER:
1025 ptr = u->user;
1026 ifmissing = CURLUE_NO_USER;
1027 break;
1028 case CURLUPART_PASSWORD:
1029 ptr = u->password;
1030 ifmissing = CURLUE_NO_PASSWORD;
1031 break;
1032 case CURLUPART_OPTIONS:
1033 ptr = u->options;
1034 ifmissing = CURLUE_NO_OPTIONS;
1035 break;
1036 case CURLUPART_HOST:
1037 ptr = u->host;
1038 ifmissing = CURLUE_NO_HOST;
1039 break;
1040 case CURLUPART_ZONEID:
1041 ptr = u->zoneid;
1042 break;
1043 case CURLUPART_PORT:
1044 ptr = u->port;
1045 ifmissing = CURLUE_NO_PORT;
1046 urldecode = FALSE; /* never for port */
1047 if(!ptr && (flags & CURLU_DEFAULT_PORT) && u->scheme) {
1048 /* there's no stored port number, but asked to deliver
1049 a default one for the scheme */
1050 const struct Curl_handler *h =
1051 Curl_builtin_scheme(u->scheme);
1052 if(h) {
1053 msnprintf(portbuf, sizeof(portbuf), "%ld", h->defport);
1054 ptr = portbuf;
1055 }
1056 }
1057 else if(ptr && u->scheme) {
1058 /* there is a stored port number, but ask to inhibit if
1059 it matches the default one for the scheme */
1060 const struct Curl_handler *h =
1061 Curl_builtin_scheme(u->scheme);
1062 if(h && (h->defport == u->portnum) &&
1063 (flags & CURLU_NO_DEFAULT_PORT))
1064 ptr = NULL;
1065 }
1066 break;
1067 case CURLUPART_PATH:
1068 ptr = u->path;
1069 if(!ptr) {
1070 ptr = u->path = strdup("/");
1071 if(!u->path)
1072 return CURLUE_OUT_OF_MEMORY;
1073 }
1074 break;
1075 case CURLUPART_QUERY:
1076 ptr = u->query;
1077 ifmissing = CURLUE_NO_QUERY;
1078 plusdecode = urldecode;
1079 break;
1080 case CURLUPART_FRAGMENT:
1081 ptr = u->fragment;
1082 ifmissing = CURLUE_NO_FRAGMENT;
1083 break;
1084 case CURLUPART_URL: {
1085 char *url;
1086 char *scheme;
1087 char *options = u->options;
1088 char *port = u->port;
1089 char *allochost = NULL;
1090 if(u->scheme && strcasecompare("file", u->scheme)) {
1091 url = aprintf("file://%s%s%s",
1092 u->path,
1093 u->fragment? "#": "",
1094 u->fragment? u->fragment : "");
1095 }
1096 else if(!u->host)
1097 return CURLUE_NO_HOST;
1098 else {
1099 const struct Curl_handler *h = NULL;
1100 if(u->scheme)
1101 scheme = u->scheme;
1102 else if(flags & CURLU_DEFAULT_SCHEME)
1103 scheme = (char *) DEFAULT_SCHEME;
1104 else
1105 return CURLUE_NO_SCHEME;
1106
1107 h = Curl_builtin_scheme(scheme);
1108 if(!port && (flags & CURLU_DEFAULT_PORT)) {
1109 /* there's no stored port number, but asked to deliver
1110 a default one for the scheme */
1111 if(h) {
1112 msnprintf(portbuf, sizeof(portbuf), "%ld", h->defport);
1113 port = portbuf;
1114 }
1115 }
1116 else if(port) {
1117 /* there is a stored port number, but asked to inhibit if it matches
1118 the default one for the scheme */
1119 if(h && (h->defport == u->portnum) &&
1120 (flags & CURLU_NO_DEFAULT_PORT))
1121 port = NULL;
1122 }
1123
1124 if(h && !(h->flags & PROTOPT_URLOPTIONS))
1125 options = NULL;
1126
1127 if((u->host[0] == '[') && u->zoneid) {
1128 /* make it '[ host %25 zoneid ]' */
1129 size_t hostlen = strlen(u->host);
1130 size_t alen = hostlen + 3 + strlen(u->zoneid) + 1;
1131 allochost = malloc(alen);
1132 if(!allochost)
1133 return CURLUE_OUT_OF_MEMORY;
1134 memcpy(allochost, u->host, hostlen - 1);
1135 msnprintf(&allochost[hostlen - 1], alen - hostlen + 1,
1136 "%%25%s]", u->zoneid);
1137 }
1138
1139 url = aprintf("%s://%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
1140 scheme,
1141 u->user ? u->user : "",
1142 u->password ? ":": "",
1143 u->password ? u->password : "",
1144 options ? ";" : "",
1145 options ? options : "",
1146 (u->user || u->password || options) ? "@": "",
1147 allochost ? allochost : u->host,
1148 port ? ":": "",
1149 port ? port : "",
1150 (u->path && (u->path[0] != '/')) ? "/": "",
1151 u->path ? u->path : "/",
1152 (u->query && u->query[0]) ? "?": "",
1153 (u->query && u->query[0]) ? u->query : "",
1154 u->fragment? "#": "",
1155 u->fragment? u->fragment : "");
1156 free(allochost);
1157 }
1158 if(!url)
1159 return CURLUE_OUT_OF_MEMORY;
1160 *part = url;
1161 return CURLUE_OK;
1162 }
1163 default:
1164 ptr = NULL;
1165 break;
1166 }
1167 if(ptr) {
1168 *part = strdup(ptr);
1169 if(!*part)
1170 return CURLUE_OUT_OF_MEMORY;
1171 if(plusdecode) {
1172 /* convert + to space */
1173 char *plus;
1174 for(plus = *part; *plus; ++plus) {
1175 if(*plus == '+')
1176 *plus = ' ';
1177 }
1178 }
1179 if(urldecode) {
1180 char *decoded;
1181 size_t dlen;
1182 CURLcode res = Curl_urldecode(NULL, *part, 0, &decoded, &dlen, TRUE);
1183 free(*part);
1184 if(res) {
1185 *part = NULL;
1186 return CURLUE_URLDECODE;
1187 }
1188 *part = decoded;
1189 }
1190 return CURLUE_OK;
1191 }
1192 else
1193 return ifmissing;
1194}
1195
1196CURLUcode curl_url_set(CURLU *u, CURLUPart what,
1197 const char *part, unsigned int flags)
1198{
1199 char **storep = NULL;
1200 long port = 0;
1201 bool urlencode = (flags & CURLU_URLENCODE)? 1 : 0;
1202 bool plusencode = FALSE;
1203 bool urlskipslash = FALSE;
1204 bool appendquery = FALSE;
1205 bool equalsencode = FALSE;
1206
1207 if(!u)
1208 return CURLUE_BAD_HANDLE;
1209 if(!part) {
1210 /* setting a part to NULL clears it */
1211 switch(what) {
1212 case CURLUPART_URL:
1213 break;
1214 case CURLUPART_SCHEME:
1215 storep = &u->scheme;
1216 break;
1217 case CURLUPART_USER:
1218 storep = &u->user;
1219 break;
1220 case CURLUPART_PASSWORD:
1221 storep = &u->password;
1222 break;
1223 case CURLUPART_OPTIONS:
1224 storep = &u->options;
1225 break;
1226 case CURLUPART_HOST:
1227 storep = &u->host;
1228 break;
1229 case CURLUPART_ZONEID:
1230 storep = &u->zoneid;
1231 break;
1232 case CURLUPART_PORT:
1233 u->portnum = 0;
1234 storep = &u->port;
1235 break;
1236 case CURLUPART_PATH:
1237 storep = &u->path;
1238 break;
1239 case CURLUPART_QUERY:
1240 storep = &u->query;
1241 break;
1242 case CURLUPART_FRAGMENT:
1243 storep = &u->fragment;
1244 break;
1245 default:
1246 return CURLUE_UNKNOWN_PART;
1247 }
1248 if(storep && *storep) {
1249 free(*storep);
1250 *storep = NULL;
1251 }
1252 return CURLUE_OK;
1253 }
1254
1255 switch(what) {
1256 case CURLUPART_SCHEME:
1257 if(strlen(part) > MAX_SCHEME_LEN)
1258 /* too long */
1259 return CURLUE_MALFORMED_INPUT;
1260 if(!(flags & CURLU_NON_SUPPORT_SCHEME) &&
1261 /* verify that it is a fine scheme */
1262 !Curl_builtin_scheme(part))
1263 return CURLUE_UNSUPPORTED_SCHEME;
1264 storep = &u->scheme;
1265 urlencode = FALSE; /* never */
1266 break;
1267 case CURLUPART_USER:
1268 storep = &u->user;
1269 break;
1270 case CURLUPART_PASSWORD:
1271 storep = &u->password;
1272 break;
1273 case CURLUPART_OPTIONS:
1274 storep = &u->options;
1275 break;
1276 case CURLUPART_HOST:
1277 storep = &u->host;
1278 free(u->zoneid);
1279 u->zoneid = NULL;
1280 break;
1281 case CURLUPART_ZONEID:
1282 storep = &u->zoneid;
1283 break;
1284 case CURLUPART_PORT:
1285 {
1286 char *endp;
1287 urlencode = FALSE; /* never */
1288 port = strtol(part, &endp, 10); /* Port number must be decimal */
1289 if((port <= 0) || (port > 0xffff))
1290 return CURLUE_BAD_PORT_NUMBER;
1291 if(*endp)
1292 /* weirdly provided number, not good! */
1293 return CURLUE_MALFORMED_INPUT;
1294 storep = &u->port;
1295 }
1296 break;
1297 case CURLUPART_PATH:
1298 urlskipslash = TRUE;
1299 storep = &u->path;
1300 break;
1301 case CURLUPART_QUERY:
1302 plusencode = urlencode;
1303 appendquery = (flags & CURLU_APPENDQUERY)?1:0;
1304 equalsencode = appendquery;
1305 storep = &u->query;
1306 break;
1307 case CURLUPART_FRAGMENT:
1308 storep = &u->fragment;
1309 break;
1310 case CURLUPART_URL: {
1311 /*
1312 * Allow a new URL to replace the existing (if any) contents.
1313 *
1314 * If the existing contents is enough for a URL, allow a relative URL to
1315 * replace it.
1316 */
1317 CURLUcode result;
1318 char *oldurl;
1319 char *redired_url;
1320 CURLU *handle2;
1321
1322 if(Curl_is_absolute_url(part, NULL, MAX_SCHEME_LEN + 1)) {
1323 handle2 = curl_url();
1324 if(!handle2)
1325 return CURLUE_OUT_OF_MEMORY;
1326 result = parseurl(part, handle2, flags);
1327 if(!result)
1328 mv_urlhandle(handle2, u);
1329 else
1330 curl_url_cleanup(handle2);
1331 return result;
1332 }
1333 /* extract the full "old" URL to do the redirect on */
1334 result = curl_url_get(u, CURLUPART_URL, &oldurl, flags);
1335 if(result) {
1336 /* couldn't get the old URL, just use the new! */
1337 handle2 = curl_url();
1338 if(!handle2)
1339 return CURLUE_OUT_OF_MEMORY;
1340 result = parseurl(part, handle2, flags);
1341 if(!result)
1342 mv_urlhandle(handle2, u);
1343 else
1344 curl_url_cleanup(handle2);
1345 return result;
1346 }
1347
1348 /* apply the relative part to create a new URL */
1349 redired_url = concat_url(oldurl, part);
1350 free(oldurl);
1351 if(!redired_url)
1352 return CURLUE_OUT_OF_MEMORY;
1353
1354 /* now parse the new URL */
1355 handle2 = curl_url();
1356 if(!handle2) {
1357 free(redired_url);
1358 return CURLUE_OUT_OF_MEMORY;
1359 }
1360 result = parseurl(redired_url, handle2, flags);
1361 free(redired_url);
1362 if(!result)
1363 mv_urlhandle(handle2, u);
1364 else
1365 curl_url_cleanup(handle2);
1366 return result;
1367 }
1368 default:
1369 return CURLUE_UNKNOWN_PART;
1370 }
1371 DEBUGASSERT(storep);
1372 {
1373 const char *newp = part;
1374 size_t nalloc = strlen(part);
1375
1376 if(nalloc > CURL_MAX_INPUT_LENGTH)
1377 /* excessive input length */
1378 return CURLUE_MALFORMED_INPUT;
1379
1380 if(urlencode) {
1381 const unsigned char *i;
1382 char *o;
1383 bool free_part = FALSE;
1384 char *enc = malloc(nalloc * 3 + 1); /* for worst case! */
1385 if(!enc)
1386 return CURLUE_OUT_OF_MEMORY;
1387 if(plusencode) {
1388 /* space to plus */
1389 i = (const unsigned char *)part;
1390 for(o = enc; *i; ++o, ++i)
1391 *o = (*i == ' ') ? '+' : *i;
1392 *o = 0; /* zero terminate */
1393 part = strdup(enc);
1394 if(!part) {
1395 free(enc);
1396 return CURLUE_OUT_OF_MEMORY;
1397 }
1398 free_part = TRUE;
1399 }
1400 for(i = (const unsigned char *)part, o = enc; *i; i++) {
1401 if(Curl_isunreserved(*i) ||
1402 ((*i == '/') && urlskipslash) ||
1403 ((*i == '=') && equalsencode) ||
1404 ((*i == '+') && plusencode)) {
1405 if((*i == '=') && equalsencode)
1406 /* only skip the first equals sign */
1407 equalsencode = FALSE;
1408 *o = *i;
1409 o++;
1410 }
1411 else {
1412 msnprintf(o, 4, "%%%02x", *i);
1413 o += 3;
1414 }
1415 }
1416 *o = 0; /* zero terminate */
1417 newp = enc;
1418 if(free_part)
1419 free((char *)part);
1420 }
1421 else {
1422 char *p;
1423 newp = strdup(part);
1424 if(!newp)
1425 return CURLUE_OUT_OF_MEMORY;
1426 p = (char *)newp;
1427 while(*p) {
1428 /* make sure percent encoded are lower case */
1429 if((*p == '%') && ISXDIGIT(p[1]) && ISXDIGIT(p[2]) &&
1430 (ISUPPER(p[1]) || ISUPPER(p[2]))) {
1431 p[1] = (char)TOLOWER(p[1]);
1432 p[2] = (char)TOLOWER(p[2]);
1433 p += 3;
1434 }
1435 else
1436 p++;
1437 }
1438 }
1439
1440 if(appendquery) {
1441 /* Append the string onto the old query. Add a '&' separator if none is
1442 present at the end of the exsting query already */
1443 size_t querylen = u->query ? strlen(u->query) : 0;
1444 bool addamperand = querylen && (u->query[querylen -1] != '&');
1445 if(querylen) {
1446 size_t newplen = strlen(newp);
1447 char *p = malloc(querylen + addamperand + newplen + 1);
1448 if(!p) {
1449 free((char *)newp);
1450 return CURLUE_OUT_OF_MEMORY;
1451 }
1452 strcpy(p, u->query); /* original query */
1453 if(addamperand)
1454 p[querylen] = '&'; /* ampersand */
1455 strcpy(&p[querylen + addamperand], newp); /* new suffix */
1456 free((char *)newp);
1457 free(*storep);
1458 *storep = p;
1459 return CURLUE_OK;
1460 }
1461 }
1462
1463 if(what == CURLUPART_HOST) {
1464 if(0 == strlen(newp) && (flags & CURLU_NO_AUTHORITY)) {
1465 /* Skip hostname check, it's allowed to be empty. */
1466 }
1467 else {
1468 if(hostname_check(u, (char *)newp)) {
1469 free((char *)newp);
1470 return CURLUE_MALFORMED_INPUT;
1471 }
1472 }
1473 }
1474
1475 free(*storep);
1476 *storep = (char *)newp;
1477 }
1478 /* set after the string, to make it not assigned if the allocation above
1479 fails */
1480 if(port)
1481 u->portnum = port;
1482 return CURLUE_OK;
1483}
1484