| 1 | /* |
| 2 | * This Source Code Form is subject to the terms of the Mozilla Public |
| 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
| 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| 5 | * |
| 6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
| 7 | */ |
| 8 | |
| 9 | /* |
| 10 | * M. Kersten |
| 11 | * Y. Zhang |
| 12 | * The URL module |
| 13 | * The URL module contains a collection of commands to manipulate |
| 14 | * Uniform Resource Locators - a resource on the World Wide Web- |
| 15 | * represented as a string in Monet. The URL can represent |
| 16 | * anything from a file, a directory or a complete movie. |
| 17 | * This module is geared towards manipulation of their name only. |
| 18 | * A complementary module can be used to gain access.[IOgate] |
| 19 | * |
| 20 | * The URL syntax is specified in RFC2396, Uniform Resource Identifiers |
| 21 | * (URI): Generic Syntax. The URL syntax is dependent upon the scheme. |
| 22 | * In general, a URL has the form <scheme>:<scheme-specific-part>. |
| 23 | * Thus, accepting a valid URL is a simple proccess, unless the scheme |
| 24 | * is known and schema-specific syntax is checked (e.g., http or ftp |
| 25 | * scheme). For the URL module implemented here, we assume some common |
| 26 | * fields of the <scheme-specific-part> that are shared among different |
| 27 | * schemes. |
| 28 | * |
| 29 | * The core of the extension involves several operators to extract |
| 30 | * portions of the URLs for further manipulation. In particular, |
| 31 | * the domain, the server, and the protocol, and the file extension |
| 32 | * can be extracted without copying the complete URL from the heap |
| 33 | * into a string variable first. |
| 34 | * |
| 35 | * The commands provided are based on the corresponding Java class. |
| 36 | * |
| 37 | * A future version should use a special atom, because this may save |
| 38 | * considerable space. Alternatively, break the URL strings into |
| 39 | * components and represent them with a bunch of BATs. An intermediate |
| 40 | * step would be to refine the atom STR, then it would be possible to |
| 41 | * redefine hashing. |
| 42 | */ |
| 43 | |
| 44 | #include "monetdb_config.h" |
| 45 | #include "mal.h" |
| 46 | #include "url.h" |
| 47 | #include "mal_exception.h" |
| 48 | |
| 49 | static char x2c(char *what); |
| 50 | |
| 51 | /* SCHEME "://" AUTHORITY [ PATH ] [ "?" SEARCH ] [ "#" FRAGMENT ] |
| 52 | * AUTHORITY is: [ USER [ ":" PASSWORD ] "@" ] HOST [ ":" PORT ] */ |
| 53 | |
| 54 | /* return pointer to string after the scheme and colon; input: pointer |
| 55 | * to start of URI */ |
| 56 | static const char * |
| 57 | skip_scheme(const char *uri) |
| 58 | { |
| 59 | if (('a' <= *uri && *uri <= 'z') || ('A' <= *uri && *uri <= 'Z')) { |
| 60 | uri++; |
| 61 | while (('a' <= *uri && *uri <= 'z') || |
| 62 | ('A' <= *uri && *uri <= 'Z') || |
| 63 | isdigit((unsigned char) *uri) || |
| 64 | *uri == '+' || *uri == '-' || *uri == '.') |
| 65 | uri++; |
| 66 | if (*uri == ':') |
| 67 | return uri + 1; |
| 68 | } |
| 69 | return NULL; |
| 70 | } |
| 71 | |
| 72 | #define ishex(c) isxdigit((unsigned char) (c)) |
| 73 | #define isreserved(c) ((c) == ';' || (c) == '/' || (c) == '?' || \ |
| 74 | (c) == ':' || (c) == '@' || (c) == '&' || \ |
| 75 | (c) == '=' || (c) == '+' || (c) == '$' || \ |
| 76 | (c) == ',') |
| 77 | #define isunreserved(c) (('a' <= (c) && (c) <= 'z') || \ |
| 78 | ('A' <= (c) && (c) <= 'Z') || \ |
| 79 | isdigit((unsigned char) (c)) || \ |
| 80 | (c) == '-' || (c) == '_' || (c) == '.' || \ |
| 81 | (c) == '!' || (c) == '~' || (c) == '*' || \ |
| 82 | (c) == '\'' || (c) == '(' || (c) == ')') |
| 83 | |
| 84 | /* return pointer to string after the authority, filling in pointers |
| 85 | * to start of user, password, host, and port, if provided; input: |
| 86 | * result of skip_scheme() */ |
| 87 | static const char * |
| 88 | skip_authority(const char *uri, const char **userp, const char **passp, const char **hostp, const char **portp) |
| 89 | { |
| 90 | const char *user = NULL, *pass = NULL, *host = NULL, *port = NULL; |
| 91 | |
| 92 | if (uri[0] == '/' && uri[1] == '/') { |
| 93 | uri += 2; |
| 94 | user = host = uri; |
| 95 | while (isunreserved(*uri) || |
| 96 | (*uri == '%' && ishex(uri[1]) && ishex(uri[2])) || |
| 97 | *uri == ';' || *uri == ':' || *uri == '=' || *uri == '+'|| *uri == '$' || *uri == ',' || |
| 98 | *uri == '@') { |
| 99 | if (*uri == ':') { |
| 100 | if (user == host) |
| 101 | port = pass = uri + 1; |
| 102 | else |
| 103 | port = uri + 1; |
| 104 | } else if (*uri == '@') |
| 105 | host = uri + 1; |
| 106 | uri += *uri == '%' ? 3 : 1; |
| 107 | } |
| 108 | if (user == host) { |
| 109 | /* no "@", so no user info */ |
| 110 | if (userp) |
| 111 | *userp = NULL; |
| 112 | if (passp) |
| 113 | *passp = NULL; |
| 114 | } else { |
| 115 | if (userp) |
| 116 | *userp = user; |
| 117 | if (passp) |
| 118 | *passp = pass; |
| 119 | } |
| 120 | if (portp) |
| 121 | *portp = port; |
| 122 | if (hostp) |
| 123 | *hostp = host; |
| 124 | return uri; |
| 125 | } |
| 126 | return NULL; |
| 127 | } |
| 128 | |
| 129 | /* return pointer to string after the path, filling in pointer to |
| 130 | * start of last component and extension of that component; input: |
| 131 | * result of skip_authority() */ |
| 132 | static const char * |
| 133 | skip_path(const char *uri, const char **basep, const char **extp) |
| 134 | { |
| 135 | const char *base = NULL, *ext = NULL; |
| 136 | |
| 137 | if (*uri == '/') { |
| 138 | uri++; |
| 139 | base = uri; |
| 140 | while (isunreserved(*uri) || |
| 141 | (*uri == '%' && ishex(uri[1]) && ishex(uri[2])) || |
| 142 | *uri == ':' || *uri == '@' || *uri == '&' || *uri == '=' || *uri == '+' || *uri == '$' || *uri == ',' || |
| 143 | *uri == ';' || |
| 144 | *uri == '/') { |
| 145 | if (*uri == '/') { |
| 146 | base = uri + 1; |
| 147 | ext = NULL; |
| 148 | } else if (*uri == '.' && ext == NULL && uri != base) { |
| 149 | ext = uri; |
| 150 | } |
| 151 | uri += *uri == '%' ? 3 : 1; |
| 152 | } |
| 153 | } |
| 154 | if (basep) |
| 155 | *basep = base; |
| 156 | if (extp) |
| 157 | *extp = ext; |
| 158 | return uri; |
| 159 | } |
| 160 | |
| 161 | /* return pointer to string after the search string; input: result of |
| 162 | * skip_path() */ |
| 163 | static const char * |
| 164 | skip_search(const char *uri) |
| 165 | { |
| 166 | if (*uri == '?') { |
| 167 | uri++; |
| 168 | while (isreserved(*uri) || isunreserved(*uri) || |
| 169 | (*uri == '%' && ishex(uri[1]) && ishex(uri[2]))) { |
| 170 | uri += *uri == '%' ? 3 : 1; |
| 171 | } |
| 172 | } |
| 173 | return uri; |
| 174 | } |
| 175 | |
| 176 | static int needEscape(char c){ |
| 177 | if( isalnum((unsigned char)c) ) |
| 178 | return 0; |
| 179 | if( c == '#' || c == '-' || c == '_' || c == '.' || c == '!' || |
| 180 | c == '~' || c == '*' || c == '\'' || c == '(' || c == ')' ) |
| 181 | return 0; |
| 182 | return 1; |
| 183 | } |
| 184 | |
| 185 | /* COMMAND "escape": this function applies the URI escaping rules defined in |
| 186 | * section 2 of [RFC 3986] to the string supplied as 's'. |
| 187 | * The effect of the function is to escape a set of identified characters in |
| 188 | * the string. Each such character is replaced in the string by an escape |
| 189 | * sequence, which is formed by encoding the character as a sequence of octets |
| 190 | * in UTF-8, and then reprensenting each of these octets in the form %HH. |
| 191 | * |
| 192 | * All characters are escaped other than: |
| 193 | * [a-z], [A-Z], [0-9], "#", "-", "_", ".", "!", "~", "*", "'", "(", ")" |
| 194 | * |
| 195 | * This function must always generate hexadecimal values using the upper-case |
| 196 | * letters A-F. |
| 197 | * |
| 198 | * SIGNATURE: escape(str) : str; */ |
| 199 | str |
| 200 | escape_str(str *retval, str s) |
| 201 | { |
| 202 | int x, y; |
| 203 | str res; |
| 204 | |
| 205 | if (!s) |
| 206 | throw(ILLARG, "url.escape" , "url missing" ); |
| 207 | |
| 208 | if (!( res = (str) GDKmalloc( strlen(s) * 3 ) )) |
| 209 | throw(MAL, "url.escape" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 210 | for (x = 0, y = 0; s[x]; ++x, ++y) { |
| 211 | if (needEscape(s[x])) { |
| 212 | if (s[x] == ' ') { |
| 213 | res[y] = '+'; |
| 214 | } else { |
| 215 | sprintf(res+y, "%%%2x" , (uint8_t) s[x]); |
| 216 | y += 2; |
| 217 | } |
| 218 | } else { |
| 219 | res[y] = s[x]; |
| 220 | } |
| 221 | } |
| 222 | res[y] = '\0'; |
| 223 | |
| 224 | if ((*retval = GDKrealloc(res, strlen(res)+1)) == NULL) { |
| 225 | GDKfree(res); |
| 226 | throw(MAL, "url.escape" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 227 | } |
| 228 | return MAL_SUCCEED; |
| 229 | } |
| 230 | |
| 231 | /* COMMAND "unescape": Convert hexadecimal representations to ASCII characters. |
| 232 | * All sequences of the form "% HEX HEX" are unescaped. |
| 233 | * SIGNATURE: unescape(str) : str; */ |
| 234 | str |
| 235 | unescape_str(str *retval, str s) |
| 236 | { |
| 237 | int x, y; |
| 238 | str res; |
| 239 | |
| 240 | if (!s) |
| 241 | throw(ILLARG, "url.escape" , "url missing" ); |
| 242 | |
| 243 | res = (str) GDKmalloc(strlen(s)); |
| 244 | if (!res) |
| 245 | throw(MAL, "url.unescape" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 246 | |
| 247 | for (x = 0, y = 0; s[x]; ++x, ++y) { |
| 248 | if (s[x] == '%') { |
| 249 | res[y] = x2c(&s[x + 1]); |
| 250 | x += 2; |
| 251 | } else { |
| 252 | res[y] = s[x]; |
| 253 | } |
| 254 | } |
| 255 | res[y] = '\0'; |
| 256 | |
| 257 | if ((*retval = GDKrealloc(res, strlen(res)+1)) == NULL) { |
| 258 | GDKfree(res); |
| 259 | throw(MAL, "url.unescape" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 260 | } |
| 261 | return MAL_SUCCEED; |
| 262 | } |
| 263 | |
| 264 | /* |
| 265 | * Utilities |
| 266 | */ |
| 267 | |
| 268 | static char |
| 269 | x2c(char *what) |
| 270 | { |
| 271 | char digit; |
| 272 | |
| 273 | digit = (what[0] >= 'A' ? ((what[0] & 0xdf) - 'A') + 10 : (what[0] - '0')); |
| 274 | digit *= 16; |
| 275 | digit += (what[1] >= 'A' ? ((what[1] & 0xdf) - 'A') + 10 : (what[1] - '0')); |
| 276 | return (digit); |
| 277 | } |
| 278 | |
| 279 | /* |
| 280 | * Wrapping |
| 281 | * Here you find the wrappers around the V4 url library included above. |
| 282 | */ |
| 283 | |
| 284 | ssize_t |
| 285 | URLfromString(const char *src, size_t *len, str *u, bool external) |
| 286 | { |
| 287 | size_t l = strlen(src) + 1; |
| 288 | |
| 289 | if (*len < l || *u == NULL) { |
| 290 | GDKfree(*u); |
| 291 | *u = GDKmalloc(l); |
| 292 | if (*u == NULL) |
| 293 | return -1; |
| 294 | *len = l; |
| 295 | } |
| 296 | |
| 297 | /* actually parse the message for valid url */ |
| 298 | |
| 299 | if (external && strcmp(src, "nil" ) == 0) |
| 300 | strcpy(*u, str_nil); |
| 301 | else |
| 302 | memcpy(*u, src, l); |
| 303 | return (ssize_t) l - 1; |
| 304 | } |
| 305 | |
| 306 | ssize_t |
| 307 | URLtoString(str *s, size_t *len, const char *src, bool external) |
| 308 | { |
| 309 | size_t l = strlen(src); |
| 310 | |
| 311 | if (external) |
| 312 | l += 2; |
| 313 | if (l >= *len || *s == NULL) { |
| 314 | GDKfree(*s); |
| 315 | *s = GDKmalloc(l + 1); |
| 316 | if (*s == NULL) |
| 317 | return -1; |
| 318 | *len = l + 1; |
| 319 | } |
| 320 | |
| 321 | if (external) { |
| 322 | if (GDK_STRNIL(src)) { |
| 323 | strcpy(*s, "nil" ); |
| 324 | return 3; |
| 325 | } |
| 326 | snprintf(*s, l + 1, "\"%s\"" , src); |
| 327 | } else { |
| 328 | strcpy(*s, src); |
| 329 | } |
| 330 | return (ssize_t) l; |
| 331 | } |
| 332 | |
| 333 | /* COMMAND "getAnchor": Extract an anchor (reference) from the URL |
| 334 | * SIGNATURE: getAnchor(url) : str; */ |
| 335 | str |
| 336 | URLgetAnchor(str *retval, url *val) |
| 337 | { |
| 338 | const char *s; |
| 339 | |
| 340 | if (val == NULL || *val == NULL) |
| 341 | throw(ILLARG, "url.getAnchor" , "url missing" ); |
| 342 | if ((s = skip_scheme(*val)) == NULL || |
| 343 | (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL || |
| 344 | (s = skip_path(s, NULL, NULL)) == NULL || |
| 345 | (s = skip_search(s)) == NULL) |
| 346 | throw(ILLARG, "url.getAnchor" , "bad url" ); |
| 347 | if (*s == '#') |
| 348 | s++; |
| 349 | else |
| 350 | s = str_nil; |
| 351 | if ((*retval = GDKstrdup(s)) == NULL) |
| 352 | throw(MAL, "url.getAnchor" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 353 | return MAL_SUCCEED; |
| 354 | } |
| 355 | |
| 356 | /* COMMAND "getBasename": Extract the base of the last file name of the URL, |
| 357 | * thus, excluding the file extension. |
| 358 | * SIGNATURE: getBasename(str) : str; */ |
| 359 | str |
| 360 | URLgetBasename(str *retval, url *val) |
| 361 | { |
| 362 | const char *s; |
| 363 | const char *b = NULL; |
| 364 | const char *e = NULL; |
| 365 | |
| 366 | if (val == NULL || *val == NULL) |
| 367 | throw(ILLARG, "url.getBasename" , "url missing" ); |
| 368 | if ((s = skip_scheme(*val)) == NULL || |
| 369 | (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL || |
| 370 | (s = skip_path(s, &b, &e)) == NULL) |
| 371 | throw(ILLARG, "url.getBasename" , "bad url" ); |
| 372 | if (b == NULL) { |
| 373 | *retval = GDKstrdup(str_nil); |
| 374 | } else { |
| 375 | size_t l; |
| 376 | |
| 377 | if (e != NULL) { |
| 378 | l = e - b; |
| 379 | } else { |
| 380 | l = s - b; |
| 381 | } |
| 382 | if ((*retval = GDKmalloc(l + 1)) != NULL) { |
| 383 | strcpy_len(*retval, b, l + 1); |
| 384 | } |
| 385 | } |
| 386 | if (*retval == NULL) |
| 387 | throw(MAL, "url.getBasename" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 388 | return MAL_SUCCEED; |
| 389 | } |
| 390 | |
| 391 | /* COMMAND "getContent": Retrieve the file referenced |
| 392 | * SIGNATURE: getContent(str) : str; */ |
| 393 | str |
| 394 | URLgetContent(str *retval, url *Str1) |
| 395 | { |
| 396 | (void) retval; |
| 397 | (void) Str1; |
| 398 | |
| 399 | throw(MAL, "url.getContent" , SQLSTATE(0A000) "Feature not supported" ); |
| 400 | } |
| 401 | |
| 402 | /* COMMAND "getContext": Extract the path context from the URL |
| 403 | * SIGNATURE: getContext(str) : str; */ |
| 404 | str |
| 405 | URLgetContext(str *retval, url *val) |
| 406 | { |
| 407 | const char *s; |
| 408 | const char *p; |
| 409 | |
| 410 | if (val == NULL || *val == NULL) |
| 411 | throw(ILLARG, "url.getContext" , "url missing" ); |
| 412 | if ((s = skip_scheme(*val)) == NULL || |
| 413 | (p = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL || |
| 414 | (s = skip_path(p, NULL, NULL)) == NULL) |
| 415 | throw(ILLARG, "url.getContext" , "bad url" ); |
| 416 | if (p == s) { |
| 417 | *retval = GDKstrdup(str_nil); |
| 418 | } else if ((*retval = GDKmalloc(s - p + 1)) != NULL) { |
| 419 | strcpy_len(*retval, p, s - p + 1); |
| 420 | } |
| 421 | if (*retval == NULL) |
| 422 | throw(MAL, "url.getContext" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 423 | return MAL_SUCCEED; |
| 424 | } |
| 425 | |
| 426 | /* COMMAND "getExtension": Extract the file extension of the URL |
| 427 | * SIGNATURE: getExtension(str) : str; */ |
| 428 | str |
| 429 | URLgetExtension(str *retval, url *val) |
| 430 | { |
| 431 | const char *s; |
| 432 | const char *e = NULL; |
| 433 | |
| 434 | if (val == NULL || *val == NULL) |
| 435 | throw(ILLARG, "url.getExtension" , "url missing" ); |
| 436 | if ((s = skip_scheme(*val)) == NULL || |
| 437 | (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL || |
| 438 | (s = skip_path(s, NULL, &e)) == NULL) |
| 439 | throw(ILLARG, "url.getExtension" , "bad url" ); |
| 440 | if (e == NULL) { |
| 441 | *retval = GDKstrdup(str_nil); |
| 442 | } else { |
| 443 | size_t l = s - e; |
| 444 | |
| 445 | assert(*e == '.'); |
| 446 | if ((*retval = GDKmalloc(l)) != NULL) { |
| 447 | strcpy_len(*retval, e + 1, l); |
| 448 | } |
| 449 | } |
| 450 | if (*retval == NULL) |
| 451 | throw(MAL, "url.getExtension" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 452 | return MAL_SUCCEED; |
| 453 | } |
| 454 | |
| 455 | /* COMMAND "getFile": Extract the last file name of the URL |
| 456 | * SIGNATURE: getFile(str) : str; */ |
| 457 | str |
| 458 | URLgetFile(str *retval, url *val) |
| 459 | { |
| 460 | const char *s; |
| 461 | const char *b = NULL; |
| 462 | |
| 463 | if (val == NULL || *val == NULL) |
| 464 | throw(ILLARG, "url.getFile" , "url missing" ); |
| 465 | if ((s = skip_scheme(*val)) == NULL || |
| 466 | (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL || |
| 467 | (s = skip_path(s, &b, NULL)) == NULL) |
| 468 | throw(ILLARG, "url.getFile" , "bad url" ); |
| 469 | if (b == NULL) { |
| 470 | *retval = GDKstrdup(str_nil); |
| 471 | } else { |
| 472 | size_t l; |
| 473 | |
| 474 | l = s - b; |
| 475 | if ((*retval = GDKmalloc(l + 1)) != NULL) { |
| 476 | strcpy_len(*retval, b, l + 1); |
| 477 | } |
| 478 | } |
| 479 | if (*retval == NULL) |
| 480 | throw(MAL, "url.getFile" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 481 | return MAL_SUCCEED; |
| 482 | } |
| 483 | |
| 484 | /* COMMAND "getHost": Extract the server identity from the URL */ |
| 485 | /* SIGNATURE: getHost(str) : str; */ |
| 486 | str |
| 487 | URLgetHost(str *retval, url *val) |
| 488 | { |
| 489 | const char *s; |
| 490 | const char *h = NULL; |
| 491 | const char *p = NULL; |
| 492 | |
| 493 | if (val == NULL || *val == NULL) |
| 494 | throw(ILLARG, "url.getHost" , "url missing" ); |
| 495 | if ((s = skip_scheme(*val)) == NULL || |
| 496 | (s = skip_authority(s, NULL, NULL, &h, &p)) == NULL) |
| 497 | throw(ILLARG, "url.getHost" , "bad url" ); |
| 498 | if (h == NULL) { |
| 499 | *retval = GDKstrdup(str_nil); |
| 500 | } else { |
| 501 | size_t l; |
| 502 | |
| 503 | if (p != NULL) { |
| 504 | l = p - h - 1; |
| 505 | } else { |
| 506 | l = s - h; |
| 507 | } |
| 508 | if ((*retval = GDKmalloc(l + 1)) != NULL) { |
| 509 | strcpy_len(*retval, h, l + 1); |
| 510 | } |
| 511 | } |
| 512 | if (*retval == NULL) |
| 513 | throw(MAL, "url.getHost" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 514 | return MAL_SUCCEED; |
| 515 | } |
| 516 | |
| 517 | /* COMMAND "getDomain": Extract the Internet domain from the URL |
| 518 | * SIGNATURE: getDomain(str) : str; */ |
| 519 | str |
| 520 | URLgetDomain(str *retval, url *val) |
| 521 | { |
| 522 | const char *s; |
| 523 | const char *h = NULL; |
| 524 | const char *p = NULL; |
| 525 | |
| 526 | if (val == NULL || *val == NULL) |
| 527 | throw(ILLARG, "url.getDomain" , "url missing" ); |
| 528 | if ((s = skip_scheme(*val)) == NULL || |
| 529 | (s = skip_authority(s, NULL, NULL, &h, &p)) == NULL) |
| 530 | throw(ILLARG, "url.getDomain" , "bad url" ); |
| 531 | if (h == NULL) { |
| 532 | *retval = GDKstrdup(str_nil); |
| 533 | } else { |
| 534 | size_t l; |
| 535 | |
| 536 | if (p != NULL) |
| 537 | p--; |
| 538 | else |
| 539 | p = s; |
| 540 | l = 0; |
| 541 | while (p > h && p[-1] != '.') { |
| 542 | p--; |
| 543 | l++; |
| 544 | } |
| 545 | if ((*retval = GDKmalloc(l + 1)) != NULL) { |
| 546 | strcpy_len(*retval, p, l + 1); |
| 547 | } |
| 548 | } |
| 549 | if (*retval == NULL) |
| 550 | throw(MAL, "url.getDomain" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 551 | return MAL_SUCCEED; |
| 552 | } |
| 553 | |
| 554 | /* COMMAND "getPort": Extract the port id from the URL |
| 555 | * SIGNATURE: getPort(str) : str; */ |
| 556 | str |
| 557 | URLgetPort(str *retval, url *val) |
| 558 | { |
| 559 | const char *s; |
| 560 | const char *p = NULL; |
| 561 | |
| 562 | if (val == NULL || *val == NULL) |
| 563 | throw(ILLARG, "url.getPort" , "url missing" ); |
| 564 | if ((s = skip_scheme(*val)) == NULL || |
| 565 | (s = skip_authority(s, NULL, NULL, NULL, &p)) == NULL) |
| 566 | throw(ILLARG, "url.getPort" , "bad url" ); |
| 567 | if (p == NULL) { |
| 568 | *retval = GDKstrdup(str_nil); |
| 569 | } else { |
| 570 | size_t l = s - p; |
| 571 | |
| 572 | if ((*retval = GDKmalloc(l + 1)) != NULL) { |
| 573 | strcpy_len(*retval, p, l + 1); |
| 574 | } |
| 575 | } |
| 576 | if (*retval == NULL) |
| 577 | throw(MAL, "url.getPort" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 578 | return MAL_SUCCEED; |
| 579 | } |
| 580 | |
| 581 | /* COMMAND "getProtocol": Extract the protocol from the URL |
| 582 | * SIGNATURE: getProtocol(str) : str; */ |
| 583 | str |
| 584 | URLgetProtocol(str *retval, url *val) |
| 585 | { |
| 586 | const char *s; |
| 587 | size_t l; |
| 588 | |
| 589 | if (val == NULL || *val == NULL) |
| 590 | throw(ILLARG, "url.getProtocol" , "url missing" ); |
| 591 | if ((s = skip_scheme(*val)) == NULL) |
| 592 | throw(ILLARG, "url.getProtocol" , "bad url" ); |
| 593 | l = s - *val; |
| 594 | if ((*retval = GDKmalloc(l)) == NULL) |
| 595 | throw(MAL, "url.getProtocol" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 596 | strcpy_len(*retval, *val, l); |
| 597 | return MAL_SUCCEED; |
| 598 | } |
| 599 | |
| 600 | /* COMMAND "getQuery": Extract the query part from the URL |
| 601 | * SIGNATURE: getQuery(str) : str; */ |
| 602 | str |
| 603 | URLgetQuery(str *retval, url *val) |
| 604 | { |
| 605 | const char *s; |
| 606 | const char *q; |
| 607 | |
| 608 | if (val == NULL || *val == NULL) |
| 609 | throw(ILLARG, "url.getQuery" , "url missing" ); |
| 610 | if ((s = skip_scheme(*val)) == NULL || |
| 611 | (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL || |
| 612 | (q = skip_path(s, NULL, NULL)) == NULL || |
| 613 | (s = skip_search(q)) == NULL) |
| 614 | throw(ILLARG, "url.getQuery" , "bad url" ); |
| 615 | if (*q == '?') { |
| 616 | size_t l; |
| 617 | |
| 618 | q++; |
| 619 | l = s - q; |
| 620 | if ((*retval = GDKmalloc(l + 1)) != NULL) { |
| 621 | strcpy_len(*retval, q, l + 1); |
| 622 | } |
| 623 | } else { |
| 624 | *retval = GDKstrdup(str_nil); |
| 625 | } |
| 626 | if (*retval == NULL) |
| 627 | throw(MAL, "url.getQuery" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 628 | return MAL_SUCCEED; |
| 629 | } |
| 630 | |
| 631 | /* COMMAND "getRobotURL": Extract the location of the robot control file |
| 632 | * SIGNATURE: getRobotURL(str) : str; */ |
| 633 | str |
| 634 | URLgetRobotURL(str *retval, url *val) |
| 635 | { |
| 636 | const char *s; |
| 637 | size_t l; |
| 638 | |
| 639 | if (val == NULL || *val == NULL) |
| 640 | throw(ILLARG, "url.getQuery" , "url missing" ); |
| 641 | if ((s = skip_scheme(*val)) == NULL || |
| 642 | (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL) |
| 643 | throw(ILLARG, "url.getQuery" , "bad url" ); |
| 644 | l = s - *val; |
| 645 | if ((*retval = GDKmalloc(l + sizeof("/robots.txt" ))) == NULL) |
| 646 | throw(MAL, "url.getQuery" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 647 | sprintf(*retval, "%.*s/robots.txt" , (int) l, *val); |
| 648 | return MAL_SUCCEED; |
| 649 | } |
| 650 | |
| 651 | |
| 652 | /* COMMAND "getUser": Extract the user identity from the URL |
| 653 | * SIGNATURE: getUser(str) : str; */ |
| 654 | str |
| 655 | URLgetUser(str *retval, url *val) |
| 656 | { |
| 657 | const char *s; |
| 658 | const char *p; |
| 659 | const char *u; |
| 660 | |
| 661 | if (val == NULL || *val == NULL) |
| 662 | throw(ILLARG, "url.getUser" , "url missing" ); |
| 663 | if ((s = skip_scheme(*val)) == NULL || |
| 664 | (p = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL || |
| 665 | (s = skip_path(p, NULL, NULL)) == NULL) |
| 666 | throw(ILLARG, "url.getUser" , "bad url" ); |
| 667 | if (p == s || *p != '/' || p[1] != '~') { |
| 668 | *retval = GDKstrdup(str_nil); |
| 669 | } else { |
| 670 | size_t l; |
| 671 | |
| 672 | u = p + 2; |
| 673 | for (p = u; p < s && *p != '/'; p++) |
| 674 | ; |
| 675 | l = p - u; |
| 676 | if ((*retval = GDKmalloc(l + 1)) != NULL) { |
| 677 | strcpy_len(*retval, u, l + 1); |
| 678 | } |
| 679 | } |
| 680 | if (*retval == NULL) |
| 681 | throw(MAL, "url.getUser" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 682 | return MAL_SUCCEED; |
| 683 | } |
| 684 | |
| 685 | /* COMMAND "isaURL": Check conformity of the URL syntax |
| 686 | * SIGNATURE: isaURL(str) : bit; */ |
| 687 | str |
| 688 | URLisaURL(bit *retval, url *val) |
| 689 | { |
| 690 | if (val == NULL || *val == NULL) |
| 691 | throw(ILLARG, "url.isaURL" , "url missing" ); |
| 692 | *retval = skip_scheme(*val) != NULL; |
| 693 | return MAL_SUCCEED; |
| 694 | } |
| 695 | |
| 696 | str |
| 697 | URLnew(url *u, str *val) |
| 698 | { |
| 699 | *u = GDKstrdup(*val); |
| 700 | if (*u == NULL) |
| 701 | throw(MAL, "url.new" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 702 | return MAL_SUCCEED; |
| 703 | } |
| 704 | |
| 705 | str |
| 706 | URLnew3(url *u, str *protocol, str *server, str *file) |
| 707 | { |
| 708 | size_t l; |
| 709 | |
| 710 | l = GDK_STRLEN(*file) + GDK_STRLEN(*server) + GDK_STRLEN(*protocol) + 10; |
| 711 | *u = GDKmalloc(l); |
| 712 | if (*u == NULL) |
| 713 | throw(MAL, "url.newurl" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 714 | snprintf(*u, l, "%s://%s/%s" , *protocol, *server, *file); |
| 715 | return MAL_SUCCEED; |
| 716 | } |
| 717 | |
| 718 | str |
| 719 | URLnew4(url *u, str *protocol, str *server, int *port, str *file) |
| 720 | { |
| 721 | str Protocol = *protocol; |
| 722 | str Server = *server; |
| 723 | str File = *file; |
| 724 | size_t l; |
| 725 | |
| 726 | if (GDK_STRNIL(File)) |
| 727 | File = "" ; |
| 728 | else if (*File == '/') |
| 729 | File++; |
| 730 | if (GDK_STRNIL(Server)) |
| 731 | Server = "" ; |
| 732 | if (GDK_STRNIL(Protocol)) |
| 733 | Protocol = "" ; |
| 734 | l = strlen(File) + strlen(Server) + strlen(Protocol) + 20; |
| 735 | *u = GDKmalloc(l); |
| 736 | if (*u == NULL) |
| 737 | throw(MAL, "url.newurl" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 738 | snprintf(*u, l, "%s://%s:%d/%s" , Protocol, Server, *port, File); |
| 739 | return MAL_SUCCEED; |
| 740 | } |
| 741 | |
| 742 | str URLnoop(url *u, url *val) |
| 743 | { |
| 744 | *u = GDKstrdup(*val); |
| 745 | if (*u == NULL) |
| 746 | throw(MAL, "url.noop" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
| 747 | return MAL_SUCCEED; |
| 748 | } |
| 749 | |