1 | /* |
2 | * This Source Code Form is subject to the terms of the Mozilla Public |
3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. |
5 | * |
6 | * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. |
7 | */ |
8 | |
9 | /* |
10 | * M. Kersten |
11 | * Y. Zhang |
12 | * The URL module |
13 | * The URL module contains a collection of commands to manipulate |
14 | * Uniform Resource Locators - a resource on the World Wide Web- |
15 | * represented as a string in Monet. The URL can represent |
16 | * anything from a file, a directory or a complete movie. |
17 | * This module is geared towards manipulation of their name only. |
18 | * A complementary module can be used to gain access.[IOgate] |
19 | * |
20 | * The URL syntax is specified in RFC2396, Uniform Resource Identifiers |
21 | * (URI): Generic Syntax. The URL syntax is dependent upon the scheme. |
22 | * In general, a URL has the form <scheme>:<scheme-specific-part>. |
23 | * Thus, accepting a valid URL is a simple proccess, unless the scheme |
24 | * is known and schema-specific syntax is checked (e.g., http or ftp |
25 | * scheme). For the URL module implemented here, we assume some common |
26 | * fields of the <scheme-specific-part> that are shared among different |
27 | * schemes. |
28 | * |
29 | * The core of the extension involves several operators to extract |
30 | * portions of the URLs for further manipulation. In particular, |
31 | * the domain, the server, and the protocol, and the file extension |
32 | * can be extracted without copying the complete URL from the heap |
33 | * into a string variable first. |
34 | * |
35 | * The commands provided are based on the corresponding Java class. |
36 | * |
37 | * A future version should use a special atom, because this may save |
38 | * considerable space. Alternatively, break the URL strings into |
39 | * components and represent them with a bunch of BATs. An intermediate |
40 | * step would be to refine the atom STR, then it would be possible to |
41 | * redefine hashing. |
42 | */ |
43 | |
44 | #include "monetdb_config.h" |
45 | #include "mal.h" |
46 | #include "url.h" |
47 | #include "mal_exception.h" |
48 | |
49 | static char x2c(char *what); |
50 | |
51 | /* SCHEME "://" AUTHORITY [ PATH ] [ "?" SEARCH ] [ "#" FRAGMENT ] |
52 | * AUTHORITY is: [ USER [ ":" PASSWORD ] "@" ] HOST [ ":" PORT ] */ |
53 | |
54 | /* return pointer to string after the scheme and colon; input: pointer |
55 | * to start of URI */ |
56 | static const char * |
57 | skip_scheme(const char *uri) |
58 | { |
59 | if (('a' <= *uri && *uri <= 'z') || ('A' <= *uri && *uri <= 'Z')) { |
60 | uri++; |
61 | while (('a' <= *uri && *uri <= 'z') || |
62 | ('A' <= *uri && *uri <= 'Z') || |
63 | isdigit((unsigned char) *uri) || |
64 | *uri == '+' || *uri == '-' || *uri == '.') |
65 | uri++; |
66 | if (*uri == ':') |
67 | return uri + 1; |
68 | } |
69 | return NULL; |
70 | } |
71 | |
72 | #define ishex(c) isxdigit((unsigned char) (c)) |
73 | #define isreserved(c) ((c) == ';' || (c) == '/' || (c) == '?' || \ |
74 | (c) == ':' || (c) == '@' || (c) == '&' || \ |
75 | (c) == '=' || (c) == '+' || (c) == '$' || \ |
76 | (c) == ',') |
77 | #define isunreserved(c) (('a' <= (c) && (c) <= 'z') || \ |
78 | ('A' <= (c) && (c) <= 'Z') || \ |
79 | isdigit((unsigned char) (c)) || \ |
80 | (c) == '-' || (c) == '_' || (c) == '.' || \ |
81 | (c) == '!' || (c) == '~' || (c) == '*' || \ |
82 | (c) == '\'' || (c) == '(' || (c) == ')') |
83 | |
84 | /* return pointer to string after the authority, filling in pointers |
85 | * to start of user, password, host, and port, if provided; input: |
86 | * result of skip_scheme() */ |
87 | static const char * |
88 | skip_authority(const char *uri, const char **userp, const char **passp, const char **hostp, const char **portp) |
89 | { |
90 | const char *user = NULL, *pass = NULL, *host = NULL, *port = NULL; |
91 | |
92 | if (uri[0] == '/' && uri[1] == '/') { |
93 | uri += 2; |
94 | user = host = uri; |
95 | while (isunreserved(*uri) || |
96 | (*uri == '%' && ishex(uri[1]) && ishex(uri[2])) || |
97 | *uri == ';' || *uri == ':' || *uri == '=' || *uri == '+'|| *uri == '$' || *uri == ',' || |
98 | *uri == '@') { |
99 | if (*uri == ':') { |
100 | if (user == host) |
101 | port = pass = uri + 1; |
102 | else |
103 | port = uri + 1; |
104 | } else if (*uri == '@') |
105 | host = uri + 1; |
106 | uri += *uri == '%' ? 3 : 1; |
107 | } |
108 | if (user == host) { |
109 | /* no "@", so no user info */ |
110 | if (userp) |
111 | *userp = NULL; |
112 | if (passp) |
113 | *passp = NULL; |
114 | } else { |
115 | if (userp) |
116 | *userp = user; |
117 | if (passp) |
118 | *passp = pass; |
119 | } |
120 | if (portp) |
121 | *portp = port; |
122 | if (hostp) |
123 | *hostp = host; |
124 | return uri; |
125 | } |
126 | return NULL; |
127 | } |
128 | |
129 | /* return pointer to string after the path, filling in pointer to |
130 | * start of last component and extension of that component; input: |
131 | * result of skip_authority() */ |
132 | static const char * |
133 | skip_path(const char *uri, const char **basep, const char **extp) |
134 | { |
135 | const char *base = NULL, *ext = NULL; |
136 | |
137 | if (*uri == '/') { |
138 | uri++; |
139 | base = uri; |
140 | while (isunreserved(*uri) || |
141 | (*uri == '%' && ishex(uri[1]) && ishex(uri[2])) || |
142 | *uri == ':' || *uri == '@' || *uri == '&' || *uri == '=' || *uri == '+' || *uri == '$' || *uri == ',' || |
143 | *uri == ';' || |
144 | *uri == '/') { |
145 | if (*uri == '/') { |
146 | base = uri + 1; |
147 | ext = NULL; |
148 | } else if (*uri == '.' && ext == NULL && uri != base) { |
149 | ext = uri; |
150 | } |
151 | uri += *uri == '%' ? 3 : 1; |
152 | } |
153 | } |
154 | if (basep) |
155 | *basep = base; |
156 | if (extp) |
157 | *extp = ext; |
158 | return uri; |
159 | } |
160 | |
161 | /* return pointer to string after the search string; input: result of |
162 | * skip_path() */ |
163 | static const char * |
164 | skip_search(const char *uri) |
165 | { |
166 | if (*uri == '?') { |
167 | uri++; |
168 | while (isreserved(*uri) || isunreserved(*uri) || |
169 | (*uri == '%' && ishex(uri[1]) && ishex(uri[2]))) { |
170 | uri += *uri == '%' ? 3 : 1; |
171 | } |
172 | } |
173 | return uri; |
174 | } |
175 | |
176 | static int needEscape(char c){ |
177 | if( isalnum((unsigned char)c) ) |
178 | return 0; |
179 | if( c == '#' || c == '-' || c == '_' || c == '.' || c == '!' || |
180 | c == '~' || c == '*' || c == '\'' || c == '(' || c == ')' ) |
181 | return 0; |
182 | return 1; |
183 | } |
184 | |
185 | /* COMMAND "escape": this function applies the URI escaping rules defined in |
186 | * section 2 of [RFC 3986] to the string supplied as 's'. |
187 | * The effect of the function is to escape a set of identified characters in |
188 | * the string. Each such character is replaced in the string by an escape |
189 | * sequence, which is formed by encoding the character as a sequence of octets |
190 | * in UTF-8, and then reprensenting each of these octets in the form %HH. |
191 | * |
192 | * All characters are escaped other than: |
193 | * [a-z], [A-Z], [0-9], "#", "-", "_", ".", "!", "~", "*", "'", "(", ")" |
194 | * |
195 | * This function must always generate hexadecimal values using the upper-case |
196 | * letters A-F. |
197 | * |
198 | * SIGNATURE: escape(str) : str; */ |
199 | str |
200 | escape_str(str *retval, str s) |
201 | { |
202 | int x, y; |
203 | str res; |
204 | |
205 | if (!s) |
206 | throw(ILLARG, "url.escape" , "url missing" ); |
207 | |
208 | if (!( res = (str) GDKmalloc( strlen(s) * 3 ) )) |
209 | throw(MAL, "url.escape" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
210 | for (x = 0, y = 0; s[x]; ++x, ++y) { |
211 | if (needEscape(s[x])) { |
212 | if (s[x] == ' ') { |
213 | res[y] = '+'; |
214 | } else { |
215 | sprintf(res+y, "%%%2x" , (uint8_t) s[x]); |
216 | y += 2; |
217 | } |
218 | } else { |
219 | res[y] = s[x]; |
220 | } |
221 | } |
222 | res[y] = '\0'; |
223 | |
224 | if ((*retval = GDKrealloc(res, strlen(res)+1)) == NULL) { |
225 | GDKfree(res); |
226 | throw(MAL, "url.escape" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
227 | } |
228 | return MAL_SUCCEED; |
229 | } |
230 | |
231 | /* COMMAND "unescape": Convert hexadecimal representations to ASCII characters. |
232 | * All sequences of the form "% HEX HEX" are unescaped. |
233 | * SIGNATURE: unescape(str) : str; */ |
234 | str |
235 | unescape_str(str *retval, str s) |
236 | { |
237 | int x, y; |
238 | str res; |
239 | |
240 | if (!s) |
241 | throw(ILLARG, "url.escape" , "url missing" ); |
242 | |
243 | res = (str) GDKmalloc(strlen(s)); |
244 | if (!res) |
245 | throw(MAL, "url.unescape" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
246 | |
247 | for (x = 0, y = 0; s[x]; ++x, ++y) { |
248 | if (s[x] == '%') { |
249 | res[y] = x2c(&s[x + 1]); |
250 | x += 2; |
251 | } else { |
252 | res[y] = s[x]; |
253 | } |
254 | } |
255 | res[y] = '\0'; |
256 | |
257 | if ((*retval = GDKrealloc(res, strlen(res)+1)) == NULL) { |
258 | GDKfree(res); |
259 | throw(MAL, "url.unescape" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
260 | } |
261 | return MAL_SUCCEED; |
262 | } |
263 | |
264 | /* |
265 | * Utilities |
266 | */ |
267 | |
268 | static char |
269 | x2c(char *what) |
270 | { |
271 | char digit; |
272 | |
273 | digit = (what[0] >= 'A' ? ((what[0] & 0xdf) - 'A') + 10 : (what[0] - '0')); |
274 | digit *= 16; |
275 | digit += (what[1] >= 'A' ? ((what[1] & 0xdf) - 'A') + 10 : (what[1] - '0')); |
276 | return (digit); |
277 | } |
278 | |
279 | /* |
280 | * Wrapping |
281 | * Here you find the wrappers around the V4 url library included above. |
282 | */ |
283 | |
284 | ssize_t |
285 | URLfromString(const char *src, size_t *len, str *u, bool external) |
286 | { |
287 | size_t l = strlen(src) + 1; |
288 | |
289 | if (*len < l || *u == NULL) { |
290 | GDKfree(*u); |
291 | *u = GDKmalloc(l); |
292 | if (*u == NULL) |
293 | return -1; |
294 | *len = l; |
295 | } |
296 | |
297 | /* actually parse the message for valid url */ |
298 | |
299 | if (external && strcmp(src, "nil" ) == 0) |
300 | strcpy(*u, str_nil); |
301 | else |
302 | memcpy(*u, src, l); |
303 | return (ssize_t) l - 1; |
304 | } |
305 | |
306 | ssize_t |
307 | URLtoString(str *s, size_t *len, const char *src, bool external) |
308 | { |
309 | size_t l = strlen(src); |
310 | |
311 | if (external) |
312 | l += 2; |
313 | if (l >= *len || *s == NULL) { |
314 | GDKfree(*s); |
315 | *s = GDKmalloc(l + 1); |
316 | if (*s == NULL) |
317 | return -1; |
318 | *len = l + 1; |
319 | } |
320 | |
321 | if (external) { |
322 | if (GDK_STRNIL(src)) { |
323 | strcpy(*s, "nil" ); |
324 | return 3; |
325 | } |
326 | snprintf(*s, l + 1, "\"%s\"" , src); |
327 | } else { |
328 | strcpy(*s, src); |
329 | } |
330 | return (ssize_t) l; |
331 | } |
332 | |
333 | /* COMMAND "getAnchor": Extract an anchor (reference) from the URL |
334 | * SIGNATURE: getAnchor(url) : str; */ |
335 | str |
336 | URLgetAnchor(str *retval, url *val) |
337 | { |
338 | const char *s; |
339 | |
340 | if (val == NULL || *val == NULL) |
341 | throw(ILLARG, "url.getAnchor" , "url missing" ); |
342 | if ((s = skip_scheme(*val)) == NULL || |
343 | (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL || |
344 | (s = skip_path(s, NULL, NULL)) == NULL || |
345 | (s = skip_search(s)) == NULL) |
346 | throw(ILLARG, "url.getAnchor" , "bad url" ); |
347 | if (*s == '#') |
348 | s++; |
349 | else |
350 | s = str_nil; |
351 | if ((*retval = GDKstrdup(s)) == NULL) |
352 | throw(MAL, "url.getAnchor" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
353 | return MAL_SUCCEED; |
354 | } |
355 | |
356 | /* COMMAND "getBasename": Extract the base of the last file name of the URL, |
357 | * thus, excluding the file extension. |
358 | * SIGNATURE: getBasename(str) : str; */ |
359 | str |
360 | URLgetBasename(str *retval, url *val) |
361 | { |
362 | const char *s; |
363 | const char *b = NULL; |
364 | const char *e = NULL; |
365 | |
366 | if (val == NULL || *val == NULL) |
367 | throw(ILLARG, "url.getBasename" , "url missing" ); |
368 | if ((s = skip_scheme(*val)) == NULL || |
369 | (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL || |
370 | (s = skip_path(s, &b, &e)) == NULL) |
371 | throw(ILLARG, "url.getBasename" , "bad url" ); |
372 | if (b == NULL) { |
373 | *retval = GDKstrdup(str_nil); |
374 | } else { |
375 | size_t l; |
376 | |
377 | if (e != NULL) { |
378 | l = e - b; |
379 | } else { |
380 | l = s - b; |
381 | } |
382 | if ((*retval = GDKmalloc(l + 1)) != NULL) { |
383 | strcpy_len(*retval, b, l + 1); |
384 | } |
385 | } |
386 | if (*retval == NULL) |
387 | throw(MAL, "url.getBasename" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
388 | return MAL_SUCCEED; |
389 | } |
390 | |
391 | /* COMMAND "getContent": Retrieve the file referenced |
392 | * SIGNATURE: getContent(str) : str; */ |
393 | str |
394 | URLgetContent(str *retval, url *Str1) |
395 | { |
396 | (void) retval; |
397 | (void) Str1; |
398 | |
399 | throw(MAL, "url.getContent" , SQLSTATE(0A000) "Feature not supported" ); |
400 | } |
401 | |
402 | /* COMMAND "getContext": Extract the path context from the URL |
403 | * SIGNATURE: getContext(str) : str; */ |
404 | str |
405 | URLgetContext(str *retval, url *val) |
406 | { |
407 | const char *s; |
408 | const char *p; |
409 | |
410 | if (val == NULL || *val == NULL) |
411 | throw(ILLARG, "url.getContext" , "url missing" ); |
412 | if ((s = skip_scheme(*val)) == NULL || |
413 | (p = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL || |
414 | (s = skip_path(p, NULL, NULL)) == NULL) |
415 | throw(ILLARG, "url.getContext" , "bad url" ); |
416 | if (p == s) { |
417 | *retval = GDKstrdup(str_nil); |
418 | } else if ((*retval = GDKmalloc(s - p + 1)) != NULL) { |
419 | strcpy_len(*retval, p, s - p + 1); |
420 | } |
421 | if (*retval == NULL) |
422 | throw(MAL, "url.getContext" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
423 | return MAL_SUCCEED; |
424 | } |
425 | |
426 | /* COMMAND "getExtension": Extract the file extension of the URL |
427 | * SIGNATURE: getExtension(str) : str; */ |
428 | str |
429 | URLgetExtension(str *retval, url *val) |
430 | { |
431 | const char *s; |
432 | const char *e = NULL; |
433 | |
434 | if (val == NULL || *val == NULL) |
435 | throw(ILLARG, "url.getExtension" , "url missing" ); |
436 | if ((s = skip_scheme(*val)) == NULL || |
437 | (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL || |
438 | (s = skip_path(s, NULL, &e)) == NULL) |
439 | throw(ILLARG, "url.getExtension" , "bad url" ); |
440 | if (e == NULL) { |
441 | *retval = GDKstrdup(str_nil); |
442 | } else { |
443 | size_t l = s - e; |
444 | |
445 | assert(*e == '.'); |
446 | if ((*retval = GDKmalloc(l)) != NULL) { |
447 | strcpy_len(*retval, e + 1, l); |
448 | } |
449 | } |
450 | if (*retval == NULL) |
451 | throw(MAL, "url.getExtension" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
452 | return MAL_SUCCEED; |
453 | } |
454 | |
455 | /* COMMAND "getFile": Extract the last file name of the URL |
456 | * SIGNATURE: getFile(str) : str; */ |
457 | str |
458 | URLgetFile(str *retval, url *val) |
459 | { |
460 | const char *s; |
461 | const char *b = NULL; |
462 | |
463 | if (val == NULL || *val == NULL) |
464 | throw(ILLARG, "url.getFile" , "url missing" ); |
465 | if ((s = skip_scheme(*val)) == NULL || |
466 | (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL || |
467 | (s = skip_path(s, &b, NULL)) == NULL) |
468 | throw(ILLARG, "url.getFile" , "bad url" ); |
469 | if (b == NULL) { |
470 | *retval = GDKstrdup(str_nil); |
471 | } else { |
472 | size_t l; |
473 | |
474 | l = s - b; |
475 | if ((*retval = GDKmalloc(l + 1)) != NULL) { |
476 | strcpy_len(*retval, b, l + 1); |
477 | } |
478 | } |
479 | if (*retval == NULL) |
480 | throw(MAL, "url.getFile" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
481 | return MAL_SUCCEED; |
482 | } |
483 | |
484 | /* COMMAND "getHost": Extract the server identity from the URL */ |
485 | /* SIGNATURE: getHost(str) : str; */ |
486 | str |
487 | URLgetHost(str *retval, url *val) |
488 | { |
489 | const char *s; |
490 | const char *h = NULL; |
491 | const char *p = NULL; |
492 | |
493 | if (val == NULL || *val == NULL) |
494 | throw(ILLARG, "url.getHost" , "url missing" ); |
495 | if ((s = skip_scheme(*val)) == NULL || |
496 | (s = skip_authority(s, NULL, NULL, &h, &p)) == NULL) |
497 | throw(ILLARG, "url.getHost" , "bad url" ); |
498 | if (h == NULL) { |
499 | *retval = GDKstrdup(str_nil); |
500 | } else { |
501 | size_t l; |
502 | |
503 | if (p != NULL) { |
504 | l = p - h - 1; |
505 | } else { |
506 | l = s - h; |
507 | } |
508 | if ((*retval = GDKmalloc(l + 1)) != NULL) { |
509 | strcpy_len(*retval, h, l + 1); |
510 | } |
511 | } |
512 | if (*retval == NULL) |
513 | throw(MAL, "url.getHost" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
514 | return MAL_SUCCEED; |
515 | } |
516 | |
517 | /* COMMAND "getDomain": Extract the Internet domain from the URL |
518 | * SIGNATURE: getDomain(str) : str; */ |
519 | str |
520 | URLgetDomain(str *retval, url *val) |
521 | { |
522 | const char *s; |
523 | const char *h = NULL; |
524 | const char *p = NULL; |
525 | |
526 | if (val == NULL || *val == NULL) |
527 | throw(ILLARG, "url.getDomain" , "url missing" ); |
528 | if ((s = skip_scheme(*val)) == NULL || |
529 | (s = skip_authority(s, NULL, NULL, &h, &p)) == NULL) |
530 | throw(ILLARG, "url.getDomain" , "bad url" ); |
531 | if (h == NULL) { |
532 | *retval = GDKstrdup(str_nil); |
533 | } else { |
534 | size_t l; |
535 | |
536 | if (p != NULL) |
537 | p--; |
538 | else |
539 | p = s; |
540 | l = 0; |
541 | while (p > h && p[-1] != '.') { |
542 | p--; |
543 | l++; |
544 | } |
545 | if ((*retval = GDKmalloc(l + 1)) != NULL) { |
546 | strcpy_len(*retval, p, l + 1); |
547 | } |
548 | } |
549 | if (*retval == NULL) |
550 | throw(MAL, "url.getDomain" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
551 | return MAL_SUCCEED; |
552 | } |
553 | |
554 | /* COMMAND "getPort": Extract the port id from the URL |
555 | * SIGNATURE: getPort(str) : str; */ |
556 | str |
557 | URLgetPort(str *retval, url *val) |
558 | { |
559 | const char *s; |
560 | const char *p = NULL; |
561 | |
562 | if (val == NULL || *val == NULL) |
563 | throw(ILLARG, "url.getPort" , "url missing" ); |
564 | if ((s = skip_scheme(*val)) == NULL || |
565 | (s = skip_authority(s, NULL, NULL, NULL, &p)) == NULL) |
566 | throw(ILLARG, "url.getPort" , "bad url" ); |
567 | if (p == NULL) { |
568 | *retval = GDKstrdup(str_nil); |
569 | } else { |
570 | size_t l = s - p; |
571 | |
572 | if ((*retval = GDKmalloc(l + 1)) != NULL) { |
573 | strcpy_len(*retval, p, l + 1); |
574 | } |
575 | } |
576 | if (*retval == NULL) |
577 | throw(MAL, "url.getPort" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
578 | return MAL_SUCCEED; |
579 | } |
580 | |
581 | /* COMMAND "getProtocol": Extract the protocol from the URL |
582 | * SIGNATURE: getProtocol(str) : str; */ |
583 | str |
584 | URLgetProtocol(str *retval, url *val) |
585 | { |
586 | const char *s; |
587 | size_t l; |
588 | |
589 | if (val == NULL || *val == NULL) |
590 | throw(ILLARG, "url.getProtocol" , "url missing" ); |
591 | if ((s = skip_scheme(*val)) == NULL) |
592 | throw(ILLARG, "url.getProtocol" , "bad url" ); |
593 | l = s - *val; |
594 | if ((*retval = GDKmalloc(l)) == NULL) |
595 | throw(MAL, "url.getProtocol" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
596 | strcpy_len(*retval, *val, l); |
597 | return MAL_SUCCEED; |
598 | } |
599 | |
600 | /* COMMAND "getQuery": Extract the query part from the URL |
601 | * SIGNATURE: getQuery(str) : str; */ |
602 | str |
603 | URLgetQuery(str *retval, url *val) |
604 | { |
605 | const char *s; |
606 | const char *q; |
607 | |
608 | if (val == NULL || *val == NULL) |
609 | throw(ILLARG, "url.getQuery" , "url missing" ); |
610 | if ((s = skip_scheme(*val)) == NULL || |
611 | (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL || |
612 | (q = skip_path(s, NULL, NULL)) == NULL || |
613 | (s = skip_search(q)) == NULL) |
614 | throw(ILLARG, "url.getQuery" , "bad url" ); |
615 | if (*q == '?') { |
616 | size_t l; |
617 | |
618 | q++; |
619 | l = s - q; |
620 | if ((*retval = GDKmalloc(l + 1)) != NULL) { |
621 | strcpy_len(*retval, q, l + 1); |
622 | } |
623 | } else { |
624 | *retval = GDKstrdup(str_nil); |
625 | } |
626 | if (*retval == NULL) |
627 | throw(MAL, "url.getQuery" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
628 | return MAL_SUCCEED; |
629 | } |
630 | |
631 | /* COMMAND "getRobotURL": Extract the location of the robot control file |
632 | * SIGNATURE: getRobotURL(str) : str; */ |
633 | str |
634 | URLgetRobotURL(str *retval, url *val) |
635 | { |
636 | const char *s; |
637 | size_t l; |
638 | |
639 | if (val == NULL || *val == NULL) |
640 | throw(ILLARG, "url.getQuery" , "url missing" ); |
641 | if ((s = skip_scheme(*val)) == NULL || |
642 | (s = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL) |
643 | throw(ILLARG, "url.getQuery" , "bad url" ); |
644 | l = s - *val; |
645 | if ((*retval = GDKmalloc(l + sizeof("/robots.txt" ))) == NULL) |
646 | throw(MAL, "url.getQuery" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
647 | sprintf(*retval, "%.*s/robots.txt" , (int) l, *val); |
648 | return MAL_SUCCEED; |
649 | } |
650 | |
651 | |
652 | /* COMMAND "getUser": Extract the user identity from the URL |
653 | * SIGNATURE: getUser(str) : str; */ |
654 | str |
655 | URLgetUser(str *retval, url *val) |
656 | { |
657 | const char *s; |
658 | const char *p; |
659 | const char *u; |
660 | |
661 | if (val == NULL || *val == NULL) |
662 | throw(ILLARG, "url.getUser" , "url missing" ); |
663 | if ((s = skip_scheme(*val)) == NULL || |
664 | (p = skip_authority(s, NULL, NULL, NULL, NULL)) == NULL || |
665 | (s = skip_path(p, NULL, NULL)) == NULL) |
666 | throw(ILLARG, "url.getUser" , "bad url" ); |
667 | if (p == s || *p != '/' || p[1] != '~') { |
668 | *retval = GDKstrdup(str_nil); |
669 | } else { |
670 | size_t l; |
671 | |
672 | u = p + 2; |
673 | for (p = u; p < s && *p != '/'; p++) |
674 | ; |
675 | l = p - u; |
676 | if ((*retval = GDKmalloc(l + 1)) != NULL) { |
677 | strcpy_len(*retval, u, l + 1); |
678 | } |
679 | } |
680 | if (*retval == NULL) |
681 | throw(MAL, "url.getUser" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
682 | return MAL_SUCCEED; |
683 | } |
684 | |
685 | /* COMMAND "isaURL": Check conformity of the URL syntax |
686 | * SIGNATURE: isaURL(str) : bit; */ |
687 | str |
688 | URLisaURL(bit *retval, url *val) |
689 | { |
690 | if (val == NULL || *val == NULL) |
691 | throw(ILLARG, "url.isaURL" , "url missing" ); |
692 | *retval = skip_scheme(*val) != NULL; |
693 | return MAL_SUCCEED; |
694 | } |
695 | |
696 | str |
697 | URLnew(url *u, str *val) |
698 | { |
699 | *u = GDKstrdup(*val); |
700 | if (*u == NULL) |
701 | throw(MAL, "url.new" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
702 | return MAL_SUCCEED; |
703 | } |
704 | |
705 | str |
706 | URLnew3(url *u, str *protocol, str *server, str *file) |
707 | { |
708 | size_t l; |
709 | |
710 | l = GDK_STRLEN(*file) + GDK_STRLEN(*server) + GDK_STRLEN(*protocol) + 10; |
711 | *u = GDKmalloc(l); |
712 | if (*u == NULL) |
713 | throw(MAL, "url.newurl" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
714 | snprintf(*u, l, "%s://%s/%s" , *protocol, *server, *file); |
715 | return MAL_SUCCEED; |
716 | } |
717 | |
718 | str |
719 | URLnew4(url *u, str *protocol, str *server, int *port, str *file) |
720 | { |
721 | str Protocol = *protocol; |
722 | str Server = *server; |
723 | str File = *file; |
724 | size_t l; |
725 | |
726 | if (GDK_STRNIL(File)) |
727 | File = "" ; |
728 | else if (*File == '/') |
729 | File++; |
730 | if (GDK_STRNIL(Server)) |
731 | Server = "" ; |
732 | if (GDK_STRNIL(Protocol)) |
733 | Protocol = "" ; |
734 | l = strlen(File) + strlen(Server) + strlen(Protocol) + 20; |
735 | *u = GDKmalloc(l); |
736 | if (*u == NULL) |
737 | throw(MAL, "url.newurl" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
738 | snprintf(*u, l, "%s://%s:%d/%s" , Protocol, Server, *port, File); |
739 | return MAL_SUCCEED; |
740 | } |
741 | |
742 | str URLnoop(url *u, url *val) |
743 | { |
744 | *u = GDKstrdup(*val); |
745 | if (*u == NULL) |
746 | throw(MAL, "url.noop" , SQLSTATE(HY001) MAL_MALLOC_FAIL); |
747 | return MAL_SUCCEED; |
748 | } |
749 | |