tool_urlglob.c source code [Curl/src/tool_urlglob.c]

1	/***************************************************************************
2	* _ _ ____ _
3	* Project ___\| \| \| \| _ \\| \|
4	* / __\| \| \| \| \|_) \| \|
5	* \| (__\| \|_\| \| _ <\| \|___
6	* \___\|\___/\|_\| \_\_____\|
7	*
8	* Copyright (C) 1998 - 2019, Daniel Stenberg, <daniel@haxx.se>, et al.
9	*
10	* This software is licensed as described in the file COPYING, which
11	* you should have received as part of this distribution. The terms
12	* are also available at https://curl.haxx.se/docs/copyright.html.
13	*
14	* You may opt to use, copy, modify, merge, publish, distribute and/or sell
15	* copies of the Software, and permit persons to whom the Software is
16	* furnished to do so, under the terms of the COPYING file.
17	*
18	* This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
19	* KIND, either express or implied.
20	*
21	***************************************************************************/
22	#include "tool_setup.h"
23
24	#define ENABLE_CURLX_PRINTF
25	/ use our own printf() functions /
26	#include "curlx.h"
27	#include "tool_cfgable.h"
28	#include "tool_doswin.h"
29	#include "tool_urlglob.h"
30	#include "tool_vms.h"
31
32	#include "memdebug.h" /* keep this as LAST include */
33
34	#define GLOBERROR(string, column, code) \
35	glob->error = string, glob->pos = column, code
36
37	static CURLcode glob_fixed(URLGlob glob, char* *fixed, size_t len)
38	{
39	URLPattern *pat = &glob->pattern[glob->size];
40	pat->type = UPTSet;
41	pat->content.Set.size = `1`;
42	pat->content.Set.ptr_s = `0`;
43	pat->globindex = -`1`;
44
45	pat->content.Set.elements = malloc(sizeof(char *));
46
47	if(!pat->content.Set.elements)
48	return GLOBERROR("out of memory", `0`, CURLE_OUT_OF_MEMORY);
49
50	pat->content.Set.elements[`0`] = malloc(len + `1`);
51	if(!pat->content.Set.elements[`0`])
52	return GLOBERROR("out of memory", `0`, CURLE_OUT_OF_MEMORY);
53
54	memcpy(pat->content.Set.elements[`0`], fixed, len);
55	pat->content.Set.elements[`0`][len] = `0`;
56
57	return CURLE_OK;
58	}
59
60	/ multiply*
61	*
62	* Multiplies and checks for overflow.
63	*/
64	static int multiply(unsigned long amount, long* with)
65	{
66	unsigned long sum = amount with;
67	if(!with) {
68	*amount = `0`;
69	return `0`;
70	}
71	if(sum/with != *amount)
72	return `1`; / didn't fit, bail out /
73	*amount = sum;
74	return `0`;
75	}
76
77	static CURLcode glob_set(URLGlob glob, char* **patternp,
78	size_t posp, unsigned* long *amount,
79	int globindex)
80	{
81	/ processes a set expression with the point behind the opening '{'*
82	','-separated elements are collected until the next closing '}'
83	*/
84	URLPattern *pat;
85	bool done = FALSE;
86	char *buf = glob->glob_buffer;
87	char pattern = patternp;
88	char *opattern = pattern;
89	size_t opos = *posp-`1`;
90
91	pat = &glob->pattern[glob->size];
92	/ patterns 0,1,2,... correspond to size=1,3,5,... /
93	pat->type = UPTSet;
94	pat->content.Set.size = `0`;
95	pat->content.Set.ptr_s = `0`;
96	pat->content.Set.elements = NULL;
97	pat->globindex = globindex;
98
99	while(!done) {
100	switch (*pattern) {
101	case `'\0'`: / URL ended while set was still open /
102	return GLOBERROR("unmatched brace", opos, CURLE_URL_MALFORMAT);
103
104	case `'{'`:
105	case `'['`: / no nested expressions at this time /
106	return GLOBERROR("nested brace", *posp, CURLE_URL_MALFORMAT);
107
108	case `'}'`: / set element completed /
109	if(opattern == pattern)
110	return GLOBERROR("empty string within braces", *posp,
111	CURLE_URL_MALFORMAT);
112
113	/ add 1 to size since it'll be incremented below /
114	if(multiply(amount, pat->content.Set.size + `1`))
115	return GLOBERROR("range overflow", `0`, CURLE_URL_MALFORMAT);
116
117	/ FALLTHROUGH /
118	case `','`:
119
120	*buf = `'\0'`;
121	if(pat->content.Set.elements) {
122	char **new_arr = realloc(pat->content.Set.elements,
123	(pat->content.Set.size + `1`) * sizeof(char *));
124	if(!new_arr)
125	return GLOBERROR("out of memory", `0`, CURLE_OUT_OF_MEMORY);
126
127	pat->content.Set.elements = new_arr;
128	}
129	else
130	pat->content.Set.elements = malloc(sizeof(char *));
131
132	if(!pat->content.Set.elements)
133	return GLOBERROR("out of memory", `0`, CURLE_OUT_OF_MEMORY);
134
135	pat->content.Set.elements[pat->content.Set.size] =
136	strdup(glob->glob_buffer);
137	if(!pat->content.Set.elements[pat->content.Set.size])
138	return GLOBERROR("out of memory", `0`, CURLE_OUT_OF_MEMORY);
139	++pat->content.Set.size;
140
141	if(*pattern == `'}'`) {
142	pattern++; / pass the closing brace /
143	done = TRUE;
144	continue;
145	}
146
147	buf = glob->glob_buffer;
148	++pattern;
149	++(*posp);
150	break;
151
152	case `']'`: / illegal closing bracket /
153	return GLOBERROR("unexpected close bracket", *posp, CURLE_URL_MALFORMAT);
154
155	case `'\\'`: / escaped character, skip '\' /
156	if(pattern[`1`]) {
157	++pattern;
158	++(*posp);
159	}
160	/ FALLTHROUGH /
161	default:
162	buf++ = pattern++; / copy character to set element /
163	++(*posp);
164	}
165	}
166
167	patternp = pattern; /* return with the new position /
168	return CURLE_OK;
169	}
170
171	static CURLcode glob_range(URLGlob glob, char* **patternp,
172	size_t posp, unsigned* long *amount,
173	int globindex)
174	{
175	/ processes a range expression with the point behind the opening '['*
176	- char range: e.g. "a-z]", "B-Q]"
177	- num range: e.g. "0-9]", "17-2000]"
178	- num range with leading zeros: e.g. "001-999]"
179	expression is checked for well-formedness and collected until the next ']'
180	*/
181	URLPattern *pat;
182	int rc;
183	char pattern = patternp;
184	char *c;
185
186	pat = &glob->pattern[glob->size];
187	pat->globindex = globindex;
188
189	if(ISALPHA(*pattern)) {
190	/ character range detected /
191	char min_c;
192	char max_c;
193	char end_c;
194	unsigned long step = `1`;
195
196	pat->type = UPTCharRange;
197
198	rc = sscanf(pattern, "%c-%c%c", &min_c, &max_c, &end_c);
199
200	if(rc == `3`) {
201	if(end_c == `':'`) {
202	char *endp;
203	errno = `0`;
204	step = strtoul(&pattern[`4`], &endp, `10`);
205	if(errno \|\| &pattern[`4`] == endp \|\| *endp != `']'`)
206	step = `0`;
207	else
208	pattern = endp + `1`;
209	}
210	else if(end_c != `']'`)
211	/ then this is wrong /
212	rc = `0`;
213	else
214	/ end_c == ']' /
215	pattern += `4`;
216	}
217
218	posp += (pattern - patternp);
219
220	if(rc != `3` \|\| !step \|\| step > (unsigned)INT_MAX \|\|
221	(min_c == max_c && step != `1`) \|\|
222	(min_c != max_c && (min_c > max_c \|\| step > (unsigned)(max_c - min_c) \|\|
223	(max_c - min_c) > (`'z'` - `'a'`))))
224	/ the pattern is not well-formed /
225	return GLOBERROR("bad range", *posp, CURLE_URL_MALFORMAT);
226
227	/ if there was a ":[num]" thing, use that as step or else use 1 /
228	pat->content.CharRange.step = (int)step;
229	pat->content.CharRange.ptr_c = pat->content.CharRange.min_c = min_c;
230	pat->content.CharRange.max_c = max_c;
231
232	if(multiply(amount, ((pat->content.CharRange.max_c -
233	pat->content.CharRange.min_c) /
234	pat->content.CharRange.step + `1`)))
235	return GLOBERROR("range overflow", *posp, CURLE_URL_MALFORMAT);
236	}
237	else if(ISDIGIT(*pattern)) {
238	/ numeric range detected /
239	unsigned long min_n;
240	unsigned long max_n = `0`;
241	unsigned long step_n = `0`;
242	char *endp;
243
244	pat->type = UPTNumRange;
245	pat->content.NumRange.padlength = `0`;
246
247	if(*pattern == `'0'`) {
248	/ leading zero specified, count them! /
249	c = pattern;
250	while(ISDIGIT(*c)) {
251	c++;
252	++pat->content.NumRange.padlength; / padding length is set for all*
253	instances of this pattern /*
254	}
255	}
256
257	errno = `0`;
258	min_n = strtoul(pattern, &endp, `10`);
259	if(errno \|\| (endp == pattern))
260	endp = NULL;
261	else {
262	if(*endp != `'-'`)
263	endp = NULL;
264	else {
265	pattern = endp + `1`;
266	while(pattern && ISBLANK(pattern))
267	pattern++;
268	if(!ISDIGIT(*pattern)) {
269	endp = NULL;
270	goto fail;
271	}
272	errno = `0`;
273	max_n = strtoul(pattern, &endp, `10`);
274	if(errno)
275	/ overflow /
276	endp = NULL;
277	else if(*endp == `':'`) {
278	pattern = endp + `1`;
279	errno = `0`;
280	step_n = strtoul(pattern, &endp, `10`);
281	if(errno)
282	/ over/underflow situation /
283	endp = NULL;
284	}
285	else
286	step_n = `1`;
287	if(endp && (*endp == `']'`)) {
288	pattern = endp + `1`;
289	}
290	else
291	endp = NULL;
292	}
293	}
294
295	fail:
296	posp += (pattern - patternp);
297
298	if(!endp \|\| !step_n \|\|
299	(min_n == max_n && step_n != `1`) \|\|
300	(min_n != max_n && (min_n > max_n \|\| step_n > (max_n - min_n))))
301	/ the pattern is not well-formed /
302	return GLOBERROR("bad range", *posp, CURLE_URL_MALFORMAT);
303
304	/ typecasting to ints are fine here since we make sure above that we*
305	are within 31 bits /*
306	pat->content.NumRange.ptr_n = pat->content.NumRange.min_n = min_n;
307	pat->content.NumRange.max_n = max_n;
308	pat->content.NumRange.step = step_n;
309
310	if(multiply(amount, ((pat->content.NumRange.max_n -
311	pat->content.NumRange.min_n) /
312	pat->content.NumRange.step + `1`)))
313	return GLOBERROR("range overflow", *posp, CURLE_URL_MALFORMAT);
314	}
315	else
316	return GLOBERROR("bad range specification", *posp, CURLE_URL_MALFORMAT);
317
318	*patternp = pattern;
319	return CURLE_OK;
320	}
321
322	static bool peek_ipv6(const char str, size_t skip)
323	{
324	/*
325	* Scan for a potential IPv6 literal.
326	* - Valid globs contain a hyphen and <= 1 colon.
327	* - IPv6 literals contain no hyphens and >= 2 colons.
328	*/
329	size_t i = `0`;
330	size_t colons = `0`;
331	if(str[i++] != `'['`) {
332	return FALSE;
333	}
334	for(;;) {
335	const char c = str[i++];
336	if(ISALNUM(c) \|\| c == `'.'` \|\| c == `'%'`) {
337	/ ok /
338	}
339	else if(c == `':'`) {
340	colons++;
341	}
342	else if(c == `']'`) {
343	*skip = i;
344	return colons >= `2` ? TRUE : FALSE;
345	}
346	else {
347	return FALSE;
348	}
349	}
350	}
351
352	static CURLcode glob_parse(URLGlob glob, char* *pattern,
353	size_t pos, unsigned long *amount)
354	{
355	/ processes a literal string component of a URL*
356	special characters '{' and '[' branch to set/range processing functions
357	*/
358	CURLcode res = CURLE_OK;
359	int globindex = `0`; / count "actual" globs /
360
361	*amount = `1`;
362
363	while(*pattern && !res) {
364	char *buf = glob->glob_buffer;
365	size_t sublen = `0`;
366	while(pattern && pattern != `'{'`) {
367	if(*pattern == `'['`) {
368	/ skip over IPv6 literals and [] /
369	size_t skip = `0`;
370	if(!peek_ipv6(pattern, &skip) && (pattern[`1`] == `']'`))
371	skip = `2`;
372	if(skip) {
373	memcpy(buf, pattern, skip);
374	buf += skip;
375	pattern += skip;
376	sublen += skip;
377	continue;
378	}
379	break;
380	}
381	if(pattern == `'}'` \|\| pattern == `']'`)
382	return GLOBERROR("unmatched close brace/bracket", pos,
383	CURLE_URL_MALFORMAT);
384
385	/ only allow \ to escape known "special letters" /
386	if(*pattern == `'\\'` &&
387	((pattern + `1`) == `'{'` \|\| (pattern + `1`) == `'['` \|\|
388	(pattern + `1`) == `'}'` \|\| (pattern + `1`) == `']'`) ) {
389
390	/ escape character, skip '\' /
391	++pattern;
392	++pos;
393	}
394	buf++ = pattern++; / copy character to literal /
395	++pos;
396	sublen++;
397	}
398	if(sublen) {
399	/ we got a literal string, add it as a single-item list /
400	*buf = `'\0'`;
401	res = glob_fixed(glob, glob->glob_buffer, sublen);
402	}
403	else {
404	switch (*pattern) {
405	case `'\0'`: / done /
406	break;
407
408	case `'{'`:
409	/ process set pattern /
410	pattern++;
411	pos++;
412	res = glob_set(glob, &pattern, &pos, amount, globindex++);
413	break;
414
415	case `'['`:
416	/ process range pattern /
417	pattern++;
418	pos++;
419	res = glob_range(glob, &pattern, &pos, amount, globindex++);
420	break;
421	}
422	}
423
424	if(++glob->size >= GLOB_PATTERN_NUM)
425	return GLOBERROR("too many globs", pos, CURLE_URL_MALFORMAT);
426	}
427	return res;
428	}
429
430	CURLcode glob_url(URLGlob *glob, char* url, unsigned* long *urlnum,
431	FILE *error)
432	{
433	/*
434	* We can deal with any-size, just make a buffer with the same length
435	* as the specified URL!
436	*/
437	URLGlob *glob_expand;
438	unsigned long amount = `0`;
439	char *glob_buffer;
440	CURLcode res;
441
442	*glob = NULL;
443
444	glob_buffer = malloc(strlen(url) + `1`);
445	if(!glob_buffer)
446	return CURLE_OUT_OF_MEMORY;
447	glob_buffer[`0`] = `0`;
448
449	glob_expand = calloc(`1`, sizeof(URLGlob));
450	if(!glob_expand) {
451	Curl_safefree(glob_buffer);
452	return CURLE_OUT_OF_MEMORY;
453	}
454	glob_expand->urllen = strlen(url);
455	glob_expand->glob_buffer = glob_buffer;
456
457	res = glob_parse(glob_expand, url, `1`, &amount);
458	if(!res)
459	*urlnum = amount;
460	else {
461	if(error && glob_expand->error) {
462	char text[`512`];
463	const char *t;
464	if(glob_expand->pos) {
465	msnprintf(text, sizeof(text), "%s in URL position %zu:\n%s\n%*s^",
466	glob_expand->error,
467	glob_expand->pos, url, glob_expand->pos - `1`, " ");
468	t = text;
469	}
470	else
471	t = glob_expand->error;
472
473	/ send error description to the error-stream /
474	fprintf(error, "curl: (%d) %s\n", res, t);
475	}
476	/ it failed, we cleanup /
477	glob_cleanup(glob_expand);
478	*urlnum = `1`;
479	return res;
480	}
481
482	*glob = glob_expand;
483	return CURLE_OK;
484	}
485
486	void glob_cleanup(URLGlob* glob)
487	{
488	size_t i;
489	int elem;
490
491	if(!glob)
492	return;
493
494	for(i = `0`; i < glob->size; i++) {
495	if((glob->pattern[i].type == UPTSet) &&
496	(glob->pattern[i].content.Set.elements)) {
497	for(elem = glob->pattern[i].content.Set.size - `1`;
498	elem >= `0`;
499	--elem) {
500	Curl_safefree(glob->pattern[i].content.Set.elements[elem]);
501	}
502	Curl_safefree(glob->pattern[i].content.Set.elements);
503	}
504	}
505	Curl_safefree(glob->glob_buffer);
506	Curl_safefree(glob);
507	}
508
509	CURLcode glob_next_url(char *globbed, URLGlob glob)
510	{
511	URLPattern *pat;
512	size_t i;
513	size_t len;
514	size_t buflen = glob->urllen + `1`;
515	char *buf = glob->glob_buffer;
516
517	*globbed = NULL;
518
519	if(!glob->beenhere)
520	glob->beenhere = `1`;
521	else {
522	bool carry = TRUE;
523
524	/ implement a counter over the index ranges of all patterns, starting*
525	with the rightmost pattern /*
526	for(i = `0`; carry && (i < glob->size); i++) {
527	carry = FALSE;
528	pat = &glob->pattern[glob->size - `1` - i];
529	switch(pat->type) {
530	case UPTSet:
531	if((pat->content.Set.elements) &&
532	(++pat->content.Set.ptr_s == pat->content.Set.size)) {
533	pat->content.Set.ptr_s = `0`;
534	carry = TRUE;
535	}
536	break;
537	case UPTCharRange:
538	pat->content.CharRange.ptr_c =
539	(char)(pat->content.CharRange.step +
540	(int)((unsigned char)pat->content.CharRange.ptr_c));
541	if(pat->content.CharRange.ptr_c > pat->content.CharRange.max_c) {
542	pat->content.CharRange.ptr_c = pat->content.CharRange.min_c;
543	carry = TRUE;
544	}
545	break;
546	case UPTNumRange:
547	pat->content.NumRange.ptr_n += pat->content.NumRange.step;
548	if(pat->content.NumRange.ptr_n > pat->content.NumRange.max_n) {
549	pat->content.NumRange.ptr_n = pat->content.NumRange.min_n;
550	carry = TRUE;
551	}
552	break;
553	default:
554	printf("internal error: invalid pattern type (%d)\n", (int)pat->type);
555	return CURLE_FAILED_INIT;
556	}
557	}
558	if(carry) { / first pattern ptr has run into overflow, done! /
559	return CURLE_OK;
560	}
561	}
562
563	for(i = `0`; i < glob->size; ++i) {
564	pat = &glob->pattern[i];
565	switch(pat->type) {
566	case UPTSet:
567	if(pat->content.Set.elements) {
568	msnprintf(buf, buflen, "%s",
569	pat->content.Set.elements[pat->content.Set.ptr_s]);
570	len = strlen(buf);
571	buf += len;
572	buflen -= len;
573	}
574	break;
575	case UPTCharRange:
576	if(buflen) {
577	*buf++ = pat->content.CharRange.ptr_c;
578	*buf = `'\0'`;
579	buflen--;
580	}
581	break;
582	case UPTNumRange:
583	msnprintf(buf, buflen, "%0*lu",
584	pat->content.NumRange.padlength,
585	pat->content.NumRange.ptr_n);
586	len = strlen(buf);
587	buf += len;
588	buflen -= len;
589	break;
590	default:
591	printf("internal error: invalid pattern type (%d)\n", (int)pat->type);
592	return CURLE_FAILED_INIT;
593	}
594	}
595
596	*globbed = strdup(glob->glob_buffer);
597	if(!*globbed)
598	return CURLE_OUT_OF_MEMORY;
599
600	return CURLE_OK;
601	}
602
603	CURLcode glob_match_url(char *result, char* filename, URLGlob glob)
604	{
605	char *target;
606	size_t allocsize;
607	char numbuf[`18`];
608	char appendthis = (char* *)"";
609	size_t appendlen = `0`;
610	size_t stringlen = `0`;
611
612	*result = NULL;
613
614	/ We cannot use the glob_buffer for storage here since the filename may*
615	* be longer than the URL we use. We allocate a good start size, then
616	* we need to realloc in case of need.
617	*/
618	allocsize = strlen(filename) + `1`; / make it at least one byte to store the*
619	trailing zero /*
620	target = malloc(allocsize);
621	if(!target)
622	return CURLE_OUT_OF_MEMORY;
623
624	while(*filename) {
625	if(*filename == `'#'` && ISDIGIT(filename[`1`])) {
626	char *ptr = filename;
627	unsigned long num = strtoul(&filename[`1`], &filename, `10`);
628	URLPattern *pat = NULL;
629
630	if(num < glob->size) {
631	unsigned long i;
632	num--; / make it zero based /
633	/ find the correct glob entry /
634	for(i = `0`; i<glob->size; i++) {
635	if(glob->pattern[i].globindex == (int)num) {
636	pat = &glob->pattern[i];
637	break;
638	}
639	}
640	}
641
642	if(pat) {
643	switch(pat->type) {
644	case UPTSet:
645	if(pat->content.Set.elements) {
646	appendthis = pat->content.Set.elements[pat->content.Set.ptr_s];
647	appendlen =
648	strlen(pat->content.Set.elements[pat->content.Set.ptr_s]);
649	}
650	break;
651	case UPTCharRange:
652	numbuf[`0`] = pat->content.CharRange.ptr_c;
653	numbuf[`1`] = `0`;
654	appendthis = numbuf;
655	appendlen = `1`;
656	break;
657	case UPTNumRange:
658	msnprintf(numbuf, sizeof(numbuf), "%0*lu",
659	pat->content.NumRange.padlength,
660	pat->content.NumRange.ptr_n);
661	appendthis = numbuf;
662	appendlen = strlen(numbuf);
663	break;
664	default:
665	fprintf(stderr, "internal error: invalid pattern type (%d)\n",
666	(int)pat->type);
667	Curl_safefree(target);
668	return CURLE_FAILED_INIT;
669	}
670	}
671	else {
672	/ #[num] out of range, use the #[num] in the output /
673	filename = ptr;
674	appendthis = filename++;
675	appendlen = `1`;
676	}
677	}
678	else {
679	appendthis = filename++;
680	appendlen = `1`;
681	}
682	if(appendlen + stringlen >= allocsize) {
683	char *newstr;
684	/ we append a single byte to allow for the trailing byte to be appended*
685	at the end of this function outside the while() loop /*
686	allocsize = (appendlen + stringlen) * `2`;
687	newstr = realloc(target, allocsize + `1`);
688	if(!newstr) {
689	Curl_safefree(target);
690	return CURLE_OUT_OF_MEMORY;
691	}
692	target = newstr;
693	}
694	memcpy(&target[stringlen], appendthis, appendlen);
695	stringlen += appendlen;
696	}
697	target[stringlen]= `'\0'`;
698
699	#if defined(MSDOS) \|\| defined(WIN32)
700	{
701	char *sanitized;
702	SANITIZEcode sc = sanitize_file_name(&sanitized, target,
703	(SANITIZE_ALLOW_PATH \|
704	SANITIZE_ALLOW_RESERVED));
705	Curl_safefree(target);
706	if(sc)
707	return CURLE_URL_MALFORMAT;
708	target = sanitized;
709	}
710	#endif /* MSDOS \|\| WIN32 */
711
712	*result = target;
713	return CURLE_OK;
714	}
715

Browse the source code of Curl/src/tool_urlglob.c