string.c source code [MuPDF/source/fitz/string.c]

1	#include "mupdf/fitz.h"
2
3	#include <string.h>
4	#include <errno.h>
5	#include <math.h>
6	#include <float.h>
7	#include <stdlib.h>
8
9	static inline int
10	fz_tolower(int c)
11	{
12	if (c >= `'A'` && c <= `'Z'`)
13	return c + `32`;
14	return c;
15	}
16
17	/*
18	Return strlen(s), if that is less than maxlen, or maxlen if
19	there is no null byte ('\0') among the first maxlen bytes.
20	*/
21	size_t
22	fz_strnlen(const char *s, size_t n)
23	{
24	const char *p = memchr(s, `0`, n);
25	return p ? p - s : n;
26	}
27
28	int
29	fz_strncasecmp(const char a, const* char b, int* n)
30	{
31	if (!n--)
32	return `0`;
33	for (; a && b && n && (a == b \|\| fz_tolower(a) == fz_tolower(b)); a++, b++, n--)
34	;
35	return fz_tolower(a) - fz_tolower(b);
36	}
37
38	/*
39	Case insensitive (ASCII only) string comparison.
40	*/
41	int
42	fz_strcasecmp(const char a, const* char *b)
43	{
44	while (fz_tolower(a) == fz_tolower(b))
45	{
46	if (*a++ == `0`)
47	return `0`;
48	b++;
49	}
50	return fz_tolower(a) - fz_tolower(b);
51	}
52
53	/*
54	Given a pointer to a C string (or a pointer to NULL) break
55	it at the first occurrence of a delimiter char (from a given set).
56
57	stringp: Pointer to a C string pointer (or NULL). Updated on exit to
58	point to the first char of the string after the delimiter that was
59	found. The string pointed to by stringp will be corrupted by this
60	call (as the found delimiter will be overwritten by 0).
61
62	delim: A C string of acceptable delimiter characters.
63
64	Returns a pointer to a C string containing the chars of stringp up
65	to the first delimiter char (or the end of the string), or NULL.
66	*/
67	char *
68	fz_strsep(char *stringp, const* char *delim)
69	{
70	char ret = stringp;
71	if (!ret) return NULL;
72	if ((stringp = strpbrk(stringp, delim)) != NULL)
73	((stringp)++) = `'\0'`;
74	return ret;
75	}
76
77	/*
78	Copy at most n-1 chars of a string into a destination
79	buffer with null termination, returning the real length of the
80	initial string (excluding terminator).
81
82	dst: Destination buffer, at least n bytes long.
83
84	src: C string (non-NULL).
85
86	n: Size of dst buffer in bytes.
87
88	Returns the length (excluding terminator) of src.
89	*/
90	size_t
91	fz_strlcpy(char dst, const* char *src, size_t siz)
92	{
93	register char *d = dst;
94	register const char *s = src;
95	register size_t n = siz;
96
97	/ Copy as many bytes as will fit /
98	if (n != `0` && --n != `0`) {
99	do {
100	if ((d++ = s++) == `0`)
101	break;
102	} while (--n != `0`);
103	}
104
105	/ Not enough room in dst, add NUL and traverse rest of src /
106	if (n == `0`) {
107	if (siz != `0`)
108	d = `'\0'`; /* NUL-terminate dst /
109	while (*s++)
110	;
111	}
112
113	return(s - src - `1`); / count does not include NUL /
114	}
115
116	/*
117	Concatenate 2 strings, with a maximum length.
118
119	dst: pointer to first string in a buffer of n bytes.
120
121	src: pointer to string to concatenate.
122
123	n: Size (in bytes) of buffer that dst is in.
124
125	Returns the real length that a concatenated dst + src would have been
126	(not including terminator).
127	*/
128	size_t
129	fz_strlcat(char dst, const* char *src, size_t siz)
130	{
131	register char *d = dst;
132	register const char *s = src;
133	register size_t n = siz;
134	size_t dlen;
135
136	/ Find the end of dst and adjust bytes left but don't go past end /
137	while (*d != `'\0'` && n-- != `0`)
138	d++;
139	dlen = d - dst;
140	n = siz - dlen;
141
142	if (n == `0`)
143	return dlen + strlen(s);
144	while (*s != `'\0'`) {
145	if (n != `1`) {
146	d++ = s;
147	n--;
148	}
149	s++;
150	}
151	*d = `'\0'`;
152
153	return dlen + (s - src); / count does not include NUL /
154	}
155
156	/*
157	extract the directory component from a path.
158	*/
159	void
160	fz_dirname(char dir, const* char *path, size_t n)
161	{
162	size_t i;
163
164	if (!path \|\| !path[`0`])
165	{
166	fz_strlcpy(dir, ".", n);
167	return;
168	}
169
170	fz_strlcpy(dir, path, n);
171
172	i = strlen(dir);
173	for(; dir[i] == `'/'`; --i) if (!i) { fz_strlcpy(dir, "/", n); return; }
174	for(; dir[i] != `'/'`; --i) if (!i) { fz_strlcpy(dir, ".", n); return; }
175	for(; dir[i] == `'/'`; --i) if (!i) { fz_strlcpy(dir, "/", n); return; }
176	dir[i+`1`] = `0`;
177	}
178
179	static inline int ishex(int a)
180	{
181	return (a >= `'A'` && a <= `'F'`) \|\|
182	(a >= `'a'` && a <= `'f'`) \|\|
183	(a >= `'0'` && a <= `'9'`);
184	}
185
186	static inline int tohex(int c)
187	{
188	if (c >= `'0'` && c <= `'9'`) return c - `'0'`;
189	if (c >= `'a'` && c <= `'f'`) return c - `'a'` + `0xA`;
190	if (c >= `'A'` && c <= `'F'`) return c - `'A'` + `0xA`;
191	return `0`;
192	}
193
194	/*
195	decode url escapes.
196	*/
197	char *
198	fz_urldecode(char *url)
199	{
200	char *s = url;
201	char *p = url;
202	while (*s)
203	{
204	int c = (unsigned char) *s++;
205	if (c == `'%'` && ishex(s[`0`]) && ishex(s[`1`]))
206	{
207	int a = tohex(*s++);
208	int b = tohex(*s++);
209	*p++ = a << `4` \| b;
210	}
211	else
212	{
213	*p++ = c;
214	}
215	}
216	*p = `0`;
217	return url;
218	}
219
220	/*
221	create output file name using a template.
222
223	If the path contains %[0-9]d, the first such pattern will be replaced*
224	with the page number. If the template does not contain such a pattern, the page
225	number will be inserted before the filename extension. If the template does not have
226	a filename extension, the page number will be added to the end.
227	*/
228	void
229	fz_format_output_path(fz_context ctx, char* path, size_t size, const* char fmt, int* page)
230	{
231	const char s, p;
232	char num[`40`];
233	int i, n;
234	int z = `0`;
235
236	for (i = `0`; page; page /= `10`)
237	num[i++] = `'0'` + page % `10`;
238	num[i] = `0`;
239
240	s = p = strchr(fmt, `'%'`);
241	if (p)
242	{
243	++p;
244	while (p >= `'0'` && p <= `'9'`)
245	z = z * `10` + (*p++ - `'0'`);
246	}
247	if (p && *p == `'d'`)
248	{
249	++p;
250	}
251	else
252	{
253	s = p = strrchr(fmt, `'.'`);
254	if (!p)
255	s = p = fmt + strlen(fmt);
256	}
257
258	if (z < `1`)
259	z = `1`;
260	while (i < z && i < sizeof num)
261	num[i++] = `'0'`;
262	n = s - fmt;
263	if (n + i + strlen(p) >= size)
264	fz_throw(ctx, FZ_ERROR_GENERIC, "path name buffer overflow");
265	memcpy(path, fmt, n);
266	while (i > `0`)
267	path[n++] = num[--i];
268	fz_strlcpy(path + n, p, size - n);
269	}
270
271	#define SEP(x) ((x)=='/' \|\| (x) == 0)
272
273	/*
274	rewrite path to the shortest string that names the same path.
275
276	Eliminates multiple and trailing slashes, interprets "." and "..".
277	Overwrites the string in place.
278	*/
279	char *
280	fz_cleanname(char *name)
281	{
282	char p, q, *dotdot;
283	int rooted;
284
285	rooted = name[`0`] == `'/'`;
286
287	/*
288	* invariants:
289	* p points at beginning of path element we're considering.
290	* q points just past the last path element we wrote (no slash).
291	* dotdot points just past the point where .. cannot backtrack
292	* any further (no slash).
293	*/
294	p = q = dotdot = name + rooted;
295	while (*p)
296	{
297	if(p[`0`] == `'/'`) / null element /
298	p++;
299	else if (p[`0`] == `'.'` && SEP(p[`1`]))
300	p += `1`; / don't count the separator in case it is nul /
301	else if (p[`0`] == `'.'` && p[`1`] == `'.'` && SEP(p[`2`]))
302	{
303	p += `2`;
304	if (q > dotdot) / can backtrack /
305	{
306	while(--q > dotdot && *q != `'/'`)
307	;
308	}
309	else if (!rooted) / /.. is / but ./../ is .. /
310	{
311	if (q != name)
312	*q++ = `'/'`;
313	*q++ = `'.'`;
314	*q++ = `'.'`;
315	dotdot = q;
316	}
317	}
318	else / real path element /
319	{
320	if (q != name+rooted)
321	*q++ = `'/'`;
322	while ((q = p) != `'/'` && *q != `0`)
323	p++, q++;
324	}
325	}
326
327	if (q == name) / empty string is really "." /
328	*q++ = `'.'`;
329	*q = `'\0'`;
330	return name;
331	}
332
333	enum
334	{
335	UTFmax = `4`, / maximum bytes per rune /
336	Runesync = `0x80`, / cannot represent part of a UTF sequence (<) /
337	Runeself = `0x80`, / rune and UTF sequences are the same (<) /
338	Runeerror = `0xFFFD`, / decoding error in UTF /
339	Runemax = `0x10FFFF`, / maximum rune value /
340	};
341
342	enum
343	{
344	Bit1 = `7`,
345	Bitx = `6`,
346	Bit2 = `5`,
347	Bit3 = `4`,
348	Bit4 = `3`,
349	Bit5 = `2`,
350
351	T1 = ((`1`<<(Bit1+`1`))-`1`) ^ `0xFF`, / 0000 0000 /
352	Tx = ((`1`<<(Bitx+`1`))-`1`) ^ `0xFF`, / 1000 0000 /
353	T2 = ((`1`<<(Bit2+`1`))-`1`) ^ `0xFF`, / 1100 0000 /
354	T3 = ((`1`<<(Bit3+`1`))-`1`) ^ `0xFF`, / 1110 0000 /
355	T4 = ((`1`<<(Bit4+`1`))-`1`) ^ `0xFF`, / 1111 0000 /
356	T5 = ((`1`<<(Bit5+`1`))-`1`) ^ `0xFF`, / 1111 1000 /
357
358	Rune1 = (`1`<<(Bit1+`0`Bitx))-`1`, /* 0000 0000 0111 1111 /
359	Rune2 = (`1`<<(Bit2+`1`Bitx))-`1`, /* 0000 0111 1111 1111 /
360	Rune3 = (`1`<<(Bit3+`2`Bitx))-`1`, /* 1111 1111 1111 1111 /
361	Rune4 = (`1`<<(Bit4+`3`Bitx))-`1`, /* 0001 1111 1111 1111 1111 1111 /
362
363	Maskx = (`1`<<Bitx)-`1`, / 0011 1111 /
364	Testx = Maskx ^ `0xFF`, / 1100 0000 /
365
366	Bad = Runeerror,
367	};
368
369	/*
370	UTF8 decode a single rune from a sequence of chars.
371
372	rune: Pointer to an int to assign the decoded 'rune' to.
373
374	str: Pointer to a UTF8 encoded string.
375
376	Returns the number of bytes consumed.
377	*/
378	int
379	fz_chartorune(int rune, const* char *str)
380	{
381	int c, c1, c2, c3;
382	int l;
383
384	/*
385	* one character sequence
386	* 00000-0007F => T1
387	*/
388	c = (const* unsigned char*)str;
389	if(c < Tx) {
390	*rune = c;
391	return `1`;
392	}
393
394	/*
395	* two character sequence
396	* 0080-07FF => T2 Tx
397	*/
398	c1 = (const* unsigned char*)(str+`1`) ^ Tx;
399	if(c1 & Testx)
400	goto bad;
401	if(c < T3) {
402	if(c < T2)
403	goto bad;
404	l = ((c << Bitx) \| c1) & Rune2;
405	if(l <= Rune1)
406	goto bad;
407	*rune = l;
408	return `2`;
409	}
410
411	/*
412	* three character sequence
413	* 0800-FFFF => T3 Tx Tx
414	*/
415	c2 = (const* unsigned char*)(str+`2`) ^ Tx;
416	if(c2 & Testx)
417	goto bad;
418	if(c < T4) {
419	l = ((((c << Bitx) \| c1) << Bitx) \| c2) & Rune3;
420	if(l <= Rune2)
421	goto bad;
422	*rune = l;
423	return `3`;
424	}
425
426	/*
427	* four character sequence (21-bit value)
428	* 10000-1FFFFF => T4 Tx Tx Tx
429	*/
430	c3 = (const* unsigned char*)(str+`3`) ^ Tx;
431	if (c3 & Testx)
432	goto bad;
433	if (c < T5) {
434	l = ((((((c << Bitx) \| c1) << Bitx) \| c2) << Bitx) \| c3) & Rune4;
435	if (l <= Rune3)
436	goto bad;
437	*rune = l;
438	return `4`;
439	}
440	/*
441	* Support for 5-byte or longer UTF-8 would go here, but
442	* since we don't have that, we'll just fall through to bad.
443	*/
444
445	/*
446	* bad decoding
447	*/
448	bad:
449	*rune = Bad;
450	return `1`;
451	}
452
453	/*
454	UTF8 encode a rune to a sequence of chars.
455
456	str: Pointer to a place to put the UTF8 encoded character.
457
458	rune: Pointer to a 'rune'.
459
460	Returns the number of bytes the rune took to output.
461	*/
462	int
463	fz_runetochar(char str, int* rune)
464	{
465	/ Runes are signed, so convert to unsigned for range check. /
466	unsigned int c = (unsigned int)rune;
467
468	/*
469	* one character sequence
470	* 00000-0007F => 00-7F
471	*/
472	if(c <= Rune1) {
473	str[`0`] = c;
474	return `1`;
475	}
476
477	/*
478	* two character sequence
479	* 0080-07FF => T2 Tx
480	*/
481	if(c <= Rune2) {
482	str[`0`] = T2 \| (c >> `1`*Bitx);
483	str[`1`] = Tx \| (c & Maskx);
484	return `2`;
485	}
486
487	/*
488	* If the Rune is out of range, convert it to the error rune.
489	* Do this test here because the error rune encodes to three bytes.
490	* Doing it earlier would duplicate work, since an out of range
491	* Rune wouldn't have fit in one or two bytes.
492	*/
493	if (c > Runemax)
494	c = Runeerror;
495
496	/*
497	* three character sequence
498	* 0800-FFFF => T3 Tx Tx
499	*/
500	if (c <= Rune3) {
501	str[`0`] = T3 \| (c >> `2`*Bitx);
502	str[`1`] = Tx \| ((c >> `1`*Bitx) & Maskx);
503	str[`2`] = Tx \| (c & Maskx);
504	return `3`;
505	}
506
507	/*
508	* four character sequence (21-bit value)
509	* 10000-1FFFFF => T4 Tx Tx Tx
510	*/
511	str[`0`] = T4 \| (c >> `3`*Bitx);
512	str[`1`] = Tx \| ((c >> `2`*Bitx) & Maskx);
513	str[`2`] = Tx \| ((c >> `1`*Bitx) & Maskx);
514	str[`3`] = Tx \| (c & Maskx);
515	return `4`;
516	}
517
518	/*
519	Count how many chars are required to represent a rune.
520
521	rune: The rune to encode.
522
523	Returns the number of bytes required to represent this run in UTF8.
524	*/
525	int
526	fz_runelen(int c)
527	{
528	char str[`10`];
529	return fz_runetochar(str, c);
530	}
531
532	/*
533	Count how many runes the UTF-8 encoded string
534	consists of.
535
536	s: The UTF-8 encoded, NUL-terminated text string.
537
538	Returns the number of runes in the string.
539	*/
540	int
541	fz_utflen(const char *s)
542	{
543	int c, n, rune;
544	n = `0`;
545	for(;;) {
546	c = (const* unsigned char*)s;
547	if(c < Runeself) {
548	if(c == `0`)
549	return n;
550	s++;
551	} else
552	s += fz_chartorune(&rune, s);
553	n++;
554	}
555	return `0`;
556	}
557
558	/*
559	Range checking atof
560	*/
561	float fz_atof(const char *s)
562	{
563	float result;
564
565	if (s == NULL)
566	return `0`;
567
568	errno = `0`;
569	result = fz_strtof(s, NULL);
570	if ((errno == ERANGE && result == `0`) \|\| isnan(result))
571	/ Return 1.0 on underflow, as it's a small known value that won't cause a divide by 0. /
572	return `1`;
573	result = fz_clamp(result, -FLT_MAX, FLT_MAX);
574	return result;
575	}
576
577	/*
578	atoi that copes with NULL
579	*/
580	int fz_atoi(const char *s)
581	{
582	if (s == NULL)
583	return `0`;
584	return atoi(s);
585	}
586
587	int64_t fz_atoi64(const char *s)
588	{
589	if (s == NULL)
590	return `0`;
591	return atoll(s);
592	}
593
594	/*
595	Check and parse string into page ranges:
596	( ','? ([0-9]+\|'N') ( '-' ([0-9]+\|N) )? )+
597	*/
598	int fz_is_page_range(fz_context ctx, const* char *s)
599	{
600	/ TODO: check the actual syntax... /
601	while (*s)
602	{
603	if ((s < `'0'` \|\| s > `'9'`) && s != `'N'` && s != `'-'` && *s != `','`)
604	return `0`;
605	s++;
606	}
607	return `1`;
608	}
609
610	const char fz_parse_page_range(fz_context ctx, const char s, int* a, int* b, int* n)
611	{
612	if (!s \|\| !s[`0`])
613	return NULL;
614
615	if (s[`0`] == `','`)
616	s += `1`;
617
618	if (s[`0`] == `'N'`)
619	{
620	*a = n;
621	s += `1`;
622	}
623	else
624	a = strtol(s, (char***)&s, `10`);
625
626	if (s[`0`] == `'-'`)
627	{
628	if (s[`1`] == `'N'`)
629	{
630	*b = n;
631	s += `2`;
632	}
633	else
634	b = strtol(s+`1`, (char***)&s, `10`);
635	}
636	else
637	b = a;
638
639	a = fz_clampi(a, `1`, n);
640	b = fz_clampi(b, `1`, n);
641
642	return s;
643	}
644
645	/ memmem from musl /
646
647	#define MAX(a,b) ((a)>(b)?(a):(b))
648
649	#define BITOP(a,b,op) \
650	((a)[(size_t)(b)/(8sizeof (a))] op (size_t)1<<((size_t)(b)%(8sizeof (a))))
651
652	static char twobyte_memmem(const* unsigned char h, size_t k, const* unsigned char *n)
653	{
654	uint16_t nw = n[`0`]<<`8` \| n[`1`], hw = h[`0`]<<`8` \| h[`1`];
655	for (h++, k--; k; k--, hw = hw<<`8` \| *++h)
656	if (hw == nw) return (char *)h-`1`;
657	return `0`;
658	}
659
660	static char threebyte_memmem(const* unsigned char h, size_t k, const* unsigned char *n)
661	{
662	uint32_t nw = n[`0`]<<`24` \| n[`1`]<<`16` \| n[`2`]<<`8`;
663	uint32_t hw = h[`0`]<<`24` \| h[`1`]<<`16` \| h[`2`]<<`8`;
664	for (h+=`2`, k-=`2`; k; k--, hw = (hw\|*++h)<<`8`)
665	if (hw == nw) return (char *)h-`2`;
666	return `0`;
667	}
668
669	static char fourbyte_memmem(const* unsigned char h, size_t k, const* unsigned char *n)
670	{
671	uint32_t nw = n[`0`]<<`24` \| n[`1`]<<`16` \| n[`2`]<<`8` \| n[`3`];
672	uint32_t hw = h[`0`]<<`24` \| h[`1`]<<`16` \| h[`2`]<<`8` \| h[`3`];
673	for (h+=`3`, k-=`3`; k; k--, hw = hw<<`8` \| *++h)
674	if (hw == nw) return (char *)h-`3`;
675	return `0`;
676	}
677
678	static char twoway_memmem(const* unsigned char h, const* unsigned char z, const* unsigned char *n, size_t l)
679	{
680	size_t i, ip, jp, k, p, ms, p0, mem, mem0;
681	size_t byteset[`32` / sizeof(size_t)] = { `0` };
682	size_t shift[`256`];
683
684	/ Computing length of needle and fill shift table /
685	for (i=`0`; i<l; i++)
686	BITOP(byteset, n[i], \|=), shift[n[i]] = i+`1`;
687
688	/ Compute maximal suffix /
689	ip = -`1`; jp = `0`; k = p = `1`;
690	while (jp+k<l) {
691	if (n[ip+k] == n[jp+k]) {
692	if (k == p) {
693	jp += p;
694	k = `1`;
695	} else k++;
696	} else if (n[ip+k] > n[jp+k]) {
697	jp += k;
698	k = `1`;
699	p = jp - ip;
700	} else {
701	ip = jp++;
702	k = p = `1`;
703	}
704	}
705	ms = ip;
706	p0 = p;
707
708	/ And with the opposite comparison /
709	ip = -`1`; jp = `0`; k = p = `1`;
710	while (jp+k<l) {
711	if (n[ip+k] == n[jp+k]) {
712	if (k == p) {
713	jp += p;
714	k = `1`;
715	} else k++;
716	} else if (n[ip+k] < n[jp+k]) {
717	jp += k;
718	k = `1`;
719	p = jp - ip;
720	} else {
721	ip = jp++;
722	k = p = `1`;
723	}
724	}
725	if (ip+`1` > ms+`1`) ms = ip;
726	else p = p0;
727
728	/ Periodic needle? /
729	if (memcmp(n, n+p, ms+`1`)) {
730	mem0 = `0`;
731	p = MAX(ms, l-ms-`1`) + `1`;
732	} else mem0 = l-p;
733	mem = `0`;
734
735	/ Search loop /
736	for (;;) {
737	/ If remainder of haystack is shorter than needle, done /
738	if (z-h < l) return `0`;
739
740	/ Check last byte first; advance by shift on mismatch /
741	if (BITOP(byteset, h[l-`1`], &)) {
742	k = l-shift[h[l-`1`]];
743	if (k) {
744	if (mem0 && mem && k < p) k = l-p;
745	h += k;
746	mem = `0`;
747	continue;
748	}
749	} else {
750	h += l;
751	mem = `0`;
752	continue;
753	}
754
755	/ Compare right half /
756	for (k=MAX(ms+`1`,mem); k<l && n[k] == h[k]; k++);
757	if (k < l) {
758	h += k-ms;
759	mem = `0`;
760	continue;
761	}
762	/ Compare left half /
763	for (k=ms+`1`; k>mem && n[k-`1`] == h[k-`1`]; k--);
764	if (k <= mem) return (char *)h;
765	h += p;
766	mem = mem0;
767	}
768	}
769
770	/*
771	Find the start of the first occurrence of the substring needle in haystack.
772	*/
773	void fz_memmem(const* void h0, size_t k, const* void *n0, size_t l)
774	{
775	const unsigned char h = h0, n = n0;
776
777	/ Return immediately on empty needle /
778	if (!l) return (void *)h;
779
780	/ Return immediately when needle is longer than haystack /
781	if (k<l) return `0`;
782
783	/ Use faster algorithms for short needles /
784	h = memchr(h0, *n, k);
785	if (!h \|\| l==`1`) return (void *)h;
786	k -= h - (const unsigned char *)h0;
787	if (k<l) return `0`;
788	if (l==`2`) return twobyte_memmem(h, k, n);
789	if (l==`3`) return threebyte_memmem(h, k, n);
790	if (l==`4`) return fourbyte_memmem(h, k, n);
791
792	return twoway_memmem(h, h+k, n, l);
793	}
794

Browse the source code of MuPDF/source/fitz/string.c