1#include "mupdf/fitz.h"
2
3#include <string.h>
4#include <errno.h>
5#include <math.h>
6#include <float.h>
7#include <stdlib.h>
8
9static inline int
10fz_tolower(int c)
11{
12 if (c >= 'A' && c <= 'Z')
13 return c + 32;
14 return c;
15}
16
17/*
18 Return strlen(s), if that is less than maxlen, or maxlen if
19 there is no null byte ('\0') among the first maxlen bytes.
20*/
21size_t
22fz_strnlen(const char *s, size_t n)
23{
24 const char *p = memchr(s, 0, n);
25 return p ? p - s : n;
26}
27
28int
29fz_strncasecmp(const char *a, const char *b, int n)
30{
31 if (!n--)
32 return 0;
33 for (; *a && *b && n && (*a == *b || fz_tolower(*a) == fz_tolower(*b)); a++, b++, n--)
34 ;
35 return fz_tolower(*a) - fz_tolower(*b);
36}
37
38/*
39 Case insensitive (ASCII only) string comparison.
40*/
41int
42fz_strcasecmp(const char *a, const char *b)
43{
44 while (fz_tolower(*a) == fz_tolower(*b))
45 {
46 if (*a++ == 0)
47 return 0;
48 b++;
49 }
50 return fz_tolower(*a) - fz_tolower(*b);
51}
52
53/*
54 Given a pointer to a C string (or a pointer to NULL) break
55 it at the first occurrence of a delimiter char (from a given set).
56
57 stringp: Pointer to a C string pointer (or NULL). Updated on exit to
58 point to the first char of the string after the delimiter that was
59 found. The string pointed to by stringp will be corrupted by this
60 call (as the found delimiter will be overwritten by 0).
61
62 delim: A C string of acceptable delimiter characters.
63
64 Returns a pointer to a C string containing the chars of stringp up
65 to the first delimiter char (or the end of the string), or NULL.
66*/
67char *
68fz_strsep(char **stringp, const char *delim)
69{
70 char *ret = *stringp;
71 if (!ret) return NULL;
72 if ((*stringp = strpbrk(*stringp, delim)) != NULL)
73 *((*stringp)++) = '\0';
74 return ret;
75}
76
77/*
78 Copy at most n-1 chars of a string into a destination
79 buffer with null termination, returning the real length of the
80 initial string (excluding terminator).
81
82 dst: Destination buffer, at least n bytes long.
83
84 src: C string (non-NULL).
85
86 n: Size of dst buffer in bytes.
87
88 Returns the length (excluding terminator) of src.
89*/
90size_t
91fz_strlcpy(char *dst, const char *src, size_t siz)
92{
93 register char *d = dst;
94 register const char *s = src;
95 register size_t n = siz;
96
97 /* Copy as many bytes as will fit */
98 if (n != 0 && --n != 0) {
99 do {
100 if ((*d++ = *s++) == 0)
101 break;
102 } while (--n != 0);
103 }
104
105 /* Not enough room in dst, add NUL and traverse rest of src */
106 if (n == 0) {
107 if (siz != 0)
108 *d = '\0'; /* NUL-terminate dst */
109 while (*s++)
110 ;
111 }
112
113 return(s - src - 1); /* count does not include NUL */
114}
115
116/*
117 Concatenate 2 strings, with a maximum length.
118
119 dst: pointer to first string in a buffer of n bytes.
120
121 src: pointer to string to concatenate.
122
123 n: Size (in bytes) of buffer that dst is in.
124
125 Returns the real length that a concatenated dst + src would have been
126 (not including terminator).
127*/
128size_t
129fz_strlcat(char *dst, const char *src, size_t siz)
130{
131 register char *d = dst;
132 register const char *s = src;
133 register size_t n = siz;
134 size_t dlen;
135
136 /* Find the end of dst and adjust bytes left but don't go past end */
137 while (*d != '\0' && n-- != 0)
138 d++;
139 dlen = d - dst;
140 n = siz - dlen;
141
142 if (n == 0)
143 return dlen + strlen(s);
144 while (*s != '\0') {
145 if (n != 1) {
146 *d++ = *s;
147 n--;
148 }
149 s++;
150 }
151 *d = '\0';
152
153 return dlen + (s - src); /* count does not include NUL */
154}
155
156/*
157 extract the directory component from a path.
158*/
159void
160fz_dirname(char *dir, const char *path, size_t n)
161{
162 size_t i;
163
164 if (!path || !path[0])
165 {
166 fz_strlcpy(dir, ".", n);
167 return;
168 }
169
170 fz_strlcpy(dir, path, n);
171
172 i = strlen(dir);
173 for(; dir[i] == '/'; --i) if (!i) { fz_strlcpy(dir, "/", n); return; }
174 for(; dir[i] != '/'; --i) if (!i) { fz_strlcpy(dir, ".", n); return; }
175 for(; dir[i] == '/'; --i) if (!i) { fz_strlcpy(dir, "/", n); return; }
176 dir[i+1] = 0;
177}
178
179static inline int ishex(int a)
180{
181 return (a >= 'A' && a <= 'F') ||
182 (a >= 'a' && a <= 'f') ||
183 (a >= '0' && a <= '9');
184}
185
186static inline int tohex(int c)
187{
188 if (c >= '0' && c <= '9') return c - '0';
189 if (c >= 'a' && c <= 'f') return c - 'a' + 0xA;
190 if (c >= 'A' && c <= 'F') return c - 'A' + 0xA;
191 return 0;
192}
193
194/*
195 decode url escapes.
196*/
197char *
198fz_urldecode(char *url)
199{
200 char *s = url;
201 char *p = url;
202 while (*s)
203 {
204 int c = (unsigned char) *s++;
205 if (c == '%' && ishex(s[0]) && ishex(s[1]))
206 {
207 int a = tohex(*s++);
208 int b = tohex(*s++);
209 *p++ = a << 4 | b;
210 }
211 else
212 {
213 *p++ = c;
214 }
215 }
216 *p = 0;
217 return url;
218}
219
220/*
221 create output file name using a template.
222
223 If the path contains %[0-9]*d, the first such pattern will be replaced
224 with the page number. If the template does not contain such a pattern, the page
225 number will be inserted before the filename extension. If the template does not have
226 a filename extension, the page number will be added to the end.
227*/
228void
229fz_format_output_path(fz_context *ctx, char *path, size_t size, const char *fmt, int page)
230{
231 const char *s, *p;
232 char num[40];
233 int i, n;
234 int z = 0;
235
236 for (i = 0; page; page /= 10)
237 num[i++] = '0' + page % 10;
238 num[i] = 0;
239
240 s = p = strchr(fmt, '%');
241 if (p)
242 {
243 ++p;
244 while (*p >= '0' && *p <= '9')
245 z = z * 10 + (*p++ - '0');
246 }
247 if (p && *p == 'd')
248 {
249 ++p;
250 }
251 else
252 {
253 s = p = strrchr(fmt, '.');
254 if (!p)
255 s = p = fmt + strlen(fmt);
256 }
257
258 if (z < 1)
259 z = 1;
260 while (i < z && i < sizeof num)
261 num[i++] = '0';
262 n = s - fmt;
263 if (n + i + strlen(p) >= size)
264 fz_throw(ctx, FZ_ERROR_GENERIC, "path name buffer overflow");
265 memcpy(path, fmt, n);
266 while (i > 0)
267 path[n++] = num[--i];
268 fz_strlcpy(path + n, p, size - n);
269}
270
271#define SEP(x) ((x)=='/' || (x) == 0)
272
273/*
274 rewrite path to the shortest string that names the same path.
275
276 Eliminates multiple and trailing slashes, interprets "." and "..".
277 Overwrites the string in place.
278*/
279char *
280fz_cleanname(char *name)
281{
282 char *p, *q, *dotdot;
283 int rooted;
284
285 rooted = name[0] == '/';
286
287 /*
288 * invariants:
289 * p points at beginning of path element we're considering.
290 * q points just past the last path element we wrote (no slash).
291 * dotdot points just past the point where .. cannot backtrack
292 * any further (no slash).
293 */
294 p = q = dotdot = name + rooted;
295 while (*p)
296 {
297 if(p[0] == '/') /* null element */
298 p++;
299 else if (p[0] == '.' && SEP(p[1]))
300 p += 1; /* don't count the separator in case it is nul */
301 else if (p[0] == '.' && p[1] == '.' && SEP(p[2]))
302 {
303 p += 2;
304 if (q > dotdot) /* can backtrack */
305 {
306 while(--q > dotdot && *q != '/')
307 ;
308 }
309 else if (!rooted) /* /.. is / but ./../ is .. */
310 {
311 if (q != name)
312 *q++ = '/';
313 *q++ = '.';
314 *q++ = '.';
315 dotdot = q;
316 }
317 }
318 else /* real path element */
319 {
320 if (q != name+rooted)
321 *q++ = '/';
322 while ((*q = *p) != '/' && *q != 0)
323 p++, q++;
324 }
325 }
326
327 if (q == name) /* empty string is really "." */
328 *q++ = '.';
329 *q = '\0';
330 return name;
331}
332
333enum
334{
335 UTFmax = 4, /* maximum bytes per rune */
336 Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */
337 Runeself = 0x80, /* rune and UTF sequences are the same (<) */
338 Runeerror = 0xFFFD, /* decoding error in UTF */
339 Runemax = 0x10FFFF, /* maximum rune value */
340};
341
342enum
343{
344 Bit1 = 7,
345 Bitx = 6,
346 Bit2 = 5,
347 Bit3 = 4,
348 Bit4 = 3,
349 Bit5 = 2,
350
351 T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
352 Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
353 T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
354 T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
355 T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
356 T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
357
358 Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */
359 Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */
360 Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */
361 Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0001 1111 1111 1111 1111 1111 */
362
363 Maskx = (1<<Bitx)-1, /* 0011 1111 */
364 Testx = Maskx ^ 0xFF, /* 1100 0000 */
365
366 Bad = Runeerror,
367};
368
369/*
370 UTF8 decode a single rune from a sequence of chars.
371
372 rune: Pointer to an int to assign the decoded 'rune' to.
373
374 str: Pointer to a UTF8 encoded string.
375
376 Returns the number of bytes consumed.
377*/
378int
379fz_chartorune(int *rune, const char *str)
380{
381 int c, c1, c2, c3;
382 int l;
383
384 /*
385 * one character sequence
386 * 00000-0007F => T1
387 */
388 c = *(const unsigned char*)str;
389 if(c < Tx) {
390 *rune = c;
391 return 1;
392 }
393
394 /*
395 * two character sequence
396 * 0080-07FF => T2 Tx
397 */
398 c1 = *(const unsigned char*)(str+1) ^ Tx;
399 if(c1 & Testx)
400 goto bad;
401 if(c < T3) {
402 if(c < T2)
403 goto bad;
404 l = ((c << Bitx) | c1) & Rune2;
405 if(l <= Rune1)
406 goto bad;
407 *rune = l;
408 return 2;
409 }
410
411 /*
412 * three character sequence
413 * 0800-FFFF => T3 Tx Tx
414 */
415 c2 = *(const unsigned char*)(str+2) ^ Tx;
416 if(c2 & Testx)
417 goto bad;
418 if(c < T4) {
419 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
420 if(l <= Rune2)
421 goto bad;
422 *rune = l;
423 return 3;
424 }
425
426 /*
427 * four character sequence (21-bit value)
428 * 10000-1FFFFF => T4 Tx Tx Tx
429 */
430 c3 = *(const unsigned char*)(str+3) ^ Tx;
431 if (c3 & Testx)
432 goto bad;
433 if (c < T5) {
434 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
435 if (l <= Rune3)
436 goto bad;
437 *rune = l;
438 return 4;
439 }
440 /*
441 * Support for 5-byte or longer UTF-8 would go here, but
442 * since we don't have that, we'll just fall through to bad.
443 */
444
445 /*
446 * bad decoding
447 */
448bad:
449 *rune = Bad;
450 return 1;
451}
452
453/*
454 UTF8 encode a rune to a sequence of chars.
455
456 str: Pointer to a place to put the UTF8 encoded character.
457
458 rune: Pointer to a 'rune'.
459
460 Returns the number of bytes the rune took to output.
461*/
462int
463fz_runetochar(char *str, int rune)
464{
465 /* Runes are signed, so convert to unsigned for range check. */
466 unsigned int c = (unsigned int)rune;
467
468 /*
469 * one character sequence
470 * 00000-0007F => 00-7F
471 */
472 if(c <= Rune1) {
473 str[0] = c;
474 return 1;
475 }
476
477 /*
478 * two character sequence
479 * 0080-07FF => T2 Tx
480 */
481 if(c <= Rune2) {
482 str[0] = T2 | (c >> 1*Bitx);
483 str[1] = Tx | (c & Maskx);
484 return 2;
485 }
486
487 /*
488 * If the Rune is out of range, convert it to the error rune.
489 * Do this test here because the error rune encodes to three bytes.
490 * Doing it earlier would duplicate work, since an out of range
491 * Rune wouldn't have fit in one or two bytes.
492 */
493 if (c > Runemax)
494 c = Runeerror;
495
496 /*
497 * three character sequence
498 * 0800-FFFF => T3 Tx Tx
499 */
500 if (c <= Rune3) {
501 str[0] = T3 | (c >> 2*Bitx);
502 str[1] = Tx | ((c >> 1*Bitx) & Maskx);
503 str[2] = Tx | (c & Maskx);
504 return 3;
505 }
506
507 /*
508 * four character sequence (21-bit value)
509 * 10000-1FFFFF => T4 Tx Tx Tx
510 */
511 str[0] = T4 | (c >> 3*Bitx);
512 str[1] = Tx | ((c >> 2*Bitx) & Maskx);
513 str[2] = Tx | ((c >> 1*Bitx) & Maskx);
514 str[3] = Tx | (c & Maskx);
515 return 4;
516}
517
518/*
519 Count how many chars are required to represent a rune.
520
521 rune: The rune to encode.
522
523 Returns the number of bytes required to represent this run in UTF8.
524*/
525int
526fz_runelen(int c)
527{
528 char str[10];
529 return fz_runetochar(str, c);
530}
531
532/*
533 Count how many runes the UTF-8 encoded string
534 consists of.
535
536 s: The UTF-8 encoded, NUL-terminated text string.
537
538 Returns the number of runes in the string.
539*/
540int
541fz_utflen(const char *s)
542{
543 int c, n, rune;
544 n = 0;
545 for(;;) {
546 c = *(const unsigned char*)s;
547 if(c < Runeself) {
548 if(c == 0)
549 return n;
550 s++;
551 } else
552 s += fz_chartorune(&rune, s);
553 n++;
554 }
555 return 0;
556}
557
558/*
559 Range checking atof
560*/
561float fz_atof(const char *s)
562{
563 float result;
564
565 if (s == NULL)
566 return 0;
567
568 errno = 0;
569 result = fz_strtof(s, NULL);
570 if ((errno == ERANGE && result == 0) || isnan(result))
571 /* Return 1.0 on underflow, as it's a small known value that won't cause a divide by 0. */
572 return 1;
573 result = fz_clamp(result, -FLT_MAX, FLT_MAX);
574 return result;
575}
576
577/*
578 atoi that copes with NULL
579*/
580int fz_atoi(const char *s)
581{
582 if (s == NULL)
583 return 0;
584 return atoi(s);
585}
586
587int64_t fz_atoi64(const char *s)
588{
589 if (s == NULL)
590 return 0;
591 return atoll(s);
592}
593
594/*
595 Check and parse string into page ranges:
596 ( ','? ([0-9]+|'N') ( '-' ([0-9]+|N) )? )+
597*/
598int fz_is_page_range(fz_context *ctx, const char *s)
599{
600 /* TODO: check the actual syntax... */
601 while (*s)
602 {
603 if ((*s < '0' || *s > '9') && *s != 'N' && *s != '-' && *s != ',')
604 return 0;
605 s++;
606 }
607 return 1;
608}
609
610const char *fz_parse_page_range(fz_context *ctx, const char *s, int *a, int *b, int n)
611{
612 if (!s || !s[0])
613 return NULL;
614
615 if (s[0] == ',')
616 s += 1;
617
618 if (s[0] == 'N')
619 {
620 *a = n;
621 s += 1;
622 }
623 else
624 *a = strtol(s, (char**)&s, 10);
625
626 if (s[0] == '-')
627 {
628 if (s[1] == 'N')
629 {
630 *b = n;
631 s += 2;
632 }
633 else
634 *b = strtol(s+1, (char**)&s, 10);
635 }
636 else
637 *b = *a;
638
639 *a = fz_clampi(*a, 1, n);
640 *b = fz_clampi(*b, 1, n);
641
642 return s;
643}
644
645/* memmem from musl */
646
647#define MAX(a,b) ((a)>(b)?(a):(b))
648
649#define BITOP(a,b,op) \
650 ((a)[(size_t)(b)/(8*sizeof *(a))] op (size_t)1<<((size_t)(b)%(8*sizeof *(a))))
651
652static char *twobyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
653{
654 uint16_t nw = n[0]<<8 | n[1], hw = h[0]<<8 | h[1];
655 for (h++, k--; k; k--, hw = hw<<8 | *++h)
656 if (hw == nw) return (char *)h-1;
657 return 0;
658}
659
660static char *threebyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
661{
662 uint32_t nw = n[0]<<24 | n[1]<<16 | n[2]<<8;
663 uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8;
664 for (h+=2, k-=2; k; k--, hw = (hw|*++h)<<8)
665 if (hw == nw) return (char *)h-2;
666 return 0;
667}
668
669static char *fourbyte_memmem(const unsigned char *h, size_t k, const unsigned char *n)
670{
671 uint32_t nw = n[0]<<24 | n[1]<<16 | n[2]<<8 | n[3];
672 uint32_t hw = h[0]<<24 | h[1]<<16 | h[2]<<8 | h[3];
673 for (h+=3, k-=3; k; k--, hw = hw<<8 | *++h)
674 if (hw == nw) return (char *)h-3;
675 return 0;
676}
677
678static char *twoway_memmem(const unsigned char *h, const unsigned char *z, const unsigned char *n, size_t l)
679{
680 size_t i, ip, jp, k, p, ms, p0, mem, mem0;
681 size_t byteset[32 / sizeof(size_t)] = { 0 };
682 size_t shift[256];
683
684 /* Computing length of needle and fill shift table */
685 for (i=0; i<l; i++)
686 BITOP(byteset, n[i], |=), shift[n[i]] = i+1;
687
688 /* Compute maximal suffix */
689 ip = -1; jp = 0; k = p = 1;
690 while (jp+k<l) {
691 if (n[ip+k] == n[jp+k]) {
692 if (k == p) {
693 jp += p;
694 k = 1;
695 } else k++;
696 } else if (n[ip+k] > n[jp+k]) {
697 jp += k;
698 k = 1;
699 p = jp - ip;
700 } else {
701 ip = jp++;
702 k = p = 1;
703 }
704 }
705 ms = ip;
706 p0 = p;
707
708 /* And with the opposite comparison */
709 ip = -1; jp = 0; k = p = 1;
710 while (jp+k<l) {
711 if (n[ip+k] == n[jp+k]) {
712 if (k == p) {
713 jp += p;
714 k = 1;
715 } else k++;
716 } else if (n[ip+k] < n[jp+k]) {
717 jp += k;
718 k = 1;
719 p = jp - ip;
720 } else {
721 ip = jp++;
722 k = p = 1;
723 }
724 }
725 if (ip+1 > ms+1) ms = ip;
726 else p = p0;
727
728 /* Periodic needle? */
729 if (memcmp(n, n+p, ms+1)) {
730 mem0 = 0;
731 p = MAX(ms, l-ms-1) + 1;
732 } else mem0 = l-p;
733 mem = 0;
734
735 /* Search loop */
736 for (;;) {
737 /* If remainder of haystack is shorter than needle, done */
738 if (z-h < l) return 0;
739
740 /* Check last byte first; advance by shift on mismatch */
741 if (BITOP(byteset, h[l-1], &)) {
742 k = l-shift[h[l-1]];
743 if (k) {
744 if (mem0 && mem && k < p) k = l-p;
745 h += k;
746 mem = 0;
747 continue;
748 }
749 } else {
750 h += l;
751 mem = 0;
752 continue;
753 }
754
755 /* Compare right half */
756 for (k=MAX(ms+1,mem); k<l && n[k] == h[k]; k++);
757 if (k < l) {
758 h += k-ms;
759 mem = 0;
760 continue;
761 }
762 /* Compare left half */
763 for (k=ms+1; k>mem && n[k-1] == h[k-1]; k--);
764 if (k <= mem) return (char *)h;
765 h += p;
766 mem = mem0;
767 }
768}
769
770/*
771 Find the start of the first occurrence of the substring needle in haystack.
772*/
773void *fz_memmem(const void *h0, size_t k, const void *n0, size_t l)
774{
775 const unsigned char *h = h0, *n = n0;
776
777 /* Return immediately on empty needle */
778 if (!l) return (void *)h;
779
780 /* Return immediately when needle is longer than haystack */
781 if (k<l) return 0;
782
783 /* Use faster algorithms for short needles */
784 h = memchr(h0, *n, k);
785 if (!h || l==1) return (void *)h;
786 k -= h - (const unsigned char *)h0;
787 if (k<l) return 0;
788 if (l==2) return twobyte_memmem(h, k, n);
789 if (l==3) return threebyte_memmem(h, k, n);
790 if (l==4) return fourbyte_memmem(h, k, n);
791
792 return twoway_memmem(h, h+k, n, l);
793}
794