1#include "mupdf/fitz.h"
2#include "mupdf/pdf.h"
3
4#include <string.h>
5
6#define IS_NUMBER \
7 '+':case'-':case'.':case'0':case'1':case'2':case'3':\
8 case'4':case'5':case'6':case'7':case'8':case'9'
9#define IS_WHITE \
10 '\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20'
11#define IS_HEX \
12 '0':case'1':case'2':case'3':case'4':case'5':case'6':\
13 case'7':case'8':case'9':case'A':case'B':case'C':\
14 case'D':case'E':case'F':case'a':case'b':case'c':\
15 case'd':case'e':case'f'
16#define IS_DELIM \
17 '(':case')':case'<':case'>':case'[':case']':case'{':\
18 case'}':case'/':case'%'
19
20#define RANGE_0_9 \
21 '0':case'1':case'2':case'3':case'4':case'5':\
22 case'6':case'7':case'8':case'9'
23#define RANGE_a_f \
24 'a':case'b':case'c':case'd':case'e':case'f'
25#define RANGE_A_F \
26 'A':case'B':case'C':case'D':case'E':case'F'
27#define RANGE_0_7 \
28 '0':case'1':case'2':case'3':case'4':case'5':case'6':case'7'
29
30/* #define DUMP_LEXER_STREAM */
31#ifdef DUMP_LEXER_STREAM
32static inline int lex_byte(fz_context *ctx, fz_stream *stm)
33{
34 int c = fz_read_byte(ctx, stm);
35
36 if (c == EOF)
37 fz_write_printf(ctx, fz_stdout(ctx), "<EOF>");
38 else if (c >= 32 && c < 128)
39 fz_write_printf(ctx, fz_stdout(ctx), "%c", c);
40 else
41 fz_write_printf(ctx, fz_stdout(ctx), "<%02x>", c);
42 return c;
43}
44#else
45#define lex_byte(C,S) fz_read_byte(C,S)
46#endif
47
48static inline int iswhite(int ch)
49{
50 return
51 ch == '\000' ||
52 ch == '\011' ||
53 ch == '\012' ||
54 ch == '\014' ||
55 ch == '\015' ||
56 ch == '\040';
57}
58
59static inline int fz_isprint(int ch)
60{
61 return ch >= ' ' && ch <= '~';
62}
63
64static inline int unhex(int ch)
65{
66 if (ch >= '0' && ch <= '9') return ch - '0';
67 if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA;
68 if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA;
69 return 0;
70}
71
72static void
73lex_white(fz_context *ctx, fz_stream *f)
74{
75 int c;
76 do {
77 c = lex_byte(ctx, f);
78 } while ((c <= 32) && (iswhite(c)));
79 if (c != EOF)
80 fz_unread_byte(ctx, f);
81}
82
83static void
84lex_comment(fz_context *ctx, fz_stream *f)
85{
86 int c;
87 do {
88 c = lex_byte(ctx, f);
89 } while ((c != '\012') && (c != '\015') && (c != EOF));
90}
91
92/* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */
93static float acrobat_compatible_atof(char *s)
94{
95 int neg = 0;
96 int i = 0;
97
98 while (*s == '-')
99 {
100 neg = 1;
101 ++s;
102 }
103 while (*s == '+')
104 {
105 ++s;
106 }
107
108 while (*s >= '0' && *s <= '9')
109 {
110 /* We deliberately ignore overflow here.
111 * Tests show that Acrobat handles * overflows in exactly the same way we do:
112 * 123450000000000000000678 is read as 678.
113 */
114 i = i * 10 + (*s - '0');
115 ++s;
116 }
117
118 if (*s == '.')
119 {
120 float v = i;
121 float n = 0;
122 float d = 1;
123 ++s;
124 while (*s >= '0' && *s <= '9')
125 {
126 n = 10 * n + (*s - '0');
127 d = 10 * d;
128 ++s;
129 }
130 v += n / d;
131 return neg ? -v : v;
132 }
133 else
134 {
135 return neg ? -i : i;
136 }
137}
138
139/* Fast but inaccurate atoi. */
140static int fast_atoi(char *s)
141{
142 int neg = 0;
143 int i = 0;
144
145 while (*s == '-')
146 {
147 neg = 1;
148 ++s;
149 }
150 while (*s == '+')
151 {
152 ++s;
153 }
154
155 while (*s >= '0' && *s <= '9')
156 {
157 /* We deliberately ignore overflow here. */
158 i = i * 10 + (*s - '0');
159 ++s;
160 }
161
162 return neg ? -i : i;
163}
164
165static int
166lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c)
167{
168 char *s = buf->scratch;
169 char *e = buf->scratch + buf->size - 1; /* leave space for zero terminator */
170 char *isreal = (c == '.' ? s : NULL);
171 int neg = (c == '-');
172 int isbad = 0;
173
174 *s++ = c;
175
176 c = lex_byte(ctx, f);
177
178 /* skip extra '-' signs at start of number */
179 if (neg)
180 {
181 while (c == '-')
182 c = lex_byte(ctx, f);
183 }
184
185 while (s < e)
186 {
187 switch (c)
188 {
189 case IS_WHITE:
190 case IS_DELIM:
191 fz_unread_byte(ctx, f);
192 goto end;
193 case EOF:
194 goto end;
195 case '.':
196 if (isreal)
197 isbad = 1;
198 isreal = s;
199 *s++ = c;
200 break;
201 case RANGE_0_9:
202 *s++ = c;
203 break;
204 default:
205 isbad = 1;
206 *s++ = c;
207 break;
208 }
209 c = lex_byte(ctx, f);
210 }
211
212end:
213 *s = '\0';
214 if (isbad)
215 return PDF_TOK_ERROR;
216 if (isreal)
217 {
218 /* We'd like to use the fastest possible atof
219 * routine, but we'd rather match acrobats
220 * handling of broken numbers. As such, we
221 * spot common broken cases and call an
222 * acrobat compatible routine where required. */
223 if (neg > 1 || isreal - buf->scratch >= 10)
224 buf->f = acrobat_compatible_atof(buf->scratch);
225 else
226 buf->f = fz_atof(buf->scratch);
227 return PDF_TOK_REAL;
228 }
229 else
230 {
231 buf->i = fast_atoi(buf->scratch);
232 return PDF_TOK_INT;
233 }
234}
235
236static void
237lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
238{
239 char *s = lb->scratch;
240 char *e = s + fz_mini(127, lb->size);
241 int c;
242
243 while (1)
244 {
245 if (s == e)
246 {
247 if (e - lb->scratch < 127)
248 {
249 s += pdf_lexbuf_grow(ctx, lb);
250 e = lb->scratch + fz_mini(127, lb->size);
251 }
252 else
253 {
254 /* truncate names that are too long */
255 fz_warn(ctx, "name is too long");
256 *s = 0;
257 lb->len = s - lb->scratch;
258 s = NULL;
259 }
260 }
261 c = lex_byte(ctx, f);
262 switch (c)
263 {
264 case IS_WHITE:
265 case IS_DELIM:
266 fz_unread_byte(ctx, f);
267 goto end;
268 case EOF:
269 goto end;
270 case '#':
271 {
272 int hex[2];
273 int i;
274 for (i = 0; i < 2; i++)
275 {
276 c = fz_peek_byte(ctx, f);
277 switch (c)
278 {
279 case RANGE_0_9:
280 if (i == 1 && c == '0' && hex[0] == 0)
281 goto illegal;
282 hex[i] = lex_byte(ctx, f) - '0';
283 break;
284 case RANGE_a_f:
285 hex[i] = lex_byte(ctx, f) - 'a' + 10;
286 break;
287 case RANGE_A_F:
288 hex[i] = lex_byte(ctx, f) - 'A' + 10;
289 break;
290 default:
291 case EOF:
292 goto illegal;
293 }
294 }
295 if (s) *s++ = (hex[0] << 4) + hex[1];
296 break;
297illegal:
298 if (i == 1)
299 fz_unread_byte(ctx, f);
300 if (s) *s++ = '#';
301 continue;
302 }
303 default:
304 if (s) *s++ = c;
305 break;
306 }
307 }
308end:
309 if (s)
310 {
311 *s = '\0';
312 lb->len = s - lb->scratch;
313 }
314}
315
316static int
317lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
318{
319 char *s = lb->scratch;
320 char *e = s + lb->size;
321 int bal = 1;
322 int oct;
323 int c;
324
325 while (1)
326 {
327 if (s == e)
328 {
329 s += pdf_lexbuf_grow(ctx, lb);
330 e = lb->scratch + lb->size;
331 }
332 c = lex_byte(ctx, f);
333 switch (c)
334 {
335 case EOF:
336 return PDF_TOK_ERROR;
337 case '(':
338 bal++;
339 *s++ = c;
340 break;
341 case ')':
342 bal --;
343 if (bal == 0)
344 goto end;
345 *s++ = c;
346 break;
347 case '\\':
348 c = lex_byte(ctx, f);
349 switch (c)
350 {
351 case EOF:
352 return PDF_TOK_ERROR;
353 case 'n':
354 *s++ = '\n';
355 break;
356 case 'r':
357 *s++ = '\r';
358 break;
359 case 't':
360 *s++ = '\t';
361 break;
362 case 'b':
363 *s++ = '\b';
364 break;
365 case 'f':
366 *s++ = '\f';
367 break;
368 case '(':
369 *s++ = '(';
370 break;
371 case ')':
372 *s++ = ')';
373 break;
374 case '\\':
375 *s++ = '\\';
376 break;
377 case RANGE_0_7:
378 oct = c - '0';
379 c = lex_byte(ctx, f);
380 if (c >= '0' && c <= '7')
381 {
382 oct = oct * 8 + (c - '0');
383 c = lex_byte(ctx, f);
384 if (c >= '0' && c <= '7')
385 oct = oct * 8 + (c - '0');
386 else if (c != EOF)
387 fz_unread_byte(ctx, f);
388 }
389 else if (c != EOF)
390 fz_unread_byte(ctx, f);
391 *s++ = oct;
392 break;
393 case '\n':
394 break;
395 case '\r':
396 c = lex_byte(ctx, f);
397 if ((c != '\n') && (c != EOF))
398 fz_unread_byte(ctx, f);
399 break;
400 default:
401 *s++ = c;
402 }
403 break;
404 default:
405 *s++ = c;
406 break;
407 }
408 }
409end:
410 lb->len = s - lb->scratch;
411 return PDF_TOK_STRING;
412}
413
414static int
415lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb)
416{
417 char *s = lb->scratch;
418 char *e = s + lb->size;
419 int a = 0, x = 0;
420 int c;
421
422 while (1)
423 {
424 if (s == e)
425 {
426 s += pdf_lexbuf_grow(ctx, lb);
427 e = lb->scratch + lb->size;
428 }
429 c = lex_byte(ctx, f);
430 switch (c)
431 {
432 case IS_WHITE:
433 break;
434 default:
435 fz_warn(ctx, "invalid character in hex string");
436 /* fall through */
437 case IS_HEX:
438 if (x)
439 {
440 *s++ = a * 16 + unhex(c);
441 x = !x;
442 }
443 else
444 {
445 a = unhex(c);
446 x = !x;
447 }
448 break;
449 case '>':
450 if (x)
451 {
452 *s++ = a * 16; /* pad truncated string with '0' */
453 }
454 goto end;
455 case EOF:
456 return PDF_TOK_ERROR;
457 }
458 }
459end:
460 lb->len = s - lb->scratch;
461 return PDF_TOK_STRING;
462}
463
464static pdf_token
465pdf_token_from_keyword(char *key)
466{
467 switch (*key)
468 {
469 case 'R':
470 if (!strcmp(key, "R")) return PDF_TOK_R;
471 break;
472 case 't':
473 if (!strcmp(key, "true")) return PDF_TOK_TRUE;
474 if (!strcmp(key, "trailer")) return PDF_TOK_TRAILER;
475 break;
476 case 'f':
477 if (!strcmp(key, "false")) return PDF_TOK_FALSE;
478 break;
479 case 'n':
480 if (!strcmp(key, "null")) return PDF_TOK_NULL;
481 break;
482 case 'o':
483 if (!strcmp(key, "obj")) return PDF_TOK_OBJ;
484 break;
485 case 'e':
486 if (!strcmp(key, "endobj")) return PDF_TOK_ENDOBJ;
487 if (!strcmp(key, "endstream")) return PDF_TOK_ENDSTREAM;
488 break;
489 case 's':
490 if (!strcmp(key, "stream")) return PDF_TOK_STREAM;
491 if (!strcmp(key, "startxref")) return PDF_TOK_STARTXREF;
492 break;
493 case 'x':
494 if (!strcmp(key, "xref")) return PDF_TOK_XREF;
495 break;
496 }
497
498 while (*key)
499 {
500 if (!fz_isprint(*key))
501 return PDF_TOK_ERROR;
502 ++key;
503 }
504
505 return PDF_TOK_KEYWORD;
506}
507
508void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size)
509{
510 lb->size = lb->base_size = size;
511 lb->len = 0;
512 lb->scratch = &lb->buffer[0];
513}
514
515void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb)
516{
517 if (lb && lb->size != lb->base_size)
518 fz_free(ctx, lb->scratch);
519}
520
521ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb)
522{
523 char *old = lb->scratch;
524 int newsize = lb->size * 2;
525 if (lb->size == lb->base_size)
526 {
527 lb->scratch = fz_malloc(ctx, newsize);
528 memcpy(lb->scratch, lb->buffer, lb->size);
529 }
530 else
531 {
532 lb->scratch = fz_realloc(ctx, lb->scratch, newsize);
533 }
534 lb->size = newsize;
535 return lb->scratch - old;
536}
537
538pdf_token
539pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
540{
541 while (1)
542 {
543 int c = lex_byte(ctx, f);
544 switch (c)
545 {
546 case EOF:
547 return PDF_TOK_EOF;
548 case IS_WHITE:
549 lex_white(ctx, f);
550 break;
551 case '%':
552 lex_comment(ctx, f);
553 break;
554 case '/':
555 lex_name(ctx, f, buf);
556 return PDF_TOK_NAME;
557 case '(':
558 return lex_string(ctx, f, buf);
559 case ')':
560 return PDF_TOK_ERROR;
561 case '<':
562 c = lex_byte(ctx, f);
563 if (c == '<')
564 return PDF_TOK_OPEN_DICT;
565 if (c != EOF)
566 fz_unread_byte(ctx, f);
567 return lex_hex_string(ctx, f, buf);
568 case '>':
569 c = lex_byte(ctx, f);
570 if (c == '>')
571 return PDF_TOK_CLOSE_DICT;
572 if (c != EOF)
573 fz_unread_byte(ctx, f);
574 return PDF_TOK_ERROR;
575 case '[':
576 return PDF_TOK_OPEN_ARRAY;
577 case ']':
578 return PDF_TOK_CLOSE_ARRAY;
579 case '{':
580 return PDF_TOK_OPEN_BRACE;
581 case '}':
582 return PDF_TOK_CLOSE_BRACE;
583 case IS_NUMBER:
584 return lex_number(ctx, f, buf, c);
585 default: /* isregular: !isdelim && !iswhite && c != EOF */
586 fz_unread_byte(ctx, f);
587 lex_name(ctx, f, buf);
588 return pdf_token_from_keyword(buf->scratch);
589 }
590 }
591}
592
593pdf_token
594pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf)
595{
596 while (1)
597 {
598 int c = lex_byte(ctx, f);
599 switch (c)
600 {
601 case EOF:
602 return PDF_TOK_EOF;
603 case IS_WHITE:
604 lex_white(ctx, f);
605 break;
606 case '%':
607 lex_comment(ctx, f);
608 break;
609 case '/':
610 lex_name(ctx, f, buf);
611 return PDF_TOK_NAME;
612 case '(':
613 return PDF_TOK_ERROR; /* no strings allowed */
614 case ')':
615 return PDF_TOK_ERROR; /* no strings allowed */
616 case '<':
617 c = lex_byte(ctx, f);
618 if (c == '<')
619 return PDF_TOK_OPEN_DICT;
620 if (c != EOF)
621 fz_unread_byte(ctx, f);
622 return PDF_TOK_ERROR; /* no strings allowed */
623 case '>':
624 c = lex_byte(ctx, f);
625 if (c == '>')
626 return PDF_TOK_CLOSE_DICT;
627 if (c != EOF)
628 fz_unread_byte(ctx, f);
629 return PDF_TOK_ERROR;
630 case '[':
631 return PDF_TOK_OPEN_ARRAY;
632 case ']':
633 return PDF_TOK_CLOSE_ARRAY;
634 case '{':
635 return PDF_TOK_OPEN_BRACE;
636 case '}':
637 return PDF_TOK_CLOSE_BRACE;
638 case IS_NUMBER:
639 return lex_number(ctx, f, buf, c);
640 default: /* isregular: !isdelim && !iswhite && c != EOF */
641 fz_unread_byte(ctx, f);
642 lex_name(ctx, f, buf);
643 return pdf_token_from_keyword(buf->scratch);
644 }
645 }
646}
647
648/*
649 print a lexed token to a buffer, growing if necessary
650*/
651void pdf_append_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf)
652{
653 switch (tok)
654 {
655 case PDF_TOK_NAME:
656 fz_append_printf(ctx, fzbuf, "/%s", buf->scratch);
657 break;
658 case PDF_TOK_STRING:
659 if (buf->len >= buf->size)
660 pdf_lexbuf_grow(ctx, buf);
661 buf->scratch[buf->len] = 0;
662 fz_append_pdf_string(ctx, fzbuf, buf->scratch);
663 break;
664 case PDF_TOK_OPEN_DICT:
665 fz_append_string(ctx, fzbuf, "<<");
666 break;
667 case PDF_TOK_CLOSE_DICT:
668 fz_append_string(ctx, fzbuf, ">>");
669 break;
670 case PDF_TOK_OPEN_ARRAY:
671 fz_append_byte(ctx, fzbuf, '[');
672 break;
673 case PDF_TOK_CLOSE_ARRAY:
674 fz_append_byte(ctx, fzbuf, ']');
675 break;
676 case PDF_TOK_OPEN_BRACE:
677 fz_append_byte(ctx, fzbuf, '{');
678 break;
679 case PDF_TOK_CLOSE_BRACE:
680 fz_append_byte(ctx, fzbuf, '}');
681 break;
682 case PDF_TOK_INT:
683 fz_append_printf(ctx, fzbuf, "%ld", buf->i);
684 break;
685 case PDF_TOK_REAL:
686 fz_append_printf(ctx, fzbuf, "%g", buf->f);
687 break;
688 default:
689 fz_append_data(ctx, fzbuf, buf->scratch, buf->len);
690 break;
691 }
692}
693