1 | #include "mupdf/fitz.h" |
2 | #include "mupdf/pdf.h" |
3 | |
4 | #include <string.h> |
5 | |
6 | #define IS_NUMBER \ |
7 | '+':case'-':case'.':case'0':case'1':case'2':case'3':\ |
8 | case'4':case'5':case'6':case'7':case'8':case'9' |
9 | #define IS_WHITE \ |
10 | '\x00':case'\x09':case'\x0a':case'\x0c':case'\x0d':case'\x20' |
11 | #define IS_HEX \ |
12 | '0':case'1':case'2':case'3':case'4':case'5':case'6':\ |
13 | case'7':case'8':case'9':case'A':case'B':case'C':\ |
14 | case'D':case'E':case'F':case'a':case'b':case'c':\ |
15 | case'd':case'e':case'f' |
16 | #define IS_DELIM \ |
17 | '(':case')':case'<':case'>':case'[':case']':case'{':\ |
18 | case'}':case'/':case'%' |
19 | |
20 | #define RANGE_0_9 \ |
21 | '0':case'1':case'2':case'3':case'4':case'5':\ |
22 | case'6':case'7':case'8':case'9' |
23 | #define RANGE_a_f \ |
24 | 'a':case'b':case'c':case'd':case'e':case'f' |
25 | #define RANGE_A_F \ |
26 | 'A':case'B':case'C':case'D':case'E':case'F' |
27 | #define RANGE_0_7 \ |
28 | '0':case'1':case'2':case'3':case'4':case'5':case'6':case'7' |
29 | |
30 | /* #define DUMP_LEXER_STREAM */ |
31 | #ifdef DUMP_LEXER_STREAM |
32 | static inline int lex_byte(fz_context *ctx, fz_stream *stm) |
33 | { |
34 | int c = fz_read_byte(ctx, stm); |
35 | |
36 | if (c == EOF) |
37 | fz_write_printf(ctx, fz_stdout(ctx), "<EOF>" ); |
38 | else if (c >= 32 && c < 128) |
39 | fz_write_printf(ctx, fz_stdout(ctx), "%c" , c); |
40 | else |
41 | fz_write_printf(ctx, fz_stdout(ctx), "<%02x>" , c); |
42 | return c; |
43 | } |
44 | #else |
45 | #define lex_byte(C,S) fz_read_byte(C,S) |
46 | #endif |
47 | |
48 | static inline int iswhite(int ch) |
49 | { |
50 | return |
51 | ch == '\000' || |
52 | ch == '\011' || |
53 | ch == '\012' || |
54 | ch == '\014' || |
55 | ch == '\015' || |
56 | ch == '\040'; |
57 | } |
58 | |
59 | static inline int fz_isprint(int ch) |
60 | { |
61 | return ch >= ' ' && ch <= '~'; |
62 | } |
63 | |
64 | static inline int unhex(int ch) |
65 | { |
66 | if (ch >= '0' && ch <= '9') return ch - '0'; |
67 | if (ch >= 'A' && ch <= 'F') return ch - 'A' + 0xA; |
68 | if (ch >= 'a' && ch <= 'f') return ch - 'a' + 0xA; |
69 | return 0; |
70 | } |
71 | |
72 | static void |
73 | lex_white(fz_context *ctx, fz_stream *f) |
74 | { |
75 | int c; |
76 | do { |
77 | c = lex_byte(ctx, f); |
78 | } while ((c <= 32) && (iswhite(c))); |
79 | if (c != EOF) |
80 | fz_unread_byte(ctx, f); |
81 | } |
82 | |
83 | static void |
84 | (fz_context *ctx, fz_stream *f) |
85 | { |
86 | int c; |
87 | do { |
88 | c = lex_byte(ctx, f); |
89 | } while ((c != '\012') && (c != '\015') && (c != EOF)); |
90 | } |
91 | |
92 | /* Fast(ish) but inaccurate strtof, with Adobe overflow handling. */ |
93 | static float acrobat_compatible_atof(char *s) |
94 | { |
95 | int neg = 0; |
96 | int i = 0; |
97 | |
98 | while (*s == '-') |
99 | { |
100 | neg = 1; |
101 | ++s; |
102 | } |
103 | while (*s == '+') |
104 | { |
105 | ++s; |
106 | } |
107 | |
108 | while (*s >= '0' && *s <= '9') |
109 | { |
110 | /* We deliberately ignore overflow here. |
111 | * Tests show that Acrobat handles * overflows in exactly the same way we do: |
112 | * 123450000000000000000678 is read as 678. |
113 | */ |
114 | i = i * 10 + (*s - '0'); |
115 | ++s; |
116 | } |
117 | |
118 | if (*s == '.') |
119 | { |
120 | float v = i; |
121 | float n = 0; |
122 | float d = 1; |
123 | ++s; |
124 | while (*s >= '0' && *s <= '9') |
125 | { |
126 | n = 10 * n + (*s - '0'); |
127 | d = 10 * d; |
128 | ++s; |
129 | } |
130 | v += n / d; |
131 | return neg ? -v : v; |
132 | } |
133 | else |
134 | { |
135 | return neg ? -i : i; |
136 | } |
137 | } |
138 | |
139 | /* Fast but inaccurate atoi. */ |
140 | static int fast_atoi(char *s) |
141 | { |
142 | int neg = 0; |
143 | int i = 0; |
144 | |
145 | while (*s == '-') |
146 | { |
147 | neg = 1; |
148 | ++s; |
149 | } |
150 | while (*s == '+') |
151 | { |
152 | ++s; |
153 | } |
154 | |
155 | while (*s >= '0' && *s <= '9') |
156 | { |
157 | /* We deliberately ignore overflow here. */ |
158 | i = i * 10 + (*s - '0'); |
159 | ++s; |
160 | } |
161 | |
162 | return neg ? -i : i; |
163 | } |
164 | |
165 | static int |
166 | lex_number(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf, int c) |
167 | { |
168 | char *s = buf->scratch; |
169 | char *e = buf->scratch + buf->size - 1; /* leave space for zero terminator */ |
170 | char *isreal = (c == '.' ? s : NULL); |
171 | int neg = (c == '-'); |
172 | int isbad = 0; |
173 | |
174 | *s++ = c; |
175 | |
176 | c = lex_byte(ctx, f); |
177 | |
178 | /* skip extra '-' signs at start of number */ |
179 | if (neg) |
180 | { |
181 | while (c == '-') |
182 | c = lex_byte(ctx, f); |
183 | } |
184 | |
185 | while (s < e) |
186 | { |
187 | switch (c) |
188 | { |
189 | case IS_WHITE: |
190 | case IS_DELIM: |
191 | fz_unread_byte(ctx, f); |
192 | goto end; |
193 | case EOF: |
194 | goto end; |
195 | case '.': |
196 | if (isreal) |
197 | isbad = 1; |
198 | isreal = s; |
199 | *s++ = c; |
200 | break; |
201 | case RANGE_0_9: |
202 | *s++ = c; |
203 | break; |
204 | default: |
205 | isbad = 1; |
206 | *s++ = c; |
207 | break; |
208 | } |
209 | c = lex_byte(ctx, f); |
210 | } |
211 | |
212 | end: |
213 | *s = '\0'; |
214 | if (isbad) |
215 | return PDF_TOK_ERROR; |
216 | if (isreal) |
217 | { |
218 | /* We'd like to use the fastest possible atof |
219 | * routine, but we'd rather match acrobats |
220 | * handling of broken numbers. As such, we |
221 | * spot common broken cases and call an |
222 | * acrobat compatible routine where required. */ |
223 | if (neg > 1 || isreal - buf->scratch >= 10) |
224 | buf->f = acrobat_compatible_atof(buf->scratch); |
225 | else |
226 | buf->f = fz_atof(buf->scratch); |
227 | return PDF_TOK_REAL; |
228 | } |
229 | else |
230 | { |
231 | buf->i = fast_atoi(buf->scratch); |
232 | return PDF_TOK_INT; |
233 | } |
234 | } |
235 | |
236 | static void |
237 | lex_name(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) |
238 | { |
239 | char *s = lb->scratch; |
240 | char *e = s + fz_mini(127, lb->size); |
241 | int c; |
242 | |
243 | while (1) |
244 | { |
245 | if (s == e) |
246 | { |
247 | if (e - lb->scratch < 127) |
248 | { |
249 | s += pdf_lexbuf_grow(ctx, lb); |
250 | e = lb->scratch + fz_mini(127, lb->size); |
251 | } |
252 | else |
253 | { |
254 | /* truncate names that are too long */ |
255 | fz_warn(ctx, "name is too long" ); |
256 | *s = 0; |
257 | lb->len = s - lb->scratch; |
258 | s = NULL; |
259 | } |
260 | } |
261 | c = lex_byte(ctx, f); |
262 | switch (c) |
263 | { |
264 | case IS_WHITE: |
265 | case IS_DELIM: |
266 | fz_unread_byte(ctx, f); |
267 | goto end; |
268 | case EOF: |
269 | goto end; |
270 | case '#': |
271 | { |
272 | int hex[2]; |
273 | int i; |
274 | for (i = 0; i < 2; i++) |
275 | { |
276 | c = fz_peek_byte(ctx, f); |
277 | switch (c) |
278 | { |
279 | case RANGE_0_9: |
280 | if (i == 1 && c == '0' && hex[0] == 0) |
281 | goto illegal; |
282 | hex[i] = lex_byte(ctx, f) - '0'; |
283 | break; |
284 | case RANGE_a_f: |
285 | hex[i] = lex_byte(ctx, f) - 'a' + 10; |
286 | break; |
287 | case RANGE_A_F: |
288 | hex[i] = lex_byte(ctx, f) - 'A' + 10; |
289 | break; |
290 | default: |
291 | case EOF: |
292 | goto illegal; |
293 | } |
294 | } |
295 | if (s) *s++ = (hex[0] << 4) + hex[1]; |
296 | break; |
297 | illegal: |
298 | if (i == 1) |
299 | fz_unread_byte(ctx, f); |
300 | if (s) *s++ = '#'; |
301 | continue; |
302 | } |
303 | default: |
304 | if (s) *s++ = c; |
305 | break; |
306 | } |
307 | } |
308 | end: |
309 | if (s) |
310 | { |
311 | *s = '\0'; |
312 | lb->len = s - lb->scratch; |
313 | } |
314 | } |
315 | |
316 | static int |
317 | lex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) |
318 | { |
319 | char *s = lb->scratch; |
320 | char *e = s + lb->size; |
321 | int bal = 1; |
322 | int oct; |
323 | int c; |
324 | |
325 | while (1) |
326 | { |
327 | if (s == e) |
328 | { |
329 | s += pdf_lexbuf_grow(ctx, lb); |
330 | e = lb->scratch + lb->size; |
331 | } |
332 | c = lex_byte(ctx, f); |
333 | switch (c) |
334 | { |
335 | case EOF: |
336 | return PDF_TOK_ERROR; |
337 | case '(': |
338 | bal++; |
339 | *s++ = c; |
340 | break; |
341 | case ')': |
342 | bal --; |
343 | if (bal == 0) |
344 | goto end; |
345 | *s++ = c; |
346 | break; |
347 | case '\\': |
348 | c = lex_byte(ctx, f); |
349 | switch (c) |
350 | { |
351 | case EOF: |
352 | return PDF_TOK_ERROR; |
353 | case 'n': |
354 | *s++ = '\n'; |
355 | break; |
356 | case 'r': |
357 | *s++ = '\r'; |
358 | break; |
359 | case 't': |
360 | *s++ = '\t'; |
361 | break; |
362 | case 'b': |
363 | *s++ = '\b'; |
364 | break; |
365 | case 'f': |
366 | *s++ = '\f'; |
367 | break; |
368 | case '(': |
369 | *s++ = '('; |
370 | break; |
371 | case ')': |
372 | *s++ = ')'; |
373 | break; |
374 | case '\\': |
375 | *s++ = '\\'; |
376 | break; |
377 | case RANGE_0_7: |
378 | oct = c - '0'; |
379 | c = lex_byte(ctx, f); |
380 | if (c >= '0' && c <= '7') |
381 | { |
382 | oct = oct * 8 + (c - '0'); |
383 | c = lex_byte(ctx, f); |
384 | if (c >= '0' && c <= '7') |
385 | oct = oct * 8 + (c - '0'); |
386 | else if (c != EOF) |
387 | fz_unread_byte(ctx, f); |
388 | } |
389 | else if (c != EOF) |
390 | fz_unread_byte(ctx, f); |
391 | *s++ = oct; |
392 | break; |
393 | case '\n': |
394 | break; |
395 | case '\r': |
396 | c = lex_byte(ctx, f); |
397 | if ((c != '\n') && (c != EOF)) |
398 | fz_unread_byte(ctx, f); |
399 | break; |
400 | default: |
401 | *s++ = c; |
402 | } |
403 | break; |
404 | default: |
405 | *s++ = c; |
406 | break; |
407 | } |
408 | } |
409 | end: |
410 | lb->len = s - lb->scratch; |
411 | return PDF_TOK_STRING; |
412 | } |
413 | |
414 | static int |
415 | lex_hex_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *lb) |
416 | { |
417 | char *s = lb->scratch; |
418 | char *e = s + lb->size; |
419 | int a = 0, x = 0; |
420 | int c; |
421 | |
422 | while (1) |
423 | { |
424 | if (s == e) |
425 | { |
426 | s += pdf_lexbuf_grow(ctx, lb); |
427 | e = lb->scratch + lb->size; |
428 | } |
429 | c = lex_byte(ctx, f); |
430 | switch (c) |
431 | { |
432 | case IS_WHITE: |
433 | break; |
434 | default: |
435 | fz_warn(ctx, "invalid character in hex string" ); |
436 | /* fall through */ |
437 | case IS_HEX: |
438 | if (x) |
439 | { |
440 | *s++ = a * 16 + unhex(c); |
441 | x = !x; |
442 | } |
443 | else |
444 | { |
445 | a = unhex(c); |
446 | x = !x; |
447 | } |
448 | break; |
449 | case '>': |
450 | if (x) |
451 | { |
452 | *s++ = a * 16; /* pad truncated string with '0' */ |
453 | } |
454 | goto end; |
455 | case EOF: |
456 | return PDF_TOK_ERROR; |
457 | } |
458 | } |
459 | end: |
460 | lb->len = s - lb->scratch; |
461 | return PDF_TOK_STRING; |
462 | } |
463 | |
464 | static pdf_token |
465 | pdf_token_from_keyword(char *key) |
466 | { |
467 | switch (*key) |
468 | { |
469 | case 'R': |
470 | if (!strcmp(key, "R" )) return PDF_TOK_R; |
471 | break; |
472 | case 't': |
473 | if (!strcmp(key, "true" )) return PDF_TOK_TRUE; |
474 | if (!strcmp(key, "trailer" )) return PDF_TOK_TRAILER; |
475 | break; |
476 | case 'f': |
477 | if (!strcmp(key, "false" )) return PDF_TOK_FALSE; |
478 | break; |
479 | case 'n': |
480 | if (!strcmp(key, "null" )) return PDF_TOK_NULL; |
481 | break; |
482 | case 'o': |
483 | if (!strcmp(key, "obj" )) return PDF_TOK_OBJ; |
484 | break; |
485 | case 'e': |
486 | if (!strcmp(key, "endobj" )) return PDF_TOK_ENDOBJ; |
487 | if (!strcmp(key, "endstream" )) return PDF_TOK_ENDSTREAM; |
488 | break; |
489 | case 's': |
490 | if (!strcmp(key, "stream" )) return PDF_TOK_STREAM; |
491 | if (!strcmp(key, "startxref" )) return PDF_TOK_STARTXREF; |
492 | break; |
493 | case 'x': |
494 | if (!strcmp(key, "xref" )) return PDF_TOK_XREF; |
495 | break; |
496 | } |
497 | |
498 | while (*key) |
499 | { |
500 | if (!fz_isprint(*key)) |
501 | return PDF_TOK_ERROR; |
502 | ++key; |
503 | } |
504 | |
505 | return PDF_TOK_KEYWORD; |
506 | } |
507 | |
508 | void pdf_lexbuf_init(fz_context *ctx, pdf_lexbuf *lb, int size) |
509 | { |
510 | lb->size = lb->base_size = size; |
511 | lb->len = 0; |
512 | lb->scratch = &lb->buffer[0]; |
513 | } |
514 | |
515 | void pdf_lexbuf_fin(fz_context *ctx, pdf_lexbuf *lb) |
516 | { |
517 | if (lb && lb->size != lb->base_size) |
518 | fz_free(ctx, lb->scratch); |
519 | } |
520 | |
521 | ptrdiff_t pdf_lexbuf_grow(fz_context *ctx, pdf_lexbuf *lb) |
522 | { |
523 | char *old = lb->scratch; |
524 | int newsize = lb->size * 2; |
525 | if (lb->size == lb->base_size) |
526 | { |
527 | lb->scratch = fz_malloc(ctx, newsize); |
528 | memcpy(lb->scratch, lb->buffer, lb->size); |
529 | } |
530 | else |
531 | { |
532 | lb->scratch = fz_realloc(ctx, lb->scratch, newsize); |
533 | } |
534 | lb->size = newsize; |
535 | return lb->scratch - old; |
536 | } |
537 | |
538 | pdf_token |
539 | pdf_lex(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf) |
540 | { |
541 | while (1) |
542 | { |
543 | int c = lex_byte(ctx, f); |
544 | switch (c) |
545 | { |
546 | case EOF: |
547 | return PDF_TOK_EOF; |
548 | case IS_WHITE: |
549 | lex_white(ctx, f); |
550 | break; |
551 | case '%': |
552 | lex_comment(ctx, f); |
553 | break; |
554 | case '/': |
555 | lex_name(ctx, f, buf); |
556 | return PDF_TOK_NAME; |
557 | case '(': |
558 | return lex_string(ctx, f, buf); |
559 | case ')': |
560 | return PDF_TOK_ERROR; |
561 | case '<': |
562 | c = lex_byte(ctx, f); |
563 | if (c == '<') |
564 | return PDF_TOK_OPEN_DICT; |
565 | if (c != EOF) |
566 | fz_unread_byte(ctx, f); |
567 | return lex_hex_string(ctx, f, buf); |
568 | case '>': |
569 | c = lex_byte(ctx, f); |
570 | if (c == '>') |
571 | return PDF_TOK_CLOSE_DICT; |
572 | if (c != EOF) |
573 | fz_unread_byte(ctx, f); |
574 | return PDF_TOK_ERROR; |
575 | case '[': |
576 | return PDF_TOK_OPEN_ARRAY; |
577 | case ']': |
578 | return PDF_TOK_CLOSE_ARRAY; |
579 | case '{': |
580 | return PDF_TOK_OPEN_BRACE; |
581 | case '}': |
582 | return PDF_TOK_CLOSE_BRACE; |
583 | case IS_NUMBER: |
584 | return lex_number(ctx, f, buf, c); |
585 | default: /* isregular: !isdelim && !iswhite && c != EOF */ |
586 | fz_unread_byte(ctx, f); |
587 | lex_name(ctx, f, buf); |
588 | return pdf_token_from_keyword(buf->scratch); |
589 | } |
590 | } |
591 | } |
592 | |
593 | pdf_token |
594 | pdf_lex_no_string(fz_context *ctx, fz_stream *f, pdf_lexbuf *buf) |
595 | { |
596 | while (1) |
597 | { |
598 | int c = lex_byte(ctx, f); |
599 | switch (c) |
600 | { |
601 | case EOF: |
602 | return PDF_TOK_EOF; |
603 | case IS_WHITE: |
604 | lex_white(ctx, f); |
605 | break; |
606 | case '%': |
607 | lex_comment(ctx, f); |
608 | break; |
609 | case '/': |
610 | lex_name(ctx, f, buf); |
611 | return PDF_TOK_NAME; |
612 | case '(': |
613 | return PDF_TOK_ERROR; /* no strings allowed */ |
614 | case ')': |
615 | return PDF_TOK_ERROR; /* no strings allowed */ |
616 | case '<': |
617 | c = lex_byte(ctx, f); |
618 | if (c == '<') |
619 | return PDF_TOK_OPEN_DICT; |
620 | if (c != EOF) |
621 | fz_unread_byte(ctx, f); |
622 | return PDF_TOK_ERROR; /* no strings allowed */ |
623 | case '>': |
624 | c = lex_byte(ctx, f); |
625 | if (c == '>') |
626 | return PDF_TOK_CLOSE_DICT; |
627 | if (c != EOF) |
628 | fz_unread_byte(ctx, f); |
629 | return PDF_TOK_ERROR; |
630 | case '[': |
631 | return PDF_TOK_OPEN_ARRAY; |
632 | case ']': |
633 | return PDF_TOK_CLOSE_ARRAY; |
634 | case '{': |
635 | return PDF_TOK_OPEN_BRACE; |
636 | case '}': |
637 | return PDF_TOK_CLOSE_BRACE; |
638 | case IS_NUMBER: |
639 | return lex_number(ctx, f, buf, c); |
640 | default: /* isregular: !isdelim && !iswhite && c != EOF */ |
641 | fz_unread_byte(ctx, f); |
642 | lex_name(ctx, f, buf); |
643 | return pdf_token_from_keyword(buf->scratch); |
644 | } |
645 | } |
646 | } |
647 | |
648 | /* |
649 | print a lexed token to a buffer, growing if necessary |
650 | */ |
651 | void pdf_append_token(fz_context *ctx, fz_buffer *fzbuf, int tok, pdf_lexbuf *buf) |
652 | { |
653 | switch (tok) |
654 | { |
655 | case PDF_TOK_NAME: |
656 | fz_append_printf(ctx, fzbuf, "/%s" , buf->scratch); |
657 | break; |
658 | case PDF_TOK_STRING: |
659 | if (buf->len >= buf->size) |
660 | pdf_lexbuf_grow(ctx, buf); |
661 | buf->scratch[buf->len] = 0; |
662 | fz_append_pdf_string(ctx, fzbuf, buf->scratch); |
663 | break; |
664 | case PDF_TOK_OPEN_DICT: |
665 | fz_append_string(ctx, fzbuf, "<<" ); |
666 | break; |
667 | case PDF_TOK_CLOSE_DICT: |
668 | fz_append_string(ctx, fzbuf, ">>" ); |
669 | break; |
670 | case PDF_TOK_OPEN_ARRAY: |
671 | fz_append_byte(ctx, fzbuf, '['); |
672 | break; |
673 | case PDF_TOK_CLOSE_ARRAY: |
674 | fz_append_byte(ctx, fzbuf, ']'); |
675 | break; |
676 | case PDF_TOK_OPEN_BRACE: |
677 | fz_append_byte(ctx, fzbuf, '{'); |
678 | break; |
679 | case PDF_TOK_CLOSE_BRACE: |
680 | fz_append_byte(ctx, fzbuf, '}'); |
681 | break; |
682 | case PDF_TOK_INT: |
683 | fz_append_printf(ctx, fzbuf, "%ld" , buf->i); |
684 | break; |
685 | case PDF_TOK_REAL: |
686 | fz_append_printf(ctx, fzbuf, "%g" , buf->f); |
687 | break; |
688 | default: |
689 | fz_append_data(ctx, fzbuf, buf->scratch, buf->len); |
690 | break; |
691 | } |
692 | } |
693 | |