1#include "mupdf/fitz.h"
2#include "mupdf/pdf.h"
3
4#include <string.h>
5
6/* Scan file for objects and reconstruct xref table */
7
8struct entry
9{
10 int num;
11 int gen;
12 int64_t ofs;
13 int64_t stm_ofs;
14 int stm_len;
15};
16
17static void add_root(fz_context *ctx, pdf_obj *obj, pdf_obj ***roots, int *num_roots, int *max_roots)
18{
19 if (*num_roots == *max_roots)
20 {
21 int new_max_roots = *max_roots * 2;
22 if (new_max_roots == 0)
23 new_max_roots = 4;
24 *roots = fz_realloc_array(ctx, *roots, new_max_roots, pdf_obj*);
25 *max_roots = new_max_roots;
26 }
27 (*roots)[(*num_roots)++] = pdf_keep_obj(ctx, obj);
28}
29
30int
31pdf_repair_obj(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf, int64_t *stmofsp, int *stmlenp, pdf_obj **encrypt, pdf_obj **id, pdf_obj **page, int64_t *tmpofs, pdf_obj **root)
32{
33 fz_stream *file = doc->file;
34 pdf_token tok;
35 int stm_len;
36
37 *stmofsp = 0;
38 if (stmlenp)
39 *stmlenp = -1;
40
41 stm_len = 0;
42
43 /* On entry to this function, we know that we've just seen
44 * '<int> <int> obj'. We expect the next thing we see to be a
45 * pdf object. Regardless of the type of thing we meet next
46 * we only need to fully parse it if it is a dictionary. */
47 tok = pdf_lex(ctx, file, buf);
48
49 if (tok == PDF_TOK_OPEN_DICT)
50 {
51 pdf_obj *obj, *dict = NULL;
52
53 fz_try(ctx)
54 {
55 dict = pdf_parse_dict(ctx, doc, file, buf);
56 }
57 fz_catch(ctx)
58 {
59 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
60 /* Don't let a broken object at EOF overwrite a good one */
61 if (file->eof)
62 fz_rethrow(ctx);
63 /* Silently swallow the error */
64 dict = pdf_new_dict(ctx, NULL, 2);
65 }
66
67 /* We must be careful not to try to resolve any indirections
68 * here. We have just read dict, so we know it to be a non
69 * indirected dictionary. Before we look at any values that
70 * we get back from looking up in it, we need to check they
71 * aren't indirected. */
72
73 if (encrypt || id || root)
74 {
75 obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
76 if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(XRef)))
77 {
78 if (encrypt)
79 {
80 obj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt));
81 if (obj)
82 {
83 pdf_drop_obj(ctx, *encrypt);
84 *encrypt = pdf_keep_obj(ctx, obj);
85 }
86 }
87
88 if (id)
89 {
90 obj = pdf_dict_get(ctx, dict, PDF_NAME(ID));
91 if (obj)
92 {
93 pdf_drop_obj(ctx, *id);
94 *id = pdf_keep_obj(ctx, obj);
95 }
96 }
97
98 if (root)
99 *root = pdf_keep_obj(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Root)));
100 }
101 }
102
103 obj = pdf_dict_get(ctx, dict, PDF_NAME(Length));
104 if (!pdf_is_indirect(ctx, obj) && pdf_is_int(ctx, obj))
105 stm_len = pdf_to_int(ctx, obj);
106
107 if (doc->file_reading_linearly && page)
108 {
109 obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
110 if (!pdf_is_indirect(ctx, obj) && pdf_name_eq(ctx, obj, PDF_NAME(Page)))
111 {
112 pdf_drop_obj(ctx, *page);
113 *page = pdf_keep_obj(ctx, dict);
114 }
115 }
116
117 pdf_drop_obj(ctx, dict);
118 }
119
120 while ( tok != PDF_TOK_STREAM &&
121 tok != PDF_TOK_ENDOBJ &&
122 tok != PDF_TOK_ERROR &&
123 tok != PDF_TOK_EOF &&
124 tok != PDF_TOK_INT )
125 {
126 *tmpofs = fz_tell(ctx, file);
127 if (*tmpofs < 0)
128 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
129 tok = pdf_lex(ctx, file, buf);
130 }
131
132 if (tok == PDF_TOK_STREAM)
133 {
134 int c = fz_read_byte(ctx, file);
135 if (c == '\r') {
136 c = fz_peek_byte(ctx, file);
137 if (c == '\n')
138 fz_read_byte(ctx, file);
139 }
140
141 *stmofsp = fz_tell(ctx, file);
142 if (*stmofsp < 0)
143 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot seek in file");
144
145 if (stm_len > 0)
146 {
147 fz_seek(ctx, file, *stmofsp + stm_len, 0);
148 fz_try(ctx)
149 {
150 tok = pdf_lex(ctx, file, buf);
151 }
152 fz_catch(ctx)
153 {
154 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
155 fz_warn(ctx, "cannot find endstream token, falling back to scanning");
156 }
157 if (tok == PDF_TOK_ENDSTREAM)
158 goto atobjend;
159 fz_seek(ctx, file, *stmofsp, 0);
160 }
161
162 (void)fz_read(ctx, file, (unsigned char *) buf->scratch, 9);
163
164 while (memcmp(buf->scratch, "endstream", 9) != 0)
165 {
166 c = fz_read_byte(ctx, file);
167 if (c == EOF)
168 break;
169 memmove(&buf->scratch[0], &buf->scratch[1], 8);
170 buf->scratch[8] = c;
171 }
172
173 if (stmlenp)
174 *stmlenp = fz_tell(ctx, file) - *stmofsp - 9;
175
176atobjend:
177 *tmpofs = fz_tell(ctx, file);
178 if (*tmpofs < 0)
179 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
180 tok = pdf_lex(ctx, file, buf);
181 if (tok != PDF_TOK_ENDOBJ)
182 fz_warn(ctx, "object missing 'endobj' token");
183 else
184 {
185 /* Read another token as we always return the next one */
186 *tmpofs = fz_tell(ctx, file);
187 if (*tmpofs < 0)
188 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
189 tok = pdf_lex(ctx, file, buf);
190 }
191 }
192 return tok;
193}
194
195static void
196pdf_repair_obj_stm(fz_context *ctx, pdf_document *doc, int stm_num)
197{
198 pdf_obj *obj;
199 fz_stream *stm = NULL;
200 pdf_token tok;
201 int i, n, count;
202 pdf_lexbuf buf;
203
204 fz_var(stm);
205
206 pdf_lexbuf_init(ctx, &buf, PDF_LEXBUF_SMALL);
207
208 fz_try(ctx)
209 {
210 obj = pdf_load_object(ctx, doc, stm_num);
211
212 count = pdf_dict_get_int(ctx, obj, PDF_NAME(N));
213
214 pdf_drop_obj(ctx, obj);
215
216 stm = pdf_open_stream_number(ctx, doc, stm_num);
217
218 for (i = 0; i < count; i++)
219 {
220 pdf_xref_entry *entry;
221
222 tok = pdf_lex(ctx, stm, &buf);
223 if (tok != PDF_TOK_INT)
224 fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d 0 R)", stm_num);
225
226 n = buf.i;
227 if (n < 0)
228 {
229 fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
230 continue;
231 }
232 else if (n >= pdf_xref_len(ctx, doc))
233 {
234 fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", n, i);
235 continue;
236 }
237
238 entry = pdf_get_populating_xref_entry(ctx, doc, n);
239 entry->ofs = stm_num;
240 entry->gen = i;
241 entry->num = n;
242 entry->stm_ofs = 0;
243 pdf_drop_obj(ctx, entry->obj);
244 entry->obj = NULL;
245 entry->type = 'o';
246
247 tok = pdf_lex(ctx, stm, &buf);
248 if (tok != PDF_TOK_INT)
249 fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d 0 R)", stm_num);
250 }
251 }
252 fz_always(ctx)
253 {
254 fz_drop_stream(ctx, stm);
255 pdf_lexbuf_fin(ctx, &buf);
256 }
257 fz_catch(ctx)
258 {
259 fz_rethrow(ctx);
260 }
261}
262
263static void
264orphan_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
265{
266 if (doc->orphans_count == doc->orphans_max)
267 {
268 int new_max = (doc->orphans_max ? doc->orphans_max*2 : 32);
269
270 fz_try(ctx)
271 {
272 doc->orphans = fz_realloc_array(ctx, doc->orphans, new_max, pdf_obj*);
273 doc->orphans_max = new_max;
274 }
275 fz_catch(ctx)
276 {
277 pdf_drop_obj(ctx, obj);
278 fz_rethrow(ctx);
279 }
280 }
281 doc->orphans[doc->orphans_count++] = obj;
282}
283
284static int is_white(int c)
285{
286 return c == '\x00' || c == '\x09' || c == '\x0a' || c == '\x0c' || c == '\x0d' || c == '\x20';
287}
288
289void
290pdf_repair_xref(fz_context *ctx, pdf_document *doc)
291{
292 pdf_obj *dict, *obj = NULL;
293 pdf_obj *length;
294
295 pdf_obj *encrypt = NULL;
296 pdf_obj *id = NULL;
297 pdf_obj **roots = NULL;
298 pdf_obj *info = NULL;
299
300 struct entry *list = NULL;
301 int listlen;
302 int listcap;
303 int maxnum = 0;
304
305 int num = 0;
306 int gen = 0;
307 int64_t tmpofs, stm_ofs, numofs = 0, genofs = 0;
308 int stm_len;
309 pdf_token tok;
310 int next;
311 int i;
312 size_t j, n;
313 int c;
314 pdf_lexbuf *buf = &doc->lexbuf.base;
315 int num_roots = 0;
316 int max_roots = 0;
317
318 fz_var(encrypt);
319 fz_var(id);
320 fz_var(roots);
321 fz_var(num_roots);
322 fz_var(max_roots);
323 fz_var(info);
324 fz_var(list);
325 fz_var(obj);
326
327 fz_warn(ctx, "repairing PDF document");
328
329 if (doc->repair_attempted)
330 fz_throw(ctx, FZ_ERROR_GENERIC, "Repair failed already - not trying again");
331 doc->repair_attempted = 1;
332
333 doc->dirty = 1;
334 doc->freeze_updates = 1; /* Can't support incremental update after repair */
335
336 pdf_forget_xref(ctx, doc);
337
338 fz_seek(ctx, doc->file, 0, 0);
339
340 fz_try(ctx)
341 {
342 pdf_xref_entry *entry;
343 listlen = 0;
344 listcap = 1024;
345 list = fz_malloc_array(ctx, listcap, struct entry);
346
347 /* look for '%PDF' version marker within first kilobyte of file */
348 n = fz_read(ctx, doc->file, (unsigned char *)buf->scratch, fz_mini(buf->size, 1024));
349
350 fz_seek(ctx, doc->file, 0, 0);
351 if (n >= 4)
352 {
353 for (j = 0; j < n - 4; j++)
354 {
355 if (memcmp(&buf->scratch[j], "%PDF", 4) == 0)
356 {
357 fz_seek(ctx, doc->file, (int64_t)(j + 8), 0); /* skip "%PDF-X.Y" */
358 break;
359 }
360 }
361 }
362
363 /* skip comment line after version marker since some generators
364 * forget to terminate the comment with a newline */
365 c = fz_read_byte(ctx, doc->file);
366 while (c >= 0 && (c == ' ' || c == '%'))
367 c = fz_read_byte(ctx, doc->file);
368 fz_unread_byte(ctx, doc->file);
369
370 while (1)
371 {
372 tmpofs = fz_tell(ctx, doc->file);
373 if (tmpofs < 0)
374 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
375
376 fz_try(ctx)
377 tok = pdf_lex_no_string(ctx, doc->file, buf);
378 fz_catch(ctx)
379 {
380 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
381 fz_warn(ctx, "skipping ahead to next token");
382 do
383 c = fz_read_byte(ctx, doc->file);
384 while (c != EOF && !is_white(c));
385 if (c == EOF)
386 tok = PDF_TOK_EOF;
387 else
388 continue;
389 }
390
391 /* If we have the next token already, then we'll jump
392 * back here, rather than going through the top of
393 * the loop. */
394 have_next_token:
395
396 if (tok == PDF_TOK_INT)
397 {
398 if (buf->i < 0)
399 {
400 num = 0;
401 gen = 0;
402 continue;
403 }
404 numofs = genofs;
405 num = gen;
406 genofs = tmpofs;
407 gen = buf->i;
408 }
409
410 else if (tok == PDF_TOK_OBJ)
411 {
412 pdf_obj *root = NULL;
413
414 fz_try(ctx)
415 {
416 stm_len = 0;
417 stm_ofs = 0;
418 tok = pdf_repair_obj(ctx, doc, buf, &stm_ofs, &stm_len, &encrypt, &id, NULL, &tmpofs, &root);
419 if (root)
420 add_root(ctx, root, &roots, &num_roots, &max_roots);
421 }
422 fz_always(ctx)
423 {
424 pdf_drop_obj(ctx, root);
425 }
426 fz_catch(ctx)
427 {
428 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
429 /* If we haven't seen a root yet, there is nothing
430 * we can do, but give up. Otherwise, we'll make
431 * do. */
432 if (!roots)
433 fz_rethrow(ctx);
434 fz_warn(ctx, "cannot parse object (%d %d R) - ignoring rest of file", num, gen);
435 break;
436 }
437
438 if (num <= 0 || num > PDF_MAX_OBJECT_NUMBER)
439 {
440 fz_warn(ctx, "ignoring object with invalid object number (%d %d R)", num, gen);
441 goto have_next_token;
442 }
443
444 gen = fz_clampi(gen, 0, 65535);
445
446 if (listlen + 1 == listcap)
447 {
448 listcap = (listcap * 3) / 2;
449 list = fz_realloc_array(ctx, list, listcap, struct entry);
450 }
451
452 list[listlen].num = num;
453 list[listlen].gen = gen;
454 list[listlen].ofs = numofs;
455 list[listlen].stm_ofs = stm_ofs;
456 list[listlen].stm_len = stm_len;
457 listlen ++;
458
459 if (num > maxnum)
460 maxnum = num;
461
462 goto have_next_token;
463 }
464
465 /* If we find a dictionary it is probably the trailer,
466 * but could be a stream (or bogus) dictionary caused
467 * by a corrupt file. */
468 else if (tok == PDF_TOK_OPEN_DICT)
469 {
470 pdf_obj *dictobj;
471
472 fz_try(ctx)
473 {
474 dict = pdf_parse_dict(ctx, doc, doc->file, buf);
475 }
476 fz_catch(ctx)
477 {
478 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
479 /* If this was the real trailer dict
480 * it was broken, in which case we are
481 * in trouble. Keep going though in
482 * case this was just a bogus dict. */
483 continue;
484 }
485
486 fz_try(ctx)
487 {
488 dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Encrypt));
489 if (dictobj)
490 {
491 pdf_drop_obj(ctx, encrypt);
492 encrypt = pdf_keep_obj(ctx, dictobj);
493 }
494
495 dictobj = pdf_dict_get(ctx, dict, PDF_NAME(ID));
496 if (dictobj && (!id || !encrypt || pdf_dict_get(ctx, dict, PDF_NAME(Encrypt))))
497 {
498 pdf_drop_obj(ctx, id);
499 id = pdf_keep_obj(ctx, dictobj);
500 }
501
502 dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Root));
503 if (dictobj)
504 add_root(ctx, dictobj, &roots, &num_roots, &max_roots);
505
506 dictobj = pdf_dict_get(ctx, dict, PDF_NAME(Info));
507 if (dictobj)
508 {
509 pdf_drop_obj(ctx, info);
510 info = pdf_keep_obj(ctx, dictobj);
511 }
512 }
513 fz_always(ctx)
514 pdf_drop_obj(ctx, dict);
515 fz_catch(ctx)
516 fz_rethrow(ctx);
517 }
518
519 else if (tok == PDF_TOK_EOF)
520 {
521 break;
522 }
523
524 else
525 {
526 num = 0;
527 gen = 0;
528 }
529 }
530
531 if (listlen == 0)
532 fz_throw(ctx, FZ_ERROR_GENERIC, "no objects found");
533
534 /* make xref reasonable */
535
536 /*
537 Dummy access to entry to assure sufficient space in the xref table
538 and avoid repeated reallocs in the loop
539 */
540 /* Ensure that the first xref table is a 'solid' one from
541 * 0 to maxnum. */
542 pdf_ensure_solid_xref(ctx, doc, maxnum);
543
544 for (i = 1; i < maxnum; i++)
545 {
546 entry = pdf_get_populating_xref_entry(ctx, doc, i);
547 if (entry->obj != NULL)
548 continue;
549 entry->type = 'f';
550 entry->ofs = 0;
551 entry->gen = 0;
552 entry->num = 0;
553
554 entry->stm_ofs = 0;
555 }
556
557 for (i = 0; i < listlen; i++)
558 {
559 entry = pdf_get_populating_xref_entry(ctx, doc, list[i].num);
560 entry->type = 'n';
561 entry->ofs = list[i].ofs;
562 entry->gen = list[i].gen;
563 entry->num = list[i].num;
564
565 entry->stm_ofs = list[i].stm_ofs;
566
567 /* correct stream length for unencrypted documents */
568 if (!encrypt && list[i].stm_len >= 0)
569 {
570 pdf_obj *old_obj = NULL;
571 dict = pdf_load_object(ctx, doc, list[i].num);
572
573 fz_try(ctx)
574 {
575 length = pdf_new_int(ctx, list[i].stm_len);
576 pdf_dict_get_put_drop(ctx, dict, PDF_NAME(Length), length, &old_obj);
577 if (old_obj)
578 orphan_object(ctx, doc, old_obj);
579 }
580 fz_always(ctx)
581 pdf_drop_obj(ctx, dict);
582 fz_catch(ctx)
583 fz_rethrow(ctx);
584 }
585 }
586
587 entry = pdf_get_populating_xref_entry(ctx, doc, 0);
588 entry->type = 'f';
589 entry->ofs = 0;
590 entry->gen = 65535;
591 entry->num = 0;
592 entry->stm_ofs = 0;
593
594 next = 0;
595 for (i = pdf_xref_len(ctx, doc) - 1; i >= 0; i--)
596 {
597 entry = pdf_get_populating_xref_entry(ctx, doc, i);
598 if (entry->type == 'f')
599 {
600 entry->ofs = next;
601 if (entry->gen < 65535)
602 entry->gen ++;
603 next = i;
604 }
605 }
606
607 /* create a repaired trailer, Root will be added later */
608
609 obj = pdf_new_dict(ctx, doc, 5);
610 /* During repair there is only a single xref section */
611 pdf_set_populating_xref_trailer(ctx, doc, obj);
612 pdf_drop_obj(ctx, obj);
613 obj = NULL;
614
615 obj = pdf_new_int(ctx, maxnum + 1);
616 pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Size), obj);
617 pdf_drop_obj(ctx, obj);
618 obj = NULL;
619
620 if (roots)
621 {
622 for (i = num_roots-1; i > 0; i--)
623 {
624 if (pdf_is_dict(ctx, roots[i]))
625 break;
626 }
627 if (i >= 0)
628 {
629 pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), roots[i]);
630 }
631 }
632 if (info)
633 {
634 pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), info);
635 pdf_drop_obj(ctx, info);
636 info = NULL;
637 }
638
639 if (encrypt)
640 {
641 if (pdf_is_indirect(ctx, encrypt))
642 {
643 /* create new reference with non-NULL xref pointer */
644 obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, encrypt), pdf_to_gen(ctx, encrypt));
645 pdf_drop_obj(ctx, encrypt);
646 encrypt = obj;
647 obj = NULL;
648 }
649 pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt), encrypt);
650 pdf_drop_obj(ctx, encrypt);
651 encrypt = NULL;
652 }
653
654 if (id)
655 {
656 if (pdf_is_indirect(ctx, id))
657 {
658 /* create new reference with non-NULL xref pointer */
659 obj = pdf_new_indirect(ctx, doc, pdf_to_num(ctx, id), pdf_to_gen(ctx, id));
660 pdf_drop_obj(ctx, id);
661 id = obj;
662 obj = NULL;
663 }
664 pdf_dict_put(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID), id);
665 pdf_drop_obj(ctx, id);
666 id = NULL;
667 }
668
669 fz_free(ctx, list);
670 }
671 fz_always(ctx)
672 {
673 for (i = 0; i < num_roots; i++)
674 pdf_drop_obj(ctx, roots[i]);
675 fz_free(ctx, roots);
676 }
677 fz_catch(ctx)
678 {
679 pdf_drop_obj(ctx, encrypt);
680 pdf_drop_obj(ctx, id);
681 pdf_drop_obj(ctx, obj);
682 pdf_drop_obj(ctx, info);
683 fz_free(ctx, list);
684 fz_rethrow(ctx);
685 }
686}
687
688void
689pdf_repair_obj_stms(fz_context *ctx, pdf_document *doc)
690{
691 pdf_obj *dict;
692 int i;
693 int xref_len = pdf_xref_len(ctx, doc);
694
695 for (i = 0; i < xref_len; i++)
696 {
697 pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i);
698
699 if (entry->stm_ofs)
700 {
701 dict = pdf_load_object(ctx, doc, i);
702 fz_try(ctx)
703 {
704 if (pdf_name_eq(ctx, pdf_dict_get(ctx, dict, PDF_NAME(Type)), PDF_NAME(ObjStm)))
705 pdf_repair_obj_stm(ctx, doc, i);
706 }
707 fz_catch(ctx)
708 {
709 fz_warn(ctx, "ignoring broken object stream (%d 0 R)", i);
710 }
711 pdf_drop_obj(ctx, dict);
712 }
713 }
714
715 /* Ensure that streamed objects reside inside a known non-streamed object */
716 for (i = 0; i < xref_len; i++)
717 {
718 pdf_xref_entry *entry = pdf_get_populating_xref_entry(ctx, doc, i);
719
720 if (entry->type == 'o' && pdf_get_populating_xref_entry(ctx, doc, entry->ofs)->type != 'n')
721 fz_throw(ctx, FZ_ERROR_GENERIC, "invalid reference to non-object-stream: %d (%d 0 R)", (int)entry->ofs, i);
722 }
723}
724