1#include "mupdf/fitz.h"
2#include "mupdf/pdf.h"
3
4#include <assert.h>
5#include <limits.h>
6#include <string.h>
7
8#undef DEBUG_PROGESSIVE_ADVANCE
9
10#ifdef DEBUG_PROGESSIVE_ADVANCE
11#define DEBUGMESS(A) do { fz_warn A; } while (0)
12#else
13#define DEBUGMESS(A) do { } while (0)
14#endif
15
16#define isdigit(c) (c >= '0' && c <= '9')
17
18static inline int iswhite(int ch)
19{
20 return
21 ch == '\000' || ch == '\011' || ch == '\012' ||
22 ch == '\014' || ch == '\015' || ch == '\040';
23}
24
25/*
26 * xref tables
27 */
28
29static void pdf_drop_xref_sections_imp(fz_context *ctx, pdf_document *doc, pdf_xref *xref_sections, int num_xref_sections)
30{
31 pdf_unsaved_sig *usig;
32 int x, e;
33
34 for (x = 0; x < num_xref_sections; x++)
35 {
36 pdf_xref *xref = &xref_sections[x];
37 pdf_xref_subsec *sub = xref->subsec;
38
39 while (sub != NULL)
40 {
41 pdf_xref_subsec *next_sub = sub->next;
42 for (e = 0; e < sub->len; e++)
43 {
44 pdf_xref_entry *entry = &sub->table[e];
45 if (entry->obj)
46 {
47 pdf_drop_obj(ctx, entry->obj);
48 fz_drop_buffer(ctx, entry->stm_buf);
49 }
50 }
51 fz_free(ctx, sub->table);
52 fz_free(ctx, sub);
53 sub = next_sub;
54 }
55
56 pdf_drop_obj(ctx, xref->pre_repair_trailer);
57 pdf_drop_obj(ctx, xref->trailer);
58
59 while ((usig = xref->unsaved_sigs) != NULL)
60 {
61 xref->unsaved_sigs = usig->next;
62 pdf_drop_obj(ctx, usig->field);
63 usig->signer->drop(usig->signer);
64 fz_free(ctx, usig);
65 }
66 }
67
68 fz_free(ctx, xref_sections);
69}
70
71static void pdf_drop_xref_sections(fz_context *ctx, pdf_document *doc)
72{
73 pdf_drop_xref_sections_imp(ctx, doc, doc->saved_xref_sections, doc->saved_num_xref_sections);
74 pdf_drop_xref_sections_imp(ctx, doc, doc->xref_sections, doc->num_xref_sections);
75
76 doc->saved_xref_sections = NULL;
77 doc->saved_num_xref_sections = 0;
78 doc->xref_sections = NULL;
79 doc->num_xref_sections = 0;
80 doc->num_incremental_sections = 0;
81}
82
83static void
84extend_xref_index(fz_context *ctx, pdf_document *doc, int newlen)
85{
86 int i;
87
88 doc->xref_index = fz_realloc_array(ctx, doc->xref_index, newlen, int);
89 for (i = doc->max_xref_len; i < newlen; i++)
90 {
91 doc->xref_index[i] = 0;
92 }
93 doc->max_xref_len = newlen;
94}
95
96/* This is only ever called when we already have an incremental
97 * xref. This means there will only be 1 subsec, and it will be
98 * a complete subsec. */
99static void pdf_resize_xref(fz_context *ctx, pdf_document *doc, int newlen)
100{
101 int i;
102 pdf_xref *xref = &doc->xref_sections[doc->xref_base];
103 pdf_xref_subsec *sub;
104
105 assert(xref != NULL);
106 sub = xref->subsec;
107 assert(sub->next == NULL && sub->start == 0 && sub->len == xref->num_objects);
108 assert(newlen > xref->num_objects);
109
110 sub->table = fz_realloc_array(ctx, sub->table, newlen, pdf_xref_entry);
111 for (i = xref->num_objects; i < newlen; i++)
112 {
113 sub->table[i].type = 0;
114 sub->table[i].ofs = 0;
115 sub->table[i].gen = 0;
116 sub->table[i].num = 0;
117 sub->table[i].stm_ofs = 0;
118 sub->table[i].stm_buf = NULL;
119 sub->table[i].obj = NULL;
120 }
121 xref->num_objects = newlen;
122 sub->len = newlen;
123 if (doc->max_xref_len < newlen)
124 extend_xref_index(ctx, doc, newlen);
125}
126
127static void pdf_populate_next_xref_level(fz_context *ctx, pdf_document *doc)
128{
129 pdf_xref *xref;
130 doc->xref_sections = fz_realloc_array(ctx, doc->xref_sections, doc->num_xref_sections + 1, pdf_xref);
131 doc->num_xref_sections++;
132
133 xref = &doc->xref_sections[doc->num_xref_sections - 1];
134 xref->subsec = NULL;
135 xref->num_objects = 0;
136 xref->trailer = NULL;
137 xref->pre_repair_trailer = NULL;
138 xref->unsaved_sigs = NULL;
139 xref->unsaved_sigs_end = NULL;
140}
141
142pdf_obj *pdf_trailer(fz_context *ctx, pdf_document *doc)
143{
144 /* Return the document's final trailer */
145 pdf_xref *xref = &doc->xref_sections[0];
146
147 return xref ? xref->trailer : NULL;
148}
149
150void pdf_set_populating_xref_trailer(fz_context *ctx, pdf_document *doc, pdf_obj *trailer)
151{
152 /* Update the trailer of the xref section being populated */
153 pdf_xref *xref = &doc->xref_sections[doc->num_xref_sections - 1];
154 if (xref->trailer)
155 {
156 pdf_drop_obj(ctx, xref->pre_repair_trailer);
157 xref->pre_repair_trailer = xref->trailer;
158 }
159 xref->trailer = pdf_keep_obj(ctx, trailer);
160}
161
162int pdf_xref_len(fz_context *ctx, pdf_document *doc)
163{
164 return doc->max_xref_len;
165}
166
167/* Ensure that the given xref has a single subsection
168 * that covers the entire range. */
169static void
170ensure_solid_xref(fz_context *ctx, pdf_document *doc, int num, int which)
171{
172 pdf_xref *xref = &doc->xref_sections[which];
173 pdf_xref_subsec *sub = xref->subsec;
174 pdf_xref_subsec *new_sub;
175
176 if (num < xref->num_objects)
177 num = xref->num_objects;
178
179 if (sub != NULL && sub->next == NULL && sub->start == 0 && sub->len >= num)
180 return;
181
182 new_sub = fz_malloc_struct(ctx, pdf_xref_subsec);
183 fz_try(ctx)
184 {
185 new_sub->table = fz_calloc(ctx, num, sizeof(pdf_xref_entry));
186 new_sub->start = 0;
187 new_sub->len = num;
188 new_sub->next = NULL;
189 }
190 fz_catch(ctx)
191 {
192 fz_free(ctx, new_sub);
193 fz_rethrow(ctx);
194 }
195
196 /* Move objects over to the new subsection and destroy the old
197 * ones */
198 sub = xref->subsec;
199 while (sub != NULL)
200 {
201 pdf_xref_subsec *next = sub->next;
202 int i;
203
204 for (i = 0; i < sub->len; i++)
205 {
206 new_sub->table[i+sub->start] = sub->table[i];
207 }
208 fz_free(ctx, sub->table);
209 fz_free(ctx, sub);
210 sub = next;
211 }
212 xref->num_objects = num;
213 xref->subsec = new_sub;
214 if (doc->max_xref_len < num)
215 extend_xref_index(ctx, doc, num);
216}
217
218/* Used while reading the individual xref sections from a file */
219pdf_xref_entry *pdf_get_populating_xref_entry(fz_context *ctx, pdf_document *doc, int num)
220{
221 /* Return an entry within the xref currently being populated */
222 pdf_xref *xref;
223 pdf_xref_subsec *sub;
224
225 if (doc->num_xref_sections == 0)
226 {
227 doc->xref_sections = fz_malloc_struct(ctx, pdf_xref);
228 doc->num_xref_sections = 1;
229 }
230
231 /* Prevent accidental heap underflow */
232 if (num < 0 || num > PDF_MAX_OBJECT_NUMBER)
233 fz_throw(ctx, FZ_ERROR_GENERIC, "object number out of range (%d)", num);
234
235 /* Return the pointer to the entry in the last section. */
236 xref = &doc->xref_sections[doc->num_xref_sections-1];
237
238 for (sub = xref->subsec; sub != NULL; sub = sub->next)
239 {
240 if (num >= sub->start && num < sub->start + sub->len)
241 return &sub->table[num-sub->start];
242 }
243
244 /* We've been asked for an object that's not in a subsec. */
245 ensure_solid_xref(ctx, doc, num+1, doc->num_xref_sections-1);
246 xref = &doc->xref_sections[doc->num_xref_sections-1];
247 sub = xref->subsec;
248
249 return &sub->table[num-sub->start];
250}
251
252/* Used after loading a document to access entries */
253/* This will never throw anything, or return NULL if it is
254 * only asked to return objects in range within a 'solid'
255 * xref. */
256pdf_xref_entry *pdf_get_xref_entry(fz_context *ctx, pdf_document *doc, int i)
257{
258 pdf_xref *xref = NULL;
259 pdf_xref_subsec *sub;
260 int j;
261
262 if (i < 0)
263 fz_throw(ctx, FZ_ERROR_GENERIC, "Negative object number requested");
264
265 if (i <= doc->max_xref_len)
266 j = doc->xref_index[i];
267 else
268 j = 0;
269
270 /* We may be accessing an earlier version of the document using xref_base
271 * and j may be an index into a later xref section */
272 if (doc->xref_base > j)
273 j = doc->xref_base;
274
275 /* Find the first xref section where the entry is defined. */
276 for (; j < doc->num_xref_sections; j++)
277 {
278 xref = &doc->xref_sections[j];
279
280 if (i < xref->num_objects)
281 {
282 for (sub = xref->subsec; sub != NULL; sub = sub->next)
283 {
284 pdf_xref_entry *entry;
285
286 if (i < sub->start || i >= sub->start + sub->len)
287 continue;
288
289 entry = &sub->table[i - sub->start];
290 if (entry->type)
291 {
292 /* Don't update xref_index if xref_base may have
293 * influenced the value of j */
294 if (doc->xref_base == 0)
295 doc->xref_index[i] = j;
296 return entry;
297 }
298 }
299 }
300 }
301
302 /* Didn't find the entry in any section. Return the entry from
303 * the final section. */
304 doc->xref_index[i] = 0;
305 if (xref == NULL || i < xref->num_objects)
306 {
307 xref = &doc->xref_sections[doc->xref_base];
308 for (sub = xref->subsec; sub != NULL; sub = sub->next)
309 {
310 if (i >= sub->start && i < sub->start + sub->len)
311 return &sub->table[i - sub->start];
312 }
313 }
314
315 /* At this point, we solidify the xref. This ensures that we
316 * can return a pointer. This is the only case where this function
317 * might throw an exception, and it will never happen when we are
318 * working within a 'solid' xref. */
319 ensure_solid_xref(ctx, doc, i+1, 0);
320 xref = &doc->xref_sections[0];
321 sub = xref->subsec;
322 return &sub->table[i - sub->start];
323}
324
325/*
326 Ensure we have an incremental xref section where we can store
327 updated versions of indirect objects. This is a new xref section
328 consisting of a single xref subsection.
329*/
330static void ensure_incremental_xref(fz_context *ctx, pdf_document *doc)
331{
332 /* If there are as yet no incremental sections, or if the most recent
333 * one has been used to sign a signature field, then we need a new one.
334 * After a signing, any further document changes require a new increment */
335 if ((doc->num_incremental_sections == 0 || doc->xref_sections[0].unsaved_sigs != NULL)
336 && !doc->disallow_new_increments)
337 {
338 pdf_xref *xref = &doc->xref_sections[0];
339 pdf_xref *pxref;
340 pdf_xref_entry *new_table = fz_calloc(ctx, xref->num_objects, sizeof(pdf_xref_entry));
341 pdf_xref_subsec *sub = NULL;
342 pdf_obj *trailer = NULL;
343 int i;
344
345 fz_var(trailer);
346 fz_var(sub);
347 fz_try(ctx)
348 {
349 sub = fz_malloc_struct(ctx, pdf_xref_subsec);
350 trailer = xref->trailer ? pdf_copy_dict(ctx, xref->trailer) : NULL;
351 doc->xref_sections = fz_realloc_array(ctx, doc->xref_sections, doc->num_xref_sections + 1, pdf_xref);
352 xref = &doc->xref_sections[0];
353 pxref = &doc->xref_sections[1];
354 memmove(pxref, xref, doc->num_xref_sections * sizeof(pdf_xref));
355 /* xref->num_objects is already correct */
356 xref->subsec = sub;
357 sub = NULL;
358 xref->trailer = trailer;
359 xref->pre_repair_trailer = NULL;
360 xref->unsaved_sigs = NULL;
361 xref->unsaved_sigs_end = NULL;
362 xref->subsec->next = NULL;
363 xref->subsec->len = xref->num_objects;
364 xref->subsec->start = 0;
365 xref->subsec->table = new_table;
366 doc->num_xref_sections++;
367 doc->num_incremental_sections++;
368 }
369 fz_catch(ctx)
370 {
371 fz_free(ctx, sub);
372 fz_free(ctx, new_table);
373 pdf_drop_obj(ctx, trailer);
374 fz_rethrow(ctx);
375 }
376
377 /* Update the xref_index */
378 for (i = 0; i < doc->max_xref_len; i++)
379 {
380 doc->xref_index[i]++;
381 }
382 }
383}
384
385/* Used when altering a document */
386static pdf_xref_entry *pdf_get_incremental_xref_entry(fz_context *ctx, pdf_document *doc, int i)
387{
388 pdf_xref *xref;
389 pdf_xref_subsec *sub;
390
391 /* Make a new final xref section if we haven't already */
392 ensure_incremental_xref(ctx, doc);
393
394 xref = &doc->xref_sections[doc->xref_base];
395 if (i >= xref->num_objects)
396 pdf_resize_xref(ctx, doc, i + 1);
397
398 sub = xref->subsec;
399 assert(sub != NULL && sub->next == NULL);
400 assert(i >= sub->start && i < sub->start + sub->len);
401 doc->xref_index[i] = 0;
402 return &sub->table[i - sub->start];
403}
404
405int pdf_xref_is_incremental(fz_context *ctx, pdf_document *doc, int num)
406{
407 pdf_xref *xref = &doc->xref_sections[doc->xref_base];
408 pdf_xref_subsec *sub = xref->subsec;
409
410 assert(sub != NULL && sub->next == NULL && sub->len == xref->num_objects && sub->start == 0);
411
412 return num < xref->num_objects && sub->table[num].type;
413}
414
415void pdf_xref_store_unsaved_signature(fz_context *ctx, pdf_document *doc, pdf_obj *field, pdf_pkcs7_signer *signer)
416{
417 pdf_xref *xref = &doc->xref_sections[0];
418 pdf_unsaved_sig *unsaved_sig;
419
420 /* Record details within the document structure so that contents
421 * and byte_range can be updated with their correct values at
422 * saving time */
423 unsaved_sig = fz_malloc_struct(ctx, pdf_unsaved_sig);
424 unsaved_sig->field = pdf_keep_obj(ctx, field);
425 unsaved_sig->signer = signer->keep(signer);
426 unsaved_sig->next = NULL;
427 if (xref->unsaved_sigs_end == NULL)
428 xref->unsaved_sigs_end = &xref->unsaved_sigs;
429
430 *xref->unsaved_sigs_end = unsaved_sig;
431 xref->unsaved_sigs_end = &unsaved_sig->next;
432}
433
434int pdf_xref_obj_is_unsaved_signature(pdf_document *doc, pdf_obj *obj)
435{
436 int i;
437 for (i = 0; i < doc->num_incremental_sections; i++)
438 {
439 pdf_xref *xref = &doc->xref_sections[i];
440 pdf_unsaved_sig *usig;
441
442 for (usig = xref->unsaved_sigs; usig; usig = usig->next)
443 {
444 if (usig->field == obj)
445 return 1;
446 }
447 }
448
449 return 0;
450}
451
452/* Ensure that the current populating xref has a single subsection
453 * that covers the entire range. */
454void pdf_ensure_solid_xref(fz_context *ctx, pdf_document *doc, int num)
455{
456 if (doc->num_xref_sections == 0)
457 pdf_populate_next_xref_level(ctx, doc);
458
459 ensure_solid_xref(ctx, doc, num, doc->num_xref_sections-1);
460}
461
462/* Ensure that an object has been cloned into the incremental xref section */
463void pdf_xref_ensure_incremental_object(fz_context *ctx, pdf_document *doc, int num)
464{
465 pdf_xref_entry *new_entry, *old_entry;
466 pdf_xref_subsec *sub = NULL;
467 int i;
468
469 /* Make sure we have created an xref section for incremental updates */
470 ensure_incremental_xref(ctx, doc);
471
472 /* Search for the section that contains this object */
473 for (i = doc->xref_index[num]; i < doc->num_xref_sections; i++)
474 {
475 pdf_xref *xref = &doc->xref_sections[i];
476
477 if (num < 0 && num >= xref->num_objects)
478 break;
479 for (sub = xref->subsec; sub != NULL; sub = sub->next)
480 {
481 if (sub->start <= num && num < sub->start + sub->len && sub->table[num - sub->start].type)
482 break;
483 }
484 if (sub != NULL)
485 break;
486 }
487 /* sub == NULL implies we did not find it */
488
489 /* If we don't find it, or it's already in the incremental section, return */
490 if (i == 0 || sub == NULL)
491 return;
492
493 /* Move the object to the incremental section */
494 doc->xref_index[num] = 0;
495 old_entry = &sub->table[num - sub->start];
496 new_entry = pdf_get_incremental_xref_entry(ctx, doc, num);
497 *new_entry = *old_entry;
498 if (i < doc->num_incremental_sections)
499 {
500 /* old entry is incremental and may have changes.
501 * Better keep a copy. We must override the old entry with
502 * the copy because the caller may be holding a reference to
503 * the original and expect it to end up in the new entry */
504 old_entry->obj = pdf_deep_copy_obj(ctx, old_entry->obj);
505 }
506 else
507 {
508 old_entry->obj = NULL;
509 }
510 old_entry->stm_buf = NULL;
511}
512
513void pdf_replace_xref(fz_context *ctx, pdf_document *doc, pdf_xref_entry *entries, int n)
514{
515 int *xref_index = NULL;
516 pdf_xref *xref = NULL;
517 pdf_xref_subsec *sub;
518
519 fz_var(xref_index);
520 fz_var(xref);
521
522 fz_try(ctx)
523 {
524 xref_index = fz_calloc(ctx, n, sizeof(int));
525 xref = fz_malloc_struct(ctx, pdf_xref);
526 sub = fz_malloc_struct(ctx, pdf_xref_subsec);
527 }
528 fz_catch(ctx)
529 {
530 fz_free(ctx, xref);
531 fz_free(ctx, xref_index);
532 fz_rethrow(ctx);
533 }
534
535 sub->table = entries;
536 sub->start = 0;
537 sub->len = n;
538
539 xref->subsec = sub;
540 xref->num_objects = n;
541 xref->trailer = pdf_keep_obj(ctx, pdf_trailer(ctx, doc));
542
543 /* The new table completely replaces the previous separate sections */
544 pdf_drop_xref_sections(ctx, doc);
545
546 doc->xref_sections = xref;
547 doc->num_xref_sections = 1;
548 doc->num_incremental_sections = 0;
549 doc->xref_base = 0;
550 doc->disallow_new_increments = 0;
551 doc->max_xref_len = n;
552
553 fz_free(ctx, doc->xref_index);
554 doc->xref_index = xref_index;
555}
556
557void pdf_forget_xref(fz_context *ctx, pdf_document *doc)
558{
559 pdf_obj *trailer = pdf_keep_obj(ctx, pdf_trailer(ctx, doc));
560
561 if (doc->saved_xref_sections)
562 pdf_drop_xref_sections_imp(ctx, doc, doc->saved_xref_sections, doc->saved_num_xref_sections);
563
564 doc->saved_xref_sections = doc->xref_sections;
565 doc->saved_num_xref_sections = doc->num_xref_sections;
566
567 doc->startxref = 0;
568 doc->num_xref_sections = 0;
569 doc->num_incremental_sections = 0;
570 doc->xref_base = 0;
571 doc->disallow_new_increments = 0;
572
573 fz_try(ctx)
574 {
575 pdf_get_populating_xref_entry(ctx, doc, 0);
576 }
577 fz_catch(ctx)
578 {
579 pdf_drop_obj(ctx, trailer);
580 fz_rethrow(ctx);
581 }
582
583 /* Set the trailer of the final xref section. */
584 doc->xref_sections[0].trailer = trailer;
585}
586
587/*
588 * magic version tag and startxref
589 */
590
591static void
592pdf_load_version(fz_context *ctx, pdf_document *doc)
593{
594 char buf[20];
595
596 fz_seek(ctx, doc->file, 0, SEEK_SET);
597 fz_read_line(ctx, doc->file, buf, sizeof buf);
598 if (strlen(buf) < 5 || memcmp(buf, "%PDF-", 5) != 0)
599 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot recognize version marker");
600
601 doc->version = 10 * (fz_atof(buf+5) + 0.05f);
602 if (doc->version < 10 || doc->version > 17)
603 if (doc->version != 20)
604 fz_warn(ctx, "unknown PDF version: %d.%d", doc->version / 10, doc->version % 10);
605}
606
607static void
608pdf_read_start_xref(fz_context *ctx, pdf_document *doc)
609{
610 unsigned char buf[1024];
611 size_t i, n;
612 int64_t t;
613
614 fz_seek(ctx, doc->file, 0, SEEK_END);
615
616 doc->file_size = fz_tell(ctx, doc->file);
617
618 t = fz_maxi64(0, doc->file_size - (int64_t)sizeof buf);
619 fz_seek(ctx, doc->file, t, SEEK_SET);
620
621 n = fz_read(ctx, doc->file, buf, sizeof buf);
622 if (n < 9)
623 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find startxref");
624
625 i = n - 9;
626 do
627 {
628 if (memcmp(buf + i, "startxref", 9) == 0)
629 {
630 i += 9;
631 while (i < n && iswhite(buf[i]))
632 i ++;
633 doc->startxref = 0;
634 while (i < n && isdigit(buf[i]))
635 {
636 if (doc->startxref >= INT64_MAX/10)
637 fz_throw(ctx, FZ_ERROR_GENERIC, "startxref too large");
638 doc->startxref = doc->startxref * 10 + (buf[i++] - '0');
639 }
640 if (doc->startxref != 0)
641 return;
642 break;
643 }
644 } while (i-- > 0);
645
646 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find startxref");
647}
648
649static void
650fz_skip_space(fz_context *ctx, fz_stream *stm)
651{
652 do
653 {
654 int c = fz_peek_byte(ctx, stm);
655 if (c == EOF || c > 32)
656 return;
657 (void)fz_read_byte(ctx, stm);
658 }
659 while (1);
660}
661
662static int fz_skip_string(fz_context *ctx, fz_stream *stm, const char *str)
663{
664 while (*str)
665 {
666 int c = fz_peek_byte(ctx, stm);
667 if (c == EOF || c != *str++)
668 return 1;
669 (void)fz_read_byte(ctx, stm);
670 }
671 return 0;
672}
673
674/*
675 * trailer dictionary
676 */
677
678static int
679pdf_xref_size_from_old_trailer(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf)
680{
681 int len;
682 char *s;
683 int64_t t;
684 pdf_token tok;
685 int c;
686 int size = 0;
687 int64_t ofs;
688 pdf_obj *trailer = NULL;
689 size_t n;
690
691 fz_var(trailer);
692
693 /* Record the current file read offset so that we can reinstate it */
694 ofs = fz_tell(ctx, doc->file);
695
696 fz_skip_space(ctx, doc->file);
697 if (fz_skip_string(ctx, doc->file, "xref"))
698 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find xref marker");
699 fz_skip_space(ctx, doc->file);
700
701 while (1)
702 {
703 c = fz_peek_byte(ctx, doc->file);
704 if (!isdigit(c))
705 break;
706
707 fz_read_line(ctx, doc->file, buf->scratch, buf->size);
708 s = buf->scratch;
709 fz_strsep(&s, " "); /* ignore start */
710 if (!s)
711 fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection length missing");
712 len = fz_atoi(fz_strsep(&s, " "));
713 if (len < 0)
714 fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection length must be positive");
715
716 /* broken pdfs where the section is not on a separate line */
717 if (s && *s != '\0')
718 fz_seek(ctx, doc->file, -(2 + (int)strlen(s)), SEEK_CUR);
719
720 t = fz_tell(ctx, doc->file);
721 if (t < 0)
722 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot tell in file");
723
724 /* Spec says xref entries should be 20 bytes, but it's not infrequent
725 * to see 19, in particular for some PCLm drivers. Cope. */
726 if (len > 0)
727 {
728 n = fz_read(ctx, doc->file, (unsigned char *)buf->scratch, 20);
729 if (n < 19)
730 fz_throw(ctx, FZ_ERROR_GENERIC, "malformed xref table");
731 if (n == 20 && buf->scratch[19] > 32)
732 n = 19;
733 }
734 else
735 n = 20;
736
737 if (len > (int64_t)((INT64_MAX - t) / n))
738 fz_throw(ctx, FZ_ERROR_GENERIC, "xref has too many entries");
739
740 fz_seek(ctx, doc->file, t + n * len, SEEK_SET);
741 }
742
743 fz_try(ctx)
744 {
745 tok = pdf_lex(ctx, doc->file, buf);
746 if (tok != PDF_TOK_TRAILER)
747 fz_throw(ctx, FZ_ERROR_GENERIC, "expected trailer marker");
748
749 tok = pdf_lex(ctx, doc->file, buf);
750 if (tok != PDF_TOK_OPEN_DICT)
751 fz_throw(ctx, FZ_ERROR_GENERIC, "expected trailer dictionary");
752
753 trailer = pdf_parse_dict(ctx, doc, doc->file, buf);
754
755 size = pdf_dict_get_int(ctx, trailer, PDF_NAME(Size));
756 if (size < 0 || size > PDF_MAX_OBJECT_NUMBER + 1)
757 fz_throw(ctx, FZ_ERROR_GENERIC, "trailer Size entry out of range");
758 }
759 fz_always(ctx)
760 {
761 pdf_drop_obj(ctx, trailer);
762 }
763 fz_catch(ctx)
764 {
765 fz_rethrow(ctx);
766 }
767
768 fz_seek(ctx, doc->file, ofs, SEEK_SET);
769
770 return size;
771}
772
773static pdf_xref_entry *
774pdf_xref_find_subsection(fz_context *ctx, pdf_document *doc, int start, int len)
775{
776 pdf_xref *xref = &doc->xref_sections[doc->num_xref_sections-1];
777 pdf_xref_subsec *sub;
778 int num_objects;
779
780 /* Different cases here. Case 1) We might be asking for a
781 * subsection (or a subset of a subsection) that we already
782 * have - Just return it. Case 2) We might be asking for a
783 * completely new subsection - Create it and return it.
784 * Case 3) We might have an overlapping one - Create a 'solid'
785 * subsection and return that. */
786
787 /* Sanity check */
788 for (sub = xref->subsec; sub != NULL; sub = sub->next)
789 {
790 if (start >= sub->start && start + len <= sub->start + sub->len)
791 return &sub->table[start-sub->start]; /* Case 1 */
792 if (start + len > sub->start && start <= sub->start + sub->len)
793 break; /* Case 3 */
794 }
795
796 num_objects = xref->num_objects;
797 if (num_objects < start + len)
798 num_objects = start + len;
799
800 if (sub == NULL)
801 {
802 /* Case 2 */
803 sub = fz_malloc_struct(ctx, pdf_xref_subsec);
804 fz_try(ctx)
805 {
806 sub->table = fz_calloc(ctx, len, sizeof(pdf_xref_entry));
807 sub->start = start;
808 sub->len = len;
809 sub->next = xref->subsec;
810 xref->subsec = sub;
811 }
812 fz_catch(ctx)
813 {
814 fz_free(ctx, sub);
815 fz_rethrow(ctx);
816 }
817 xref->num_objects = num_objects;
818 if (doc->max_xref_len < num_objects)
819 extend_xref_index(ctx, doc, num_objects);
820 }
821 else
822 {
823 /* Case 3 */
824 ensure_solid_xref(ctx, doc, num_objects, doc->num_xref_sections-1);
825 xref = &doc->xref_sections[doc->num_xref_sections-1];
826 sub = xref->subsec;
827 }
828 return &sub->table[start-sub->start];
829}
830
831static pdf_obj *
832pdf_read_old_xref(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf)
833{
834 int start, len, c, i, xref_len, carried;
835 fz_stream *file = doc->file;
836 pdf_xref_entry *table;
837 pdf_token tok;
838 size_t n;
839 char *s, *e;
840
841 xref_len = pdf_xref_size_from_old_trailer(ctx, doc, buf);
842
843 fz_skip_space(ctx, doc->file);
844 if (fz_skip_string(ctx, doc->file, "xref"))
845 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find xref marker");
846 fz_skip_space(ctx, doc->file);
847
848 while (1)
849 {
850 c = fz_peek_byte(ctx, file);
851 if (!isdigit(c))
852 break;
853
854 fz_read_line(ctx, file, buf->scratch, buf->size);
855 s = buf->scratch;
856 start = fz_atoi(fz_strsep(&s, " "));
857 len = fz_atoi(fz_strsep(&s, " "));
858
859 /* broken pdfs where the section is not on a separate line */
860 if (s && *s != '\0')
861 {
862 fz_warn(ctx, "broken xref subsection. proceeding anyway.");
863 fz_seek(ctx, file, -(2 + (int)strlen(s)), SEEK_CUR);
864 }
865
866 if (start < 0 || start > PDF_MAX_OBJECT_NUMBER
867 || len < 0 || len > PDF_MAX_OBJECT_NUMBER
868 || start + len - 1 > PDF_MAX_OBJECT_NUMBER)
869 {
870 fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection object numbers are out of range");
871 }
872 /* broken pdfs where size in trailer undershoots entries in xref sections */
873 if (start + len > xref_len)
874 {
875 fz_warn(ctx, "broken xref subsection, proceeding anyway.");
876 }
877
878 table = pdf_xref_find_subsection(ctx, doc, start, len);
879
880 /* Xref entries SHOULD be 20 bytes long, but we see 19 byte
881 * ones more frequently than we'd like (e.g. PCLm drivers).
882 * Cope with this by 'carrying' data forward. */
883 carried = 0;
884 for (i = 0; i < len; i++)
885 {
886 pdf_xref_entry *entry = &table[i];
887 n = fz_read(ctx, file, (unsigned char *) buf->scratch + carried, 20-carried);
888 if (n != 20-carried)
889 fz_throw(ctx, FZ_ERROR_GENERIC, "unexpected EOF in xref table");
890 n += carried;
891 buf->scratch[n] = '\0';
892 if (!entry->type)
893 {
894 s = buf->scratch;
895 e = s + n;
896
897 entry->num = start + i;
898
899 /* broken pdfs where line start with white space */
900 while (s < e && iswhite(*s))
901 s++;
902
903 if (s == e || !isdigit(*s))
904 fz_throw(ctx, FZ_ERROR_GENERIC, "xref offset missing");
905 while (s < e && isdigit(*s))
906 entry->ofs = entry->ofs * 10 + *s++ - '0';
907
908 while (s < e && iswhite(*s))
909 s++;
910 if (s == e || !isdigit(*s))
911 fz_throw(ctx, FZ_ERROR_GENERIC, "xref generation number missing");
912 while (s < e && isdigit(*s))
913 entry->gen = entry->gen * 10 + *s++ - '0';
914
915 while (s < e && iswhite(*s))
916 s++;
917 if (s == e || (*s != 'f' && *s != 'n' && *s != 'o'))
918 fz_throw(ctx, FZ_ERROR_GENERIC, "unexpected xref type: 0x%x (%d %d R)", s == e ? 0 : *s, entry->num, entry->gen);
919 entry->type = *s++;
920
921 /* If the last byte of our buffer isn't an EOL (or space), carry one byte forward */
922 carried = buf->scratch[19] > 32;
923 if (carried)
924 buf->scratch[0] = buf->scratch[19];
925 }
926 }
927 if (carried)
928 fz_unread_byte(ctx, file);
929 }
930
931 tok = pdf_lex(ctx, file, buf);
932 if (tok != PDF_TOK_TRAILER)
933 fz_throw(ctx, FZ_ERROR_GENERIC, "expected trailer marker");
934
935 tok = pdf_lex(ctx, file, buf);
936 if (tok != PDF_TOK_OPEN_DICT)
937 fz_throw(ctx, FZ_ERROR_GENERIC, "expected trailer dictionary");
938
939 doc->has_old_style_xrefs = 1;
940
941 return pdf_parse_dict(ctx, doc, file, buf);
942}
943
944static void
945pdf_read_new_xref_section(fz_context *ctx, pdf_document *doc, fz_stream *stm, int i0, int i1, int w0, int w1, int w2)
946{
947 pdf_xref_entry *table;
948 int i, n;
949
950 if (i0 < 0 || i0 > PDF_MAX_OBJECT_NUMBER || i1 < 0 || i1 > PDF_MAX_OBJECT_NUMBER || i0 + i1 - 1 > PDF_MAX_OBJECT_NUMBER)
951 fz_throw(ctx, FZ_ERROR_GENERIC, "xref subsection object numbers are out of range");
952
953 table = pdf_xref_find_subsection(ctx, doc, i0, i1);
954 for (i = i0; i < i0 + i1; i++)
955 {
956 pdf_xref_entry *entry = &table[i-i0];
957 int a = 0;
958 int64_t b = 0;
959 int c = 0;
960
961 if (fz_is_eof(ctx, stm))
962 fz_throw(ctx, FZ_ERROR_GENERIC, "truncated xref stream");
963
964 for (n = 0; n < w0; n++)
965 a = (a << 8) + fz_read_byte(ctx, stm);
966 for (n = 0; n < w1; n++)
967 b = (b << 8) + fz_read_byte(ctx, stm);
968 for (n = 0; n < w2; n++)
969 c = (c << 8) + fz_read_byte(ctx, stm);
970
971 if (!entry->type)
972 {
973 int t = w0 ? a : 1;
974 entry->type = t == 0 ? 'f' : t == 1 ? 'n' : t == 2 ? 'o' : 0;
975 entry->ofs = w1 ? b : 0;
976 entry->gen = w2 ? c : 0;
977 entry->num = i;
978 }
979 }
980
981 doc->has_xref_streams = 1;
982}
983
984/* Entered with file locked, remains locked throughout. */
985static pdf_obj *
986pdf_read_new_xref(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf)
987{
988 fz_stream *stm = NULL;
989 pdf_obj *trailer = NULL;
990 pdf_obj *index = NULL;
991 pdf_obj *obj = NULL;
992 int gen, num = 0;
993 int64_t ofs, stm_ofs;
994 int size, w0, w1, w2;
995 int t;
996
997 fz_var(trailer);
998 fz_var(stm);
999
1000 fz_try(ctx)
1001 {
1002 ofs = fz_tell(ctx, doc->file);
1003 trailer = pdf_parse_ind_obj(ctx, doc, doc->file, buf, &num, &gen, &stm_ofs, NULL);
1004 }
1005 fz_catch(ctx)
1006 {
1007 pdf_drop_obj(ctx, trailer);
1008 fz_rethrow(ctx);
1009 }
1010
1011 fz_try(ctx)
1012 {
1013 pdf_xref_entry *entry;
1014
1015 obj = pdf_dict_get(ctx, trailer, PDF_NAME(Size));
1016 if (!obj)
1017 fz_throw(ctx, FZ_ERROR_GENERIC, "xref stream missing Size entry (%d 0 R)", num);
1018
1019 size = pdf_to_int(ctx, obj);
1020
1021 obj = pdf_dict_get(ctx, trailer, PDF_NAME(W));
1022 if (!obj)
1023 fz_throw(ctx, FZ_ERROR_GENERIC, "xref stream missing W entry (%d R)", num);
1024 w0 = pdf_array_get_int(ctx, obj, 0);
1025 w1 = pdf_array_get_int(ctx, obj, 1);
1026 w2 = pdf_array_get_int(ctx, obj, 2);
1027
1028 if (w0 < 0)
1029 fz_warn(ctx, "xref stream objects have corrupt type");
1030 if (w1 < 0)
1031 fz_warn(ctx, "xref stream objects have corrupt offset");
1032 if (w2 < 0)
1033 fz_warn(ctx, "xref stream objects have corrupt generation");
1034
1035 w0 = w0 < 0 ? 0 : w0;
1036 w1 = w1 < 0 ? 0 : w1;
1037 w2 = w2 < 0 ? 0 : w2;
1038
1039 index = pdf_dict_get(ctx, trailer, PDF_NAME(Index));
1040
1041 stm = pdf_open_stream_with_offset(ctx, doc, num, trailer, stm_ofs);
1042
1043 if (!index)
1044 {
1045 pdf_read_new_xref_section(ctx, doc, stm, 0, size, w0, w1, w2);
1046 }
1047 else
1048 {
1049 int n = pdf_array_len(ctx, index);
1050 for (t = 0; t < n; t += 2)
1051 {
1052 int i0 = pdf_array_get_int(ctx, index, t + 0);
1053 int i1 = pdf_array_get_int(ctx, index, t + 1);
1054 pdf_read_new_xref_section(ctx, doc, stm, i0, i1, w0, w1, w2);
1055 }
1056 }
1057 entry = pdf_get_populating_xref_entry(ctx, doc, num);
1058 entry->ofs = ofs;
1059 entry->gen = gen;
1060 entry->num = num;
1061 entry->stm_ofs = stm_ofs;
1062 pdf_drop_obj(ctx, entry->obj);
1063 entry->obj = pdf_keep_obj(ctx, trailer);
1064 entry->type = 'n';
1065 }
1066 fz_always(ctx)
1067 {
1068 fz_drop_stream(ctx, stm);
1069 }
1070 fz_catch(ctx)
1071 {
1072 pdf_drop_obj(ctx, trailer);
1073 fz_rethrow(ctx);
1074 }
1075
1076 return trailer;
1077}
1078
1079static pdf_obj *
1080pdf_read_xref(fz_context *ctx, pdf_document *doc, int64_t ofs, pdf_lexbuf *buf)
1081{
1082 pdf_obj *trailer;
1083 int c;
1084
1085 fz_seek(ctx, doc->file, ofs, SEEK_SET);
1086
1087 while (iswhite(fz_peek_byte(ctx, doc->file)))
1088 fz_read_byte(ctx, doc->file);
1089
1090 c = fz_peek_byte(ctx, doc->file);
1091 if (c == 'x')
1092 trailer = pdf_read_old_xref(ctx, doc, buf);
1093 else if (isdigit(c))
1094 trailer = pdf_read_new_xref(ctx, doc, buf);
1095 else
1096 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot recognize xref format");
1097
1098 return trailer;
1099}
1100
1101static int64_t
1102read_xref_section(fz_context *ctx, pdf_document *doc, int64_t ofs, pdf_lexbuf *buf)
1103{
1104 pdf_obj *trailer = NULL;
1105 pdf_obj *prevobj;
1106 int64_t xrefstmofs = 0;
1107 int64_t prevofs = 0;
1108
1109 trailer = pdf_read_xref(ctx, doc, ofs, buf);
1110 fz_try(ctx)
1111 {
1112 pdf_set_populating_xref_trailer(ctx, doc, trailer);
1113
1114 /* FIXME: do we overwrite free entries properly? */
1115 /* FIXME: Does this work properly with progression? */
1116 xrefstmofs = pdf_to_int64(ctx, pdf_dict_get(ctx, trailer, PDF_NAME(XRefStm)));
1117 if (xrefstmofs)
1118 {
1119 if (xrefstmofs < 0)
1120 fz_throw(ctx, FZ_ERROR_GENERIC, "negative xref stream offset");
1121
1122 /*
1123 Read the XRefStm stream, but throw away the resulting trailer. We do not
1124 follow any Prev tag therein, as specified on Page 108 of the PDF reference
1125 1.7
1126 */
1127 pdf_drop_obj(ctx, pdf_read_xref(ctx, doc, xrefstmofs, buf));
1128 }
1129
1130 prevobj = pdf_dict_get(ctx, trailer, PDF_NAME(Prev));
1131 if (pdf_is_int(ctx, prevobj))
1132 {
1133 prevofs = pdf_to_int64(ctx, prevobj);
1134 if (prevofs <= 0)
1135 fz_throw(ctx, FZ_ERROR_GENERIC, "invalid offset for previous xref section");
1136 }
1137 }
1138 fz_always(ctx)
1139 pdf_drop_obj(ctx, trailer);
1140 fz_catch(ctx)
1141 fz_rethrow(ctx);
1142
1143 return prevofs;
1144}
1145
1146static void
1147pdf_read_xref_sections(fz_context *ctx, pdf_document *doc, int64_t ofs, pdf_lexbuf *buf, int read_previous)
1148{
1149 int i, len, cap;
1150 int64_t *offsets;
1151
1152 len = 0;
1153 cap = 10;
1154 offsets = fz_malloc_array(ctx, cap, int64_t);
1155
1156 fz_try(ctx)
1157 {
1158 while(ofs)
1159 {
1160 for (i = 0; i < len; i ++)
1161 {
1162 if (offsets[i] == ofs)
1163 break;
1164 }
1165 if (i < len)
1166 {
1167 fz_warn(ctx, "ignoring xref section recursion at offset %d", (int)ofs);
1168 break;
1169 }
1170 if (len == cap)
1171 {
1172 cap *= 2;
1173 offsets = fz_realloc_array(ctx, offsets, cap, int64_t);
1174 }
1175 offsets[len++] = ofs;
1176
1177 pdf_populate_next_xref_level(ctx, doc);
1178 ofs = read_xref_section(ctx, doc, ofs, buf);
1179 if (!read_previous)
1180 break;
1181 }
1182 }
1183 fz_always(ctx)
1184 {
1185 fz_free(ctx, offsets);
1186 }
1187 fz_catch(ctx)
1188 {
1189 fz_rethrow(ctx);
1190 }
1191}
1192
1193static void
1194pdf_prime_xref_index(fz_context *ctx, pdf_document *doc)
1195{
1196 int i, j;
1197 int *idx = doc->xref_index;
1198
1199 for (i = doc->num_xref_sections-1; i >= 0; i--)
1200 {
1201 pdf_xref *xref = &doc->xref_sections[i];
1202 pdf_xref_subsec *subsec = xref->subsec;
1203 while (subsec != NULL)
1204 {
1205 int start = subsec->start;
1206 int end = subsec->start + subsec->len;
1207 for (j = start; j < end; j++)
1208 {
1209 char t = subsec->table[j-start].type;
1210 if (t != 0 && t != 'f')
1211 idx[j] = i;
1212 }
1213
1214 subsec = subsec->next;
1215 }
1216 }
1217}
1218
1219/*
1220 * load xref tables from pdf
1221 *
1222 * File locked on entry, throughout and on exit.
1223 */
1224
1225static void
1226pdf_load_xref(fz_context *ctx, pdf_document *doc, pdf_lexbuf *buf)
1227{
1228 int i;
1229 int xref_len;
1230 pdf_xref_entry *entry;
1231
1232 pdf_read_start_xref(ctx, doc);
1233
1234 pdf_read_xref_sections(ctx, doc, doc->startxref, buf, 1);
1235
1236 if (pdf_xref_len(ctx, doc) == 0)
1237 fz_throw(ctx, FZ_ERROR_GENERIC, "found xref was empty");
1238
1239 pdf_prime_xref_index(ctx, doc);
1240
1241 entry = pdf_get_xref_entry(ctx, doc, 0);
1242 /* broken pdfs where first object is missing */
1243 if (!entry->type)
1244 {
1245 entry->type = 'f';
1246 entry->gen = 65535;
1247 entry->num = 0;
1248 }
1249 /* broken pdfs where first object is not free */
1250 else if (entry->type != 'f')
1251 fz_warn(ctx, "first object in xref is not free");
1252
1253 /* broken pdfs where object offsets are out of range */
1254 xref_len = pdf_xref_len(ctx, doc);
1255 for (i = 0; i < xref_len; i++)
1256 {
1257 entry = pdf_get_xref_entry(ctx, doc, i);
1258 if (entry->type == 'n')
1259 {
1260 /* Special case code: "0000000000 * n" means free,
1261 * according to some producers (inc Quartz) */
1262 if (entry->ofs == 0)
1263 entry->type = 'f';
1264 else if (entry->ofs <= 0 || entry->ofs >= doc->file_size)
1265 fz_throw(ctx, FZ_ERROR_GENERIC, "object offset out of range: %d (%d 0 R)", (int)entry->ofs, i);
1266 }
1267 if (entry->type == 'o')
1268 {
1269 /* Read this into a local variable here, because pdf_get_xref_entry
1270 * may solidify the xref, hence invalidating "entry", meaning we
1271 * need a stashed value for the throw. */
1272 int64_t ofs = entry->ofs;
1273 if (ofs <= 0 || ofs >= xref_len || pdf_get_xref_entry(ctx, doc, ofs)->type != 'n')
1274 fz_throw(ctx, FZ_ERROR_GENERIC, "invalid reference to an objstm that does not exist: %d (%d 0 R)", (int)ofs, i);
1275 }
1276 }
1277}
1278
1279static void
1280pdf_load_linear(fz_context *ctx, pdf_document *doc)
1281{
1282 pdf_obj *dict = NULL;
1283 pdf_obj *hint = NULL;
1284 pdf_obj *o;
1285 int num, gen, lin, len;
1286 int64_t stmofs;
1287
1288 fz_var(dict);
1289 fz_var(hint);
1290
1291 fz_try(ctx)
1292 {
1293 pdf_xref_entry *entry;
1294
1295 dict = pdf_parse_ind_obj(ctx, doc, doc->file, &doc->lexbuf.base, &num, &gen, &stmofs, NULL);
1296 if (!pdf_is_dict(ctx, dict))
1297 fz_throw(ctx, FZ_ERROR_GENERIC, "Failed to read linearized dictionary");
1298 o = pdf_dict_get(ctx, dict, PDF_NAME(Linearized));
1299 if (o == NULL)
1300 fz_throw(ctx, FZ_ERROR_GENERIC, "Failed to read linearized dictionary");
1301 lin = pdf_to_int(ctx, o);
1302 if (lin != 1)
1303 fz_throw(ctx, FZ_ERROR_GENERIC, "Unexpected version of Linearized tag (%d)", lin);
1304 len = pdf_dict_get_int(ctx, dict, PDF_NAME(L));
1305 if (len != doc->file_length)
1306 fz_throw(ctx, FZ_ERROR_GENERIC, "File has been updated since linearization");
1307
1308 pdf_read_xref_sections(ctx, doc, fz_tell(ctx, doc->file), &doc->lexbuf.base, 0);
1309
1310 doc->linear_page_count = pdf_dict_get_int(ctx, dict, PDF_NAME(N));
1311 doc->linear_page_refs = fz_realloc_array(ctx, doc->linear_page_refs, doc->linear_page_count, pdf_obj *);
1312 memset(doc->linear_page_refs, 0, doc->linear_page_count * sizeof(pdf_obj*));
1313 doc->linear_obj = dict;
1314 doc->linear_pos = fz_tell(ctx, doc->file);
1315 doc->linear_page1_obj_num = pdf_dict_get_int(ctx, dict, PDF_NAME(O));
1316 doc->linear_page_refs[0] = pdf_new_indirect(ctx, doc, doc->linear_page1_obj_num, 0);
1317 doc->linear_page_num = 0;
1318 hint = pdf_dict_get(ctx, dict, PDF_NAME(H));
1319 doc->hint_object_offset = pdf_array_get_int(ctx, hint, 0);
1320 doc->hint_object_length = pdf_array_get_int(ctx, hint, 1);
1321
1322 entry = pdf_get_populating_xref_entry(ctx, doc, 0);
1323 entry->type = 'f';
1324 }
1325 fz_catch(ctx)
1326 {
1327 pdf_drop_obj(ctx, dict);
1328 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1329 /* Drop back to non linearized reading mode */
1330 doc->file_reading_linearly = 0;
1331 }
1332}
1333
1334/*
1335 * Initialize and load xref tables.
1336 * If password is not null, try to decrypt.
1337 */
1338
1339static void
1340pdf_init_document(fz_context *ctx, pdf_document *doc)
1341{
1342 pdf_obj *encrypt, *id;
1343 pdf_obj *dict = NULL;
1344 pdf_obj *obj;
1345 pdf_obj *nobj = NULL;
1346 int i, repaired = 0;
1347
1348 fz_var(dict);
1349 fz_var(nobj);
1350
1351 fz_try(ctx)
1352 {
1353 /* Check to see if we should work in progressive mode */
1354 if (doc->file->progressive)
1355 {
1356 doc->file_reading_linearly = 1;
1357 fz_seek(ctx, doc->file, 0, SEEK_END);
1358 doc->file_length = fz_tell(ctx, doc->file);
1359 if (doc->file_length < 0)
1360 doc->file_length = 0;
1361 fz_seek(ctx, doc->file, 0, SEEK_SET);
1362 }
1363
1364 pdf_load_version(ctx, doc);
1365
1366 /* Try to load the linearized file if we are in progressive
1367 * mode. */
1368 if (doc->file_reading_linearly)
1369 pdf_load_linear(ctx, doc);
1370
1371 /* If we aren't in progressive mode (or the linear load failed
1372 * and has set us back to non-progressive mode), load normally.
1373 */
1374 if (!doc->file_reading_linearly)
1375 pdf_load_xref(ctx, doc, &doc->lexbuf.base);
1376 }
1377 fz_catch(ctx)
1378 {
1379 pdf_drop_xref_sections(ctx, doc);
1380 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1381 fz_warn(ctx, "trying to repair broken xref");
1382 repaired = 1;
1383 }
1384
1385 fz_try(ctx)
1386 {
1387 int hasroot, hasinfo;
1388
1389 if (repaired)
1390 {
1391 /* pdf_repair_xref may access xref_index, so reset it properly */
1392 if (doc->xref_index)
1393 memset(doc->xref_index, 0, sizeof(int) * doc->max_xref_len);
1394 pdf_repair_xref(ctx, doc);
1395 pdf_prime_xref_index(ctx, doc);
1396 }
1397
1398 encrypt = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Encrypt));
1399 id = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(ID));
1400 if (pdf_is_dict(ctx, encrypt))
1401 doc->crypt = pdf_new_crypt(ctx, encrypt, id);
1402
1403 /* Allow lazy clients to read encrypted files with a blank password */
1404 pdf_authenticate_password(ctx, doc, "");
1405
1406 if (repaired)
1407 {
1408 int xref_len = pdf_xref_len(ctx, doc);
1409 pdf_repair_obj_stms(ctx, doc);
1410
1411 hasroot = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root)) != NULL);
1412 hasinfo = (pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info)) != NULL);
1413
1414 for (i = 1; i < xref_len && !hasinfo && !hasroot; ++i)
1415 {
1416 pdf_xref_entry *entry = pdf_get_xref_entry(ctx, doc, i);
1417 if (entry->type == 0 || entry->type == 'f')
1418 continue;
1419
1420 fz_try(ctx)
1421 {
1422 dict = pdf_load_object(ctx, doc, i);
1423 }
1424 fz_catch(ctx)
1425 {
1426 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1427 fz_warn(ctx, "ignoring broken object (%d 0 R)", i);
1428 continue;
1429 }
1430
1431 if (!hasroot)
1432 {
1433 obj = pdf_dict_get(ctx, dict, PDF_NAME(Type));
1434 if (pdf_name_eq(ctx, obj, PDF_NAME(Catalog)))
1435 {
1436 nobj = pdf_new_indirect(ctx, doc, i, 0);
1437 pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), nobj);
1438 hasroot = 1;
1439 }
1440 }
1441
1442 if (!hasinfo)
1443 {
1444 if (pdf_dict_get(ctx, dict, PDF_NAME(Creator)) || pdf_dict_get(ctx, dict, PDF_NAME(Producer)))
1445 {
1446 nobj = pdf_new_indirect(ctx, doc, i, 0);
1447 pdf_dict_put_drop(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info), nobj);
1448 hasinfo = 1;
1449 }
1450 }
1451
1452 pdf_drop_obj(ctx, dict);
1453 dict = NULL;
1454 }
1455
1456 /* ensure that strings are not used in their repaired, non-decrypted form */
1457 if (doc->crypt)
1458 pdf_clear_xref(ctx, doc);
1459 }
1460 }
1461 fz_catch(ctx)
1462 {
1463 pdf_drop_obj(ctx, dict);
1464 fz_rethrow(ctx);
1465 }
1466
1467 fz_try(ctx)
1468 {
1469 pdf_read_ocg(ctx, doc);
1470 }
1471 fz_catch(ctx)
1472 {
1473 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1474 fz_warn(ctx, "Ignoring broken Optional Content configuration");
1475 }
1476
1477 fz_try(ctx)
1478 {
1479 const char *version_str;
1480 obj = pdf_dict_getl(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root), PDF_NAME(Version), NULL);
1481 version_str = pdf_to_name(ctx, obj);
1482 if (*version_str)
1483 {
1484 int version = 10 * (fz_atof(version_str) + 0.05f);
1485 if (version > doc->version)
1486 doc->version = version;
1487 }
1488 }
1489 fz_catch(ctx) { }
1490}
1491
1492static void
1493pdf_drop_document_imp(fz_context *ctx, pdf_document *doc)
1494{
1495 int i;
1496
1497 fz_defer_reap_start(ctx);
1498
1499 /* Type3 glyphs in the glyph cache can contain pdf_obj pointers
1500 * that we are about to destroy. Simplest solution is to bin the
1501 * glyph cache at this point. */
1502 fz_try(ctx)
1503 fz_purge_glyph_cache(ctx);
1504 fz_catch(ctx)
1505 {
1506 /* Swallow error, but continue dropping */
1507 }
1508
1509 pdf_drop_js(ctx, doc->js);
1510
1511 pdf_drop_xref_sections(ctx, doc);
1512 fz_free(ctx, doc->xref_index);
1513
1514 fz_drop_stream(ctx, doc->file);
1515 pdf_drop_crypt(ctx, doc->crypt);
1516
1517 pdf_drop_obj(ctx, doc->linear_obj);
1518 if (doc->linear_page_refs)
1519 {
1520 for (i=0; i < doc->linear_page_count; i++)
1521 pdf_drop_obj(ctx, doc->linear_page_refs[i]);
1522
1523 fz_free(ctx, doc->linear_page_refs);
1524 }
1525
1526 fz_free(ctx, doc->hint_page);
1527 fz_free(ctx, doc->hint_shared_ref);
1528 fz_free(ctx, doc->hint_shared);
1529 fz_free(ctx, doc->hint_obj_offsets);
1530
1531 for (i=0; i < doc->num_type3_fonts; i++)
1532 {
1533 fz_try(ctx)
1534 fz_decouple_type3_font(ctx, doc->type3_fonts[i], (void *)doc);
1535 fz_always(ctx)
1536 fz_drop_font(ctx, doc->type3_fonts[i]);
1537 fz_catch(ctx)
1538 {
1539 /* Swallow error, but continue dropping */
1540 }
1541 }
1542
1543 fz_free(ctx, doc->type3_fonts);
1544
1545 pdf_drop_ocg(ctx, doc);
1546
1547 pdf_empty_store(ctx, doc);
1548
1549 pdf_lexbuf_fin(ctx, &doc->lexbuf.base);
1550
1551 pdf_drop_resource_tables(ctx, doc);
1552
1553 fz_drop_colorspace(ctx, doc->oi);
1554
1555 for (i = 0; i < doc->orphans_count; i++)
1556 pdf_drop_obj(ctx, doc->orphans[i]);
1557
1558 fz_free(ctx, doc->orphans);
1559
1560 fz_free(ctx, doc->rev_page_map);
1561
1562 fz_defer_reap_end(ctx);
1563}
1564
1565/*
1566 Closes and frees an opened PDF document.
1567
1568 The resource store in the context associated with pdf_document
1569 is emptied.
1570*/
1571void
1572pdf_drop_document(fz_context *ctx, pdf_document *doc)
1573{
1574 fz_drop_document(ctx, &doc->super);
1575}
1576
1577pdf_document *
1578pdf_keep_document(fz_context *ctx, pdf_document *doc)
1579{
1580 return (pdf_document *)fz_keep_document(ctx, &doc->super);
1581}
1582
1583/*
1584 * compressed object streams
1585 */
1586
1587static pdf_xref_entry *
1588pdf_load_obj_stm(fz_context *ctx, pdf_document *doc, int num, pdf_lexbuf *buf, int target)
1589{
1590 fz_stream *stm = NULL;
1591 pdf_obj *objstm = NULL;
1592 int *numbuf = NULL;
1593 int64_t *ofsbuf = NULL;
1594
1595 pdf_obj *obj;
1596 int64_t first;
1597 int count;
1598 int i;
1599 pdf_token tok;
1600 pdf_xref_entry *ret_entry = NULL;
1601 int xref_len;
1602 int found;
1603
1604 fz_var(numbuf);
1605 fz_var(ofsbuf);
1606 fz_var(objstm);
1607 fz_var(stm);
1608
1609 fz_try(ctx)
1610 {
1611 objstm = pdf_load_object(ctx, doc, num);
1612
1613 if (pdf_obj_marked(ctx, objstm))
1614 fz_throw(ctx, FZ_ERROR_GENERIC, "recursive object stream lookup");
1615 }
1616 fz_catch(ctx)
1617 {
1618 pdf_drop_obj(ctx, objstm);
1619 fz_rethrow(ctx);
1620 }
1621
1622 fz_try(ctx)
1623 {
1624 pdf_mark_obj(ctx, objstm);
1625
1626 count = pdf_dict_get_int(ctx, objstm, PDF_NAME(N));
1627 first = pdf_dict_get_int(ctx, objstm, PDF_NAME(First));
1628
1629 if (count < 0 || count > PDF_MAX_OBJECT_NUMBER)
1630 fz_throw(ctx, FZ_ERROR_GENERIC, "number of objects in object stream out of range");
1631 if (first < 0 || first > PDF_MAX_OBJECT_NUMBER
1632 || count < 0 || count > PDF_MAX_OBJECT_NUMBER
1633 || first + count - 1 > PDF_MAX_OBJECT_NUMBER)
1634 fz_throw(ctx, FZ_ERROR_GENERIC, "object stream object numbers are out of range");
1635
1636 numbuf = fz_calloc(ctx, count, sizeof(*numbuf));
1637 ofsbuf = fz_calloc(ctx, count, sizeof(*ofsbuf));
1638
1639 xref_len = pdf_xref_len(ctx, doc);
1640
1641 found = 0;
1642
1643 stm = pdf_open_stream_number(ctx, doc, num);
1644 for (i = 0; i < count; i++)
1645 {
1646 tok = pdf_lex(ctx, stm, buf);
1647 if (tok != PDF_TOK_INT)
1648 fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d 0 R)", num);
1649 numbuf[found] = buf->i;
1650
1651 tok = pdf_lex(ctx, stm, buf);
1652 if (tok != PDF_TOK_INT)
1653 fz_throw(ctx, FZ_ERROR_GENERIC, "corrupt object stream (%d 0 R)", num);
1654 ofsbuf[found] = buf->i;
1655
1656 if (numbuf[found] <= 0 || numbuf[found] >= xref_len)
1657 fz_warn(ctx, "object stream object out of range, skipping");
1658 else
1659 found++;
1660 }
1661
1662 for (i = 0; i < found; i++)
1663 {
1664 pdf_xref_entry *entry;
1665
1666 fz_seek(ctx, stm, first + ofsbuf[i], SEEK_SET);
1667
1668 obj = pdf_parse_stm_obj(ctx, doc, stm, buf);
1669
1670 entry = pdf_get_xref_entry(ctx, doc, numbuf[i]);
1671
1672 pdf_set_obj_parent(ctx, obj, numbuf[i]);
1673
1674 if (entry->type == 'o' && entry->ofs == num)
1675 {
1676 /* If we already have an entry for this object,
1677 * we'd like to drop it and use the new one -
1678 * but this means that anyone currently holding
1679 * a pointer to the old one will be left with a
1680 * stale pointer. Instead, we drop the new one
1681 * and trust that the old one is correct. */
1682 if (entry->obj)
1683 {
1684 if (pdf_objcmp(ctx, entry->obj, obj))
1685 fz_warn(ctx, "Encountered new definition for object %d - keeping the original one", numbuf[i]);
1686 pdf_drop_obj(ctx, obj);
1687 }
1688 else
1689 {
1690 entry->obj = obj;
1691 fz_drop_buffer(ctx, entry->stm_buf);
1692 entry->stm_buf = NULL;
1693 }
1694 if (numbuf[i] == target)
1695 ret_entry = entry;
1696 }
1697 else
1698 {
1699 pdf_drop_obj(ctx, obj);
1700 }
1701 }
1702 }
1703 fz_always(ctx)
1704 {
1705 fz_drop_stream(ctx, stm);
1706 fz_free(ctx, ofsbuf);
1707 fz_free(ctx, numbuf);
1708 pdf_unmark_obj(ctx, objstm);
1709 pdf_drop_obj(ctx, objstm);
1710 }
1711 fz_catch(ctx)
1712 {
1713 fz_rethrow(ctx);
1714 }
1715 return ret_entry;
1716}
1717
1718/*
1719 * object loading
1720 */
1721static int
1722pdf_obj_read(fz_context *ctx, pdf_document *doc, int64_t *offset, int *nump, pdf_obj **page)
1723{
1724 pdf_lexbuf *buf = &doc->lexbuf.base;
1725 int num, gen, tok;
1726 int64_t numofs, genofs, stmofs, tmpofs, newtmpofs;
1727 int xref_len;
1728 pdf_xref_entry *entry;
1729
1730 numofs = *offset;
1731 fz_seek(ctx, doc->file, numofs, SEEK_SET);
1732
1733 /* We expect to read 'num' here */
1734 tok = pdf_lex(ctx, doc->file, buf);
1735 genofs = fz_tell(ctx, doc->file);
1736 if (tok != PDF_TOK_INT)
1737 {
1738 /* Failed! */
1739 DEBUGMESS((ctx, "skipping unexpected data (tok=%d) at %d", tok, *offset));
1740 *offset = genofs;
1741 return tok == PDF_TOK_EOF;
1742 }
1743 *nump = num = buf->i;
1744
1745 /* We expect to read 'gen' here */
1746 tok = pdf_lex(ctx, doc->file, buf);
1747 tmpofs = fz_tell(ctx, doc->file);
1748 if (tok != PDF_TOK_INT)
1749 {
1750 /* Failed! */
1751 DEBUGMESS((ctx, "skipping unexpected data after \"%d\" (tok=%d) at %d", num, tok, *offset));
1752 *offset = tmpofs;
1753 return tok == PDF_TOK_EOF;
1754 }
1755 gen = buf->i;
1756
1757 /* We expect to read 'obj' here */
1758 do
1759 {
1760 tmpofs = fz_tell(ctx, doc->file);
1761 tok = pdf_lex(ctx, doc->file, buf);
1762 if (tok == PDF_TOK_OBJ)
1763 break;
1764 if (tok != PDF_TOK_INT)
1765 {
1766 DEBUGMESS((ctx, "skipping unexpected data (tok=%d) at %d", tok, tmpofs));
1767 *offset = fz_tell(ctx, doc->file);
1768 return tok == PDF_TOK_EOF;
1769 }
1770 DEBUGMESS((ctx, "skipping unexpected int %d at %d", num, numofs));
1771 *nump = num = gen;
1772 numofs = genofs;
1773 gen = buf->i;
1774 genofs = tmpofs;
1775 }
1776 while (1);
1777
1778 /* Now we read the actual object */
1779 xref_len = pdf_xref_len(ctx, doc);
1780
1781 /* When we are reading a progressive file, we typically see:
1782 * File Header
1783 * obj m (Linearization params)
1784 * xref #1 (refers to objects m-n)
1785 * obj m+1
1786 * ...
1787 * obj n
1788 * obj 1
1789 * ...
1790 * obj n-1
1791 * xref #2
1792 *
1793 * The linearisation params are read elsewhere, hence
1794 * whenever we read an object it should just go into the
1795 * previous xref.
1796 */
1797 tok = pdf_repair_obj(ctx, doc, buf, &stmofs, NULL, NULL, NULL, page, &newtmpofs, NULL);
1798
1799 do /* So we can break out of it */
1800 {
1801 if (num <= 0 || num >= xref_len)
1802 {
1803 fz_warn(ctx, "Not a valid object number (%d %d obj)", num, gen);
1804 break;
1805 }
1806 if (gen != 0)
1807 {
1808 fz_warn(ctx, "Unexpected non zero generation number in linearized file");
1809 }
1810 entry = pdf_get_populating_xref_entry(ctx, doc, num);
1811 if (entry->type != 0)
1812 {
1813 DEBUGMESS((ctx, "Duplicate object found (%d %d obj)", num, gen));
1814 break;
1815 }
1816 if (page && *page)
1817 {
1818 DEBUGMESS((ctx, "Successfully read object %d @ %d - and found page %d!", num, numofs, doc->linear_page_num));
1819 if (!entry->obj)
1820 entry->obj = pdf_keep_obj(ctx, *page);
1821
1822 if (doc->linear_page_refs[doc->linear_page_num] == NULL)
1823 doc->linear_page_refs[doc->linear_page_num] = pdf_new_indirect(ctx, doc, num, gen);
1824 }
1825 else
1826 {
1827 DEBUGMESS((ctx, "Successfully read object %d @ %d", num, numofs));
1828 }
1829 entry->type = 'n';
1830 entry->gen = gen; // XXX: was 0
1831 entry->num = num;
1832 entry->ofs = numofs;
1833 entry->stm_ofs = stmofs;
1834 }
1835 while (0);
1836 if (page && *page)
1837 doc->linear_page_num++;
1838
1839 if (tok == PDF_TOK_ENDOBJ)
1840 {
1841 *offset = fz_tell(ctx, doc->file);
1842 }
1843 else
1844 {
1845 *offset = newtmpofs;
1846 }
1847 return 0;
1848}
1849
1850static void
1851pdf_load_hinted_page(fz_context *ctx, pdf_document *doc, int pagenum)
1852{
1853 pdf_obj *page = NULL;
1854
1855 if (!doc->hints_loaded || !doc->linear_page_refs)
1856 return;
1857
1858 if (doc->linear_page_refs[pagenum])
1859 return;
1860
1861 fz_var(page);
1862
1863 fz_try(ctx)
1864 {
1865 int num = doc->hint_page[pagenum].number;
1866 page = pdf_load_object(ctx, doc, num);
1867 if (pdf_name_eq(ctx, PDF_NAME(Page), pdf_dict_get(ctx, page, PDF_NAME(Type))))
1868 {
1869 /* We have found the page object! */
1870 DEBUGMESS((ctx, "LoadHintedPage pagenum=%d num=%d", pagenum, num));
1871 doc->linear_page_refs[pagenum] = pdf_new_indirect(ctx, doc, num, 0);
1872 }
1873 }
1874 fz_always(ctx)
1875 pdf_drop_obj(ctx, page);
1876 fz_catch(ctx)
1877 {
1878 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1879 /* Silently swallow the error and proceed as normal */
1880 }
1881}
1882
1883static int
1884read_hinted_object(fz_context *ctx, pdf_document *doc, int num)
1885{
1886 /* Try to find the object using our hint table. Find the closest
1887 * object <= the one we want that has a hint and read forward from
1888 * there. */
1889 int expected = num;
1890 int curr_pos;
1891 int64_t start, offset;
1892
1893 while (doc->hint_obj_offsets[expected] == 0 && expected > 0)
1894 expected--;
1895 if (expected != num)
1896 DEBUGMESS((ctx, "object %d is unhinted, will search forward from %d", expected, num));
1897 if (expected == 0) /* No hints found, just bail */
1898 return 0;
1899
1900 curr_pos = fz_tell(ctx, doc->file);
1901 offset = doc->hint_obj_offsets[expected];
1902
1903 fz_var(expected);
1904
1905 fz_try(ctx)
1906 {
1907 int found;
1908
1909 /* Try to read forward from there */
1910 do
1911 {
1912 start = offset;
1913 DEBUGMESS((ctx, "Searching for object %d @ %d", expected, offset));
1914 pdf_obj_read(ctx, doc, &offset, &found, 0);
1915 DEBUGMESS((ctx, "Found object %d - next will be @ %d", found, offset));
1916 if (found <= expected)
1917 {
1918 /* We found the right one (or one earlier than
1919 * we expected). Update the hints. */
1920 doc->hint_obj_offsets[expected] = offset;
1921 doc->hint_obj_offsets[found] = start;
1922 doc->hint_obj_offsets[found+1] = offset;
1923 /* Retry with the next one */
1924 expected = found+1;
1925 }
1926 else
1927 {
1928 /* We found one later than we expected. */
1929 doc->hint_obj_offsets[expected] = 0;
1930 doc->hint_obj_offsets[found] = start;
1931 doc->hint_obj_offsets[found+1] = offset;
1932 while (doc->hint_obj_offsets[expected] == 0 && expected > 0)
1933 expected--;
1934 if (expected == 0) /* No hints found, we give up */
1935 break;
1936 }
1937 }
1938 while (found != num);
1939 }
1940 fz_always(ctx)
1941 {
1942 fz_seek(ctx, doc->file, curr_pos, SEEK_SET);
1943 }
1944 fz_catch(ctx)
1945 {
1946 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
1947 /* FIXME: Currently we ignore the hint. Perhaps we should
1948 * drop back to non-hinted operation here. */
1949 doc->hint_obj_offsets[expected] = 0;
1950 fz_rethrow(ctx);
1951 }
1952 return expected != 0;
1953}
1954
1955pdf_obj *
1956pdf_load_unencrypted_object(fz_context *ctx, pdf_document *doc, int num)
1957{
1958 pdf_xref_entry *x;
1959
1960 if (num <= 0 || num >= pdf_xref_len(ctx, doc))
1961 fz_throw(ctx, FZ_ERROR_GENERIC, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
1962
1963 x = pdf_get_xref_entry(ctx, doc, num);
1964 if (x->type == 'n')
1965 {
1966 fz_seek(ctx, doc->file, x->ofs, SEEK_SET);
1967 return pdf_parse_ind_obj(ctx, doc, doc->file, &doc->lexbuf.base, NULL, NULL, NULL, NULL);
1968 }
1969 return NULL;
1970}
1971
1972pdf_xref_entry *
1973pdf_cache_object(fz_context *ctx, pdf_document *doc, int num)
1974{
1975 pdf_xref_entry *x;
1976 int rnum, rgen, try_repair;
1977
1978 fz_var(try_repair);
1979
1980 if (num <= 0 || num >= pdf_xref_len(ctx, doc))
1981 fz_throw(ctx, FZ_ERROR_GENERIC, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
1982
1983object_updated:
1984 try_repair = 0;
1985 rnum = num;
1986
1987 x = pdf_get_xref_entry(ctx, doc, num);
1988
1989 if (x->obj != NULL)
1990 return x;
1991
1992 if (x->type == 'f')
1993 {
1994 x->obj = PDF_NULL;
1995 }
1996 else if (x->type == 'n')
1997 {
1998 fz_seek(ctx, doc->file, x->ofs, SEEK_SET);
1999
2000 fz_try(ctx)
2001 {
2002 x->obj = pdf_parse_ind_obj(ctx, doc, doc->file, &doc->lexbuf.base,
2003 &rnum, &rgen, &x->stm_ofs, &try_repair);
2004 }
2005 fz_catch(ctx)
2006 {
2007 if (!try_repair || fz_caught(ctx) == FZ_ERROR_TRYLATER)
2008 fz_rethrow(ctx);
2009 }
2010
2011 if (!try_repair && rnum != num)
2012 {
2013 pdf_drop_obj(ctx, x->obj);
2014 x->type = 'f';
2015 x->ofs = -1;
2016 x->gen = 0;
2017 x->num = 0;
2018 x->stm_ofs = 0;
2019 x->obj = NULL;
2020 try_repair = (doc->repair_attempted == 0);
2021 }
2022
2023 if (try_repair)
2024 {
2025 fz_try(ctx)
2026 {
2027 pdf_repair_xref(ctx, doc);
2028 pdf_prime_xref_index(ctx, doc);
2029 pdf_repair_obj_stms(ctx, doc);
2030 }
2031 fz_catch(ctx)
2032 {
2033 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
2034 if (rnum == num)
2035 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot parse object (%d 0 R)", num);
2036 else
2037 fz_throw(ctx, FZ_ERROR_GENERIC, "found object (%d 0 R) instead of (%d 0 R)", rnum, num);
2038 }
2039 goto object_updated;
2040 }
2041
2042 if (doc->crypt)
2043 pdf_crypt_obj(ctx, doc->crypt, x->obj, x->num, x->gen);
2044 }
2045 else if (x->type == 'o')
2046 {
2047 if (!x->obj)
2048 {
2049 x = pdf_load_obj_stm(ctx, doc, x->ofs, &doc->lexbuf.base, num);
2050 if (x == NULL)
2051 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot load object stream containing object (%d 0 R)", num);
2052 if (!x->obj)
2053 fz_throw(ctx, FZ_ERROR_GENERIC, "object (%d 0 R) was not found in its object stream", num);
2054 }
2055 }
2056 else if (doc->hint_obj_offsets && read_hinted_object(ctx, doc, num))
2057 {
2058 goto object_updated;
2059 }
2060 else if (doc->file_length && doc->linear_pos < doc->file_length)
2061 {
2062 fz_throw(ctx, FZ_ERROR_TRYLATER, "cannot find object in xref (%d 0 R) - not loaded yet?", num);
2063 }
2064 else
2065 {
2066 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot find object in xref (%d 0 R)", num);
2067 }
2068
2069 pdf_set_obj_parent(ctx, x->obj, num);
2070 return x;
2071}
2072
2073pdf_obj *
2074pdf_load_object(fz_context *ctx, pdf_document *doc, int num)
2075{
2076 pdf_xref_entry *entry = pdf_cache_object(ctx, doc, num);
2077 return pdf_keep_obj(ctx, entry->obj);
2078}
2079
2080pdf_obj *
2081pdf_resolve_indirect(fz_context *ctx, pdf_obj *ref)
2082{
2083 if (pdf_is_indirect(ctx, ref))
2084 {
2085 pdf_document *doc = pdf_get_indirect_document(ctx, ref);
2086 int num = pdf_to_num(ctx, ref);
2087 pdf_xref_entry *entry;
2088
2089 if (!doc)
2090 return NULL;
2091 if (num <= 0)
2092 {
2093 fz_warn(ctx, "invalid indirect reference (%d 0 R)", num);
2094 return NULL;
2095 }
2096
2097 fz_try(ctx)
2098 entry = pdf_cache_object(ctx, doc, num);
2099 fz_catch(ctx)
2100 {
2101 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
2102 fz_warn(ctx, "cannot load object (%d 0 R) into cache", num);
2103 return NULL;
2104 }
2105
2106 ref = entry->obj;
2107 }
2108 return ref;
2109}
2110
2111pdf_obj *
2112pdf_resolve_indirect_chain(fz_context *ctx, pdf_obj *ref)
2113{
2114 int sanity = 10;
2115
2116 while (pdf_is_indirect(ctx, ref))
2117 {
2118 if (--sanity == 0)
2119 {
2120 fz_warn(ctx, "too many indirections (possible indirection cycle involving %d 0 R)", pdf_to_num(ctx, ref));
2121 return NULL;
2122 }
2123
2124 ref = pdf_resolve_indirect(ctx, ref);
2125 }
2126
2127 return ref;
2128}
2129
2130int
2131pdf_count_objects(fz_context *ctx, pdf_document *doc)
2132{
2133 return pdf_xref_len(ctx, doc);
2134}
2135
2136/*
2137 Allocate a slot in the xref table and return a fresh unused object number.
2138*/
2139int
2140pdf_create_object(fz_context *ctx, pdf_document *doc)
2141{
2142 /* TODO: reuse free object slots by properly linking free object chains in the ofs field */
2143 pdf_xref_entry *entry;
2144 int num = pdf_xref_len(ctx, doc);
2145
2146 if (num > PDF_MAX_OBJECT_NUMBER)
2147 fz_throw(ctx, FZ_ERROR_GENERIC, "too many objects stored in pdf");
2148
2149 entry = pdf_get_incremental_xref_entry(ctx, doc, num);
2150 entry->type = 'f';
2151 entry->ofs = -1;
2152 entry->gen = 0;
2153 entry->num = num;
2154 entry->stm_ofs = 0;
2155 entry->stm_buf = NULL;
2156 entry->obj = NULL;
2157 return num;
2158}
2159
2160/*
2161 Remove object from xref table, marking the slot as free.
2162*/
2163void
2164pdf_delete_object(fz_context *ctx, pdf_document *doc, int num)
2165{
2166 pdf_xref_entry *x;
2167
2168 if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2169 {
2170 fz_warn(ctx, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2171 return;
2172 }
2173
2174 x = pdf_get_incremental_xref_entry(ctx, doc, num);
2175
2176 fz_drop_buffer(ctx, x->stm_buf);
2177 pdf_drop_obj(ctx, x->obj);
2178
2179 x->type = 'f';
2180 x->ofs = 0;
2181 x->gen += 1;
2182 x->num = 0;
2183 x->stm_ofs = 0;
2184 x->stm_buf = NULL;
2185 x->obj = NULL;
2186}
2187
2188/*
2189 Replace object in xref table with the passed in object.
2190*/
2191void
2192pdf_update_object(fz_context *ctx, pdf_document *doc, int num, pdf_obj *newobj)
2193{
2194 pdf_xref_entry *x;
2195
2196 if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2197 {
2198 fz_warn(ctx, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2199 return;
2200 }
2201
2202 if (!newobj)
2203 {
2204 pdf_delete_object(ctx, doc, num);
2205 return;
2206 }
2207
2208 x = pdf_get_incremental_xref_entry(ctx, doc, num);
2209
2210 pdf_drop_obj(ctx, x->obj);
2211
2212 x->type = 'n';
2213 x->ofs = 0;
2214 x->obj = pdf_keep_obj(ctx, newobj);
2215
2216 pdf_set_obj_parent(ctx, newobj, num);
2217}
2218
2219/*
2220 Replace stream contents for object in xref table with the passed in buffer.
2221
2222 The buffer contents must match the /Filter setting if 'compressed' is true.
2223 If 'compressed' is false, the /Filter and /DecodeParms entries are deleted.
2224 The /Length entry is updated.
2225*/
2226void
2227pdf_update_stream(fz_context *ctx, pdf_document *doc, pdf_obj *obj, fz_buffer *newbuf, int compressed)
2228{
2229 int num;
2230 pdf_xref_entry *x;
2231
2232 if (pdf_is_indirect(ctx, obj))
2233 num = pdf_to_num(ctx, obj);
2234 else
2235 num = pdf_obj_parent_num(ctx, obj);
2236 if (num <= 0 || num >= pdf_xref_len(ctx, doc))
2237 {
2238 fz_warn(ctx, "object out of range (%d 0 R); xref size %d", num, pdf_xref_len(ctx, doc));
2239 return;
2240 }
2241
2242 x = pdf_get_xref_entry(ctx, doc, num);
2243
2244 fz_drop_buffer(ctx, x->stm_buf);
2245 x->stm_buf = fz_keep_buffer(ctx, newbuf);
2246
2247 pdf_dict_put_int(ctx, obj, PDF_NAME(Length), (int)fz_buffer_storage(ctx, newbuf, NULL));
2248 if (!compressed)
2249 {
2250 pdf_dict_del(ctx, obj, PDF_NAME(Filter));
2251 pdf_dict_del(ctx, obj, PDF_NAME(DecodeParms));
2252 }
2253}
2254
2255int
2256pdf_lookup_metadata(fz_context *ctx, pdf_document *doc, const char *key, char *buf, int size)
2257{
2258 if (!strcmp(key, "format"))
2259 return (int)fz_snprintf(buf, size, "PDF %d.%d", doc->version/10, doc->version % 10);
2260
2261 if (!strcmp(key, "encryption"))
2262 {
2263 if (doc->crypt)
2264 return (int)fz_snprintf(buf, size, "Standard V%d R%d %d-bit %s",
2265 pdf_crypt_version(ctx, doc->crypt),
2266 pdf_crypt_revision(ctx, doc->crypt),
2267 pdf_crypt_length(ctx, doc->crypt),
2268 pdf_crypt_method(ctx, doc->crypt));
2269 else
2270 return (int)fz_strlcpy(buf, "None", size);
2271 }
2272
2273 if (strstr(key, "info:") == key)
2274 {
2275 pdf_obj *info;
2276 const char *s;
2277 int n;
2278
2279 info = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Info));
2280 if (!info)
2281 return -1;
2282
2283 info = pdf_dict_gets(ctx, info, key + 5);
2284 if (!info)
2285 return -1;
2286
2287 s = pdf_to_text_string(ctx, info);
2288 n = (int)fz_strlcpy(buf, s, size);
2289 return n;
2290 }
2291
2292 return -1;
2293}
2294
2295/*
2296 Initializers for the fz_document interface.
2297
2298 The functions are split across two files to allow calls to a
2299 version of the constructor that does not link in the interpreter.
2300 The interpreter references the built-in font and cmap resources
2301 which are quite big. Not linking those into the mutool binary
2302 saves roughly 6MB of space.
2303*/
2304
2305static pdf_document *
2306pdf_new_document(fz_context *ctx, fz_stream *file)
2307{
2308 pdf_document *doc = fz_new_derived_document(ctx, pdf_document);
2309
2310 doc->super.drop_document = (fz_document_drop_fn*)pdf_drop_document_imp;
2311 doc->super.get_output_intent = (fz_document_output_intent_fn*)pdf_document_output_intent;
2312 doc->super.needs_password = (fz_document_needs_password_fn*)pdf_needs_password;
2313 doc->super.authenticate_password = (fz_document_authenticate_password_fn*)pdf_authenticate_password;
2314 doc->super.has_permission = (fz_document_has_permission_fn*)pdf_has_permission;
2315 doc->super.load_outline = (fz_document_load_outline_fn*)pdf_load_outline;
2316 doc->super.resolve_link = (fz_document_resolve_link_fn*)pdf_resolve_link;
2317 doc->super.count_pages = (fz_document_count_pages_fn*)pdf_count_pages;
2318 doc->super.load_page = (fz_document_load_page_fn*)pdf_load_page;
2319 doc->super.lookup_metadata = (fz_document_lookup_metadata_fn*)pdf_lookup_metadata;
2320
2321 pdf_lexbuf_init(ctx, &doc->lexbuf.base, PDF_LEXBUF_LARGE);
2322 doc->file = fz_keep_stream(ctx, file);
2323
2324 return doc;
2325}
2326
2327/*
2328 Opens a PDF document.
2329
2330 Same as pdf_open_document, but takes a stream instead of a
2331 filename to locate the PDF document to open. Increments the
2332 reference count of the stream. See fz_open_file,
2333 fz_open_file_w or fz_open_fd for opening a stream, and
2334 fz_drop_stream for closing an open stream.
2335*/
2336pdf_document *
2337pdf_open_document_with_stream(fz_context *ctx, fz_stream *file)
2338{
2339 pdf_document *doc = pdf_new_document(ctx, file);
2340 fz_try(ctx)
2341 {
2342 pdf_init_document(ctx, doc);
2343 }
2344 fz_catch(ctx)
2345 {
2346 int caught = fz_caught(ctx);
2347 fz_drop_document(ctx, &doc->super);
2348 fz_throw(ctx, caught, "Failed to open doc from stream");
2349 }
2350 return doc;
2351}
2352
2353/*
2354 Open a PDF document.
2355
2356 Open a PDF document by reading its cross reference table, so
2357 MuPDF can locate PDF objects inside the file. Upon an broken
2358 cross reference table or other parse errors MuPDF will restart
2359 parsing the file from the beginning to try to rebuild a
2360 (hopefully correct) cross reference table to allow further
2361 processing of the file.
2362
2363 The returned pdf_document should be used when calling most
2364 other PDF functions. Note that it wraps the context, so those
2365 functions implicitly get access to the global state in
2366 context.
2367
2368 filename: a path to a file as it would be given to open(2).
2369*/
2370pdf_document *
2371pdf_open_document(fz_context *ctx, const char *filename)
2372{
2373 fz_stream *file = NULL;
2374 pdf_document *doc = NULL;
2375
2376 fz_var(file);
2377 fz_var(doc);
2378
2379 fz_try(ctx)
2380 {
2381 file = fz_open_file(ctx, filename);
2382 doc = pdf_new_document(ctx, file);
2383 pdf_init_document(ctx, doc);
2384 }
2385 fz_always(ctx)
2386 {
2387 fz_drop_stream(ctx, file);
2388 }
2389 fz_catch(ctx)
2390 {
2391 fz_drop_document(ctx, &doc->super);
2392 fz_rethrow(ctx);
2393 }
2394 return doc;
2395}
2396
2397static void
2398pdf_load_hints(fz_context *ctx, pdf_document *doc, int objnum)
2399{
2400 fz_stream *stream = NULL;
2401 pdf_obj *dict;
2402
2403 fz_var(stream);
2404 fz_var(dict);
2405
2406 fz_try(ctx)
2407 {
2408 int i, j, least_num_page_objs, page_obj_num_bits;
2409 int least_page_len, page_len_num_bits, shared_hint_offset;
2410 /* int least_page_offset, page_offset_num_bits; */
2411 /* int least_content_stream_len, content_stream_len_num_bits; */
2412 int num_shared_obj_num_bits, shared_obj_num_bits;
2413 /* int numerator_bits, denominator_bits; */
2414 int shared;
2415 int shared_obj_num, shared_obj_offset, shared_obj_count_page1;
2416 int shared_obj_count_total;
2417 int least_shared_group_len, shared_group_len_num_bits;
2418 int max_object_num = pdf_xref_len(ctx, doc);
2419
2420 stream = pdf_open_stream_number(ctx, doc, objnum);
2421 dict = pdf_get_xref_entry(ctx, doc, objnum)->obj;
2422 if (dict == NULL || !pdf_is_dict(ctx, dict))
2423 fz_throw(ctx, FZ_ERROR_GENERIC, "malformed hint object");
2424
2425 shared_hint_offset = pdf_dict_get_int(ctx, dict, PDF_NAME(S));
2426
2427 /* Malloc the structures (use realloc to cope with the fact we
2428 * may try this several times before enough data is loaded) */
2429 doc->hint_page = fz_realloc_array(ctx, doc->hint_page, doc->linear_page_count+1, pdf_hint_page);
2430 memset(doc->hint_page, 0, sizeof(*doc->hint_page) * (doc->linear_page_count+1));
2431 doc->hint_obj_offsets = fz_realloc_array(ctx, doc->hint_obj_offsets, max_object_num, int64_t);
2432 memset(doc->hint_obj_offsets, 0, sizeof(*doc->hint_obj_offsets) * max_object_num);
2433 doc->hint_obj_offsets_max = max_object_num;
2434
2435 /* Read the page object hints table: Header first */
2436 least_num_page_objs = fz_read_bits(ctx, stream, 32);
2437 /* The following is sometimes a lie, but we read this version,
2438 * as other table values are built from it. In
2439 * pdf_reference17.pdf, this points to 2 objects before the
2440 * first pages page object. */
2441 doc->hint_page[0].offset = fz_read_bits(ctx, stream, 32);
2442 if (doc->hint_page[0].offset > doc->hint_object_offset)
2443 doc->hint_page[0].offset += doc->hint_object_length;
2444 page_obj_num_bits = fz_read_bits(ctx, stream, 16);
2445 least_page_len = fz_read_bits(ctx, stream, 32);
2446 page_len_num_bits = fz_read_bits(ctx, stream, 16);
2447 /* least_page_offset = */ (void) fz_read_bits(ctx, stream, 32);
2448 /* page_offset_num_bits = */ (void) fz_read_bits(ctx, stream, 16);
2449 /* least_content_stream_len = */ (void) fz_read_bits(ctx, stream, 32);
2450 /* content_stream_len_num_bits = */ (void) fz_read_bits(ctx, stream, 16);
2451 num_shared_obj_num_bits = fz_read_bits(ctx, stream, 16);
2452 shared_obj_num_bits = fz_read_bits(ctx, stream, 16);
2453 /* numerator_bits = */ (void) fz_read_bits(ctx, stream, 16);
2454 /* denominator_bits = */ (void) fz_read_bits(ctx, stream, 16);
2455
2456 /* Item 1: Page object numbers */
2457 doc->hint_page[0].number = doc->linear_page1_obj_num;
2458 /* We don't care about the number of objects in the first page */
2459 (void)fz_read_bits(ctx, stream, page_obj_num_bits);
2460 j = 1;
2461 for (i = 1; i < doc->linear_page_count; i++)
2462 {
2463 int delta_page_objs = fz_read_bits(ctx, stream, page_obj_num_bits);
2464
2465 doc->hint_page[i].number = j;
2466 j += least_num_page_objs + delta_page_objs;
2467 }
2468 doc->hint_page[i].number = j; /* Not a real page object */
2469 fz_sync_bits(ctx, stream);
2470 /* Item 2: Page lengths */
2471 j = doc->hint_page[0].offset;
2472 for (i = 0; i < doc->linear_page_count; i++)
2473 {
2474 int delta_page_len = fz_read_bits(ctx, stream, page_len_num_bits);
2475 int old = j;
2476
2477 doc->hint_page[i].offset = j;
2478 j += least_page_len + delta_page_len;
2479 if (old <= doc->hint_object_offset && j > doc->hint_object_offset)
2480 j += doc->hint_object_length;
2481 }
2482 doc->hint_page[i].offset = j;
2483 fz_sync_bits(ctx, stream);
2484 /* Item 3: Shared references */
2485 shared = 0;
2486 for (i = 0; i < doc->linear_page_count; i++)
2487 {
2488 int num_shared_objs = fz_read_bits(ctx, stream, num_shared_obj_num_bits);
2489 doc->hint_page[i].index = shared;
2490 shared += num_shared_objs;
2491 }
2492 doc->hint_page[i].index = shared;
2493 doc->hint_shared_ref = fz_realloc_array(ctx, doc->hint_shared_ref, shared, int);
2494 memset(doc->hint_shared_ref, 0, sizeof(*doc->hint_shared_ref) * shared);
2495 fz_sync_bits(ctx, stream);
2496 /* Item 4: Shared references */
2497 for (i = 0; i < shared; i++)
2498 {
2499 int ref = fz_read_bits(ctx, stream, shared_obj_num_bits);
2500 doc->hint_shared_ref[i] = ref;
2501 }
2502 /* Skip items 5,6,7 as we don't use them */
2503
2504 fz_seek(ctx, stream, shared_hint_offset, SEEK_SET);
2505
2506 /* Read the shared object hints table: Header first */
2507 shared_obj_num = fz_read_bits(ctx, stream, 32);
2508 shared_obj_offset = fz_read_bits(ctx, stream, 32);
2509 if (shared_obj_offset > doc->hint_object_offset)
2510 shared_obj_offset += doc->hint_object_length;
2511 shared_obj_count_page1 = fz_read_bits(ctx, stream, 32);
2512 shared_obj_count_total = fz_read_bits(ctx, stream, 32);
2513 shared_obj_num_bits = fz_read_bits(ctx, stream, 16);
2514 least_shared_group_len = fz_read_bits(ctx, stream, 32);
2515 shared_group_len_num_bits = fz_read_bits(ctx, stream, 16);
2516
2517 /* Sanity check the references in Item 4 above to ensure we
2518 * don't access out of range with malicious files. */
2519 for (i = 0; i < shared; i++)
2520 {
2521 if (doc->hint_shared_ref[i] >= shared_obj_count_total)
2522 {
2523 fz_throw(ctx, FZ_ERROR_GENERIC, "malformed hint stream (shared refs)");
2524 }
2525 }
2526
2527 doc->hint_shared = fz_realloc_array(ctx, doc->hint_shared, shared_obj_count_total+1, pdf_hint_shared);
2528 memset(doc->hint_shared, 0, sizeof(*doc->hint_shared) * (shared_obj_count_total+1));
2529
2530 /* Item 1: Shared references */
2531 j = doc->hint_page[0].offset;
2532 for (i = 0; i < shared_obj_count_page1; i++)
2533 {
2534 int off = fz_read_bits(ctx, stream, shared_group_len_num_bits);
2535 int old = j;
2536 doc->hint_shared[i].offset = j;
2537 j += off + least_shared_group_len;
2538 if (old <= doc->hint_object_offset && j > doc->hint_object_offset)
2539 j += doc->hint_object_length;
2540 }
2541 /* FIXME: We would have problems recreating the length of the
2542 * last page 1 shared reference group. But we'll never need
2543 * to, so ignore it. */
2544 j = shared_obj_offset;
2545 for (; i < shared_obj_count_total; i++)
2546 {
2547 int off = fz_read_bits(ctx, stream, shared_group_len_num_bits);
2548 int old = j;
2549 doc->hint_shared[i].offset = j;
2550 j += off + least_shared_group_len;
2551 if (old <= doc->hint_object_offset && j > doc->hint_object_offset)
2552 j += doc->hint_object_length;
2553 }
2554 doc->hint_shared[i].offset = j;
2555 fz_sync_bits(ctx, stream);
2556 /* Item 2: Signature flags: read these just so we can skip */
2557 for (i = 0; i < shared_obj_count_total; i++)
2558 {
2559 doc->hint_shared[i].number = fz_read_bits(ctx, stream, 1);
2560 }
2561 fz_sync_bits(ctx, stream);
2562 /* Item 3: Signatures: just skip */
2563 for (i = 0; i < shared_obj_count_total; i++)
2564 {
2565 if (doc->hint_shared[i].number)
2566 {
2567 (void) fz_read_bits(ctx, stream, 128);
2568 }
2569 }
2570 fz_sync_bits(ctx, stream);
2571 /* Item 4: Shared object object numbers */
2572 j = doc->linear_page1_obj_num; /* FIXME: This is a lie! */
2573 for (i = 0; i < shared_obj_count_page1; i++)
2574 {
2575 doc->hint_shared[i].number = j;
2576 j += fz_read_bits(ctx, stream, shared_obj_num_bits) + 1;
2577 }
2578 j = shared_obj_num;
2579 for (; i < shared_obj_count_total; i++)
2580 {
2581 doc->hint_shared[i].number = j;
2582 j += fz_read_bits(ctx, stream, shared_obj_num_bits) + 1;
2583 }
2584 doc->hint_shared[i].number = j;
2585
2586 /* Now, actually use the data we have gathered. */
2587 for (i = 0 /*shared_obj_count_page1*/; i < shared_obj_count_total; i++)
2588 {
2589 doc->hint_obj_offsets[doc->hint_shared[i].number] = doc->hint_shared[i].offset;
2590 }
2591 for (i = 0; i < doc->linear_page_count; i++)
2592 {
2593 doc->hint_obj_offsets[doc->hint_page[i].number] = doc->hint_page[i].offset;
2594 }
2595 }
2596 fz_always(ctx)
2597 {
2598 fz_drop_stream(ctx, stream);
2599 }
2600 fz_catch(ctx)
2601 {
2602 fz_rethrow_if(ctx, FZ_ERROR_TRYLATER);
2603 /* Don't try to load hints again */
2604 doc->hints_loaded = 1;
2605 /* We won't use the linearized object anymore. */
2606 doc->file_reading_linearly = 0;
2607 /* Any other error becomes a TRYLATER */
2608 fz_throw(ctx, FZ_ERROR_TRYLATER, "malformed hints object");
2609 }
2610 doc->hints_loaded = 1;
2611}
2612
2613static void
2614pdf_load_hint_object(fz_context *ctx, pdf_document *doc)
2615{
2616 pdf_lexbuf *buf = &doc->lexbuf.base;
2617 int64_t curr_pos;
2618
2619 curr_pos = fz_tell(ctx, doc->file);
2620 fz_seek(ctx, doc->file, doc->hint_object_offset, SEEK_SET);
2621 fz_try(ctx)
2622 {
2623 while (1)
2624 {
2625 pdf_obj *page = NULL;
2626 int64_t tmpofs;
2627 int num, tok;
2628
2629 tok = pdf_lex(ctx, doc->file, buf);
2630 if (tok != PDF_TOK_INT)
2631 break;
2632 num = buf->i;
2633 tok = pdf_lex(ctx, doc->file, buf);
2634 if (tok != PDF_TOK_INT)
2635 break;
2636 /* Ignore gen = buf->i */
2637 tok = pdf_lex(ctx, doc->file, buf);
2638 if (tok != PDF_TOK_OBJ)
2639 break;
2640 (void)pdf_repair_obj(ctx, doc, buf, &tmpofs, NULL, NULL, NULL, &page, &tmpofs, NULL);
2641 pdf_load_hints(ctx, doc, num);
2642 }
2643 }
2644 fz_always(ctx)
2645 {
2646 fz_seek(ctx, doc->file, curr_pos, SEEK_SET);
2647 }
2648 fz_catch(ctx)
2649 {
2650 fz_rethrow(ctx);
2651 }
2652}
2653
2654pdf_obj *pdf_progressive_advance(fz_context *ctx, pdf_document *doc, int pagenum)
2655{
2656 pdf_lexbuf *buf = &doc->lexbuf.base;
2657 int curr_pos;
2658 pdf_obj *page = NULL;
2659
2660 pdf_load_hinted_page(ctx, doc, pagenum);
2661
2662 if (pagenum < 0 || pagenum >= doc->linear_page_count)
2663 fz_throw(ctx, FZ_ERROR_GENERIC, "page load out of range (%d of %d)", pagenum, doc->linear_page_count);
2664
2665 if (doc->linear_pos == doc->file_length)
2666 return doc->linear_page_refs[pagenum];
2667
2668 /* Only load hints once, and then only after we have got page 0 */
2669 if (pagenum > 0 && !doc->hints_loaded && doc->hint_object_offset > 0 && doc->linear_pos >= doc->hint_object_offset)
2670 {
2671 /* Found hint object */
2672 pdf_load_hint_object(ctx, doc);
2673 }
2674
2675 DEBUGMESS((ctx, "continuing to try to advance from %d", doc->linear_pos));
2676 curr_pos = fz_tell(ctx, doc->file);
2677
2678 fz_var(page);
2679
2680 fz_try(ctx)
2681 {
2682 int eof;
2683 do
2684 {
2685 int num;
2686 eof = pdf_obj_read(ctx, doc, &doc->linear_pos, &num, &page);
2687 pdf_drop_obj(ctx, page);
2688 page = NULL;
2689 }
2690 while (!eof);
2691
2692 {
2693 pdf_obj *catalog;
2694 pdf_obj *pages;
2695 doc->linear_pos = doc->file_length;
2696 pdf_load_xref(ctx, doc, buf);
2697 catalog = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root));
2698 pages = pdf_dict_get(ctx, catalog, PDF_NAME(Pages));
2699
2700 if (!pdf_is_dict(ctx, pages))
2701 fz_throw(ctx, FZ_ERROR_GENERIC, "missing page tree");
2702 break;
2703 }
2704 }
2705 fz_always(ctx)
2706 {
2707 fz_seek(ctx, doc->file, curr_pos, SEEK_SET);
2708 }
2709 fz_catch(ctx)
2710 {
2711 pdf_drop_obj(ctx, page);
2712 if (fz_caught(ctx) == FZ_ERROR_TRYLATER)
2713 {
2714 if (doc->linear_page_refs[pagenum] == NULL)
2715 {
2716 /* Still not got a page */
2717 fz_rethrow(ctx);
2718 }
2719 }
2720 else
2721 fz_rethrow(ctx);
2722 }
2723
2724 return doc->linear_page_refs[pagenum];
2725}
2726
2727/*
2728 Down-cast generic fitz objects into pdf specific variants.
2729 Returns NULL if the objects are not from a PDF document.
2730*/
2731pdf_document *pdf_document_from_fz_document(fz_context *ctx, fz_document *ptr)
2732{
2733 return (pdf_document *)((ptr && ptr->count_pages == (fz_document_count_pages_fn*)pdf_count_pages) ? ptr : NULL);
2734}
2735
2736pdf_page *pdf_page_from_fz_page(fz_context *ctx, fz_page *ptr)
2737{
2738 return (pdf_page *)((ptr && ptr->bound_page == (fz_page_bound_page_fn*)pdf_bound_page) ? ptr : NULL);
2739}
2740
2741/*
2742 down-cast a fz_document to a pdf_document.
2743 Returns NULL if underlying document is not PDF
2744*/
2745pdf_document *pdf_specifics(fz_context *ctx, fz_document *doc)
2746{
2747 return pdf_document_from_fz_document(ctx, doc);
2748}
2749
2750pdf_obj *
2751pdf_add_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
2752{
2753 pdf_document *orig_doc;
2754 int num;
2755
2756 orig_doc = pdf_get_bound_document(ctx, obj);
2757 if (orig_doc && orig_doc != doc)
2758 fz_throw(ctx, FZ_ERROR_GENERIC, "tried to add an object belonging to a different document");
2759 if (pdf_is_indirect(ctx, obj))
2760 return pdf_keep_obj(ctx, obj);
2761 num = pdf_create_object(ctx, doc);
2762 pdf_update_object(ctx, doc, num, obj);
2763 return pdf_new_indirect(ctx, doc, num, 0);
2764}
2765
2766pdf_obj *
2767pdf_add_object_drop(fz_context *ctx, pdf_document *doc, pdf_obj *obj)
2768{
2769 pdf_obj *ind = NULL;
2770 fz_try(ctx)
2771 ind = pdf_add_object(ctx, doc, obj);
2772 fz_always(ctx)
2773 pdf_drop_obj(ctx, obj);
2774 fz_catch(ctx)
2775 fz_rethrow(ctx);
2776 return ind;
2777}
2778
2779pdf_obj *
2780pdf_add_new_dict(fz_context *ctx, pdf_document *doc, int initial)
2781{
2782 return pdf_add_object_drop(ctx, doc, pdf_new_dict(ctx, doc, initial));
2783}
2784
2785pdf_obj *
2786pdf_add_new_array(fz_context *ctx, pdf_document *doc, int initial)
2787{
2788 return pdf_add_object_drop(ctx, doc, pdf_new_array(ctx, doc, initial));
2789}
2790
2791pdf_obj *
2792pdf_add_stream(fz_context *ctx, pdf_document *doc, fz_buffer *buf, pdf_obj *obj, int compressed)
2793{
2794 pdf_obj *ind;
2795 if (!obj)
2796 ind = pdf_add_new_dict(ctx, doc, 4);
2797 else
2798 ind = pdf_add_object(ctx, doc, obj);
2799 fz_try(ctx)
2800 pdf_update_stream(ctx, doc, ind, buf, compressed);
2801 fz_catch(ctx)
2802 {
2803 pdf_drop_obj(ctx, ind);
2804 fz_rethrow(ctx);
2805 }
2806 return ind;
2807}
2808
2809pdf_document *pdf_create_document(fz_context *ctx)
2810{
2811 pdf_document *doc;
2812 pdf_obj *root;
2813 pdf_obj *pages;
2814 pdf_obj *trailer = NULL;
2815
2816 fz_var(trailer);
2817
2818 doc = pdf_new_document(ctx, NULL);
2819 fz_try(ctx)
2820 {
2821 doc->version = 14;
2822 doc->file_size = 0;
2823 doc->startxref = 0;
2824 doc->num_xref_sections = 0;
2825 doc->num_incremental_sections = 0;
2826 doc->xref_base = 0;
2827 doc->disallow_new_increments = 0;
2828 pdf_get_populating_xref_entry(ctx, doc, 0);
2829
2830 trailer = pdf_new_dict(ctx, doc, 2);
2831 pdf_dict_put_int(ctx, trailer, PDF_NAME(Size), 3);
2832 pdf_dict_put_drop(ctx, trailer, PDF_NAME(Root), root = pdf_add_new_dict(ctx, doc, 2));
2833 pdf_dict_put(ctx, root, PDF_NAME(Type), PDF_NAME(Catalog));
2834 pdf_dict_put_drop(ctx, root, PDF_NAME(Pages), pages = pdf_add_new_dict(ctx, doc, 3));
2835 pdf_dict_put(ctx, pages, PDF_NAME(Type), PDF_NAME(Pages));
2836 pdf_dict_put_int(ctx, pages, PDF_NAME(Count), 0);
2837 pdf_dict_put_array(ctx, pages, PDF_NAME(Kids), 1);
2838
2839 /* Set the trailer of the final xref section. */
2840 doc->xref_sections[0].trailer = trailer;
2841 }
2842 fz_catch(ctx)
2843 {
2844 pdf_drop_obj(ctx, trailer);
2845 fz_drop_document(ctx, &doc->super);
2846 fz_rethrow(ctx);
2847 }
2848 return doc;
2849}
2850
2851static const char *pdf_extensions[] =
2852{
2853 "pdf",
2854 "pclm",
2855 "ai",
2856 NULL
2857};
2858
2859static const char *pdf_mimetypes[] =
2860{
2861 "application/pdf",
2862 "application/PCLm",
2863 NULL
2864};
2865
2866fz_document_handler pdf_document_handler =
2867{
2868 NULL,
2869 (fz_document_open_fn*)pdf_open_document,
2870 (fz_document_open_with_stream_fn*)pdf_open_document_with_stream,
2871 pdf_extensions,
2872 pdf_mimetypes
2873};
2874
2875void pdf_mark_xref(fz_context *ctx, pdf_document *doc)
2876{
2877 int x, e;
2878
2879 for (x = 0; x < doc->num_xref_sections; x++)
2880 {
2881 pdf_xref *xref = &doc->xref_sections[x];
2882 pdf_xref_subsec *sub;
2883
2884 for (sub = xref->subsec; sub != NULL; sub = sub->next)
2885 {
2886 for (e = 0; e < sub->len; e++)
2887 {
2888 pdf_xref_entry *entry = &sub->table[e];
2889 if (entry->obj)
2890 {
2891 entry->marked = 1;
2892 }
2893 }
2894 }
2895 }
2896}
2897
2898void pdf_clear_xref(fz_context *ctx, pdf_document *doc)
2899{
2900 int x, e;
2901
2902 for (x = 0; x < doc->num_xref_sections; x++)
2903 {
2904 pdf_xref *xref = &doc->xref_sections[x];
2905 pdf_xref_subsec *sub;
2906
2907 for (sub = xref->subsec; sub != NULL; sub = sub->next)
2908 {
2909 for (e = 0; e < sub->len; e++)
2910 {
2911 pdf_xref_entry *entry = &sub->table[e];
2912 /* We cannot drop objects if the stream
2913 * buffer has been updated */
2914 if (entry->obj != NULL && entry->stm_buf == NULL)
2915 {
2916 if (pdf_obj_refs(ctx, entry->obj) == 1)
2917 {
2918 pdf_drop_obj(ctx, entry->obj);
2919 entry->obj = NULL;
2920 }
2921 }
2922 }
2923 }
2924 }
2925}
2926
2927void pdf_clear_xref_to_mark(fz_context *ctx, pdf_document *doc)
2928{
2929 int x, e;
2930
2931 for (x = 0; x < doc->num_xref_sections; x++)
2932 {
2933 pdf_xref *xref = &doc->xref_sections[x];
2934 pdf_xref_subsec *sub;
2935
2936 for (sub = xref->subsec; sub != NULL; sub = sub->next)
2937 {
2938 for (e = 0; e < sub->len; e++)
2939 {
2940 pdf_xref_entry *entry = &sub->table[e];
2941
2942 /* We cannot drop objects if the stream buffer has
2943 * been updated */
2944 if (entry->obj != NULL && entry->stm_buf == NULL)
2945 {
2946 if (!entry->marked && pdf_obj_refs(ctx, entry->obj) == 1)
2947 {
2948 pdf_drop_obj(ctx, entry->obj);
2949 entry->obj = NULL;
2950 }
2951 }
2952 }
2953 }
2954 }
2955}
2956