1#include "mupdf/fitz.h"
2#include "mupdf/pdf.h"
3
4#include <string.h>
5
6typedef struct globals_s
7{
8 pdf_document *doc;
9 fz_context *ctx;
10} globals;
11
12static int
13string_in_names_list(fz_context *ctx, pdf_obj *p, pdf_obj *names_list)
14{
15 int n = pdf_array_len(ctx, names_list);
16 int i;
17 char *str = pdf_to_str_buf(ctx, p);
18
19 for (i = 0; i < n ; i += 2)
20 {
21 if (!strcmp(pdf_to_str_buf(ctx, pdf_array_get(ctx, names_list, i)), str))
22 return 1;
23 }
24 return 0;
25}
26
27/*
28 * Recreate page tree to only retain specified pages.
29 */
30
31static void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parent, pdf_obj *kids, int page)
32{
33 pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, page-1);
34
35 pdf_flatten_inheritable_page_items(ctx, pageref);
36
37 pdf_dict_put(ctx, pageref, PDF_NAME(Parent), parent);
38
39 /* Store page object in new kids array */
40 pdf_array_push(ctx, kids, pageref);
41}
42
43static int dest_is_valid_page(fz_context *ctx, pdf_obj *obj, int *page_object_nums, int pagecount)
44{
45 int i;
46 int num = pdf_to_num(ctx, obj);
47
48 if (num == 0)
49 return 0;
50 for (i = 0; i < pagecount; i++)
51 {
52 if (page_object_nums[i] == num)
53 return 1;
54 }
55 return 0;
56}
57
58static int dest_is_valid(fz_context *ctx, pdf_obj *o, int page_count, int *page_object_nums, pdf_obj *names_list)
59{
60 pdf_obj *p;
61
62 p = pdf_dict_get(ctx, o, PDF_NAME(A));
63 if (pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME(S)), PDF_NAME(GoTo)) &&
64 !string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME(D)), names_list))
65 return 0;
66
67 p = pdf_dict_get(ctx, o, PDF_NAME(Dest));
68 if (p == NULL)
69 {}
70 else if (pdf_is_string(ctx, p))
71 return string_in_names_list(ctx, p, names_list);
72 else if (!dest_is_valid_page(ctx, pdf_array_get(ctx, p, 0), page_object_nums, page_count))
73 return 0;
74
75 return 1;
76}
77
78static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list);
79
80static int strip_outline(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list, pdf_obj **pfirst, pdf_obj **plast)
81{
82 pdf_obj *prev = NULL;
83 pdf_obj *first = NULL;
84 pdf_obj *current;
85 int count = 0;
86
87 for (current = outlines; current != NULL; )
88 {
89 int nc;
90
91 /* Strip any children to start with. This takes care of
92 * First/Last/Count for us. */
93 nc = strip_outlines(ctx, doc, current, page_count, page_object_nums, names_list);
94
95 if (!dest_is_valid(ctx, current, page_count, page_object_nums, names_list))
96 {
97 if (nc == 0)
98 {
99 /* Outline with invalid dest and no children. Drop it by
100 * pulling the next one in here. */
101 pdf_obj *next = pdf_dict_get(ctx, current, PDF_NAME(Next));
102 if (next == NULL)
103 {
104 /* There is no next one to pull in */
105 if (prev != NULL)
106 pdf_dict_del(ctx, prev, PDF_NAME(Next));
107 }
108 else if (prev != NULL)
109 {
110 pdf_dict_put(ctx, prev, PDF_NAME(Next), next);
111 pdf_dict_put(ctx, next, PDF_NAME(Prev), prev);
112 }
113 else
114 {
115 pdf_dict_del(ctx, next, PDF_NAME(Prev));
116 }
117 current = next;
118 }
119 else
120 {
121 /* Outline with invalid dest, but children. Just drop the dest. */
122 pdf_dict_del(ctx, current, PDF_NAME(Dest));
123 pdf_dict_del(ctx, current, PDF_NAME(A));
124 current = pdf_dict_get(ctx, current, PDF_NAME(Next));
125 }
126 }
127 else
128 {
129 /* Keep this one */
130 if (first == NULL)
131 first = current;
132 prev = current;
133 current = pdf_dict_get(ctx, current, PDF_NAME(Next));
134 count++;
135 }
136 }
137
138 *pfirst = first;
139 *plast = prev;
140
141 return count;
142}
143
144static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list)
145{
146 int nc;
147 pdf_obj *first;
148 pdf_obj *last;
149
150 if (outlines == NULL)
151 return 0;
152
153 first = pdf_dict_get(ctx, outlines, PDF_NAME(First));
154 if (first == NULL)
155 nc = 0;
156 else
157 nc = strip_outline(ctx, doc, first, page_count, page_object_nums, names_list, &first, &last);
158
159 if (nc == 0)
160 {
161 pdf_dict_del(ctx, outlines, PDF_NAME(First));
162 pdf_dict_del(ctx, outlines, PDF_NAME(Last));
163 pdf_dict_del(ctx, outlines, PDF_NAME(Count));
164 }
165 else
166 {
167 int old_count = pdf_dict_get_int(ctx, outlines, PDF_NAME(Count));
168 pdf_dict_put(ctx, outlines, PDF_NAME(First), first);
169 pdf_dict_put(ctx, outlines, PDF_NAME(Last), last);
170 pdf_dict_put_int(ctx, outlines, PDF_NAME(Count), old_count > 0 ? nc : -nc);
171 }
172
173 return nc;
174}
175
176static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv)
177{
178 pdf_obj *oldroot, *root, *pages, *kids, *countobj, *olddests;
179 pdf_document *doc = glo->doc;
180 int argidx = 0;
181 pdf_obj *names_list = NULL;
182 pdf_obj *outlines;
183 pdf_obj *ocproperties;
184 int pagecount;
185 int i;
186 int *page_object_nums;
187
188 /* Keep only pages/type and (reduced) dest entries to avoid
189 * references to unretained pages */
190 oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root));
191 pages = pdf_dict_get(ctx, oldroot, PDF_NAME(Pages));
192 olddests = pdf_load_name_tree(ctx, doc, PDF_NAME(Dests));
193 outlines = pdf_dict_get(ctx, oldroot, PDF_NAME(Outlines));
194 ocproperties = pdf_dict_get(ctx, oldroot, PDF_NAME(OCProperties));
195
196 root = pdf_new_dict(ctx, doc, 3);
197 pdf_dict_put(ctx, root, PDF_NAME(Type), pdf_dict_get(ctx, oldroot, PDF_NAME(Type)));
198 pdf_dict_put(ctx, root, PDF_NAME(Pages), pdf_dict_get(ctx, oldroot, PDF_NAME(Pages)));
199 if (outlines)
200 pdf_dict_put(ctx, root, PDF_NAME(Outlines), outlines);
201 if (ocproperties)
202 pdf_dict_put(ctx, root, PDF_NAME(OCProperties), ocproperties);
203
204 pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root);
205
206 /* Create a new kids array with only the pages we want to keep */
207 kids = pdf_new_array(ctx, doc, 1);
208
209 /* Retain pages specified */
210 while (argc - argidx)
211 {
212 int page, spage, epage;
213 const char *pagelist = argv[argidx];
214
215 pagecount = pdf_count_pages(ctx, doc);
216
217 while ((pagelist = fz_parse_page_range(ctx, pagelist, &spage, &epage, pagecount)))
218 {
219 if (spage < epage)
220 for (page = spage; page <= epage; ++page)
221 retainpage(ctx, doc, pages, kids, page);
222 else
223 for (page = spage; page >= epage; --page)
224 retainpage(ctx, doc, pages, kids, page);
225 }
226
227 argidx++;
228 }
229
230 /* Update page count and kids array */
231 countobj = pdf_new_int(ctx, pdf_array_len(ctx, kids));
232 pdf_dict_put_drop(ctx, pages, PDF_NAME(Count), countobj);
233 pdf_dict_put_drop(ctx, pages, PDF_NAME(Kids), kids);
234
235 pagecount = pdf_count_pages(ctx, doc);
236 page_object_nums = fz_calloc(ctx, pagecount, sizeof(*page_object_nums));
237 for (i = 0; i < pagecount; i++)
238 {
239 pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i);
240 page_object_nums[i] = pdf_to_num(ctx, pageref);
241 }
242
243 /* If we had an old Dests tree (now reformed as an olddests
244 * dictionary), keep any entries in there that point to
245 * valid pages. This may mean we keep more than we need, but
246 * it's safe at least. */
247 if (olddests)
248 {
249 pdf_obj *names = pdf_new_dict(ctx, doc, 1);
250 pdf_obj *dests = pdf_new_dict(ctx, doc, 1);
251 int len = pdf_dict_len(ctx, olddests);
252
253 names_list = pdf_new_array(ctx, doc, 32);
254
255 for (i = 0; i < len; i++)
256 {
257 pdf_obj *key = pdf_dict_get_key(ctx, olddests, i);
258 pdf_obj *val = pdf_dict_get_val(ctx, olddests, i);
259 pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME(D));
260
261 dest = pdf_array_get(ctx, dest ? dest : val, 0);
262 if (dest_is_valid_page(ctx, dest, page_object_nums, pagecount))
263 {
264 pdf_obj *key_str = pdf_new_string(ctx, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key)));
265 pdf_array_push_drop(ctx, names_list, key_str);
266 pdf_array_push(ctx, names_list, val);
267 }
268 }
269
270 pdf_dict_put(ctx, dests, PDF_NAME(Names), names_list);
271 pdf_dict_put(ctx, names, PDF_NAME(Dests), dests);
272 pdf_dict_put(ctx, root, PDF_NAME(Names), names);
273
274 pdf_drop_obj(ctx, names);
275 pdf_drop_obj(ctx, dests);
276 pdf_drop_obj(ctx, olddests);
277 }
278
279 /* Edit each pages /Annot list to remove any links that point to nowhere. */
280 for (i = 0; i < pagecount; i++)
281 {
282 pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i);
283
284 pdf_obj *annots = pdf_dict_get(ctx, pageref, PDF_NAME(Annots));
285
286 int len = pdf_array_len(ctx, annots);
287 int j;
288
289 for (j = 0; j < len; j++)
290 {
291 pdf_obj *o = pdf_array_get(ctx, annots, j);
292
293 if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME(Subtype)), PDF_NAME(Link)))
294 continue;
295
296 if (!dest_is_valid(ctx, o, pagecount, page_object_nums, names_list))
297 {
298 /* Remove this annotation */
299 pdf_array_delete(ctx, annots, j);
300 len--;
301 j--;
302 }
303 }
304 }
305
306 if (strip_outlines(ctx, doc, outlines, pagecount, page_object_nums, names_list) == 0)
307 {
308 pdf_dict_del(ctx, root, PDF_NAME(Outlines));
309 }
310
311 fz_free(ctx, page_object_nums);
312 pdf_drop_obj(ctx, names_list);
313 pdf_drop_obj(ctx, root);
314}
315
316/* Read infile, and write selected pages to outfile with the given options. */
317void pdf_clean_file(fz_context *ctx, char *infile, char *outfile, char *password, pdf_write_options *opts, char *argv[], int argc)
318{
319 globals glo = { 0 };
320
321 glo.ctx = ctx;
322
323 fz_try(ctx)
324 {
325 glo.doc = pdf_open_document(ctx, infile);
326 if (pdf_needs_password(ctx, glo.doc))
327 if (!pdf_authenticate_password(ctx, glo.doc, password))
328 fz_throw(glo.ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s", infile);
329
330 /* Only retain the specified subset of the pages */
331 if (argc)
332 retainpages(ctx, &glo, argc, argv);
333
334 pdf_save_document(ctx, glo.doc, outfile, opts);
335 }
336 fz_always(ctx)
337 {
338 pdf_drop_document(ctx, glo.doc);
339 }
340 fz_catch(ctx)
341 {
342 fz_rethrow(ctx);
343 }
344}
345