1 | #include "mupdf/fitz.h" |
2 | #include "mupdf/pdf.h" |
3 | |
4 | #include <string.h> |
5 | |
6 | typedef struct globals_s |
7 | { |
8 | pdf_document *doc; |
9 | fz_context *ctx; |
10 | } globals; |
11 | |
12 | static int |
13 | string_in_names_list(fz_context *ctx, pdf_obj *p, pdf_obj *names_list) |
14 | { |
15 | int n = pdf_array_len(ctx, names_list); |
16 | int i; |
17 | char *str = pdf_to_str_buf(ctx, p); |
18 | |
19 | for (i = 0; i < n ; i += 2) |
20 | { |
21 | if (!strcmp(pdf_to_str_buf(ctx, pdf_array_get(ctx, names_list, i)), str)) |
22 | return 1; |
23 | } |
24 | return 0; |
25 | } |
26 | |
27 | /* |
28 | * Recreate page tree to only retain specified pages. |
29 | */ |
30 | |
31 | static void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parent, pdf_obj *kids, int page) |
32 | { |
33 | pdf_obj * = pdf_lookup_page_obj(ctx, doc, page-1); |
34 | |
35 | pdf_flatten_inheritable_page_items(ctx, pageref); |
36 | |
37 | pdf_dict_put(ctx, pageref, PDF_NAME(Parent), parent); |
38 | |
39 | /* Store page object in new kids array */ |
40 | pdf_array_push(ctx, kids, pageref); |
41 | } |
42 | |
43 | static int dest_is_valid_page(fz_context *ctx, pdf_obj *obj, int *page_object_nums, int pagecount) |
44 | { |
45 | int i; |
46 | int num = pdf_to_num(ctx, obj); |
47 | |
48 | if (num == 0) |
49 | return 0; |
50 | for (i = 0; i < pagecount; i++) |
51 | { |
52 | if (page_object_nums[i] == num) |
53 | return 1; |
54 | } |
55 | return 0; |
56 | } |
57 | |
58 | static int dest_is_valid(fz_context *ctx, pdf_obj *o, int page_count, int *page_object_nums, pdf_obj *names_list) |
59 | { |
60 | pdf_obj *p; |
61 | |
62 | p = pdf_dict_get(ctx, o, PDF_NAME(A)); |
63 | if (pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME(S)), PDF_NAME(GoTo)) && |
64 | !string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME(D)), names_list)) |
65 | return 0; |
66 | |
67 | p = pdf_dict_get(ctx, o, PDF_NAME(Dest)); |
68 | if (p == NULL) |
69 | {} |
70 | else if (pdf_is_string(ctx, p)) |
71 | return string_in_names_list(ctx, p, names_list); |
72 | else if (!dest_is_valid_page(ctx, pdf_array_get(ctx, p, 0), page_object_nums, page_count)) |
73 | return 0; |
74 | |
75 | return 1; |
76 | } |
77 | |
78 | static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list); |
79 | |
80 | static int strip_outline(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list, pdf_obj **pfirst, pdf_obj **plast) |
81 | { |
82 | pdf_obj *prev = NULL; |
83 | pdf_obj *first = NULL; |
84 | pdf_obj *current; |
85 | int count = 0; |
86 | |
87 | for (current = outlines; current != NULL; ) |
88 | { |
89 | int nc; |
90 | |
91 | /* Strip any children to start with. This takes care of |
92 | * First/Last/Count for us. */ |
93 | nc = strip_outlines(ctx, doc, current, page_count, page_object_nums, names_list); |
94 | |
95 | if (!dest_is_valid(ctx, current, page_count, page_object_nums, names_list)) |
96 | { |
97 | if (nc == 0) |
98 | { |
99 | /* Outline with invalid dest and no children. Drop it by |
100 | * pulling the next one in here. */ |
101 | pdf_obj *next = pdf_dict_get(ctx, current, PDF_NAME(Next)); |
102 | if (next == NULL) |
103 | { |
104 | /* There is no next one to pull in */ |
105 | if (prev != NULL) |
106 | pdf_dict_del(ctx, prev, PDF_NAME(Next)); |
107 | } |
108 | else if (prev != NULL) |
109 | { |
110 | pdf_dict_put(ctx, prev, PDF_NAME(Next), next); |
111 | pdf_dict_put(ctx, next, PDF_NAME(Prev), prev); |
112 | } |
113 | else |
114 | { |
115 | pdf_dict_del(ctx, next, PDF_NAME(Prev)); |
116 | } |
117 | current = next; |
118 | } |
119 | else |
120 | { |
121 | /* Outline with invalid dest, but children. Just drop the dest. */ |
122 | pdf_dict_del(ctx, current, PDF_NAME(Dest)); |
123 | pdf_dict_del(ctx, current, PDF_NAME(A)); |
124 | current = pdf_dict_get(ctx, current, PDF_NAME(Next)); |
125 | } |
126 | } |
127 | else |
128 | { |
129 | /* Keep this one */ |
130 | if (first == NULL) |
131 | first = current; |
132 | prev = current; |
133 | current = pdf_dict_get(ctx, current, PDF_NAME(Next)); |
134 | count++; |
135 | } |
136 | } |
137 | |
138 | *pfirst = first; |
139 | *plast = prev; |
140 | |
141 | return count; |
142 | } |
143 | |
144 | static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list) |
145 | { |
146 | int nc; |
147 | pdf_obj *first; |
148 | pdf_obj *last; |
149 | |
150 | if (outlines == NULL) |
151 | return 0; |
152 | |
153 | first = pdf_dict_get(ctx, outlines, PDF_NAME(First)); |
154 | if (first == NULL) |
155 | nc = 0; |
156 | else |
157 | nc = strip_outline(ctx, doc, first, page_count, page_object_nums, names_list, &first, &last); |
158 | |
159 | if (nc == 0) |
160 | { |
161 | pdf_dict_del(ctx, outlines, PDF_NAME(First)); |
162 | pdf_dict_del(ctx, outlines, PDF_NAME(Last)); |
163 | pdf_dict_del(ctx, outlines, PDF_NAME(Count)); |
164 | } |
165 | else |
166 | { |
167 | int old_count = pdf_dict_get_int(ctx, outlines, PDF_NAME(Count)); |
168 | pdf_dict_put(ctx, outlines, PDF_NAME(First), first); |
169 | pdf_dict_put(ctx, outlines, PDF_NAME(Last), last); |
170 | pdf_dict_put_int(ctx, outlines, PDF_NAME(Count), old_count > 0 ? nc : -nc); |
171 | } |
172 | |
173 | return nc; |
174 | } |
175 | |
176 | static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) |
177 | { |
178 | pdf_obj *oldroot, *root, *pages, *kids, *countobj, *olddests; |
179 | pdf_document *doc = glo->doc; |
180 | int argidx = 0; |
181 | pdf_obj *names_list = NULL; |
182 | pdf_obj *outlines; |
183 | pdf_obj *ocproperties; |
184 | int pagecount; |
185 | int i; |
186 | int *page_object_nums; |
187 | |
188 | /* Keep only pages/type and (reduced) dest entries to avoid |
189 | * references to unretained pages */ |
190 | oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root)); |
191 | pages = pdf_dict_get(ctx, oldroot, PDF_NAME(Pages)); |
192 | olddests = pdf_load_name_tree(ctx, doc, PDF_NAME(Dests)); |
193 | outlines = pdf_dict_get(ctx, oldroot, PDF_NAME(Outlines)); |
194 | ocproperties = pdf_dict_get(ctx, oldroot, PDF_NAME(OCProperties)); |
195 | |
196 | root = pdf_new_dict(ctx, doc, 3); |
197 | pdf_dict_put(ctx, root, PDF_NAME(Type), pdf_dict_get(ctx, oldroot, PDF_NAME(Type))); |
198 | pdf_dict_put(ctx, root, PDF_NAME(Pages), pdf_dict_get(ctx, oldroot, PDF_NAME(Pages))); |
199 | if (outlines) |
200 | pdf_dict_put(ctx, root, PDF_NAME(Outlines), outlines); |
201 | if (ocproperties) |
202 | pdf_dict_put(ctx, root, PDF_NAME(OCProperties), ocproperties); |
203 | |
204 | pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root); |
205 | |
206 | /* Create a new kids array with only the pages we want to keep */ |
207 | kids = pdf_new_array(ctx, doc, 1); |
208 | |
209 | /* Retain pages specified */ |
210 | while (argc - argidx) |
211 | { |
212 | int page, spage, epage; |
213 | const char *pagelist = argv[argidx]; |
214 | |
215 | pagecount = pdf_count_pages(ctx, doc); |
216 | |
217 | while ((pagelist = fz_parse_page_range(ctx, pagelist, &spage, &epage, pagecount))) |
218 | { |
219 | if (spage < epage) |
220 | for (page = spage; page <= epage; ++page) |
221 | retainpage(ctx, doc, pages, kids, page); |
222 | else |
223 | for (page = spage; page >= epage; --page) |
224 | retainpage(ctx, doc, pages, kids, page); |
225 | } |
226 | |
227 | argidx++; |
228 | } |
229 | |
230 | /* Update page count and kids array */ |
231 | countobj = pdf_new_int(ctx, pdf_array_len(ctx, kids)); |
232 | pdf_dict_put_drop(ctx, pages, PDF_NAME(Count), countobj); |
233 | pdf_dict_put_drop(ctx, pages, PDF_NAME(Kids), kids); |
234 | |
235 | pagecount = pdf_count_pages(ctx, doc); |
236 | page_object_nums = fz_calloc(ctx, pagecount, sizeof(*page_object_nums)); |
237 | for (i = 0; i < pagecount; i++) |
238 | { |
239 | pdf_obj * = pdf_lookup_page_obj(ctx, doc, i); |
240 | page_object_nums[i] = pdf_to_num(ctx, pageref); |
241 | } |
242 | |
243 | /* If we had an old Dests tree (now reformed as an olddests |
244 | * dictionary), keep any entries in there that point to |
245 | * valid pages. This may mean we keep more than we need, but |
246 | * it's safe at least. */ |
247 | if (olddests) |
248 | { |
249 | pdf_obj *names = pdf_new_dict(ctx, doc, 1); |
250 | pdf_obj *dests = pdf_new_dict(ctx, doc, 1); |
251 | int len = pdf_dict_len(ctx, olddests); |
252 | |
253 | names_list = pdf_new_array(ctx, doc, 32); |
254 | |
255 | for (i = 0; i < len; i++) |
256 | { |
257 | pdf_obj *key = pdf_dict_get_key(ctx, olddests, i); |
258 | pdf_obj *val = pdf_dict_get_val(ctx, olddests, i); |
259 | pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME(D)); |
260 | |
261 | dest = pdf_array_get(ctx, dest ? dest : val, 0); |
262 | if (dest_is_valid_page(ctx, dest, page_object_nums, pagecount)) |
263 | { |
264 | pdf_obj *key_str = pdf_new_string(ctx, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key))); |
265 | pdf_array_push_drop(ctx, names_list, key_str); |
266 | pdf_array_push(ctx, names_list, val); |
267 | } |
268 | } |
269 | |
270 | pdf_dict_put(ctx, dests, PDF_NAME(Names), names_list); |
271 | pdf_dict_put(ctx, names, PDF_NAME(Dests), dests); |
272 | pdf_dict_put(ctx, root, PDF_NAME(Names), names); |
273 | |
274 | pdf_drop_obj(ctx, names); |
275 | pdf_drop_obj(ctx, dests); |
276 | pdf_drop_obj(ctx, olddests); |
277 | } |
278 | |
279 | /* Edit each pages /Annot list to remove any links that point to nowhere. */ |
280 | for (i = 0; i < pagecount; i++) |
281 | { |
282 | pdf_obj * = pdf_lookup_page_obj(ctx, doc, i); |
283 | |
284 | pdf_obj *annots = pdf_dict_get(ctx, pageref, PDF_NAME(Annots)); |
285 | |
286 | int len = pdf_array_len(ctx, annots); |
287 | int j; |
288 | |
289 | for (j = 0; j < len; j++) |
290 | { |
291 | pdf_obj *o = pdf_array_get(ctx, annots, j); |
292 | |
293 | if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME(Subtype)), PDF_NAME(Link))) |
294 | continue; |
295 | |
296 | if (!dest_is_valid(ctx, o, pagecount, page_object_nums, names_list)) |
297 | { |
298 | /* Remove this annotation */ |
299 | pdf_array_delete(ctx, annots, j); |
300 | len--; |
301 | j--; |
302 | } |
303 | } |
304 | } |
305 | |
306 | if (strip_outlines(ctx, doc, outlines, pagecount, page_object_nums, names_list) == 0) |
307 | { |
308 | pdf_dict_del(ctx, root, PDF_NAME(Outlines)); |
309 | } |
310 | |
311 | fz_free(ctx, page_object_nums); |
312 | pdf_drop_obj(ctx, names_list); |
313 | pdf_drop_obj(ctx, root); |
314 | } |
315 | |
316 | /* Read infile, and write selected pages to outfile with the given options. */ |
317 | void pdf_clean_file(fz_context *ctx, char *infile, char *outfile, char *password, pdf_write_options *opts, char *argv[], int argc) |
318 | { |
319 | globals glo = { 0 }; |
320 | |
321 | glo.ctx = ctx; |
322 | |
323 | fz_try(ctx) |
324 | { |
325 | glo.doc = pdf_open_document(ctx, infile); |
326 | if (pdf_needs_password(ctx, glo.doc)) |
327 | if (!pdf_authenticate_password(ctx, glo.doc, password)) |
328 | fz_throw(glo.ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s" , infile); |
329 | |
330 | /* Only retain the specified subset of the pages */ |
331 | if (argc) |
332 | retainpages(ctx, &glo, argc, argv); |
333 | |
334 | pdf_save_document(ctx, glo.doc, outfile, opts); |
335 | } |
336 | fz_always(ctx) |
337 | { |
338 | pdf_drop_document(ctx, glo.doc); |
339 | } |
340 | fz_catch(ctx) |
341 | { |
342 | fz_rethrow(ctx); |
343 | } |
344 | } |
345 | |