pdf-clean-file.c source code [MuPDF/source/pdf/pdf-clean-file.c]

1	#include "mupdf/fitz.h"
2	#include "mupdf/pdf.h"
3
4	#include <string.h>
5
6	typedef struct globals_s
7	{
8	pdf_document *doc;
9	fz_context *ctx;
10	} globals;
11
12	static int
13	string_in_names_list(fz_context ctx, pdf_obj p, pdf_obj *names_list)
14	{
15	int n = pdf_array_len(ctx, names_list);
16	int i;
17	char *str = pdf_to_str_buf(ctx, p);
18
19	for (i = `0`; i < n ; i += `2`)
20	{
21	if (!strcmp(pdf_to_str_buf(ctx, pdf_array_get(ctx, names_list, i)), str))
22	return `1`;
23	}
24	return `0`;
25	}
26
27	/*
28	* Recreate page tree to only retain specified pages.
29	*/
30
31	static void retainpage(fz_context ctx, pdf_document doc, pdf_obj parent, pdf_obj kids, int page)
32	{
33	pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, page-`1`);
34
35	pdf_flatten_inheritable_page_items(ctx, pageref);
36
37	pdf_dict_put(ctx, pageref, PDF_NAME(Parent), parent);
38
39	/ Store page object in new kids array /
40	pdf_array_push(ctx, kids, pageref);
41	}
42
43	static int dest_is_valid_page(fz_context ctx, pdf_obj obj, int page_object_nums, int* pagecount)
44	{
45	int i;
46	int num = pdf_to_num(ctx, obj);
47
48	if (num == `0`)
49	return `0`;
50	for (i = `0`; i < pagecount; i++)
51	{
52	if (page_object_nums[i] == num)
53	return `1`;
54	}
55	return `0`;
56	}
57
58	static int dest_is_valid(fz_context ctx, pdf_obj o, int page_count, int page_object_nums, pdf_obj names_list)
59	{
60	pdf_obj *p;
61
62	p = pdf_dict_get(ctx, o, PDF_NAME(A));
63	if (pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME(S)), PDF_NAME(GoTo)) &&
64	!string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME(D)), names_list))
65	return `0`;
66
67	p = pdf_dict_get(ctx, o, PDF_NAME(Dest));
68	if (p == NULL)
69	{}
70	else if (pdf_is_string(ctx, p))
71	return string_in_names_list(ctx, p, names_list);
72	else if (!dest_is_valid_page(ctx, pdf_array_get(ctx, p, `0`), page_object_nums, page_count))
73	return `0`;
74
75	return `1`;
76	}
77
78	static int strip_outlines(fz_context ctx, pdf_document doc, pdf_obj outlines, int* page_count, int page_object_nums, pdf_obj names_list);
79
80	static int strip_outline(fz_context ctx, pdf_document doc, pdf_obj outlines, int* page_count, int page_object_nums, pdf_obj names_list, pdf_obj pfirst, pdf_obj plast)
81	{
82	pdf_obj *prev = NULL;
83	pdf_obj *first = NULL;
84	pdf_obj *current;
85	int count = `0`;
86
87	for (current = outlines; current != NULL; )
88	{
89	int nc;
90
91	/ Strip any children to start with. This takes care of*
92	* First/Last/Count for us. */
93	nc = strip_outlines(ctx, doc, current, page_count, page_object_nums, names_list);
94
95	if (!dest_is_valid(ctx, current, page_count, page_object_nums, names_list))
96	{
97	if (nc == `0`)
98	{
99	/ Outline with invalid dest and no children. Drop it by*
100	* pulling the next one in here. */
101	pdf_obj *next = pdf_dict_get(ctx, current, PDF_NAME(Next));
102	if (next == NULL)
103	{
104	/ There is no next one to pull in /
105	if (prev != NULL)
106	pdf_dict_del(ctx, prev, PDF_NAME(Next));
107	}
108	else if (prev != NULL)
109	{
110	pdf_dict_put(ctx, prev, PDF_NAME(Next), next);
111	pdf_dict_put(ctx, next, PDF_NAME(Prev), prev);
112	}
113	else
114	{
115	pdf_dict_del(ctx, next, PDF_NAME(Prev));
116	}
117	current = next;
118	}
119	else
120	{
121	/ Outline with invalid dest, but children. Just drop the dest. /
122	pdf_dict_del(ctx, current, PDF_NAME(Dest));
123	pdf_dict_del(ctx, current, PDF_NAME(A));
124	current = pdf_dict_get(ctx, current, PDF_NAME(Next));
125	}
126	}
127	else
128	{
129	/ Keep this one /
130	if (first == NULL)
131	first = current;
132	prev = current;
133	current = pdf_dict_get(ctx, current, PDF_NAME(Next));
134	count++;
135	}
136	}
137
138	*pfirst = first;
139	*plast = prev;
140
141	return count;
142	}
143
144	static int strip_outlines(fz_context ctx, pdf_document doc, pdf_obj outlines, int* page_count, int page_object_nums, pdf_obj names_list)
145	{
146	int nc;
147	pdf_obj *first;
148	pdf_obj *last;
149
150	if (outlines == NULL)
151	return `0`;
152
153	first = pdf_dict_get(ctx, outlines, PDF_NAME(First));
154	if (first == NULL)
155	nc = `0`;
156	else
157	nc = strip_outline(ctx, doc, first, page_count, page_object_nums, names_list, &first, &last);
158
159	if (nc == `0`)
160	{
161	pdf_dict_del(ctx, outlines, PDF_NAME(First));
162	pdf_dict_del(ctx, outlines, PDF_NAME(Last));
163	pdf_dict_del(ctx, outlines, PDF_NAME(Count));
164	}
165	else
166	{
167	int old_count = pdf_dict_get_int(ctx, outlines, PDF_NAME(Count));
168	pdf_dict_put(ctx, outlines, PDF_NAME(First), first);
169	pdf_dict_put(ctx, outlines, PDF_NAME(Last), last);
170	pdf_dict_put_int(ctx, outlines, PDF_NAME(Count), old_count > `0` ? nc : -nc);
171	}
172
173	return nc;
174	}
175
176	static void retainpages(fz_context ctx, globals glo, int argc, char **argv)
177	{
178	pdf_obj oldroot, root, pages, kids, countobj, olddests;
179	pdf_document *doc = glo->doc;
180	int argidx = `0`;
181	pdf_obj *names_list = NULL;
182	pdf_obj *outlines;
183	pdf_obj *ocproperties;
184	int pagecount;
185	int i;
186	int *page_object_nums;
187
188	/ Keep only pages/type and (reduced) dest entries to avoid*
189	* references to unretained pages */
190	oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root));
191	pages = pdf_dict_get(ctx, oldroot, PDF_NAME(Pages));
192	olddests = pdf_load_name_tree(ctx, doc, PDF_NAME(Dests));
193	outlines = pdf_dict_get(ctx, oldroot, PDF_NAME(Outlines));
194	ocproperties = pdf_dict_get(ctx, oldroot, PDF_NAME(OCProperties));
195
196	root = pdf_new_dict(ctx, doc, `3`);
197	pdf_dict_put(ctx, root, PDF_NAME(Type), pdf_dict_get(ctx, oldroot, PDF_NAME(Type)));
198	pdf_dict_put(ctx, root, PDF_NAME(Pages), pdf_dict_get(ctx, oldroot, PDF_NAME(Pages)));
199	if (outlines)
200	pdf_dict_put(ctx, root, PDF_NAME(Outlines), outlines);
201	if (ocproperties)
202	pdf_dict_put(ctx, root, PDF_NAME(OCProperties), ocproperties);
203
204	pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root);
205
206	/ Create a new kids array with only the pages we want to keep /
207	kids = pdf_new_array(ctx, doc, `1`);
208
209	/ Retain pages specified /
210	while (argc - argidx)
211	{
212	int page, spage, epage;
213	const char *pagelist = argv[argidx];
214
215	pagecount = pdf_count_pages(ctx, doc);
216
217	while ((pagelist = fz_parse_page_range(ctx, pagelist, &spage, &epage, pagecount)))
218	{
219	if (spage < epage)
220	for (page = spage; page <= epage; ++page)
221	retainpage(ctx, doc, pages, kids, page);
222	else
223	for (page = spage; page >= epage; --page)
224	retainpage(ctx, doc, pages, kids, page);
225	}
226
227	argidx++;
228	}
229
230	/ Update page count and kids array /
231	countobj = pdf_new_int(ctx, pdf_array_len(ctx, kids));
232	pdf_dict_put_drop(ctx, pages, PDF_NAME(Count), countobj);
233	pdf_dict_put_drop(ctx, pages, PDF_NAME(Kids), kids);
234
235	pagecount = pdf_count_pages(ctx, doc);
236	page_object_nums = fz_calloc(ctx, pagecount, sizeof(*page_object_nums));
237	for (i = `0`; i < pagecount; i++)
238	{
239	pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i);
240	page_object_nums[i] = pdf_to_num(ctx, pageref);
241	}
242
243	/ If we had an old Dests tree (now reformed as an olddests*
244	* dictionary), keep any entries in there that point to
245	* valid pages. This may mean we keep more than we need, but
246	* it's safe at least. */
247	if (olddests)
248	{
249	pdf_obj *names = pdf_new_dict(ctx, doc, `1`);
250	pdf_obj *dests = pdf_new_dict(ctx, doc, `1`);
251	int len = pdf_dict_len(ctx, olddests);
252
253	names_list = pdf_new_array(ctx, doc, `32`);
254
255	for (i = `0`; i < len; i++)
256	{
257	pdf_obj *key = pdf_dict_get_key(ctx, olddests, i);
258	pdf_obj *val = pdf_dict_get_val(ctx, olddests, i);
259	pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME(D));
260
261	dest = pdf_array_get(ctx, dest ? dest : val, `0`);
262	if (dest_is_valid_page(ctx, dest, page_object_nums, pagecount))
263	{
264	pdf_obj *key_str = pdf_new_string(ctx, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key)));
265	pdf_array_push_drop(ctx, names_list, key_str);
266	pdf_array_push(ctx, names_list, val);
267	}
268	}
269
270	pdf_dict_put(ctx, dests, PDF_NAME(Names), names_list);
271	pdf_dict_put(ctx, names, PDF_NAME(Dests), dests);
272	pdf_dict_put(ctx, root, PDF_NAME(Names), names);
273
274	pdf_drop_obj(ctx, names);
275	pdf_drop_obj(ctx, dests);
276	pdf_drop_obj(ctx, olddests);
277	}
278
279	/ Edit each pages /Annot list to remove any links that point to nowhere. /
280	for (i = `0`; i < pagecount; i++)
281	{
282	pdf_obj *pageref = pdf_lookup_page_obj(ctx, doc, i);
283
284	pdf_obj *annots = pdf_dict_get(ctx, pageref, PDF_NAME(Annots));
285
286	int len = pdf_array_len(ctx, annots);
287	int j;
288
289	for (j = `0`; j < len; j++)
290	{
291	pdf_obj *o = pdf_array_get(ctx, annots, j);
292
293	if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME(Subtype)), PDF_NAME(Link)))
294	continue;
295
296	if (!dest_is_valid(ctx, o, pagecount, page_object_nums, names_list))
297	{
298	/ Remove this annotation /
299	pdf_array_delete(ctx, annots, j);
300	len--;
301	j--;
302	}
303	}
304	}
305
306	if (strip_outlines(ctx, doc, outlines, pagecount, page_object_nums, names_list) == `0`)
307	{
308	pdf_dict_del(ctx, root, PDF_NAME(Outlines));
309	}
310
311	fz_free(ctx, page_object_nums);
312	pdf_drop_obj(ctx, names_list);
313	pdf_drop_obj(ctx, root);
314	}
315
316	/ Read infile, and write selected pages to outfile with the given options. /
317	void pdf_clean_file(fz_context ctx, char* infile, char* outfile, char* password, pdf_write_options opts, char argv[], int* argc)
318	{
319	globals glo = { `0` };
320
321	glo.ctx = ctx;
322
323	fz_try(ctx)
324	{
325	glo.doc = pdf_open_document(ctx, infile);
326	if (pdf_needs_password(ctx, glo.doc))
327	if (!pdf_authenticate_password(ctx, glo.doc, password))
328	fz_throw(glo.ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s", infile);
329
330	/ Only retain the specified subset of the pages /
331	if (argc)
332	retainpages(ctx, &glo, argc, argv);
333
334	pdf_save_document(ctx, glo.doc, outfile, opts);
335	}
336	fz_always(ctx)
337	{
338	pdf_drop_document(ctx, glo.doc);
339	}
340	fz_catch(ctx)
341	{
342	fz_rethrow(ctx);
343	}
344	}
345

Browse the source code of MuPDF/source/pdf/pdf-clean-file.c