pdf-clean.c source code [MuPDF/source/pdf/pdf-clean.c]

1	#include "mupdf/fitz.h"
2	#include "mupdf/pdf.h"
3
4	static void
5	pdf_clean_stream_object(fz_context ctx, pdf_document doc, pdf_obj obj, pdf_obj orig_res, fz_cookie cookie, int* own_res,
6	pdf_text_filter_fn text_filter, pdf_after_text_object_fn after_text, void *arg,
7	int sanitize, int ascii)
8	{
9	pdf_processor *proc_buffer = NULL;
10	pdf_processor *proc_filter = NULL;
11	pdf_obj *res = NULL;
12	pdf_obj *ref;
13	fz_buffer *buffer;
14
15	if (!obj)
16	return;
17
18	fz_var(res);
19	fz_var(proc_buffer);
20	fz_var(proc_filter);
21
22	buffer = fz_new_buffer(ctx, `1024`);
23
24	fz_try(ctx)
25	{
26	pdf_obj *sp = pdf_dict_get(ctx, obj, PDF_NAME(StructParents));
27	int structparents = -`1`;
28	if (pdf_is_number(ctx, sp))
29	structparents = pdf_to_int(ctx, sp);
30
31	if (own_res)
32	{
33	pdf_obj *r = pdf_dict_get(ctx, obj, PDF_NAME(Resources));
34	if (r)
35	orig_res = r;
36	}
37
38	res = pdf_new_dict(ctx, doc, `1`);
39
40	proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii);
41	proc_filter = pdf_new_filter_processor_with_text_filter(ctx, doc, structparents, proc_buffer, orig_res, res, text_filter, after_text, arg);
42
43	pdf_process_contents(ctx, proc_filter, doc, orig_res, obj, cookie);
44	pdf_close_processor(ctx, proc_filter);
45	pdf_close_processor(ctx, proc_buffer);
46
47	pdf_update_stream(ctx, doc, obj, buffer, `0`);
48
49	if (own_res)
50	{
51	ref = pdf_add_object(ctx, doc, res);
52	pdf_dict_put_drop(ctx, obj, PDF_NAME(Resources), ref);
53	}
54	}
55	fz_always(ctx)
56	{
57	pdf_drop_processor(ctx, proc_filter);
58	pdf_drop_processor(ctx, proc_buffer);
59	fz_drop_buffer(ctx, buffer);
60	pdf_drop_obj(ctx, res);
61	}
62	fz_catch(ctx)
63	{
64	fz_rethrow(ctx);
65	}
66	}
67
68	static void
69	pdf_clean_type3(fz_context ctx, pdf_document doc, pdf_obj obj, pdf_obj orig_res, fz_cookie cookie, int* sanitize, int ascii)
70	{
71	pdf_processor *proc_buffer = NULL;
72	pdf_processor *proc_filter = NULL;
73	pdf_obj *res = NULL;
74	pdf_obj *ref;
75	pdf_obj *charprocs;
76	int i, l;
77
78	fz_var(res);
79	fz_var(proc_buffer);
80	fz_var(proc_filter);
81
82	fz_try(ctx)
83	{
84	res = pdf_dict_get(ctx, obj, PDF_NAME(Resources));
85	if (res)
86	orig_res = res;
87	res = NULL;
88
89	res = pdf_new_dict(ctx, doc, `1`);
90
91	charprocs = pdf_dict_get(ctx, obj, PDF_NAME(CharProcs));
92	l = pdf_dict_len(ctx, charprocs);
93
94	for (i = `0`; i < l; i++)
95	{
96	pdf_obj *val = pdf_dict_get_val(ctx, charprocs, i);
97	fz_buffer *buffer = fz_new_buffer(ctx, `1024`);
98	fz_try(ctx)
99	{
100	proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii);
101	if (sanitize)
102	{
103	proc_filter = pdf_new_filter_processor(ctx, doc, proc_buffer, orig_res, res);
104	pdf_process_contents(ctx, proc_filter, doc, orig_res, val, cookie);
105	pdf_close_processor(ctx, proc_filter);
106	}
107	else
108	{
109	pdf_process_contents(ctx, proc_filter, doc, orig_res, val, cookie);
110	}
111	pdf_close_processor(ctx, proc_buffer);
112
113	pdf_update_stream(ctx, doc, val, buffer, `0`);
114	}
115	fz_always(ctx)
116	{
117	pdf_drop_processor(ctx, proc_filter);
118	pdf_drop_processor(ctx, proc_buffer);
119	fz_drop_buffer(ctx, buffer);
120	}
121	fz_catch(ctx)
122	{
123	fz_rethrow(ctx);
124	}
125	}
126
127	/ ProcSet - no cleaning possible. Inherit this from the old dict. /
128	pdf_dict_put(ctx, res, PDF_NAME(ProcSet), pdf_dict_get(ctx, orig_res, PDF_NAME(ProcSet)));
129
130	ref = pdf_add_object(ctx, doc, res);
131	pdf_dict_put_drop(ctx, obj, PDF_NAME(Resources), ref);
132	}
133	fz_always(ctx)
134	{
135	pdf_drop_obj(ctx, res);
136	}
137	fz_catch(ctx)
138	{
139	fz_rethrow(ctx);
140	}
141	}
142
143	/*
144	Clean a loaded pages rendering operations,
145	with an optional post processing step.
146
147	Firstly, this filters the PDF operators used to avoid (some cases
148	of) repetition, and leaves the page in a balanced state with an
149	unchanged top level matrix etc. At the same time, the resources
150	used by the page contents are collected.
151
152	Next, the resources themselves are cleaned (as appropriate) in the
153	same way.
154
155	Next, an optional post processing stage is called.
156
157	Finally, the page contents and resources in the documents page tree
158	are replaced by these processed versions.
159
160	Annotations remain unaffected.
161
162	page: A page loaded by pdf_load_page.
163
164	cookie: A pointer to an optional fz_cookie structure that can be used
165	to track progress, collect errors etc.
166	*/
167	void pdf_clean_page_contents(fz_context ctx, pdf_document doc, pdf_page page, fz_cookie cookie, pdf_page_contents_process_fn proc_fn, void* arg, int* sanitize, int ascii)
168	{
169	pdf_filter_page_contents(ctx, doc, page, cookie, proc_fn, NULL, NULL, arg, sanitize, ascii);
170	}
171
172	/*
173	Performs the same task as
174	pdf_clean_page_contents, but with an optional text filter
175	function.
176
177	text_filter: Function to assess whether a given character
178	should be kept (return 0) or removed (return 1).
179
180	after_text: Function called after each text object is closed
181	to allow other output to be sent.
182
183	arg: Opaque value to be passed to callback functions.
184	*/
185	void pdf_filter_page_contents(fz_context ctx, pdf_document doc, pdf_page page, fz_cookie cookie,
186	pdf_page_contents_process_fn proc_fn, pdf_text_filter_fn text_filter, pdf_after_text_object_fn after_text, void* *proc_arg,
187	int sanitize, int ascii)
188	{
189	pdf_processor *proc_buffer = NULL;
190	pdf_processor *proc_filter = NULL;
191	pdf_obj *new_obj = NULL;
192	pdf_obj *new_ref = NULL;
193	pdf_obj *res = NULL;
194	pdf_obj *obj;
195	pdf_obj *contents;
196	pdf_obj *resources;
197	fz_buffer *buffer;
198
199	fz_var(new_obj);
200	fz_var(new_ref);
201	fz_var(res);
202	fz_var(proc_buffer);
203	fz_var(proc_filter);
204
205	buffer = fz_new_buffer(ctx, `1024`);
206
207	fz_try(ctx)
208	{
209	pdf_obj *sp = pdf_dict_get(ctx, page->obj, PDF_NAME(StructParents));
210	int structparents = -`1`;
211	if (pdf_is_number(ctx, sp))
212	structparents = pdf_to_int(ctx, sp);
213	contents = pdf_page_contents(ctx, page);
214	resources = pdf_page_resources(ctx, page);
215
216	proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii);
217	if (sanitize)
218	{
219	res = pdf_new_dict(ctx, doc, `1`);
220	proc_filter = pdf_new_filter_processor_with_text_filter(ctx, doc, structparents, proc_buffer, resources, res, text_filter, after_text, proc_arg);
221	pdf_process_contents(ctx, proc_filter, doc, resources, contents, cookie);
222	pdf_close_processor(ctx, proc_filter);
223	}
224	else
225	{
226	res = pdf_keep_obj(ctx, resources);
227	pdf_process_contents(ctx, proc_buffer, doc, resources, contents, cookie);
228	}
229	pdf_close_processor(ctx, proc_buffer);
230
231	/ Deal with page content stream. /
232
233	if (pdf_is_array(ctx, contents))
234	{
235	/ create a new object to replace the array /
236	new_obj = pdf_new_dict(ctx, doc, `1`);
237	new_ref = pdf_add_object(ctx, doc, new_obj);
238	contents = new_ref;
239	pdf_dict_put(ctx, page->obj, PDF_NAME(Contents), contents);
240	}
241	else
242	{
243	pdf_dict_del(ctx, contents, PDF_NAME(Filter));
244	pdf_dict_del(ctx, contents, PDF_NAME(DecodeParms));
245	}
246
247	pdf_update_stream(ctx, doc, contents, buffer, `0`);
248
249	/ Now deal with resources. The spec allows for Type3 fonts and form*
250	* XObjects to omit a resource dictionary and look in the parent.
251	* Avoid that by flattening here as part of the cleaning. This could
252	* conceivably cause changes in rendering, but we don't care. */
253
254	/ ExtGState /
255	obj = pdf_dict_get(ctx, res, PDF_NAME(ExtGState));
256	if (obj)
257	{
258	int i, l;
259
260	l = pdf_dict_len(ctx, obj);
261	for (i = `0`; i < l; i++)
262	{
263	pdf_obj *o = pdf_dict_get(ctx, pdf_dict_get_val(ctx, obj, i), PDF_NAME(SMask));
264	if (!o)
265	continue;
266	o = pdf_dict_get(ctx, o, PDF_NAME(G));
267	if (!o)
268	continue;
269	/ Transparency group XObject /
270	pdf_clean_stream_object(ctx, doc, o, resources, cookie, `1`, text_filter, after_text, proc_arg, sanitize, ascii);
271	}
272	}
273
274	/ Pattern /
275	obj = pdf_dict_get(ctx, res, PDF_NAME(Pattern));
276	if (obj)
277	{
278	int i, l;
279	l = pdf_dict_len(ctx, obj);
280	for (i = `0`; i < l; i++)
281	{
282	pdf_obj *pat_res;
283	pdf_obj *pat = pdf_dict_get_val(ctx, obj, i);
284	if (!pat)
285	continue;
286	pat_res = pdf_dict_get(ctx, pat, PDF_NAME(Resources));
287	if (pat_res == NULL)
288	pat_res = resources;
289	if (pdf_dict_get_int(ctx, pat, PDF_NAME(PatternType)) == `1`)
290	pdf_clean_stream_object(ctx, doc, pat, pat_res, cookie, `0`, text_filter, after_text, proc_arg, sanitize, ascii);
291	}
292	}
293
294	/ XObject /
295	obj = pdf_dict_get(ctx, res, PDF_NAME(XObject));
296	if (obj)
297	{
298	int i, l;
299	l = pdf_dict_len(ctx, obj);
300	for (i = `0`; i < l; i++)
301	{
302	pdf_obj *xobj_res;
303	pdf_obj *xobj = pdf_dict_get_val(ctx, obj, i);
304	if (!xobj)
305	continue;
306	xobj_res = pdf_dict_get(ctx, xobj, PDF_NAME(Resources));
307	if (xobj_res == NULL)
308	xobj_res = resources;
309	if (pdf_name_eq(ctx, PDF_NAME(Form), pdf_dict_get(ctx, xobj, PDF_NAME(Subtype))))
310	pdf_clean_stream_object(ctx, doc, xobj, xobj_res, cookie, `1`, text_filter, after_text, proc_arg, sanitize, ascii);
311	}
312	}
313
314	/ Font /
315	obj = pdf_dict_get(ctx, res, PDF_NAME(Font));
316	if (obj)
317	{
318	int i, l;
319	l = pdf_dict_len(ctx, obj);
320	for (i = `0`; i < l; i++)
321	{
322	pdf_obj *o = pdf_dict_get_val(ctx, obj, i);
323	if (!o)
324	continue;
325	if (pdf_name_eq(ctx, PDF_NAME(Type3), pdf_dict_get(ctx, o, PDF_NAME(Subtype))))
326	pdf_clean_type3(ctx, doc, o, resources, cookie, sanitize, ascii);
327	}
328	}
329
330	/ ProcSet - no cleaning possible. Inherit this from the old dict. /
331	obj = pdf_dict_get(ctx, resources, PDF_NAME(ProcSet));
332	if (obj)
333	pdf_dict_put(ctx, res, PDF_NAME(ProcSet), obj);
334
335	/ ColorSpace - no cleaning possible. /
336	/ Properties - no cleaning possible. /
337
338	if (proc_fn)
339	(*proc_fn)(ctx, buffer, res, proc_arg);
340
341	/ Update resource dictionary /
342	if (sanitize)
343	{
344	pdf_dict_put(ctx, page->obj, PDF_NAME(Resources), res);
345	}
346	}
347	fz_always(ctx)
348	{
349	pdf_drop_processor(ctx, proc_filter);
350	pdf_drop_processor(ctx, proc_buffer);
351	fz_drop_buffer(ctx, buffer);
352	pdf_drop_obj(ctx, new_obj);
353	pdf_drop_obj(ctx, new_ref);
354	pdf_drop_obj(ctx, res);
355	}
356	fz_catch(ctx)
357	{
358	fz_rethrow(ctx);
359	}
360	}
361
362	/*
363	Clean a loaded annotations rendering operations,
364	with an optional post processing step.
365
366	Each appearance stream in the annotation is processed.
367
368	Firstly, this filters the PDF operators used to avoid (some cases
369	of) repetition, and leaves the page in a balanced state with an
370	unchanged top level matrix etc. At the same time, the resources
371	used by the page contents are collected.
372
373	Next, the resources themselves are cleaned (as appropriate) in the
374	same way.
375
376	Next, an optional post processing stage is called.
377
378	Finally, the updated stream of operations is reinserted into the
379	appearance stream.
380
381	annot: An annotation loaded by pdf_load_annot.
382
383	cookie: A pointer to an optional fz_cookie structure that can be used
384	to track progress, collect errors etc.
385	*/
386	void pdf_clean_annot_contents(fz_context ctx, pdf_document doc, pdf_annot annot, fz_cookie cookie, pdf_page_contents_process_fn proc_fn, void* proc_arg, int* sanitize, int ascii)
387	{
388	pdf_filter_annot_contents(ctx, doc, annot, cookie, proc_fn, NULL, NULL, proc_arg, sanitize, ascii);
389	}
390
391	/*
392	Performs the same task as
393	pdf_clean_annot_contents, but with an optional text filter
394	function.
395
396	text_filter: Function to assess whether a given character
397	should be kept (return 0) or removed (return 1).
398
399	after_text: Function called after each text object is closed
400	to allow other output to be sent.
401
402	arg: Opaque value to be passed to callback functions.
403	*/
404	void pdf_filter_annot_contents(fz_context ctx, pdf_document doc, pdf_annot annot, fz_cookie cookie,
405	pdf_page_contents_process_fn proc, pdf_text_filter_fn text_filter, pdf_after_text_object_fn after_text, void* arg, int* sanitize, int ascii)
406	{
407	pdf_obj *ap;
408	int i, n;
409
410	ap = pdf_dict_get(ctx, annot->obj, PDF_NAME(AP));
411	if (ap == NULL)
412	return;
413
414	n = pdf_dict_len(ctx, ap);
415	for (i = `0`; i < n; i++)
416	{
417	pdf_obj *v = pdf_dict_get_val(ctx, ap, i);
418
419	if (v == NULL)
420	continue;
421
422	pdf_clean_stream_object(ctx, doc, v, NULL, cookie, `1`, text_filter, after_text, arg, sanitize, ascii);
423	}
424	}
425
426	static void
427	pdf_redact_end_page(fz_context ctx, fz_buffer buf, pdf_obj res, void* *opaque)
428	{
429	pdf_page *page = opaque;
430	pdf_annot *annot;
431	pdf_obj *qp;
432	int i, n;
433
434	fz_append_string(ctx, buf, "0 g\n");
435
436	for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
437	{
438	if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
439	{
440	qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
441	n = pdf_array_len(ctx, qp);
442	if (n > `0`)
443	{
444	for (i = `0`; i < n; i += `8`)
445	{
446	fz_quad q = pdf_to_quad(ctx, qp, i);
447	fz_append_printf(ctx, buf, "%g %g m\n", q.ll.x, q.ll.y);
448	fz_append_printf(ctx, buf, "%g %g l\n", q.lr.x, q.lr.y);
449	fz_append_printf(ctx, buf, "%g %g l\n", q.ur.x, q.ur.y);
450	fz_append_printf(ctx, buf, "%g %g l\n", q.ul.x, q.ul.y);
451	fz_append_string(ctx, buf, "f\n");
452	}
453	}
454	else
455	{
456	fz_rect r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
457	fz_append_printf(ctx, buf, "%g %g m\n", r.x0, r.y0);
458	fz_append_printf(ctx, buf, "%g %g l\n", r.x1, r.y0);
459	fz_append_printf(ctx, buf, "%g %g l\n", r.x1, r.y1);
460	fz_append_printf(ctx, buf, "%g %g l\n", r.x0, r.y1);
461	fz_append_string(ctx, buf, "f\n");
462	}
463	}
464	}
465	}
466
467	static int
468	pdf_redact_text_filter(fz_context ctx, void* opaque, int* ucsbuf, int* ucslen, fz_matrix trm, fz_matrix ctm, fz_rect bbox)
469	{
470	pdf_page *page = opaque;
471	pdf_annot *annot;
472	pdf_obj *qp;
473	fz_rect r;
474	fz_quad q;
475	int i, n;
476
477	trm = fz_concat(trm, ctm);
478
479	for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
480	{
481	if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
482	{
483	qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
484	n = pdf_array_len(ctx, qp);
485	if (n > `0`)
486	{
487	for (i = `0`; i < n; i += `8`)
488	{
489	q = pdf_to_quad(ctx, qp, i);
490	if (fz_is_point_inside_quad(fz_make_point(trm.e, trm.f), q))
491	return `1`;
492	}
493	}
494	else
495	{
496	r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
497	if (fz_is_point_inside_rect(fz_make_point(trm.e, trm.f), r))
498	return `1`;
499	}
500	}
501	}
502
503	return `0`;
504	}
505
506	int
507	pdf_redact_page(fz_context ctx, pdf_document doc, pdf_page page, pdf_redact_options opts)
508	{
509	pdf_annot *annot;
510	int has_redactions = `0`;
511	int no_black_boxes = `0`;
512
513	if (opts)
514	{
515	no_black_boxes = opts->no_black_boxes;
516	}
517
518	for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
519	if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
520	has_redactions = `1`;
521
522	if (has_redactions)
523	{
524	pdf_filter_page_contents(ctx, doc, page, NULL,
525	no_black_boxes ? NULL : pdf_redact_end_page,
526	pdf_redact_text_filter,
527	NULL,
528	page,
529	`1`, `1`);
530	}
531
532	annot = pdf_first_annot(ctx, page);
533	while (annot)
534	{
535	if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
536	{
537	pdf_delete_annot(ctx, page, annot);
538	annot = pdf_first_annot(ctx, page);
539	}
540	else
541	{
542	annot = pdf_next_annot(ctx, annot);
543	}
544	}
545
546	doc->redacted = has_redactions;
547
548	return has_redactions;
549	}
550

Browse the source code of MuPDF/source/pdf/pdf-clean.c