1#include "mupdf/fitz.h"
2#include "mupdf/pdf.h"
3
4static void
5pdf_clean_stream_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int own_res,
6 pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg,
7 int sanitize, int ascii)
8{
9 pdf_processor *proc_buffer = NULL;
10 pdf_processor *proc_filter = NULL;
11 pdf_obj *res = NULL;
12 pdf_obj *ref;
13 fz_buffer *buffer;
14
15 if (!obj)
16 return;
17
18 fz_var(res);
19 fz_var(proc_buffer);
20 fz_var(proc_filter);
21
22 buffer = fz_new_buffer(ctx, 1024);
23
24 fz_try(ctx)
25 {
26 pdf_obj *sp = pdf_dict_get(ctx, obj, PDF_NAME(StructParents));
27 int structparents = -1;
28 if (pdf_is_number(ctx, sp))
29 structparents = pdf_to_int(ctx, sp);
30
31 if (own_res)
32 {
33 pdf_obj *r = pdf_dict_get(ctx, obj, PDF_NAME(Resources));
34 if (r)
35 orig_res = r;
36 }
37
38 res = pdf_new_dict(ctx, doc, 1);
39
40 proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii);
41 proc_filter = pdf_new_filter_processor_with_text_filter(ctx, doc, structparents, proc_buffer, orig_res, res, text_filter, after_text, arg);
42
43 pdf_process_contents(ctx, proc_filter, doc, orig_res, obj, cookie);
44 pdf_close_processor(ctx, proc_filter);
45 pdf_close_processor(ctx, proc_buffer);
46
47 pdf_update_stream(ctx, doc, obj, buffer, 0);
48
49 if (own_res)
50 {
51 ref = pdf_add_object(ctx, doc, res);
52 pdf_dict_put_drop(ctx, obj, PDF_NAME(Resources), ref);
53 }
54 }
55 fz_always(ctx)
56 {
57 pdf_drop_processor(ctx, proc_filter);
58 pdf_drop_processor(ctx, proc_buffer);
59 fz_drop_buffer(ctx, buffer);
60 pdf_drop_obj(ctx, res);
61 }
62 fz_catch(ctx)
63 {
64 fz_rethrow(ctx);
65 }
66}
67
68static void
69pdf_clean_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int sanitize, int ascii)
70{
71 pdf_processor *proc_buffer = NULL;
72 pdf_processor *proc_filter = NULL;
73 pdf_obj *res = NULL;
74 pdf_obj *ref;
75 pdf_obj *charprocs;
76 int i, l;
77
78 fz_var(res);
79 fz_var(proc_buffer);
80 fz_var(proc_filter);
81
82 fz_try(ctx)
83 {
84 res = pdf_dict_get(ctx, obj, PDF_NAME(Resources));
85 if (res)
86 orig_res = res;
87 res = NULL;
88
89 res = pdf_new_dict(ctx, doc, 1);
90
91 charprocs = pdf_dict_get(ctx, obj, PDF_NAME(CharProcs));
92 l = pdf_dict_len(ctx, charprocs);
93
94 for (i = 0; i < l; i++)
95 {
96 pdf_obj *val = pdf_dict_get_val(ctx, charprocs, i);
97 fz_buffer *buffer = fz_new_buffer(ctx, 1024);
98 fz_try(ctx)
99 {
100 proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii);
101 if (sanitize)
102 {
103 proc_filter = pdf_new_filter_processor(ctx, doc, proc_buffer, orig_res, res);
104 pdf_process_contents(ctx, proc_filter, doc, orig_res, val, cookie);
105 pdf_close_processor(ctx, proc_filter);
106 }
107 else
108 {
109 pdf_process_contents(ctx, proc_filter, doc, orig_res, val, cookie);
110 }
111 pdf_close_processor(ctx, proc_buffer);
112
113 pdf_update_stream(ctx, doc, val, buffer, 0);
114 }
115 fz_always(ctx)
116 {
117 pdf_drop_processor(ctx, proc_filter);
118 pdf_drop_processor(ctx, proc_buffer);
119 fz_drop_buffer(ctx, buffer);
120 }
121 fz_catch(ctx)
122 {
123 fz_rethrow(ctx);
124 }
125 }
126
127 /* ProcSet - no cleaning possible. Inherit this from the old dict. */
128 pdf_dict_put(ctx, res, PDF_NAME(ProcSet), pdf_dict_get(ctx, orig_res, PDF_NAME(ProcSet)));
129
130 ref = pdf_add_object(ctx, doc, res);
131 pdf_dict_put_drop(ctx, obj, PDF_NAME(Resources), ref);
132 }
133 fz_always(ctx)
134 {
135 pdf_drop_obj(ctx, res);
136 }
137 fz_catch(ctx)
138 {
139 fz_rethrow(ctx);
140 }
141}
142
143/*
144 Clean a loaded pages rendering operations,
145 with an optional post processing step.
146
147 Firstly, this filters the PDF operators used to avoid (some cases
148 of) repetition, and leaves the page in a balanced state with an
149 unchanged top level matrix etc. At the same time, the resources
150 used by the page contents are collected.
151
152 Next, the resources themselves are cleaned (as appropriate) in the
153 same way.
154
155 Next, an optional post processing stage is called.
156
157 Finally, the page contents and resources in the documents page tree
158 are replaced by these processed versions.
159
160 Annotations remain unaffected.
161
162 page: A page loaded by pdf_load_page.
163
164 cookie: A pointer to an optional fz_cookie structure that can be used
165 to track progress, collect errors etc.
166*/
167void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *arg, int sanitize, int ascii)
168{
169 pdf_filter_page_contents(ctx, doc, page, cookie, proc_fn, NULL, NULL, arg, sanitize, ascii);
170}
171
172/*
173 Performs the same task as
174 pdf_clean_page_contents, but with an optional text filter
175 function.
176
177 text_filter: Function to assess whether a given character
178 should be kept (return 0) or removed (return 1).
179
180 after_text: Function called after each text object is closed
181 to allow other output to be sent.
182
183 arg: Opaque value to be passed to callback functions.
184*/
185void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie,
186 pdf_page_contents_process_fn *proc_fn, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *proc_arg,
187 int sanitize, int ascii)
188{
189 pdf_processor *proc_buffer = NULL;
190 pdf_processor *proc_filter = NULL;
191 pdf_obj *new_obj = NULL;
192 pdf_obj *new_ref = NULL;
193 pdf_obj *res = NULL;
194 pdf_obj *obj;
195 pdf_obj *contents;
196 pdf_obj *resources;
197 fz_buffer *buffer;
198
199 fz_var(new_obj);
200 fz_var(new_ref);
201 fz_var(res);
202 fz_var(proc_buffer);
203 fz_var(proc_filter);
204
205 buffer = fz_new_buffer(ctx, 1024);
206
207 fz_try(ctx)
208 {
209 pdf_obj *sp = pdf_dict_get(ctx, page->obj, PDF_NAME(StructParents));
210 int structparents = -1;
211 if (pdf_is_number(ctx, sp))
212 structparents = pdf_to_int(ctx, sp);
213 contents = pdf_page_contents(ctx, page);
214 resources = pdf_page_resources(ctx, page);
215
216 proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii);
217 if (sanitize)
218 {
219 res = pdf_new_dict(ctx, doc, 1);
220 proc_filter = pdf_new_filter_processor_with_text_filter(ctx, doc, structparents, proc_buffer, resources, res, text_filter, after_text, proc_arg);
221 pdf_process_contents(ctx, proc_filter, doc, resources, contents, cookie);
222 pdf_close_processor(ctx, proc_filter);
223 }
224 else
225 {
226 res = pdf_keep_obj(ctx, resources);
227 pdf_process_contents(ctx, proc_buffer, doc, resources, contents, cookie);
228 }
229 pdf_close_processor(ctx, proc_buffer);
230
231 /* Deal with page content stream. */
232
233 if (pdf_is_array(ctx, contents))
234 {
235 /* create a new object to replace the array */
236 new_obj = pdf_new_dict(ctx, doc, 1);
237 new_ref = pdf_add_object(ctx, doc, new_obj);
238 contents = new_ref;
239 pdf_dict_put(ctx, page->obj, PDF_NAME(Contents), contents);
240 }
241 else
242 {
243 pdf_dict_del(ctx, contents, PDF_NAME(Filter));
244 pdf_dict_del(ctx, contents, PDF_NAME(DecodeParms));
245 }
246
247 pdf_update_stream(ctx, doc, contents, buffer, 0);
248
249 /* Now deal with resources. The spec allows for Type3 fonts and form
250 * XObjects to omit a resource dictionary and look in the parent.
251 * Avoid that by flattening here as part of the cleaning. This could
252 * conceivably cause changes in rendering, but we don't care. */
253
254 /* ExtGState */
255 obj = pdf_dict_get(ctx, res, PDF_NAME(ExtGState));
256 if (obj)
257 {
258 int i, l;
259
260 l = pdf_dict_len(ctx, obj);
261 for (i = 0; i < l; i++)
262 {
263 pdf_obj *o = pdf_dict_get(ctx, pdf_dict_get_val(ctx, obj, i), PDF_NAME(SMask));
264 if (!o)
265 continue;
266 o = pdf_dict_get(ctx, o, PDF_NAME(G));
267 if (!o)
268 continue;
269 /* Transparency group XObject */
270 pdf_clean_stream_object(ctx, doc, o, resources, cookie, 1, text_filter, after_text, proc_arg, sanitize, ascii);
271 }
272 }
273
274 /* Pattern */
275 obj = pdf_dict_get(ctx, res, PDF_NAME(Pattern));
276 if (obj)
277 {
278 int i, l;
279 l = pdf_dict_len(ctx, obj);
280 for (i = 0; i < l; i++)
281 {
282 pdf_obj *pat_res;
283 pdf_obj *pat = pdf_dict_get_val(ctx, obj, i);
284 if (!pat)
285 continue;
286 pat_res = pdf_dict_get(ctx, pat, PDF_NAME(Resources));
287 if (pat_res == NULL)
288 pat_res = resources;
289 if (pdf_dict_get_int(ctx, pat, PDF_NAME(PatternType)) == 1)
290 pdf_clean_stream_object(ctx, doc, pat, pat_res, cookie, 0, text_filter, after_text, proc_arg, sanitize, ascii);
291 }
292 }
293
294 /* XObject */
295 obj = pdf_dict_get(ctx, res, PDF_NAME(XObject));
296 if (obj)
297 {
298 int i, l;
299 l = pdf_dict_len(ctx, obj);
300 for (i = 0; i < l; i++)
301 {
302 pdf_obj *xobj_res;
303 pdf_obj *xobj = pdf_dict_get_val(ctx, obj, i);
304 if (!xobj)
305 continue;
306 xobj_res = pdf_dict_get(ctx, xobj, PDF_NAME(Resources));
307 if (xobj_res == NULL)
308 xobj_res = resources;
309 if (pdf_name_eq(ctx, PDF_NAME(Form), pdf_dict_get(ctx, xobj, PDF_NAME(Subtype))))
310 pdf_clean_stream_object(ctx, doc, xobj, xobj_res, cookie, 1, text_filter, after_text, proc_arg, sanitize, ascii);
311 }
312 }
313
314 /* Font */
315 obj = pdf_dict_get(ctx, res, PDF_NAME(Font));
316 if (obj)
317 {
318 int i, l;
319 l = pdf_dict_len(ctx, obj);
320 for (i = 0; i < l; i++)
321 {
322 pdf_obj *o = pdf_dict_get_val(ctx, obj, i);
323 if (!o)
324 continue;
325 if (pdf_name_eq(ctx, PDF_NAME(Type3), pdf_dict_get(ctx, o, PDF_NAME(Subtype))))
326 pdf_clean_type3(ctx, doc, o, resources, cookie, sanitize, ascii);
327 }
328 }
329
330 /* ProcSet - no cleaning possible. Inherit this from the old dict. */
331 obj = pdf_dict_get(ctx, resources, PDF_NAME(ProcSet));
332 if (obj)
333 pdf_dict_put(ctx, res, PDF_NAME(ProcSet), obj);
334
335 /* ColorSpace - no cleaning possible. */
336 /* Properties - no cleaning possible. */
337
338 if (proc_fn)
339 (*proc_fn)(ctx, buffer, res, proc_arg);
340
341 /* Update resource dictionary */
342 if (sanitize)
343 {
344 pdf_dict_put(ctx, page->obj, PDF_NAME(Resources), res);
345 }
346 }
347 fz_always(ctx)
348 {
349 pdf_drop_processor(ctx, proc_filter);
350 pdf_drop_processor(ctx, proc_buffer);
351 fz_drop_buffer(ctx, buffer);
352 pdf_drop_obj(ctx, new_obj);
353 pdf_drop_obj(ctx, new_ref);
354 pdf_drop_obj(ctx, res);
355 }
356 fz_catch(ctx)
357 {
358 fz_rethrow(ctx);
359 }
360}
361
362/*
363 Clean a loaded annotations rendering operations,
364 with an optional post processing step.
365
366 Each appearance stream in the annotation is processed.
367
368 Firstly, this filters the PDF operators used to avoid (some cases
369 of) repetition, and leaves the page in a balanced state with an
370 unchanged top level matrix etc. At the same time, the resources
371 used by the page contents are collected.
372
373 Next, the resources themselves are cleaned (as appropriate) in the
374 same way.
375
376 Next, an optional post processing stage is called.
377
378 Finally, the updated stream of operations is reinserted into the
379 appearance stream.
380
381 annot: An annotation loaded by pdf_load_annot.
382
383 cookie: A pointer to an optional fz_cookie structure that can be used
384 to track progress, collect errors etc.
385*/
386void pdf_clean_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *proc_arg, int sanitize, int ascii)
387{
388 pdf_filter_annot_contents(ctx, doc, annot, cookie, proc_fn, NULL, NULL, proc_arg, sanitize, ascii);
389}
390
391/*
392 Performs the same task as
393 pdf_clean_annot_contents, but with an optional text filter
394 function.
395
396 text_filter: Function to assess whether a given character
397 should be kept (return 0) or removed (return 1).
398
399 after_text: Function called after each text object is closed
400 to allow other output to be sent.
401
402 arg: Opaque value to be passed to callback functions.
403*/
404void pdf_filter_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie,
405 pdf_page_contents_process_fn *proc, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, int sanitize, int ascii)
406{
407 pdf_obj *ap;
408 int i, n;
409
410 ap = pdf_dict_get(ctx, annot->obj, PDF_NAME(AP));
411 if (ap == NULL)
412 return;
413
414 n = pdf_dict_len(ctx, ap);
415 for (i = 0; i < n; i++)
416 {
417 pdf_obj *v = pdf_dict_get_val(ctx, ap, i);
418
419 if (v == NULL)
420 continue;
421
422 pdf_clean_stream_object(ctx, doc, v, NULL, cookie, 1, text_filter, after_text, arg, sanitize, ascii);
423 }
424}
425
426static void
427pdf_redact_end_page(fz_context *ctx, fz_buffer *buf, pdf_obj *res, void *opaque)
428{
429 pdf_page *page = opaque;
430 pdf_annot *annot;
431 pdf_obj *qp;
432 int i, n;
433
434 fz_append_string(ctx, buf, "0 g\n");
435
436 for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
437 {
438 if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
439 {
440 qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
441 n = pdf_array_len(ctx, qp);
442 if (n > 0)
443 {
444 for (i = 0; i < n; i += 8)
445 {
446 fz_quad q = pdf_to_quad(ctx, qp, i);
447 fz_append_printf(ctx, buf, "%g %g m\n", q.ll.x, q.ll.y);
448 fz_append_printf(ctx, buf, "%g %g l\n", q.lr.x, q.lr.y);
449 fz_append_printf(ctx, buf, "%g %g l\n", q.ur.x, q.ur.y);
450 fz_append_printf(ctx, buf, "%g %g l\n", q.ul.x, q.ul.y);
451 fz_append_string(ctx, buf, "f\n");
452 }
453 }
454 else
455 {
456 fz_rect r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
457 fz_append_printf(ctx, buf, "%g %g m\n", r.x0, r.y0);
458 fz_append_printf(ctx, buf, "%g %g l\n", r.x1, r.y0);
459 fz_append_printf(ctx, buf, "%g %g l\n", r.x1, r.y1);
460 fz_append_printf(ctx, buf, "%g %g l\n", r.x0, r.y1);
461 fz_append_string(ctx, buf, "f\n");
462 }
463 }
464 }
465}
466
467static int
468pdf_redact_text_filter(fz_context *ctx, void *opaque, int *ucsbuf, int ucslen, fz_matrix trm, fz_matrix ctm, fz_rect bbox)
469{
470 pdf_page *page = opaque;
471 pdf_annot *annot;
472 pdf_obj *qp;
473 fz_rect r;
474 fz_quad q;
475 int i, n;
476
477 trm = fz_concat(trm, ctm);
478
479 for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
480 {
481 if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
482 {
483 qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints));
484 n = pdf_array_len(ctx, qp);
485 if (n > 0)
486 {
487 for (i = 0; i < n; i += 8)
488 {
489 q = pdf_to_quad(ctx, qp, i);
490 if (fz_is_point_inside_quad(fz_make_point(trm.e, trm.f), q))
491 return 1;
492 }
493 }
494 else
495 {
496 r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect));
497 if (fz_is_point_inside_rect(fz_make_point(trm.e, trm.f), r))
498 return 1;
499 }
500 }
501 }
502
503 return 0;
504}
505
506int
507pdf_redact_page(fz_context *ctx, pdf_document *doc, pdf_page *page, pdf_redact_options *opts)
508{
509 pdf_annot *annot;
510 int has_redactions = 0;
511 int no_black_boxes = 0;
512
513 if (opts)
514 {
515 no_black_boxes = opts->no_black_boxes;
516 }
517
518 for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot))
519 if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
520 has_redactions = 1;
521
522 if (has_redactions)
523 {
524 pdf_filter_page_contents(ctx, doc, page, NULL,
525 no_black_boxes ? NULL : pdf_redact_end_page,
526 pdf_redact_text_filter,
527 NULL,
528 page,
529 1, 1);
530 }
531
532 annot = pdf_first_annot(ctx, page);
533 while (annot)
534 {
535 if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact))
536 {
537 pdf_delete_annot(ctx, page, annot);
538 annot = pdf_first_annot(ctx, page);
539 }
540 else
541 {
542 annot = pdf_next_annot(ctx, annot);
543 }
544 }
545
546 doc->redacted = has_redactions;
547
548 return has_redactions;
549}
550