1 | #include "mupdf/fitz.h" |
2 | #include "mupdf/pdf.h" |
3 | |
4 | static void |
5 | pdf_clean_stream_object(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int own_res, |
6 | pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, |
7 | int sanitize, int ascii) |
8 | { |
9 | pdf_processor *proc_buffer = NULL; |
10 | pdf_processor *proc_filter = NULL; |
11 | pdf_obj *res = NULL; |
12 | pdf_obj *ref; |
13 | fz_buffer *buffer; |
14 | |
15 | if (!obj) |
16 | return; |
17 | |
18 | fz_var(res); |
19 | fz_var(proc_buffer); |
20 | fz_var(proc_filter); |
21 | |
22 | buffer = fz_new_buffer(ctx, 1024); |
23 | |
24 | fz_try(ctx) |
25 | { |
26 | pdf_obj *sp = pdf_dict_get(ctx, obj, PDF_NAME(StructParents)); |
27 | int structparents = -1; |
28 | if (pdf_is_number(ctx, sp)) |
29 | structparents = pdf_to_int(ctx, sp); |
30 | |
31 | if (own_res) |
32 | { |
33 | pdf_obj *r = pdf_dict_get(ctx, obj, PDF_NAME(Resources)); |
34 | if (r) |
35 | orig_res = r; |
36 | } |
37 | |
38 | res = pdf_new_dict(ctx, doc, 1); |
39 | |
40 | proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii); |
41 | proc_filter = pdf_new_filter_processor_with_text_filter(ctx, doc, structparents, proc_buffer, orig_res, res, text_filter, after_text, arg); |
42 | |
43 | pdf_process_contents(ctx, proc_filter, doc, orig_res, obj, cookie); |
44 | pdf_close_processor(ctx, proc_filter); |
45 | pdf_close_processor(ctx, proc_buffer); |
46 | |
47 | pdf_update_stream(ctx, doc, obj, buffer, 0); |
48 | |
49 | if (own_res) |
50 | { |
51 | ref = pdf_add_object(ctx, doc, res); |
52 | pdf_dict_put_drop(ctx, obj, PDF_NAME(Resources), ref); |
53 | } |
54 | } |
55 | fz_always(ctx) |
56 | { |
57 | pdf_drop_processor(ctx, proc_filter); |
58 | pdf_drop_processor(ctx, proc_buffer); |
59 | fz_drop_buffer(ctx, buffer); |
60 | pdf_drop_obj(ctx, res); |
61 | } |
62 | fz_catch(ctx) |
63 | { |
64 | fz_rethrow(ctx); |
65 | } |
66 | } |
67 | |
68 | static void |
69 | pdf_clean_type3(fz_context *ctx, pdf_document *doc, pdf_obj *obj, pdf_obj *orig_res, fz_cookie *cookie, int sanitize, int ascii) |
70 | { |
71 | pdf_processor *proc_buffer = NULL; |
72 | pdf_processor *proc_filter = NULL; |
73 | pdf_obj *res = NULL; |
74 | pdf_obj *ref; |
75 | pdf_obj *charprocs; |
76 | int i, l; |
77 | |
78 | fz_var(res); |
79 | fz_var(proc_buffer); |
80 | fz_var(proc_filter); |
81 | |
82 | fz_try(ctx) |
83 | { |
84 | res = pdf_dict_get(ctx, obj, PDF_NAME(Resources)); |
85 | if (res) |
86 | orig_res = res; |
87 | res = NULL; |
88 | |
89 | res = pdf_new_dict(ctx, doc, 1); |
90 | |
91 | charprocs = pdf_dict_get(ctx, obj, PDF_NAME(CharProcs)); |
92 | l = pdf_dict_len(ctx, charprocs); |
93 | |
94 | for (i = 0; i < l; i++) |
95 | { |
96 | pdf_obj *val = pdf_dict_get_val(ctx, charprocs, i); |
97 | fz_buffer *buffer = fz_new_buffer(ctx, 1024); |
98 | fz_try(ctx) |
99 | { |
100 | proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii); |
101 | if (sanitize) |
102 | { |
103 | proc_filter = pdf_new_filter_processor(ctx, doc, proc_buffer, orig_res, res); |
104 | pdf_process_contents(ctx, proc_filter, doc, orig_res, val, cookie); |
105 | pdf_close_processor(ctx, proc_filter); |
106 | } |
107 | else |
108 | { |
109 | pdf_process_contents(ctx, proc_filter, doc, orig_res, val, cookie); |
110 | } |
111 | pdf_close_processor(ctx, proc_buffer); |
112 | |
113 | pdf_update_stream(ctx, doc, val, buffer, 0); |
114 | } |
115 | fz_always(ctx) |
116 | { |
117 | pdf_drop_processor(ctx, proc_filter); |
118 | pdf_drop_processor(ctx, proc_buffer); |
119 | fz_drop_buffer(ctx, buffer); |
120 | } |
121 | fz_catch(ctx) |
122 | { |
123 | fz_rethrow(ctx); |
124 | } |
125 | } |
126 | |
127 | /* ProcSet - no cleaning possible. Inherit this from the old dict. */ |
128 | pdf_dict_put(ctx, res, PDF_NAME(ProcSet), pdf_dict_get(ctx, orig_res, PDF_NAME(ProcSet))); |
129 | |
130 | ref = pdf_add_object(ctx, doc, res); |
131 | pdf_dict_put_drop(ctx, obj, PDF_NAME(Resources), ref); |
132 | } |
133 | fz_always(ctx) |
134 | { |
135 | pdf_drop_obj(ctx, res); |
136 | } |
137 | fz_catch(ctx) |
138 | { |
139 | fz_rethrow(ctx); |
140 | } |
141 | } |
142 | |
143 | /* |
144 | Clean a loaded pages rendering operations, |
145 | with an optional post processing step. |
146 | |
147 | Firstly, this filters the PDF operators used to avoid (some cases |
148 | of) repetition, and leaves the page in a balanced state with an |
149 | unchanged top level matrix etc. At the same time, the resources |
150 | used by the page contents are collected. |
151 | |
152 | Next, the resources themselves are cleaned (as appropriate) in the |
153 | same way. |
154 | |
155 | Next, an optional post processing stage is called. |
156 | |
157 | Finally, the page contents and resources in the documents page tree |
158 | are replaced by these processed versions. |
159 | |
160 | Annotations remain unaffected. |
161 | |
162 | page: A page loaded by pdf_load_page. |
163 | |
164 | cookie: A pointer to an optional fz_cookie structure that can be used |
165 | to track progress, collect errors etc. |
166 | */ |
167 | void pdf_clean_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *arg, int sanitize, int ascii) |
168 | { |
169 | pdf_filter_page_contents(ctx, doc, page, cookie, proc_fn, NULL, NULL, arg, sanitize, ascii); |
170 | } |
171 | |
172 | /* |
173 | Performs the same task as |
174 | pdf_clean_page_contents, but with an optional text filter |
175 | function. |
176 | |
177 | text_filter: Function to assess whether a given character |
178 | should be kept (return 0) or removed (return 1). |
179 | |
180 | after_text: Function called after each text object is closed |
181 | to allow other output to be sent. |
182 | |
183 | arg: Opaque value to be passed to callback functions. |
184 | */ |
185 | void pdf_filter_page_contents(fz_context *ctx, pdf_document *doc, pdf_page *page, fz_cookie *cookie, |
186 | pdf_page_contents_process_fn *proc_fn, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *proc_arg, |
187 | int sanitize, int ascii) |
188 | { |
189 | pdf_processor *proc_buffer = NULL; |
190 | pdf_processor *proc_filter = NULL; |
191 | pdf_obj *new_obj = NULL; |
192 | pdf_obj *new_ref = NULL; |
193 | pdf_obj *res = NULL; |
194 | pdf_obj *obj; |
195 | pdf_obj *contents; |
196 | pdf_obj *resources; |
197 | fz_buffer *buffer; |
198 | |
199 | fz_var(new_obj); |
200 | fz_var(new_ref); |
201 | fz_var(res); |
202 | fz_var(proc_buffer); |
203 | fz_var(proc_filter); |
204 | |
205 | buffer = fz_new_buffer(ctx, 1024); |
206 | |
207 | fz_try(ctx) |
208 | { |
209 | pdf_obj *sp = pdf_dict_get(ctx, page->obj, PDF_NAME(StructParents)); |
210 | int structparents = -1; |
211 | if (pdf_is_number(ctx, sp)) |
212 | structparents = pdf_to_int(ctx, sp); |
213 | contents = pdf_page_contents(ctx, page); |
214 | resources = pdf_page_resources(ctx, page); |
215 | |
216 | proc_buffer = pdf_new_buffer_processor(ctx, buffer, ascii); |
217 | if (sanitize) |
218 | { |
219 | res = pdf_new_dict(ctx, doc, 1); |
220 | proc_filter = pdf_new_filter_processor_with_text_filter(ctx, doc, structparents, proc_buffer, resources, res, text_filter, after_text, proc_arg); |
221 | pdf_process_contents(ctx, proc_filter, doc, resources, contents, cookie); |
222 | pdf_close_processor(ctx, proc_filter); |
223 | } |
224 | else |
225 | { |
226 | res = pdf_keep_obj(ctx, resources); |
227 | pdf_process_contents(ctx, proc_buffer, doc, resources, contents, cookie); |
228 | } |
229 | pdf_close_processor(ctx, proc_buffer); |
230 | |
231 | /* Deal with page content stream. */ |
232 | |
233 | if (pdf_is_array(ctx, contents)) |
234 | { |
235 | /* create a new object to replace the array */ |
236 | new_obj = pdf_new_dict(ctx, doc, 1); |
237 | new_ref = pdf_add_object(ctx, doc, new_obj); |
238 | contents = new_ref; |
239 | pdf_dict_put(ctx, page->obj, PDF_NAME(Contents), contents); |
240 | } |
241 | else |
242 | { |
243 | pdf_dict_del(ctx, contents, PDF_NAME(Filter)); |
244 | pdf_dict_del(ctx, contents, PDF_NAME(DecodeParms)); |
245 | } |
246 | |
247 | pdf_update_stream(ctx, doc, contents, buffer, 0); |
248 | |
249 | /* Now deal with resources. The spec allows for Type3 fonts and form |
250 | * XObjects to omit a resource dictionary and look in the parent. |
251 | * Avoid that by flattening here as part of the cleaning. This could |
252 | * conceivably cause changes in rendering, but we don't care. */ |
253 | |
254 | /* ExtGState */ |
255 | obj = pdf_dict_get(ctx, res, PDF_NAME(ExtGState)); |
256 | if (obj) |
257 | { |
258 | int i, l; |
259 | |
260 | l = pdf_dict_len(ctx, obj); |
261 | for (i = 0; i < l; i++) |
262 | { |
263 | pdf_obj *o = pdf_dict_get(ctx, pdf_dict_get_val(ctx, obj, i), PDF_NAME(SMask)); |
264 | if (!o) |
265 | continue; |
266 | o = pdf_dict_get(ctx, o, PDF_NAME(G)); |
267 | if (!o) |
268 | continue; |
269 | /* Transparency group XObject */ |
270 | pdf_clean_stream_object(ctx, doc, o, resources, cookie, 1, text_filter, after_text, proc_arg, sanitize, ascii); |
271 | } |
272 | } |
273 | |
274 | /* Pattern */ |
275 | obj = pdf_dict_get(ctx, res, PDF_NAME(Pattern)); |
276 | if (obj) |
277 | { |
278 | int i, l; |
279 | l = pdf_dict_len(ctx, obj); |
280 | for (i = 0; i < l; i++) |
281 | { |
282 | pdf_obj *pat_res; |
283 | pdf_obj *pat = pdf_dict_get_val(ctx, obj, i); |
284 | if (!pat) |
285 | continue; |
286 | pat_res = pdf_dict_get(ctx, pat, PDF_NAME(Resources)); |
287 | if (pat_res == NULL) |
288 | pat_res = resources; |
289 | if (pdf_dict_get_int(ctx, pat, PDF_NAME(PatternType)) == 1) |
290 | pdf_clean_stream_object(ctx, doc, pat, pat_res, cookie, 0, text_filter, after_text, proc_arg, sanitize, ascii); |
291 | } |
292 | } |
293 | |
294 | /* XObject */ |
295 | obj = pdf_dict_get(ctx, res, PDF_NAME(XObject)); |
296 | if (obj) |
297 | { |
298 | int i, l; |
299 | l = pdf_dict_len(ctx, obj); |
300 | for (i = 0; i < l; i++) |
301 | { |
302 | pdf_obj *xobj_res; |
303 | pdf_obj *xobj = pdf_dict_get_val(ctx, obj, i); |
304 | if (!xobj) |
305 | continue; |
306 | xobj_res = pdf_dict_get(ctx, xobj, PDF_NAME(Resources)); |
307 | if (xobj_res == NULL) |
308 | xobj_res = resources; |
309 | if (pdf_name_eq(ctx, PDF_NAME(Form), pdf_dict_get(ctx, xobj, PDF_NAME(Subtype)))) |
310 | pdf_clean_stream_object(ctx, doc, xobj, xobj_res, cookie, 1, text_filter, after_text, proc_arg, sanitize, ascii); |
311 | } |
312 | } |
313 | |
314 | /* Font */ |
315 | obj = pdf_dict_get(ctx, res, PDF_NAME(Font)); |
316 | if (obj) |
317 | { |
318 | int i, l; |
319 | l = pdf_dict_len(ctx, obj); |
320 | for (i = 0; i < l; i++) |
321 | { |
322 | pdf_obj *o = pdf_dict_get_val(ctx, obj, i); |
323 | if (!o) |
324 | continue; |
325 | if (pdf_name_eq(ctx, PDF_NAME(Type3), pdf_dict_get(ctx, o, PDF_NAME(Subtype)))) |
326 | pdf_clean_type3(ctx, doc, o, resources, cookie, sanitize, ascii); |
327 | } |
328 | } |
329 | |
330 | /* ProcSet - no cleaning possible. Inherit this from the old dict. */ |
331 | obj = pdf_dict_get(ctx, resources, PDF_NAME(ProcSet)); |
332 | if (obj) |
333 | pdf_dict_put(ctx, res, PDF_NAME(ProcSet), obj); |
334 | |
335 | /* ColorSpace - no cleaning possible. */ |
336 | /* Properties - no cleaning possible. */ |
337 | |
338 | if (proc_fn) |
339 | (*proc_fn)(ctx, buffer, res, proc_arg); |
340 | |
341 | /* Update resource dictionary */ |
342 | if (sanitize) |
343 | { |
344 | pdf_dict_put(ctx, page->obj, PDF_NAME(Resources), res); |
345 | } |
346 | } |
347 | fz_always(ctx) |
348 | { |
349 | pdf_drop_processor(ctx, proc_filter); |
350 | pdf_drop_processor(ctx, proc_buffer); |
351 | fz_drop_buffer(ctx, buffer); |
352 | pdf_drop_obj(ctx, new_obj); |
353 | pdf_drop_obj(ctx, new_ref); |
354 | pdf_drop_obj(ctx, res); |
355 | } |
356 | fz_catch(ctx) |
357 | { |
358 | fz_rethrow(ctx); |
359 | } |
360 | } |
361 | |
362 | /* |
363 | Clean a loaded annotations rendering operations, |
364 | with an optional post processing step. |
365 | |
366 | Each appearance stream in the annotation is processed. |
367 | |
368 | Firstly, this filters the PDF operators used to avoid (some cases |
369 | of) repetition, and leaves the page in a balanced state with an |
370 | unchanged top level matrix etc. At the same time, the resources |
371 | used by the page contents are collected. |
372 | |
373 | Next, the resources themselves are cleaned (as appropriate) in the |
374 | same way. |
375 | |
376 | Next, an optional post processing stage is called. |
377 | |
378 | Finally, the updated stream of operations is reinserted into the |
379 | appearance stream. |
380 | |
381 | annot: An annotation loaded by pdf_load_annot. |
382 | |
383 | cookie: A pointer to an optional fz_cookie structure that can be used |
384 | to track progress, collect errors etc. |
385 | */ |
386 | void pdf_clean_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie, pdf_page_contents_process_fn *proc_fn, void *proc_arg, int sanitize, int ascii) |
387 | { |
388 | pdf_filter_annot_contents(ctx, doc, annot, cookie, proc_fn, NULL, NULL, proc_arg, sanitize, ascii); |
389 | } |
390 | |
391 | /* |
392 | Performs the same task as |
393 | pdf_clean_annot_contents, but with an optional text filter |
394 | function. |
395 | |
396 | text_filter: Function to assess whether a given character |
397 | should be kept (return 0) or removed (return 1). |
398 | |
399 | after_text: Function called after each text object is closed |
400 | to allow other output to be sent. |
401 | |
402 | arg: Opaque value to be passed to callback functions. |
403 | */ |
404 | void pdf_filter_annot_contents(fz_context *ctx, pdf_document *doc, pdf_annot *annot, fz_cookie *cookie, |
405 | pdf_page_contents_process_fn *proc, pdf_text_filter_fn *text_filter, pdf_after_text_object_fn *after_text, void *arg, int sanitize, int ascii) |
406 | { |
407 | pdf_obj *ap; |
408 | int i, n; |
409 | |
410 | ap = pdf_dict_get(ctx, annot->obj, PDF_NAME(AP)); |
411 | if (ap == NULL) |
412 | return; |
413 | |
414 | n = pdf_dict_len(ctx, ap); |
415 | for (i = 0; i < n; i++) |
416 | { |
417 | pdf_obj *v = pdf_dict_get_val(ctx, ap, i); |
418 | |
419 | if (v == NULL) |
420 | continue; |
421 | |
422 | pdf_clean_stream_object(ctx, doc, v, NULL, cookie, 1, text_filter, after_text, arg, sanitize, ascii); |
423 | } |
424 | } |
425 | |
426 | static void |
427 | pdf_redact_end_page(fz_context *ctx, fz_buffer *buf, pdf_obj *res, void *opaque) |
428 | { |
429 | pdf_page *page = opaque; |
430 | pdf_annot *annot; |
431 | pdf_obj *qp; |
432 | int i, n; |
433 | |
434 | fz_append_string(ctx, buf, "0 g\n" ); |
435 | |
436 | for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot)) |
437 | { |
438 | if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact)) |
439 | { |
440 | qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints)); |
441 | n = pdf_array_len(ctx, qp); |
442 | if (n > 0) |
443 | { |
444 | for (i = 0; i < n; i += 8) |
445 | { |
446 | fz_quad q = pdf_to_quad(ctx, qp, i); |
447 | fz_append_printf(ctx, buf, "%g %g m\n" , q.ll.x, q.ll.y); |
448 | fz_append_printf(ctx, buf, "%g %g l\n" , q.lr.x, q.lr.y); |
449 | fz_append_printf(ctx, buf, "%g %g l\n" , q.ur.x, q.ur.y); |
450 | fz_append_printf(ctx, buf, "%g %g l\n" , q.ul.x, q.ul.y); |
451 | fz_append_string(ctx, buf, "f\n" ); |
452 | } |
453 | } |
454 | else |
455 | { |
456 | fz_rect r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect)); |
457 | fz_append_printf(ctx, buf, "%g %g m\n" , r.x0, r.y0); |
458 | fz_append_printf(ctx, buf, "%g %g l\n" , r.x1, r.y0); |
459 | fz_append_printf(ctx, buf, "%g %g l\n" , r.x1, r.y1); |
460 | fz_append_printf(ctx, buf, "%g %g l\n" , r.x0, r.y1); |
461 | fz_append_string(ctx, buf, "f\n" ); |
462 | } |
463 | } |
464 | } |
465 | } |
466 | |
467 | static int |
468 | pdf_redact_text_filter(fz_context *ctx, void *opaque, int *ucsbuf, int ucslen, fz_matrix trm, fz_matrix ctm, fz_rect bbox) |
469 | { |
470 | pdf_page *page = opaque; |
471 | pdf_annot *annot; |
472 | pdf_obj *qp; |
473 | fz_rect r; |
474 | fz_quad q; |
475 | int i, n; |
476 | |
477 | trm = fz_concat(trm, ctm); |
478 | |
479 | for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot)) |
480 | { |
481 | if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact)) |
482 | { |
483 | qp = pdf_dict_get(ctx, annot->obj, PDF_NAME(QuadPoints)); |
484 | n = pdf_array_len(ctx, qp); |
485 | if (n > 0) |
486 | { |
487 | for (i = 0; i < n; i += 8) |
488 | { |
489 | q = pdf_to_quad(ctx, qp, i); |
490 | if (fz_is_point_inside_quad(fz_make_point(trm.e, trm.f), q)) |
491 | return 1; |
492 | } |
493 | } |
494 | else |
495 | { |
496 | r = pdf_dict_get_rect(ctx, annot->obj, PDF_NAME(Rect)); |
497 | if (fz_is_point_inside_rect(fz_make_point(trm.e, trm.f), r)) |
498 | return 1; |
499 | } |
500 | } |
501 | } |
502 | |
503 | return 0; |
504 | } |
505 | |
506 | int |
507 | pdf_redact_page(fz_context *ctx, pdf_document *doc, pdf_page *page, pdf_redact_options *opts) |
508 | { |
509 | pdf_annot *annot; |
510 | int has_redactions = 0; |
511 | int no_black_boxes = 0; |
512 | |
513 | if (opts) |
514 | { |
515 | no_black_boxes = opts->no_black_boxes; |
516 | } |
517 | |
518 | for (annot = pdf_first_annot(ctx, page); annot; annot = pdf_next_annot(ctx, annot)) |
519 | if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact)) |
520 | has_redactions = 1; |
521 | |
522 | if (has_redactions) |
523 | { |
524 | pdf_filter_page_contents(ctx, doc, page, NULL, |
525 | no_black_boxes ? NULL : pdf_redact_end_page, |
526 | pdf_redact_text_filter, |
527 | NULL, |
528 | page, |
529 | 1, 1); |
530 | } |
531 | |
532 | annot = pdf_first_annot(ctx, page); |
533 | while (annot) |
534 | { |
535 | if (pdf_dict_get(ctx, annot->obj, PDF_NAME(Subtype)) == PDF_NAME(Redact)) |
536 | { |
537 | pdf_delete_annot(ctx, page, annot); |
538 | annot = pdf_first_annot(ctx, page); |
539 | } |
540 | else |
541 | { |
542 | annot = pdf_next_annot(ctx, annot); |
543 | } |
544 | } |
545 | |
546 | doc->redacted = has_redactions; |
547 | |
548 | return has_redactions; |
549 | } |
550 | |