| 1 | #include "mupdf/fitz.h" |
| 2 | #include "mupdf/pdf.h" |
| 3 | |
| 4 | #include <string.h> |
| 5 | |
| 6 | typedef struct globals_s |
| 7 | { |
| 8 | pdf_document *doc; |
| 9 | fz_context *ctx; |
| 10 | } globals; |
| 11 | |
| 12 | static int |
| 13 | string_in_names_list(fz_context *ctx, pdf_obj *p, pdf_obj *names_list) |
| 14 | { |
| 15 | int n = pdf_array_len(ctx, names_list); |
| 16 | int i; |
| 17 | char *str = pdf_to_str_buf(ctx, p); |
| 18 | |
| 19 | for (i = 0; i < n ; i += 2) |
| 20 | { |
| 21 | if (!strcmp(pdf_to_str_buf(ctx, pdf_array_get(ctx, names_list, i)), str)) |
| 22 | return 1; |
| 23 | } |
| 24 | return 0; |
| 25 | } |
| 26 | |
| 27 | /* |
| 28 | * Recreate page tree to only retain specified pages. |
| 29 | */ |
| 30 | |
| 31 | static void retainpage(fz_context *ctx, pdf_document *doc, pdf_obj *parent, pdf_obj *kids, int page) |
| 32 | { |
| 33 | pdf_obj * = pdf_lookup_page_obj(ctx, doc, page-1); |
| 34 | |
| 35 | pdf_flatten_inheritable_page_items(ctx, pageref); |
| 36 | |
| 37 | pdf_dict_put(ctx, pageref, PDF_NAME(Parent), parent); |
| 38 | |
| 39 | /* Store page object in new kids array */ |
| 40 | pdf_array_push(ctx, kids, pageref); |
| 41 | } |
| 42 | |
| 43 | static int dest_is_valid_page(fz_context *ctx, pdf_obj *obj, int *page_object_nums, int pagecount) |
| 44 | { |
| 45 | int i; |
| 46 | int num = pdf_to_num(ctx, obj); |
| 47 | |
| 48 | if (num == 0) |
| 49 | return 0; |
| 50 | for (i = 0; i < pagecount; i++) |
| 51 | { |
| 52 | if (page_object_nums[i] == num) |
| 53 | return 1; |
| 54 | } |
| 55 | return 0; |
| 56 | } |
| 57 | |
| 58 | static int dest_is_valid(fz_context *ctx, pdf_obj *o, int page_count, int *page_object_nums, pdf_obj *names_list) |
| 59 | { |
| 60 | pdf_obj *p; |
| 61 | |
| 62 | p = pdf_dict_get(ctx, o, PDF_NAME(A)); |
| 63 | if (pdf_name_eq(ctx, pdf_dict_get(ctx, p, PDF_NAME(S)), PDF_NAME(GoTo)) && |
| 64 | !string_in_names_list(ctx, pdf_dict_get(ctx, p, PDF_NAME(D)), names_list)) |
| 65 | return 0; |
| 66 | |
| 67 | p = pdf_dict_get(ctx, o, PDF_NAME(Dest)); |
| 68 | if (p == NULL) |
| 69 | {} |
| 70 | else if (pdf_is_string(ctx, p)) |
| 71 | return string_in_names_list(ctx, p, names_list); |
| 72 | else if (!dest_is_valid_page(ctx, pdf_array_get(ctx, p, 0), page_object_nums, page_count)) |
| 73 | return 0; |
| 74 | |
| 75 | return 1; |
| 76 | } |
| 77 | |
| 78 | static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list); |
| 79 | |
| 80 | static int strip_outline(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list, pdf_obj **pfirst, pdf_obj **plast) |
| 81 | { |
| 82 | pdf_obj *prev = NULL; |
| 83 | pdf_obj *first = NULL; |
| 84 | pdf_obj *current; |
| 85 | int count = 0; |
| 86 | |
| 87 | for (current = outlines; current != NULL; ) |
| 88 | { |
| 89 | int nc; |
| 90 | |
| 91 | /* Strip any children to start with. This takes care of |
| 92 | * First/Last/Count for us. */ |
| 93 | nc = strip_outlines(ctx, doc, current, page_count, page_object_nums, names_list); |
| 94 | |
| 95 | if (!dest_is_valid(ctx, current, page_count, page_object_nums, names_list)) |
| 96 | { |
| 97 | if (nc == 0) |
| 98 | { |
| 99 | /* Outline with invalid dest and no children. Drop it by |
| 100 | * pulling the next one in here. */ |
| 101 | pdf_obj *next = pdf_dict_get(ctx, current, PDF_NAME(Next)); |
| 102 | if (next == NULL) |
| 103 | { |
| 104 | /* There is no next one to pull in */ |
| 105 | if (prev != NULL) |
| 106 | pdf_dict_del(ctx, prev, PDF_NAME(Next)); |
| 107 | } |
| 108 | else if (prev != NULL) |
| 109 | { |
| 110 | pdf_dict_put(ctx, prev, PDF_NAME(Next), next); |
| 111 | pdf_dict_put(ctx, next, PDF_NAME(Prev), prev); |
| 112 | } |
| 113 | else |
| 114 | { |
| 115 | pdf_dict_del(ctx, next, PDF_NAME(Prev)); |
| 116 | } |
| 117 | current = next; |
| 118 | } |
| 119 | else |
| 120 | { |
| 121 | /* Outline with invalid dest, but children. Just drop the dest. */ |
| 122 | pdf_dict_del(ctx, current, PDF_NAME(Dest)); |
| 123 | pdf_dict_del(ctx, current, PDF_NAME(A)); |
| 124 | current = pdf_dict_get(ctx, current, PDF_NAME(Next)); |
| 125 | } |
| 126 | } |
| 127 | else |
| 128 | { |
| 129 | /* Keep this one */ |
| 130 | if (first == NULL) |
| 131 | first = current; |
| 132 | prev = current; |
| 133 | current = pdf_dict_get(ctx, current, PDF_NAME(Next)); |
| 134 | count++; |
| 135 | } |
| 136 | } |
| 137 | |
| 138 | *pfirst = first; |
| 139 | *plast = prev; |
| 140 | |
| 141 | return count; |
| 142 | } |
| 143 | |
| 144 | static int strip_outlines(fz_context *ctx, pdf_document *doc, pdf_obj *outlines, int page_count, int *page_object_nums, pdf_obj *names_list) |
| 145 | { |
| 146 | int nc; |
| 147 | pdf_obj *first; |
| 148 | pdf_obj *last; |
| 149 | |
| 150 | if (outlines == NULL) |
| 151 | return 0; |
| 152 | |
| 153 | first = pdf_dict_get(ctx, outlines, PDF_NAME(First)); |
| 154 | if (first == NULL) |
| 155 | nc = 0; |
| 156 | else |
| 157 | nc = strip_outline(ctx, doc, first, page_count, page_object_nums, names_list, &first, &last); |
| 158 | |
| 159 | if (nc == 0) |
| 160 | { |
| 161 | pdf_dict_del(ctx, outlines, PDF_NAME(First)); |
| 162 | pdf_dict_del(ctx, outlines, PDF_NAME(Last)); |
| 163 | pdf_dict_del(ctx, outlines, PDF_NAME(Count)); |
| 164 | } |
| 165 | else |
| 166 | { |
| 167 | int old_count = pdf_dict_get_int(ctx, outlines, PDF_NAME(Count)); |
| 168 | pdf_dict_put(ctx, outlines, PDF_NAME(First), first); |
| 169 | pdf_dict_put(ctx, outlines, PDF_NAME(Last), last); |
| 170 | pdf_dict_put_int(ctx, outlines, PDF_NAME(Count), old_count > 0 ? nc : -nc); |
| 171 | } |
| 172 | |
| 173 | return nc; |
| 174 | } |
| 175 | |
| 176 | static void retainpages(fz_context *ctx, globals *glo, int argc, char **argv) |
| 177 | { |
| 178 | pdf_obj *oldroot, *root, *pages, *kids, *countobj, *olddests; |
| 179 | pdf_document *doc = glo->doc; |
| 180 | int argidx = 0; |
| 181 | pdf_obj *names_list = NULL; |
| 182 | pdf_obj *outlines; |
| 183 | pdf_obj *ocproperties; |
| 184 | int pagecount; |
| 185 | int i; |
| 186 | int *page_object_nums; |
| 187 | |
| 188 | /* Keep only pages/type and (reduced) dest entries to avoid |
| 189 | * references to unretained pages */ |
| 190 | oldroot = pdf_dict_get(ctx, pdf_trailer(ctx, doc), PDF_NAME(Root)); |
| 191 | pages = pdf_dict_get(ctx, oldroot, PDF_NAME(Pages)); |
| 192 | olddests = pdf_load_name_tree(ctx, doc, PDF_NAME(Dests)); |
| 193 | outlines = pdf_dict_get(ctx, oldroot, PDF_NAME(Outlines)); |
| 194 | ocproperties = pdf_dict_get(ctx, oldroot, PDF_NAME(OCProperties)); |
| 195 | |
| 196 | root = pdf_new_dict(ctx, doc, 3); |
| 197 | pdf_dict_put(ctx, root, PDF_NAME(Type), pdf_dict_get(ctx, oldroot, PDF_NAME(Type))); |
| 198 | pdf_dict_put(ctx, root, PDF_NAME(Pages), pdf_dict_get(ctx, oldroot, PDF_NAME(Pages))); |
| 199 | if (outlines) |
| 200 | pdf_dict_put(ctx, root, PDF_NAME(Outlines), outlines); |
| 201 | if (ocproperties) |
| 202 | pdf_dict_put(ctx, root, PDF_NAME(OCProperties), ocproperties); |
| 203 | |
| 204 | pdf_update_object(ctx, doc, pdf_to_num(ctx, oldroot), root); |
| 205 | |
| 206 | /* Create a new kids array with only the pages we want to keep */ |
| 207 | kids = pdf_new_array(ctx, doc, 1); |
| 208 | |
| 209 | /* Retain pages specified */ |
| 210 | while (argc - argidx) |
| 211 | { |
| 212 | int page, spage, epage; |
| 213 | const char *pagelist = argv[argidx]; |
| 214 | |
| 215 | pagecount = pdf_count_pages(ctx, doc); |
| 216 | |
| 217 | while ((pagelist = fz_parse_page_range(ctx, pagelist, &spage, &epage, pagecount))) |
| 218 | { |
| 219 | if (spage < epage) |
| 220 | for (page = spage; page <= epage; ++page) |
| 221 | retainpage(ctx, doc, pages, kids, page); |
| 222 | else |
| 223 | for (page = spage; page >= epage; --page) |
| 224 | retainpage(ctx, doc, pages, kids, page); |
| 225 | } |
| 226 | |
| 227 | argidx++; |
| 228 | } |
| 229 | |
| 230 | /* Update page count and kids array */ |
| 231 | countobj = pdf_new_int(ctx, pdf_array_len(ctx, kids)); |
| 232 | pdf_dict_put_drop(ctx, pages, PDF_NAME(Count), countobj); |
| 233 | pdf_dict_put_drop(ctx, pages, PDF_NAME(Kids), kids); |
| 234 | |
| 235 | pagecount = pdf_count_pages(ctx, doc); |
| 236 | page_object_nums = fz_calloc(ctx, pagecount, sizeof(*page_object_nums)); |
| 237 | for (i = 0; i < pagecount; i++) |
| 238 | { |
| 239 | pdf_obj * = pdf_lookup_page_obj(ctx, doc, i); |
| 240 | page_object_nums[i] = pdf_to_num(ctx, pageref); |
| 241 | } |
| 242 | |
| 243 | /* If we had an old Dests tree (now reformed as an olddests |
| 244 | * dictionary), keep any entries in there that point to |
| 245 | * valid pages. This may mean we keep more than we need, but |
| 246 | * it's safe at least. */ |
| 247 | if (olddests) |
| 248 | { |
| 249 | pdf_obj *names = pdf_new_dict(ctx, doc, 1); |
| 250 | pdf_obj *dests = pdf_new_dict(ctx, doc, 1); |
| 251 | int len = pdf_dict_len(ctx, olddests); |
| 252 | |
| 253 | names_list = pdf_new_array(ctx, doc, 32); |
| 254 | |
| 255 | for (i = 0; i < len; i++) |
| 256 | { |
| 257 | pdf_obj *key = pdf_dict_get_key(ctx, olddests, i); |
| 258 | pdf_obj *val = pdf_dict_get_val(ctx, olddests, i); |
| 259 | pdf_obj *dest = pdf_dict_get(ctx, val, PDF_NAME(D)); |
| 260 | |
| 261 | dest = pdf_array_get(ctx, dest ? dest : val, 0); |
| 262 | if (dest_is_valid_page(ctx, dest, page_object_nums, pagecount)) |
| 263 | { |
| 264 | pdf_obj *key_str = pdf_new_string(ctx, pdf_to_name(ctx, key), strlen(pdf_to_name(ctx, key))); |
| 265 | pdf_array_push_drop(ctx, names_list, key_str); |
| 266 | pdf_array_push(ctx, names_list, val); |
| 267 | } |
| 268 | } |
| 269 | |
| 270 | pdf_dict_put(ctx, dests, PDF_NAME(Names), names_list); |
| 271 | pdf_dict_put(ctx, names, PDF_NAME(Dests), dests); |
| 272 | pdf_dict_put(ctx, root, PDF_NAME(Names), names); |
| 273 | |
| 274 | pdf_drop_obj(ctx, names); |
| 275 | pdf_drop_obj(ctx, dests); |
| 276 | pdf_drop_obj(ctx, olddests); |
| 277 | } |
| 278 | |
| 279 | /* Edit each pages /Annot list to remove any links that point to nowhere. */ |
| 280 | for (i = 0; i < pagecount; i++) |
| 281 | { |
| 282 | pdf_obj * = pdf_lookup_page_obj(ctx, doc, i); |
| 283 | |
| 284 | pdf_obj *annots = pdf_dict_get(ctx, pageref, PDF_NAME(Annots)); |
| 285 | |
| 286 | int len = pdf_array_len(ctx, annots); |
| 287 | int j; |
| 288 | |
| 289 | for (j = 0; j < len; j++) |
| 290 | { |
| 291 | pdf_obj *o = pdf_array_get(ctx, annots, j); |
| 292 | |
| 293 | if (!pdf_name_eq(ctx, pdf_dict_get(ctx, o, PDF_NAME(Subtype)), PDF_NAME(Link))) |
| 294 | continue; |
| 295 | |
| 296 | if (!dest_is_valid(ctx, o, pagecount, page_object_nums, names_list)) |
| 297 | { |
| 298 | /* Remove this annotation */ |
| 299 | pdf_array_delete(ctx, annots, j); |
| 300 | len--; |
| 301 | j--; |
| 302 | } |
| 303 | } |
| 304 | } |
| 305 | |
| 306 | if (strip_outlines(ctx, doc, outlines, pagecount, page_object_nums, names_list) == 0) |
| 307 | { |
| 308 | pdf_dict_del(ctx, root, PDF_NAME(Outlines)); |
| 309 | } |
| 310 | |
| 311 | fz_free(ctx, page_object_nums); |
| 312 | pdf_drop_obj(ctx, names_list); |
| 313 | pdf_drop_obj(ctx, root); |
| 314 | } |
| 315 | |
| 316 | /* Read infile, and write selected pages to outfile with the given options. */ |
| 317 | void pdf_clean_file(fz_context *ctx, char *infile, char *outfile, char *password, pdf_write_options *opts, char *argv[], int argc) |
| 318 | { |
| 319 | globals glo = { 0 }; |
| 320 | |
| 321 | glo.ctx = ctx; |
| 322 | |
| 323 | fz_try(ctx) |
| 324 | { |
| 325 | glo.doc = pdf_open_document(ctx, infile); |
| 326 | if (pdf_needs_password(ctx, glo.doc)) |
| 327 | if (!pdf_authenticate_password(ctx, glo.doc, password)) |
| 328 | fz_throw(glo.ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s" , infile); |
| 329 | |
| 330 | /* Only retain the specified subset of the pages */ |
| 331 | if (argc) |
| 332 | retainpages(ctx, &glo, argc, argv); |
| 333 | |
| 334 | pdf_save_document(ctx, glo.doc, outfile, opts); |
| 335 | } |
| 336 | fz_always(ctx) |
| 337 | { |
| 338 | pdf_drop_document(ctx, glo.doc); |
| 339 | } |
| 340 | fz_catch(ctx) |
| 341 | { |
| 342 | fz_rethrow(ctx); |
| 343 | } |
| 344 | } |
| 345 | |