1 | /* |
2 | * pdfextract -- the ultimate way to extract images and fonts from pdfs |
3 | */ |
4 | |
5 | #include "mupdf/fitz.h" |
6 | #include "mupdf/pdf.h" |
7 | |
8 | #include <stdlib.h> |
9 | #include <stdio.h> |
10 | |
11 | static pdf_document *doc = NULL; |
12 | static fz_context *ctx = NULL; |
13 | static int dorgb = 0; |
14 | static int doicc = 1; |
15 | |
16 | static void usage(void) |
17 | { |
18 | fprintf(stderr, "usage: mutool extract [options] file.pdf [object numbers]\n" ); |
19 | fprintf(stderr, "\t-p\tpassword\n" ); |
20 | fprintf(stderr, "\t-r\tconvert images to rgb\n" ); |
21 | fprintf(stderr, "\t-N\tdo not use ICC color conversions\n" ); |
22 | exit(1); |
23 | } |
24 | |
25 | static int isimage(pdf_obj *obj) |
26 | { |
27 | pdf_obj *type = pdf_dict_get(ctx, obj, PDF_NAME(Subtype)); |
28 | return pdf_name_eq(ctx, type, PDF_NAME(Image)); |
29 | } |
30 | |
31 | static int isfontdesc(pdf_obj *obj) |
32 | { |
33 | pdf_obj *type = pdf_dict_get(ctx, obj, PDF_NAME(Type)); |
34 | return pdf_name_eq(ctx, type, PDF_NAME(FontDescriptor)); |
35 | } |
36 | |
37 | static void writepixmap(fz_context *ctx, fz_pixmap *pix, char *file, int dorgb) |
38 | { |
39 | char buf[1024]; |
40 | fz_pixmap *rgb = NULL; |
41 | |
42 | if (!pix) |
43 | return; |
44 | |
45 | if (dorgb && pix->colorspace && pix->colorspace != fz_device_rgb(ctx)) |
46 | { |
47 | rgb = fz_convert_pixmap(ctx, pix, fz_device_rgb(ctx), NULL, NULL, fz_default_color_params /* FIXME */, 1); |
48 | pix = rgb; |
49 | } |
50 | |
51 | if (!pix->colorspace || pix->colorspace->type == FZ_COLORSPACE_GRAY || pix->colorspace->type == FZ_COLORSPACE_RGB) |
52 | { |
53 | fz_snprintf(buf, sizeof(buf), "%s.png" , file); |
54 | printf("extracting image %s\n" , buf); |
55 | fz_save_pixmap_as_png(ctx, pix, buf); |
56 | } |
57 | else |
58 | { |
59 | fz_snprintf(buf, sizeof(buf), "%s.pam" , file); |
60 | printf("extracting image %s\n" , buf); |
61 | fz_save_pixmap_as_pam(ctx, pix, buf); |
62 | } |
63 | |
64 | fz_drop_pixmap(ctx, rgb); |
65 | } |
66 | |
67 | static void |
68 | writejpeg(fz_context *ctx, const unsigned char *data, size_t len, const char *file) |
69 | { |
70 | char buf[1024]; |
71 | fz_output *out; |
72 | |
73 | fz_snprintf(buf, sizeof(buf), "%s.jpg" , file); |
74 | |
75 | out = fz_new_output_with_path(ctx, buf, 0); |
76 | fz_try(ctx) |
77 | { |
78 | printf("extracting image %s\n" , buf); |
79 | fz_write_data(ctx, out, data, len); |
80 | fz_close_output(ctx, out); |
81 | } |
82 | fz_always(ctx) |
83 | fz_drop_output(ctx, out); |
84 | fz_catch(ctx) |
85 | fz_rethrow(ctx); |
86 | } |
87 | |
88 | static void saveimage(pdf_obj *ref) |
89 | { |
90 | fz_image *image = NULL; |
91 | fz_pixmap *pix = NULL; |
92 | char buf[32]; |
93 | fz_compressed_buffer *cbuf; |
94 | int type; |
95 | |
96 | fz_var(image); |
97 | fz_var(pix); |
98 | |
99 | fz_try(ctx) |
100 | { |
101 | image = pdf_load_image(ctx, doc, ref); |
102 | cbuf = fz_compressed_image_buffer(ctx, image); |
103 | fz_snprintf(buf, sizeof(buf), "img-%04d" , pdf_to_num(ctx, ref)); |
104 | type = cbuf == NULL ? FZ_IMAGE_UNKNOWN : cbuf->params.type; |
105 | |
106 | if (image->use_colorkey) |
107 | type = FZ_IMAGE_UNKNOWN; |
108 | if (image->use_decode) |
109 | type = FZ_IMAGE_UNKNOWN; |
110 | if (image->mask) |
111 | type = FZ_IMAGE_UNKNOWN; |
112 | if (dorgb) |
113 | { |
114 | enum fz_colorspace_type ctype = fz_colorspace_type(ctx, image->colorspace); |
115 | if (ctype != FZ_COLORSPACE_RGB && ctype != FZ_COLORSPACE_GRAY) |
116 | type = FZ_IMAGE_UNKNOWN; |
117 | } |
118 | |
119 | if (type == FZ_IMAGE_JPEG) |
120 | { |
121 | unsigned char *data; |
122 | size_t len = fz_buffer_storage(ctx, cbuf->buffer, &data); |
123 | writejpeg(ctx, data, len, buf); |
124 | } |
125 | else |
126 | { |
127 | pix = fz_get_pixmap_from_image(ctx, image, NULL, NULL, 0, 0); |
128 | writepixmap(ctx, pix, buf, dorgb); |
129 | } |
130 | } |
131 | fz_always(ctx) |
132 | { |
133 | fz_drop_image(ctx, image); |
134 | fz_drop_pixmap(ctx, pix); |
135 | } |
136 | fz_catch(ctx) |
137 | fz_rethrow(ctx); |
138 | } |
139 | |
140 | static void savefont(pdf_obj *dict) |
141 | { |
142 | char namebuf[1024]; |
143 | fz_buffer *buf; |
144 | pdf_obj *stream = NULL; |
145 | pdf_obj *obj; |
146 | char *ext = "" ; |
147 | fz_output *out; |
148 | const char *fontname = "font" ; |
149 | size_t len; |
150 | unsigned char *data; |
151 | |
152 | obj = pdf_dict_get(ctx, dict, PDF_NAME(FontName)); |
153 | if (obj) |
154 | fontname = pdf_to_name(ctx, obj); |
155 | |
156 | obj = pdf_dict_get(ctx, dict, PDF_NAME(FontFile)); |
157 | if (obj) |
158 | { |
159 | stream = obj; |
160 | ext = "pfa" ; |
161 | } |
162 | |
163 | obj = pdf_dict_get(ctx, dict, PDF_NAME(FontFile2)); |
164 | if (obj) |
165 | { |
166 | stream = obj; |
167 | ext = "ttf" ; |
168 | } |
169 | |
170 | obj = pdf_dict_get(ctx, dict, PDF_NAME(FontFile3)); |
171 | if (obj) |
172 | { |
173 | stream = obj; |
174 | |
175 | obj = pdf_dict_get(ctx, obj, PDF_NAME(Subtype)); |
176 | if (obj && !pdf_is_name(ctx, obj)) |
177 | fz_throw(ctx, FZ_ERROR_GENERIC, "invalid font descriptor subtype" ); |
178 | |
179 | if (pdf_name_eq(ctx, obj, PDF_NAME(Type1C))) |
180 | ext = "cff" ; |
181 | else if (pdf_name_eq(ctx, obj, PDF_NAME(CIDFontType0C))) |
182 | ext = "cid" ; |
183 | else if (pdf_name_eq(ctx, obj, PDF_NAME(OpenType))) |
184 | ext = "otf" ; |
185 | else |
186 | fz_throw(ctx, FZ_ERROR_GENERIC, "unhandled font type '%s'" , pdf_to_name(ctx, obj)); |
187 | } |
188 | |
189 | if (!stream) |
190 | { |
191 | fz_warn(ctx, "unhandled font type" ); |
192 | return; |
193 | } |
194 | |
195 | buf = pdf_load_stream(ctx, stream); |
196 | len = fz_buffer_storage(ctx, buf, &data); |
197 | fz_try(ctx) |
198 | { |
199 | fz_snprintf(namebuf, sizeof(namebuf), "%s-%04d.%s" , fontname, pdf_to_num(ctx, dict), ext); |
200 | printf("extracting font %s\n" , namebuf); |
201 | out = fz_new_output_with_path(ctx, namebuf, 0); |
202 | fz_try(ctx) |
203 | { |
204 | fz_write_data(ctx, out, data, len); |
205 | fz_close_output(ctx, out); |
206 | } |
207 | fz_always(ctx) |
208 | fz_drop_output(ctx, out); |
209 | fz_catch(ctx) |
210 | fz_rethrow(ctx); |
211 | } |
212 | fz_always(ctx) |
213 | fz_drop_buffer(ctx, buf); |
214 | fz_catch(ctx) |
215 | fz_rethrow(ctx); |
216 | } |
217 | |
218 | static void (int num) |
219 | { |
220 | pdf_obj *ref; |
221 | |
222 | if (!doc) |
223 | fz_throw(ctx, FZ_ERROR_GENERIC, "no file specified" ); |
224 | |
225 | fz_try(ctx) |
226 | { |
227 | ref = pdf_new_indirect(ctx, doc, num, 0); |
228 | if (isimage(ref)) |
229 | saveimage(ref); |
230 | if (isfontdesc(ref)) |
231 | savefont(ref); |
232 | } |
233 | fz_always(ctx) |
234 | pdf_drop_obj(ctx, ref); |
235 | fz_catch(ctx) |
236 | fz_warn(ctx, "ignoring object %d" , num); |
237 | } |
238 | |
239 | int pdfextract_main(int argc, char **argv) |
240 | { |
241 | char *infile; |
242 | char *password = "" ; |
243 | int c, o; |
244 | |
245 | while ((c = fz_getopt(argc, argv, "p:rN" )) != -1) |
246 | { |
247 | switch (c) |
248 | { |
249 | case 'p': password = fz_optarg; break; |
250 | case 'r': dorgb++; break; |
251 | case 'N': doicc^=1; break; |
252 | default: usage(); break; |
253 | } |
254 | } |
255 | |
256 | if (fz_optind == argc) |
257 | usage(); |
258 | |
259 | infile = argv[fz_optind++]; |
260 | |
261 | ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED); |
262 | if (!ctx) |
263 | { |
264 | fprintf(stderr, "cannot initialise context\n" ); |
265 | exit(1); |
266 | } |
267 | |
268 | if (doicc) |
269 | fz_enable_icc(ctx); |
270 | else |
271 | fz_disable_icc(ctx); |
272 | |
273 | doc = pdf_open_document(ctx, infile); |
274 | if (pdf_needs_password(ctx, doc)) |
275 | if (!pdf_authenticate_password(ctx, doc, password)) |
276 | fz_throw(ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s" , infile); |
277 | |
278 | if (fz_optind == argc) |
279 | { |
280 | int len = pdf_count_objects(ctx, doc); |
281 | for (o = 1; o < len; o++) |
282 | extractobject(o); |
283 | } |
284 | else |
285 | { |
286 | while (fz_optind < argc) |
287 | { |
288 | extractobject(atoi(argv[fz_optind])); |
289 | fz_optind++; |
290 | } |
291 | } |
292 | |
293 | pdf_drop_document(ctx, doc); |
294 | fz_flush_warnings(ctx); |
295 | fz_drop_context(ctx); |
296 | return 0; |
297 | } |
298 | |