1/*
2 * pdfextract -- the ultimate way to extract images and fonts from pdfs
3 */
4
5#include "mupdf/fitz.h"
6#include "mupdf/pdf.h"
7
8#include <stdlib.h>
9#include <stdio.h>
10
11static pdf_document *doc = NULL;
12static fz_context *ctx = NULL;
13static int dorgb = 0;
14static int doicc = 1;
15
16static void usage(void)
17{
18 fprintf(stderr, "usage: mutool extract [options] file.pdf [object numbers]\n");
19 fprintf(stderr, "\t-p\tpassword\n");
20 fprintf(stderr, "\t-r\tconvert images to rgb\n");
21 fprintf(stderr, "\t-N\tdo not use ICC color conversions\n");
22 exit(1);
23}
24
25static int isimage(pdf_obj *obj)
26{
27 pdf_obj *type = pdf_dict_get(ctx, obj, PDF_NAME(Subtype));
28 return pdf_name_eq(ctx, type, PDF_NAME(Image));
29}
30
31static int isfontdesc(pdf_obj *obj)
32{
33 pdf_obj *type = pdf_dict_get(ctx, obj, PDF_NAME(Type));
34 return pdf_name_eq(ctx, type, PDF_NAME(FontDescriptor));
35}
36
37static void writepixmap(fz_context *ctx, fz_pixmap *pix, char *file, int dorgb)
38{
39 char buf[1024];
40 fz_pixmap *rgb = NULL;
41
42 if (!pix)
43 return;
44
45 if (dorgb && pix->colorspace && pix->colorspace != fz_device_rgb(ctx))
46 {
47 rgb = fz_convert_pixmap(ctx, pix, fz_device_rgb(ctx), NULL, NULL, fz_default_color_params /* FIXME */, 1);
48 pix = rgb;
49 }
50
51 if (!pix->colorspace || pix->colorspace->type == FZ_COLORSPACE_GRAY || pix->colorspace->type == FZ_COLORSPACE_RGB)
52 {
53 fz_snprintf(buf, sizeof(buf), "%s.png", file);
54 printf("extracting image %s\n", buf);
55 fz_save_pixmap_as_png(ctx, pix, buf);
56 }
57 else
58 {
59 fz_snprintf(buf, sizeof(buf), "%s.pam", file);
60 printf("extracting image %s\n", buf);
61 fz_save_pixmap_as_pam(ctx, pix, buf);
62 }
63
64 fz_drop_pixmap(ctx, rgb);
65}
66
67static void
68writejpeg(fz_context *ctx, const unsigned char *data, size_t len, const char *file)
69{
70 char buf[1024];
71 fz_output *out;
72
73 fz_snprintf(buf, sizeof(buf), "%s.jpg", file);
74
75 out = fz_new_output_with_path(ctx, buf, 0);
76 fz_try(ctx)
77 {
78 printf("extracting image %s\n", buf);
79 fz_write_data(ctx, out, data, len);
80 fz_close_output(ctx, out);
81 }
82 fz_always(ctx)
83 fz_drop_output(ctx, out);
84 fz_catch(ctx)
85 fz_rethrow(ctx);
86}
87
88static void saveimage(pdf_obj *ref)
89{
90 fz_image *image = NULL;
91 fz_pixmap *pix = NULL;
92 char buf[32];
93 fz_compressed_buffer *cbuf;
94 int type;
95
96 fz_var(image);
97 fz_var(pix);
98
99 fz_try(ctx)
100 {
101 image = pdf_load_image(ctx, doc, ref);
102 cbuf = fz_compressed_image_buffer(ctx, image);
103 fz_snprintf(buf, sizeof(buf), "img-%04d", pdf_to_num(ctx, ref));
104 type = cbuf == NULL ? FZ_IMAGE_UNKNOWN : cbuf->params.type;
105
106 if (image->use_colorkey)
107 type = FZ_IMAGE_UNKNOWN;
108 if (image->use_decode)
109 type = FZ_IMAGE_UNKNOWN;
110 if (image->mask)
111 type = FZ_IMAGE_UNKNOWN;
112 if (dorgb)
113 {
114 enum fz_colorspace_type ctype = fz_colorspace_type(ctx, image->colorspace);
115 if (ctype != FZ_COLORSPACE_RGB && ctype != FZ_COLORSPACE_GRAY)
116 type = FZ_IMAGE_UNKNOWN;
117 }
118
119 if (type == FZ_IMAGE_JPEG)
120 {
121 unsigned char *data;
122 size_t len = fz_buffer_storage(ctx, cbuf->buffer, &data);
123 writejpeg(ctx, data, len, buf);
124 }
125 else
126 {
127 pix = fz_get_pixmap_from_image(ctx, image, NULL, NULL, 0, 0);
128 writepixmap(ctx, pix, buf, dorgb);
129 }
130 }
131 fz_always(ctx)
132 {
133 fz_drop_image(ctx, image);
134 fz_drop_pixmap(ctx, pix);
135 }
136 fz_catch(ctx)
137 fz_rethrow(ctx);
138}
139
140static void savefont(pdf_obj *dict)
141{
142 char namebuf[1024];
143 fz_buffer *buf;
144 pdf_obj *stream = NULL;
145 pdf_obj *obj;
146 char *ext = "";
147 fz_output *out;
148 const char *fontname = "font";
149 size_t len;
150 unsigned char *data;
151
152 obj = pdf_dict_get(ctx, dict, PDF_NAME(FontName));
153 if (obj)
154 fontname = pdf_to_name(ctx, obj);
155
156 obj = pdf_dict_get(ctx, dict, PDF_NAME(FontFile));
157 if (obj)
158 {
159 stream = obj;
160 ext = "pfa";
161 }
162
163 obj = pdf_dict_get(ctx, dict, PDF_NAME(FontFile2));
164 if (obj)
165 {
166 stream = obj;
167 ext = "ttf";
168 }
169
170 obj = pdf_dict_get(ctx, dict, PDF_NAME(FontFile3));
171 if (obj)
172 {
173 stream = obj;
174
175 obj = pdf_dict_get(ctx, obj, PDF_NAME(Subtype));
176 if (obj && !pdf_is_name(ctx, obj))
177 fz_throw(ctx, FZ_ERROR_GENERIC, "invalid font descriptor subtype");
178
179 if (pdf_name_eq(ctx, obj, PDF_NAME(Type1C)))
180 ext = "cff";
181 else if (pdf_name_eq(ctx, obj, PDF_NAME(CIDFontType0C)))
182 ext = "cid";
183 else if (pdf_name_eq(ctx, obj, PDF_NAME(OpenType)))
184 ext = "otf";
185 else
186 fz_throw(ctx, FZ_ERROR_GENERIC, "unhandled font type '%s'", pdf_to_name(ctx, obj));
187 }
188
189 if (!stream)
190 {
191 fz_warn(ctx, "unhandled font type");
192 return;
193 }
194
195 buf = pdf_load_stream(ctx, stream);
196 len = fz_buffer_storage(ctx, buf, &data);
197 fz_try(ctx)
198 {
199 fz_snprintf(namebuf, sizeof(namebuf), "%s-%04d.%s", fontname, pdf_to_num(ctx, dict), ext);
200 printf("extracting font %s\n", namebuf);
201 out = fz_new_output_with_path(ctx, namebuf, 0);
202 fz_try(ctx)
203 {
204 fz_write_data(ctx, out, data, len);
205 fz_close_output(ctx, out);
206 }
207 fz_always(ctx)
208 fz_drop_output(ctx, out);
209 fz_catch(ctx)
210 fz_rethrow(ctx);
211 }
212 fz_always(ctx)
213 fz_drop_buffer(ctx, buf);
214 fz_catch(ctx)
215 fz_rethrow(ctx);
216}
217
218static void extractobject(int num)
219{
220 pdf_obj *ref;
221
222 if (!doc)
223 fz_throw(ctx, FZ_ERROR_GENERIC, "no file specified");
224
225 fz_try(ctx)
226 {
227 ref = pdf_new_indirect(ctx, doc, num, 0);
228 if (isimage(ref))
229 saveimage(ref);
230 if (isfontdesc(ref))
231 savefont(ref);
232 }
233 fz_always(ctx)
234 pdf_drop_obj(ctx, ref);
235 fz_catch(ctx)
236 fz_warn(ctx, "ignoring object %d", num);
237}
238
239int pdfextract_main(int argc, char **argv)
240{
241 char *infile;
242 char *password = "";
243 int c, o;
244
245 while ((c = fz_getopt(argc, argv, "p:rN")) != -1)
246 {
247 switch (c)
248 {
249 case 'p': password = fz_optarg; break;
250 case 'r': dorgb++; break;
251 case 'N': doicc^=1; break;
252 default: usage(); break;
253 }
254 }
255
256 if (fz_optind == argc)
257 usage();
258
259 infile = argv[fz_optind++];
260
261 ctx = fz_new_context(NULL, NULL, FZ_STORE_UNLIMITED);
262 if (!ctx)
263 {
264 fprintf(stderr, "cannot initialise context\n");
265 exit(1);
266 }
267
268 if (doicc)
269 fz_enable_icc(ctx);
270 else
271 fz_disable_icc(ctx);
272
273 doc = pdf_open_document(ctx, infile);
274 if (pdf_needs_password(ctx, doc))
275 if (!pdf_authenticate_password(ctx, doc, password))
276 fz_throw(ctx, FZ_ERROR_GENERIC, "cannot authenticate password: %s", infile);
277
278 if (fz_optind == argc)
279 {
280 int len = pdf_count_objects(ctx, doc);
281 for (o = 1; o < len; o++)
282 extractobject(o);
283 }
284 else
285 {
286 while (fz_optind < argc)
287 {
288 extractobject(atoi(argv[fz_optind]));
289 fz_optind++;
290 }
291 }
292
293 pdf_drop_document(ctx, doc);
294 fz_flush_warnings(ctx);
295 fz_drop_context(ctx);
296 return 0;
297}
298