1#include "mupdf/fitz.h"
2#include "mupdf/pdf.h"
3
4#include <string.h>
5
6fz_rect
7pdf_to_rect(fz_context *ctx, pdf_obj *array)
8{
9 if (!pdf_is_array(ctx, array))
10 return fz_empty_rect;
11 else
12 {
13 float a = pdf_array_get_real(ctx, array, 0);
14 float b = pdf_array_get_real(ctx, array, 1);
15 float c = pdf_array_get_real(ctx, array, 2);
16 float d = pdf_array_get_real(ctx, array, 3);
17 fz_rect r;
18 r.x0 = fz_min(a, c);
19 r.y0 = fz_min(b, d);
20 r.x1 = fz_max(a, c);
21 r.y1 = fz_max(b, d);
22 return r;
23 }
24}
25
26fz_quad
27pdf_to_quad(fz_context *ctx, pdf_obj *array, int offset)
28{
29 fz_quad q;
30 q.ul.x = pdf_array_get_real(ctx, array, offset+0);
31 q.ul.y = pdf_array_get_real(ctx, array, offset+1);
32 q.ur.x = pdf_array_get_real(ctx, array, offset+2);
33 q.ur.y = pdf_array_get_real(ctx, array, offset+3);
34 q.ll.x = pdf_array_get_real(ctx, array, offset+4);
35 q.ll.y = pdf_array_get_real(ctx, array, offset+5);
36 q.lr.x = pdf_array_get_real(ctx, array, offset+6);
37 q.lr.y = pdf_array_get_real(ctx, array, offset+7);
38 return q;
39}
40
41fz_matrix
42pdf_to_matrix(fz_context *ctx, pdf_obj *array)
43{
44 if (!pdf_is_array(ctx, array))
45 return fz_identity;
46 else
47 {
48 fz_matrix m;
49 m.a = pdf_array_get_real(ctx, array, 0);
50 m.b = pdf_array_get_real(ctx, array, 1);
51 m.c = pdf_array_get_real(ctx, array, 2);
52 m.d = pdf_array_get_real(ctx, array, 3);
53 m.e = pdf_array_get_real(ctx, array, 4);
54 m.f = pdf_array_get_real(ctx, array, 5);
55 return m;
56 }
57}
58
59static int
60rune_from_utf16be(int *out, const unsigned char *s, const unsigned char *end)
61{
62 if (s + 2 <= end)
63 {
64 int a = s[0] << 8 | s[1];
65 if (a >= 0xD800 && a <= 0xDFFF && s + 4 <= end)
66 {
67 int b = s[2] << 8 | s[3];
68 *out = ((a - 0xD800) << 10) + (b - 0xDC00) + 0x10000;
69 return 4;
70 }
71 *out = a;
72 return 2;
73 }
74 *out = FZ_REPLACEMENT_CHARACTER;
75 return 1;
76}
77
78static int
79rune_from_utf16le(int *out, const unsigned char *s, const unsigned char *end)
80{
81 if (s + 2 <= end)
82 {
83 int a = s[1] << 8 | s[0];
84 if (a >= 0xD800 && a <= 0xDFFF && s + 4 <= end)
85 {
86 int b = s[3] << 8 | s[2];
87 *out = ((a - 0xD800) << 10) + (b - 0xDC00) + 0x10000;
88 return 4;
89 }
90 *out = a;
91 return 2;
92 }
93 *out = FZ_REPLACEMENT_CHARACTER;
94 return 1;
95}
96
97static size_t
98skip_language_code_utf16le(const unsigned char *s, size_t n, size_t i)
99{
100 /* skip language escape codes */
101 if (i + 6 <= n && s[i+1] == 0 && s[i+0] == 27 && s[i+5] == 0 && s[i+4] == 27)
102 return 6;
103 else if (i + 8 <= n && s[i+1] == 0 && s[i+0] == 27 && s[i+7] == 0 && s[i+6] == 27)
104 return 8;
105 return 0;
106}
107
108static size_t
109skip_language_code_utf16be(const unsigned char *s, size_t n, size_t i)
110{
111 /* skip language escape codes */
112 if (i + 6 <= n && s[i+0] == 0 && s[i+1] == 27 && s[i+4] == 0 && s[i+5] == 27)
113 return 6;
114 else if (i + 8 <= n && s[i+0] == 0 && s[i+1] == 27 && s[i+6] == 0 && s[i+7] == 27)
115 return 8;
116 return 0;
117}
118
119static size_t
120skip_language_code_utf8(const unsigned char *s, size_t n, size_t i)
121{
122 /* skip language escape codes */
123 if (i + 3 <= n && s[i] == 27 && s[i+3])
124 return 3;
125 else if (i + 5 <= n && s[i] == 27 && s[i+5] == 27)
126 return 5;
127 return 0;
128}
129
130/* Convert Unicode/PdfDocEncoding string into utf-8 */
131char *
132pdf_new_utf8_from_pdf_string(fz_context *ctx, const char *ssrcptr, size_t srclen)
133{
134 const unsigned char *srcptr = (const unsigned char*)ssrcptr;
135 char *dstptr, *dst;
136 size_t dstlen = 0;
137 int ucs;
138 size_t i, n;
139
140 /* UTF-16BE */
141 if (srclen >= 2 && srcptr[0] == 254 && srcptr[1] == 255)
142 {
143 i = 2;
144 while (i + 2 <= srclen)
145 {
146 n = skip_language_code_utf16be(srcptr, srclen, i);
147 if (n)
148 i += n;
149 else
150 {
151 i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen);
152 dstlen += fz_runelen(ucs);
153 }
154 }
155
156 dstptr = dst = fz_malloc(ctx, dstlen + 1);
157
158 i = 2;
159 while (i + 2 <= srclen)
160 {
161 n = skip_language_code_utf16be(srcptr, srclen, i);
162 if (n)
163 i += n;
164 else
165 {
166 i += rune_from_utf16be(&ucs, srcptr + i, srcptr + srclen);
167 dstptr += fz_runetochar(dstptr, ucs);
168 }
169 }
170 }
171
172 /* UTF-16LE */
173 else if (srclen >= 2 && srcptr[0] == 255 && srcptr[1] == 254)
174 {
175 i = 2;
176 while (i + 2 <= srclen)
177 {
178 n = skip_language_code_utf16le(srcptr, srclen, i);
179 if (n)
180 i += n;
181 else
182 {
183 i += rune_from_utf16le(&ucs, srcptr + i, srcptr + srclen);
184 dstlen += fz_runelen(ucs);
185 }
186 }
187
188 dstptr = dst = fz_malloc(ctx, dstlen + 1);
189
190 i = 2;
191 while (i + 2 <= srclen)
192 {
193 n = skip_language_code_utf16le(srcptr, srclen, i);
194 if (n)
195 i += n;
196 else
197 {
198 i += rune_from_utf16le(&ucs, srcptr + i, srcptr + srclen);
199 dstptr += fz_runetochar(dstptr, ucs);
200 }
201 }
202 }
203
204 /* UTF-8 */
205 else if (srclen >= 3 && srcptr[0] == 239 && srcptr[1] == 187 && srcptr[2] == 191)
206 {
207 i = 3;
208 while (i < srclen)
209 {
210 n = skip_language_code_utf8(srcptr, srclen, i);
211 if (n)
212 i += n;
213 else
214 {
215 i += 1;
216 dstlen += 1;
217 }
218 }
219
220 dstptr = dst = fz_malloc(ctx, dstlen + 1);
221
222 i = 3;
223 while (i < srclen)
224 {
225 n = skip_language_code_utf8(srcptr, srclen, i);
226 if (n)
227 i += n;
228 else
229 *dstptr++ = srcptr[i++];
230 }
231 }
232
233 /* PDFDocEncoding */
234 else
235 {
236 for (i = 0; i < srclen; i++)
237 dstlen += fz_runelen(fz_unicode_from_pdf_doc_encoding[srcptr[i]]);
238
239 dstptr = dst = fz_malloc(ctx, dstlen + 1);
240
241 for (i = 0; i < srclen; i++)
242 {
243 ucs = fz_unicode_from_pdf_doc_encoding[srcptr[i]];
244 dstptr += fz_runetochar(dstptr, ucs);
245 }
246 }
247
248 *dstptr = 0;
249 return dst;
250}
251
252/* Convert text string object to UTF-8 */
253char *
254pdf_new_utf8_from_pdf_string_obj(fz_context *ctx, pdf_obj *src)
255{
256 const char *srcptr;
257 size_t srclen;
258 srcptr = pdf_to_string(ctx, src, &srclen);
259 return pdf_new_utf8_from_pdf_string(ctx, srcptr, srclen);
260}
261
262/* Load text stream and convert to UTF-8 */
263char *
264pdf_new_utf8_from_pdf_stream_obj(fz_context *ctx, pdf_obj *src)
265{
266 fz_buffer *stmbuf;
267 char *srcptr;
268 size_t srclen;
269 char *dst = NULL;
270
271 stmbuf = pdf_load_stream(ctx, src);
272 srclen = fz_buffer_storage(ctx, stmbuf, (unsigned char **)&srcptr);
273 fz_try(ctx)
274 dst = pdf_new_utf8_from_pdf_string(ctx, srcptr, srclen);
275 fz_always(ctx)
276 fz_drop_buffer(ctx, stmbuf);
277 fz_catch(ctx)
278 fz_rethrow(ctx);
279 return dst;
280}
281
282/* Load text stream or text string and convert to UTF-8 */
283char *
284pdf_load_stream_or_string_as_utf8(fz_context *ctx, pdf_obj *src)
285{
286 if (pdf_is_stream(ctx, src))
287 return pdf_new_utf8_from_pdf_stream_obj(ctx, src);
288 return pdf_new_utf8_from_pdf_string_obj(ctx, src);
289}
290
291static pdf_obj *
292pdf_new_text_string_utf16be(fz_context *ctx, const char *s)
293{
294 int c, i = 0, n = fz_utflen(s);
295 unsigned char *p = fz_malloc(ctx, n * 2 + 2);
296 pdf_obj *obj;
297 p[i++] = 254;
298 p[i++] = 255;
299 while (*s)
300 {
301 s += fz_chartorune(&c, s);
302 p[i++] = (c>>8) & 0xff;
303 p[i++] = (c) & 0xff;
304 }
305 fz_try(ctx)
306 obj = pdf_new_string(ctx, (char*)p, i);
307 fz_always(ctx)
308 fz_free(ctx, p);
309 fz_catch(ctx)
310 fz_rethrow(ctx);
311 return obj;
312}
313
314/*
315 * Create a PDF 'text string' by encoding input string as either ASCII or UTF-16BE.
316 * In theory, we could also use PDFDocEncoding.
317 */
318pdf_obj *
319pdf_new_text_string(fz_context *ctx, const char *s)
320{
321 int i = 0;
322 while (s[i] != 0)
323 {
324 if (((unsigned char)s[i]) >= 128)
325 return pdf_new_text_string_utf16be(ctx, s);
326 ++i;
327 }
328 return pdf_new_string(ctx, s, i);
329}
330
331pdf_obj *
332pdf_parse_array(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
333{
334 pdf_obj *ary = NULL;
335 pdf_obj *obj = NULL;
336 int64_t a = 0, b = 0, n = 0;
337 pdf_token tok;
338 pdf_obj *op = NULL;
339
340 fz_var(obj);
341
342 ary = pdf_new_array(ctx, doc, 4);
343
344 fz_try(ctx)
345 {
346 while (1)
347 {
348 tok = pdf_lex(ctx, file, buf);
349
350 if (tok != PDF_TOK_INT && tok != PDF_TOK_R)
351 {
352 if (n > 0)
353 pdf_array_push_int(ctx, ary, a);
354 if (n > 1)
355 pdf_array_push_int(ctx, ary, b);
356 n = 0;
357 }
358
359 if (tok == PDF_TOK_INT && n == 2)
360 {
361 pdf_array_push_int(ctx, ary, a);
362 a = b;
363 n --;
364 }
365
366 switch (tok)
367 {
368 case PDF_TOK_EOF:
369 fz_throw(ctx, FZ_ERROR_SYNTAX, "array not closed before end of file");
370
371 case PDF_TOK_CLOSE_ARRAY:
372 op = ary;
373 goto end;
374
375 case PDF_TOK_INT:
376 if (n == 0)
377 a = buf->i;
378 if (n == 1)
379 b = buf->i;
380 n ++;
381 break;
382
383 case PDF_TOK_R:
384 if (n != 2)
385 fz_throw(ctx, FZ_ERROR_SYNTAX, "cannot parse indirect reference in array");
386 pdf_array_push_drop(ctx, ary, pdf_new_indirect(ctx, doc, a, b));
387 n = 0;
388 break;
389
390 case PDF_TOK_OPEN_ARRAY:
391 obj = pdf_parse_array(ctx, doc, file, buf);
392 pdf_array_push_drop(ctx, ary, obj);
393 break;
394
395 case PDF_TOK_OPEN_DICT:
396 obj = pdf_parse_dict(ctx, doc, file, buf);
397 pdf_array_push_drop(ctx, ary, obj);
398 break;
399
400 case PDF_TOK_NAME:
401 pdf_array_push_name(ctx, ary, buf->scratch);
402 break;
403 case PDF_TOK_REAL:
404 pdf_array_push_real(ctx, ary, buf->f);
405 break;
406 case PDF_TOK_STRING:
407 pdf_array_push_string(ctx, ary, buf->scratch, buf->len);
408 break;
409 case PDF_TOK_TRUE:
410 pdf_array_push_bool(ctx, ary, 1);
411 break;
412 case PDF_TOK_FALSE:
413 pdf_array_push_bool(ctx, ary, 0);
414 break;
415 case PDF_TOK_NULL:
416 pdf_array_push(ctx, ary, PDF_NULL);
417 break;
418
419 default:
420 pdf_array_push(ctx, ary, PDF_NULL);
421 break;
422 }
423 }
424end:
425 {}
426 }
427 fz_catch(ctx)
428 {
429 pdf_drop_obj(ctx, ary);
430 fz_rethrow(ctx);
431 }
432 return op;
433}
434
435pdf_obj *
436pdf_parse_dict(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
437{
438 pdf_obj *dict;
439 pdf_obj *key = NULL;
440 pdf_obj *val = NULL;
441 pdf_token tok;
442 int64_t a, b;
443
444 dict = pdf_new_dict(ctx, doc, 8);
445
446 fz_var(key);
447 fz_var(val);
448
449 fz_try(ctx)
450 {
451 while (1)
452 {
453 tok = pdf_lex(ctx, file, buf);
454 skip:
455 if (tok == PDF_TOK_CLOSE_DICT)
456 break;
457
458 /* for BI .. ID .. EI in content streams */
459 if (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID"))
460 break;
461
462 if (tok != PDF_TOK_NAME)
463 fz_throw(ctx, FZ_ERROR_SYNTAX, "invalid key in dict");
464
465 key = pdf_new_name(ctx, buf->scratch);
466
467 tok = pdf_lex(ctx, file, buf);
468
469 switch (tok)
470 {
471 case PDF_TOK_OPEN_ARRAY:
472 val = pdf_parse_array(ctx, doc, file, buf);
473 break;
474
475 case PDF_TOK_OPEN_DICT:
476 val = pdf_parse_dict(ctx, doc, file, buf);
477 break;
478
479 case PDF_TOK_NAME: val = pdf_new_name(ctx, buf->scratch); break;
480 case PDF_TOK_REAL: val = pdf_new_real(ctx, buf->f); break;
481 case PDF_TOK_STRING: val = pdf_new_string(ctx, buf->scratch, buf->len); break;
482 case PDF_TOK_TRUE: val = PDF_TRUE; break;
483 case PDF_TOK_FALSE: val = PDF_FALSE; break;
484 case PDF_TOK_NULL: val = PDF_NULL; break;
485
486 case PDF_TOK_INT:
487 /* 64-bit to allow for numbers > INT_MAX and overflow */
488 a = buf->i;
489 tok = pdf_lex(ctx, file, buf);
490 if (tok == PDF_TOK_CLOSE_DICT || tok == PDF_TOK_NAME ||
491 (tok == PDF_TOK_KEYWORD && !strcmp(buf->scratch, "ID")))
492 {
493 val = pdf_new_int(ctx, a);
494 pdf_dict_put(ctx, dict, key, val);
495 pdf_drop_obj(ctx, val);
496 val = NULL;
497 pdf_drop_obj(ctx, key);
498 key = NULL;
499 goto skip;
500 }
501 if (tok == PDF_TOK_INT)
502 {
503 b = buf->i;
504 tok = pdf_lex(ctx, file, buf);
505 if (tok == PDF_TOK_R)
506 {
507 val = pdf_new_indirect(ctx, doc, a, b);
508 break;
509 }
510 }
511 fz_warn(ctx, "invalid indirect reference in dict");
512 val = PDF_NULL;
513 break;
514
515 default:
516 val = PDF_NULL;
517 break;
518 }
519
520 pdf_dict_put(ctx, dict, key, val);
521 pdf_drop_obj(ctx, val);
522 val = NULL;
523 pdf_drop_obj(ctx, key);
524 key = NULL;
525 }
526 }
527 fz_catch(ctx)
528 {
529 pdf_drop_obj(ctx, dict);
530 pdf_drop_obj(ctx, key);
531 pdf_drop_obj(ctx, val);
532 fz_rethrow(ctx);
533 }
534 return dict;
535}
536
537pdf_obj *
538pdf_parse_stm_obj(fz_context *ctx, pdf_document *doc, fz_stream *file, pdf_lexbuf *buf)
539{
540 pdf_token tok;
541
542 tok = pdf_lex(ctx, file, buf);
543
544 switch (tok)
545 {
546 case PDF_TOK_OPEN_ARRAY:
547 return pdf_parse_array(ctx, doc, file, buf);
548 case PDF_TOK_OPEN_DICT:
549 return pdf_parse_dict(ctx, doc, file, buf);
550 case PDF_TOK_NAME: return pdf_new_name(ctx, buf->scratch);
551 case PDF_TOK_REAL: return pdf_new_real(ctx, buf->f);
552 case PDF_TOK_STRING: return pdf_new_string(ctx, buf->scratch, buf->len);
553 case PDF_TOK_TRUE: return PDF_TRUE;
554 case PDF_TOK_FALSE: return PDF_FALSE;
555 case PDF_TOK_NULL: return PDF_NULL;
556 case PDF_TOK_INT: return pdf_new_int(ctx, buf->i);
557 default: fz_throw(ctx, FZ_ERROR_SYNTAX, "unknown token in object stream");
558 }
559}
560
561pdf_obj *
562pdf_parse_ind_obj(fz_context *ctx, pdf_document *doc,
563 fz_stream *file, pdf_lexbuf *buf,
564 int *onum, int *ogen, int64_t *ostmofs, int *try_repair)
565{
566 pdf_obj *obj = NULL;
567 int num = 0, gen = 0;
568 int64_t stm_ofs;
569 pdf_token tok;
570 int64_t a, b;
571 int read_next_token = 1;
572
573 fz_var(obj);
574
575 tok = pdf_lex(ctx, file, buf);
576 if (tok != PDF_TOK_INT)
577 {
578 if (try_repair)
579 *try_repair = 1;
580 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected object number");
581 }
582 num = buf->i;
583 if (num < 0 || num > PDF_MAX_OBJECT_NUMBER)
584 fz_throw(ctx, FZ_ERROR_SYNTAX, "object number out of range");
585
586 tok = pdf_lex(ctx, file, buf);
587 if (tok != PDF_TOK_INT)
588 {
589 if (try_repair)
590 *try_repair = 1;
591 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected generation number (%d ? obj)", num);
592 }
593 gen = buf->i;
594
595 tok = pdf_lex(ctx, file, buf);
596 if (tok != PDF_TOK_OBJ)
597 {
598 if (try_repair)
599 *try_repair = 1;
600 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'obj' keyword (%d %d ?)", num, gen);
601 }
602
603 tok = pdf_lex(ctx, file, buf);
604
605 switch (tok)
606 {
607 case PDF_TOK_OPEN_ARRAY:
608 obj = pdf_parse_array(ctx, doc, file, buf);
609 break;
610
611 case PDF_TOK_OPEN_DICT:
612 obj = pdf_parse_dict(ctx, doc, file, buf);
613 break;
614
615 case PDF_TOK_NAME: obj = pdf_new_name(ctx, buf->scratch); break;
616 case PDF_TOK_REAL: obj = pdf_new_real(ctx, buf->f); break;
617 case PDF_TOK_STRING: obj = pdf_new_string(ctx, buf->scratch, buf->len); break;
618 case PDF_TOK_TRUE: obj = PDF_TRUE; break;
619 case PDF_TOK_FALSE: obj = PDF_FALSE; break;
620 case PDF_TOK_NULL: obj = PDF_NULL; break;
621
622 case PDF_TOK_INT:
623 a = buf->i;
624 tok = pdf_lex(ctx, file, buf);
625
626 if (tok == PDF_TOK_STREAM || tok == PDF_TOK_ENDOBJ)
627 {
628 obj = pdf_new_int(ctx, a);
629 read_next_token = 0;
630 break;
631 }
632 else if (tok == PDF_TOK_INT)
633 {
634 b = buf->i;
635 tok = pdf_lex(ctx, file, buf);
636 if (tok == PDF_TOK_R)
637 {
638 obj = pdf_new_indirect(ctx, doc, a, b);
639 break;
640 }
641 }
642 fz_throw(ctx, FZ_ERROR_SYNTAX, "expected 'R' keyword (%d %d R)", num, gen);
643
644 case PDF_TOK_ENDOBJ:
645 obj = PDF_NULL;
646 read_next_token = 0;
647 break;
648
649 default:
650 fz_throw(ctx, FZ_ERROR_SYNTAX, "syntax error in object (%d %d R)", num, gen);
651 }
652
653 fz_try(ctx)
654 {
655 if (read_next_token)
656 tok = pdf_lex(ctx, file, buf);
657
658 if (tok == PDF_TOK_STREAM)
659 {
660 int c = fz_read_byte(ctx, file);
661 while (c == ' ')
662 c = fz_read_byte(ctx, file);
663 if (c == '\r')
664 {
665 c = fz_peek_byte(ctx, file);
666 if (c != '\n')
667 fz_warn(ctx, "line feed missing after stream begin marker (%d %d R)", num, gen);
668 else
669 fz_read_byte(ctx, file);
670 }
671 stm_ofs = fz_tell(ctx, file);
672 }
673 else if (tok == PDF_TOK_ENDOBJ)
674 {
675 stm_ofs = 0;
676 }
677 else
678 {
679 fz_warn(ctx, "expected 'endobj' or 'stream' keyword (%d %d R)", num, gen);
680 stm_ofs = 0;
681 }
682 }
683 fz_catch(ctx)
684 {
685 pdf_drop_obj(ctx, obj);
686 fz_rethrow(ctx);
687 }
688
689 if (onum) *onum = num;
690 if (ogen) *ogen = gen;
691 if (ostmofs) *ostmofs = stm_ofs;
692
693 return obj;
694}
695