1#include "fitz-imp.h"
2
3#define SUBSCRIPT_OFFSET 0.2f
4#define SUPERSCRIPT_OFFSET -0.2f
5
6#include <ft2build.h>
7#include FT_FREETYPE_H
8
9/* HTML output (visual formatting with preserved layout) */
10
11static int
12detect_super_script(fz_stext_line *line, fz_stext_char *ch)
13{
14 if (line->wmode == 0 && line->dir.x == 1 && line->dir.y == 0)
15 return ch->origin.y < line->first_char->origin.y - ch->size * 0.1f;
16 return 0;
17}
18
19static const char *
20font_full_name(fz_context *ctx, fz_font *font)
21{
22 const char *name = fz_font_name(ctx, font);
23 const char *s = strchr(name, '+');
24 return s ? s + 1 : name;
25}
26
27static void
28font_family_name(fz_context *ctx, fz_font *font, char *buf, int size, int is_mono, int is_serif)
29{
30 const char *name = font_full_name(ctx, font);
31 char *s;
32 fz_strlcpy(buf, name, size);
33 s = strrchr(buf, '-');
34 if (s)
35 *s = 0;
36 if (is_mono)
37 fz_strlcat(buf, ",monospace", size);
38 else
39 fz_strlcat(buf, is_serif ? ",serif" : ",sans-serif", size);
40}
41
42static void
43fz_print_style_begin_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup, int color)
44{
45 char family[80];
46
47 int is_bold = fz_font_is_bold(ctx, font);
48 int is_italic = fz_font_is_italic(ctx, font);
49 int is_serif = fz_font_is_serif(ctx, font);
50 int is_mono = fz_font_is_monospaced(ctx, font);
51
52 font_family_name(ctx, font, family, sizeof family, is_mono, is_serif);
53
54 if (sup) fz_write_string(ctx, out, "<sup>");
55 if (is_mono) fz_write_string(ctx, out, "<tt>");
56 if (is_bold) fz_write_string(ctx, out, "<b>");
57 if (is_italic) fz_write_string(ctx, out, "<i>");
58 fz_write_printf(ctx, out, "<span style=\"font-family:%s;font-size:%gpt", family, size);
59 if (color != 0)
60 fz_write_printf(ctx, out, ";color:#%06x", color);
61 fz_write_printf(ctx, out, "\">");
62}
63
64static void
65fz_print_style_end_html(fz_context *ctx, fz_output *out, fz_font *font, float size, int sup)
66{
67 int is_mono = fz_font_is_monospaced(ctx, font);
68 int is_bold = fz_font_is_bold(ctx,font);
69 int is_italic = fz_font_is_italic(ctx, font);
70
71 fz_write_string(ctx, out, "</span>");
72 if (is_italic) fz_write_string(ctx, out, "</i>");
73 if (is_bold) fz_write_string(ctx, out, "</b>");
74 if (is_mono) fz_write_string(ctx, out, "</tt>");
75 if (sup) fz_write_string(ctx, out, "</sup>");
76}
77
78static void
79fz_print_stext_image_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
80{
81 int x = block->bbox.x0;
82 int y = block->bbox.y0;
83 int w = block->bbox.x1 - block->bbox.x0;
84 int h = block->bbox.y1 - block->bbox.y0;
85
86 fz_write_printf(ctx, out, "<img style=\"position:absolute;top:%dpt;left:%dpt;width:%dpt;height:%dpt\" src=\"", y, x, w, h);
87 fz_write_image_as_data_uri(ctx, out, block->u.i.image);
88 fz_write_string(ctx, out, "\">\n");
89}
90
91void
92fz_print_stext_block_as_html(fz_context *ctx, fz_output *out, fz_stext_block *block)
93{
94 fz_stext_line *line;
95 fz_stext_char *ch;
96 int x, y;
97
98 fz_font *font = NULL;
99 float size = 0;
100 int sup = 0;
101 int color = 0;
102
103 for (line = block->u.t.first_line; line; line = line->next)
104 {
105 x = line->bbox.x0;
106 y = line->bbox.y0;
107
108 fz_write_printf(ctx, out, "<p style=\"position:absolute;white-space:pre;margin:0;padding:0;top:%dpt;left:%dpt\">", y, x);
109 font = NULL;
110
111 for (ch = line->first_char; ch; ch = ch->next)
112 {
113 int ch_sup = detect_super_script(line, ch);
114 if (ch->font != font || ch->size != size || ch_sup != sup || ch->color != color)
115 {
116 if (font)
117 fz_print_style_end_html(ctx, out, font, size, sup);
118 font = ch->font;
119 size = ch->size;
120 color = ch->color;
121 sup = ch_sup;
122 fz_print_style_begin_html(ctx, out, font, size, sup, color);
123 }
124
125 switch (ch->c)
126 {
127 default:
128 if (ch->c >= 32 && ch->c <= 127)
129 fz_write_byte(ctx, out, ch->c);
130 else
131 fz_write_printf(ctx, out, "&#x%x;", ch->c);
132 break;
133 case '<': fz_write_string(ctx, out, "&lt;"); break;
134 case '>': fz_write_string(ctx, out, "&gt;"); break;
135 case '&': fz_write_string(ctx, out, "&amp;"); break;
136 case '"': fz_write_string(ctx, out, "&quot;"); break;
137 case '\'': fz_write_string(ctx, out, "&apos;"); break;
138 }
139 }
140
141 if (font)
142 fz_print_style_end_html(ctx, out, font, size, sup);
143
144 fz_write_string(ctx, out, "</p>\n");
145 }
146}
147
148/*
149 Output a page to a file in HTML (visual) format.
150*/
151void
152fz_print_stext_page_as_html(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
153{
154 fz_stext_block *block;
155
156 int w = page->mediabox.x1 - page->mediabox.x0;
157 int h = page->mediabox.y1 - page->mediabox.y0;
158
159 fz_write_printf(ctx, out, "<div id=\"page%d\" style=\"position:relative;width:%dpt;height:%dpt;background-color:white\">\n", id, w, h);
160
161 for (block = page->first_block; block; block = block->next)
162 {
163 if (block->type == FZ_STEXT_BLOCK_IMAGE)
164 fz_print_stext_image_as_html(ctx, out, block);
165 else if (block->type == FZ_STEXT_BLOCK_TEXT)
166 fz_print_stext_block_as_html(ctx, out, block);
167 }
168
169 fz_write_string(ctx, out, "</div>\n");
170}
171
172void
173fz_print_stext_header_as_html(fz_context *ctx, fz_output *out)
174{
175 fz_write_string(ctx, out, "<!DOCTYPE html>\n");
176 fz_write_string(ctx, out, "<html>\n");
177 fz_write_string(ctx, out, "<head>\n");
178 fz_write_string(ctx, out, "<style>\n");
179 fz_write_string(ctx, out, "body{background-color:gray}\n");
180 fz_write_string(ctx, out, "div{margin:1em auto}\n");
181 fz_write_string(ctx, out, "</style>\n");
182 fz_write_string(ctx, out, "</head>\n");
183 fz_write_string(ctx, out, "<body>\n");
184}
185
186void
187fz_print_stext_trailer_as_html(fz_context *ctx, fz_output *out)
188{
189 fz_write_string(ctx, out, "</body>\n");
190 fz_write_string(ctx, out, "</html>\n");
191}
192
193/* XHTML output (semantic, little layout, suitable for reflow) */
194
195static void
196fz_print_stext_image_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
197{
198 int w = block->bbox.x1 - block->bbox.x0;
199 int h = block->bbox.y1 - block->bbox.y0;
200
201 fz_write_printf(ctx, out, "<p><img width=\"%d\" height=\"%d\" src=\"", w, h);
202 fz_write_image_as_data_uri(ctx, out, block->u.i.image);
203 fz_write_string(ctx, out, "\"/></p>\n");
204}
205
206static void
207fz_print_style_begin_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup)
208{
209 int is_mono = fz_font_is_monospaced(ctx, font);
210 int is_bold = fz_font_is_bold(ctx, font);
211 int is_italic = fz_font_is_italic(ctx, font);
212
213 if (sup)
214 fz_write_string(ctx, out, "<sup>");
215 if (is_mono)
216 fz_write_string(ctx, out, "<tt>");
217 if (is_bold)
218 fz_write_string(ctx, out, "<b>");
219 if (is_italic)
220 fz_write_string(ctx, out, "<i>");
221}
222
223static void
224fz_print_style_end_xhtml(fz_context *ctx, fz_output *out, fz_font *font, int sup)
225{
226 int is_mono = fz_font_is_monospaced(ctx, font);
227 int is_bold = fz_font_is_bold(ctx, font);
228 int is_italic = fz_font_is_italic(ctx, font);
229
230 if (is_italic)
231 fz_write_string(ctx, out, "</i>");
232 if (is_bold)
233 fz_write_string(ctx, out, "</b>");
234 if (is_mono)
235 fz_write_string(ctx, out, "</tt>");
236 if (sup)
237 fz_write_string(ctx, out, "</sup>");
238}
239
240static float avg_font_size_of_line(fz_stext_char *ch)
241{
242 float size = 0;
243 int n = 0;
244 if (!ch)
245 return 0;
246 while (ch)
247 {
248 size += ch->size;
249 ++n;
250 ch = ch->next;
251 }
252 return size / n;
253}
254
255static const char *tag_from_font_size(float size)
256{
257 if (size >= 20) return "h1";
258 if (size >= 15) return "h2";
259 if (size >= 12) return "h3";
260 return "p";
261}
262
263static void fz_print_stext_block_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_block *block)
264{
265 fz_stext_line *line;
266 fz_stext_char *ch;
267
268 fz_font *font = NULL;
269 int sup = 0;
270 int sp = 1;
271 const char *tag = NULL;
272 const char *new_tag;
273
274 for (line = block->u.t.first_line; line; line = line->next)
275 {
276 new_tag = tag_from_font_size(avg_font_size_of_line(line->first_char));
277 if (tag != new_tag)
278 {
279 if (tag)
280 {
281 if (font)
282 fz_print_style_end_xhtml(ctx, out, font, sup);
283 fz_write_printf(ctx, out, "</%s>", tag);
284 }
285 tag = new_tag;
286 fz_write_printf(ctx, out, "<%s>", tag);
287 if (font)
288 fz_print_style_begin_xhtml(ctx, out, font, sup);
289 }
290
291 if (!sp)
292 fz_write_byte(ctx, out, ' ');
293
294 for (ch = line->first_char; ch; ch = ch->next)
295 {
296 int ch_sup = detect_super_script(line, ch);
297 if (ch->font != font || ch_sup != sup)
298 {
299 if (font)
300 fz_print_style_end_xhtml(ctx, out, font, sup);
301 font = ch->font;
302 sup = ch_sup;
303 fz_print_style_begin_xhtml(ctx, out, font, sup);
304 }
305
306 sp = (ch->c == ' ');
307 switch (ch->c)
308 {
309 default:
310 if (ch->c >= 32 && ch->c <= 127)
311 fz_write_byte(ctx, out, ch->c);
312 else
313 fz_write_printf(ctx, out, "&#x%x;", ch->c);
314 break;
315 case '<': fz_write_string(ctx, out, "&lt;"); break;
316 case '>': fz_write_string(ctx, out, "&gt;"); break;
317 case '&': fz_write_string(ctx, out, "&amp;"); break;
318 case '"': fz_write_string(ctx, out, "&quot;"); break;
319 case '\'': fz_write_string(ctx, out, "&apos;"); break;
320 }
321 }
322 }
323
324 if (font)
325 fz_print_style_end_xhtml(ctx, out, font, sup);
326 fz_write_printf(ctx, out, "</%s>\n", tag);
327}
328
329/*
330 Output a page to a file in XHTML (semantic) format.
331*/
332void
333fz_print_stext_page_as_xhtml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
334{
335 fz_stext_block *block;
336
337 fz_write_printf(ctx, out, "<div id=\"page%d\">\n", id);
338
339 for (block = page->first_block; block; block = block->next)
340 {
341 if (block->type == FZ_STEXT_BLOCK_IMAGE)
342 fz_print_stext_image_as_xhtml(ctx, out, block);
343 else if (block->type == FZ_STEXT_BLOCK_TEXT)
344 fz_print_stext_block_as_xhtml(ctx, out, block);
345 }
346
347 fz_write_string(ctx, out, "</div>\n");
348}
349
350void
351fz_print_stext_header_as_xhtml(fz_context *ctx, fz_output *out)
352{
353 fz_write_string(ctx, out, "<?xml version=\"1.0\"?>\n");
354 fz_write_string(ctx, out, "<!DOCTYPE html");
355 fz_write_string(ctx, out, " PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\"");
356 fz_write_string(ctx, out, " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n");
357 fz_write_string(ctx, out, "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n");
358 fz_write_string(ctx, out, "<head>\n");
359 fz_write_string(ctx, out, "<style>\n");
360 fz_write_string(ctx, out, "p{white-space:pre-wrap}\n");
361 fz_write_string(ctx, out, "</style>\n");
362 fz_write_string(ctx, out, "</head>\n");
363 fz_write_string(ctx, out, "<body>\n");
364}
365
366void
367fz_print_stext_trailer_as_xhtml(fz_context *ctx, fz_output *out)
368{
369 fz_write_string(ctx, out, "</body>\n");
370 fz_write_string(ctx, out, "</html>\n");
371}
372
373/* Detailed XML dump of the entire structured text data */
374
375/*
376 Output a page to a file in XML format.
377*/
378void
379fz_print_stext_page_as_xml(fz_context *ctx, fz_output *out, fz_stext_page *page, int id)
380{
381 fz_stext_block *block;
382 fz_stext_line *line;
383 fz_stext_char *ch;
384
385 fz_write_printf(ctx, out, "<page id=\"page%d\" width=\"%g\" height=\"%g\">\n", id,
386 page->mediabox.x1 - page->mediabox.x0,
387 page->mediabox.y1 - page->mediabox.y0);
388
389 for (block = page->first_block; block; block = block->next)
390 {
391 switch (block->type)
392 {
393 case FZ_STEXT_BLOCK_TEXT:
394 fz_write_printf(ctx, out, "<block bbox=\"%g %g %g %g\">\n",
395 block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
396 for (line = block->u.t.first_line; line; line = line->next)
397 {
398 fz_font *font = NULL;
399 float size = 0;
400 const char *name = NULL;
401
402 fz_write_printf(ctx, out, "<line bbox=\"%g %g %g %g\" wmode=\"%d\" dir=\"%g %g\">\n",
403 line->bbox.x0, line->bbox.y0, line->bbox.x1, line->bbox.y1,
404 line->wmode,
405 line->dir.x, line->dir.y);
406
407 for (ch = line->first_char; ch; ch = ch->next)
408 {
409 if (ch->font != font || ch->size != size)
410 {
411 if (font)
412 fz_write_string(ctx, out, "</font>\n");
413 font = ch->font;
414 size = ch->size;
415 name = font_full_name(ctx, font);
416 fz_write_printf(ctx, out, "<font name=\"%s\" size=\"%g\">\n", name, size);
417 }
418 fz_write_printf(ctx, out, "<char quad=\"%g %g %g %g %g %g %g %g\" x=\"%g\" y=\"%g\" color=\"#%06x\" c=\"",
419 ch->quad.ul.x, ch->quad.ul.y,
420 ch->quad.ur.x, ch->quad.ur.y,
421 ch->quad.ll.x, ch->quad.ll.y,
422 ch->quad.lr.x, ch->quad.lr.y,
423 ch->origin.x, ch->origin.y,
424 ch->color);
425 switch (ch->c)
426 {
427 case '<': fz_write_string(ctx, out, "&lt;"); break;
428 case '>': fz_write_string(ctx, out, "&gt;"); break;
429 case '&': fz_write_string(ctx, out, "&amp;"); break;
430 case '"': fz_write_string(ctx, out, "&quot;"); break;
431 case '\'': fz_write_string(ctx, out, "&apos;"); break;
432 default:
433 if (ch->c >= 32 && ch->c <= 127)
434 fz_write_printf(ctx, out, "%c", ch->c);
435 else
436 fz_write_printf(ctx, out, "&#x%x;", ch->c);
437 break;
438 }
439 fz_write_string(ctx, out, "\"/>\n");
440 }
441
442 if (font)
443 fz_write_string(ctx, out, "</font>\n");
444
445 fz_write_string(ctx, out, "</line>\n");
446 }
447 fz_write_string(ctx, out, "</block>\n");
448 break;
449
450 case FZ_STEXT_BLOCK_IMAGE:
451 fz_write_printf(ctx, out, "<image bbox=\"%g %g %g %g\" />\n",
452 block->bbox.x0, block->bbox.y0, block->bbox.x1, block->bbox.y1);
453 break;
454 }
455 }
456 fz_write_string(ctx, out, "</page>\n");
457}
458
459/* Plain text */
460
461/*
462 Output a page to a file in UTF-8 format.
463*/
464void
465fz_print_stext_page_as_text(fz_context *ctx, fz_output *out, fz_stext_page *page)
466{
467 fz_stext_block *block;
468 fz_stext_line *line;
469 fz_stext_char *ch;
470 char utf[10];
471 int i, n;
472
473 for (block = page->first_block; block; block = block->next)
474 {
475 if (block->type == FZ_STEXT_BLOCK_TEXT)
476 {
477 for (line = block->u.t.first_line; line; line = line->next)
478 {
479 for (ch = line->first_char; ch; ch = ch->next)
480 {
481 n = fz_runetochar(utf, ch->c);
482 for (i = 0; i < n; i++)
483 fz_write_byte(ctx, out, utf[i]);
484 }
485 fz_write_string(ctx, out, "\n");
486 }
487 fz_write_string(ctx, out, "\n");
488 }
489 }
490}
491
492/* Text output writer */
493
494enum {
495 FZ_FORMAT_TEXT,
496 FZ_FORMAT_HTML,
497 FZ_FORMAT_XHTML,
498 FZ_FORMAT_STEXT,
499};
500
501typedef struct fz_text_writer_s fz_text_writer;
502
503struct fz_text_writer_s
504{
505 fz_document_writer super;
506 int format;
507 int number;
508 fz_stext_options opts;
509 fz_stext_page *page;
510 fz_output *out;
511};
512
513static fz_device *
514text_begin_page(fz_context *ctx, fz_document_writer *wri_, fz_rect mediabox)
515{
516 fz_text_writer *wri = (fz_text_writer*)wri_;
517
518 if (wri->page)
519 {
520 fz_drop_stext_page(ctx, wri->page);
521 wri->page = NULL;
522 }
523
524 wri->number++;
525
526 wri->page = fz_new_stext_page(ctx, mediabox);
527 return fz_new_stext_device(ctx, wri->page, &wri->opts);
528}
529
530static void
531text_end_page(fz_context *ctx, fz_document_writer *wri_, fz_device *dev)
532{
533 fz_text_writer *wri = (fz_text_writer*)wri_;
534
535 fz_try(ctx)
536 {
537 fz_close_device(ctx, dev);
538 switch (wri->format)
539 {
540 default:
541 case FZ_FORMAT_TEXT:
542 fz_print_stext_page_as_text(ctx, wri->out, wri->page);
543 break;
544 case FZ_FORMAT_HTML:
545 fz_print_stext_page_as_html(ctx, wri->out, wri->page, wri->number);
546 break;
547 case FZ_FORMAT_XHTML:
548 fz_print_stext_page_as_xhtml(ctx, wri->out, wri->page, wri->number);
549 break;
550 case FZ_FORMAT_STEXT:
551 fz_print_stext_page_as_xml(ctx, wri->out, wri->page, wri->number);
552 break;
553 }
554 }
555 fz_always(ctx)
556 {
557 fz_drop_device(ctx, dev);
558 fz_drop_stext_page(ctx, wri->page);
559 wri->page = NULL;
560 }
561 fz_catch(ctx)
562 fz_rethrow(ctx);
563}
564
565static void
566text_close_writer(fz_context *ctx, fz_document_writer *wri_)
567{
568 fz_text_writer *wri = (fz_text_writer*)wri_;
569 switch (wri->format)
570 {
571 case FZ_FORMAT_HTML:
572 fz_print_stext_trailer_as_html(ctx, wri->out);
573 break;
574 case FZ_FORMAT_XHTML:
575 fz_print_stext_trailer_as_xhtml(ctx, wri->out);
576 break;
577 case FZ_FORMAT_STEXT:
578 fz_write_string(ctx, wri->out, "</document>\n");
579 break;
580 }
581 fz_close_output(ctx, wri->out);
582}
583
584static void
585text_drop_writer(fz_context *ctx, fz_document_writer *wri_)
586{
587 fz_text_writer *wri = (fz_text_writer*)wri_;
588 fz_drop_stext_page(ctx, wri->page);
589 fz_drop_output(ctx, wri->out);
590}
591
592fz_document_writer *
593fz_new_text_writer(fz_context *ctx, const char *format, const char *path, const char *args)
594{
595 fz_text_writer *wri;
596
597 wri = fz_new_derived_document_writer(ctx, fz_text_writer, text_begin_page, text_end_page, text_close_writer, text_drop_writer);
598 fz_try(ctx)
599 {
600 fz_parse_stext_options(ctx, &wri->opts, args);
601
602 wri->format = FZ_FORMAT_TEXT;
603 if (!strcmp(format, "text"))
604 wri->format = FZ_FORMAT_TEXT;
605 else if (!strcmp(format, "html"))
606 wri->format = FZ_FORMAT_HTML;
607 else if (!strcmp(format, "xhtml"))
608 wri->format = FZ_FORMAT_XHTML;
609 else if (!strcmp(format, "stext"))
610 wri->format = FZ_FORMAT_STEXT;
611
612 wri->out = fz_new_output_with_path(ctx, path ? path : "out.txt", 0);
613
614 switch (wri->format)
615 {
616 case FZ_FORMAT_HTML:
617 fz_print_stext_header_as_html(ctx, wri->out);
618 break;
619 case FZ_FORMAT_XHTML:
620 fz_print_stext_header_as_xhtml(ctx, wri->out);
621 break;
622 case FZ_FORMAT_STEXT:
623 fz_write_string(ctx, wri->out, "<?xml version=\"1.0\"?>\n");
624 fz_write_string(ctx, wri->out, "<document>\n");
625 break;
626 }
627 }
628 fz_catch(ctx)
629 {
630 fz_drop_output(ctx, wri->out);
631 fz_free(ctx, wri);
632 fz_rethrow(ctx);
633 }
634
635 return (fz_document_writer*)wri;
636}
637