1 | #include "mupdf/fitz.h" |
2 | #include "mupdf/ucdn.h" |
3 | |
4 | #include <math.h> |
5 | #include <float.h> |
6 | #include <string.h> |
7 | |
8 | /* Simple layout structure */ |
9 | |
10 | fz_layout_block *fz_new_layout(fz_context *ctx) |
11 | { |
12 | fz_pool *pool = fz_new_pool(ctx); |
13 | fz_layout_block *block; |
14 | fz_try(ctx) |
15 | { |
16 | block = fz_pool_alloc(ctx, pool, sizeof (fz_layout_block)); |
17 | block->pool = pool; |
18 | block->head = NULL; |
19 | block->tailp = &block->head; |
20 | } |
21 | fz_catch(ctx) |
22 | { |
23 | fz_drop_pool(ctx, pool); |
24 | fz_rethrow(ctx); |
25 | } |
26 | return block; |
27 | } |
28 | |
29 | void fz_drop_layout(fz_context *ctx, fz_layout_block *block) |
30 | { |
31 | if (block) |
32 | fz_drop_pool(ctx, block->pool); |
33 | } |
34 | |
35 | void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float h, const char *p) |
36 | { |
37 | fz_layout_line *line = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_line)); |
38 | line->x = x; |
39 | line->y = y; |
40 | line->h = h; |
41 | line->p = p; |
42 | line->text = NULL; |
43 | line->next = NULL; |
44 | *block->tailp = line; |
45 | block->tailp = &line->next; |
46 | block->text_tailp = &line->text; |
47 | } |
48 | |
49 | void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float w, const char *p) |
50 | { |
51 | fz_layout_char *ch = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_char)); |
52 | ch->x = x; |
53 | ch->w = w; |
54 | ch->p = p; |
55 | ch->next = NULL; |
56 | *block->text_tailp = ch; |
57 | block->text_tailp = &ch->next; |
58 | } |
59 | |
60 | /* Extract text into blocks and lines. */ |
61 | |
62 | #define PARAGRAPH_DIST 1.5f |
63 | #define SPACE_DIST 0.15f |
64 | #define SPACE_MAX_DIST 0.8f |
65 | |
66 | typedef struct fz_stext_device_s fz_stext_device; |
67 | |
68 | struct fz_stext_device_s |
69 | { |
70 | fz_device super; |
71 | fz_stext_page *page; |
72 | fz_point pen, start; |
73 | fz_matrix trm; |
74 | int new_obj; |
75 | int curdir; |
76 | int lastchar; |
77 | int flags; |
78 | int color; |
79 | const fz_text *lasttext; |
80 | }; |
81 | |
82 | const char *fz_stext_options_usage = |
83 | "Text output options:\n" |
84 | "\tinhibit-spaces: don't add spaces between gaps in the text\n" |
85 | "\tpreserve-images: keep images in output\n" |
86 | "\tpreserve-ligatures: do not expand ligatures into constituent characters\n" |
87 | "\tpreserve-whitespace: do not convert all whitespace into space characters\n" |
88 | "\n" ; |
89 | |
90 | /* |
91 | Create an empty text page. |
92 | |
93 | The text page is filled out by the text device to contain the blocks |
94 | and lines of text on the page. |
95 | |
96 | mediabox: optional mediabox information. |
97 | */ |
98 | fz_stext_page * |
99 | fz_new_stext_page(fz_context *ctx, fz_rect mediabox) |
100 | { |
101 | fz_pool *pool = fz_new_pool(ctx); |
102 | fz_stext_page *page = NULL; |
103 | fz_try(ctx) |
104 | { |
105 | page = fz_pool_alloc(ctx, pool, sizeof(*page)); |
106 | page->pool = pool; |
107 | page->mediabox = mediabox; |
108 | page->first_block = NULL; |
109 | page->last_block = NULL; |
110 | } |
111 | fz_catch(ctx) |
112 | { |
113 | fz_drop_pool(ctx, pool); |
114 | fz_rethrow(ctx); |
115 | } |
116 | return page; |
117 | } |
118 | |
119 | void |
120 | fz_drop_stext_page(fz_context *ctx, fz_stext_page *page) |
121 | { |
122 | if (page) |
123 | { |
124 | fz_stext_block *block; |
125 | for (block = page->first_block; block; block = block->next) |
126 | if (block->type == FZ_STEXT_BLOCK_IMAGE) |
127 | fz_drop_image(ctx, block->u.i.image); |
128 | fz_drop_pool(ctx, page->pool); |
129 | } |
130 | } |
131 | |
132 | static fz_stext_block * |
133 | add_block_to_page(fz_context *ctx, fz_stext_page *page) |
134 | { |
135 | fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block); |
136 | block->prev = page->last_block; |
137 | if (!page->first_block) |
138 | page->first_block = page->last_block = block; |
139 | else |
140 | { |
141 | page->last_block->next = block; |
142 | page->last_block = block; |
143 | } |
144 | return block; |
145 | } |
146 | |
147 | static fz_stext_block * |
148 | add_text_block_to_page(fz_context *ctx, fz_stext_page *page) |
149 | { |
150 | fz_stext_block *block = add_block_to_page(ctx, page); |
151 | block->type = FZ_STEXT_BLOCK_TEXT; |
152 | return block; |
153 | } |
154 | |
155 | static fz_stext_block * |
156 | add_image_block_to_page(fz_context *ctx, fz_stext_page *page, fz_matrix ctm, fz_image *image) |
157 | { |
158 | fz_stext_block *block = add_block_to_page(ctx, page); |
159 | block->type = FZ_STEXT_BLOCK_IMAGE; |
160 | block->u.i.transform = ctm; |
161 | block->u.i.image = fz_keep_image(ctx, image); |
162 | block->bbox = fz_transform_rect(fz_unit_rect, ctm); |
163 | return block; |
164 | } |
165 | |
166 | static fz_stext_line * |
167 | add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, const fz_point *dir, int wmode) |
168 | { |
169 | fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line); |
170 | line->prev = block->u.t.last_line; |
171 | if (!block->u.t.first_line) |
172 | block->u.t.first_line = block->u.t.last_line = line; |
173 | else |
174 | { |
175 | block->u.t.last_line->next = line; |
176 | block->u.t.last_line = line; |
177 | } |
178 | |
179 | line->dir = *dir; |
180 | line->wmode = wmode; |
181 | |
182 | return line; |
183 | } |
184 | |
185 | static fz_stext_char * |
186 | add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, fz_matrix trm, fz_font *font, float size, int c, fz_point *p, fz_point *q, int color) |
187 | { |
188 | fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char); |
189 | fz_point a, d; |
190 | |
191 | if (!line->first_char) |
192 | line->first_char = line->last_char = ch; |
193 | else |
194 | { |
195 | line->last_char->next = ch; |
196 | line->last_char = ch; |
197 | } |
198 | |
199 | ch->c = c; |
200 | ch->color = color; |
201 | ch->origin = *p; |
202 | ch->size = size; |
203 | ch->font = font; /* TODO: keep and drop */ |
204 | |
205 | if (line->wmode == 0) |
206 | { |
207 | a.x = 0; |
208 | d.x = 0; |
209 | a.y = fz_font_ascender(ctx, font); |
210 | d.y = fz_font_descender(ctx, font); |
211 | } |
212 | else |
213 | { |
214 | fz_rect bbox = fz_font_bbox(ctx, font); |
215 | a.x = bbox.x1; |
216 | d.x = bbox.x0; |
217 | a.y = 0; |
218 | d.y = 0; |
219 | } |
220 | a = fz_transform_vector(a, trm); |
221 | d = fz_transform_vector(d, trm); |
222 | |
223 | ch->quad.ll = fz_make_point(p->x + d.x, p->y + d.y); |
224 | ch->quad.ul = fz_make_point(p->x + a.x, p->y + a.y); |
225 | ch->quad.lr = fz_make_point(q->x + d.x, q->y + d.y); |
226 | ch->quad.ur = fz_make_point(q->x + a.x, q->y + a.y); |
227 | |
228 | return ch; |
229 | } |
230 | |
231 | static int |
232 | direction_from_bidi_class(int bidiclass, int curdir) |
233 | { |
234 | switch (bidiclass) |
235 | { |
236 | /* strong */ |
237 | case UCDN_BIDI_CLASS_L: return 1; |
238 | case UCDN_BIDI_CLASS_R: return -1; |
239 | case UCDN_BIDI_CLASS_AL: return -1; |
240 | |
241 | /* weak */ |
242 | case UCDN_BIDI_CLASS_EN: |
243 | case UCDN_BIDI_CLASS_ES: |
244 | case UCDN_BIDI_CLASS_ET: |
245 | case UCDN_BIDI_CLASS_AN: |
246 | case UCDN_BIDI_CLASS_CS: |
247 | case UCDN_BIDI_CLASS_NSM: |
248 | case UCDN_BIDI_CLASS_BN: |
249 | return curdir; |
250 | |
251 | /* neutral */ |
252 | case UCDN_BIDI_CLASS_B: |
253 | case UCDN_BIDI_CLASS_S: |
254 | case UCDN_BIDI_CLASS_WS: |
255 | case UCDN_BIDI_CLASS_ON: |
256 | return curdir; |
257 | |
258 | /* embedding, override, pop ... we don't support them */ |
259 | default: |
260 | return 0; |
261 | } |
262 | } |
263 | |
264 | static float |
265 | vec_dot(const fz_point *a, const fz_point *b) |
266 | { |
267 | return a->x * b->x + a->y * b->y; |
268 | } |
269 | |
270 | static void |
271 | fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode) |
272 | { |
273 | fz_stext_page *page = dev->page; |
274 | fz_stext_block *cur_block; |
275 | fz_stext_line *cur_line; |
276 | |
277 | int new_para = 0; |
278 | int new_line = 1; |
279 | int add_space = 0; |
280 | fz_point dir, ndir, p, q; |
281 | float size; |
282 | fz_point delta; |
283 | float spacing = 0; |
284 | float base_offset = 0; |
285 | int rtl = 0; |
286 | |
287 | dev->curdir = direction_from_bidi_class(ucdn_get_bidi_class(c), dev->curdir); |
288 | |
289 | /* dir = direction vector for motion. ndir = normalised(dir) */ |
290 | if (wmode == 0) |
291 | { |
292 | dir.x = 1; |
293 | dir.y = 0; |
294 | } |
295 | else |
296 | { |
297 | dir.x = 0; |
298 | dir.y = -1; |
299 | } |
300 | dir = fz_transform_vector(dir, trm); |
301 | ndir = fz_normalize_vector(dir); |
302 | |
303 | size = fz_matrix_expansion(trm); |
304 | |
305 | /* We need to identify where glyphs 'start' (p) and 'stop' (q). |
306 | * Each glyph holds its 'start' position, and the next glyph in the |
307 | * span (or span->max if there is no next glyph) holds its 'end' |
308 | * position. |
309 | * |
310 | * For both horizontal and vertical motion, trm->{e,f} gives the |
311 | * origin (usually the bottom left) of the glyph. |
312 | * |
313 | * In horizontal mode: |
314 | * + p is bottom left. |
315 | * + q is the bottom right |
316 | * In vertical mode: |
317 | * + p is top left (where it advanced from) |
318 | * + q is bottom left |
319 | */ |
320 | if (wmode == 0) |
321 | { |
322 | p.x = trm.e; |
323 | p.y = trm.f; |
324 | q.x = trm.e + adv * dir.x; |
325 | q.y = trm.f + adv * dir.y; |
326 | } |
327 | else |
328 | { |
329 | p.x = trm.e - adv * dir.x; |
330 | p.y = trm.f - adv * dir.y; |
331 | q.x = trm.e; |
332 | q.y = trm.f; |
333 | } |
334 | |
335 | /* Find current position to enter new text. */ |
336 | cur_block = page->last_block; |
337 | if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT) |
338 | cur_block = NULL; |
339 | cur_line = cur_block ? cur_block->u.t.last_line : NULL; |
340 | |
341 | if (cur_line && glyph < 0) |
342 | { |
343 | /* Don't advance pen or break lines for no-glyph characters in a cluster */ |
344 | add_char_to_line(ctx, page, cur_line, trm, font, size, c, &dev->pen, &dev->pen, dev->color); |
345 | dev->lastchar = c; |
346 | return; |
347 | } |
348 | |
349 | if (cur_line == NULL || cur_line->wmode != wmode || vec_dot(&ndir, &cur_line->dir) < 0.999f) |
350 | { |
351 | /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all), |
352 | * then we can't append to the current block/line. */ |
353 | new_para = 1; |
354 | new_line = 1; |
355 | } |
356 | else |
357 | { |
358 | /* Detect fake bold where text is printed twice in the same place. */ |
359 | delta.x = fabsf(q.x - dev->pen.x); |
360 | delta.y = fabsf(q.y - dev->pen.y); |
361 | if (delta.x < FLT_EPSILON && delta.y < FLT_EPSILON && c == dev->lastchar) |
362 | return; |
363 | |
364 | /* Calculate how far we've moved since the last character. */ |
365 | delta.x = p.x - dev->pen.x; |
366 | delta.y = p.y - dev->pen.y; |
367 | |
368 | /* The transform has not changed, so we know we're in the same |
369 | * direction. Calculate 2 distances; how far off the previous |
370 | * baseline we are, together with how far along the baseline |
371 | * we are from the expected position. */ |
372 | spacing = ndir.x * delta.x + ndir.y * delta.y; |
373 | base_offset = -ndir.y * delta.x + ndir.x * delta.y; |
374 | |
375 | /* Only a small amount off the baseline - we'll take this */ |
376 | if (fabsf(base_offset) < size * 0.8f) |
377 | { |
378 | /* LTR or neutral character */ |
379 | if (dev->curdir >= 0) |
380 | { |
381 | if (fabsf(spacing) < size * SPACE_DIST) |
382 | { |
383 | /* Motion is in line and small enough to ignore. */ |
384 | new_line = 0; |
385 | } |
386 | else if (fabsf(spacing) > size * SPACE_MAX_DIST) |
387 | { |
388 | /* Motion is in line and large enough to warrant splitting to a new line */ |
389 | new_line = 1; |
390 | } |
391 | else if (spacing < 0) |
392 | { |
393 | /* Motion is backward in line! Ignore this odd spacing. */ |
394 | new_line = 0; |
395 | } |
396 | else |
397 | { |
398 | /* Motion is forward in line and large enough to warrant us adding a space. */ |
399 | if (dev->lastchar != ' ' && wmode == 0) |
400 | add_space = 1; |
401 | new_line = 0; |
402 | } |
403 | } |
404 | |
405 | /* RTL character -- disable space character and column detection heuristics */ |
406 | else |
407 | { |
408 | new_line = 0; |
409 | if (spacing > size * SPACE_DIST || spacing < 0) |
410 | rtl = 0; /* backward (or big jump to 'right' side) means logical order */ |
411 | else |
412 | rtl = 1; /* visual order, we need to reverse in a post process pass */ |
413 | } |
414 | } |
415 | |
416 | /* Enough for a new line, but not enough for a new paragraph */ |
417 | else if (fabsf(base_offset) <= size * PARAGRAPH_DIST) |
418 | { |
419 | /* Check indent to spot text-indent style paragraphs */ |
420 | if (wmode == 0 && cur_line && dev->new_obj) |
421 | if (fabsf(p.x - dev->start.x) > size * 0.5f) |
422 | new_para = 1; |
423 | new_line = 1; |
424 | } |
425 | |
426 | /* Way off the baseline - open a new paragraph */ |
427 | else |
428 | { |
429 | new_para = 1; |
430 | new_line = 1; |
431 | } |
432 | } |
433 | |
434 | /* Start a new block (but only at the beginning of a text object) */ |
435 | if (new_para || !cur_block) |
436 | { |
437 | cur_block = add_text_block_to_page(ctx, page); |
438 | cur_line = cur_block->u.t.last_line; |
439 | } |
440 | |
441 | /* Start a new line */ |
442 | if (new_line || !cur_line) |
443 | { |
444 | cur_line = add_line_to_block(ctx, page, cur_block, &ndir, wmode); |
445 | dev->start = p; |
446 | } |
447 | |
448 | /* Add synthetic space */ |
449 | if (add_space && !(dev->flags & FZ_STEXT_INHIBIT_SPACES)) |
450 | add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', &dev->pen, &p, dev->color); |
451 | |
452 | add_char_to_line(ctx, page, cur_line, trm, font, size, c, &p, &q, dev->color); |
453 | dev->lastchar = c; |
454 | dev->pen = q; |
455 | |
456 | dev->new_obj = 0; |
457 | dev->trm = trm; |
458 | } |
459 | |
460 | static void |
461 | fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode) |
462 | { |
463 | /* ignore when one unicode character maps to multiple glyphs */ |
464 | if (c == -1) |
465 | return; |
466 | |
467 | if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES)) |
468 | { |
469 | switch (c) |
470 | { |
471 | case 0xFB00: /* ff */ |
472 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); |
473 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); |
474 | return; |
475 | case 0xFB01: /* fi */ |
476 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); |
477 | fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode); |
478 | return; |
479 | case 0xFB02: /* fl */ |
480 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); |
481 | fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode); |
482 | return; |
483 | case 0xFB03: /* ffi */ |
484 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); |
485 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); |
486 | fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode); |
487 | return; |
488 | case 0xFB04: /* ffl */ |
489 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); |
490 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); |
491 | fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode); |
492 | return; |
493 | case 0xFB05: /* long st */ |
494 | case 0xFB06: /* st */ |
495 | fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode); |
496 | fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode); |
497 | return; |
498 | } |
499 | } |
500 | |
501 | if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE)) |
502 | { |
503 | switch (c) |
504 | { |
505 | case 0x0009: /* tab */ |
506 | case 0x0020: /* space */ |
507 | case 0x00A0: /* no-break space */ |
508 | case 0x1680: /* ogham space mark */ |
509 | case 0x180E: /* mongolian vowel separator */ |
510 | case 0x2000: /* en quad */ |
511 | case 0x2001: /* em quad */ |
512 | case 0x2002: /* en space */ |
513 | case 0x2003: /* em space */ |
514 | case 0x2004: /* three-per-em space */ |
515 | case 0x2005: /* four-per-em space */ |
516 | case 0x2006: /* six-per-em space */ |
517 | case 0x2007: /* figure space */ |
518 | case 0x2008: /* punctuation space */ |
519 | case 0x2009: /* thin space */ |
520 | case 0x200A: /* hair space */ |
521 | case 0x202F: /* narrow no-break space */ |
522 | case 0x205F: /* medium mathematical space */ |
523 | case 0x3000: /* ideographic space */ |
524 | c = ' '; |
525 | } |
526 | } |
527 | |
528 | fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode); |
529 | } |
530 | |
531 | static void |
532 | (fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm) |
533 | { |
534 | fz_font *font = span->font; |
535 | fz_matrix tm = span->trm; |
536 | fz_matrix trm; |
537 | float adv; |
538 | int i; |
539 | |
540 | if (span->len == 0) |
541 | return; |
542 | |
543 | tm.e = 0; |
544 | tm.f = 0; |
545 | trm = fz_concat(tm, ctm); |
546 | |
547 | for (i = 0; i < span->len; i++) |
548 | { |
549 | /* Calculate new pen location and delta */ |
550 | tm.e = span->items[i].x; |
551 | tm.f = span->items[i].y; |
552 | trm = fz_concat(tm, ctm); |
553 | |
554 | /* Calculate bounding box and new pen position based on font metrics */ |
555 | if (span->items[i].gid >= 0) |
556 | adv = fz_advance_glyph(ctx, font, span->items[i].gid, span->wmode); |
557 | else |
558 | adv = 0; |
559 | |
560 | fz_add_stext_char(ctx, dev, font, span->items[i].ucs, span->items[i].gid, trm, adv, span->wmode); |
561 | } |
562 | } |
563 | |
564 | static int hexrgb_from_color(fz_context *ctx, fz_colorspace *colorspace, const float *color) |
565 | { |
566 | float rgb[3]; |
567 | fz_convert_color(ctx, colorspace, color, fz_device_rgb(ctx), rgb, NULL, fz_default_color_params); |
568 | return |
569 | (fz_clampi(rgb[0] * 255, 0, 255) << 16) | |
570 | (fz_clampi(rgb[1] * 255, 0, 255) << 8) | |
571 | (fz_clampi(rgb[2] * 255, 0, 255)); |
572 | } |
573 | |
574 | static void |
575 | fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, |
576 | fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) |
577 | { |
578 | fz_stext_device *tdev = (fz_stext_device*)dev; |
579 | fz_text_span *span; |
580 | if (text == tdev->lasttext) |
581 | return; |
582 | tdev->color = hexrgb_from_color(ctx, colorspace, color); |
583 | tdev->new_obj = 1; |
584 | for (span = text->head; span; span = span->next) |
585 | fz_stext_extract(ctx, tdev, span, ctm); |
586 | fz_drop_text(ctx, tdev->lasttext); |
587 | tdev->lasttext = fz_keep_text(ctx, text); |
588 | } |
589 | |
590 | static void |
591 | fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, |
592 | fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) |
593 | { |
594 | fz_stext_device *tdev = (fz_stext_device*)dev; |
595 | fz_text_span *span; |
596 | if (text == tdev->lasttext) |
597 | return; |
598 | tdev->color = hexrgb_from_color(ctx, colorspace, color); |
599 | tdev->new_obj = 1; |
600 | for (span = text->head; span; span = span->next) |
601 | fz_stext_extract(ctx, tdev, span, ctm); |
602 | fz_drop_text(ctx, tdev->lasttext); |
603 | tdev->lasttext = fz_keep_text(ctx, text); |
604 | } |
605 | |
606 | static void |
607 | fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor) |
608 | { |
609 | fz_stext_device *tdev = (fz_stext_device*)dev; |
610 | fz_text_span *span; |
611 | if (text == tdev->lasttext) |
612 | return; |
613 | tdev->color = 0; |
614 | tdev->new_obj = 1; |
615 | for (span = text->head; span; span = span->next) |
616 | fz_stext_extract(ctx, tdev, span, ctm); |
617 | fz_drop_text(ctx, tdev->lasttext); |
618 | tdev->lasttext = fz_keep_text(ctx, text); |
619 | } |
620 | |
621 | static void |
622 | fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor) |
623 | { |
624 | fz_stext_device *tdev = (fz_stext_device*)dev; |
625 | fz_text_span *span; |
626 | if (text == tdev->lasttext) |
627 | return; |
628 | tdev->color = 0; |
629 | tdev->new_obj = 1; |
630 | for (span = text->head; span; span = span->next) |
631 | fz_stext_extract(ctx, tdev, span, ctm); |
632 | fz_drop_text(ctx, tdev->lasttext); |
633 | tdev->lasttext = fz_keep_text(ctx, text); |
634 | } |
635 | |
636 | static void |
637 | fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm) |
638 | { |
639 | fz_stext_device *tdev = (fz_stext_device*)dev; |
640 | fz_text_span *span; |
641 | if (text == tdev->lasttext) |
642 | return; |
643 | tdev->color = 0; |
644 | tdev->new_obj = 1; |
645 | for (span = text->head; span; span = span->next) |
646 | fz_stext_extract(ctx, tdev, span, ctm); |
647 | fz_drop_text(ctx, tdev->lasttext); |
648 | tdev->lasttext = fz_keep_text(ctx, text); |
649 | } |
650 | |
651 | /* Images and shadings */ |
652 | |
653 | static void |
654 | fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params) |
655 | { |
656 | fz_stext_device *tdev = (fz_stext_device*)dev; |
657 | |
658 | /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */ |
659 | if (alpha < 0.5f) |
660 | return; |
661 | |
662 | add_image_block_to_page(ctx, tdev->page, ctm, img); |
663 | } |
664 | |
665 | static void |
666 | fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, |
667 | fz_colorspace *cspace, const float *color, float alpha, fz_color_params color_params) |
668 | { |
669 | fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params); |
670 | } |
671 | |
672 | static fz_image * |
673 | fz_new_image_from_shade(fz_context *ctx, fz_shade *shade, fz_matrix *in_out_ctm, fz_color_params color_params, fz_rect scissor) |
674 | { |
675 | fz_matrix ctm = *in_out_ctm; |
676 | fz_pixmap *pix; |
677 | fz_image *img = NULL; |
678 | fz_rect bounds; |
679 | fz_irect bbox; |
680 | |
681 | bounds = fz_bound_shade(ctx, shade, ctm); |
682 | bounds = fz_intersect_rect(bounds, scissor); |
683 | bbox = fz_irect_from_rect(bounds); |
684 | |
685 | pix = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), bbox, NULL, !shade->use_background); |
686 | fz_try(ctx) |
687 | { |
688 | if (shade->use_background) |
689 | fz_fill_pixmap_with_color(ctx, pix, shade->colorspace, shade->background, color_params); |
690 | else |
691 | fz_clear_pixmap(ctx, pix); |
692 | fz_paint_shade(ctx, shade, NULL, ctm, pix, color_params, bbox, NULL); |
693 | img = fz_new_image_from_pixmap(ctx, pix, NULL); |
694 | } |
695 | fz_always(ctx) |
696 | fz_drop_pixmap(ctx, pix); |
697 | fz_catch(ctx) |
698 | fz_rethrow(ctx); |
699 | |
700 | in_out_ctm->a = pix->w; |
701 | in_out_ctm->b = 0; |
702 | in_out_ctm->c = 0; |
703 | in_out_ctm->d = pix->h; |
704 | in_out_ctm->e = pix->x; |
705 | in_out_ctm->f = pix->y; |
706 | return img; |
707 | } |
708 | |
709 | static void |
710 | fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params) |
711 | { |
712 | fz_matrix local_ctm = ctm; |
713 | fz_rect scissor = fz_device_current_scissor(ctx, dev); |
714 | fz_image *image = fz_new_image_from_shade(ctx, shade, &local_ctm, color_params, scissor); |
715 | fz_try(ctx) |
716 | fz_stext_fill_image(ctx, dev, image, local_ctm, alpha, color_params); |
717 | fz_always(ctx) |
718 | fz_drop_image(ctx, image); |
719 | fz_catch(ctx) |
720 | fz_rethrow(ctx); |
721 | } |
722 | |
723 | static void |
724 | fz_stext_close_device(fz_context *ctx, fz_device *dev) |
725 | { |
726 | fz_stext_device *tdev = (fz_stext_device*)dev; |
727 | fz_stext_page *page = tdev->page; |
728 | fz_stext_block *block; |
729 | fz_stext_line *line; |
730 | fz_stext_char *ch; |
731 | |
732 | for (block = page->first_block; block; block = block->next) |
733 | { |
734 | if (block->type != FZ_STEXT_BLOCK_TEXT) |
735 | continue; |
736 | for (line = block->u.t.first_line; line; line = line->next) |
737 | { |
738 | for (ch = line->first_char; ch; ch = ch->next) |
739 | { |
740 | fz_rect ch_box = fz_rect_from_quad(ch->quad); |
741 | if (ch == line->first_char) |
742 | line->bbox = ch_box; |
743 | else |
744 | line->bbox = fz_union_rect(line->bbox, ch_box); |
745 | } |
746 | block->bbox = fz_union_rect(block->bbox, line->bbox); |
747 | } |
748 | } |
749 | |
750 | /* TODO: smart sorting of blocks and lines in reading order */ |
751 | /* TODO: unicode NFC normalization */ |
752 | } |
753 | |
754 | static void |
755 | fz_stext_drop_device(fz_context *ctx, fz_device *dev) |
756 | { |
757 | fz_stext_device *tdev = (fz_stext_device*)dev; |
758 | fz_drop_text(ctx, tdev->lasttext); |
759 | } |
760 | |
761 | /* |
762 | Parse stext device options from a comma separated key-value string. |
763 | */ |
764 | fz_stext_options * |
765 | fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string) |
766 | { |
767 | const char *val; |
768 | |
769 | memset(opts, 0, sizeof *opts); |
770 | |
771 | if (fz_has_option(ctx, string, "preserve-ligatures" , &val) && fz_option_eq(val, "yes" )) |
772 | opts->flags |= FZ_STEXT_PRESERVE_LIGATURES; |
773 | if (fz_has_option(ctx, string, "preserve-whitespace" , &val) && fz_option_eq(val, "yes" )) |
774 | opts->flags |= FZ_STEXT_PRESERVE_WHITESPACE; |
775 | if (fz_has_option(ctx, string, "preserve-images" , &val) && fz_option_eq(val, "yes" )) |
776 | opts->flags |= FZ_STEXT_PRESERVE_IMAGES; |
777 | if (fz_has_option(ctx, string, "inhibit-spaces" , &val) && fz_option_eq(val, "yes" )) |
778 | opts->flags |= FZ_STEXT_INHIBIT_SPACES; |
779 | |
780 | return opts; |
781 | } |
782 | |
783 | /* |
784 | Create a device to extract the text on a page. |
785 | |
786 | Gather the text on a page into blocks and lines. |
787 | |
788 | The reading order is taken from the order the text is drawn in the |
789 | source file, so may not be accurate. |
790 | |
791 | page: The text page to which content should be added. This will |
792 | usually be a newly created (empty) text page, but it can be one |
793 | containing data already (for example when merging multiple pages, |
794 | or watermarking). |
795 | |
796 | options: Options to configure the stext device. |
797 | */ |
798 | fz_device * |
799 | fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts) |
800 | { |
801 | fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device); |
802 | |
803 | dev->super.close_device = fz_stext_close_device; |
804 | dev->super.drop_device = fz_stext_drop_device; |
805 | |
806 | dev->super.fill_text = fz_stext_fill_text; |
807 | dev->super.stroke_text = fz_stext_stroke_text; |
808 | dev->super.clip_text = fz_stext_clip_text; |
809 | dev->super.clip_stroke_text = fz_stext_clip_stroke_text; |
810 | dev->super.ignore_text = fz_stext_ignore_text; |
811 | |
812 | if (opts && (opts->flags & FZ_STEXT_PRESERVE_IMAGES)) |
813 | { |
814 | dev->super.fill_shade = fz_stext_fill_shade; |
815 | dev->super.fill_image = fz_stext_fill_image; |
816 | dev->super.fill_image_mask = fz_stext_fill_image_mask; |
817 | } |
818 | |
819 | if (opts) |
820 | dev->flags = opts->flags; |
821 | dev->page = page; |
822 | dev->pen.x = 0; |
823 | dev->pen.y = 0; |
824 | dev->trm = fz_identity; |
825 | dev->lastchar = ' '; |
826 | dev->curdir = 1; |
827 | dev->lasttext = NULL; |
828 | |
829 | return (fz_device*)dev; |
830 | } |
831 | |