| 1 | #include "mupdf/fitz.h" |
| 2 | #include "mupdf/ucdn.h" |
| 3 | |
| 4 | #include <math.h> |
| 5 | #include <float.h> |
| 6 | #include <string.h> |
| 7 | |
| 8 | /* Simple layout structure */ |
| 9 | |
| 10 | fz_layout_block *fz_new_layout(fz_context *ctx) |
| 11 | { |
| 12 | fz_pool *pool = fz_new_pool(ctx); |
| 13 | fz_layout_block *block; |
| 14 | fz_try(ctx) |
| 15 | { |
| 16 | block = fz_pool_alloc(ctx, pool, sizeof (fz_layout_block)); |
| 17 | block->pool = pool; |
| 18 | block->head = NULL; |
| 19 | block->tailp = &block->head; |
| 20 | } |
| 21 | fz_catch(ctx) |
| 22 | { |
| 23 | fz_drop_pool(ctx, pool); |
| 24 | fz_rethrow(ctx); |
| 25 | } |
| 26 | return block; |
| 27 | } |
| 28 | |
| 29 | void fz_drop_layout(fz_context *ctx, fz_layout_block *block) |
| 30 | { |
| 31 | if (block) |
| 32 | fz_drop_pool(ctx, block->pool); |
| 33 | } |
| 34 | |
| 35 | void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float h, const char *p) |
| 36 | { |
| 37 | fz_layout_line *line = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_line)); |
| 38 | line->x = x; |
| 39 | line->y = y; |
| 40 | line->h = h; |
| 41 | line->p = p; |
| 42 | line->text = NULL; |
| 43 | line->next = NULL; |
| 44 | *block->tailp = line; |
| 45 | block->tailp = &line->next; |
| 46 | block->text_tailp = &line->text; |
| 47 | } |
| 48 | |
| 49 | void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float w, const char *p) |
| 50 | { |
| 51 | fz_layout_char *ch = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_char)); |
| 52 | ch->x = x; |
| 53 | ch->w = w; |
| 54 | ch->p = p; |
| 55 | ch->next = NULL; |
| 56 | *block->text_tailp = ch; |
| 57 | block->text_tailp = &ch->next; |
| 58 | } |
| 59 | |
| 60 | /* Extract text into blocks and lines. */ |
| 61 | |
| 62 | #define PARAGRAPH_DIST 1.5f |
| 63 | #define SPACE_DIST 0.15f |
| 64 | #define SPACE_MAX_DIST 0.8f |
| 65 | |
| 66 | typedef struct fz_stext_device_s fz_stext_device; |
| 67 | |
| 68 | struct fz_stext_device_s |
| 69 | { |
| 70 | fz_device super; |
| 71 | fz_stext_page *page; |
| 72 | fz_point pen, start; |
| 73 | fz_matrix trm; |
| 74 | int new_obj; |
| 75 | int curdir; |
| 76 | int lastchar; |
| 77 | int flags; |
| 78 | int color; |
| 79 | const fz_text *lasttext; |
| 80 | }; |
| 81 | |
| 82 | const char *fz_stext_options_usage = |
| 83 | "Text output options:\n" |
| 84 | "\tinhibit-spaces: don't add spaces between gaps in the text\n" |
| 85 | "\tpreserve-images: keep images in output\n" |
| 86 | "\tpreserve-ligatures: do not expand ligatures into constituent characters\n" |
| 87 | "\tpreserve-whitespace: do not convert all whitespace into space characters\n" |
| 88 | "\n" ; |
| 89 | |
| 90 | /* |
| 91 | Create an empty text page. |
| 92 | |
| 93 | The text page is filled out by the text device to contain the blocks |
| 94 | and lines of text on the page. |
| 95 | |
| 96 | mediabox: optional mediabox information. |
| 97 | */ |
| 98 | fz_stext_page * |
| 99 | fz_new_stext_page(fz_context *ctx, fz_rect mediabox) |
| 100 | { |
| 101 | fz_pool *pool = fz_new_pool(ctx); |
| 102 | fz_stext_page *page = NULL; |
| 103 | fz_try(ctx) |
| 104 | { |
| 105 | page = fz_pool_alloc(ctx, pool, sizeof(*page)); |
| 106 | page->pool = pool; |
| 107 | page->mediabox = mediabox; |
| 108 | page->first_block = NULL; |
| 109 | page->last_block = NULL; |
| 110 | } |
| 111 | fz_catch(ctx) |
| 112 | { |
| 113 | fz_drop_pool(ctx, pool); |
| 114 | fz_rethrow(ctx); |
| 115 | } |
| 116 | return page; |
| 117 | } |
| 118 | |
| 119 | void |
| 120 | fz_drop_stext_page(fz_context *ctx, fz_stext_page *page) |
| 121 | { |
| 122 | if (page) |
| 123 | { |
| 124 | fz_stext_block *block; |
| 125 | for (block = page->first_block; block; block = block->next) |
| 126 | if (block->type == FZ_STEXT_BLOCK_IMAGE) |
| 127 | fz_drop_image(ctx, block->u.i.image); |
| 128 | fz_drop_pool(ctx, page->pool); |
| 129 | } |
| 130 | } |
| 131 | |
| 132 | static fz_stext_block * |
| 133 | add_block_to_page(fz_context *ctx, fz_stext_page *page) |
| 134 | { |
| 135 | fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block); |
| 136 | block->prev = page->last_block; |
| 137 | if (!page->first_block) |
| 138 | page->first_block = page->last_block = block; |
| 139 | else |
| 140 | { |
| 141 | page->last_block->next = block; |
| 142 | page->last_block = block; |
| 143 | } |
| 144 | return block; |
| 145 | } |
| 146 | |
| 147 | static fz_stext_block * |
| 148 | add_text_block_to_page(fz_context *ctx, fz_stext_page *page) |
| 149 | { |
| 150 | fz_stext_block *block = add_block_to_page(ctx, page); |
| 151 | block->type = FZ_STEXT_BLOCK_TEXT; |
| 152 | return block; |
| 153 | } |
| 154 | |
| 155 | static fz_stext_block * |
| 156 | add_image_block_to_page(fz_context *ctx, fz_stext_page *page, fz_matrix ctm, fz_image *image) |
| 157 | { |
| 158 | fz_stext_block *block = add_block_to_page(ctx, page); |
| 159 | block->type = FZ_STEXT_BLOCK_IMAGE; |
| 160 | block->u.i.transform = ctm; |
| 161 | block->u.i.image = fz_keep_image(ctx, image); |
| 162 | block->bbox = fz_transform_rect(fz_unit_rect, ctm); |
| 163 | return block; |
| 164 | } |
| 165 | |
| 166 | static fz_stext_line * |
| 167 | add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, const fz_point *dir, int wmode) |
| 168 | { |
| 169 | fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line); |
| 170 | line->prev = block->u.t.last_line; |
| 171 | if (!block->u.t.first_line) |
| 172 | block->u.t.first_line = block->u.t.last_line = line; |
| 173 | else |
| 174 | { |
| 175 | block->u.t.last_line->next = line; |
| 176 | block->u.t.last_line = line; |
| 177 | } |
| 178 | |
| 179 | line->dir = *dir; |
| 180 | line->wmode = wmode; |
| 181 | |
| 182 | return line; |
| 183 | } |
| 184 | |
| 185 | static fz_stext_char * |
| 186 | add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, fz_matrix trm, fz_font *font, float size, int c, fz_point *p, fz_point *q, int color) |
| 187 | { |
| 188 | fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char); |
| 189 | fz_point a, d; |
| 190 | |
| 191 | if (!line->first_char) |
| 192 | line->first_char = line->last_char = ch; |
| 193 | else |
| 194 | { |
| 195 | line->last_char->next = ch; |
| 196 | line->last_char = ch; |
| 197 | } |
| 198 | |
| 199 | ch->c = c; |
| 200 | ch->color = color; |
| 201 | ch->origin = *p; |
| 202 | ch->size = size; |
| 203 | ch->font = font; /* TODO: keep and drop */ |
| 204 | |
| 205 | if (line->wmode == 0) |
| 206 | { |
| 207 | a.x = 0; |
| 208 | d.x = 0; |
| 209 | a.y = fz_font_ascender(ctx, font); |
| 210 | d.y = fz_font_descender(ctx, font); |
| 211 | } |
| 212 | else |
| 213 | { |
| 214 | fz_rect bbox = fz_font_bbox(ctx, font); |
| 215 | a.x = bbox.x1; |
| 216 | d.x = bbox.x0; |
| 217 | a.y = 0; |
| 218 | d.y = 0; |
| 219 | } |
| 220 | a = fz_transform_vector(a, trm); |
| 221 | d = fz_transform_vector(d, trm); |
| 222 | |
| 223 | ch->quad.ll = fz_make_point(p->x + d.x, p->y + d.y); |
| 224 | ch->quad.ul = fz_make_point(p->x + a.x, p->y + a.y); |
| 225 | ch->quad.lr = fz_make_point(q->x + d.x, q->y + d.y); |
| 226 | ch->quad.ur = fz_make_point(q->x + a.x, q->y + a.y); |
| 227 | |
| 228 | return ch; |
| 229 | } |
| 230 | |
| 231 | static int |
| 232 | direction_from_bidi_class(int bidiclass, int curdir) |
| 233 | { |
| 234 | switch (bidiclass) |
| 235 | { |
| 236 | /* strong */ |
| 237 | case UCDN_BIDI_CLASS_L: return 1; |
| 238 | case UCDN_BIDI_CLASS_R: return -1; |
| 239 | case UCDN_BIDI_CLASS_AL: return -1; |
| 240 | |
| 241 | /* weak */ |
| 242 | case UCDN_BIDI_CLASS_EN: |
| 243 | case UCDN_BIDI_CLASS_ES: |
| 244 | case UCDN_BIDI_CLASS_ET: |
| 245 | case UCDN_BIDI_CLASS_AN: |
| 246 | case UCDN_BIDI_CLASS_CS: |
| 247 | case UCDN_BIDI_CLASS_NSM: |
| 248 | case UCDN_BIDI_CLASS_BN: |
| 249 | return curdir; |
| 250 | |
| 251 | /* neutral */ |
| 252 | case UCDN_BIDI_CLASS_B: |
| 253 | case UCDN_BIDI_CLASS_S: |
| 254 | case UCDN_BIDI_CLASS_WS: |
| 255 | case UCDN_BIDI_CLASS_ON: |
| 256 | return curdir; |
| 257 | |
| 258 | /* embedding, override, pop ... we don't support them */ |
| 259 | default: |
| 260 | return 0; |
| 261 | } |
| 262 | } |
| 263 | |
| 264 | static float |
| 265 | vec_dot(const fz_point *a, const fz_point *b) |
| 266 | { |
| 267 | return a->x * b->x + a->y * b->y; |
| 268 | } |
| 269 | |
| 270 | static void |
| 271 | fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode) |
| 272 | { |
| 273 | fz_stext_page *page = dev->page; |
| 274 | fz_stext_block *cur_block; |
| 275 | fz_stext_line *cur_line; |
| 276 | |
| 277 | int new_para = 0; |
| 278 | int new_line = 1; |
| 279 | int add_space = 0; |
| 280 | fz_point dir, ndir, p, q; |
| 281 | float size; |
| 282 | fz_point delta; |
| 283 | float spacing = 0; |
| 284 | float base_offset = 0; |
| 285 | int rtl = 0; |
| 286 | |
| 287 | dev->curdir = direction_from_bidi_class(ucdn_get_bidi_class(c), dev->curdir); |
| 288 | |
| 289 | /* dir = direction vector for motion. ndir = normalised(dir) */ |
| 290 | if (wmode == 0) |
| 291 | { |
| 292 | dir.x = 1; |
| 293 | dir.y = 0; |
| 294 | } |
| 295 | else |
| 296 | { |
| 297 | dir.x = 0; |
| 298 | dir.y = -1; |
| 299 | } |
| 300 | dir = fz_transform_vector(dir, trm); |
| 301 | ndir = fz_normalize_vector(dir); |
| 302 | |
| 303 | size = fz_matrix_expansion(trm); |
| 304 | |
| 305 | /* We need to identify where glyphs 'start' (p) and 'stop' (q). |
| 306 | * Each glyph holds its 'start' position, and the next glyph in the |
| 307 | * span (or span->max if there is no next glyph) holds its 'end' |
| 308 | * position. |
| 309 | * |
| 310 | * For both horizontal and vertical motion, trm->{e,f} gives the |
| 311 | * origin (usually the bottom left) of the glyph. |
| 312 | * |
| 313 | * In horizontal mode: |
| 314 | * + p is bottom left. |
| 315 | * + q is the bottom right |
| 316 | * In vertical mode: |
| 317 | * + p is top left (where it advanced from) |
| 318 | * + q is bottom left |
| 319 | */ |
| 320 | if (wmode == 0) |
| 321 | { |
| 322 | p.x = trm.e; |
| 323 | p.y = trm.f; |
| 324 | q.x = trm.e + adv * dir.x; |
| 325 | q.y = trm.f + adv * dir.y; |
| 326 | } |
| 327 | else |
| 328 | { |
| 329 | p.x = trm.e - adv * dir.x; |
| 330 | p.y = trm.f - adv * dir.y; |
| 331 | q.x = trm.e; |
| 332 | q.y = trm.f; |
| 333 | } |
| 334 | |
| 335 | /* Find current position to enter new text. */ |
| 336 | cur_block = page->last_block; |
| 337 | if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT) |
| 338 | cur_block = NULL; |
| 339 | cur_line = cur_block ? cur_block->u.t.last_line : NULL; |
| 340 | |
| 341 | if (cur_line && glyph < 0) |
| 342 | { |
| 343 | /* Don't advance pen or break lines for no-glyph characters in a cluster */ |
| 344 | add_char_to_line(ctx, page, cur_line, trm, font, size, c, &dev->pen, &dev->pen, dev->color); |
| 345 | dev->lastchar = c; |
| 346 | return; |
| 347 | } |
| 348 | |
| 349 | if (cur_line == NULL || cur_line->wmode != wmode || vec_dot(&ndir, &cur_line->dir) < 0.999f) |
| 350 | { |
| 351 | /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all), |
| 352 | * then we can't append to the current block/line. */ |
| 353 | new_para = 1; |
| 354 | new_line = 1; |
| 355 | } |
| 356 | else |
| 357 | { |
| 358 | /* Detect fake bold where text is printed twice in the same place. */ |
| 359 | delta.x = fabsf(q.x - dev->pen.x); |
| 360 | delta.y = fabsf(q.y - dev->pen.y); |
| 361 | if (delta.x < FLT_EPSILON && delta.y < FLT_EPSILON && c == dev->lastchar) |
| 362 | return; |
| 363 | |
| 364 | /* Calculate how far we've moved since the last character. */ |
| 365 | delta.x = p.x - dev->pen.x; |
| 366 | delta.y = p.y - dev->pen.y; |
| 367 | |
| 368 | /* The transform has not changed, so we know we're in the same |
| 369 | * direction. Calculate 2 distances; how far off the previous |
| 370 | * baseline we are, together with how far along the baseline |
| 371 | * we are from the expected position. */ |
| 372 | spacing = ndir.x * delta.x + ndir.y * delta.y; |
| 373 | base_offset = -ndir.y * delta.x + ndir.x * delta.y; |
| 374 | |
| 375 | /* Only a small amount off the baseline - we'll take this */ |
| 376 | if (fabsf(base_offset) < size * 0.8f) |
| 377 | { |
| 378 | /* LTR or neutral character */ |
| 379 | if (dev->curdir >= 0) |
| 380 | { |
| 381 | if (fabsf(spacing) < size * SPACE_DIST) |
| 382 | { |
| 383 | /* Motion is in line and small enough to ignore. */ |
| 384 | new_line = 0; |
| 385 | } |
| 386 | else if (fabsf(spacing) > size * SPACE_MAX_DIST) |
| 387 | { |
| 388 | /* Motion is in line and large enough to warrant splitting to a new line */ |
| 389 | new_line = 1; |
| 390 | } |
| 391 | else if (spacing < 0) |
| 392 | { |
| 393 | /* Motion is backward in line! Ignore this odd spacing. */ |
| 394 | new_line = 0; |
| 395 | } |
| 396 | else |
| 397 | { |
| 398 | /* Motion is forward in line and large enough to warrant us adding a space. */ |
| 399 | if (dev->lastchar != ' ' && wmode == 0) |
| 400 | add_space = 1; |
| 401 | new_line = 0; |
| 402 | } |
| 403 | } |
| 404 | |
| 405 | /* RTL character -- disable space character and column detection heuristics */ |
| 406 | else |
| 407 | { |
| 408 | new_line = 0; |
| 409 | if (spacing > size * SPACE_DIST || spacing < 0) |
| 410 | rtl = 0; /* backward (or big jump to 'right' side) means logical order */ |
| 411 | else |
| 412 | rtl = 1; /* visual order, we need to reverse in a post process pass */ |
| 413 | } |
| 414 | } |
| 415 | |
| 416 | /* Enough for a new line, but not enough for a new paragraph */ |
| 417 | else if (fabsf(base_offset) <= size * PARAGRAPH_DIST) |
| 418 | { |
| 419 | /* Check indent to spot text-indent style paragraphs */ |
| 420 | if (wmode == 0 && cur_line && dev->new_obj) |
| 421 | if (fabsf(p.x - dev->start.x) > size * 0.5f) |
| 422 | new_para = 1; |
| 423 | new_line = 1; |
| 424 | } |
| 425 | |
| 426 | /* Way off the baseline - open a new paragraph */ |
| 427 | else |
| 428 | { |
| 429 | new_para = 1; |
| 430 | new_line = 1; |
| 431 | } |
| 432 | } |
| 433 | |
| 434 | /* Start a new block (but only at the beginning of a text object) */ |
| 435 | if (new_para || !cur_block) |
| 436 | { |
| 437 | cur_block = add_text_block_to_page(ctx, page); |
| 438 | cur_line = cur_block->u.t.last_line; |
| 439 | } |
| 440 | |
| 441 | /* Start a new line */ |
| 442 | if (new_line || !cur_line) |
| 443 | { |
| 444 | cur_line = add_line_to_block(ctx, page, cur_block, &ndir, wmode); |
| 445 | dev->start = p; |
| 446 | } |
| 447 | |
| 448 | /* Add synthetic space */ |
| 449 | if (add_space && !(dev->flags & FZ_STEXT_INHIBIT_SPACES)) |
| 450 | add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', &dev->pen, &p, dev->color); |
| 451 | |
| 452 | add_char_to_line(ctx, page, cur_line, trm, font, size, c, &p, &q, dev->color); |
| 453 | dev->lastchar = c; |
| 454 | dev->pen = q; |
| 455 | |
| 456 | dev->new_obj = 0; |
| 457 | dev->trm = trm; |
| 458 | } |
| 459 | |
| 460 | static void |
| 461 | fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode) |
| 462 | { |
| 463 | /* ignore when one unicode character maps to multiple glyphs */ |
| 464 | if (c == -1) |
| 465 | return; |
| 466 | |
| 467 | if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES)) |
| 468 | { |
| 469 | switch (c) |
| 470 | { |
| 471 | case 0xFB00: /* ff */ |
| 472 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); |
| 473 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); |
| 474 | return; |
| 475 | case 0xFB01: /* fi */ |
| 476 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); |
| 477 | fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode); |
| 478 | return; |
| 479 | case 0xFB02: /* fl */ |
| 480 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); |
| 481 | fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode); |
| 482 | return; |
| 483 | case 0xFB03: /* ffi */ |
| 484 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); |
| 485 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); |
| 486 | fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode); |
| 487 | return; |
| 488 | case 0xFB04: /* ffl */ |
| 489 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); |
| 490 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); |
| 491 | fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode); |
| 492 | return; |
| 493 | case 0xFB05: /* long st */ |
| 494 | case 0xFB06: /* st */ |
| 495 | fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode); |
| 496 | fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode); |
| 497 | return; |
| 498 | } |
| 499 | } |
| 500 | |
| 501 | if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE)) |
| 502 | { |
| 503 | switch (c) |
| 504 | { |
| 505 | case 0x0009: /* tab */ |
| 506 | case 0x0020: /* space */ |
| 507 | case 0x00A0: /* no-break space */ |
| 508 | case 0x1680: /* ogham space mark */ |
| 509 | case 0x180E: /* mongolian vowel separator */ |
| 510 | case 0x2000: /* en quad */ |
| 511 | case 0x2001: /* em quad */ |
| 512 | case 0x2002: /* en space */ |
| 513 | case 0x2003: /* em space */ |
| 514 | case 0x2004: /* three-per-em space */ |
| 515 | case 0x2005: /* four-per-em space */ |
| 516 | case 0x2006: /* six-per-em space */ |
| 517 | case 0x2007: /* figure space */ |
| 518 | case 0x2008: /* punctuation space */ |
| 519 | case 0x2009: /* thin space */ |
| 520 | case 0x200A: /* hair space */ |
| 521 | case 0x202F: /* narrow no-break space */ |
| 522 | case 0x205F: /* medium mathematical space */ |
| 523 | case 0x3000: /* ideographic space */ |
| 524 | c = ' '; |
| 525 | } |
| 526 | } |
| 527 | |
| 528 | fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode); |
| 529 | } |
| 530 | |
| 531 | static void |
| 532 | (fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm) |
| 533 | { |
| 534 | fz_font *font = span->font; |
| 535 | fz_matrix tm = span->trm; |
| 536 | fz_matrix trm; |
| 537 | float adv; |
| 538 | int i; |
| 539 | |
| 540 | if (span->len == 0) |
| 541 | return; |
| 542 | |
| 543 | tm.e = 0; |
| 544 | tm.f = 0; |
| 545 | trm = fz_concat(tm, ctm); |
| 546 | |
| 547 | for (i = 0; i < span->len; i++) |
| 548 | { |
| 549 | /* Calculate new pen location and delta */ |
| 550 | tm.e = span->items[i].x; |
| 551 | tm.f = span->items[i].y; |
| 552 | trm = fz_concat(tm, ctm); |
| 553 | |
| 554 | /* Calculate bounding box and new pen position based on font metrics */ |
| 555 | if (span->items[i].gid >= 0) |
| 556 | adv = fz_advance_glyph(ctx, font, span->items[i].gid, span->wmode); |
| 557 | else |
| 558 | adv = 0; |
| 559 | |
| 560 | fz_add_stext_char(ctx, dev, font, span->items[i].ucs, span->items[i].gid, trm, adv, span->wmode); |
| 561 | } |
| 562 | } |
| 563 | |
| 564 | static int hexrgb_from_color(fz_context *ctx, fz_colorspace *colorspace, const float *color) |
| 565 | { |
| 566 | float rgb[3]; |
| 567 | fz_convert_color(ctx, colorspace, color, fz_device_rgb(ctx), rgb, NULL, fz_default_color_params); |
| 568 | return |
| 569 | (fz_clampi(rgb[0] * 255, 0, 255) << 16) | |
| 570 | (fz_clampi(rgb[1] * 255, 0, 255) << 8) | |
| 571 | (fz_clampi(rgb[2] * 255, 0, 255)); |
| 572 | } |
| 573 | |
| 574 | static void |
| 575 | fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, |
| 576 | fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) |
| 577 | { |
| 578 | fz_stext_device *tdev = (fz_stext_device*)dev; |
| 579 | fz_text_span *span; |
| 580 | if (text == tdev->lasttext) |
| 581 | return; |
| 582 | tdev->color = hexrgb_from_color(ctx, colorspace, color); |
| 583 | tdev->new_obj = 1; |
| 584 | for (span = text->head; span; span = span->next) |
| 585 | fz_stext_extract(ctx, tdev, span, ctm); |
| 586 | fz_drop_text(ctx, tdev->lasttext); |
| 587 | tdev->lasttext = fz_keep_text(ctx, text); |
| 588 | } |
| 589 | |
| 590 | static void |
| 591 | fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, |
| 592 | fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) |
| 593 | { |
| 594 | fz_stext_device *tdev = (fz_stext_device*)dev; |
| 595 | fz_text_span *span; |
| 596 | if (text == tdev->lasttext) |
| 597 | return; |
| 598 | tdev->color = hexrgb_from_color(ctx, colorspace, color); |
| 599 | tdev->new_obj = 1; |
| 600 | for (span = text->head; span; span = span->next) |
| 601 | fz_stext_extract(ctx, tdev, span, ctm); |
| 602 | fz_drop_text(ctx, tdev->lasttext); |
| 603 | tdev->lasttext = fz_keep_text(ctx, text); |
| 604 | } |
| 605 | |
| 606 | static void |
| 607 | fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor) |
| 608 | { |
| 609 | fz_stext_device *tdev = (fz_stext_device*)dev; |
| 610 | fz_text_span *span; |
| 611 | if (text == tdev->lasttext) |
| 612 | return; |
| 613 | tdev->color = 0; |
| 614 | tdev->new_obj = 1; |
| 615 | for (span = text->head; span; span = span->next) |
| 616 | fz_stext_extract(ctx, tdev, span, ctm); |
| 617 | fz_drop_text(ctx, tdev->lasttext); |
| 618 | tdev->lasttext = fz_keep_text(ctx, text); |
| 619 | } |
| 620 | |
| 621 | static void |
| 622 | fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor) |
| 623 | { |
| 624 | fz_stext_device *tdev = (fz_stext_device*)dev; |
| 625 | fz_text_span *span; |
| 626 | if (text == tdev->lasttext) |
| 627 | return; |
| 628 | tdev->color = 0; |
| 629 | tdev->new_obj = 1; |
| 630 | for (span = text->head; span; span = span->next) |
| 631 | fz_stext_extract(ctx, tdev, span, ctm); |
| 632 | fz_drop_text(ctx, tdev->lasttext); |
| 633 | tdev->lasttext = fz_keep_text(ctx, text); |
| 634 | } |
| 635 | |
| 636 | static void |
| 637 | fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm) |
| 638 | { |
| 639 | fz_stext_device *tdev = (fz_stext_device*)dev; |
| 640 | fz_text_span *span; |
| 641 | if (text == tdev->lasttext) |
| 642 | return; |
| 643 | tdev->color = 0; |
| 644 | tdev->new_obj = 1; |
| 645 | for (span = text->head; span; span = span->next) |
| 646 | fz_stext_extract(ctx, tdev, span, ctm); |
| 647 | fz_drop_text(ctx, tdev->lasttext); |
| 648 | tdev->lasttext = fz_keep_text(ctx, text); |
| 649 | } |
| 650 | |
| 651 | /* Images and shadings */ |
| 652 | |
| 653 | static void |
| 654 | fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params) |
| 655 | { |
| 656 | fz_stext_device *tdev = (fz_stext_device*)dev; |
| 657 | |
| 658 | /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */ |
| 659 | if (alpha < 0.5f) |
| 660 | return; |
| 661 | |
| 662 | add_image_block_to_page(ctx, tdev->page, ctm, img); |
| 663 | } |
| 664 | |
| 665 | static void |
| 666 | fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, |
| 667 | fz_colorspace *cspace, const float *color, float alpha, fz_color_params color_params) |
| 668 | { |
| 669 | fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params); |
| 670 | } |
| 671 | |
| 672 | static fz_image * |
| 673 | fz_new_image_from_shade(fz_context *ctx, fz_shade *shade, fz_matrix *in_out_ctm, fz_color_params color_params, fz_rect scissor) |
| 674 | { |
| 675 | fz_matrix ctm = *in_out_ctm; |
| 676 | fz_pixmap *pix; |
| 677 | fz_image *img = NULL; |
| 678 | fz_rect bounds; |
| 679 | fz_irect bbox; |
| 680 | |
| 681 | bounds = fz_bound_shade(ctx, shade, ctm); |
| 682 | bounds = fz_intersect_rect(bounds, scissor); |
| 683 | bbox = fz_irect_from_rect(bounds); |
| 684 | |
| 685 | pix = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), bbox, NULL, !shade->use_background); |
| 686 | fz_try(ctx) |
| 687 | { |
| 688 | if (shade->use_background) |
| 689 | fz_fill_pixmap_with_color(ctx, pix, shade->colorspace, shade->background, color_params); |
| 690 | else |
| 691 | fz_clear_pixmap(ctx, pix); |
| 692 | fz_paint_shade(ctx, shade, NULL, ctm, pix, color_params, bbox, NULL); |
| 693 | img = fz_new_image_from_pixmap(ctx, pix, NULL); |
| 694 | } |
| 695 | fz_always(ctx) |
| 696 | fz_drop_pixmap(ctx, pix); |
| 697 | fz_catch(ctx) |
| 698 | fz_rethrow(ctx); |
| 699 | |
| 700 | in_out_ctm->a = pix->w; |
| 701 | in_out_ctm->b = 0; |
| 702 | in_out_ctm->c = 0; |
| 703 | in_out_ctm->d = pix->h; |
| 704 | in_out_ctm->e = pix->x; |
| 705 | in_out_ctm->f = pix->y; |
| 706 | return img; |
| 707 | } |
| 708 | |
| 709 | static void |
| 710 | fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params) |
| 711 | { |
| 712 | fz_matrix local_ctm = ctm; |
| 713 | fz_rect scissor = fz_device_current_scissor(ctx, dev); |
| 714 | fz_image *image = fz_new_image_from_shade(ctx, shade, &local_ctm, color_params, scissor); |
| 715 | fz_try(ctx) |
| 716 | fz_stext_fill_image(ctx, dev, image, local_ctm, alpha, color_params); |
| 717 | fz_always(ctx) |
| 718 | fz_drop_image(ctx, image); |
| 719 | fz_catch(ctx) |
| 720 | fz_rethrow(ctx); |
| 721 | } |
| 722 | |
| 723 | static void |
| 724 | fz_stext_close_device(fz_context *ctx, fz_device *dev) |
| 725 | { |
| 726 | fz_stext_device *tdev = (fz_stext_device*)dev; |
| 727 | fz_stext_page *page = tdev->page; |
| 728 | fz_stext_block *block; |
| 729 | fz_stext_line *line; |
| 730 | fz_stext_char *ch; |
| 731 | |
| 732 | for (block = page->first_block; block; block = block->next) |
| 733 | { |
| 734 | if (block->type != FZ_STEXT_BLOCK_TEXT) |
| 735 | continue; |
| 736 | for (line = block->u.t.first_line; line; line = line->next) |
| 737 | { |
| 738 | for (ch = line->first_char; ch; ch = ch->next) |
| 739 | { |
| 740 | fz_rect ch_box = fz_rect_from_quad(ch->quad); |
| 741 | if (ch == line->first_char) |
| 742 | line->bbox = ch_box; |
| 743 | else |
| 744 | line->bbox = fz_union_rect(line->bbox, ch_box); |
| 745 | } |
| 746 | block->bbox = fz_union_rect(block->bbox, line->bbox); |
| 747 | } |
| 748 | } |
| 749 | |
| 750 | /* TODO: smart sorting of blocks and lines in reading order */ |
| 751 | /* TODO: unicode NFC normalization */ |
| 752 | } |
| 753 | |
| 754 | static void |
| 755 | fz_stext_drop_device(fz_context *ctx, fz_device *dev) |
| 756 | { |
| 757 | fz_stext_device *tdev = (fz_stext_device*)dev; |
| 758 | fz_drop_text(ctx, tdev->lasttext); |
| 759 | } |
| 760 | |
| 761 | /* |
| 762 | Parse stext device options from a comma separated key-value string. |
| 763 | */ |
| 764 | fz_stext_options * |
| 765 | fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string) |
| 766 | { |
| 767 | const char *val; |
| 768 | |
| 769 | memset(opts, 0, sizeof *opts); |
| 770 | |
| 771 | if (fz_has_option(ctx, string, "preserve-ligatures" , &val) && fz_option_eq(val, "yes" )) |
| 772 | opts->flags |= FZ_STEXT_PRESERVE_LIGATURES; |
| 773 | if (fz_has_option(ctx, string, "preserve-whitespace" , &val) && fz_option_eq(val, "yes" )) |
| 774 | opts->flags |= FZ_STEXT_PRESERVE_WHITESPACE; |
| 775 | if (fz_has_option(ctx, string, "preserve-images" , &val) && fz_option_eq(val, "yes" )) |
| 776 | opts->flags |= FZ_STEXT_PRESERVE_IMAGES; |
| 777 | if (fz_has_option(ctx, string, "inhibit-spaces" , &val) && fz_option_eq(val, "yes" )) |
| 778 | opts->flags |= FZ_STEXT_INHIBIT_SPACES; |
| 779 | |
| 780 | return opts; |
| 781 | } |
| 782 | |
| 783 | /* |
| 784 | Create a device to extract the text on a page. |
| 785 | |
| 786 | Gather the text on a page into blocks and lines. |
| 787 | |
| 788 | The reading order is taken from the order the text is drawn in the |
| 789 | source file, so may not be accurate. |
| 790 | |
| 791 | page: The text page to which content should be added. This will |
| 792 | usually be a newly created (empty) text page, but it can be one |
| 793 | containing data already (for example when merging multiple pages, |
| 794 | or watermarking). |
| 795 | |
| 796 | options: Options to configure the stext device. |
| 797 | */ |
| 798 | fz_device * |
| 799 | fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts) |
| 800 | { |
| 801 | fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device); |
| 802 | |
| 803 | dev->super.close_device = fz_stext_close_device; |
| 804 | dev->super.drop_device = fz_stext_drop_device; |
| 805 | |
| 806 | dev->super.fill_text = fz_stext_fill_text; |
| 807 | dev->super.stroke_text = fz_stext_stroke_text; |
| 808 | dev->super.clip_text = fz_stext_clip_text; |
| 809 | dev->super.clip_stroke_text = fz_stext_clip_stroke_text; |
| 810 | dev->super.ignore_text = fz_stext_ignore_text; |
| 811 | |
| 812 | if (opts && (opts->flags & FZ_STEXT_PRESERVE_IMAGES)) |
| 813 | { |
| 814 | dev->super.fill_shade = fz_stext_fill_shade; |
| 815 | dev->super.fill_image = fz_stext_fill_image; |
| 816 | dev->super.fill_image_mask = fz_stext_fill_image_mask; |
| 817 | } |
| 818 | |
| 819 | if (opts) |
| 820 | dev->flags = opts->flags; |
| 821 | dev->page = page; |
| 822 | dev->pen.x = 0; |
| 823 | dev->pen.y = 0; |
| 824 | dev->trm = fz_identity; |
| 825 | dev->lastchar = ' '; |
| 826 | dev->curdir = 1; |
| 827 | dev->lasttext = NULL; |
| 828 | |
| 829 | return (fz_device*)dev; |
| 830 | } |
| 831 | |