| 1 | #include "mupdf/fitz.h" | 
|---|
| 2 | #include "mupdf/ucdn.h" | 
|---|
| 3 |  | 
|---|
| 4 | #include <math.h> | 
|---|
| 5 | #include <float.h> | 
|---|
| 6 | #include <string.h> | 
|---|
| 7 |  | 
|---|
| 8 | /* Simple layout structure */ | 
|---|
| 9 |  | 
|---|
| 10 | fz_layout_block *fz_new_layout(fz_context *ctx) | 
|---|
| 11 | { | 
|---|
| 12 | fz_pool *pool = fz_new_pool(ctx); | 
|---|
| 13 | fz_layout_block *block; | 
|---|
| 14 | fz_try(ctx) | 
|---|
| 15 | { | 
|---|
| 16 | block = fz_pool_alloc(ctx, pool, sizeof (fz_layout_block)); | 
|---|
| 17 | block->pool = pool; | 
|---|
| 18 | block->head = NULL; | 
|---|
| 19 | block->tailp = &block->head; | 
|---|
| 20 | } | 
|---|
| 21 | fz_catch(ctx) | 
|---|
| 22 | { | 
|---|
| 23 | fz_drop_pool(ctx, pool); | 
|---|
| 24 | fz_rethrow(ctx); | 
|---|
| 25 | } | 
|---|
| 26 | return block; | 
|---|
| 27 | } | 
|---|
| 28 |  | 
|---|
| 29 | void fz_drop_layout(fz_context *ctx, fz_layout_block *block) | 
|---|
| 30 | { | 
|---|
| 31 | if (block) | 
|---|
| 32 | fz_drop_pool(ctx, block->pool); | 
|---|
| 33 | } | 
|---|
| 34 |  | 
|---|
| 35 | void fz_add_layout_line(fz_context *ctx, fz_layout_block *block, float x, float y, float h, const char *p) | 
|---|
| 36 | { | 
|---|
| 37 | fz_layout_line *line = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_line)); | 
|---|
| 38 | line->x = x; | 
|---|
| 39 | line->y = y; | 
|---|
| 40 | line->h = h; | 
|---|
| 41 | line->p = p; | 
|---|
| 42 | line->text = NULL; | 
|---|
| 43 | line->next = NULL; | 
|---|
| 44 | *block->tailp = line; | 
|---|
| 45 | block->tailp = &line->next; | 
|---|
| 46 | block->text_tailp = &line->text; | 
|---|
| 47 | } | 
|---|
| 48 |  | 
|---|
| 49 | void fz_add_layout_char(fz_context *ctx, fz_layout_block *block, float x, float w, const char *p) | 
|---|
| 50 | { | 
|---|
| 51 | fz_layout_char *ch = fz_pool_alloc(ctx, block->pool, sizeof (fz_layout_char)); | 
|---|
| 52 | ch->x = x; | 
|---|
| 53 | ch->w = w; | 
|---|
| 54 | ch->p = p; | 
|---|
| 55 | ch->next = NULL; | 
|---|
| 56 | *block->text_tailp = ch; | 
|---|
| 57 | block->text_tailp = &ch->next; | 
|---|
| 58 | } | 
|---|
| 59 |  | 
|---|
| 60 | /* Extract text into blocks and lines. */ | 
|---|
| 61 |  | 
|---|
| 62 | #define PARAGRAPH_DIST 1.5f | 
|---|
| 63 | #define SPACE_DIST 0.15f | 
|---|
| 64 | #define SPACE_MAX_DIST 0.8f | 
|---|
| 65 |  | 
|---|
| 66 | typedef struct fz_stext_device_s fz_stext_device; | 
|---|
| 67 |  | 
|---|
| 68 | struct fz_stext_device_s | 
|---|
| 69 | { | 
|---|
| 70 | fz_device super; | 
|---|
| 71 | fz_stext_page *page; | 
|---|
| 72 | fz_point pen, start; | 
|---|
| 73 | fz_matrix trm; | 
|---|
| 74 | int new_obj; | 
|---|
| 75 | int curdir; | 
|---|
| 76 | int lastchar; | 
|---|
| 77 | int flags; | 
|---|
| 78 | int color; | 
|---|
| 79 | const fz_text *lasttext; | 
|---|
| 80 | }; | 
|---|
| 81 |  | 
|---|
| 82 | const char *fz_stext_options_usage = | 
|---|
| 83 | "Text output options:\n" | 
|---|
| 84 | "\tinhibit-spaces: don't add spaces between gaps in the text\n" | 
|---|
| 85 | "\tpreserve-images: keep images in output\n" | 
|---|
| 86 | "\tpreserve-ligatures: do not expand ligatures into constituent characters\n" | 
|---|
| 87 | "\tpreserve-whitespace: do not convert all whitespace into space characters\n" | 
|---|
| 88 | "\n"; | 
|---|
| 89 |  | 
|---|
| 90 | /* | 
|---|
| 91 | Create an empty text page. | 
|---|
| 92 |  | 
|---|
| 93 | The text page is filled out by the text device to contain the blocks | 
|---|
| 94 | and lines of text on the page. | 
|---|
| 95 |  | 
|---|
| 96 | mediabox: optional mediabox information. | 
|---|
| 97 | */ | 
|---|
| 98 | fz_stext_page * | 
|---|
| 99 | fz_new_stext_page(fz_context *ctx, fz_rect mediabox) | 
|---|
| 100 | { | 
|---|
| 101 | fz_pool *pool = fz_new_pool(ctx); | 
|---|
| 102 | fz_stext_page *page = NULL; | 
|---|
| 103 | fz_try(ctx) | 
|---|
| 104 | { | 
|---|
| 105 | page = fz_pool_alloc(ctx, pool, sizeof(*page)); | 
|---|
| 106 | page->pool = pool; | 
|---|
| 107 | page->mediabox = mediabox; | 
|---|
| 108 | page->first_block = NULL; | 
|---|
| 109 | page->last_block = NULL; | 
|---|
| 110 | } | 
|---|
| 111 | fz_catch(ctx) | 
|---|
| 112 | { | 
|---|
| 113 | fz_drop_pool(ctx, pool); | 
|---|
| 114 | fz_rethrow(ctx); | 
|---|
| 115 | } | 
|---|
| 116 | return page; | 
|---|
| 117 | } | 
|---|
| 118 |  | 
|---|
| 119 | void | 
|---|
| 120 | fz_drop_stext_page(fz_context *ctx, fz_stext_page *page) | 
|---|
| 121 | { | 
|---|
| 122 | if (page) | 
|---|
| 123 | { | 
|---|
| 124 | fz_stext_block *block; | 
|---|
| 125 | for (block = page->first_block; block; block = block->next) | 
|---|
| 126 | if (block->type == FZ_STEXT_BLOCK_IMAGE) | 
|---|
| 127 | fz_drop_image(ctx, block->u.i.image); | 
|---|
| 128 | fz_drop_pool(ctx, page->pool); | 
|---|
| 129 | } | 
|---|
| 130 | } | 
|---|
| 131 |  | 
|---|
| 132 | static fz_stext_block * | 
|---|
| 133 | add_block_to_page(fz_context *ctx, fz_stext_page *page) | 
|---|
| 134 | { | 
|---|
| 135 | fz_stext_block *block = fz_pool_alloc(ctx, page->pool, sizeof *page->first_block); | 
|---|
| 136 | block->prev = page->last_block; | 
|---|
| 137 | if (!page->first_block) | 
|---|
| 138 | page->first_block = page->last_block = block; | 
|---|
| 139 | else | 
|---|
| 140 | { | 
|---|
| 141 | page->last_block->next = block; | 
|---|
| 142 | page->last_block = block; | 
|---|
| 143 | } | 
|---|
| 144 | return block; | 
|---|
| 145 | } | 
|---|
| 146 |  | 
|---|
| 147 | static fz_stext_block * | 
|---|
| 148 | add_text_block_to_page(fz_context *ctx, fz_stext_page *page) | 
|---|
| 149 | { | 
|---|
| 150 | fz_stext_block *block = add_block_to_page(ctx, page); | 
|---|
| 151 | block->type = FZ_STEXT_BLOCK_TEXT; | 
|---|
| 152 | return block; | 
|---|
| 153 | } | 
|---|
| 154 |  | 
|---|
| 155 | static fz_stext_block * | 
|---|
| 156 | add_image_block_to_page(fz_context *ctx, fz_stext_page *page, fz_matrix ctm, fz_image *image) | 
|---|
| 157 | { | 
|---|
| 158 | fz_stext_block *block = add_block_to_page(ctx, page); | 
|---|
| 159 | block->type = FZ_STEXT_BLOCK_IMAGE; | 
|---|
| 160 | block->u.i.transform = ctm; | 
|---|
| 161 | block->u.i.image = fz_keep_image(ctx, image); | 
|---|
| 162 | block->bbox = fz_transform_rect(fz_unit_rect, ctm); | 
|---|
| 163 | return block; | 
|---|
| 164 | } | 
|---|
| 165 |  | 
|---|
| 166 | static fz_stext_line * | 
|---|
| 167 | add_line_to_block(fz_context *ctx, fz_stext_page *page, fz_stext_block *block, const fz_point *dir, int wmode) | 
|---|
| 168 | { | 
|---|
| 169 | fz_stext_line *line = fz_pool_alloc(ctx, page->pool, sizeof *block->u.t.first_line); | 
|---|
| 170 | line->prev = block->u.t.last_line; | 
|---|
| 171 | if (!block->u.t.first_line) | 
|---|
| 172 | block->u.t.first_line = block->u.t.last_line = line; | 
|---|
| 173 | else | 
|---|
| 174 | { | 
|---|
| 175 | block->u.t.last_line->next = line; | 
|---|
| 176 | block->u.t.last_line = line; | 
|---|
| 177 | } | 
|---|
| 178 |  | 
|---|
| 179 | line->dir = *dir; | 
|---|
| 180 | line->wmode = wmode; | 
|---|
| 181 |  | 
|---|
| 182 | return line; | 
|---|
| 183 | } | 
|---|
| 184 |  | 
|---|
| 185 | static fz_stext_char * | 
|---|
| 186 | add_char_to_line(fz_context *ctx, fz_stext_page *page, fz_stext_line *line, fz_matrix trm, fz_font *font, float size, int c, fz_point *p, fz_point *q, int color) | 
|---|
| 187 | { | 
|---|
| 188 | fz_stext_char *ch = fz_pool_alloc(ctx, page->pool, sizeof *line->first_char); | 
|---|
| 189 | fz_point a, d; | 
|---|
| 190 |  | 
|---|
| 191 | if (!line->first_char) | 
|---|
| 192 | line->first_char = line->last_char = ch; | 
|---|
| 193 | else | 
|---|
| 194 | { | 
|---|
| 195 | line->last_char->next = ch; | 
|---|
| 196 | line->last_char = ch; | 
|---|
| 197 | } | 
|---|
| 198 |  | 
|---|
| 199 | ch->c = c; | 
|---|
| 200 | ch->color = color; | 
|---|
| 201 | ch->origin = *p; | 
|---|
| 202 | ch->size = size; | 
|---|
| 203 | ch->font = font; /* TODO: keep and drop */ | 
|---|
| 204 |  | 
|---|
| 205 | if (line->wmode == 0) | 
|---|
| 206 | { | 
|---|
| 207 | a.x = 0; | 
|---|
| 208 | d.x = 0; | 
|---|
| 209 | a.y = fz_font_ascender(ctx, font); | 
|---|
| 210 | d.y = fz_font_descender(ctx, font); | 
|---|
| 211 | } | 
|---|
| 212 | else | 
|---|
| 213 | { | 
|---|
| 214 | fz_rect bbox = fz_font_bbox(ctx, font); | 
|---|
| 215 | a.x = bbox.x1; | 
|---|
| 216 | d.x = bbox.x0; | 
|---|
| 217 | a.y = 0; | 
|---|
| 218 | d.y = 0; | 
|---|
| 219 | } | 
|---|
| 220 | a = fz_transform_vector(a, trm); | 
|---|
| 221 | d = fz_transform_vector(d, trm); | 
|---|
| 222 |  | 
|---|
| 223 | ch->quad.ll = fz_make_point(p->x + d.x, p->y + d.y); | 
|---|
| 224 | ch->quad.ul = fz_make_point(p->x + a.x, p->y + a.y); | 
|---|
| 225 | ch->quad.lr = fz_make_point(q->x + d.x, q->y + d.y); | 
|---|
| 226 | ch->quad.ur = fz_make_point(q->x + a.x, q->y + a.y); | 
|---|
| 227 |  | 
|---|
| 228 | return ch; | 
|---|
| 229 | } | 
|---|
| 230 |  | 
|---|
| 231 | static int | 
|---|
| 232 | direction_from_bidi_class(int bidiclass, int curdir) | 
|---|
| 233 | { | 
|---|
| 234 | switch (bidiclass) | 
|---|
| 235 | { | 
|---|
| 236 | /* strong */ | 
|---|
| 237 | case UCDN_BIDI_CLASS_L: return 1; | 
|---|
| 238 | case UCDN_BIDI_CLASS_R: return -1; | 
|---|
| 239 | case UCDN_BIDI_CLASS_AL: return -1; | 
|---|
| 240 |  | 
|---|
| 241 | /* weak */ | 
|---|
| 242 | case UCDN_BIDI_CLASS_EN: | 
|---|
| 243 | case UCDN_BIDI_CLASS_ES: | 
|---|
| 244 | case UCDN_BIDI_CLASS_ET: | 
|---|
| 245 | case UCDN_BIDI_CLASS_AN: | 
|---|
| 246 | case UCDN_BIDI_CLASS_CS: | 
|---|
| 247 | case UCDN_BIDI_CLASS_NSM: | 
|---|
| 248 | case UCDN_BIDI_CLASS_BN: | 
|---|
| 249 | return curdir; | 
|---|
| 250 |  | 
|---|
| 251 | /* neutral */ | 
|---|
| 252 | case UCDN_BIDI_CLASS_B: | 
|---|
| 253 | case UCDN_BIDI_CLASS_S: | 
|---|
| 254 | case UCDN_BIDI_CLASS_WS: | 
|---|
| 255 | case UCDN_BIDI_CLASS_ON: | 
|---|
| 256 | return curdir; | 
|---|
| 257 |  | 
|---|
| 258 | /* embedding, override, pop ... we don't support them */ | 
|---|
| 259 | default: | 
|---|
| 260 | return 0; | 
|---|
| 261 | } | 
|---|
| 262 | } | 
|---|
| 263 |  | 
|---|
| 264 | static float | 
|---|
| 265 | vec_dot(const fz_point *a, const fz_point *b) | 
|---|
| 266 | { | 
|---|
| 267 | return a->x * b->x + a->y * b->y; | 
|---|
| 268 | } | 
|---|
| 269 |  | 
|---|
| 270 | static void | 
|---|
| 271 | fz_add_stext_char_imp(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode) | 
|---|
| 272 | { | 
|---|
| 273 | fz_stext_page *page = dev->page; | 
|---|
| 274 | fz_stext_block *cur_block; | 
|---|
| 275 | fz_stext_line *cur_line; | 
|---|
| 276 |  | 
|---|
| 277 | int new_para = 0; | 
|---|
| 278 | int new_line = 1; | 
|---|
| 279 | int add_space = 0; | 
|---|
| 280 | fz_point dir, ndir, p, q; | 
|---|
| 281 | float size; | 
|---|
| 282 | fz_point delta; | 
|---|
| 283 | float spacing = 0; | 
|---|
| 284 | float base_offset = 0; | 
|---|
| 285 | int rtl = 0; | 
|---|
| 286 |  | 
|---|
| 287 | dev->curdir = direction_from_bidi_class(ucdn_get_bidi_class(c), dev->curdir); | 
|---|
| 288 |  | 
|---|
| 289 | /* dir = direction vector for motion. ndir = normalised(dir) */ | 
|---|
| 290 | if (wmode == 0) | 
|---|
| 291 | { | 
|---|
| 292 | dir.x = 1; | 
|---|
| 293 | dir.y = 0; | 
|---|
| 294 | } | 
|---|
| 295 | else | 
|---|
| 296 | { | 
|---|
| 297 | dir.x = 0; | 
|---|
| 298 | dir.y = -1; | 
|---|
| 299 | } | 
|---|
| 300 | dir = fz_transform_vector(dir, trm); | 
|---|
| 301 | ndir = fz_normalize_vector(dir); | 
|---|
| 302 |  | 
|---|
| 303 | size = fz_matrix_expansion(trm); | 
|---|
| 304 |  | 
|---|
| 305 | /* We need to identify where glyphs 'start' (p) and 'stop' (q). | 
|---|
| 306 | * Each glyph holds its 'start' position, and the next glyph in the | 
|---|
| 307 | * span (or span->max if there is no next glyph) holds its 'end' | 
|---|
| 308 | * position. | 
|---|
| 309 | * | 
|---|
| 310 | * For both horizontal and vertical motion, trm->{e,f} gives the | 
|---|
| 311 | * origin (usually the bottom left) of the glyph. | 
|---|
| 312 | * | 
|---|
| 313 | * In horizontal mode: | 
|---|
| 314 | *   + p is bottom left. | 
|---|
| 315 | *   + q is the bottom right | 
|---|
| 316 | * In vertical mode: | 
|---|
| 317 | *   + p is top left (where it advanced from) | 
|---|
| 318 | *   + q is bottom left | 
|---|
| 319 | */ | 
|---|
| 320 | if (wmode == 0) | 
|---|
| 321 | { | 
|---|
| 322 | p.x = trm.e; | 
|---|
| 323 | p.y = trm.f; | 
|---|
| 324 | q.x = trm.e + adv * dir.x; | 
|---|
| 325 | q.y = trm.f + adv * dir.y; | 
|---|
| 326 | } | 
|---|
| 327 | else | 
|---|
| 328 | { | 
|---|
| 329 | p.x = trm.e - adv * dir.x; | 
|---|
| 330 | p.y = trm.f - adv * dir.y; | 
|---|
| 331 | q.x = trm.e; | 
|---|
| 332 | q.y = trm.f; | 
|---|
| 333 | } | 
|---|
| 334 |  | 
|---|
| 335 | /* Find current position to enter new text. */ | 
|---|
| 336 | cur_block = page->last_block; | 
|---|
| 337 | if (cur_block && cur_block->type != FZ_STEXT_BLOCK_TEXT) | 
|---|
| 338 | cur_block = NULL; | 
|---|
| 339 | cur_line = cur_block ? cur_block->u.t.last_line : NULL; | 
|---|
| 340 |  | 
|---|
| 341 | if (cur_line && glyph < 0) | 
|---|
| 342 | { | 
|---|
| 343 | /* Don't advance pen or break lines for no-glyph characters in a cluster */ | 
|---|
| 344 | add_char_to_line(ctx, page, cur_line, trm, font, size, c, &dev->pen, &dev->pen, dev->color); | 
|---|
| 345 | dev->lastchar = c; | 
|---|
| 346 | return; | 
|---|
| 347 | } | 
|---|
| 348 |  | 
|---|
| 349 | if (cur_line == NULL || cur_line->wmode != wmode || vec_dot(&ndir, &cur_line->dir) < 0.999f) | 
|---|
| 350 | { | 
|---|
| 351 | /* If the matrix has changed rotation, or the wmode is different (or if we don't have a line at all), | 
|---|
| 352 | * then we can't append to the current block/line. */ | 
|---|
| 353 | new_para = 1; | 
|---|
| 354 | new_line = 1; | 
|---|
| 355 | } | 
|---|
| 356 | else | 
|---|
| 357 | { | 
|---|
| 358 | /* Detect fake bold where text is printed twice in the same place. */ | 
|---|
| 359 | delta.x = fabsf(q.x - dev->pen.x); | 
|---|
| 360 | delta.y = fabsf(q.y - dev->pen.y); | 
|---|
| 361 | if (delta.x < FLT_EPSILON && delta.y < FLT_EPSILON && c == dev->lastchar) | 
|---|
| 362 | return; | 
|---|
| 363 |  | 
|---|
| 364 | /* Calculate how far we've moved since the last character. */ | 
|---|
| 365 | delta.x = p.x - dev->pen.x; | 
|---|
| 366 | delta.y = p.y - dev->pen.y; | 
|---|
| 367 |  | 
|---|
| 368 | /* The transform has not changed, so we know we're in the same | 
|---|
| 369 | * direction. Calculate 2 distances; how far off the previous | 
|---|
| 370 | * baseline we are, together with how far along the baseline | 
|---|
| 371 | * we are from the expected position. */ | 
|---|
| 372 | spacing = ndir.x * delta.x + ndir.y * delta.y; | 
|---|
| 373 | base_offset = -ndir.y * delta.x + ndir.x * delta.y; | 
|---|
| 374 |  | 
|---|
| 375 | /* Only a small amount off the baseline - we'll take this */ | 
|---|
| 376 | if (fabsf(base_offset) < size * 0.8f) | 
|---|
| 377 | { | 
|---|
| 378 | /* LTR or neutral character */ | 
|---|
| 379 | if (dev->curdir >= 0) | 
|---|
| 380 | { | 
|---|
| 381 | if (fabsf(spacing) < size * SPACE_DIST) | 
|---|
| 382 | { | 
|---|
| 383 | /* Motion is in line and small enough to ignore. */ | 
|---|
| 384 | new_line = 0; | 
|---|
| 385 | } | 
|---|
| 386 | else if (fabsf(spacing) > size * SPACE_MAX_DIST) | 
|---|
| 387 | { | 
|---|
| 388 | /* Motion is in line and large enough to warrant splitting to a new line */ | 
|---|
| 389 | new_line = 1; | 
|---|
| 390 | } | 
|---|
| 391 | else if (spacing < 0) | 
|---|
| 392 | { | 
|---|
| 393 | /* Motion is backward in line! Ignore this odd spacing. */ | 
|---|
| 394 | new_line = 0; | 
|---|
| 395 | } | 
|---|
| 396 | else | 
|---|
| 397 | { | 
|---|
| 398 | /* Motion is forward in line and large enough to warrant us adding a space. */ | 
|---|
| 399 | if (dev->lastchar != ' ' && wmode == 0) | 
|---|
| 400 | add_space = 1; | 
|---|
| 401 | new_line = 0; | 
|---|
| 402 | } | 
|---|
| 403 | } | 
|---|
| 404 |  | 
|---|
| 405 | /* RTL character -- disable space character and column detection heuristics */ | 
|---|
| 406 | else | 
|---|
| 407 | { | 
|---|
| 408 | new_line = 0; | 
|---|
| 409 | if (spacing > size * SPACE_DIST || spacing < 0) | 
|---|
| 410 | rtl = 0; /* backward (or big jump to 'right' side) means logical order */ | 
|---|
| 411 | else | 
|---|
| 412 | rtl = 1; /* visual order, we need to reverse in a post process pass */ | 
|---|
| 413 | } | 
|---|
| 414 | } | 
|---|
| 415 |  | 
|---|
| 416 | /* Enough for a new line, but not enough for a new paragraph */ | 
|---|
| 417 | else if (fabsf(base_offset) <= size * PARAGRAPH_DIST) | 
|---|
| 418 | { | 
|---|
| 419 | /* Check indent to spot text-indent style paragraphs */ | 
|---|
| 420 | if (wmode == 0 && cur_line && dev->new_obj) | 
|---|
| 421 | if (fabsf(p.x - dev->start.x) > size * 0.5f) | 
|---|
| 422 | new_para = 1; | 
|---|
| 423 | new_line = 1; | 
|---|
| 424 | } | 
|---|
| 425 |  | 
|---|
| 426 | /* Way off the baseline - open a new paragraph */ | 
|---|
| 427 | else | 
|---|
| 428 | { | 
|---|
| 429 | new_para = 1; | 
|---|
| 430 | new_line = 1; | 
|---|
| 431 | } | 
|---|
| 432 | } | 
|---|
| 433 |  | 
|---|
| 434 | /* Start a new block (but only at the beginning of a text object) */ | 
|---|
| 435 | if (new_para || !cur_block) | 
|---|
| 436 | { | 
|---|
| 437 | cur_block = add_text_block_to_page(ctx, page); | 
|---|
| 438 | cur_line = cur_block->u.t.last_line; | 
|---|
| 439 | } | 
|---|
| 440 |  | 
|---|
| 441 | /* Start a new line */ | 
|---|
| 442 | if (new_line || !cur_line) | 
|---|
| 443 | { | 
|---|
| 444 | cur_line = add_line_to_block(ctx, page, cur_block, &ndir, wmode); | 
|---|
| 445 | dev->start = p; | 
|---|
| 446 | } | 
|---|
| 447 |  | 
|---|
| 448 | /* Add synthetic space */ | 
|---|
| 449 | if (add_space && !(dev->flags & FZ_STEXT_INHIBIT_SPACES)) | 
|---|
| 450 | add_char_to_line(ctx, page, cur_line, trm, font, size, ' ', &dev->pen, &p, dev->color); | 
|---|
| 451 |  | 
|---|
| 452 | add_char_to_line(ctx, page, cur_line, trm, font, size, c, &p, &q, dev->color); | 
|---|
| 453 | dev->lastchar = c; | 
|---|
| 454 | dev->pen = q; | 
|---|
| 455 |  | 
|---|
| 456 | dev->new_obj = 0; | 
|---|
| 457 | dev->trm = trm; | 
|---|
| 458 | } | 
|---|
| 459 |  | 
|---|
| 460 | static void | 
|---|
| 461 | fz_add_stext_char(fz_context *ctx, fz_stext_device *dev, fz_font *font, int c, int glyph, fz_matrix trm, float adv, int wmode) | 
|---|
| 462 | { | 
|---|
| 463 | /* ignore when one unicode character maps to multiple glyphs */ | 
|---|
| 464 | if (c == -1) | 
|---|
| 465 | return; | 
|---|
| 466 |  | 
|---|
| 467 | if (!(dev->flags & FZ_STEXT_PRESERVE_LIGATURES)) | 
|---|
| 468 | { | 
|---|
| 469 | switch (c) | 
|---|
| 470 | { | 
|---|
| 471 | case 0xFB00: /* ff */ | 
|---|
| 472 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); | 
|---|
| 473 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); | 
|---|
| 474 | return; | 
|---|
| 475 | case 0xFB01: /* fi */ | 
|---|
| 476 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); | 
|---|
| 477 | fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode); | 
|---|
| 478 | return; | 
|---|
| 479 | case 0xFB02: /* fl */ | 
|---|
| 480 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); | 
|---|
| 481 | fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode); | 
|---|
| 482 | return; | 
|---|
| 483 | case 0xFB03: /* ffi */ | 
|---|
| 484 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); | 
|---|
| 485 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); | 
|---|
| 486 | fz_add_stext_char_imp(ctx, dev, font, 'i', -1, trm, 0, wmode); | 
|---|
| 487 | return; | 
|---|
| 488 | case 0xFB04: /* ffl */ | 
|---|
| 489 | fz_add_stext_char_imp(ctx, dev, font, 'f', glyph, trm, adv, wmode); | 
|---|
| 490 | fz_add_stext_char_imp(ctx, dev, font, 'f', -1, trm, 0, wmode); | 
|---|
| 491 | fz_add_stext_char_imp(ctx, dev, font, 'l', -1, trm, 0, wmode); | 
|---|
| 492 | return; | 
|---|
| 493 | case 0xFB05: /* long st */ | 
|---|
| 494 | case 0xFB06: /* st */ | 
|---|
| 495 | fz_add_stext_char_imp(ctx, dev, font, 's', glyph, trm, adv, wmode); | 
|---|
| 496 | fz_add_stext_char_imp(ctx, dev, font, 't', -1, trm, 0, wmode); | 
|---|
| 497 | return; | 
|---|
| 498 | } | 
|---|
| 499 | } | 
|---|
| 500 |  | 
|---|
| 501 | if (!(dev->flags & FZ_STEXT_PRESERVE_WHITESPACE)) | 
|---|
| 502 | { | 
|---|
| 503 | switch (c) | 
|---|
| 504 | { | 
|---|
| 505 | case 0x0009: /* tab */ | 
|---|
| 506 | case 0x0020: /* space */ | 
|---|
| 507 | case 0x00A0: /* no-break space */ | 
|---|
| 508 | case 0x1680: /* ogham space mark */ | 
|---|
| 509 | case 0x180E: /* mongolian vowel separator */ | 
|---|
| 510 | case 0x2000: /* en quad */ | 
|---|
| 511 | case 0x2001: /* em quad */ | 
|---|
| 512 | case 0x2002: /* en space */ | 
|---|
| 513 | case 0x2003: /* em space */ | 
|---|
| 514 | case 0x2004: /* three-per-em space */ | 
|---|
| 515 | case 0x2005: /* four-per-em space */ | 
|---|
| 516 | case 0x2006: /* six-per-em space */ | 
|---|
| 517 | case 0x2007: /* figure space */ | 
|---|
| 518 | case 0x2008: /* punctuation space */ | 
|---|
| 519 | case 0x2009: /* thin space */ | 
|---|
| 520 | case 0x200A: /* hair space */ | 
|---|
| 521 | case 0x202F: /* narrow no-break space */ | 
|---|
| 522 | case 0x205F: /* medium mathematical space */ | 
|---|
| 523 | case 0x3000: /* ideographic space */ | 
|---|
| 524 | c = ' '; | 
|---|
| 525 | } | 
|---|
| 526 | } | 
|---|
| 527 |  | 
|---|
| 528 | fz_add_stext_char_imp(ctx, dev, font, c, glyph, trm, adv, wmode); | 
|---|
| 529 | } | 
|---|
| 530 |  | 
|---|
| 531 | static void | 
|---|
| 532 | (fz_context *ctx, fz_stext_device *dev, fz_text_span *span, fz_matrix ctm) | 
|---|
| 533 | { | 
|---|
| 534 | fz_font *font = span->font; | 
|---|
| 535 | fz_matrix tm = span->trm; | 
|---|
| 536 | fz_matrix trm; | 
|---|
| 537 | float adv; | 
|---|
| 538 | int i; | 
|---|
| 539 |  | 
|---|
| 540 | if (span->len == 0) | 
|---|
| 541 | return; | 
|---|
| 542 |  | 
|---|
| 543 | tm.e = 0; | 
|---|
| 544 | tm.f = 0; | 
|---|
| 545 | trm = fz_concat(tm, ctm); | 
|---|
| 546 |  | 
|---|
| 547 | for (i = 0; i < span->len; i++) | 
|---|
| 548 | { | 
|---|
| 549 | /* Calculate new pen location and delta */ | 
|---|
| 550 | tm.e = span->items[i].x; | 
|---|
| 551 | tm.f = span->items[i].y; | 
|---|
| 552 | trm = fz_concat(tm, ctm); | 
|---|
| 553 |  | 
|---|
| 554 | /* Calculate bounding box and new pen position based on font metrics */ | 
|---|
| 555 | if (span->items[i].gid >= 0) | 
|---|
| 556 | adv = fz_advance_glyph(ctx, font, span->items[i].gid, span->wmode); | 
|---|
| 557 | else | 
|---|
| 558 | adv = 0; | 
|---|
| 559 |  | 
|---|
| 560 | fz_add_stext_char(ctx, dev, font, span->items[i].ucs, span->items[i].gid, trm, adv, span->wmode); | 
|---|
| 561 | } | 
|---|
| 562 | } | 
|---|
| 563 |  | 
|---|
| 564 | static int hexrgb_from_color(fz_context *ctx, fz_colorspace *colorspace, const float *color) | 
|---|
| 565 | { | 
|---|
| 566 | float rgb[3]; | 
|---|
| 567 | fz_convert_color(ctx, colorspace, color, fz_device_rgb(ctx), rgb, NULL, fz_default_color_params); | 
|---|
| 568 | return | 
|---|
| 569 | (fz_clampi(rgb[0] * 255, 0, 255) << 16) | | 
|---|
| 570 | (fz_clampi(rgb[1] * 255, 0, 255) << 8) | | 
|---|
| 571 | (fz_clampi(rgb[2] * 255, 0, 255)); | 
|---|
| 572 | } | 
|---|
| 573 |  | 
|---|
| 574 | static void | 
|---|
| 575 | fz_stext_fill_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, | 
|---|
| 576 | fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) | 
|---|
| 577 | { | 
|---|
| 578 | fz_stext_device *tdev = (fz_stext_device*)dev; | 
|---|
| 579 | fz_text_span *span; | 
|---|
| 580 | if (text == tdev->lasttext) | 
|---|
| 581 | return; | 
|---|
| 582 | tdev->color = hexrgb_from_color(ctx, colorspace, color); | 
|---|
| 583 | tdev->new_obj = 1; | 
|---|
| 584 | for (span = text->head; span; span = span->next) | 
|---|
| 585 | fz_stext_extract(ctx, tdev, span, ctm); | 
|---|
| 586 | fz_drop_text(ctx, tdev->lasttext); | 
|---|
| 587 | tdev->lasttext = fz_keep_text(ctx, text); | 
|---|
| 588 | } | 
|---|
| 589 |  | 
|---|
| 590 | static void | 
|---|
| 591 | fz_stext_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, | 
|---|
| 592 | fz_colorspace *colorspace, const float *color, float alpha, fz_color_params color_params) | 
|---|
| 593 | { | 
|---|
| 594 | fz_stext_device *tdev = (fz_stext_device*)dev; | 
|---|
| 595 | fz_text_span *span; | 
|---|
| 596 | if (text == tdev->lasttext) | 
|---|
| 597 | return; | 
|---|
| 598 | tdev->color = hexrgb_from_color(ctx, colorspace, color); | 
|---|
| 599 | tdev->new_obj = 1; | 
|---|
| 600 | for (span = text->head; span; span = span->next) | 
|---|
| 601 | fz_stext_extract(ctx, tdev, span, ctm); | 
|---|
| 602 | fz_drop_text(ctx, tdev->lasttext); | 
|---|
| 603 | tdev->lasttext = fz_keep_text(ctx, text); | 
|---|
| 604 | } | 
|---|
| 605 |  | 
|---|
| 606 | static void | 
|---|
| 607 | fz_stext_clip_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm, fz_rect scissor) | 
|---|
| 608 | { | 
|---|
| 609 | fz_stext_device *tdev = (fz_stext_device*)dev; | 
|---|
| 610 | fz_text_span *span; | 
|---|
| 611 | if (text == tdev->lasttext) | 
|---|
| 612 | return; | 
|---|
| 613 | tdev->color = 0; | 
|---|
| 614 | tdev->new_obj = 1; | 
|---|
| 615 | for (span = text->head; span; span = span->next) | 
|---|
| 616 | fz_stext_extract(ctx, tdev, span, ctm); | 
|---|
| 617 | fz_drop_text(ctx, tdev->lasttext); | 
|---|
| 618 | tdev->lasttext = fz_keep_text(ctx, text); | 
|---|
| 619 | } | 
|---|
| 620 |  | 
|---|
| 621 | static void | 
|---|
| 622 | fz_stext_clip_stroke_text(fz_context *ctx, fz_device *dev, const fz_text *text, const fz_stroke_state *stroke, fz_matrix ctm, fz_rect scissor) | 
|---|
| 623 | { | 
|---|
| 624 | fz_stext_device *tdev = (fz_stext_device*)dev; | 
|---|
| 625 | fz_text_span *span; | 
|---|
| 626 | if (text == tdev->lasttext) | 
|---|
| 627 | return; | 
|---|
| 628 | tdev->color = 0; | 
|---|
| 629 | tdev->new_obj = 1; | 
|---|
| 630 | for (span = text->head; span; span = span->next) | 
|---|
| 631 | fz_stext_extract(ctx, tdev, span, ctm); | 
|---|
| 632 | fz_drop_text(ctx, tdev->lasttext); | 
|---|
| 633 | tdev->lasttext = fz_keep_text(ctx, text); | 
|---|
| 634 | } | 
|---|
| 635 |  | 
|---|
| 636 | static void | 
|---|
| 637 | fz_stext_ignore_text(fz_context *ctx, fz_device *dev, const fz_text *text, fz_matrix ctm) | 
|---|
| 638 | { | 
|---|
| 639 | fz_stext_device *tdev = (fz_stext_device*)dev; | 
|---|
| 640 | fz_text_span *span; | 
|---|
| 641 | if (text == tdev->lasttext) | 
|---|
| 642 | return; | 
|---|
| 643 | tdev->color = 0; | 
|---|
| 644 | tdev->new_obj = 1; | 
|---|
| 645 | for (span = text->head; span; span = span->next) | 
|---|
| 646 | fz_stext_extract(ctx, tdev, span, ctm); | 
|---|
| 647 | fz_drop_text(ctx, tdev->lasttext); | 
|---|
| 648 | tdev->lasttext = fz_keep_text(ctx, text); | 
|---|
| 649 | } | 
|---|
| 650 |  | 
|---|
| 651 | /* Images and shadings */ | 
|---|
| 652 |  | 
|---|
| 653 | static void | 
|---|
| 654 | fz_stext_fill_image(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, float alpha, fz_color_params color_params) | 
|---|
| 655 | { | 
|---|
| 656 | fz_stext_device *tdev = (fz_stext_device*)dev; | 
|---|
| 657 |  | 
|---|
| 658 | /* If the alpha is less than 50% then it's probably a watermark or effect or something. Skip it. */ | 
|---|
| 659 | if (alpha < 0.5f) | 
|---|
| 660 | return; | 
|---|
| 661 |  | 
|---|
| 662 | add_image_block_to_page(ctx, tdev->page, ctm, img); | 
|---|
| 663 | } | 
|---|
| 664 |  | 
|---|
| 665 | static void | 
|---|
| 666 | fz_stext_fill_image_mask(fz_context *ctx, fz_device *dev, fz_image *img, fz_matrix ctm, | 
|---|
| 667 | fz_colorspace *cspace, const float *color, float alpha, fz_color_params color_params) | 
|---|
| 668 | { | 
|---|
| 669 | fz_stext_fill_image(ctx, dev, img, ctm, alpha, color_params); | 
|---|
| 670 | } | 
|---|
| 671 |  | 
|---|
| 672 | static fz_image * | 
|---|
| 673 | fz_new_image_from_shade(fz_context *ctx, fz_shade *shade, fz_matrix *in_out_ctm, fz_color_params color_params, fz_rect scissor) | 
|---|
| 674 | { | 
|---|
| 675 | fz_matrix ctm = *in_out_ctm; | 
|---|
| 676 | fz_pixmap *pix; | 
|---|
| 677 | fz_image *img = NULL; | 
|---|
| 678 | fz_rect bounds; | 
|---|
| 679 | fz_irect bbox; | 
|---|
| 680 |  | 
|---|
| 681 | bounds = fz_bound_shade(ctx, shade, ctm); | 
|---|
| 682 | bounds = fz_intersect_rect(bounds, scissor); | 
|---|
| 683 | bbox = fz_irect_from_rect(bounds); | 
|---|
| 684 |  | 
|---|
| 685 | pix = fz_new_pixmap_with_bbox(ctx, fz_device_rgb(ctx), bbox, NULL, !shade->use_background); | 
|---|
| 686 | fz_try(ctx) | 
|---|
| 687 | { | 
|---|
| 688 | if (shade->use_background) | 
|---|
| 689 | fz_fill_pixmap_with_color(ctx, pix, shade->colorspace, shade->background, color_params); | 
|---|
| 690 | else | 
|---|
| 691 | fz_clear_pixmap(ctx, pix); | 
|---|
| 692 | fz_paint_shade(ctx, shade, NULL, ctm, pix, color_params, bbox, NULL); | 
|---|
| 693 | img = fz_new_image_from_pixmap(ctx, pix, NULL); | 
|---|
| 694 | } | 
|---|
| 695 | fz_always(ctx) | 
|---|
| 696 | fz_drop_pixmap(ctx, pix); | 
|---|
| 697 | fz_catch(ctx) | 
|---|
| 698 | fz_rethrow(ctx); | 
|---|
| 699 |  | 
|---|
| 700 | in_out_ctm->a = pix->w; | 
|---|
| 701 | in_out_ctm->b = 0; | 
|---|
| 702 | in_out_ctm->c = 0; | 
|---|
| 703 | in_out_ctm->d = pix->h; | 
|---|
| 704 | in_out_ctm->e = pix->x; | 
|---|
| 705 | in_out_ctm->f = pix->y; | 
|---|
| 706 | return img; | 
|---|
| 707 | } | 
|---|
| 708 |  | 
|---|
| 709 | static void | 
|---|
| 710 | fz_stext_fill_shade(fz_context *ctx, fz_device *dev, fz_shade *shade, fz_matrix ctm, float alpha, fz_color_params color_params) | 
|---|
| 711 | { | 
|---|
| 712 | fz_matrix local_ctm = ctm; | 
|---|
| 713 | fz_rect scissor = fz_device_current_scissor(ctx, dev); | 
|---|
| 714 | fz_image *image = fz_new_image_from_shade(ctx, shade, &local_ctm, color_params, scissor); | 
|---|
| 715 | fz_try(ctx) | 
|---|
| 716 | fz_stext_fill_image(ctx, dev, image, local_ctm, alpha, color_params); | 
|---|
| 717 | fz_always(ctx) | 
|---|
| 718 | fz_drop_image(ctx, image); | 
|---|
| 719 | fz_catch(ctx) | 
|---|
| 720 | fz_rethrow(ctx); | 
|---|
| 721 | } | 
|---|
| 722 |  | 
|---|
| 723 | static void | 
|---|
| 724 | fz_stext_close_device(fz_context *ctx, fz_device *dev) | 
|---|
| 725 | { | 
|---|
| 726 | fz_stext_device *tdev = (fz_stext_device*)dev; | 
|---|
| 727 | fz_stext_page *page = tdev->page; | 
|---|
| 728 | fz_stext_block *block; | 
|---|
| 729 | fz_stext_line *line; | 
|---|
| 730 | fz_stext_char *ch; | 
|---|
| 731 |  | 
|---|
| 732 | for (block = page->first_block; block; block = block->next) | 
|---|
| 733 | { | 
|---|
| 734 | if (block->type != FZ_STEXT_BLOCK_TEXT) | 
|---|
| 735 | continue; | 
|---|
| 736 | for (line = block->u.t.first_line; line; line = line->next) | 
|---|
| 737 | { | 
|---|
| 738 | for (ch = line->first_char; ch; ch = ch->next) | 
|---|
| 739 | { | 
|---|
| 740 | fz_rect ch_box = fz_rect_from_quad(ch->quad); | 
|---|
| 741 | if (ch == line->first_char) | 
|---|
| 742 | line->bbox = ch_box; | 
|---|
| 743 | else | 
|---|
| 744 | line->bbox = fz_union_rect(line->bbox, ch_box); | 
|---|
| 745 | } | 
|---|
| 746 | block->bbox = fz_union_rect(block->bbox, line->bbox); | 
|---|
| 747 | } | 
|---|
| 748 | } | 
|---|
| 749 |  | 
|---|
| 750 | /* TODO: smart sorting of blocks and lines in reading order */ | 
|---|
| 751 | /* TODO: unicode NFC normalization */ | 
|---|
| 752 | } | 
|---|
| 753 |  | 
|---|
| 754 | static void | 
|---|
| 755 | fz_stext_drop_device(fz_context *ctx, fz_device *dev) | 
|---|
| 756 | { | 
|---|
| 757 | fz_stext_device *tdev = (fz_stext_device*)dev; | 
|---|
| 758 | fz_drop_text(ctx, tdev->lasttext); | 
|---|
| 759 | } | 
|---|
| 760 |  | 
|---|
| 761 | /* | 
|---|
| 762 | Parse stext device options from a comma separated key-value string. | 
|---|
| 763 | */ | 
|---|
| 764 | fz_stext_options * | 
|---|
| 765 | fz_parse_stext_options(fz_context *ctx, fz_stext_options *opts, const char *string) | 
|---|
| 766 | { | 
|---|
| 767 | const char *val; | 
|---|
| 768 |  | 
|---|
| 769 | memset(opts, 0, sizeof *opts); | 
|---|
| 770 |  | 
|---|
| 771 | if (fz_has_option(ctx, string, "preserve-ligatures", &val) && fz_option_eq(val, "yes")) | 
|---|
| 772 | opts->flags |= FZ_STEXT_PRESERVE_LIGATURES; | 
|---|
| 773 | if (fz_has_option(ctx, string, "preserve-whitespace", &val) && fz_option_eq(val, "yes")) | 
|---|
| 774 | opts->flags |= FZ_STEXT_PRESERVE_WHITESPACE; | 
|---|
| 775 | if (fz_has_option(ctx, string, "preserve-images", &val) && fz_option_eq(val, "yes")) | 
|---|
| 776 | opts->flags |= FZ_STEXT_PRESERVE_IMAGES; | 
|---|
| 777 | if (fz_has_option(ctx, string, "inhibit-spaces", &val) && fz_option_eq(val, "yes")) | 
|---|
| 778 | opts->flags |= FZ_STEXT_INHIBIT_SPACES; | 
|---|
| 779 |  | 
|---|
| 780 | return opts; | 
|---|
| 781 | } | 
|---|
| 782 |  | 
|---|
| 783 | /* | 
|---|
| 784 | Create a device to extract the text on a page. | 
|---|
| 785 |  | 
|---|
| 786 | Gather the text on a page into blocks and lines. | 
|---|
| 787 |  | 
|---|
| 788 | The reading order is taken from the order the text is drawn in the | 
|---|
| 789 | source file, so may not be accurate. | 
|---|
| 790 |  | 
|---|
| 791 | page: The text page to which content should be added. This will | 
|---|
| 792 | usually be a newly created (empty) text page, but it can be one | 
|---|
| 793 | containing data already (for example when merging multiple pages, | 
|---|
| 794 | or watermarking). | 
|---|
| 795 |  | 
|---|
| 796 | options: Options to configure the stext device. | 
|---|
| 797 | */ | 
|---|
| 798 | fz_device * | 
|---|
| 799 | fz_new_stext_device(fz_context *ctx, fz_stext_page *page, const fz_stext_options *opts) | 
|---|
| 800 | { | 
|---|
| 801 | fz_stext_device *dev = fz_new_derived_device(ctx, fz_stext_device); | 
|---|
| 802 |  | 
|---|
| 803 | dev->super.close_device = fz_stext_close_device; | 
|---|
| 804 | dev->super.drop_device = fz_stext_drop_device; | 
|---|
| 805 |  | 
|---|
| 806 | dev->super.fill_text = fz_stext_fill_text; | 
|---|
| 807 | dev->super.stroke_text = fz_stext_stroke_text; | 
|---|
| 808 | dev->super.clip_text = fz_stext_clip_text; | 
|---|
| 809 | dev->super.clip_stroke_text = fz_stext_clip_stroke_text; | 
|---|
| 810 | dev->super.ignore_text = fz_stext_ignore_text; | 
|---|
| 811 |  | 
|---|
| 812 | if (opts && (opts->flags & FZ_STEXT_PRESERVE_IMAGES)) | 
|---|
| 813 | { | 
|---|
| 814 | dev->super.fill_shade = fz_stext_fill_shade; | 
|---|
| 815 | dev->super.fill_image = fz_stext_fill_image; | 
|---|
| 816 | dev->super.fill_image_mask = fz_stext_fill_image_mask; | 
|---|
| 817 | } | 
|---|
| 818 |  | 
|---|
| 819 | if (opts) | 
|---|
| 820 | dev->flags = opts->flags; | 
|---|
| 821 | dev->page = page; | 
|---|
| 822 | dev->pen.x = 0; | 
|---|
| 823 | dev->pen.y = 0; | 
|---|
| 824 | dev->trm = fz_identity; | 
|---|
| 825 | dev->lastchar = ' '; | 
|---|
| 826 | dev->curdir = 1; | 
|---|
| 827 | dev->lasttext = NULL; | 
|---|
| 828 |  | 
|---|
| 829 | return (fz_device*)dev; | 
|---|
| 830 | } | 
|---|
| 831 |  | 
|---|