| 1 | #include "mupdf/fitz.h" |
| 2 | #include "mupdf/ucdn.h" |
| 3 | #include "html-imp.h" |
| 4 | |
| 5 | #include <string.h> |
| 6 | #include <stdio.h> |
| 7 | |
| 8 | enum { T, R, B, L }; |
| 9 | |
| 10 | #define DEFAULT_DIR FZ_BIDI_LTR |
| 11 | |
| 12 | static const char *html_default_css = |
| 13 | "@page{margin:3em 2em}" |
| 14 | "a{color:#06C;text-decoration:underline}" |
| 15 | "address{display:block;font-style:italic}" |
| 16 | "b{font-weight:bold}" |
| 17 | "bdo{direction:rtl;unicode-bidi:bidi-override}" |
| 18 | "blockquote{display:block;margin:1em 40px}" |
| 19 | "body{display:block;margin:1em}" |
| 20 | "cite{font-style:italic}" |
| 21 | "code{font-family:monospace}" |
| 22 | "dd{display:block;margin:0 0 0 40px}" |
| 23 | "del{text-decoration:line-through}" |
| 24 | "div{display:block}" |
| 25 | "dl{display:block;margin:1em 0}" |
| 26 | "dt{display:block}" |
| 27 | "em{font-style:italic}" |
| 28 | "h1{display:block;font-size:2em;font-weight:bold;margin:0.67em 0;page-break-after:avoid}" |
| 29 | "h2{display:block;font-size:1.5em;font-weight:bold;margin:0.83em 0;page-break-after:avoid}" |
| 30 | "h3{display:block;font-size:1.17em;font-weight:bold;margin:1em 0;page-break-after:avoid}" |
| 31 | "h4{display:block;font-size:1em;font-weight:bold;margin:1.33em 0;page-break-after:avoid}" |
| 32 | "h5{display:block;font-size:0.83em;font-weight:bold;margin:1.67em 0;page-break-after:avoid}" |
| 33 | "h6{display:block;font-size:0.67em;font-weight:bold;margin:2.33em 0;page-break-after:avoid}" |
| 34 | "head{display:none}" |
| 35 | "hr{border-style:solid;border-width:1px;display:block;margin-bottom:0.5em;margin-top:0.5em;text-align:center}" |
| 36 | "html{display:block}" |
| 37 | "i{font-style:italic}" |
| 38 | "ins{text-decoration:underline}" |
| 39 | "kbd{font-family:monospace}" |
| 40 | "li{display:list-item}" |
| 41 | "menu{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}" |
| 42 | "ol{display:block;list-style-type:decimal;margin:1em 0;padding:0 0 0 30pt}" |
| 43 | "p{display:block;margin:1em 0}" |
| 44 | "pre{display:block;font-family:monospace;margin:1em 0;white-space:pre}" |
| 45 | "samp{font-family:monospace}" |
| 46 | "script{display:none}" |
| 47 | "small{font-size:0.83em}" |
| 48 | "strong{font-weight:bold}" |
| 49 | "style{display:none}" |
| 50 | "sub{font-size:0.83em;vertical-align:sub}" |
| 51 | "sup{font-size:0.83em;vertical-align:super}" |
| 52 | "table{display:table}" |
| 53 | "tbody{display:table-row-group}" |
| 54 | "td{display:table-cell;padding:1px}" |
| 55 | "tfoot{display:table-footer-group}" |
| 56 | "th{display:table-cell;font-weight:bold;padding:1px;text-align:center}" |
| 57 | "thead{display:table-header-group}" |
| 58 | "tr{display:table-row}" |
| 59 | "ul{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}" |
| 60 | "ul ul{list-style-type:circle}" |
| 61 | "ul ul ul{list-style-type:square}" |
| 62 | "var{font-style:italic}" |
| 63 | "svg{display:none}" |
| 64 | ; |
| 65 | |
| 66 | static const char *fb2_default_css = |
| 67 | "@page{margin:3em 2em}" |
| 68 | "FictionBook{display:block;margin:1em}" |
| 69 | "stylesheet,binary{display:none}" |
| 70 | "description>*{display:none}" |
| 71 | "description>title-info{display:block}" |
| 72 | "description>title-info>*{display:none}" |
| 73 | "description>title-info>coverpage{display:block;page-break-before:always;page-break-after:always}" |
| 74 | "body,section,title,subtitle,p,cite,epigraph,text-author,date,poem,stanza,v,empty-line{display:block}" |
| 75 | "image{display:block}" |
| 76 | "p>image{display:inline}" |
| 77 | "table{display:table}" |
| 78 | "tr{display:table-row}" |
| 79 | "th,td{display:table-cell}" |
| 80 | "a{color:#06C;text-decoration:underline}" |
| 81 | "a[type=note]{font-size:small;vertical-align:super}" |
| 82 | "code{white-space:pre;font-family:monospace}" |
| 83 | "emphasis{font-style:italic}" |
| 84 | "strikethrough{text-decoration:line-through}" |
| 85 | "strong{font-weight:bold}" |
| 86 | "sub{font-size:small;vertical-align:sub}" |
| 87 | "sup{font-size:small;vertical-align:super}" |
| 88 | "image{margin:1em 0;text-align:center}" |
| 89 | "cite,poem{margin:1em 2em}" |
| 90 | "subtitle,epigraph,stanza{margin:1em 0}" |
| 91 | "title>p{text-align:center;font-size:x-large}" |
| 92 | "subtitle{text-align:center;font-size:large}" |
| 93 | "p{margin-top:1em;text-align:justify}" |
| 94 | "empty-line{padding-top:1em}" |
| 95 | "p+p{margin-top:0;text-indent:1.5em}" |
| 96 | "empty-line+p{margin-top:0}" |
| 97 | "section>title{page-break-before:always}" |
| 98 | ; |
| 99 | |
| 100 | struct genstate |
| 101 | { |
| 102 | fz_pool *pool; |
| 103 | fz_html_font_set *set; |
| 104 | fz_archive *zip; |
| 105 | fz_tree *images; |
| 106 | int is_fb2; |
| 107 | const char *base_uri; |
| 108 | fz_css *css; |
| 109 | int at_bol; |
| 110 | int emit_white; |
| 111 | int last_brk_cls; |
| 112 | }; |
| 113 | |
| 114 | static int iswhite(int c) |
| 115 | { |
| 116 | return c == ' ' || c == '\t' || c == '\r' || c == '\n'; |
| 117 | } |
| 118 | |
| 119 | static int is_all_white(const char *s) |
| 120 | { |
| 121 | while (*s) |
| 122 | { |
| 123 | if (!iswhite(*s)) |
| 124 | return 0; |
| 125 | ++s; |
| 126 | } |
| 127 | return 1; |
| 128 | } |
| 129 | |
| 130 | /* TODO: pool allocator for flow nodes */ |
| 131 | /* TODO: store text by pointing to a giant buffer */ |
| 132 | |
| 133 | static void fz_drop_html_flow(fz_context *ctx, fz_html_flow *flow) |
| 134 | { |
| 135 | while (flow) |
| 136 | { |
| 137 | fz_html_flow *next = flow->next; |
| 138 | if (flow->type == FLOW_IMAGE) |
| 139 | fz_drop_image(ctx, flow->content.image); |
| 140 | flow = next; |
| 141 | } |
| 142 | } |
| 143 | |
| 144 | static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, int type) |
| 145 | { |
| 146 | fz_html_flow *flow = fz_pool_alloc(ctx, pool, sizeof *flow); |
| 147 | flow->type = type; |
| 148 | flow->expand = 0; |
| 149 | flow->bidi_level = 0; |
| 150 | flow->markup_lang = 0; |
| 151 | flow->breaks_line = 0; |
| 152 | flow->box = inline_box; |
| 153 | *top->flow_tail = flow; |
| 154 | top->flow_tail = &flow->next; |
| 155 | return flow; |
| 156 | } |
| 157 | |
| 158 | static void add_flow_space(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) |
| 159 | { |
| 160 | fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_SPACE); |
| 161 | flow->expand = 1; |
| 162 | } |
| 163 | |
| 164 | static void add_flow_break(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) |
| 165 | { |
| 166 | (void)add_flow(ctx, pool, top, inline_box, FLOW_BREAK); |
| 167 | } |
| 168 | |
| 169 | static void add_flow_sbreak(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) |
| 170 | { |
| 171 | (void)add_flow(ctx, pool, top, inline_box, FLOW_SBREAK); |
| 172 | } |
| 173 | |
| 174 | static void add_flow_shyphen(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) |
| 175 | { |
| 176 | (void)add_flow(ctx, pool, top, inline_box, FLOW_SHYPHEN); |
| 177 | } |
| 178 | |
| 179 | static void add_flow_word(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, const char *a, const char *b, int lang) |
| 180 | { |
| 181 | fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_WORD); |
| 182 | flow->content.text = fz_pool_alloc(ctx, pool, b - a + 1); |
| 183 | memcpy(flow->content.text, a, b - a); |
| 184 | flow->content.text[b - a] = 0; |
| 185 | flow->markup_lang = lang; |
| 186 | } |
| 187 | |
| 188 | static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, fz_image *img) |
| 189 | { |
| 190 | fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_IMAGE); |
| 191 | flow->content.image = fz_keep_image(ctx, img); |
| 192 | } |
| 193 | |
| 194 | static void add_flow_anchor(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) |
| 195 | { |
| 196 | (void)add_flow(ctx, pool, top, inline_box, FLOW_ANCHOR); |
| 197 | } |
| 198 | |
| 199 | static fz_html_flow *split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset) |
| 200 | { |
| 201 | fz_html_flow *new_flow; |
| 202 | char *text; |
| 203 | size_t len; |
| 204 | |
| 205 | if (offset == 0) |
| 206 | return flow; |
| 207 | new_flow = fz_pool_alloc(ctx, pool, sizeof *flow); |
| 208 | *new_flow = *flow; |
| 209 | new_flow->next = flow->next; |
| 210 | flow->next = new_flow; |
| 211 | |
| 212 | text = flow->content.text; |
| 213 | while (*text && offset) |
| 214 | { |
| 215 | int rune; |
| 216 | text += fz_chartorune(&rune, text); |
| 217 | offset--; |
| 218 | } |
| 219 | len = strlen(text); |
| 220 | new_flow->content.text = fz_pool_alloc(ctx, pool, len+1); |
| 221 | strcpy(new_flow->content.text, text); |
| 222 | *text = 0; |
| 223 | return new_flow; |
| 224 | } |
| 225 | |
| 226 | static void flush_space(fz_context *ctx, fz_html_box *flow, fz_html_box *inline_box, int lang, struct genstate *g) |
| 227 | { |
| 228 | static const char *space = " " ; |
| 229 | int bsp = inline_box->style.white_space & WS_ALLOW_BREAK_SPACE; |
| 230 | fz_pool *pool = g->pool; |
| 231 | if (g->emit_white) |
| 232 | { |
| 233 | if (!g->at_bol) |
| 234 | { |
| 235 | if (bsp) |
| 236 | add_flow_space(ctx, pool, flow, inline_box); |
| 237 | else |
| 238 | add_flow_word(ctx, pool, flow, inline_box, space, space+1, lang); |
| 239 | } |
| 240 | g->emit_white = 0; |
| 241 | } |
| 242 | } |
| 243 | |
| 244 | /* pair-wise lookup table for UAX#14 linebreaks */ |
| 245 | static const char *pairbrk[29] = |
| 246 | { |
| 247 | /* -OCCQGNESIPPNAHIIHBBBZCWHHJJJR- */ |
| 248 | /* -PLPULSXYSROULLDNYAB2WMJ23LVTI- */ |
| 249 | "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" , /* OP open punctuation */ |
| 250 | "_^^%%^^^^%%_____%%__^^^______" , /* CL close punctuation */ |
| 251 | "_^^%%^^^^%%%%%__%%__^^^______" , /* CP close parenthesis */ |
| 252 | "^^^%%%^^^%%%%%%%%%%%^^^%%%%%%" , /* QU quotation */ |
| 253 | "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%" , /* GL non-breaking glue */ |
| 254 | "_^^%%%^^^_______%%__^^^______" , /* NS nonstarters */ |
| 255 | "_^^%%%^^^______%%%__^^^______" , /* EX exclamation/interrogation */ |
| 256 | "_^^%%%^^^__%_%__%%__^^^______" , /* SY symbols allowing break after */ |
| 257 | "_^^%%%^^^__%%%__%%__^^^______" , /* IS infix numeric separator */ |
| 258 | "%^^%%%^^^__%%%%_%%__^^^%%%%%_" , /* PR prefix numeric */ |
| 259 | "%^^%%%^^^__%%%__%%__^^^______" , /* PO postfix numeric */ |
| 260 | "%^^%%%^^^%%%%%_%%%__^^^______" , /* NU numeric */ |
| 261 | "%^^%%%^^^__%%%_%%%__^^^______" , /* AL ordinary alphabetic and symbol characters */ |
| 262 | "%^^%%%^^^__%%%_%%%__^^^______" , /* HL hebrew letter */ |
| 263 | "_^^%%%^^^_%____%%%__^^^______" , /* ID ideographic */ |
| 264 | "_^^%%%^^^______%%%__^^^______" , /* IN inseparable characters */ |
| 265 | "_^^%_%^^^__%____%%__^^^______" , /* HY hyphens */ |
| 266 | "_^^%_%^^^_______%%__^^^______" , /* BA break after */ |
| 267 | "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%" , /* BB break before */ |
| 268 | "_^^%%%^^^_______%%_^^^^______" , /* B2 break opportunity before and after */ |
| 269 | "____________________^________" , /* ZW zero width space */ |
| 270 | "%^^%%%^^^__%%%_%%%__^^^______" , /* CM combining mark */ |
| 271 | "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%" , /* WJ word joiner */ |
| 272 | "_^^%%%^^^_%____%%%__^^^___%%_" , /* H2 hangul leading/vowel syllable */ |
| 273 | "_^^%%%^^^_%____%%%__^^^____%_" , /* H3 hangul leading/vowel/trailing syllable */ |
| 274 | "_^^%%%^^^_%____%%%__^^^%%%%__" , /* JL hangul leading jamo */ |
| 275 | "_^^%%%^^^_%____%%%__^^^___%%_" , /* JV hangul vowel jamo */ |
| 276 | "_^^%%%^^^_%____%%%__^^^____%_" , /* JT hangul trailing jamo */ |
| 277 | "_^^%%%^^^_______%%__^^^_____%" , /* RI regional indicator */ |
| 278 | }; |
| 279 | |
| 280 | static void generate_text(fz_context *ctx, fz_html_box *box, const char *text, int lang, struct genstate *g) |
| 281 | { |
| 282 | fz_html_box *flow; |
| 283 | fz_pool *pool = g->pool; |
| 284 | int collapse = box->style.white_space & WS_COLLAPSE; |
| 285 | int bsp = box->style.white_space & WS_ALLOW_BREAK_SPACE; |
| 286 | int bnl = box->style.white_space & WS_FORCE_BREAK_NEWLINE; |
| 287 | |
| 288 | static const char *space = " " ; |
| 289 | |
| 290 | flow = box; |
| 291 | while (flow->type != BOX_FLOW) |
| 292 | flow = flow->up; |
| 293 | |
| 294 | while (*text) |
| 295 | { |
| 296 | if (bnl && (*text == '\n' || *text == '\r')) |
| 297 | { |
| 298 | if (text[0] == '\r' && text[1] == '\n') |
| 299 | text += 2; |
| 300 | else |
| 301 | text += 1; |
| 302 | add_flow_break(ctx, pool, flow, box); |
| 303 | g->at_bol = 1; |
| 304 | } |
| 305 | else if (iswhite(*text)) |
| 306 | { |
| 307 | if (collapse) |
| 308 | { |
| 309 | if (bnl) |
| 310 | while (*text == ' ' || *text == '\t') |
| 311 | ++text; |
| 312 | else |
| 313 | while (iswhite(*text)) |
| 314 | ++text; |
| 315 | g->emit_white = 1; |
| 316 | } |
| 317 | else |
| 318 | { |
| 319 | // TODO: tabs |
| 320 | if (bsp) |
| 321 | add_flow_space(ctx, pool, flow, box); |
| 322 | else |
| 323 | add_flow_word(ctx, pool, flow, box, space, space+1, lang); |
| 324 | ++text; |
| 325 | } |
| 326 | g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a space */ |
| 327 | } |
| 328 | else |
| 329 | { |
| 330 | const char *prev, *mark = text; |
| 331 | int c; |
| 332 | |
| 333 | flush_space(ctx, flow, box, lang, g); |
| 334 | |
| 335 | if (g->at_bol) |
| 336 | g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; |
| 337 | |
| 338 | while (*text && !iswhite(*text)) |
| 339 | { |
| 340 | prev = text; |
| 341 | text += fz_chartorune(&c, text); |
| 342 | if (c == 0xAD) /* soft hyphen */ |
| 343 | { |
| 344 | if (mark != prev) |
| 345 | add_flow_word(ctx, pool, flow, box, mark, prev, lang); |
| 346 | add_flow_shyphen(ctx, pool, flow, box); |
| 347 | mark = text; |
| 348 | g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a soft hyphen */ |
| 349 | } |
| 350 | else if (bsp) /* allow soft breaks */ |
| 351 | { |
| 352 | int this_brk_cls = ucdn_get_resolved_linebreak_class(c); |
| 353 | if (this_brk_cls < UCDN_LINEBREAK_CLASS_RI) |
| 354 | { |
| 355 | int brk = pairbrk[g->last_brk_cls][this_brk_cls]; |
| 356 | |
| 357 | /* we handle spaces elsewhere, so ignore these classes */ |
| 358 | if (brk == '@') brk = '^'; |
| 359 | if (brk == '#') brk = '^'; |
| 360 | if (brk == '%') brk = '^'; |
| 361 | |
| 362 | if (brk == '_') |
| 363 | { |
| 364 | if (mark != prev) |
| 365 | add_flow_word(ctx, pool, flow, box, mark, prev, lang); |
| 366 | add_flow_sbreak(ctx, pool, flow, box); |
| 367 | mark = prev; |
| 368 | } |
| 369 | |
| 370 | g->last_brk_cls = this_brk_cls; |
| 371 | } |
| 372 | } |
| 373 | } |
| 374 | if (mark != text) |
| 375 | add_flow_word(ctx, pool, flow, box, mark, text, lang); |
| 376 | |
| 377 | g->at_bol = 0; |
| 378 | } |
| 379 | } |
| 380 | } |
| 381 | |
| 382 | static fz_image *load_html_image(fz_context *ctx, fz_archive *zip, const char *base_uri, const char *src) |
| 383 | { |
| 384 | char path[2048]; |
| 385 | fz_image *img = NULL; |
| 386 | fz_buffer *buf = NULL; |
| 387 | |
| 388 | fz_var(img); |
| 389 | fz_var(buf); |
| 390 | |
| 391 | fz_try(ctx) |
| 392 | { |
| 393 | if (!strncmp(src, "data:image/jpeg;base64," , 23)) |
| 394 | buf = fz_new_buffer_from_base64(ctx, src+23, 0); |
| 395 | else if (!strncmp(src, "data:image/png;base64," , 22)) |
| 396 | buf = fz_new_buffer_from_base64(ctx, src+22, 0); |
| 397 | else |
| 398 | { |
| 399 | fz_strlcpy(path, base_uri, sizeof path); |
| 400 | fz_strlcat(path, "/" , sizeof path); |
| 401 | fz_strlcat(path, src, sizeof path); |
| 402 | fz_urldecode(path); |
| 403 | fz_cleanname(path); |
| 404 | buf = fz_read_archive_entry(ctx, zip, path); |
| 405 | } |
| 406 | #if FZ_ENABLE_SVG |
| 407 | if (strstr(src, ".svg" )) |
| 408 | img = fz_new_image_from_svg(ctx, buf, base_uri, zip); |
| 409 | else |
| 410 | #endif |
| 411 | img = fz_new_image_from_buffer(ctx, buf); |
| 412 | } |
| 413 | fz_always(ctx) |
| 414 | fz_drop_buffer(ctx, buf); |
| 415 | fz_catch(ctx) |
| 416 | fz_warn(ctx, "html: cannot load image src='%s'" , src); |
| 417 | |
| 418 | return img; |
| 419 | } |
| 420 | |
| 421 | static fz_image *load_svg_image(fz_context *ctx, fz_archive *zip, const char *base_uri, fz_xml *xml) |
| 422 | { |
| 423 | fz_image *img = NULL; |
| 424 | fz_try(ctx) |
| 425 | img = fz_new_image_from_svg_xml(ctx, xml, base_uri, zip); |
| 426 | fz_catch(ctx) |
| 427 | fz_warn(ctx, "html: cannot load embedded svg document" ); |
| 428 | return img; |
| 429 | } |
| 430 | |
| 431 | static void generate_anchor(fz_context *ctx, fz_html_box *box, struct genstate *g) |
| 432 | { |
| 433 | fz_pool *pool = g->pool; |
| 434 | fz_html_box *flow = box; |
| 435 | while (flow->type != BOX_FLOW) |
| 436 | flow = flow->up; |
| 437 | add_flow_anchor(ctx, pool, flow, box); |
| 438 | } |
| 439 | |
| 440 | static void generate_image(fz_context *ctx, fz_html_box *box, fz_image *img, struct genstate *g) |
| 441 | { |
| 442 | fz_html_box *flow = box; |
| 443 | fz_pool *pool = g->pool; |
| 444 | while (flow->type != BOX_FLOW) |
| 445 | flow = flow->up; |
| 446 | |
| 447 | flush_space(ctx, flow, box, 0, g); |
| 448 | |
| 449 | if (!img) |
| 450 | { |
| 451 | const char *alt = "[image]" ; |
| 452 | add_flow_word(ctx, pool, flow, box, alt, alt + 7, 0); |
| 453 | } |
| 454 | else |
| 455 | { |
| 456 | fz_try(ctx) |
| 457 | { |
| 458 | add_flow_sbreak(ctx, pool, flow, box); |
| 459 | add_flow_image(ctx, pool, flow, box, img); |
| 460 | add_flow_sbreak(ctx, pool, flow, box); |
| 461 | } |
| 462 | fz_always(ctx) |
| 463 | { |
| 464 | fz_drop_image(ctx, img); |
| 465 | } |
| 466 | fz_catch(ctx) |
| 467 | fz_rethrow(ctx); |
| 468 | } |
| 469 | |
| 470 | g->at_bol = 0; |
| 471 | } |
| 472 | |
| 473 | static void init_box(fz_context *ctx, fz_html_box *box, fz_bidi_direction markup_dir) |
| 474 | { |
| 475 | box->type = BOX_BLOCK; |
| 476 | box->x = box->y = 0; |
| 477 | box->w = box->b = 0; |
| 478 | |
| 479 | box->up = NULL; |
| 480 | box->last = NULL; |
| 481 | box->down = NULL; |
| 482 | box->next = NULL; |
| 483 | |
| 484 | box->flow_head = NULL; |
| 485 | box->flow_tail = &box->flow_head; |
| 486 | box->markup_dir = markup_dir; |
| 487 | |
| 488 | fz_default_css_style(ctx, &box->style); |
| 489 | } |
| 490 | |
| 491 | static void fz_drop_html_box(fz_context *ctx, fz_html_box *box) |
| 492 | { |
| 493 | while (box) |
| 494 | { |
| 495 | fz_html_box *next = box->next; |
| 496 | fz_drop_html_flow(ctx, box->flow_head); |
| 497 | fz_drop_html_box(ctx, box->down); |
| 498 | box = next; |
| 499 | } |
| 500 | } |
| 501 | |
| 502 | void fz_drop_html(fz_context *ctx, fz_html *html) |
| 503 | { |
| 504 | if (html) |
| 505 | { |
| 506 | fz_drop_html_box(ctx, html->root); |
| 507 | fz_drop_pool(ctx, html->pool); |
| 508 | } |
| 509 | } |
| 510 | |
| 511 | static fz_html_box *new_box(fz_context *ctx, fz_pool *pool, fz_bidi_direction markup_dir) |
| 512 | { |
| 513 | fz_html_box *box = fz_pool_alloc(ctx, pool, sizeof *box); |
| 514 | init_box(ctx, box, markup_dir); |
| 515 | return box; |
| 516 | } |
| 517 | |
| 518 | static void insert_box(fz_context *ctx, fz_html_box *box, int type, fz_html_box *top) |
| 519 | { |
| 520 | box->type = type; |
| 521 | |
| 522 | box->up = top; |
| 523 | |
| 524 | if (top) |
| 525 | { |
| 526 | if (!top->last) |
| 527 | { |
| 528 | top->down = top->last = box; |
| 529 | } |
| 530 | else |
| 531 | { |
| 532 | top->last->next = box; |
| 533 | top->last = box; |
| 534 | } |
| 535 | } |
| 536 | } |
| 537 | |
| 538 | static fz_html_box *insert_block_box(fz_context *ctx, fz_html_box *box, fz_html_box *top) |
| 539 | { |
| 540 | if (top->type == BOX_BLOCK) |
| 541 | { |
| 542 | insert_box(ctx, box, BOX_BLOCK, top); |
| 543 | } |
| 544 | else if (top->type == BOX_FLOW) |
| 545 | { |
| 546 | while (top->type != BOX_BLOCK) |
| 547 | top = top->up; |
| 548 | insert_box(ctx, box, BOX_BLOCK, top); |
| 549 | } |
| 550 | else if (top->type == BOX_INLINE) |
| 551 | { |
| 552 | while (top->type != BOX_BLOCK) |
| 553 | top = top->up; |
| 554 | insert_box(ctx, box, BOX_BLOCK, top); |
| 555 | } |
| 556 | return top; |
| 557 | } |
| 558 | |
| 559 | static fz_html_box *insert_table_box(fz_context *ctx, fz_html_box *box, fz_html_box *top) |
| 560 | { |
| 561 | top = insert_block_box(ctx, box, top); |
| 562 | box->type = BOX_TABLE; |
| 563 | return top; |
| 564 | } |
| 565 | |
| 566 | static fz_html_box *insert_table_row_box(fz_context *ctx, fz_html_box *box, fz_html_box *top) |
| 567 | { |
| 568 | fz_html_box *table = top; |
| 569 | while (table && table->type != BOX_TABLE) |
| 570 | table = table->up; |
| 571 | if (table) |
| 572 | { |
| 573 | insert_box(ctx, box, BOX_TABLE_ROW, table); |
| 574 | return table; |
| 575 | } |
| 576 | fz_warn(ctx, "table-row not inside table element" ); |
| 577 | insert_block_box(ctx, box, top); |
| 578 | return top; |
| 579 | } |
| 580 | |
| 581 | static fz_html_box *insert_table_cell_box(fz_context *ctx, fz_html_box *box, fz_html_box *top) |
| 582 | { |
| 583 | fz_html_box *tr = top; |
| 584 | while (tr && tr->type != BOX_TABLE_ROW) |
| 585 | tr = tr->up; |
| 586 | if (tr) |
| 587 | { |
| 588 | insert_box(ctx, box, BOX_TABLE_CELL, tr); |
| 589 | return tr; |
| 590 | } |
| 591 | fz_warn(ctx, "table-cell not inside table-row element" ); |
| 592 | insert_block_box(ctx, box, top); |
| 593 | return top; |
| 594 | } |
| 595 | |
| 596 | static fz_html_box *insert_break_box(fz_context *ctx, fz_html_box *box, fz_html_box *top) |
| 597 | { |
| 598 | if (top->type == BOX_BLOCK) |
| 599 | { |
| 600 | insert_box(ctx, box, BOX_BREAK, top); |
| 601 | } |
| 602 | else if (top->type == BOX_FLOW) |
| 603 | { |
| 604 | while (top->type != BOX_BLOCK) |
| 605 | top = top->up; |
| 606 | insert_box(ctx, box, BOX_BREAK, top); |
| 607 | } |
| 608 | else if (top->type == BOX_INLINE) |
| 609 | { |
| 610 | while (top->type != BOX_BLOCK) |
| 611 | top = top->up; |
| 612 | insert_box(ctx, box, BOX_BREAK, top); |
| 613 | } |
| 614 | return top; |
| 615 | } |
| 616 | |
| 617 | static void insert_inline_box(fz_context *ctx, fz_html_box *box, fz_html_box *top, int markup_dir, struct genstate *g) |
| 618 | { |
| 619 | if (top->type == BOX_FLOW || top->type == BOX_INLINE) |
| 620 | { |
| 621 | insert_box(ctx, box, BOX_INLINE, top); |
| 622 | } |
| 623 | else |
| 624 | { |
| 625 | while (top->type != BOX_BLOCK && top->type != BOX_TABLE_CELL) |
| 626 | top = top->up; |
| 627 | |
| 628 | if (top->last && top->last->type == BOX_FLOW) |
| 629 | { |
| 630 | insert_box(ctx, box, BOX_INLINE, top->last); |
| 631 | } |
| 632 | else |
| 633 | { |
| 634 | fz_html_box *flow = new_box(ctx, g->pool, markup_dir); |
| 635 | flow->is_first_flow = !top->last; |
| 636 | insert_box(ctx, flow, BOX_FLOW, top); |
| 637 | insert_box(ctx, box, BOX_INLINE, flow); |
| 638 | g->at_bol = 1; |
| 639 | } |
| 640 | } |
| 641 | } |
| 642 | |
| 643 | static fz_html_box * |
| 644 | generate_boxes(fz_context *ctx, |
| 645 | fz_xml *node, |
| 646 | fz_html_box *top, |
| 647 | fz_css_match *up_match, |
| 648 | int list_counter, |
| 649 | int section_depth, |
| 650 | int markup_dir, |
| 651 | int markup_lang, |
| 652 | struct genstate *g) |
| 653 | { |
| 654 | fz_css_match match; |
| 655 | fz_html_box *box, *last_top; |
| 656 | const char *tag; |
| 657 | int display; |
| 658 | |
| 659 | while (node) |
| 660 | { |
| 661 | match.up = up_match; |
| 662 | match.count = 0; |
| 663 | |
| 664 | tag = fz_xml_tag(node); |
| 665 | if (tag) |
| 666 | { |
| 667 | fz_match_css(ctx, &match, g->css, node); |
| 668 | |
| 669 | display = fz_get_css_match_display(&match); |
| 670 | |
| 671 | if (tag[0]=='b' && tag[1]=='r' && tag[2]==0) |
| 672 | { |
| 673 | if (top->type == BOX_INLINE) |
| 674 | { |
| 675 | fz_html_box *flow = top; |
| 676 | while (flow->type != BOX_FLOW) |
| 677 | flow = flow->up; |
| 678 | add_flow_break(ctx, g->pool, flow, top); |
| 679 | } |
| 680 | else |
| 681 | { |
| 682 | box = new_box(ctx, g->pool, markup_dir); |
| 683 | fz_apply_css_style(ctx, g->set, &box->style, &match); |
| 684 | top = insert_break_box(ctx, box, top); |
| 685 | } |
| 686 | g->at_bol = 1; |
| 687 | } |
| 688 | |
| 689 | else if (tag[0]=='i' && tag[1]=='m' && tag[2]=='g' && tag[3]==0) |
| 690 | { |
| 691 | const char *src = fz_xml_att(node, "src" ); |
| 692 | if (src) |
| 693 | { |
| 694 | int w, h; |
| 695 | const char *w_att = fz_xml_att(node, "width" ); |
| 696 | const char *h_att = fz_xml_att(node, "height" ); |
| 697 | box = new_box(ctx, g->pool, markup_dir); |
| 698 | fz_apply_css_style(ctx, g->set, &box->style, &match); |
| 699 | if (w_att && (w = fz_atoi(w_att)) > 0) |
| 700 | { |
| 701 | box->style.width.value = w; |
| 702 | box->style.width.unit = strchr(w_att, '%') ? N_PERCENT : N_LENGTH; |
| 703 | } |
| 704 | if (h_att && (h = fz_atoi(h_att)) > 0) |
| 705 | { |
| 706 | box->style.height.value = h; |
| 707 | box->style.height.unit = strchr(h_att, '%') ? N_PERCENT : N_LENGTH; |
| 708 | } |
| 709 | insert_inline_box(ctx, box, top, markup_dir, g); |
| 710 | generate_image(ctx, box, load_html_image(ctx, g->zip, g->base_uri, src), g); |
| 711 | } |
| 712 | } |
| 713 | |
| 714 | else if (tag[0]=='s' && tag[1]=='v' && tag[2]=='g' && tag[3]==0) |
| 715 | { |
| 716 | box = new_box(ctx, g->pool, markup_dir); |
| 717 | fz_apply_css_style(ctx, g->set, &box->style, &match); |
| 718 | insert_inline_box(ctx, box, top, markup_dir, g); |
| 719 | generate_image(ctx, box, load_svg_image(ctx, g->zip, g->base_uri, node), g); |
| 720 | } |
| 721 | |
| 722 | else if (g->is_fb2 && tag[0]=='i' && tag[1]=='m' && tag[2]=='a' && tag[3]=='g' && tag[4]=='e' && tag[5]==0) |
| 723 | { |
| 724 | const char *src = fz_xml_att(node, "l:href" ); |
| 725 | if (!src) |
| 726 | src = fz_xml_att(node, "xlink:href" ); |
| 727 | if (src && src[0] == '#') |
| 728 | { |
| 729 | fz_image *img = fz_tree_lookup(ctx, g->images, src+1); |
| 730 | if (display == DIS_BLOCK) |
| 731 | { |
| 732 | fz_html_box *imgbox; |
| 733 | box = new_box(ctx, g->pool, markup_dir); |
| 734 | fz_apply_css_style(ctx, g->set, &box->style, &match); |
| 735 | top = insert_block_box(ctx, box, top); |
| 736 | imgbox = new_box(ctx, g->pool, markup_dir); |
| 737 | fz_apply_css_style(ctx, g->set, &imgbox->style, &match); |
| 738 | insert_inline_box(ctx, imgbox, box, markup_dir, g); |
| 739 | generate_image(ctx, imgbox, fz_keep_image(ctx, img), g); |
| 740 | } |
| 741 | else if (display == DIS_INLINE) |
| 742 | { |
| 743 | box = new_box(ctx, g->pool, markup_dir); |
| 744 | fz_apply_css_style(ctx, g->set, &box->style, &match); |
| 745 | insert_inline_box(ctx, box, top, markup_dir, g); |
| 746 | generate_image(ctx, box, fz_keep_image(ctx, img), g); |
| 747 | } |
| 748 | } |
| 749 | } |
| 750 | |
| 751 | else if (display != DIS_NONE) |
| 752 | { |
| 753 | const char *dir, *lang, *id, *href; |
| 754 | int child_dir = markup_dir; |
| 755 | int child_lang = markup_lang; |
| 756 | |
| 757 | dir = fz_xml_att(node, "dir" ); |
| 758 | if (dir) |
| 759 | { |
| 760 | if (!strcmp(dir, "auto" )) |
| 761 | child_dir = FZ_BIDI_NEUTRAL; |
| 762 | else if (!strcmp(dir, "rtl" )) |
| 763 | child_dir = FZ_BIDI_RTL; |
| 764 | else if (!strcmp(dir, "ltr" )) |
| 765 | child_dir = FZ_BIDI_LTR; |
| 766 | else |
| 767 | child_dir = DEFAULT_DIR; |
| 768 | } |
| 769 | |
| 770 | lang = fz_xml_att(node, "lang" ); |
| 771 | if (lang) |
| 772 | child_lang = fz_text_language_from_string(lang); |
| 773 | |
| 774 | box = new_box(ctx, g->pool, child_dir); |
| 775 | fz_apply_css_style(ctx, g->set, &box->style, &match); |
| 776 | |
| 777 | id = fz_xml_att(node, "id" ); |
| 778 | if (id) |
| 779 | box->id = fz_pool_strdup(ctx, g->pool, id); |
| 780 | |
| 781 | if (display == DIS_BLOCK || display == DIS_INLINE_BLOCK) |
| 782 | { |
| 783 | top = insert_block_box(ctx, box, top); |
| 784 | if (g->is_fb2) |
| 785 | { |
| 786 | if (!strcmp(tag, "title" ) || !strcmp(tag, "subtitle" )) |
| 787 | box->heading = fz_mini(section_depth, 6); |
| 788 | } |
| 789 | else |
| 790 | { |
| 791 | if (tag[0]=='h' && tag[1]>='1' && tag[1]<='6' && tag[2]==0) |
| 792 | box->heading = tag[1] - '0'; |
| 793 | } |
| 794 | } |
| 795 | else if (display == DIS_LIST_ITEM) |
| 796 | { |
| 797 | top = insert_block_box(ctx, box, top); |
| 798 | box->list_item = ++list_counter; |
| 799 | } |
| 800 | else if (display == DIS_INLINE) |
| 801 | { |
| 802 | insert_inline_box(ctx, box, top, child_dir, g); |
| 803 | if (id) |
| 804 | generate_anchor(ctx, box, g); |
| 805 | if (tag[0]=='a' && tag[1]==0) |
| 806 | { |
| 807 | if (g->is_fb2) |
| 808 | { |
| 809 | href = fz_xml_att(node, "l:href" ); |
| 810 | if (!href) |
| 811 | href = fz_xml_att(node, "xlink:href" ); |
| 812 | } |
| 813 | else |
| 814 | href = fz_xml_att(node, g->is_fb2 ? "l:href" : "href" ); |
| 815 | if (href) |
| 816 | box->href = fz_pool_strdup(ctx, g->pool, href); |
| 817 | } |
| 818 | } |
| 819 | else if (display == DIS_TABLE) |
| 820 | { |
| 821 | top = insert_table_box(ctx, box, top); |
| 822 | } |
| 823 | else if (display == DIS_TABLE_ROW) |
| 824 | { |
| 825 | top = insert_table_row_box(ctx, box, top); |
| 826 | } |
| 827 | else if (display == DIS_TABLE_CELL) |
| 828 | { |
| 829 | top = insert_table_cell_box(ctx, box, top); |
| 830 | } |
| 831 | else |
| 832 | { |
| 833 | fz_warn(ctx, "unknown box display type" ); |
| 834 | insert_box(ctx, box, BOX_BLOCK, top); |
| 835 | } |
| 836 | |
| 837 | if (fz_xml_down(node)) |
| 838 | { |
| 839 | int child_counter = list_counter; |
| 840 | int child_section = section_depth; |
| 841 | if (!strcmp(tag, "ul" ) || !strcmp(tag, "ol" )) |
| 842 | child_counter = 0; |
| 843 | if (!strcmp(tag, "section" )) |
| 844 | ++child_section; |
| 845 | last_top = generate_boxes(ctx, |
| 846 | fz_xml_down(node), |
| 847 | box, |
| 848 | &match, |
| 849 | child_counter, |
| 850 | child_section, |
| 851 | child_dir, |
| 852 | child_lang, |
| 853 | g); |
| 854 | if (last_top != box) |
| 855 | top = last_top; |
| 856 | } |
| 857 | } |
| 858 | } |
| 859 | else |
| 860 | { |
| 861 | const char *text = fz_xml_text(node); |
| 862 | int collapse = top->style.white_space & WS_COLLAPSE; |
| 863 | if (collapse && is_all_white(text)) |
| 864 | { |
| 865 | g->emit_white = 1; |
| 866 | } |
| 867 | else |
| 868 | { |
| 869 | if (top->type != BOX_INLINE) |
| 870 | { |
| 871 | /* Create anonymous inline box, with the same style as the top block box. */ |
| 872 | box = new_box(ctx, g->pool, markup_dir); |
| 873 | insert_inline_box(ctx, box, top, markup_dir, g); |
| 874 | box->style = top->style; |
| 875 | /* Make sure not to recursively multiply font sizes. */ |
| 876 | box->style.font_size.value = 1; |
| 877 | box->style.font_size.unit = N_SCALE; |
| 878 | generate_text(ctx, box, text, markup_lang, g); |
| 879 | } |
| 880 | else |
| 881 | { |
| 882 | generate_text(ctx, top, text, markup_lang, g); |
| 883 | } |
| 884 | } |
| 885 | } |
| 886 | |
| 887 | node = fz_xml_next(node); |
| 888 | } |
| 889 | |
| 890 | return top; |
| 891 | } |
| 892 | |
| 893 | static char *concat_text(fz_context *ctx, fz_xml *root) |
| 894 | { |
| 895 | fz_xml *node; |
| 896 | size_t i = 0, n = 1; |
| 897 | char *s; |
| 898 | for (node = fz_xml_down(root); node; node = fz_xml_next(node)) |
| 899 | { |
| 900 | const char *text = fz_xml_text(node); |
| 901 | n += text ? strlen(text) : 0; |
| 902 | } |
| 903 | s = fz_malloc(ctx, n); |
| 904 | for (node = fz_xml_down(root); node; node = fz_xml_next(node)) |
| 905 | { |
| 906 | const char *text = fz_xml_text(node); |
| 907 | if (text) |
| 908 | { |
| 909 | n = strlen(text); |
| 910 | memcpy(s+i, text, n); |
| 911 | i += n; |
| 912 | } |
| 913 | } |
| 914 | s[i] = 0; |
| 915 | return s; |
| 916 | } |
| 917 | |
| 918 | static void |
| 919 | html_load_css(fz_context *ctx, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root) |
| 920 | { |
| 921 | fz_xml *html, *head, *node; |
| 922 | fz_buffer *buf; |
| 923 | char path[2048]; |
| 924 | |
| 925 | fz_var(buf); |
| 926 | |
| 927 | html = fz_xml_find(root, "html" ); |
| 928 | head = fz_xml_find_down(html, "head" ); |
| 929 | for (node = fz_xml_down(head); node; node = fz_xml_next(node)) |
| 930 | { |
| 931 | if (fz_xml_is_tag(node, "link" )) |
| 932 | { |
| 933 | char *rel = fz_xml_att(node, "rel" ); |
| 934 | if (rel && !fz_strcasecmp(rel, "stylesheet" )) |
| 935 | { |
| 936 | char *type = fz_xml_att(node, "type" ); |
| 937 | if ((type && !strcmp(type, "text/css" )) || !type) |
| 938 | { |
| 939 | char *href = fz_xml_att(node, "href" ); |
| 940 | if (href) |
| 941 | { |
| 942 | fz_strlcpy(path, base_uri, sizeof path); |
| 943 | fz_strlcat(path, "/" , sizeof path); |
| 944 | fz_strlcat(path, href, sizeof path); |
| 945 | fz_urldecode(path); |
| 946 | fz_cleanname(path); |
| 947 | |
| 948 | buf = NULL; |
| 949 | fz_try(ctx) |
| 950 | { |
| 951 | buf = fz_read_archive_entry(ctx, zip, path); |
| 952 | fz_parse_css(ctx, css, fz_string_from_buffer(ctx, buf), path); |
| 953 | } |
| 954 | fz_always(ctx) |
| 955 | fz_drop_buffer(ctx, buf); |
| 956 | fz_catch(ctx) |
| 957 | fz_warn(ctx, "ignoring stylesheet %s" , path); |
| 958 | } |
| 959 | } |
| 960 | } |
| 961 | } |
| 962 | else if (fz_xml_is_tag(node, "style" )) |
| 963 | { |
| 964 | char *s = concat_text(ctx, node); |
| 965 | fz_try(ctx) |
| 966 | fz_parse_css(ctx, css, s, "<style>" ); |
| 967 | fz_catch(ctx) |
| 968 | fz_warn(ctx, "ignoring inline stylesheet" ); |
| 969 | fz_free(ctx, s); |
| 970 | } |
| 971 | } |
| 972 | } |
| 973 | |
| 974 | static void |
| 975 | fb2_load_css(fz_context *ctx, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root) |
| 976 | { |
| 977 | fz_xml *fictionbook, *stylesheet; |
| 978 | |
| 979 | fictionbook = fz_xml_find(root, "FictionBook" ); |
| 980 | stylesheet = fz_xml_find_down(fictionbook, "stylesheet" ); |
| 981 | if (stylesheet) |
| 982 | { |
| 983 | char *s = concat_text(ctx, stylesheet); |
| 984 | fz_try(ctx) |
| 985 | fz_parse_css(ctx, css, s, "<stylesheet>" ); |
| 986 | fz_catch(ctx) |
| 987 | fz_warn(ctx, "ignoring inline stylesheet" ); |
| 988 | fz_free(ctx, s); |
| 989 | } |
| 990 | } |
| 991 | |
| 992 | static fz_tree * |
| 993 | load_fb2_images(fz_context *ctx, fz_xml *root) |
| 994 | { |
| 995 | fz_xml *fictionbook, *binary; |
| 996 | fz_tree *images = NULL; |
| 997 | |
| 998 | fictionbook = fz_xml_find(root, "FictionBook" ); |
| 999 | for (binary = fz_xml_find_down(fictionbook, "binary" ); binary; binary = fz_xml_find_next(binary, "binary" )) |
| 1000 | { |
| 1001 | const char *id = fz_xml_att(binary, "id" ); |
| 1002 | char *b64 = NULL; |
| 1003 | fz_buffer *buf = NULL; |
| 1004 | fz_image *img = NULL; |
| 1005 | |
| 1006 | fz_var(b64); |
| 1007 | fz_var(buf); |
| 1008 | |
| 1009 | fz_try(ctx) |
| 1010 | { |
| 1011 | b64 = concat_text(ctx, binary); |
| 1012 | buf = fz_new_buffer_from_base64(ctx, b64, strlen(b64)); |
| 1013 | img = fz_new_image_from_buffer(ctx, buf); |
| 1014 | } |
| 1015 | fz_always(ctx) |
| 1016 | { |
| 1017 | fz_drop_buffer(ctx, buf); |
| 1018 | fz_free(ctx, b64); |
| 1019 | } |
| 1020 | fz_catch(ctx) |
| 1021 | fz_rethrow(ctx); |
| 1022 | |
| 1023 | images = fz_tree_insert(ctx, images, id, img); |
| 1024 | } |
| 1025 | |
| 1026 | return images; |
| 1027 | } |
| 1028 | |
| 1029 | typedef struct |
| 1030 | { |
| 1031 | uint32_t *data; |
| 1032 | size_t cap; |
| 1033 | size_t len; |
| 1034 | } uni_buf; |
| 1035 | |
| 1036 | typedef struct |
| 1037 | { |
| 1038 | fz_context *ctx; |
| 1039 | fz_pool *pool; |
| 1040 | fz_html_flow *flow; |
| 1041 | uni_buf *buffer; |
| 1042 | } bidi_data; |
| 1043 | |
| 1044 | static void fragment_cb(const uint32_t *fragment, |
| 1045 | size_t fragment_len, |
| 1046 | int bidi_level, |
| 1047 | int script, |
| 1048 | void *arg) |
| 1049 | { |
| 1050 | bidi_data *data = (bidi_data *)arg; |
| 1051 | size_t fragment_offset = fragment - data->buffer->data; |
| 1052 | |
| 1053 | /* We are guaranteed that fragmentOffset will be at the beginning |
| 1054 | * of flow. */ |
| 1055 | while (fragment_len > 0) |
| 1056 | { |
| 1057 | size_t len; |
| 1058 | |
| 1059 | if (data->flow->type == FLOW_SPACE) |
| 1060 | { |
| 1061 | len = 1; |
| 1062 | } |
| 1063 | else if (data->flow->type == FLOW_BREAK || data->flow->type == FLOW_SBREAK || |
| 1064 | data->flow->type == FLOW_SHYPHEN || data->flow->type == FLOW_ANCHOR) |
| 1065 | { |
| 1066 | len = 0; |
| 1067 | } |
| 1068 | else |
| 1069 | { |
| 1070 | /* Must be text */ |
| 1071 | len = fz_utflen(data->flow->content.text); |
| 1072 | if (len > fragment_len) |
| 1073 | { |
| 1074 | /* We need to split this flow box */ |
| 1075 | (void)split_flow(data->ctx, data->pool, data->flow, fragment_len); |
| 1076 | len = fz_utflen(data->flow->content.text); |
| 1077 | } |
| 1078 | } |
| 1079 | |
| 1080 | /* This flow box is entirely contained within this fragment. */ |
| 1081 | data->flow->bidi_level = bidi_level; |
| 1082 | data->flow->script = script; |
| 1083 | data->flow = data->flow->next; |
| 1084 | fragment_offset += len; |
| 1085 | fragment_len -= len; |
| 1086 | } |
| 1087 | } |
| 1088 | |
| 1089 | static fz_bidi_direction |
| 1090 | detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_bidi_direction bidi_dir, fz_html_flow *flow) |
| 1091 | { |
| 1092 | fz_html_flow *end = flow; |
| 1093 | bidi_data data; |
| 1094 | |
| 1095 | while (end) |
| 1096 | { |
| 1097 | int level = end->bidi_level; |
| 1098 | |
| 1099 | /* Gather the text from the flow up into a single buffer (at |
| 1100 | * least, as much of it as has the same direction markup). */ |
| 1101 | buffer->len = 0; |
| 1102 | while (end && (level & 1) == (end->bidi_level & 1)) |
| 1103 | { |
| 1104 | size_t len = 0; |
| 1105 | const char *text = "" ; |
| 1106 | int broken = 0; |
| 1107 | |
| 1108 | switch (end->type) |
| 1109 | { |
| 1110 | case FLOW_WORD: |
| 1111 | len = fz_utflen(end->content.text); |
| 1112 | text = end->content.text; |
| 1113 | break; |
| 1114 | case FLOW_SPACE: |
| 1115 | len = 1; |
| 1116 | text = " " ; |
| 1117 | break; |
| 1118 | case FLOW_SHYPHEN: |
| 1119 | case FLOW_SBREAK: |
| 1120 | break; |
| 1121 | case FLOW_BREAK: |
| 1122 | case FLOW_IMAGE: |
| 1123 | broken = 1; |
| 1124 | break; |
| 1125 | } |
| 1126 | |
| 1127 | end = end->next; |
| 1128 | |
| 1129 | if (broken) |
| 1130 | break; |
| 1131 | |
| 1132 | /* Make sure the buffer is large enough */ |
| 1133 | if (buffer->len + len > buffer->cap) |
| 1134 | { |
| 1135 | size_t newcap = buffer->cap; |
| 1136 | if (newcap < 128) |
| 1137 | newcap = 128; /* Sensible small default */ |
| 1138 | |
| 1139 | while (newcap < buffer->len + len) |
| 1140 | newcap = (newcap * 3) / 2; |
| 1141 | |
| 1142 | buffer->data = fz_realloc_array(ctx, buffer->data, newcap, uint32_t); |
| 1143 | buffer->cap = newcap; |
| 1144 | } |
| 1145 | |
| 1146 | /* Expand the utf8 text into Unicode and store it in the buffer */ |
| 1147 | while (*text) |
| 1148 | { |
| 1149 | int rune; |
| 1150 | text += fz_chartorune(&rune, text); |
| 1151 | buffer->data[buffer->len++] = rune; |
| 1152 | } |
| 1153 | } |
| 1154 | |
| 1155 | /* Detect directionality for the buffer */ |
| 1156 | data.ctx = ctx; |
| 1157 | data.pool = pool; |
| 1158 | data.flow = flow; |
| 1159 | data.buffer = buffer; |
| 1160 | fz_bidi_fragment_text(ctx, buffer->data, buffer->len, &bidi_dir, fragment_cb, &data, 0 /* Flags */); |
| 1161 | } |
| 1162 | return bidi_dir; |
| 1163 | } |
| 1164 | |
| 1165 | static void |
| 1166 | detect_box_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html_box *box) |
| 1167 | { |
| 1168 | while (box) |
| 1169 | { |
| 1170 | if (box->flow_head) |
| 1171 | box->markup_dir = detect_flow_directionality(ctx, pool, buffer, box->markup_dir, box->flow_head); |
| 1172 | detect_box_directionality(ctx, pool, buffer, box->down); |
| 1173 | box = box->next; |
| 1174 | } |
| 1175 | } |
| 1176 | |
| 1177 | static void |
| 1178 | detect_directionality(fz_context *ctx, fz_pool *pool, fz_html_box *box) |
| 1179 | { |
| 1180 | uni_buf buffer = { NULL }; |
| 1181 | |
| 1182 | fz_try(ctx) |
| 1183 | detect_box_directionality(ctx, pool, &buffer, box); |
| 1184 | fz_always(ctx) |
| 1185 | fz_free(ctx, buffer.data); |
| 1186 | fz_catch(ctx) |
| 1187 | fz_rethrow(ctx); |
| 1188 | } |
| 1189 | |
| 1190 | fz_html * |
| 1191 | fz_parse_html(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css) |
| 1192 | { |
| 1193 | fz_xml_doc *xml; |
| 1194 | fz_xml *root, *node; |
| 1195 | fz_html *html = NULL; |
| 1196 | char *title; |
| 1197 | |
| 1198 | fz_css_match match; |
| 1199 | struct genstate g; |
| 1200 | |
| 1201 | g.pool = NULL; |
| 1202 | g.set = set; |
| 1203 | g.zip = zip; |
| 1204 | g.images = NULL; |
| 1205 | g.base_uri = base_uri; |
| 1206 | g.css = NULL; |
| 1207 | g.at_bol = 0; |
| 1208 | g.emit_white = 0; |
| 1209 | g.last_brk_cls = UCDN_LINEBREAK_CLASS_OP; |
| 1210 | |
| 1211 | xml = fz_parse_xml(ctx, buf, 1); |
| 1212 | root = fz_xml_root(xml); |
| 1213 | |
| 1214 | fz_try(ctx) |
| 1215 | g.css = fz_new_css(ctx); |
| 1216 | fz_catch(ctx) |
| 1217 | { |
| 1218 | fz_drop_xml(ctx, xml); |
| 1219 | fz_rethrow(ctx); |
| 1220 | } |
| 1221 | |
| 1222 | #ifndef NDEBUG |
| 1223 | if (fz_atoi(getenv("FZ_DEBUG_XML" ))) |
| 1224 | fz_debug_xml(root, 0); |
| 1225 | #endif |
| 1226 | |
| 1227 | fz_try(ctx) |
| 1228 | { |
| 1229 | if (fz_xml_find(root, "FictionBook" )) |
| 1230 | { |
| 1231 | g.is_fb2 = 1; |
| 1232 | fz_parse_css(ctx, g.css, fb2_default_css, "<default:fb2>" ); |
| 1233 | if (fz_use_document_css(ctx)) |
| 1234 | fb2_load_css(ctx, g.zip, g.base_uri, g.css, root); |
| 1235 | g.images = load_fb2_images(ctx, root); |
| 1236 | } |
| 1237 | else |
| 1238 | { |
| 1239 | g.is_fb2 = 0; |
| 1240 | fz_parse_css(ctx, g.css, html_default_css, "<default:html>" ); |
| 1241 | if (fz_use_document_css(ctx)) |
| 1242 | html_load_css(ctx, g.zip, g.base_uri, g.css, root); |
| 1243 | g.images = NULL; |
| 1244 | } |
| 1245 | |
| 1246 | if (user_css) |
| 1247 | fz_parse_css(ctx, g.css, user_css, "<user>" ); |
| 1248 | |
| 1249 | fz_add_css_font_faces(ctx, g.set, g.zip, g.base_uri, g.css); /* load @font-face fonts into font set */ |
| 1250 | } |
| 1251 | fz_catch(ctx) |
| 1252 | { |
| 1253 | fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); |
| 1254 | fz_warn(ctx, "ignoring styles due to errors: %s" , fz_caught_message(ctx)); |
| 1255 | } |
| 1256 | |
| 1257 | #ifndef NDEBUG |
| 1258 | if (fz_atoi(getenv("FZ_DEBUG_CSS" ))) |
| 1259 | fz_debug_css(ctx, g.css); |
| 1260 | #endif |
| 1261 | |
| 1262 | fz_try(ctx) |
| 1263 | { |
| 1264 | g.pool = fz_new_pool(ctx); |
| 1265 | html = fz_pool_alloc(ctx, g.pool, sizeof *html); |
| 1266 | html->pool = g.pool; |
| 1267 | html->root = new_box(ctx, g.pool, DEFAULT_DIR); |
| 1268 | |
| 1269 | match.up = NULL; |
| 1270 | match.count = 0; |
| 1271 | fz_match_css_at_page(ctx, &match, g.css); |
| 1272 | fz_apply_css_style(ctx, g.set, &html->root->style, &match); |
| 1273 | // TODO: transfer page margins out of this hacky box |
| 1274 | |
| 1275 | generate_boxes(ctx, root, html->root, &match, 0, 0, DEFAULT_DIR, FZ_LANG_UNSET, &g); |
| 1276 | |
| 1277 | detect_directionality(ctx, g.pool, html->root); |
| 1278 | |
| 1279 | if (g.is_fb2) |
| 1280 | { |
| 1281 | node = fz_xml_find(root, "FictionBook" ); |
| 1282 | node = fz_xml_find_down(node, "description" ); |
| 1283 | node = fz_xml_find_down(node, "title-info" ); |
| 1284 | node = fz_xml_find_down(node, "book-title" ); |
| 1285 | title = fz_xml_text(fz_xml_down(node)); |
| 1286 | if (title) |
| 1287 | html->title = fz_pool_strdup(ctx, g.pool, title); |
| 1288 | } |
| 1289 | else |
| 1290 | { |
| 1291 | node = fz_xml_find(root, "html" ); |
| 1292 | node = fz_xml_find_down(node, "head" ); |
| 1293 | node = fz_xml_find_down(node, "title" ); |
| 1294 | title = fz_xml_text(fz_xml_down(node)); |
| 1295 | if (title) |
| 1296 | html->title = fz_pool_strdup(ctx, g.pool, title); |
| 1297 | } |
| 1298 | } |
| 1299 | fz_always(ctx) |
| 1300 | { |
| 1301 | fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image); |
| 1302 | fz_drop_css(ctx, g.css); |
| 1303 | fz_drop_xml(ctx, xml); |
| 1304 | } |
| 1305 | fz_catch(ctx) |
| 1306 | { |
| 1307 | fz_drop_html(ctx, html); |
| 1308 | fz_rethrow(ctx); |
| 1309 | } |
| 1310 | |
| 1311 | return html; |
| 1312 | } |
| 1313 | |
| 1314 | static void indent(int level) |
| 1315 | { |
| 1316 | while (level-- > 0) |
| 1317 | putchar('\t'); |
| 1318 | } |
| 1319 | |
| 1320 | static void |
| 1321 | fz_debug_html_flow(fz_context *ctx, fz_html_flow *flow, int level) |
| 1322 | { |
| 1323 | fz_html_box *sbox = NULL; |
| 1324 | while (flow) |
| 1325 | { |
| 1326 | if (flow->box != sbox) { |
| 1327 | if (sbox) { |
| 1328 | indent(level); |
| 1329 | printf("}\n" ); |
| 1330 | } |
| 1331 | sbox = flow->box; |
| 1332 | indent(level); |
| 1333 | printf("span em=%g font='%s'" , sbox->em, fz_font_name(ctx, sbox->style.font)); |
| 1334 | if (fz_font_is_serif(ctx, sbox->style.font)) |
| 1335 | printf(" serif" ); |
| 1336 | else |
| 1337 | printf(" sans" ); |
| 1338 | if (fz_font_is_monospaced(ctx, sbox->style.font)) |
| 1339 | printf(" monospaced" ); |
| 1340 | if (fz_font_is_bold(ctx, sbox->style.font)) |
| 1341 | printf(" bold" ); |
| 1342 | if (fz_font_is_italic(ctx, sbox->style.font)) |
| 1343 | printf(" italic" ); |
| 1344 | if (sbox->style.small_caps) |
| 1345 | printf(" small-caps" ); |
| 1346 | printf("\n" ); |
| 1347 | indent(level); |
| 1348 | printf("{\n" ); |
| 1349 | } |
| 1350 | |
| 1351 | indent(level+1); |
| 1352 | switch (flow->type) { |
| 1353 | case FLOW_WORD: printf("word " ); break; |
| 1354 | case FLOW_SPACE: printf("space" ); break; |
| 1355 | case FLOW_SBREAK: printf("sbrk " ); break; |
| 1356 | case FLOW_SHYPHEN: printf("shy " ); break; |
| 1357 | case FLOW_BREAK: printf("break" ); break; |
| 1358 | case FLOW_IMAGE: printf("image" ); break; |
| 1359 | case FLOW_ANCHOR: printf("anchor" ); break; |
| 1360 | } |
| 1361 | printf(" y=%g x=%g w=%g" , flow->y, flow->x, flow->w); |
| 1362 | if (flow->type == FLOW_IMAGE) |
| 1363 | printf(" h=%g" , flow->h); |
| 1364 | if (flow->type == FLOW_WORD) |
| 1365 | printf(" text='%s'" , flow->content.text); |
| 1366 | printf("\n" ); |
| 1367 | if (flow->breaks_line) { |
| 1368 | indent(level+1); |
| 1369 | printf("*\n" ); |
| 1370 | } |
| 1371 | |
| 1372 | flow = flow->next; |
| 1373 | } |
| 1374 | indent(level); |
| 1375 | printf("}\n" ); |
| 1376 | } |
| 1377 | |
| 1378 | static void |
| 1379 | fz_debug_html_box(fz_context *ctx, fz_html_box *box, int level) |
| 1380 | { |
| 1381 | while (box) |
| 1382 | { |
| 1383 | indent(level); |
| 1384 | switch (box->type) { |
| 1385 | case BOX_BLOCK: printf("block" ); break; |
| 1386 | case BOX_BREAK: printf("break" ); break; |
| 1387 | case BOX_FLOW: printf("flow" ); break; |
| 1388 | case BOX_INLINE: printf("inline" ); break; |
| 1389 | case BOX_TABLE: printf("table" ); break; |
| 1390 | case BOX_TABLE_ROW: printf("table-row" ); break; |
| 1391 | case BOX_TABLE_CELL: printf("table-cell" ); break; |
| 1392 | } |
| 1393 | |
| 1394 | printf(" em=%g x=%g y=%g w=%g b=%g\n" , box->em, box->x, box->y, box->w, box->b); |
| 1395 | |
| 1396 | indent(level); |
| 1397 | printf("{\n" ); |
| 1398 | if (box->type == BOX_BLOCK) { |
| 1399 | indent(level+1); |
| 1400 | printf("margin=%g %g %g %g\n" , box->margin[0], box->margin[1], box->margin[2], box->margin[3]); |
| 1401 | } |
| 1402 | if (box->is_first_flow) { |
| 1403 | indent(level+1); |
| 1404 | printf("is-first-flow\n" ); |
| 1405 | } |
| 1406 | if (box->list_item) { |
| 1407 | indent(level+1); |
| 1408 | printf("list=%d\n" , box->list_item); |
| 1409 | } |
| 1410 | if (box->id) { |
| 1411 | indent(level+1); |
| 1412 | printf("id=%s\n" , box->id); |
| 1413 | } |
| 1414 | if (box->href) { |
| 1415 | indent(level+1); |
| 1416 | printf("href=%s\n" , box->href); |
| 1417 | } |
| 1418 | |
| 1419 | if (box->down) |
| 1420 | fz_debug_html_box(ctx, box->down, level + 1); |
| 1421 | if (box->flow_head) |
| 1422 | fz_debug_html_flow(ctx, box->flow_head, level + 1); |
| 1423 | |
| 1424 | indent(level); |
| 1425 | printf("}\n" ); |
| 1426 | |
| 1427 | box = box->next; |
| 1428 | } |
| 1429 | } |
| 1430 | |
| 1431 | void |
| 1432 | fz_debug_html(fz_context *ctx, fz_html_box *box) |
| 1433 | { |
| 1434 | fz_debug_html_box(ctx, box, 0); |
| 1435 | } |
| 1436 | |