1 | #include "mupdf/fitz.h" |
2 | #include "mupdf/ucdn.h" |
3 | #include "html-imp.h" |
4 | |
5 | #include <string.h> |
6 | #include <stdio.h> |
7 | |
8 | enum { T, R, B, L }; |
9 | |
10 | #define DEFAULT_DIR FZ_BIDI_LTR |
11 | |
12 | static const char *html_default_css = |
13 | "@page{margin:3em 2em}" |
14 | "a{color:#06C;text-decoration:underline}" |
15 | "address{display:block;font-style:italic}" |
16 | "b{font-weight:bold}" |
17 | "bdo{direction:rtl;unicode-bidi:bidi-override}" |
18 | "blockquote{display:block;margin:1em 40px}" |
19 | "body{display:block;margin:1em}" |
20 | "cite{font-style:italic}" |
21 | "code{font-family:monospace}" |
22 | "dd{display:block;margin:0 0 0 40px}" |
23 | "del{text-decoration:line-through}" |
24 | "div{display:block}" |
25 | "dl{display:block;margin:1em 0}" |
26 | "dt{display:block}" |
27 | "em{font-style:italic}" |
28 | "h1{display:block;font-size:2em;font-weight:bold;margin:0.67em 0;page-break-after:avoid}" |
29 | "h2{display:block;font-size:1.5em;font-weight:bold;margin:0.83em 0;page-break-after:avoid}" |
30 | "h3{display:block;font-size:1.17em;font-weight:bold;margin:1em 0;page-break-after:avoid}" |
31 | "h4{display:block;font-size:1em;font-weight:bold;margin:1.33em 0;page-break-after:avoid}" |
32 | "h5{display:block;font-size:0.83em;font-weight:bold;margin:1.67em 0;page-break-after:avoid}" |
33 | "h6{display:block;font-size:0.67em;font-weight:bold;margin:2.33em 0;page-break-after:avoid}" |
34 | "head{display:none}" |
35 | "hr{border-style:solid;border-width:1px;display:block;margin-bottom:0.5em;margin-top:0.5em;text-align:center}" |
36 | "html{display:block}" |
37 | "i{font-style:italic}" |
38 | "ins{text-decoration:underline}" |
39 | "kbd{font-family:monospace}" |
40 | "li{display:list-item}" |
41 | "menu{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}" |
42 | "ol{display:block;list-style-type:decimal;margin:1em 0;padding:0 0 0 30pt}" |
43 | "p{display:block;margin:1em 0}" |
44 | "pre{display:block;font-family:monospace;margin:1em 0;white-space:pre}" |
45 | "samp{font-family:monospace}" |
46 | "script{display:none}" |
47 | "small{font-size:0.83em}" |
48 | "strong{font-weight:bold}" |
49 | "style{display:none}" |
50 | "sub{font-size:0.83em;vertical-align:sub}" |
51 | "sup{font-size:0.83em;vertical-align:super}" |
52 | "table{display:table}" |
53 | "tbody{display:table-row-group}" |
54 | "td{display:table-cell;padding:1px}" |
55 | "tfoot{display:table-footer-group}" |
56 | "th{display:table-cell;font-weight:bold;padding:1px;text-align:center}" |
57 | "thead{display:table-header-group}" |
58 | "tr{display:table-row}" |
59 | "ul{display:block;list-style-type:disc;margin:1em 0;padding:0 0 0 30pt}" |
60 | "ul ul{list-style-type:circle}" |
61 | "ul ul ul{list-style-type:square}" |
62 | "var{font-style:italic}" |
63 | "svg{display:none}" |
64 | ; |
65 | |
66 | static const char *fb2_default_css = |
67 | "@page{margin:3em 2em}" |
68 | "FictionBook{display:block;margin:1em}" |
69 | "stylesheet,binary{display:none}" |
70 | "description>*{display:none}" |
71 | "description>title-info{display:block}" |
72 | "description>title-info>*{display:none}" |
73 | "description>title-info>coverpage{display:block;page-break-before:always;page-break-after:always}" |
74 | "body,section,title,subtitle,p,cite,epigraph,text-author,date,poem,stanza,v,empty-line{display:block}" |
75 | "image{display:block}" |
76 | "p>image{display:inline}" |
77 | "table{display:table}" |
78 | "tr{display:table-row}" |
79 | "th,td{display:table-cell}" |
80 | "a{color:#06C;text-decoration:underline}" |
81 | "a[type=note]{font-size:small;vertical-align:super}" |
82 | "code{white-space:pre;font-family:monospace}" |
83 | "emphasis{font-style:italic}" |
84 | "strikethrough{text-decoration:line-through}" |
85 | "strong{font-weight:bold}" |
86 | "sub{font-size:small;vertical-align:sub}" |
87 | "sup{font-size:small;vertical-align:super}" |
88 | "image{margin:1em 0;text-align:center}" |
89 | "cite,poem{margin:1em 2em}" |
90 | "subtitle,epigraph,stanza{margin:1em 0}" |
91 | "title>p{text-align:center;font-size:x-large}" |
92 | "subtitle{text-align:center;font-size:large}" |
93 | "p{margin-top:1em;text-align:justify}" |
94 | "empty-line{padding-top:1em}" |
95 | "p+p{margin-top:0;text-indent:1.5em}" |
96 | "empty-line+p{margin-top:0}" |
97 | "section>title{page-break-before:always}" |
98 | ; |
99 | |
100 | struct genstate |
101 | { |
102 | fz_pool *pool; |
103 | fz_html_font_set *set; |
104 | fz_archive *zip; |
105 | fz_tree *images; |
106 | int is_fb2; |
107 | const char *base_uri; |
108 | fz_css *css; |
109 | int at_bol; |
110 | int emit_white; |
111 | int last_brk_cls; |
112 | }; |
113 | |
114 | static int iswhite(int c) |
115 | { |
116 | return c == ' ' || c == '\t' || c == '\r' || c == '\n'; |
117 | } |
118 | |
119 | static int is_all_white(const char *s) |
120 | { |
121 | while (*s) |
122 | { |
123 | if (!iswhite(*s)) |
124 | return 0; |
125 | ++s; |
126 | } |
127 | return 1; |
128 | } |
129 | |
130 | /* TODO: pool allocator for flow nodes */ |
131 | /* TODO: store text by pointing to a giant buffer */ |
132 | |
133 | static void fz_drop_html_flow(fz_context *ctx, fz_html_flow *flow) |
134 | { |
135 | while (flow) |
136 | { |
137 | fz_html_flow *next = flow->next; |
138 | if (flow->type == FLOW_IMAGE) |
139 | fz_drop_image(ctx, flow->content.image); |
140 | flow = next; |
141 | } |
142 | } |
143 | |
144 | static fz_html_flow *add_flow(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, int type) |
145 | { |
146 | fz_html_flow *flow = fz_pool_alloc(ctx, pool, sizeof *flow); |
147 | flow->type = type; |
148 | flow->expand = 0; |
149 | flow->bidi_level = 0; |
150 | flow->markup_lang = 0; |
151 | flow->breaks_line = 0; |
152 | flow->box = inline_box; |
153 | *top->flow_tail = flow; |
154 | top->flow_tail = &flow->next; |
155 | return flow; |
156 | } |
157 | |
158 | static void add_flow_space(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) |
159 | { |
160 | fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_SPACE); |
161 | flow->expand = 1; |
162 | } |
163 | |
164 | static void add_flow_break(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) |
165 | { |
166 | (void)add_flow(ctx, pool, top, inline_box, FLOW_BREAK); |
167 | } |
168 | |
169 | static void add_flow_sbreak(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) |
170 | { |
171 | (void)add_flow(ctx, pool, top, inline_box, FLOW_SBREAK); |
172 | } |
173 | |
174 | static void add_flow_shyphen(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) |
175 | { |
176 | (void)add_flow(ctx, pool, top, inline_box, FLOW_SHYPHEN); |
177 | } |
178 | |
179 | static void add_flow_word(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, const char *a, const char *b, int lang) |
180 | { |
181 | fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_WORD); |
182 | flow->content.text = fz_pool_alloc(ctx, pool, b - a + 1); |
183 | memcpy(flow->content.text, a, b - a); |
184 | flow->content.text[b - a] = 0; |
185 | flow->markup_lang = lang; |
186 | } |
187 | |
188 | static void add_flow_image(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box, fz_image *img) |
189 | { |
190 | fz_html_flow *flow = add_flow(ctx, pool, top, inline_box, FLOW_IMAGE); |
191 | flow->content.image = fz_keep_image(ctx, img); |
192 | } |
193 | |
194 | static void add_flow_anchor(fz_context *ctx, fz_pool *pool, fz_html_box *top, fz_html_box *inline_box) |
195 | { |
196 | (void)add_flow(ctx, pool, top, inline_box, FLOW_ANCHOR); |
197 | } |
198 | |
199 | static fz_html_flow *split_flow(fz_context *ctx, fz_pool *pool, fz_html_flow *flow, size_t offset) |
200 | { |
201 | fz_html_flow *new_flow; |
202 | char *text; |
203 | size_t len; |
204 | |
205 | if (offset == 0) |
206 | return flow; |
207 | new_flow = fz_pool_alloc(ctx, pool, sizeof *flow); |
208 | *new_flow = *flow; |
209 | new_flow->next = flow->next; |
210 | flow->next = new_flow; |
211 | |
212 | text = flow->content.text; |
213 | while (*text && offset) |
214 | { |
215 | int rune; |
216 | text += fz_chartorune(&rune, text); |
217 | offset--; |
218 | } |
219 | len = strlen(text); |
220 | new_flow->content.text = fz_pool_alloc(ctx, pool, len+1); |
221 | strcpy(new_flow->content.text, text); |
222 | *text = 0; |
223 | return new_flow; |
224 | } |
225 | |
226 | static void flush_space(fz_context *ctx, fz_html_box *flow, fz_html_box *inline_box, int lang, struct genstate *g) |
227 | { |
228 | static const char *space = " " ; |
229 | int bsp = inline_box->style.white_space & WS_ALLOW_BREAK_SPACE; |
230 | fz_pool *pool = g->pool; |
231 | if (g->emit_white) |
232 | { |
233 | if (!g->at_bol) |
234 | { |
235 | if (bsp) |
236 | add_flow_space(ctx, pool, flow, inline_box); |
237 | else |
238 | add_flow_word(ctx, pool, flow, inline_box, space, space+1, lang); |
239 | } |
240 | g->emit_white = 0; |
241 | } |
242 | } |
243 | |
244 | /* pair-wise lookup table for UAX#14 linebreaks */ |
245 | static const char *pairbrk[29] = |
246 | { |
247 | /* -OCCQGNESIPPNAHIIHBBBZCWHHJJJR- */ |
248 | /* -PLPULSXYSROULLDNYAB2WMJ23LVTI- */ |
249 | "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" , /* OP open punctuation */ |
250 | "_^^%%^^^^%%_____%%__^^^______" , /* CL close punctuation */ |
251 | "_^^%%^^^^%%%%%__%%__^^^______" , /* CP close parenthesis */ |
252 | "^^^%%%^^^%%%%%%%%%%%^^^%%%%%%" , /* QU quotation */ |
253 | "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%" , /* GL non-breaking glue */ |
254 | "_^^%%%^^^_______%%__^^^______" , /* NS nonstarters */ |
255 | "_^^%%%^^^______%%%__^^^______" , /* EX exclamation/interrogation */ |
256 | "_^^%%%^^^__%_%__%%__^^^______" , /* SY symbols allowing break after */ |
257 | "_^^%%%^^^__%%%__%%__^^^______" , /* IS infix numeric separator */ |
258 | "%^^%%%^^^__%%%%_%%__^^^%%%%%_" , /* PR prefix numeric */ |
259 | "%^^%%%^^^__%%%__%%__^^^______" , /* PO postfix numeric */ |
260 | "%^^%%%^^^%%%%%_%%%__^^^______" , /* NU numeric */ |
261 | "%^^%%%^^^__%%%_%%%__^^^______" , /* AL ordinary alphabetic and symbol characters */ |
262 | "%^^%%%^^^__%%%_%%%__^^^______" , /* HL hebrew letter */ |
263 | "_^^%%%^^^_%____%%%__^^^______" , /* ID ideographic */ |
264 | "_^^%%%^^^______%%%__^^^______" , /* IN inseparable characters */ |
265 | "_^^%_%^^^__%____%%__^^^______" , /* HY hyphens */ |
266 | "_^^%_%^^^_______%%__^^^______" , /* BA break after */ |
267 | "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%" , /* BB break before */ |
268 | "_^^%%%^^^_______%%_^^^^______" , /* B2 break opportunity before and after */ |
269 | "____________________^________" , /* ZW zero width space */ |
270 | "%^^%%%^^^__%%%_%%%__^^^______" , /* CM combining mark */ |
271 | "%^^%%%^^^%%%%%%%%%%%^^^%%%%%%" , /* WJ word joiner */ |
272 | "_^^%%%^^^_%____%%%__^^^___%%_" , /* H2 hangul leading/vowel syllable */ |
273 | "_^^%%%^^^_%____%%%__^^^____%_" , /* H3 hangul leading/vowel/trailing syllable */ |
274 | "_^^%%%^^^_%____%%%__^^^%%%%__" , /* JL hangul leading jamo */ |
275 | "_^^%%%^^^_%____%%%__^^^___%%_" , /* JV hangul vowel jamo */ |
276 | "_^^%%%^^^_%____%%%__^^^____%_" , /* JT hangul trailing jamo */ |
277 | "_^^%%%^^^_______%%__^^^_____%" , /* RI regional indicator */ |
278 | }; |
279 | |
280 | static void generate_text(fz_context *ctx, fz_html_box *box, const char *text, int lang, struct genstate *g) |
281 | { |
282 | fz_html_box *flow; |
283 | fz_pool *pool = g->pool; |
284 | int collapse = box->style.white_space & WS_COLLAPSE; |
285 | int bsp = box->style.white_space & WS_ALLOW_BREAK_SPACE; |
286 | int bnl = box->style.white_space & WS_FORCE_BREAK_NEWLINE; |
287 | |
288 | static const char *space = " " ; |
289 | |
290 | flow = box; |
291 | while (flow->type != BOX_FLOW) |
292 | flow = flow->up; |
293 | |
294 | while (*text) |
295 | { |
296 | if (bnl && (*text == '\n' || *text == '\r')) |
297 | { |
298 | if (text[0] == '\r' && text[1] == '\n') |
299 | text += 2; |
300 | else |
301 | text += 1; |
302 | add_flow_break(ctx, pool, flow, box); |
303 | g->at_bol = 1; |
304 | } |
305 | else if (iswhite(*text)) |
306 | { |
307 | if (collapse) |
308 | { |
309 | if (bnl) |
310 | while (*text == ' ' || *text == '\t') |
311 | ++text; |
312 | else |
313 | while (iswhite(*text)) |
314 | ++text; |
315 | g->emit_white = 1; |
316 | } |
317 | else |
318 | { |
319 | // TODO: tabs |
320 | if (bsp) |
321 | add_flow_space(ctx, pool, flow, box); |
322 | else |
323 | add_flow_word(ctx, pool, flow, box, space, space+1, lang); |
324 | ++text; |
325 | } |
326 | g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a space */ |
327 | } |
328 | else |
329 | { |
330 | const char *prev, *mark = text; |
331 | int c; |
332 | |
333 | flush_space(ctx, flow, box, lang, g); |
334 | |
335 | if (g->at_bol) |
336 | g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; |
337 | |
338 | while (*text && !iswhite(*text)) |
339 | { |
340 | prev = text; |
341 | text += fz_chartorune(&c, text); |
342 | if (c == 0xAD) /* soft hyphen */ |
343 | { |
344 | if (mark != prev) |
345 | add_flow_word(ctx, pool, flow, box, mark, prev, lang); |
346 | add_flow_shyphen(ctx, pool, flow, box); |
347 | mark = text; |
348 | g->last_brk_cls = UCDN_LINEBREAK_CLASS_WJ; /* don't add sbreaks after a soft hyphen */ |
349 | } |
350 | else if (bsp) /* allow soft breaks */ |
351 | { |
352 | int this_brk_cls = ucdn_get_resolved_linebreak_class(c); |
353 | if (this_brk_cls < UCDN_LINEBREAK_CLASS_RI) |
354 | { |
355 | int brk = pairbrk[g->last_brk_cls][this_brk_cls]; |
356 | |
357 | /* we handle spaces elsewhere, so ignore these classes */ |
358 | if (brk == '@') brk = '^'; |
359 | if (brk == '#') brk = '^'; |
360 | if (brk == '%') brk = '^'; |
361 | |
362 | if (brk == '_') |
363 | { |
364 | if (mark != prev) |
365 | add_flow_word(ctx, pool, flow, box, mark, prev, lang); |
366 | add_flow_sbreak(ctx, pool, flow, box); |
367 | mark = prev; |
368 | } |
369 | |
370 | g->last_brk_cls = this_brk_cls; |
371 | } |
372 | } |
373 | } |
374 | if (mark != text) |
375 | add_flow_word(ctx, pool, flow, box, mark, text, lang); |
376 | |
377 | g->at_bol = 0; |
378 | } |
379 | } |
380 | } |
381 | |
382 | static fz_image *load_html_image(fz_context *ctx, fz_archive *zip, const char *base_uri, const char *src) |
383 | { |
384 | char path[2048]; |
385 | fz_image *img = NULL; |
386 | fz_buffer *buf = NULL; |
387 | |
388 | fz_var(img); |
389 | fz_var(buf); |
390 | |
391 | fz_try(ctx) |
392 | { |
393 | if (!strncmp(src, "data:image/jpeg;base64," , 23)) |
394 | buf = fz_new_buffer_from_base64(ctx, src+23, 0); |
395 | else if (!strncmp(src, "data:image/png;base64," , 22)) |
396 | buf = fz_new_buffer_from_base64(ctx, src+22, 0); |
397 | else |
398 | { |
399 | fz_strlcpy(path, base_uri, sizeof path); |
400 | fz_strlcat(path, "/" , sizeof path); |
401 | fz_strlcat(path, src, sizeof path); |
402 | fz_urldecode(path); |
403 | fz_cleanname(path); |
404 | buf = fz_read_archive_entry(ctx, zip, path); |
405 | } |
406 | #if FZ_ENABLE_SVG |
407 | if (strstr(src, ".svg" )) |
408 | img = fz_new_image_from_svg(ctx, buf, base_uri, zip); |
409 | else |
410 | #endif |
411 | img = fz_new_image_from_buffer(ctx, buf); |
412 | } |
413 | fz_always(ctx) |
414 | fz_drop_buffer(ctx, buf); |
415 | fz_catch(ctx) |
416 | fz_warn(ctx, "html: cannot load image src='%s'" , src); |
417 | |
418 | return img; |
419 | } |
420 | |
421 | static fz_image *load_svg_image(fz_context *ctx, fz_archive *zip, const char *base_uri, fz_xml *xml) |
422 | { |
423 | fz_image *img = NULL; |
424 | fz_try(ctx) |
425 | img = fz_new_image_from_svg_xml(ctx, xml, base_uri, zip); |
426 | fz_catch(ctx) |
427 | fz_warn(ctx, "html: cannot load embedded svg document" ); |
428 | return img; |
429 | } |
430 | |
431 | static void generate_anchor(fz_context *ctx, fz_html_box *box, struct genstate *g) |
432 | { |
433 | fz_pool *pool = g->pool; |
434 | fz_html_box *flow = box; |
435 | while (flow->type != BOX_FLOW) |
436 | flow = flow->up; |
437 | add_flow_anchor(ctx, pool, flow, box); |
438 | } |
439 | |
440 | static void generate_image(fz_context *ctx, fz_html_box *box, fz_image *img, struct genstate *g) |
441 | { |
442 | fz_html_box *flow = box; |
443 | fz_pool *pool = g->pool; |
444 | while (flow->type != BOX_FLOW) |
445 | flow = flow->up; |
446 | |
447 | flush_space(ctx, flow, box, 0, g); |
448 | |
449 | if (!img) |
450 | { |
451 | const char *alt = "[image]" ; |
452 | add_flow_word(ctx, pool, flow, box, alt, alt + 7, 0); |
453 | } |
454 | else |
455 | { |
456 | fz_try(ctx) |
457 | { |
458 | add_flow_sbreak(ctx, pool, flow, box); |
459 | add_flow_image(ctx, pool, flow, box, img); |
460 | add_flow_sbreak(ctx, pool, flow, box); |
461 | } |
462 | fz_always(ctx) |
463 | { |
464 | fz_drop_image(ctx, img); |
465 | } |
466 | fz_catch(ctx) |
467 | fz_rethrow(ctx); |
468 | } |
469 | |
470 | g->at_bol = 0; |
471 | } |
472 | |
473 | static void init_box(fz_context *ctx, fz_html_box *box, fz_bidi_direction markup_dir) |
474 | { |
475 | box->type = BOX_BLOCK; |
476 | box->x = box->y = 0; |
477 | box->w = box->b = 0; |
478 | |
479 | box->up = NULL; |
480 | box->last = NULL; |
481 | box->down = NULL; |
482 | box->next = NULL; |
483 | |
484 | box->flow_head = NULL; |
485 | box->flow_tail = &box->flow_head; |
486 | box->markup_dir = markup_dir; |
487 | |
488 | fz_default_css_style(ctx, &box->style); |
489 | } |
490 | |
491 | static void fz_drop_html_box(fz_context *ctx, fz_html_box *box) |
492 | { |
493 | while (box) |
494 | { |
495 | fz_html_box *next = box->next; |
496 | fz_drop_html_flow(ctx, box->flow_head); |
497 | fz_drop_html_box(ctx, box->down); |
498 | box = next; |
499 | } |
500 | } |
501 | |
502 | void fz_drop_html(fz_context *ctx, fz_html *html) |
503 | { |
504 | if (html) |
505 | { |
506 | fz_drop_html_box(ctx, html->root); |
507 | fz_drop_pool(ctx, html->pool); |
508 | } |
509 | } |
510 | |
511 | static fz_html_box *new_box(fz_context *ctx, fz_pool *pool, fz_bidi_direction markup_dir) |
512 | { |
513 | fz_html_box *box = fz_pool_alloc(ctx, pool, sizeof *box); |
514 | init_box(ctx, box, markup_dir); |
515 | return box; |
516 | } |
517 | |
518 | static void insert_box(fz_context *ctx, fz_html_box *box, int type, fz_html_box *top) |
519 | { |
520 | box->type = type; |
521 | |
522 | box->up = top; |
523 | |
524 | if (top) |
525 | { |
526 | if (!top->last) |
527 | { |
528 | top->down = top->last = box; |
529 | } |
530 | else |
531 | { |
532 | top->last->next = box; |
533 | top->last = box; |
534 | } |
535 | } |
536 | } |
537 | |
538 | static fz_html_box *insert_block_box(fz_context *ctx, fz_html_box *box, fz_html_box *top) |
539 | { |
540 | if (top->type == BOX_BLOCK) |
541 | { |
542 | insert_box(ctx, box, BOX_BLOCK, top); |
543 | } |
544 | else if (top->type == BOX_FLOW) |
545 | { |
546 | while (top->type != BOX_BLOCK) |
547 | top = top->up; |
548 | insert_box(ctx, box, BOX_BLOCK, top); |
549 | } |
550 | else if (top->type == BOX_INLINE) |
551 | { |
552 | while (top->type != BOX_BLOCK) |
553 | top = top->up; |
554 | insert_box(ctx, box, BOX_BLOCK, top); |
555 | } |
556 | return top; |
557 | } |
558 | |
559 | static fz_html_box *insert_table_box(fz_context *ctx, fz_html_box *box, fz_html_box *top) |
560 | { |
561 | top = insert_block_box(ctx, box, top); |
562 | box->type = BOX_TABLE; |
563 | return top; |
564 | } |
565 | |
566 | static fz_html_box *insert_table_row_box(fz_context *ctx, fz_html_box *box, fz_html_box *top) |
567 | { |
568 | fz_html_box *table = top; |
569 | while (table && table->type != BOX_TABLE) |
570 | table = table->up; |
571 | if (table) |
572 | { |
573 | insert_box(ctx, box, BOX_TABLE_ROW, table); |
574 | return table; |
575 | } |
576 | fz_warn(ctx, "table-row not inside table element" ); |
577 | insert_block_box(ctx, box, top); |
578 | return top; |
579 | } |
580 | |
581 | static fz_html_box *insert_table_cell_box(fz_context *ctx, fz_html_box *box, fz_html_box *top) |
582 | { |
583 | fz_html_box *tr = top; |
584 | while (tr && tr->type != BOX_TABLE_ROW) |
585 | tr = tr->up; |
586 | if (tr) |
587 | { |
588 | insert_box(ctx, box, BOX_TABLE_CELL, tr); |
589 | return tr; |
590 | } |
591 | fz_warn(ctx, "table-cell not inside table-row element" ); |
592 | insert_block_box(ctx, box, top); |
593 | return top; |
594 | } |
595 | |
596 | static fz_html_box *insert_break_box(fz_context *ctx, fz_html_box *box, fz_html_box *top) |
597 | { |
598 | if (top->type == BOX_BLOCK) |
599 | { |
600 | insert_box(ctx, box, BOX_BREAK, top); |
601 | } |
602 | else if (top->type == BOX_FLOW) |
603 | { |
604 | while (top->type != BOX_BLOCK) |
605 | top = top->up; |
606 | insert_box(ctx, box, BOX_BREAK, top); |
607 | } |
608 | else if (top->type == BOX_INLINE) |
609 | { |
610 | while (top->type != BOX_BLOCK) |
611 | top = top->up; |
612 | insert_box(ctx, box, BOX_BREAK, top); |
613 | } |
614 | return top; |
615 | } |
616 | |
617 | static void insert_inline_box(fz_context *ctx, fz_html_box *box, fz_html_box *top, int markup_dir, struct genstate *g) |
618 | { |
619 | if (top->type == BOX_FLOW || top->type == BOX_INLINE) |
620 | { |
621 | insert_box(ctx, box, BOX_INLINE, top); |
622 | } |
623 | else |
624 | { |
625 | while (top->type != BOX_BLOCK && top->type != BOX_TABLE_CELL) |
626 | top = top->up; |
627 | |
628 | if (top->last && top->last->type == BOX_FLOW) |
629 | { |
630 | insert_box(ctx, box, BOX_INLINE, top->last); |
631 | } |
632 | else |
633 | { |
634 | fz_html_box *flow = new_box(ctx, g->pool, markup_dir); |
635 | flow->is_first_flow = !top->last; |
636 | insert_box(ctx, flow, BOX_FLOW, top); |
637 | insert_box(ctx, box, BOX_INLINE, flow); |
638 | g->at_bol = 1; |
639 | } |
640 | } |
641 | } |
642 | |
643 | static fz_html_box * |
644 | generate_boxes(fz_context *ctx, |
645 | fz_xml *node, |
646 | fz_html_box *top, |
647 | fz_css_match *up_match, |
648 | int list_counter, |
649 | int section_depth, |
650 | int markup_dir, |
651 | int markup_lang, |
652 | struct genstate *g) |
653 | { |
654 | fz_css_match match; |
655 | fz_html_box *box, *last_top; |
656 | const char *tag; |
657 | int display; |
658 | |
659 | while (node) |
660 | { |
661 | match.up = up_match; |
662 | match.count = 0; |
663 | |
664 | tag = fz_xml_tag(node); |
665 | if (tag) |
666 | { |
667 | fz_match_css(ctx, &match, g->css, node); |
668 | |
669 | display = fz_get_css_match_display(&match); |
670 | |
671 | if (tag[0]=='b' && tag[1]=='r' && tag[2]==0) |
672 | { |
673 | if (top->type == BOX_INLINE) |
674 | { |
675 | fz_html_box *flow = top; |
676 | while (flow->type != BOX_FLOW) |
677 | flow = flow->up; |
678 | add_flow_break(ctx, g->pool, flow, top); |
679 | } |
680 | else |
681 | { |
682 | box = new_box(ctx, g->pool, markup_dir); |
683 | fz_apply_css_style(ctx, g->set, &box->style, &match); |
684 | top = insert_break_box(ctx, box, top); |
685 | } |
686 | g->at_bol = 1; |
687 | } |
688 | |
689 | else if (tag[0]=='i' && tag[1]=='m' && tag[2]=='g' && tag[3]==0) |
690 | { |
691 | const char *src = fz_xml_att(node, "src" ); |
692 | if (src) |
693 | { |
694 | int w, h; |
695 | const char *w_att = fz_xml_att(node, "width" ); |
696 | const char *h_att = fz_xml_att(node, "height" ); |
697 | box = new_box(ctx, g->pool, markup_dir); |
698 | fz_apply_css_style(ctx, g->set, &box->style, &match); |
699 | if (w_att && (w = fz_atoi(w_att)) > 0) |
700 | { |
701 | box->style.width.value = w; |
702 | box->style.width.unit = strchr(w_att, '%') ? N_PERCENT : N_LENGTH; |
703 | } |
704 | if (h_att && (h = fz_atoi(h_att)) > 0) |
705 | { |
706 | box->style.height.value = h; |
707 | box->style.height.unit = strchr(h_att, '%') ? N_PERCENT : N_LENGTH; |
708 | } |
709 | insert_inline_box(ctx, box, top, markup_dir, g); |
710 | generate_image(ctx, box, load_html_image(ctx, g->zip, g->base_uri, src), g); |
711 | } |
712 | } |
713 | |
714 | else if (tag[0]=='s' && tag[1]=='v' && tag[2]=='g' && tag[3]==0) |
715 | { |
716 | box = new_box(ctx, g->pool, markup_dir); |
717 | fz_apply_css_style(ctx, g->set, &box->style, &match); |
718 | insert_inline_box(ctx, box, top, markup_dir, g); |
719 | generate_image(ctx, box, load_svg_image(ctx, g->zip, g->base_uri, node), g); |
720 | } |
721 | |
722 | else if (g->is_fb2 && tag[0]=='i' && tag[1]=='m' && tag[2]=='a' && tag[3]=='g' && tag[4]=='e' && tag[5]==0) |
723 | { |
724 | const char *src = fz_xml_att(node, "l:href" ); |
725 | if (!src) |
726 | src = fz_xml_att(node, "xlink:href" ); |
727 | if (src && src[0] == '#') |
728 | { |
729 | fz_image *img = fz_tree_lookup(ctx, g->images, src+1); |
730 | if (display == DIS_BLOCK) |
731 | { |
732 | fz_html_box *imgbox; |
733 | box = new_box(ctx, g->pool, markup_dir); |
734 | fz_apply_css_style(ctx, g->set, &box->style, &match); |
735 | top = insert_block_box(ctx, box, top); |
736 | imgbox = new_box(ctx, g->pool, markup_dir); |
737 | fz_apply_css_style(ctx, g->set, &imgbox->style, &match); |
738 | insert_inline_box(ctx, imgbox, box, markup_dir, g); |
739 | generate_image(ctx, imgbox, fz_keep_image(ctx, img), g); |
740 | } |
741 | else if (display == DIS_INLINE) |
742 | { |
743 | box = new_box(ctx, g->pool, markup_dir); |
744 | fz_apply_css_style(ctx, g->set, &box->style, &match); |
745 | insert_inline_box(ctx, box, top, markup_dir, g); |
746 | generate_image(ctx, box, fz_keep_image(ctx, img), g); |
747 | } |
748 | } |
749 | } |
750 | |
751 | else if (display != DIS_NONE) |
752 | { |
753 | const char *dir, *lang, *id, *href; |
754 | int child_dir = markup_dir; |
755 | int child_lang = markup_lang; |
756 | |
757 | dir = fz_xml_att(node, "dir" ); |
758 | if (dir) |
759 | { |
760 | if (!strcmp(dir, "auto" )) |
761 | child_dir = FZ_BIDI_NEUTRAL; |
762 | else if (!strcmp(dir, "rtl" )) |
763 | child_dir = FZ_BIDI_RTL; |
764 | else if (!strcmp(dir, "ltr" )) |
765 | child_dir = FZ_BIDI_LTR; |
766 | else |
767 | child_dir = DEFAULT_DIR; |
768 | } |
769 | |
770 | lang = fz_xml_att(node, "lang" ); |
771 | if (lang) |
772 | child_lang = fz_text_language_from_string(lang); |
773 | |
774 | box = new_box(ctx, g->pool, child_dir); |
775 | fz_apply_css_style(ctx, g->set, &box->style, &match); |
776 | |
777 | id = fz_xml_att(node, "id" ); |
778 | if (id) |
779 | box->id = fz_pool_strdup(ctx, g->pool, id); |
780 | |
781 | if (display == DIS_BLOCK || display == DIS_INLINE_BLOCK) |
782 | { |
783 | top = insert_block_box(ctx, box, top); |
784 | if (g->is_fb2) |
785 | { |
786 | if (!strcmp(tag, "title" ) || !strcmp(tag, "subtitle" )) |
787 | box->heading = fz_mini(section_depth, 6); |
788 | } |
789 | else |
790 | { |
791 | if (tag[0]=='h' && tag[1]>='1' && tag[1]<='6' && tag[2]==0) |
792 | box->heading = tag[1] - '0'; |
793 | } |
794 | } |
795 | else if (display == DIS_LIST_ITEM) |
796 | { |
797 | top = insert_block_box(ctx, box, top); |
798 | box->list_item = ++list_counter; |
799 | } |
800 | else if (display == DIS_INLINE) |
801 | { |
802 | insert_inline_box(ctx, box, top, child_dir, g); |
803 | if (id) |
804 | generate_anchor(ctx, box, g); |
805 | if (tag[0]=='a' && tag[1]==0) |
806 | { |
807 | if (g->is_fb2) |
808 | { |
809 | href = fz_xml_att(node, "l:href" ); |
810 | if (!href) |
811 | href = fz_xml_att(node, "xlink:href" ); |
812 | } |
813 | else |
814 | href = fz_xml_att(node, g->is_fb2 ? "l:href" : "href" ); |
815 | if (href) |
816 | box->href = fz_pool_strdup(ctx, g->pool, href); |
817 | } |
818 | } |
819 | else if (display == DIS_TABLE) |
820 | { |
821 | top = insert_table_box(ctx, box, top); |
822 | } |
823 | else if (display == DIS_TABLE_ROW) |
824 | { |
825 | top = insert_table_row_box(ctx, box, top); |
826 | } |
827 | else if (display == DIS_TABLE_CELL) |
828 | { |
829 | top = insert_table_cell_box(ctx, box, top); |
830 | } |
831 | else |
832 | { |
833 | fz_warn(ctx, "unknown box display type" ); |
834 | insert_box(ctx, box, BOX_BLOCK, top); |
835 | } |
836 | |
837 | if (fz_xml_down(node)) |
838 | { |
839 | int child_counter = list_counter; |
840 | int child_section = section_depth; |
841 | if (!strcmp(tag, "ul" ) || !strcmp(tag, "ol" )) |
842 | child_counter = 0; |
843 | if (!strcmp(tag, "section" )) |
844 | ++child_section; |
845 | last_top = generate_boxes(ctx, |
846 | fz_xml_down(node), |
847 | box, |
848 | &match, |
849 | child_counter, |
850 | child_section, |
851 | child_dir, |
852 | child_lang, |
853 | g); |
854 | if (last_top != box) |
855 | top = last_top; |
856 | } |
857 | } |
858 | } |
859 | else |
860 | { |
861 | const char *text = fz_xml_text(node); |
862 | int collapse = top->style.white_space & WS_COLLAPSE; |
863 | if (collapse && is_all_white(text)) |
864 | { |
865 | g->emit_white = 1; |
866 | } |
867 | else |
868 | { |
869 | if (top->type != BOX_INLINE) |
870 | { |
871 | /* Create anonymous inline box, with the same style as the top block box. */ |
872 | box = new_box(ctx, g->pool, markup_dir); |
873 | insert_inline_box(ctx, box, top, markup_dir, g); |
874 | box->style = top->style; |
875 | /* Make sure not to recursively multiply font sizes. */ |
876 | box->style.font_size.value = 1; |
877 | box->style.font_size.unit = N_SCALE; |
878 | generate_text(ctx, box, text, markup_lang, g); |
879 | } |
880 | else |
881 | { |
882 | generate_text(ctx, top, text, markup_lang, g); |
883 | } |
884 | } |
885 | } |
886 | |
887 | node = fz_xml_next(node); |
888 | } |
889 | |
890 | return top; |
891 | } |
892 | |
893 | static char *concat_text(fz_context *ctx, fz_xml *root) |
894 | { |
895 | fz_xml *node; |
896 | size_t i = 0, n = 1; |
897 | char *s; |
898 | for (node = fz_xml_down(root); node; node = fz_xml_next(node)) |
899 | { |
900 | const char *text = fz_xml_text(node); |
901 | n += text ? strlen(text) : 0; |
902 | } |
903 | s = fz_malloc(ctx, n); |
904 | for (node = fz_xml_down(root); node; node = fz_xml_next(node)) |
905 | { |
906 | const char *text = fz_xml_text(node); |
907 | if (text) |
908 | { |
909 | n = strlen(text); |
910 | memcpy(s+i, text, n); |
911 | i += n; |
912 | } |
913 | } |
914 | s[i] = 0; |
915 | return s; |
916 | } |
917 | |
918 | static void |
919 | html_load_css(fz_context *ctx, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root) |
920 | { |
921 | fz_xml *html, *head, *node; |
922 | fz_buffer *buf; |
923 | char path[2048]; |
924 | |
925 | fz_var(buf); |
926 | |
927 | html = fz_xml_find(root, "html" ); |
928 | head = fz_xml_find_down(html, "head" ); |
929 | for (node = fz_xml_down(head); node; node = fz_xml_next(node)) |
930 | { |
931 | if (fz_xml_is_tag(node, "link" )) |
932 | { |
933 | char *rel = fz_xml_att(node, "rel" ); |
934 | if (rel && !fz_strcasecmp(rel, "stylesheet" )) |
935 | { |
936 | char *type = fz_xml_att(node, "type" ); |
937 | if ((type && !strcmp(type, "text/css" )) || !type) |
938 | { |
939 | char *href = fz_xml_att(node, "href" ); |
940 | if (href) |
941 | { |
942 | fz_strlcpy(path, base_uri, sizeof path); |
943 | fz_strlcat(path, "/" , sizeof path); |
944 | fz_strlcat(path, href, sizeof path); |
945 | fz_urldecode(path); |
946 | fz_cleanname(path); |
947 | |
948 | buf = NULL; |
949 | fz_try(ctx) |
950 | { |
951 | buf = fz_read_archive_entry(ctx, zip, path); |
952 | fz_parse_css(ctx, css, fz_string_from_buffer(ctx, buf), path); |
953 | } |
954 | fz_always(ctx) |
955 | fz_drop_buffer(ctx, buf); |
956 | fz_catch(ctx) |
957 | fz_warn(ctx, "ignoring stylesheet %s" , path); |
958 | } |
959 | } |
960 | } |
961 | } |
962 | else if (fz_xml_is_tag(node, "style" )) |
963 | { |
964 | char *s = concat_text(ctx, node); |
965 | fz_try(ctx) |
966 | fz_parse_css(ctx, css, s, "<style>" ); |
967 | fz_catch(ctx) |
968 | fz_warn(ctx, "ignoring inline stylesheet" ); |
969 | fz_free(ctx, s); |
970 | } |
971 | } |
972 | } |
973 | |
974 | static void |
975 | fb2_load_css(fz_context *ctx, fz_archive *zip, const char *base_uri, fz_css *css, fz_xml *root) |
976 | { |
977 | fz_xml *fictionbook, *stylesheet; |
978 | |
979 | fictionbook = fz_xml_find(root, "FictionBook" ); |
980 | stylesheet = fz_xml_find_down(fictionbook, "stylesheet" ); |
981 | if (stylesheet) |
982 | { |
983 | char *s = concat_text(ctx, stylesheet); |
984 | fz_try(ctx) |
985 | fz_parse_css(ctx, css, s, "<stylesheet>" ); |
986 | fz_catch(ctx) |
987 | fz_warn(ctx, "ignoring inline stylesheet" ); |
988 | fz_free(ctx, s); |
989 | } |
990 | } |
991 | |
992 | static fz_tree * |
993 | load_fb2_images(fz_context *ctx, fz_xml *root) |
994 | { |
995 | fz_xml *fictionbook, *binary; |
996 | fz_tree *images = NULL; |
997 | |
998 | fictionbook = fz_xml_find(root, "FictionBook" ); |
999 | for (binary = fz_xml_find_down(fictionbook, "binary" ); binary; binary = fz_xml_find_next(binary, "binary" )) |
1000 | { |
1001 | const char *id = fz_xml_att(binary, "id" ); |
1002 | char *b64 = NULL; |
1003 | fz_buffer *buf = NULL; |
1004 | fz_image *img = NULL; |
1005 | |
1006 | fz_var(b64); |
1007 | fz_var(buf); |
1008 | |
1009 | fz_try(ctx) |
1010 | { |
1011 | b64 = concat_text(ctx, binary); |
1012 | buf = fz_new_buffer_from_base64(ctx, b64, strlen(b64)); |
1013 | img = fz_new_image_from_buffer(ctx, buf); |
1014 | } |
1015 | fz_always(ctx) |
1016 | { |
1017 | fz_drop_buffer(ctx, buf); |
1018 | fz_free(ctx, b64); |
1019 | } |
1020 | fz_catch(ctx) |
1021 | fz_rethrow(ctx); |
1022 | |
1023 | images = fz_tree_insert(ctx, images, id, img); |
1024 | } |
1025 | |
1026 | return images; |
1027 | } |
1028 | |
1029 | typedef struct |
1030 | { |
1031 | uint32_t *data; |
1032 | size_t cap; |
1033 | size_t len; |
1034 | } uni_buf; |
1035 | |
1036 | typedef struct |
1037 | { |
1038 | fz_context *ctx; |
1039 | fz_pool *pool; |
1040 | fz_html_flow *flow; |
1041 | uni_buf *buffer; |
1042 | } bidi_data; |
1043 | |
1044 | static void fragment_cb(const uint32_t *fragment, |
1045 | size_t fragment_len, |
1046 | int bidi_level, |
1047 | int script, |
1048 | void *arg) |
1049 | { |
1050 | bidi_data *data = (bidi_data *)arg; |
1051 | size_t fragment_offset = fragment - data->buffer->data; |
1052 | |
1053 | /* We are guaranteed that fragmentOffset will be at the beginning |
1054 | * of flow. */ |
1055 | while (fragment_len > 0) |
1056 | { |
1057 | size_t len; |
1058 | |
1059 | if (data->flow->type == FLOW_SPACE) |
1060 | { |
1061 | len = 1; |
1062 | } |
1063 | else if (data->flow->type == FLOW_BREAK || data->flow->type == FLOW_SBREAK || |
1064 | data->flow->type == FLOW_SHYPHEN || data->flow->type == FLOW_ANCHOR) |
1065 | { |
1066 | len = 0; |
1067 | } |
1068 | else |
1069 | { |
1070 | /* Must be text */ |
1071 | len = fz_utflen(data->flow->content.text); |
1072 | if (len > fragment_len) |
1073 | { |
1074 | /* We need to split this flow box */ |
1075 | (void)split_flow(data->ctx, data->pool, data->flow, fragment_len); |
1076 | len = fz_utflen(data->flow->content.text); |
1077 | } |
1078 | } |
1079 | |
1080 | /* This flow box is entirely contained within this fragment. */ |
1081 | data->flow->bidi_level = bidi_level; |
1082 | data->flow->script = script; |
1083 | data->flow = data->flow->next; |
1084 | fragment_offset += len; |
1085 | fragment_len -= len; |
1086 | } |
1087 | } |
1088 | |
1089 | static fz_bidi_direction |
1090 | detect_flow_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_bidi_direction bidi_dir, fz_html_flow *flow) |
1091 | { |
1092 | fz_html_flow *end = flow; |
1093 | bidi_data data; |
1094 | |
1095 | while (end) |
1096 | { |
1097 | int level = end->bidi_level; |
1098 | |
1099 | /* Gather the text from the flow up into a single buffer (at |
1100 | * least, as much of it as has the same direction markup). */ |
1101 | buffer->len = 0; |
1102 | while (end && (level & 1) == (end->bidi_level & 1)) |
1103 | { |
1104 | size_t len = 0; |
1105 | const char *text = "" ; |
1106 | int broken = 0; |
1107 | |
1108 | switch (end->type) |
1109 | { |
1110 | case FLOW_WORD: |
1111 | len = fz_utflen(end->content.text); |
1112 | text = end->content.text; |
1113 | break; |
1114 | case FLOW_SPACE: |
1115 | len = 1; |
1116 | text = " " ; |
1117 | break; |
1118 | case FLOW_SHYPHEN: |
1119 | case FLOW_SBREAK: |
1120 | break; |
1121 | case FLOW_BREAK: |
1122 | case FLOW_IMAGE: |
1123 | broken = 1; |
1124 | break; |
1125 | } |
1126 | |
1127 | end = end->next; |
1128 | |
1129 | if (broken) |
1130 | break; |
1131 | |
1132 | /* Make sure the buffer is large enough */ |
1133 | if (buffer->len + len > buffer->cap) |
1134 | { |
1135 | size_t newcap = buffer->cap; |
1136 | if (newcap < 128) |
1137 | newcap = 128; /* Sensible small default */ |
1138 | |
1139 | while (newcap < buffer->len + len) |
1140 | newcap = (newcap * 3) / 2; |
1141 | |
1142 | buffer->data = fz_realloc_array(ctx, buffer->data, newcap, uint32_t); |
1143 | buffer->cap = newcap; |
1144 | } |
1145 | |
1146 | /* Expand the utf8 text into Unicode and store it in the buffer */ |
1147 | while (*text) |
1148 | { |
1149 | int rune; |
1150 | text += fz_chartorune(&rune, text); |
1151 | buffer->data[buffer->len++] = rune; |
1152 | } |
1153 | } |
1154 | |
1155 | /* Detect directionality for the buffer */ |
1156 | data.ctx = ctx; |
1157 | data.pool = pool; |
1158 | data.flow = flow; |
1159 | data.buffer = buffer; |
1160 | fz_bidi_fragment_text(ctx, buffer->data, buffer->len, &bidi_dir, fragment_cb, &data, 0 /* Flags */); |
1161 | } |
1162 | return bidi_dir; |
1163 | } |
1164 | |
1165 | static void |
1166 | detect_box_directionality(fz_context *ctx, fz_pool *pool, uni_buf *buffer, fz_html_box *box) |
1167 | { |
1168 | while (box) |
1169 | { |
1170 | if (box->flow_head) |
1171 | box->markup_dir = detect_flow_directionality(ctx, pool, buffer, box->markup_dir, box->flow_head); |
1172 | detect_box_directionality(ctx, pool, buffer, box->down); |
1173 | box = box->next; |
1174 | } |
1175 | } |
1176 | |
1177 | static void |
1178 | detect_directionality(fz_context *ctx, fz_pool *pool, fz_html_box *box) |
1179 | { |
1180 | uni_buf buffer = { NULL }; |
1181 | |
1182 | fz_try(ctx) |
1183 | detect_box_directionality(ctx, pool, &buffer, box); |
1184 | fz_always(ctx) |
1185 | fz_free(ctx, buffer.data); |
1186 | fz_catch(ctx) |
1187 | fz_rethrow(ctx); |
1188 | } |
1189 | |
1190 | fz_html * |
1191 | fz_parse_html(fz_context *ctx, fz_html_font_set *set, fz_archive *zip, const char *base_uri, fz_buffer *buf, const char *user_css) |
1192 | { |
1193 | fz_xml_doc *xml; |
1194 | fz_xml *root, *node; |
1195 | fz_html *html = NULL; |
1196 | char *title; |
1197 | |
1198 | fz_css_match match; |
1199 | struct genstate g; |
1200 | |
1201 | g.pool = NULL; |
1202 | g.set = set; |
1203 | g.zip = zip; |
1204 | g.images = NULL; |
1205 | g.base_uri = base_uri; |
1206 | g.css = NULL; |
1207 | g.at_bol = 0; |
1208 | g.emit_white = 0; |
1209 | g.last_brk_cls = UCDN_LINEBREAK_CLASS_OP; |
1210 | |
1211 | xml = fz_parse_xml(ctx, buf, 1); |
1212 | root = fz_xml_root(xml); |
1213 | |
1214 | fz_try(ctx) |
1215 | g.css = fz_new_css(ctx); |
1216 | fz_catch(ctx) |
1217 | { |
1218 | fz_drop_xml(ctx, xml); |
1219 | fz_rethrow(ctx); |
1220 | } |
1221 | |
1222 | #ifndef NDEBUG |
1223 | if (fz_atoi(getenv("FZ_DEBUG_XML" ))) |
1224 | fz_debug_xml(root, 0); |
1225 | #endif |
1226 | |
1227 | fz_try(ctx) |
1228 | { |
1229 | if (fz_xml_find(root, "FictionBook" )) |
1230 | { |
1231 | g.is_fb2 = 1; |
1232 | fz_parse_css(ctx, g.css, fb2_default_css, "<default:fb2>" ); |
1233 | if (fz_use_document_css(ctx)) |
1234 | fb2_load_css(ctx, g.zip, g.base_uri, g.css, root); |
1235 | g.images = load_fb2_images(ctx, root); |
1236 | } |
1237 | else |
1238 | { |
1239 | g.is_fb2 = 0; |
1240 | fz_parse_css(ctx, g.css, html_default_css, "<default:html>" ); |
1241 | if (fz_use_document_css(ctx)) |
1242 | html_load_css(ctx, g.zip, g.base_uri, g.css, root); |
1243 | g.images = NULL; |
1244 | } |
1245 | |
1246 | if (user_css) |
1247 | fz_parse_css(ctx, g.css, user_css, "<user>" ); |
1248 | |
1249 | fz_add_css_font_faces(ctx, g.set, g.zip, g.base_uri, g.css); /* load @font-face fonts into font set */ |
1250 | } |
1251 | fz_catch(ctx) |
1252 | { |
1253 | fz_rethrow_if(ctx, FZ_ERROR_TRYLATER); |
1254 | fz_warn(ctx, "ignoring styles due to errors: %s" , fz_caught_message(ctx)); |
1255 | } |
1256 | |
1257 | #ifndef NDEBUG |
1258 | if (fz_atoi(getenv("FZ_DEBUG_CSS" ))) |
1259 | fz_debug_css(ctx, g.css); |
1260 | #endif |
1261 | |
1262 | fz_try(ctx) |
1263 | { |
1264 | g.pool = fz_new_pool(ctx); |
1265 | html = fz_pool_alloc(ctx, g.pool, sizeof *html); |
1266 | html->pool = g.pool; |
1267 | html->root = new_box(ctx, g.pool, DEFAULT_DIR); |
1268 | |
1269 | match.up = NULL; |
1270 | match.count = 0; |
1271 | fz_match_css_at_page(ctx, &match, g.css); |
1272 | fz_apply_css_style(ctx, g.set, &html->root->style, &match); |
1273 | // TODO: transfer page margins out of this hacky box |
1274 | |
1275 | generate_boxes(ctx, root, html->root, &match, 0, 0, DEFAULT_DIR, FZ_LANG_UNSET, &g); |
1276 | |
1277 | detect_directionality(ctx, g.pool, html->root); |
1278 | |
1279 | if (g.is_fb2) |
1280 | { |
1281 | node = fz_xml_find(root, "FictionBook" ); |
1282 | node = fz_xml_find_down(node, "description" ); |
1283 | node = fz_xml_find_down(node, "title-info" ); |
1284 | node = fz_xml_find_down(node, "book-title" ); |
1285 | title = fz_xml_text(fz_xml_down(node)); |
1286 | if (title) |
1287 | html->title = fz_pool_strdup(ctx, g.pool, title); |
1288 | } |
1289 | else |
1290 | { |
1291 | node = fz_xml_find(root, "html" ); |
1292 | node = fz_xml_find_down(node, "head" ); |
1293 | node = fz_xml_find_down(node, "title" ); |
1294 | title = fz_xml_text(fz_xml_down(node)); |
1295 | if (title) |
1296 | html->title = fz_pool_strdup(ctx, g.pool, title); |
1297 | } |
1298 | } |
1299 | fz_always(ctx) |
1300 | { |
1301 | fz_drop_tree(ctx, g.images, (void(*)(fz_context*,void*))fz_drop_image); |
1302 | fz_drop_css(ctx, g.css); |
1303 | fz_drop_xml(ctx, xml); |
1304 | } |
1305 | fz_catch(ctx) |
1306 | { |
1307 | fz_drop_html(ctx, html); |
1308 | fz_rethrow(ctx); |
1309 | } |
1310 | |
1311 | return html; |
1312 | } |
1313 | |
1314 | static void indent(int level) |
1315 | { |
1316 | while (level-- > 0) |
1317 | putchar('\t'); |
1318 | } |
1319 | |
1320 | static void |
1321 | fz_debug_html_flow(fz_context *ctx, fz_html_flow *flow, int level) |
1322 | { |
1323 | fz_html_box *sbox = NULL; |
1324 | while (flow) |
1325 | { |
1326 | if (flow->box != sbox) { |
1327 | if (sbox) { |
1328 | indent(level); |
1329 | printf("}\n" ); |
1330 | } |
1331 | sbox = flow->box; |
1332 | indent(level); |
1333 | printf("span em=%g font='%s'" , sbox->em, fz_font_name(ctx, sbox->style.font)); |
1334 | if (fz_font_is_serif(ctx, sbox->style.font)) |
1335 | printf(" serif" ); |
1336 | else |
1337 | printf(" sans" ); |
1338 | if (fz_font_is_monospaced(ctx, sbox->style.font)) |
1339 | printf(" monospaced" ); |
1340 | if (fz_font_is_bold(ctx, sbox->style.font)) |
1341 | printf(" bold" ); |
1342 | if (fz_font_is_italic(ctx, sbox->style.font)) |
1343 | printf(" italic" ); |
1344 | if (sbox->style.small_caps) |
1345 | printf(" small-caps" ); |
1346 | printf("\n" ); |
1347 | indent(level); |
1348 | printf("{\n" ); |
1349 | } |
1350 | |
1351 | indent(level+1); |
1352 | switch (flow->type) { |
1353 | case FLOW_WORD: printf("word " ); break; |
1354 | case FLOW_SPACE: printf("space" ); break; |
1355 | case FLOW_SBREAK: printf("sbrk " ); break; |
1356 | case FLOW_SHYPHEN: printf("shy " ); break; |
1357 | case FLOW_BREAK: printf("break" ); break; |
1358 | case FLOW_IMAGE: printf("image" ); break; |
1359 | case FLOW_ANCHOR: printf("anchor" ); break; |
1360 | } |
1361 | printf(" y=%g x=%g w=%g" , flow->y, flow->x, flow->w); |
1362 | if (flow->type == FLOW_IMAGE) |
1363 | printf(" h=%g" , flow->h); |
1364 | if (flow->type == FLOW_WORD) |
1365 | printf(" text='%s'" , flow->content.text); |
1366 | printf("\n" ); |
1367 | if (flow->breaks_line) { |
1368 | indent(level+1); |
1369 | printf("*\n" ); |
1370 | } |
1371 | |
1372 | flow = flow->next; |
1373 | } |
1374 | indent(level); |
1375 | printf("}\n" ); |
1376 | } |
1377 | |
1378 | static void |
1379 | fz_debug_html_box(fz_context *ctx, fz_html_box *box, int level) |
1380 | { |
1381 | while (box) |
1382 | { |
1383 | indent(level); |
1384 | switch (box->type) { |
1385 | case BOX_BLOCK: printf("block" ); break; |
1386 | case BOX_BREAK: printf("break" ); break; |
1387 | case BOX_FLOW: printf("flow" ); break; |
1388 | case BOX_INLINE: printf("inline" ); break; |
1389 | case BOX_TABLE: printf("table" ); break; |
1390 | case BOX_TABLE_ROW: printf("table-row" ); break; |
1391 | case BOX_TABLE_CELL: printf("table-cell" ); break; |
1392 | } |
1393 | |
1394 | printf(" em=%g x=%g y=%g w=%g b=%g\n" , box->em, box->x, box->y, box->w, box->b); |
1395 | |
1396 | indent(level); |
1397 | printf("{\n" ); |
1398 | if (box->type == BOX_BLOCK) { |
1399 | indent(level+1); |
1400 | printf("margin=%g %g %g %g\n" , box->margin[0], box->margin[1], box->margin[2], box->margin[3]); |
1401 | } |
1402 | if (box->is_first_flow) { |
1403 | indent(level+1); |
1404 | printf("is-first-flow\n" ); |
1405 | } |
1406 | if (box->list_item) { |
1407 | indent(level+1); |
1408 | printf("list=%d\n" , box->list_item); |
1409 | } |
1410 | if (box->id) { |
1411 | indent(level+1); |
1412 | printf("id=%s\n" , box->id); |
1413 | } |
1414 | if (box->href) { |
1415 | indent(level+1); |
1416 | printf("href=%s\n" , box->href); |
1417 | } |
1418 | |
1419 | if (box->down) |
1420 | fz_debug_html_box(ctx, box->down, level + 1); |
1421 | if (box->flow_head) |
1422 | fz_debug_html_flow(ctx, box->flow_head, level + 1); |
1423 | |
1424 | indent(level); |
1425 | printf("}\n" ); |
1426 | |
1427 | box = box->next; |
1428 | } |
1429 | } |
1430 | |
1431 | void |
1432 | fz_debug_html(fz_context *ctx, fz_html_box *box) |
1433 | { |
1434 | fz_debug_html_box(ctx, box, 0); |
1435 | } |
1436 | |