1 | #include "mupdf/fitz.h" |
2 | #include "html-imp.h" |
3 | |
4 | #include <string.h> |
5 | |
6 | enum { T, R, B, L }; |
7 | |
8 | static int is_internal_uri(const char *uri) |
9 | { |
10 | while (*uri >= 'a' && *uri <= 'z') |
11 | ++uri; |
12 | if (uri[0] == ':' && uri[1] == '/' && uri[2] == '/') |
13 | return 0; |
14 | return 1; |
15 | } |
16 | |
17 | static const char *box_href(fz_html_box *box) |
18 | { |
19 | while (box) |
20 | { |
21 | const char *href = box->href; |
22 | if (href) |
23 | return href; |
24 | box = box->up; |
25 | } |
26 | return NULL; |
27 | } |
28 | |
29 | static int has_same_href(fz_html_box *box, const char *old_href) |
30 | { |
31 | while (box) |
32 | { |
33 | const char *href = box->href; |
34 | if (href) |
35 | return !strcmp(old_href, href); |
36 | box = box->up; |
37 | } |
38 | return 0; |
39 | } |
40 | |
41 | static fz_link *load_link_flow(fz_context *ctx, fz_html_flow *flow, fz_link *head, int page, float page_h, const char *dir, const char *file) |
42 | { |
43 | fz_link *link; |
44 | fz_html_flow *next; |
45 | char path[2048]; |
46 | fz_rect bbox; |
47 | const char *dest; |
48 | const char *href; |
49 | float end; |
50 | |
51 | while (flow) |
52 | { |
53 | href = box_href(flow->box); |
54 | next = flow->next; |
55 | if (href && (int)(flow->y / page_h) == page) |
56 | { |
57 | /* Coalesce contiguous flow boxes into one link node */ |
58 | end = flow->x + flow->w; |
59 | while (next && |
60 | next->y == flow->y && |
61 | next->h == flow->h && |
62 | has_same_href(next->box, href)) |
63 | { |
64 | end = next->x + next->w; |
65 | next = next->next; |
66 | } |
67 | |
68 | bbox.x0 = flow->x; |
69 | bbox.y0 = flow->y - page * page_h; |
70 | bbox.x1 = end; |
71 | bbox.y1 = bbox.y0 + flow->h; |
72 | if (flow->type != FLOW_IMAGE) |
73 | { |
74 | /* flow->y is the baseline, adjust bbox appropriately */ |
75 | bbox.y0 -= 0.8f * flow->h; |
76 | bbox.y1 -= 0.8f * flow->h; |
77 | } |
78 | |
79 | if (is_internal_uri(href)) |
80 | { |
81 | if (href[0] == '#') |
82 | { |
83 | fz_strlcpy(path, file, sizeof path); |
84 | fz_strlcat(path, href, sizeof path); |
85 | } |
86 | else |
87 | { |
88 | fz_strlcpy(path, dir, sizeof path); |
89 | fz_strlcat(path, "/" , sizeof path); |
90 | fz_strlcat(path, href, sizeof path); |
91 | } |
92 | fz_urldecode(path); |
93 | fz_cleanname(path); |
94 | |
95 | dest = path; |
96 | } |
97 | else |
98 | { |
99 | dest = href; |
100 | } |
101 | |
102 | link = fz_new_link(ctx, bbox, NULL, dest); |
103 | link->next = head; |
104 | head = link; |
105 | } |
106 | flow = next; |
107 | } |
108 | return head; |
109 | } |
110 | |
111 | static fz_link *load_link_box(fz_context *ctx, fz_html_box *box, fz_link *head, int page, float page_h, const char *dir, const char *file) |
112 | { |
113 | while (box) |
114 | { |
115 | if (box->flow_head) |
116 | head = load_link_flow(ctx, box->flow_head, head, page, page_h, dir, file); |
117 | if (box->down) |
118 | head = load_link_box(ctx, box->down, head, page, page_h, dir, file); |
119 | box = box->next; |
120 | } |
121 | return head; |
122 | } |
123 | |
124 | fz_link * |
125 | fz_load_html_links(fz_context *ctx, fz_html *html, int page, const char *file, void *doc) |
126 | { |
127 | fz_link *link, *head; |
128 | char dir[2048]; |
129 | fz_dirname(dir, file, sizeof dir); |
130 | |
131 | head = load_link_box(ctx, html->root, NULL, page, html->page_h, dir, file); |
132 | |
133 | for (link = head; link; link = link->next) |
134 | { |
135 | /* Adjust for page margins */ |
136 | link->rect.x0 += html->page_margin[L]; |
137 | link->rect.x1 += html->page_margin[L]; |
138 | link->rect.y0 += html->page_margin[T]; |
139 | link->rect.y1 += html->page_margin[T]; |
140 | |
141 | /* Set document pointer */ |
142 | link->doc = doc; |
143 | } |
144 | |
145 | return head; |
146 | } |
147 | |
148 | static fz_html_flow * |
149 | find_first_content(fz_html_box *box) |
150 | { |
151 | while (box) |
152 | { |
153 | if (box->type == BOX_FLOW) |
154 | return box->flow_head; |
155 | box = box->down; |
156 | } |
157 | return NULL; |
158 | } |
159 | |
160 | static float |
161 | find_flow_target(fz_html_flow *flow, const char *id) |
162 | { |
163 | while (flow) |
164 | { |
165 | if (flow->box->id && !strcmp(id, flow->box->id)) |
166 | return flow->y; |
167 | flow = flow->next; |
168 | } |
169 | return -1; |
170 | } |
171 | |
172 | static float |
173 | find_box_target(fz_html_box *box, const char *id) |
174 | { |
175 | float y; |
176 | while (box) |
177 | { |
178 | if (box->id && !strcmp(id, box->id)) |
179 | { |
180 | fz_html_flow *flow = find_first_content(box); |
181 | if (flow) |
182 | return flow->y; |
183 | return box->y; |
184 | } |
185 | if (box->type == BOX_FLOW) |
186 | { |
187 | y = find_flow_target(box->flow_head, id); |
188 | if (y >= 0) |
189 | return y; |
190 | } |
191 | else |
192 | { |
193 | y = find_box_target(box->down, id); |
194 | if (y >= 0) |
195 | return y; |
196 | } |
197 | box = box->next; |
198 | } |
199 | return -1; |
200 | } |
201 | |
202 | float |
203 | fz_find_html_target(fz_context *ctx, fz_html *html, const char *id) |
204 | { |
205 | return find_box_target(html->root, id); |
206 | } |
207 | |
208 | static fz_html_flow * |
209 | make_flow_bookmark(fz_context *ctx, fz_html_flow *flow, float y) |
210 | { |
211 | while (flow) |
212 | { |
213 | if (flow->y >= y) |
214 | return flow; |
215 | flow = flow->next; |
216 | } |
217 | return NULL; |
218 | } |
219 | |
220 | static fz_html_flow * |
221 | make_box_bookmark(fz_context *ctx, fz_html_box *box, float y) |
222 | { |
223 | fz_html_flow *mark; |
224 | while (box) |
225 | { |
226 | if (box->type == BOX_FLOW) |
227 | { |
228 | if (box->y >= y) |
229 | { |
230 | mark = make_flow_bookmark(ctx, box->flow_head, y); |
231 | if (mark) |
232 | return mark; |
233 | } |
234 | } |
235 | else |
236 | { |
237 | mark = make_box_bookmark(ctx, box->down, y); |
238 | if (mark) |
239 | return mark; |
240 | } |
241 | box = box->next; |
242 | } |
243 | return NULL; |
244 | } |
245 | |
246 | fz_bookmark |
247 | fz_make_html_bookmark(fz_context *ctx, fz_html *html, int page) |
248 | { |
249 | return (fz_bookmark)make_box_bookmark(ctx, html->root, page * html->page_h); |
250 | } |
251 | |
252 | static int |
253 | lookup_flow_bookmark(fz_context *ctx, fz_html_flow *flow, fz_html_flow *mark) |
254 | { |
255 | while (flow) |
256 | { |
257 | if (flow == mark) |
258 | return 1; |
259 | flow = flow->next; |
260 | } |
261 | return 0; |
262 | } |
263 | |
264 | static int |
265 | lookup_box_bookmark(fz_context *ctx, fz_html_box *box, fz_html_flow *mark) |
266 | { |
267 | while (box) |
268 | { |
269 | if (box->type == BOX_FLOW) |
270 | { |
271 | if (lookup_flow_bookmark(ctx, box->flow_head, mark)) |
272 | return 1; |
273 | } |
274 | else |
275 | { |
276 | if (lookup_box_bookmark(ctx, box->down, mark)) |
277 | return 1; |
278 | } |
279 | box = box->next; |
280 | } |
281 | return 0; |
282 | } |
283 | |
284 | int |
285 | fz_lookup_html_bookmark(fz_context *ctx, fz_html *html, fz_bookmark mark) |
286 | { |
287 | fz_html_flow *flow = (fz_html_flow*)mark; |
288 | if (flow && lookup_box_bookmark(ctx, html->root, flow)) |
289 | return (int)(flow->y / html->page_h); |
290 | return -1; |
291 | } |
292 | |
293 | struct outline_parser |
294 | { |
295 | fz_html *html; |
296 | fz_buffer *cat; |
297 | fz_outline *head; |
298 | fz_outline **tail[6]; |
299 | fz_outline **down[6]; |
300 | int level[6]; |
301 | int current; |
302 | int id; |
303 | }; |
304 | |
305 | static void |
306 | cat_html_flow(fz_context *ctx, fz_buffer *cat, fz_html_flow *flow) |
307 | { |
308 | while (flow) |
309 | { |
310 | switch (flow->type) |
311 | { |
312 | case FLOW_WORD: |
313 | fz_append_string(ctx, cat, flow->content.text); |
314 | break; |
315 | case FLOW_SPACE: |
316 | case FLOW_BREAK: |
317 | fz_append_byte(ctx, cat, ' '); |
318 | break; |
319 | default: |
320 | break; |
321 | } |
322 | flow = flow->next; |
323 | } |
324 | } |
325 | |
326 | static void |
327 | cat_html_box(fz_context *ctx, fz_buffer *cat, fz_html_box *box) |
328 | { |
329 | while (box) |
330 | { |
331 | cat_html_flow(ctx, cat, box->flow_head); |
332 | cat_html_box(ctx, cat, box->down); |
333 | box = box->next; |
334 | } |
335 | } |
336 | |
337 | static const char * |
338 | cat_html_text(fz_context *ctx, struct outline_parser *x, fz_html_box *box) |
339 | { |
340 | if (!x->cat) |
341 | x->cat = fz_new_buffer(ctx, 1024); |
342 | else |
343 | fz_clear_buffer(ctx, x->cat); |
344 | |
345 | cat_html_flow(ctx, x->cat, box->flow_head); |
346 | cat_html_box(ctx, x->cat, box->down); |
347 | |
348 | return fz_string_from_buffer(ctx, x->cat); |
349 | } |
350 | |
351 | static void |
352 | add_html_outline(fz_context *ctx, struct outline_parser *x, fz_html_box *box) |
353 | { |
354 | fz_outline *node; |
355 | char buf[100]; |
356 | |
357 | node = fz_new_outline(ctx); |
358 | fz_try(ctx) |
359 | { |
360 | node->title = fz_strdup(ctx, cat_html_text(ctx, x, box)); |
361 | if (!box->id) |
362 | { |
363 | fz_snprintf(buf, sizeof buf, "'%d" , x->id++); |
364 | box->id = fz_pool_strdup(ctx, x->html->pool, buf); |
365 | } |
366 | node->uri = fz_asprintf(ctx, "#%s" , box->id); |
367 | node->is_open = 1; |
368 | } |
369 | fz_catch(ctx) |
370 | { |
371 | fz_free(ctx, node); |
372 | fz_rethrow(ctx); |
373 | } |
374 | |
375 | if (x->level[x->current] < box->heading && x->current < 5) |
376 | { |
377 | x->tail[x->current+1] = x->down[x->current]; |
378 | x->current += 1; |
379 | } |
380 | else |
381 | { |
382 | while (x->current > 0 && x->level[x->current] > box->heading) |
383 | { |
384 | x->current -= 1; |
385 | } |
386 | } |
387 | x->level[x->current] = box->heading; |
388 | |
389 | *(x->tail[x->current]) = node; |
390 | x->tail[x->current] = &node->next; |
391 | x->down[x->current] = &node->down; |
392 | } |
393 | |
394 | static void |
395 | load_html_outline(fz_context *ctx, struct outline_parser *x, fz_html_box *box) |
396 | { |
397 | while (box) |
398 | { |
399 | if (box->heading) |
400 | add_html_outline(ctx, x, box); |
401 | if (box->down) |
402 | load_html_outline(ctx, x, box->down); |
403 | box = box->next; |
404 | } |
405 | } |
406 | |
407 | fz_outline * |
408 | fz_load_html_outline(fz_context *ctx, fz_html *html) |
409 | { |
410 | struct outline_parser state; |
411 | state.html = html; |
412 | state.cat = NULL; |
413 | state.head = NULL; |
414 | state.tail[0] = &state.head; |
415 | state.down[0] = NULL; |
416 | state.level[0] = 99; |
417 | state.current = 0; |
418 | state.id = 1; |
419 | fz_try(ctx) |
420 | load_html_outline(ctx, &state, html->root); |
421 | fz_always(ctx) |
422 | fz_drop_buffer(ctx, state.cat); |
423 | fz_catch(ctx) |
424 | { |
425 | fz_drop_outline(ctx, state.head); |
426 | state.head = NULL; |
427 | } |
428 | return state.head; |
429 | } |
430 | |