1#include "mupdf/fitz.h"
2#include "html-imp.h"
3
4#include <string.h>
5
6enum { T, R, B, L };
7
8static int is_internal_uri(const char *uri)
9{
10 while (*uri >= 'a' && *uri <= 'z')
11 ++uri;
12 if (uri[0] == ':' && uri[1] == '/' && uri[2] == '/')
13 return 0;
14 return 1;
15}
16
17static const char *box_href(fz_html_box *box)
18{
19 while (box)
20 {
21 const char *href = box->href;
22 if (href)
23 return href;
24 box = box->up;
25 }
26 return NULL;
27}
28
29static int has_same_href(fz_html_box *box, const char *old_href)
30{
31 while (box)
32 {
33 const char *href = box->href;
34 if (href)
35 return !strcmp(old_href, href);
36 box = box->up;
37 }
38 return 0;
39}
40
41static fz_link *load_link_flow(fz_context *ctx, fz_html_flow *flow, fz_link *head, int page, float page_h, const char *dir, const char *file)
42{
43 fz_link *link;
44 fz_html_flow *next;
45 char path[2048];
46 fz_rect bbox;
47 const char *dest;
48 const char *href;
49 float end;
50
51 while (flow)
52 {
53 href = box_href(flow->box);
54 next = flow->next;
55 if (href && (int)(flow->y / page_h) == page)
56 {
57 /* Coalesce contiguous flow boxes into one link node */
58 end = flow->x + flow->w;
59 while (next &&
60 next->y == flow->y &&
61 next->h == flow->h &&
62 has_same_href(next->box, href))
63 {
64 end = next->x + next->w;
65 next = next->next;
66 }
67
68 bbox.x0 = flow->x;
69 bbox.y0 = flow->y - page * page_h;
70 bbox.x1 = end;
71 bbox.y1 = bbox.y0 + flow->h;
72 if (flow->type != FLOW_IMAGE)
73 {
74 /* flow->y is the baseline, adjust bbox appropriately */
75 bbox.y0 -= 0.8f * flow->h;
76 bbox.y1 -= 0.8f * flow->h;
77 }
78
79 if (is_internal_uri(href))
80 {
81 if (href[0] == '#')
82 {
83 fz_strlcpy(path, file, sizeof path);
84 fz_strlcat(path, href, sizeof path);
85 }
86 else
87 {
88 fz_strlcpy(path, dir, sizeof path);
89 fz_strlcat(path, "/", sizeof path);
90 fz_strlcat(path, href, sizeof path);
91 }
92 fz_urldecode(path);
93 fz_cleanname(path);
94
95 dest = path;
96 }
97 else
98 {
99 dest = href;
100 }
101
102 link = fz_new_link(ctx, bbox, NULL, dest);
103 link->next = head;
104 head = link;
105 }
106 flow = next;
107 }
108 return head;
109}
110
111static fz_link *load_link_box(fz_context *ctx, fz_html_box *box, fz_link *head, int page, float page_h, const char *dir, const char *file)
112{
113 while (box)
114 {
115 if (box->flow_head)
116 head = load_link_flow(ctx, box->flow_head, head, page, page_h, dir, file);
117 if (box->down)
118 head = load_link_box(ctx, box->down, head, page, page_h, dir, file);
119 box = box->next;
120 }
121 return head;
122}
123
124fz_link *
125fz_load_html_links(fz_context *ctx, fz_html *html, int page, const char *file, void *doc)
126{
127 fz_link *link, *head;
128 char dir[2048];
129 fz_dirname(dir, file, sizeof dir);
130
131 head = load_link_box(ctx, html->root, NULL, page, html->page_h, dir, file);
132
133 for (link = head; link; link = link->next)
134 {
135 /* Adjust for page margins */
136 link->rect.x0 += html->page_margin[L];
137 link->rect.x1 += html->page_margin[L];
138 link->rect.y0 += html->page_margin[T];
139 link->rect.y1 += html->page_margin[T];
140
141 /* Set document pointer */
142 link->doc = doc;
143 }
144
145 return head;
146}
147
148static fz_html_flow *
149find_first_content(fz_html_box *box)
150{
151 while (box)
152 {
153 if (box->type == BOX_FLOW)
154 return box->flow_head;
155 box = box->down;
156 }
157 return NULL;
158}
159
160static float
161find_flow_target(fz_html_flow *flow, const char *id)
162{
163 while (flow)
164 {
165 if (flow->box->id && !strcmp(id, flow->box->id))
166 return flow->y;
167 flow = flow->next;
168 }
169 return -1;
170}
171
172static float
173find_box_target(fz_html_box *box, const char *id)
174{
175 float y;
176 while (box)
177 {
178 if (box->id && !strcmp(id, box->id))
179 {
180 fz_html_flow *flow = find_first_content(box);
181 if (flow)
182 return flow->y;
183 return box->y;
184 }
185 if (box->type == BOX_FLOW)
186 {
187 y = find_flow_target(box->flow_head, id);
188 if (y >= 0)
189 return y;
190 }
191 else
192 {
193 y = find_box_target(box->down, id);
194 if (y >= 0)
195 return y;
196 }
197 box = box->next;
198 }
199 return -1;
200}
201
202float
203fz_find_html_target(fz_context *ctx, fz_html *html, const char *id)
204{
205 return find_box_target(html->root, id);
206}
207
208static fz_html_flow *
209make_flow_bookmark(fz_context *ctx, fz_html_flow *flow, float y)
210{
211 while (flow)
212 {
213 if (flow->y >= y)
214 return flow;
215 flow = flow->next;
216 }
217 return NULL;
218}
219
220static fz_html_flow *
221make_box_bookmark(fz_context *ctx, fz_html_box *box, float y)
222{
223 fz_html_flow *mark;
224 while (box)
225 {
226 if (box->type == BOX_FLOW)
227 {
228 if (box->y >= y)
229 {
230 mark = make_flow_bookmark(ctx, box->flow_head, y);
231 if (mark)
232 return mark;
233 }
234 }
235 else
236 {
237 mark = make_box_bookmark(ctx, box->down, y);
238 if (mark)
239 return mark;
240 }
241 box = box->next;
242 }
243 return NULL;
244}
245
246fz_bookmark
247fz_make_html_bookmark(fz_context *ctx, fz_html *html, int page)
248{
249 return (fz_bookmark)make_box_bookmark(ctx, html->root, page * html->page_h);
250}
251
252static int
253lookup_flow_bookmark(fz_context *ctx, fz_html_flow *flow, fz_html_flow *mark)
254{
255 while (flow)
256 {
257 if (flow == mark)
258 return 1;
259 flow = flow->next;
260 }
261 return 0;
262}
263
264static int
265lookup_box_bookmark(fz_context *ctx, fz_html_box *box, fz_html_flow *mark)
266{
267 while (box)
268 {
269 if (box->type == BOX_FLOW)
270 {
271 if (lookup_flow_bookmark(ctx, box->flow_head, mark))
272 return 1;
273 }
274 else
275 {
276 if (lookup_box_bookmark(ctx, box->down, mark))
277 return 1;
278 }
279 box = box->next;
280 }
281 return 0;
282}
283
284int
285fz_lookup_html_bookmark(fz_context *ctx, fz_html *html, fz_bookmark mark)
286{
287 fz_html_flow *flow = (fz_html_flow*)mark;
288 if (flow && lookup_box_bookmark(ctx, html->root, flow))
289 return (int)(flow->y / html->page_h);
290 return -1;
291}
292
293struct outline_parser
294{
295 fz_html *html;
296 fz_buffer *cat;
297 fz_outline *head;
298 fz_outline **tail[6];
299 fz_outline **down[6];
300 int level[6];
301 int current;
302 int id;
303};
304
305static void
306cat_html_flow(fz_context *ctx, fz_buffer *cat, fz_html_flow *flow)
307{
308 while (flow)
309 {
310 switch (flow->type)
311 {
312 case FLOW_WORD:
313 fz_append_string(ctx, cat, flow->content.text);
314 break;
315 case FLOW_SPACE:
316 case FLOW_BREAK:
317 fz_append_byte(ctx, cat, ' ');
318 break;
319 default:
320 break;
321 }
322 flow = flow->next;
323 }
324}
325
326static void
327cat_html_box(fz_context *ctx, fz_buffer *cat, fz_html_box *box)
328{
329 while (box)
330 {
331 cat_html_flow(ctx, cat, box->flow_head);
332 cat_html_box(ctx, cat, box->down);
333 box = box->next;
334 }
335}
336
337static const char *
338cat_html_text(fz_context *ctx, struct outline_parser *x, fz_html_box *box)
339{
340 if (!x->cat)
341 x->cat = fz_new_buffer(ctx, 1024);
342 else
343 fz_clear_buffer(ctx, x->cat);
344
345 cat_html_flow(ctx, x->cat, box->flow_head);
346 cat_html_box(ctx, x->cat, box->down);
347
348 return fz_string_from_buffer(ctx, x->cat);
349}
350
351static void
352add_html_outline(fz_context *ctx, struct outline_parser *x, fz_html_box *box)
353{
354 fz_outline *node;
355 char buf[100];
356
357 node = fz_new_outline(ctx);
358 fz_try(ctx)
359 {
360 node->title = fz_strdup(ctx, cat_html_text(ctx, x, box));
361 if (!box->id)
362 {
363 fz_snprintf(buf, sizeof buf, "'%d", x->id++);
364 box->id = fz_pool_strdup(ctx, x->html->pool, buf);
365 }
366 node->uri = fz_asprintf(ctx, "#%s", box->id);
367 node->is_open = 1;
368 }
369 fz_catch(ctx)
370 {
371 fz_free(ctx, node);
372 fz_rethrow(ctx);
373 }
374
375 if (x->level[x->current] < box->heading && x->current < 5)
376 {
377 x->tail[x->current+1] = x->down[x->current];
378 x->current += 1;
379 }
380 else
381 {
382 while (x->current > 0 && x->level[x->current] > box->heading)
383 {
384 x->current -= 1;
385 }
386 }
387 x->level[x->current] = box->heading;
388
389 *(x->tail[x->current]) = node;
390 x->tail[x->current] = &node->next;
391 x->down[x->current] = &node->down;
392}
393
394static void
395load_html_outline(fz_context *ctx, struct outline_parser *x, fz_html_box *box)
396{
397 while (box)
398 {
399 if (box->heading)
400 add_html_outline(ctx, x, box);
401 if (box->down)
402 load_html_outline(ctx, x, box->down);
403 box = box->next;
404 }
405}
406
407fz_outline *
408fz_load_html_outline(fz_context *ctx, fz_html *html)
409{
410 struct outline_parser state;
411 state.html = html;
412 state.cat = NULL;
413 state.head = NULL;
414 state.tail[0] = &state.head;
415 state.down[0] = NULL;
416 state.level[0] = 99;
417 state.current = 0;
418 state.id = 1;
419 fz_try(ctx)
420 load_html_outline(ctx, &state, html->root);
421 fz_always(ctx)
422 fz_drop_buffer(ctx, state.cat);
423 fz_catch(ctx)
424 {
425 fz_drop_outline(ctx, state.head);
426 state.head = NULL;
427 }
428 return state.head;
429}
430