1#include "mupdf/fitz.h"
2
3#include <string.h>
4#include <stdlib.h>
5#include <stdio.h>
6
7static const struct { const char *name; int c; } html_entities[] = {
8 {"nbsp",160}, {"iexcl",161}, {"cent",162}, {"pound",163},
9 {"curren",164}, {"yen",165}, {"brvbar",166}, {"sect",167},
10 {"uml",168}, {"copy",169}, {"ordf",170}, {"laquo",171},
11 {"not",172}, {"shy",173}, {"reg",174}, {"macr",175}, {"deg",176},
12 {"plusmn",177}, {"sup2",178}, {"sup3",179}, {"acute",180},
13 {"micro",181}, {"para",182}, {"middot",183}, {"cedil",184},
14 {"sup1",185}, {"ordm",186}, {"raquo",187}, {"frac14",188},
15 {"frac12",189}, {"frac34",190}, {"iquest",191}, {"Agrave",192},
16 {"Aacute",193}, {"Acirc",194}, {"Atilde",195}, {"Auml",196},
17 {"Aring",197}, {"AElig",198}, {"Ccedil",199}, {"Egrave",200},
18 {"Eacute",201}, {"Ecirc",202}, {"Euml",203}, {"Igrave",204},
19 {"Iacute",205}, {"Icirc",206}, {"Iuml",207}, {"ETH",208},
20 {"Ntilde",209}, {"Ograve",210}, {"Oacute",211}, {"Ocirc",212},
21 {"Otilde",213}, {"Ouml",214}, {"times",215}, {"Oslash",216},
22 {"Ugrave",217}, {"Uacute",218}, {"Ucirc",219}, {"Uuml",220},
23 {"Yacute",221}, {"THORN",222}, {"szlig",223}, {"agrave",224},
24 {"aacute",225}, {"acirc",226}, {"atilde",227}, {"auml",228},
25 {"aring",229}, {"aelig",230}, {"ccedil",231}, {"egrave",232},
26 {"eacute",233}, {"ecirc",234}, {"euml",235}, {"igrave",236},
27 {"iacute",237}, {"icirc",238}, {"iuml",239}, {"eth",240},
28 {"ntilde",241}, {"ograve",242}, {"oacute",243}, {"ocirc",244},
29 {"otilde",245}, {"ouml",246}, {"divide",247}, {"oslash",248},
30 {"ugrave",249}, {"uacute",250}, {"ucirc",251}, {"uuml",252},
31 {"yacute",253}, {"thorn",254}, {"yuml",255}, {"lt",60}, {"gt",62},
32 {"amp",38}, {"apos",39}, {"quot",34}, {"OElig",338}, {"oelig",339},
33 {"Scaron",352}, {"scaron",353}, {"Yuml",376}, {"circ",710},
34 {"tilde",732}, {"ensp",8194}, {"emsp",8195}, {"thinsp",8201},
35 {"zwnj",8204}, {"zwj",8205}, {"lrm",8206}, {"rlm",8207},
36 {"ndash",8211}, {"mdash",8212}, {"lsquo",8216}, {"rsquo",8217},
37 {"sbquo",8218}, {"ldquo",8220}, {"rdquo",8221}, {"bdquo",8222},
38 {"dagger",8224}, {"Dagger",8225}, {"permil",8240}, {"lsaquo",8249},
39 {"rsaquo",8250}, {"euro",8364}, {"fnof",402}, {"Alpha",913},
40 {"Beta",914}, {"Gamma",915}, {"Delta",916}, {"Epsilon",917},
41 {"Zeta",918}, {"Eta",919}, {"Theta",920}, {"Iota",921}, {"Kappa",922},
42 {"Lambda",923}, {"Mu",924}, {"Nu",925}, {"Xi",926}, {"Omicron",927},
43 {"Pi",928}, {"Rho",929}, {"Sigma",931}, {"Tau",932}, {"Upsilon",933},
44 {"Phi",934}, {"Chi",935}, {"Psi",936}, {"Omega",937}, {"alpha",945},
45 {"beta",946}, {"gamma",947}, {"delta",948}, {"epsilon",949},
46 {"zeta",950}, {"eta",951}, {"theta",952}, {"iota",953}, {"kappa",954},
47 {"lambda",955}, {"mu",956}, {"nu",957}, {"xi",958}, {"omicron",959},
48 {"pi",960}, {"rho",961}, {"sigmaf",962}, {"sigma",963}, {"tau",964},
49 {"upsilon",965}, {"phi",966}, {"chi",967}, {"psi",968}, {"omega",969},
50 {"thetasym",977}, {"upsih",978}, {"piv",982}, {"bull",8226},
51 {"hellip",8230}, {"prime",8242}, {"Prime",8243}, {"oline",8254},
52 {"frasl",8260}, {"weierp",8472}, {"image",8465}, {"real",8476},
53 {"trade",8482}, {"alefsym",8501}, {"larr",8592}, {"uarr",8593},
54 {"rarr",8594}, {"darr",8595}, {"harr",8596}, {"crarr",8629},
55 {"lArr",8656}, {"uArr",8657}, {"rArr",8658}, {"dArr",8659},
56 {"hArr",8660}, {"forall",8704}, {"part",8706}, {"exist",8707},
57 {"empty",8709}, {"nabla",8711}, {"isin",8712}, {"notin",8713},
58 {"ni",8715}, {"prod",8719}, {"sum",8721}, {"minus",8722},
59 {"lowast",8727}, {"radic",8730}, {"prop",8733}, {"infin",8734},
60 {"ang",8736}, {"and",8743}, {"or",8744}, {"cap",8745}, {"cup",8746},
61 {"int",8747}, {"there4",8756}, {"sim",8764}, {"cong",8773},
62 {"asymp",8776}, {"ne",8800}, {"equiv",8801}, {"le",8804}, {"ge",8805},
63 {"sub",8834}, {"sup",8835}, {"nsub",8836}, {"sube",8838},
64 {"supe",8839}, {"oplus",8853}, {"otimes",8855}, {"perp",8869},
65 {"sdot",8901}, {"lceil",8968}, {"rceil",8969}, {"lfloor",8970},
66 {"rfloor",8971}, {"lang",9001}, {"rang",9002}, {"loz",9674},
67 {"spades",9824}, {"clubs",9827}, {"hearts",9829}, {"diams",9830},
68};
69
70struct parser
71{
72 fz_pool *pool;
73 fz_xml *head;
74 int preserve_white;
75 int depth;
76};
77
78struct attribute
79{
80 char name[40];
81 char *value;
82 struct attribute *next;
83};
84
85struct fz_xml_doc_s
86{
87 fz_pool *pool;
88 fz_xml *root;
89};
90
91struct fz_xml_s
92{
93 char name[40];
94 char *text;
95 struct attribute *atts;
96 fz_xml *up, *down, *tail, *prev, *next;
97};
98
99static void xml_indent(int n)
100{
101 while (n--) {
102 putchar(' ');
103 putchar(' ');
104 }
105}
106
107/*
108 Pretty-print an XML tree to stdout.
109*/
110void fz_debug_xml(fz_xml *item, int level)
111{
112 if (item->text)
113 {
114 char *s = item->text;
115 int c;
116 xml_indent(level);
117 putchar('"');
118 while ((c = *s++)) {
119 switch (c) {
120 default:
121 if (c < 32 || c > 127) {
122 putchar('\\');
123 putchar('x');
124 putchar("0123456789ABCDEF"[(c>>4) & 15]);
125 putchar("0123456789ABCDEF"[(c) & 15]);
126 } else {
127 putchar(c);
128 }
129 break;
130 case '\\': putchar('\\'); putchar('\\'); break;
131 case '\b': putchar('\\'); putchar('b'); break;
132 case '\f': putchar('\\'); putchar('f'); break;
133 case '\n': putchar('\\'); putchar('n'); break;
134 case '\r': putchar('\\'); putchar('r'); break;
135 case '\t': putchar('\\'); putchar('t'); break;
136 }
137 }
138 putchar('\n');
139 }
140 else
141 {
142 fz_xml *child;
143 struct attribute *att;
144
145 xml_indent(level);
146 printf("(%s\n", item->name);
147 for (att = item->atts; att; att = att->next)
148 {
149 xml_indent(level);
150 printf("=%s %s\n", att->name, att->value);
151 }
152 for (child = item->down; child; child = child->next)
153 fz_debug_xml(child, level + 1);
154 xml_indent(level);
155 printf(")%s\n", item->name);
156 }
157}
158
159/*
160 Return previous sibling of XML node.
161*/
162fz_xml *fz_xml_prev(fz_xml *item)
163{
164 return item ? item->prev : NULL;
165}
166
167/*
168 Return next sibling of XML node.
169*/
170fz_xml *fz_xml_next(fz_xml *item)
171{
172 return item ? item->next : NULL;
173}
174
175/*
176 Return parent of XML node.
177*/
178fz_xml *fz_xml_up(fz_xml *item)
179{
180 return item ? item->up : NULL;
181}
182
183/*
184 Return first child of XML node.
185*/
186fz_xml *fz_xml_down(fz_xml *item)
187{
188 return item ? item->down : NULL;
189}
190
191/*
192 Return the text content of an XML node.
193 Return NULL if the node is a tag.
194*/
195char *fz_xml_text(fz_xml *item)
196{
197 return item ? item->text : NULL;
198}
199
200/*
201 Return tag of XML node. Return NULL for text nodes.
202*/
203char *fz_xml_tag(fz_xml *item)
204{
205 return item && item->name[0] ? item->name : NULL;
206}
207
208/*
209 Return true if the tag name matches.
210*/
211int fz_xml_is_tag(fz_xml *item, const char *name)
212{
213 if (!item)
214 return 0;
215 return !strcmp(item->name, name);
216}
217
218/*
219 Return the value of an attribute of an XML node.
220 NULL if the attribute doesn't exist.
221*/
222char *fz_xml_att(fz_xml *item, const char *name)
223{
224 struct attribute *att;
225 if (!item)
226 return NULL;
227 for (att = item->atts; att; att = att->next)
228 if (!strcmp(att->name, name))
229 return att->value;
230 return NULL;
231}
232
233fz_xml *fz_xml_find(fz_xml *item, const char *tag)
234{
235 while (item)
236 {
237 if (!strcmp(item->name, tag))
238 return item;
239 item = item->next;
240 }
241 return NULL;
242}
243
244fz_xml *fz_xml_find_next(fz_xml *item, const char *tag)
245{
246 if (item)
247 item = item->next;
248 return fz_xml_find(item, tag);
249}
250
251fz_xml *fz_xml_find_down(fz_xml *item, const char *tag)
252{
253 if (item)
254 item = item->down;
255 return fz_xml_find(item, tag);
256}
257
258fz_xml *fz_xml_root(fz_xml_doc *xml)
259{
260 return xml ? xml->root : NULL;
261}
262
263/*
264 Free the XML node and all its children and siblings.
265*/
266void fz_drop_xml(fz_context *ctx, fz_xml_doc *xml)
267{
268 if (xml)
269 fz_drop_pool(ctx, xml->pool);
270}
271
272/*
273 Detach a node from the tree, unlinking it from its parent,
274 and setting the document root to the node.
275*/
276void fz_detach_xml(fz_context *ctx, fz_xml_doc *xml, fz_xml *node)
277{
278 if (node->up)
279 node->up->down = NULL;
280 xml->root = node;
281}
282
283static size_t xml_parse_entity(int *c, char *a)
284{
285 char *b;
286 size_t i;
287
288 if (a[1] == '#') {
289 if (a[2] == 'x')
290 *c = strtol(a + 3, &b, 16);
291 else
292 *c = strtol(a + 2, &b, 10);
293 if (*b == ';')
294 return b - a + 1;
295 }
296 else if (a[1] == 'l' && a[2] == 't' && a[3] == ';') {
297 *c = '<';
298 return 4;
299 }
300 else if (a[1] == 'g' && a[2] == 't' && a[3] == ';') {
301 *c = '>';
302 return 4;
303 }
304 else if (a[1] == 'a' && a[2] == 'm' && a[3] == 'p' && a[4] == ';') {
305 *c = '&';
306 return 5;
307 }
308 else if (a[1] == 'a' && a[2] == 'p' && a[3] == 'o' && a[4] == 's' && a[5] == ';') {
309 *c = '\'';
310 return 6;
311 }
312 else if (a[1] == 'q' && a[2] == 'u' && a[3] == 'o' && a[4] == 't' && a[5] == ';') {
313 *c = '"';
314 return 6;
315 }
316
317 /* We should only be doing this for XHTML, but it shouldn't be a problem. */
318 for (i = 0; i < nelem(html_entities); ++i) {
319 size_t n = strlen(html_entities[i].name);
320 if (!strncmp(a+1, html_entities[i].name, n) && a[n+1] == ';') {
321 *c = html_entities[i].c;
322 return n + 2;
323 }
324 }
325
326 *c = *a;
327 return 1;
328}
329
330static inline int isname(int c)
331{
332 return c == '.' || c == '-' || c == '_' || c == ':' ||
333 (c >= '0' && c <= '9') ||
334 (c >= 'A' && c <= 'Z') ||
335 (c >= 'a' && c <= 'z');
336}
337
338static inline int iswhite(int c)
339{
340 return c == ' ' || c == '\r' || c == '\n' || c == '\t';
341}
342
343static void xml_emit_open_tag(fz_context *ctx, struct parser *parser, char *a, char *b)
344{
345 fz_xml *head, *tail;
346 char *ns;
347
348 /* skip namespace prefix */
349 for (ns = a; ns < b; ++ns)
350 if (*ns == ':')
351 a = ns + 1;
352
353 head = fz_pool_alloc(ctx, parser->pool, sizeof *head);
354 if (b - a > sizeof(head->name) - 1)
355 b = a + sizeof(head->name) - 1;
356 memcpy(head->name, a, b - a);
357 head->name[b - a] = 0;
358
359 head->atts = NULL;
360 head->text = NULL;
361 head->up = parser->head;
362 head->down = NULL;
363 head->prev = NULL;
364 head->next = NULL;
365
366 if (!parser->head->down) {
367 parser->head->down = head;
368 parser->head->tail = head;
369 }
370 else {
371 tail = parser->head->tail;
372 tail->next = head;
373 head->prev = tail;
374 parser->head->tail = head;
375 }
376
377 parser->head = head;
378 parser->depth++;
379}
380
381static void xml_emit_att_name(fz_context *ctx, struct parser *parser, char *a, char *b)
382{
383 fz_xml *head = parser->head;
384 struct attribute *att;
385
386 att = fz_pool_alloc(ctx, parser->pool, sizeof *att);
387 if (b - a > sizeof(att->name) - 1)
388 b = a + sizeof(att->name) - 1;
389 memcpy(att->name, a, b - a);
390 att->name[b - a] = 0;
391 att->value = NULL;
392 att->next = head->atts;
393 head->atts = att;
394}
395
396static void xml_emit_att_value(fz_context *ctx, struct parser *parser, char *a, char *b)
397{
398 fz_xml *head = parser->head;
399 struct attribute *att = head->atts;
400 char *s;
401 int c;
402
403 /* entities are all longer than UTFmax so runetochar is safe */
404 s = att->value = fz_pool_alloc(ctx, parser->pool, b - a + 1);
405 while (a < b) {
406 if (*a == '&') {
407 a += xml_parse_entity(&c, a);
408 s += fz_runetochar(s, c);
409 }
410 else {
411 *s++ = *a++;
412 }
413 }
414 *s = 0;
415}
416
417static void xml_emit_close_tag(fz_context *ctx, struct parser *parser)
418{
419 parser->depth--;
420 if (parser->head->up)
421 parser->head = parser->head->up;
422}
423
424static void xml_emit_text(fz_context *ctx, struct parser *parser, char *a, char *b)
425{
426 static char *empty = "";
427 fz_xml *head;
428 char *s;
429 int c;
430
431 /* Skip text outside the root tag */
432 if (parser->depth == 0)
433 return;
434
435 /* Skip all-whitespace text nodes */
436 if (!parser->preserve_white)
437 {
438 for (s = a; s < b; s++)
439 if (!iswhite(*s))
440 break;
441 if (s == b)
442 return;
443 }
444
445 xml_emit_open_tag(ctx, parser, empty, empty);
446 head = parser->head;
447
448 /* entities are all longer than UTFmax so runetochar is safe */
449 s = head->text = fz_pool_alloc(ctx, parser->pool, b - a + 1);
450 while (a < b) {
451 if (*a == '&') {
452 a += xml_parse_entity(&c, a);
453 s += fz_runetochar(s, c);
454 }
455 else {
456 *s++ = *a++;
457 }
458 }
459 *s = 0;
460
461 xml_emit_close_tag(ctx, parser);
462}
463
464static void xml_emit_cdata(fz_context *ctx, struct parser *parser, char *a, char *b)
465{
466 static char *empty = "";
467 fz_xml *head;
468 char *s;
469
470 xml_emit_open_tag(ctx, parser, empty, empty);
471 head = parser->head;
472
473 s = head->text = fz_pool_alloc(ctx, parser->pool, b - a + 1);
474 while (a < b)
475 *s++ = *a++;
476 *s = 0;
477
478 xml_emit_close_tag(ctx, parser);
479}
480
481static char *xml_parse_document_imp(fz_context *ctx, struct parser *parser, char *p)
482{
483 char *mark;
484 int quote;
485
486parse_text:
487 mark = p;
488 while (*p && *p != '<') ++p;
489 if (*p == '<') {
490 /* skip trailing newline before closing tag */
491 if (p[1] == '/' && p - 1 >= mark && p[-1] == '\n')
492 xml_emit_text(ctx, parser, mark, p - 1);
493 else if (mark < p)
494 xml_emit_text(ctx, parser, mark, p);
495 ++p;
496 goto parse_element;
497 } else if (mark < p)
498 xml_emit_text(ctx, parser, mark, p);
499 return NULL;
500
501parse_element:
502 if (*p == '/') { ++p; goto parse_closing_element; }
503 if (*p == '!') { ++p; goto parse_comment; }
504 if (*p == '?') { ++p; goto parse_processing_instruction; }
505 while (iswhite(*p)) ++p;
506 if (isname(*p))
507 goto parse_element_name;
508 return "syntax error in element";
509
510parse_comment:
511 if (p[0]=='D' && p[1]=='O' && p[2]=='C' && p[3]=='T' && p[4]=='Y' && p[5]=='P' && p[6]=='E')
512 goto parse_declaration;
513 if (p[0]=='E' && p[1]=='N' && p[2]=='T' && p[3]=='I' && p[4]=='T' && p[5]=='Y')
514 goto parse_declaration;
515 if (*p == '[') goto parse_cdata;
516 if (*p++ != '-') return "syntax error in comment (<! not followed by --)";
517 if (*p++ != '-') return "syntax error in comment (<!- not followed by -)";
518 while (*p) {
519 if (p[0] == '-' && p[1] == '-' && p[2] == '>') {
520 p += 3;
521 goto parse_text;
522 }
523 ++p;
524 }
525 return "end of data in comment";
526
527parse_declaration:
528 while (*p) if (*p++ == '>') goto parse_text;
529 return "end of data in declaration";
530
531parse_cdata:
532 if (p[1] != 'C' || p[2] != 'D' || p[3] != 'A' || p[4] != 'T' || p[5] != 'A' || p[6] != '[')
533 return "syntax error in CDATA section";
534 p += 7;
535 mark = p;
536 while (*p) {
537 if (p[0] == ']' && p[1] == ']' && p[2] == '>') {
538 xml_emit_cdata(ctx, parser, mark, p);
539 p += 3;
540 goto parse_text;
541 }
542 ++p;
543 }
544 return "end of data in CDATA section";
545
546parse_processing_instruction:
547 while (*p) {
548 if (p[0] == '?' && p[1] == '>') {
549 p += 2;
550 goto parse_text;
551 }
552 ++p;
553 }
554 return "end of data in processing instruction";
555
556parse_closing_element:
557 while (iswhite(*p)) ++p;
558 while (isname(*p)) ++p;
559 while (iswhite(*p)) ++p;
560 if (*p != '>')
561 return "syntax error in closing element";
562 xml_emit_close_tag(ctx, parser);
563 ++p;
564 goto parse_text;
565
566parse_element_name:
567 mark = p;
568 while (isname(*p)) ++p;
569 xml_emit_open_tag(ctx, parser, mark, p);
570 if (*p == '>') {
571 ++p;
572 if (*p == '\n') ++p; /* must skip linebreak immediately after an opening tag */
573 goto parse_text;
574 }
575 if (p[0] == '/' && p[1] == '>') {
576 xml_emit_close_tag(ctx, parser);
577 p += 2;
578 goto parse_text;
579 }
580 if (iswhite(*p))
581 goto parse_attributes;
582 return "syntax error after element name";
583
584parse_attributes:
585 while (iswhite(*p)) ++p;
586 if (isname(*p))
587 goto parse_attribute_name;
588 if (*p == '>') {
589 ++p;
590 if (*p == '\n') ++p; /* must skip linebreak immediately after an opening tag */
591 goto parse_text;
592 }
593 if (p[0] == '/' && p[1] == '>') {
594 xml_emit_close_tag(ctx, parser);
595 p += 2;
596 goto parse_text;
597 }
598 return "syntax error in attributes";
599
600parse_attribute_name:
601 mark = p;
602 while (isname(*p)) ++p;
603 xml_emit_att_name(ctx, parser, mark, p);
604 while (iswhite(*p)) ++p;
605 if (*p == '=') { ++p; goto parse_attribute_value; }
606 return "syntax error after attribute name";
607
608parse_attribute_value:
609 while (iswhite(*p)) ++p;
610 quote = *p++;
611 if (quote != '"' && quote != '\'')
612 return "missing quote character";
613 mark = p;
614 while (*p && *p != quote) ++p;
615 if (*p == quote) {
616 xml_emit_att_value(ctx, parser, mark, p++);
617 goto parse_attributes;
618 }
619 return "end of data in attribute value";
620}
621
622static int startswith(const char *a, const char *b)
623{
624 return !fz_strncasecmp(a, b, strlen(b));
625}
626
627static const unsigned short *find_xml_encoding(char *s)
628{
629 const unsigned short *table = NULL;
630 char *end, *xml, *enc;
631
632 end = strchr(s, '>');
633 if (end)
634 {
635 *end = 0;
636 xml = strstr(s, "<?xml");
637 if (xml)
638 {
639 enc = strstr(xml, "encoding=");
640 if (enc)
641 {
642 enc += 10;
643 if (startswith(enc, "iso-8859-1") || startswith(enc, "latin1"))
644 table = fz_unicode_from_iso8859_1;
645 else if (startswith(enc, "iso-8859-7") || startswith(enc, "greek"))
646 table = fz_unicode_from_iso8859_7;
647 else if (startswith(enc, "koi8"))
648 table = fz_unicode_from_koi8u;
649 else if (startswith(enc, "windows-1250"))
650 table = fz_unicode_from_windows_1250;
651 else if (startswith(enc, "windows-1251"))
652 table = fz_unicode_from_windows_1251;
653 else if (startswith(enc, "windows-1252"))
654 table = fz_unicode_from_windows_1252;
655 }
656 }
657 *end = '>';
658 }
659
660 return table;
661}
662
663static char *convert_to_utf8(fz_context *ctx, unsigned char *s, size_t n, int *dofree)
664{
665 const unsigned short *table;
666 const unsigned char *e = s + n;
667 char *dst, *d;
668 int c;
669
670 if (s[0] == 0xFE && s[1] == 0xFF) {
671 s += 2;
672 dst = d = fz_malloc(ctx, n * FZ_UTFMAX);
673 while (s + 1 < e) {
674 c = s[0] << 8 | s[1];
675 d += fz_runetochar(d, c);
676 s += 2;
677 }
678 *d = 0;
679 *dofree = 1;
680 return dst;
681 }
682
683 if (s[0] == 0xFF && s[1] == 0xFE) {
684 s += 2;
685 dst = d = fz_malloc(ctx, n * FZ_UTFMAX);
686 while (s + 1 < e) {
687 c = s[0] | s[1] << 8;
688 d += fz_runetochar(d, c);
689 s += 2;
690 }
691 *d = 0;
692 *dofree = 1;
693 return dst;
694 }
695
696 table = find_xml_encoding((char*)s);
697 if (table) {
698 dst = d = fz_malloc(ctx, n * FZ_UTFMAX);
699 while (*s) {
700 c = table[*s++];
701 d += fz_runetochar(d, c);
702 }
703 *d = 0;
704 *dofree = 1;
705 return dst;
706 }
707
708 *dofree = 0;
709
710 if (s[0] == 0xEF && s[1] == 0xBB && s[2] == 0xBF)
711 return (char*)s+3;
712
713 return (char*)s;
714}
715
716/*
717 Parse the contents of buffer into a tree of xml nodes.
718
719 preserve_white: whether to keep or delete all-whitespace nodes.
720*/
721fz_xml_doc *
722fz_parse_xml(fz_context *ctx, fz_buffer *buf, int preserve_white)
723{
724 struct parser parser;
725 fz_xml_doc *xml = NULL;
726 fz_xml root, *node;
727 char *p = NULL;
728 char *error;
729 int dofree;
730 unsigned char *s;
731 size_t n;
732
733 fz_var(p);
734
735 /* ensure we are zero-terminated */
736 fz_terminate_buffer(ctx, buf);
737 n = fz_buffer_storage(ctx, buf, &s);
738
739 memset(&root, 0, sizeof(root));
740 parser.pool = fz_new_pool(ctx);
741 parser.head = &root;
742 parser.preserve_white = preserve_white;
743 parser.depth = 0;
744
745 fz_try(ctx)
746 {
747 p = convert_to_utf8(ctx, s, n, &dofree);
748
749 error = xml_parse_document_imp(ctx, &parser, p);
750 if (error)
751 fz_throw(ctx, FZ_ERROR_GENERIC, "%s", error);
752
753 for (node = root.down; node; node = node->next)
754 node->up = NULL;
755
756 xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml);
757 xml->pool = parser.pool;
758 xml->root = root.down;
759 }
760 fz_always(ctx)
761 {
762 if (dofree)
763 fz_free(ctx, p);
764 }
765 fz_catch(ctx)
766 {
767 fz_drop_pool(ctx, parser.pool);
768 fz_rethrow(ctx);
769 }
770
771 return xml;
772}
773