xml.c source code [MuPDF/source/fitz/xml.c]

1	#include "mupdf/fitz.h"
2
3	#include <string.h>
4	#include <stdlib.h>
5	#include <stdio.h>
6
7	static const struct { const char name; int* c; } html_entities[] = {
8	{"nbsp",`160`}, {"iexcl",`161`}, {"cent",`162`}, {"pound",`163`},
9	{"curren",`164`}, {"yen",`165`}, {"brvbar",`166`}, {"sect",`167`},
10	{"uml",`168`}, {"copy",`169`}, {"ordf",`170`}, {"laquo",`171`},
11	{"not",`172`}, {"shy",`173`}, {"reg",`174`}, {"macr",`175`}, {"deg",`176`},
12	{"plusmn",`177`}, {"sup2",`178`}, {"sup3",`179`}, {"acute",`180`},
13	{"micro",`181`}, {"para",`182`}, {"middot",`183`}, {"cedil",`184`},
14	{"sup1",`185`}, {"ordm",`186`}, {"raquo",`187`}, {"frac14",`188`},
15	{"frac12",`189`}, {"frac34",`190`}, {"iquest",`191`}, {"Agrave",`192`},
16	{"Aacute",`193`}, {"Acirc",`194`}, {"Atilde",`195`}, {"Auml",`196`},
17	{"Aring",`197`}, {"AElig",`198`}, {"Ccedil",`199`}, {"Egrave",`200`},
18	{"Eacute",`201`}, {"Ecirc",`202`}, {"Euml",`203`}, {"Igrave",`204`},
19	{"Iacute",`205`}, {"Icirc",`206`}, {"Iuml",`207`}, {"ETH",`208`},
20	{"Ntilde",`209`}, {"Ograve",`210`}, {"Oacute",`211`}, {"Ocirc",`212`},
21	{"Otilde",`213`}, {"Ouml",`214`}, {"times",`215`}, {"Oslash",`216`},
22	{"Ugrave",`217`}, {"Uacute",`218`}, {"Ucirc",`219`}, {"Uuml",`220`},
23	{"Yacute",`221`}, {"THORN",`222`}, {"szlig",`223`}, {"agrave",`224`},
24	{"aacute",`225`}, {"acirc",`226`}, {"atilde",`227`}, {"auml",`228`},
25	{"aring",`229`}, {"aelig",`230`}, {"ccedil",`231`}, {"egrave",`232`},
26	{"eacute",`233`}, {"ecirc",`234`}, {"euml",`235`}, {"igrave",`236`},
27	{"iacute",`237`}, {"icirc",`238`}, {"iuml",`239`}, {"eth",`240`},
28	{"ntilde",`241`}, {"ograve",`242`}, {"oacute",`243`}, {"ocirc",`244`},
29	{"otilde",`245`}, {"ouml",`246`}, {"divide",`247`}, {"oslash",`248`},
30	{"ugrave",`249`}, {"uacute",`250`}, {"ucirc",`251`}, {"uuml",`252`},
31	{"yacute",`253`}, {"thorn",`254`}, {"yuml",`255`}, {"lt",`60`}, {"gt",`62`},
32	{"amp",`38`}, {"apos",`39`}, {"quot",`34`}, {"OElig",`338`}, {"oelig",`339`},
33	{"Scaron",`352`}, {"scaron",`353`}, {"Yuml",`376`}, {"circ",`710`},
34	{"tilde",`732`}, {"ensp",`8194`}, {"emsp",`8195`}, {"thinsp",`8201`},
35	{"zwnj",`8204`}, {"zwj",`8205`}, {"lrm",`8206`}, {"rlm",`8207`},
36	{"ndash",`8211`}, {"mdash",`8212`}, {"lsquo",`8216`}, {"rsquo",`8217`},
37	{"sbquo",`8218`}, {"ldquo",`8220`}, {"rdquo",`8221`}, {"bdquo",`8222`},
38	{"dagger",`8224`}, {"Dagger",`8225`}, {"permil",`8240`}, {"lsaquo",`8249`},
39	{"rsaquo",`8250`}, {"euro",`8364`}, {"fnof",`402`}, {"Alpha",`913`},
40	{"Beta",`914`}, {"Gamma",`915`}, {"Delta",`916`}, {"Epsilon",`917`},
41	{"Zeta",`918`}, {"Eta",`919`}, {"Theta",`920`}, {"Iota",`921`}, {"Kappa",`922`},
42	{"Lambda",`923`}, {"Mu",`924`}, {"Nu",`925`}, {"Xi",`926`}, {"Omicron",`927`},
43	{"Pi",`928`}, {"Rho",`929`}, {"Sigma",`931`}, {"Tau",`932`}, {"Upsilon",`933`},
44	{"Phi",`934`}, {"Chi",`935`}, {"Psi",`936`}, {"Omega",`937`}, {"alpha",`945`},
45	{"beta",`946`}, {"gamma",`947`}, {"delta",`948`}, {"epsilon",`949`},
46	{"zeta",`950`}, {"eta",`951`}, {"theta",`952`}, {"iota",`953`}, {"kappa",`954`},
47	{"lambda",`955`}, {"mu",`956`}, {"nu",`957`}, {"xi",`958`}, {"omicron",`959`},
48	{"pi",`960`}, {"rho",`961`}, {"sigmaf",`962`}, {"sigma",`963`}, {"tau",`964`},
49	{"upsilon",`965`}, {"phi",`966`}, {"chi",`967`}, {"psi",`968`}, {"omega",`969`},
50	{"thetasym",`977`}, {"upsih",`978`}, {"piv",`982`}, {"bull",`8226`},
51	{"hellip",`8230`}, {"prime",`8242`}, {"Prime",`8243`}, {"oline",`8254`},
52	{"frasl",`8260`}, {"weierp",`8472`}, {"image",`8465`}, {"real",`8476`},
53	{"trade",`8482`}, {"alefsym",`8501`}, {"larr",`8592`}, {"uarr",`8593`},
54	{"rarr",`8594`}, {"darr",`8595`}, {"harr",`8596`}, {"crarr",`8629`},
55	{"lArr",`8656`}, {"uArr",`8657`}, {"rArr",`8658`}, {"dArr",`8659`},
56	{"hArr",`8660`}, {"forall",`8704`}, {"part",`8706`}, {"exist",`8707`},
57	{"empty",`8709`}, {"nabla",`8711`}, {"isin",`8712`}, {"notin",`8713`},
58	{"ni",`8715`}, {"prod",`8719`}, {"sum",`8721`}, {"minus",`8722`},
59	{"lowast",`8727`}, {"radic",`8730`}, {"prop",`8733`}, {"infin",`8734`},
60	{"ang",`8736`}, {"and",`8743`}, {"or",`8744`}, {"cap",`8745`}, {"cup",`8746`},
61	{"int",`8747`}, {"there4",`8756`}, {"sim",`8764`}, {"cong",`8773`},
62	{"asymp",`8776`}, {"ne",`8800`}, {"equiv",`8801`}, {"le",`8804`}, {"ge",`8805`},
63	{"sub",`8834`}, {"sup",`8835`}, {"nsub",`8836`}, {"sube",`8838`},
64	{"supe",`8839`}, {"oplus",`8853`}, {"otimes",`8855`}, {"perp",`8869`},
65	{"sdot",`8901`}, {"lceil",`8968`}, {"rceil",`8969`}, {"lfloor",`8970`},
66	{"rfloor",`8971`}, {"lang",`9001`}, {"rang",`9002`}, {"loz",`9674`},
67	{"spades",`9824`}, {"clubs",`9827`}, {"hearts",`9829`}, {"diams",`9830`},
68	};
69
70	struct parser
71	{
72	fz_pool *pool;
73	fz_xml *head;
74	int preserve_white;
75	int depth;
76	};
77
78	struct attribute
79	{
80	char name[`40`];
81	char *value;
82	struct attribute *next;
83	};
84
85	struct fz_xml_doc_s
86	{
87	fz_pool *pool;
88	fz_xml *root;
89	};
90
91	struct fz_xml_s
92	{
93	char name[`40`];
94	char *text;
95	struct attribute *atts;
96	fz_xml up, down, tail, prev, *next;
97	};
98
99	static void xml_indent(int n)
100	{
101	while (n--) {
102	putchar(`' '`);
103	putchar(`' '`);
104	}
105	}
106
107	/*
108	Pretty-print an XML tree to stdout.
109	*/
110	void fz_debug_xml(fz_xml item, int* level)
111	{
112	if (item->text)
113	{
114	char *s = item->text;
115	int c;
116	xml_indent(level);
117	putchar(`'"'`);
118	while ((c = *s++)) {
119	switch (c) {
120	default:
121	if (c < `32` \|\| c > `127`) {
122	putchar(`'\\'`);
123	putchar(`'x'`);
124	putchar("0123456789ABCDEF"[(c>>`4`) & `15`]);
125	putchar("0123456789ABCDEF"[(c) & `15`]);
126	} else {
127	putchar(c);
128	}
129	break;
130	case `'\\'`: putchar(`'\\'`); putchar(`'\\'`); break;
131	case `'\b'`: putchar(`'\\'`); putchar(`'b'`); break;
132	case `'\f'`: putchar(`'\\'`); putchar(`'f'`); break;
133	case `'\n'`: putchar(`'\\'`); putchar(`'n'`); break;
134	case `'\r'`: putchar(`'\\'`); putchar(`'r'`); break;
135	case `'\t'`: putchar(`'\\'`); putchar(`'t'`); break;
136	}
137	}
138	putchar(`'\n'`);
139	}
140	else
141	{
142	fz_xml *child;
143	struct attribute *att;
144
145	xml_indent(level);
146	printf("(%s\n", item->name);
147	for (att = item->atts; att; att = att->next)
148	{
149	xml_indent(level);
150	printf("=%s %s\n", att->name, att->value);
151	}
152	for (child = item->down; child; child = child->next)
153	fz_debug_xml(child, level + `1`);
154	xml_indent(level);
155	printf(")%s\n", item->name);
156	}
157	}
158
159	/*
160	Return previous sibling of XML node.
161	*/
162	fz_xml fz_xml_prev(fz_xml item)
163	{
164	return item ? item->prev : NULL;
165	}
166
167	/*
168	Return next sibling of XML node.
169	*/
170	fz_xml fz_xml_next(fz_xml item)
171	{
172	return item ? item->next : NULL;
173	}
174
175	/*
176	Return parent of XML node.
177	*/
178	fz_xml fz_xml_up(fz_xml item)
179	{
180	return item ? item->up : NULL;
181	}
182
183	/*
184	Return first child of XML node.
185	*/
186	fz_xml fz_xml_down(fz_xml item)
187	{
188	return item ? item->down : NULL;
189	}
190
191	/*
192	Return the text content of an XML node.
193	Return NULL if the node is a tag.
194	*/
195	char fz_xml_text(fz_xml item)
196	{
197	return item ? item->text : NULL;
198	}
199
200	/*
201	Return tag of XML node. Return NULL for text nodes.
202	*/
203	char fz_xml_tag(fz_xml item)
204	{
205	return item && item->name[`0`] ? item->name : NULL;
206	}
207
208	/*
209	Return true if the tag name matches.
210	*/
211	int fz_xml_is_tag(fz_xml item, const* char *name)
212	{
213	if (!item)
214	return `0`;
215	return !strcmp(item->name, name);
216	}
217
218	/*
219	Return the value of an attribute of an XML node.
220	NULL if the attribute doesn't exist.
221	*/
222	char fz_xml_att(fz_xml item, const char *name)
223	{
224	struct attribute *att;
225	if (!item)
226	return NULL;
227	for (att = item->atts; att; att = att->next)
228	if (!strcmp(att->name, name))
229	return att->value;
230	return NULL;
231	}
232
233	fz_xml fz_xml_find(fz_xml item, const char *tag)
234	{
235	while (item)
236	{
237	if (!strcmp(item->name, tag))
238	return item;
239	item = item->next;
240	}
241	return NULL;
242	}
243
244	fz_xml fz_xml_find_next(fz_xml item, const char *tag)
245	{
246	if (item)
247	item = item->next;
248	return fz_xml_find(item, tag);
249	}
250
251	fz_xml fz_xml_find_down(fz_xml item, const char *tag)
252	{
253	if (item)
254	item = item->down;
255	return fz_xml_find(item, tag);
256	}
257
258	fz_xml fz_xml_root(fz_xml_doc xml)
259	{
260	return xml ? xml->root : NULL;
261	}
262
263	/*
264	Free the XML node and all its children and siblings.
265	*/
266	void fz_drop_xml(fz_context ctx, fz_xml_doc xml)
267	{
268	if (xml)
269	fz_drop_pool(ctx, xml->pool);
270	}
271
272	/*
273	Detach a node from the tree, unlinking it from its parent,
274	and setting the document root to the node.
275	*/
276	void fz_detach_xml(fz_context ctx, fz_xml_doc xml, fz_xml *node)
277	{
278	if (node->up)
279	node->up->down = NULL;
280	xml->root = node;
281	}
282
283	static size_t xml_parse_entity(int c, char* *a)
284	{
285	char *b;
286	size_t i;
287
288	if (a[`1`] == `'#'`) {
289	if (a[`2`] == `'x'`)
290	*c = strtol(a + `3`, &b, `16`);
291	else
292	*c = strtol(a + `2`, &b, `10`);
293	if (*b == `';'`)
294	return b - a + `1`;
295	}
296	else if (a[`1`] == `'l'` && a[`2`] == `'t'` && a[`3`] == `';'`) {
297	*c = `'<'`;
298	return `4`;
299	}
300	else if (a[`1`] == `'g'` && a[`2`] == `'t'` && a[`3`] == `';'`) {
301	*c = `'>'`;
302	return `4`;
303	}
304	else if (a[`1`] == `'a'` && a[`2`] == `'m'` && a[`3`] == `'p'` && a[`4`] == `';'`) {
305	*c = `'&'`;
306	return `5`;
307	}
308	else if (a[`1`] == `'a'` && a[`2`] == `'p'` && a[`3`] == `'o'` && a[`4`] == `'s'` && a[`5`] == `';'`) {
309	*c = `'\''`;
310	return `6`;
311	}
312	else if (a[`1`] == `'q'` && a[`2`] == `'u'` && a[`3`] == `'o'` && a[`4`] == `'t'` && a[`5`] == `';'`) {
313	*c = `'"'`;
314	return `6`;
315	}
316
317	/ We should only be doing this for XHTML, but it shouldn't be a problem. /
318	for (i = `0`; i < nelem(html_entities); ++i) {
319	size_t n = strlen(html_entities[i].name);
320	if (!strncmp(a+`1`, html_entities[i].name, n) && a[n+`1`] == `';'`) {
321	*c = html_entities[i].c;
322	return n + `2`;
323	}
324	}
325
326	c = a;
327	return `1`;
328	}
329
330	static inline int isname(int c)
331	{
332	return c == `'.'` \|\| c == `'-'` \|\| c == `'_'` \|\| c == `':'` \|\|
333	(c >= `'0'` && c <= `'9'`) \|\|
334	(c >= `'A'` && c <= `'Z'`) \|\|
335	(c >= `'a'` && c <= `'z'`);
336	}
337
338	static inline int iswhite(int c)
339	{
340	return c == `' '` \|\| c == `'\r'` \|\| c == `'\n'` \|\| c == `'\t'`;
341	}
342
343	static void xml_emit_open_tag(fz_context ctx, struct* parser parser, char* a, char* *b)
344	{
345	fz_xml head, tail;
346	char *ns;
347
348	/ skip namespace prefix /
349	for (ns = a; ns < b; ++ns)
350	if (*ns == `':'`)
351	a = ns + `1`;
352
353	head = fz_pool_alloc(ctx, parser->pool, sizeof *head);
354	if (b - a > sizeof(head->name) - `1`)
355	b = a + sizeof(head->name) - `1`;
356	memcpy(head->name, a, b - a);
357	head->name[b - a] = `0`;
358
359	head->atts = NULL;
360	head->text = NULL;
361	head->up = parser->head;
362	head->down = NULL;
363	head->prev = NULL;
364	head->next = NULL;
365
366	if (!parser->head->down) {
367	parser->head->down = head;
368	parser->head->tail = head;
369	}
370	else {
371	tail = parser->head->tail;
372	tail->next = head;
373	head->prev = tail;
374	parser->head->tail = head;
375	}
376
377	parser->head = head;
378	parser->depth++;
379	}
380
381	static void xml_emit_att_name(fz_context ctx, struct* parser parser, char* a, char* *b)
382	{
383	fz_xml *head = parser->head;
384	struct attribute *att;
385
386	att = fz_pool_alloc(ctx, parser->pool, sizeof *att);
387	if (b - a > sizeof(att->name) - `1`)
388	b = a + sizeof(att->name) - `1`;
389	memcpy(att->name, a, b - a);
390	att->name[b - a] = `0`;
391	att->value = NULL;
392	att->next = head->atts;
393	head->atts = att;
394	}
395
396	static void xml_emit_att_value(fz_context ctx, struct* parser parser, char* a, char* *b)
397	{
398	fz_xml *head = parser->head;
399	struct attribute *att = head->atts;
400	char *s;
401	int c;
402
403	/ entities are all longer than UTFmax so runetochar is safe /
404	s = att->value = fz_pool_alloc(ctx, parser->pool, b - a + `1`);
405	while (a < b) {
406	if (*a == `'&'`) {
407	a += xml_parse_entity(&c, a);
408	s += fz_runetochar(s, c);
409	}
410	else {
411	s++ = a++;
412	}
413	}
414	*s = `0`;
415	}
416
417	static void xml_emit_close_tag(fz_context ctx, struct* parser *parser)
418	{
419	parser->depth--;
420	if (parser->head->up)
421	parser->head = parser->head->up;
422	}
423
424	static void xml_emit_text(fz_context ctx, struct* parser parser, char* a, char* *b)
425	{
426	static char *empty = "";
427	fz_xml *head;
428	char *s;
429	int c;
430
431	/ Skip text outside the root tag /
432	if (parser->depth == `0`)
433	return;
434
435	/ Skip all-whitespace text nodes /
436	if (!parser->preserve_white)
437	{
438	for (s = a; s < b; s++)
439	if (!iswhite(*s))
440	break;
441	if (s == b)
442	return;
443	}
444
445	xml_emit_open_tag(ctx, parser, empty, empty);
446	head = parser->head;
447
448	/ entities are all longer than UTFmax so runetochar is safe /
449	s = head->text = fz_pool_alloc(ctx, parser->pool, b - a + `1`);
450	while (a < b) {
451	if (*a == `'&'`) {
452	a += xml_parse_entity(&c, a);
453	s += fz_runetochar(s, c);
454	}
455	else {
456	s++ = a++;
457	}
458	}
459	*s = `0`;
460
461	xml_emit_close_tag(ctx, parser);
462	}
463
464	static void xml_emit_cdata(fz_context ctx, struct* parser parser, char* a, char* *b)
465	{
466	static char *empty = "";
467	fz_xml *head;
468	char *s;
469
470	xml_emit_open_tag(ctx, parser, empty, empty);
471	head = parser->head;
472
473	s = head->text = fz_pool_alloc(ctx, parser->pool, b - a + `1`);
474	while (a < b)
475	s++ = a++;
476	*s = `0`;
477
478	xml_emit_close_tag(ctx, parser);
479	}
480
481	static char xml_parse_document_imp(fz_context ctx, struct parser parser, char* *p)
482	{
483	char *mark;
484	int quote;
485
486	parse_text:
487	mark = p;
488	while (p && p != `'<'`) ++p;
489	if (*p == `'<'`) {
490	/ skip trailing newline before closing tag /
491	if (p[`1`] == `'/'` && p - `1` >= mark && p[-`1`] == `'\n'`)
492	xml_emit_text(ctx, parser, mark, p - `1`);
493	else if (mark < p)
494	xml_emit_text(ctx, parser, mark, p);
495	++p;
496	goto parse_element;
497	} else if (mark < p)
498	xml_emit_text(ctx, parser, mark, p);
499	return NULL;
500
501	parse_element:
502	if (p == `'/'`) { ++p; goto* parse_closing_element; }
503	if (p == `'!'`) { ++p; goto* parse_comment; }
504	if (p == `'?'`) { ++p; goto* parse_processing_instruction; }
505	while (iswhite(*p)) ++p;
506	if (isname(*p))
507	goto parse_element_name;
508	return "syntax error in element";
509
510	parse_comment:
511	if (p[`0`]==`'D'` && p[`1`]==`'O'` && p[`2`]==`'C'` && p[`3`]==`'T'` && p[`4`]==`'Y'` && p[`5`]==`'P'` && p[`6`]==`'E'`)
512	goto parse_declaration;
513	if (p[`0`]==`'E'` && p[`1`]==`'N'` && p[`2`]==`'T'` && p[`3`]==`'I'` && p[`4`]==`'T'` && p[`5`]==`'Y'`)
514	goto parse_declaration;
515	if (p == `'['`) goto* parse_cdata;
516	if (p++ != `'-'`) return* "syntax error in comment (<! not followed by --)";
517	if (p++ != `'-'`) return* "syntax error in comment (<!- not followed by -)";
518	while (*p) {
519	if (p[`0`] == `'-'` && p[`1`] == `'-'` && p[`2`] == `'>'`) {
520	p += `3`;
521	goto parse_text;
522	}
523	++p;
524	}
525	return "end of data in comment";
526
527	parse_declaration:
528	while (p) if (p++ == `'>'`) goto parse_text;
529	return "end of data in declaration";
530
531	parse_cdata:
532	if (p[`1`] != `'C'` \|\| p[`2`] != `'D'` \|\| p[`3`] != `'A'` \|\| p[`4`] != `'T'` \|\| p[`5`] != `'A'` \|\| p[`6`] != `'['`)
533	return "syntax error in CDATA section";
534	p += `7`;
535	mark = p;
536	while (*p) {
537	if (p[`0`] == `']'` && p[`1`] == `']'` && p[`2`] == `'>'`) {
538	xml_emit_cdata(ctx, parser, mark, p);
539	p += `3`;
540	goto parse_text;
541	}
542	++p;
543	}
544	return "end of data in CDATA section";
545
546	parse_processing_instruction:
547	while (*p) {
548	if (p[`0`] == `'?'` && p[`1`] == `'>'`) {
549	p += `2`;
550	goto parse_text;
551	}
552	++p;
553	}
554	return "end of data in processing instruction";
555
556	parse_closing_element:
557	while (iswhite(*p)) ++p;
558	while (isname(*p)) ++p;
559	while (iswhite(*p)) ++p;
560	if (*p != `'>'`)
561	return "syntax error in closing element";
562	xml_emit_close_tag(ctx, parser);
563	++p;
564	goto parse_text;
565
566	parse_element_name:
567	mark = p;
568	while (isname(*p)) ++p;
569	xml_emit_open_tag(ctx, parser, mark, p);
570	if (*p == `'>'`) {
571	++p;
572	if (p == `'\n'`) ++p; /* must skip linebreak immediately after an opening tag /
573	goto parse_text;
574	}
575	if (p[`0`] == `'/'` && p[`1`] == `'>'`) {
576	xml_emit_close_tag(ctx, parser);
577	p += `2`;
578	goto parse_text;
579	}
580	if (iswhite(*p))
581	goto parse_attributes;
582	return "syntax error after element name";
583
584	parse_attributes:
585	while (iswhite(*p)) ++p;
586	if (isname(*p))
587	goto parse_attribute_name;
588	if (*p == `'>'`) {
589	++p;
590	if (p == `'\n'`) ++p; /* must skip linebreak immediately after an opening tag /
591	goto parse_text;
592	}
593	if (p[`0`] == `'/'` && p[`1`] == `'>'`) {
594	xml_emit_close_tag(ctx, parser);
595	p += `2`;
596	goto parse_text;
597	}
598	return "syntax error in attributes";
599
600	parse_attribute_name:
601	mark = p;
602	while (isname(*p)) ++p;
603	xml_emit_att_name(ctx, parser, mark, p);
604	while (iswhite(*p)) ++p;
605	if (p == `'='`) { ++p; goto* parse_attribute_value; }
606	return "syntax error after attribute name";
607
608	parse_attribute_value:
609	while (iswhite(*p)) ++p;
610	quote = *p++;
611	if (quote != `'"'` && quote != `'\''`)
612	return "missing quote character";
613	mark = p;
614	while (p && p != quote) ++p;
615	if (*p == quote) {
616	xml_emit_att_value(ctx, parser, mark, p++);
617	goto parse_attributes;
618	}
619	return "end of data in attribute value";
620	}
621
622	static int startswith(const char a, const* char *b)
623	{
624	return !fz_strncasecmp(a, b, strlen(b));
625	}
626
627	static const unsigned short find_xml_encoding(char* *s)
628	{
629	const unsigned short *table = NULL;
630	char end, xml, *enc;
631
632	end = strchr(s, `'>'`);
633	if (end)
634	{
635	*end = `0`;
636	xml = strstr(s, "<?xml");
637	if (xml)
638	{
639	enc = strstr(xml, "encoding=");
640	if (enc)
641	{
642	enc += `10`;
643	if (startswith(enc, "iso-8859-1") \|\| startswith(enc, "latin1"))
644	table = fz_unicode_from_iso8859_1;
645	else if (startswith(enc, "iso-8859-7") \|\| startswith(enc, "greek"))
646	table = fz_unicode_from_iso8859_7;
647	else if (startswith(enc, "koi8"))
648	table = fz_unicode_from_koi8u;
649	else if (startswith(enc, "windows-1250"))
650	table = fz_unicode_from_windows_1250;
651	else if (startswith(enc, "windows-1251"))
652	table = fz_unicode_from_windows_1251;
653	else if (startswith(enc, "windows-1252"))
654	table = fz_unicode_from_windows_1252;
655	}
656	}
657	*end = `'>'`;
658	}
659
660	return table;
661	}
662
663	static char convert_to_utf8(fz_context ctx, unsigned char s, size_t n, int* *dofree)
664	{
665	const unsigned short *table;
666	const unsigned char *e = s + n;
667	char dst, d;
668	int c;
669
670	if (s[`0`] == `0xFE` && s[`1`] == `0xFF`) {
671	s += `2`;
672	dst = d = fz_malloc(ctx, n * FZ_UTFMAX);
673	while (s + `1` < e) {
674	c = s[`0`] << `8` \| s[`1`];
675	d += fz_runetochar(d, c);
676	s += `2`;
677	}
678	*d = `0`;
679	*dofree = `1`;
680	return dst;
681	}
682
683	if (s[`0`] == `0xFF` && s[`1`] == `0xFE`) {
684	s += `2`;
685	dst = d = fz_malloc(ctx, n * FZ_UTFMAX);
686	while (s + `1` < e) {
687	c = s[`0`] \| s[`1`] << `8`;
688	d += fz_runetochar(d, c);
689	s += `2`;
690	}
691	*d = `0`;
692	*dofree = `1`;
693	return dst;
694	}
695
696	table = find_xml_encoding((char*)s);
697	if (table) {
698	dst = d = fz_malloc(ctx, n * FZ_UTFMAX);
699	while (*s) {
700	c = table[*s++];
701	d += fz_runetochar(d, c);
702	}
703	*d = `0`;
704	*dofree = `1`;
705	return dst;
706	}
707
708	*dofree = `0`;
709
710	if (s[`0`] == `0xEF` && s[`1`] == `0xBB` && s[`2`] == `0xBF`)
711	return (char*)s+`3`;
712
713	return (char*)s;
714	}
715
716	/*
717	Parse the contents of buffer into a tree of xml nodes.
718
719	preserve_white: whether to keep or delete all-whitespace nodes.
720	*/
721	fz_xml_doc *
722	fz_parse_xml(fz_context ctx, fz_buffer buf, int preserve_white)
723	{
724	struct parser parser;
725	fz_xml_doc *xml = NULL;
726	fz_xml root, *node;
727	char *p = NULL;
728	char *error;
729	int dofree;
730	unsigned char *s;
731	size_t n;
732
733	fz_var(p);
734
735	/ ensure we are zero-terminated /
736	fz_terminate_buffer(ctx, buf);
737	n = fz_buffer_storage(ctx, buf, &s);
738
739	memset(&root, `0`, sizeof(root));
740	parser.pool = fz_new_pool(ctx);
741	parser.head = &root;
742	parser.preserve_white = preserve_white;
743	parser.depth = `0`;
744
745	fz_try(ctx)
746	{
747	p = convert_to_utf8(ctx, s, n, &dofree);
748
749	error = xml_parse_document_imp(ctx, &parser, p);
750	if (error)
751	fz_throw(ctx, FZ_ERROR_GENERIC, "%s", error);
752
753	for (node = root.down; node; node = node->next)
754	node->up = NULL;
755
756	xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml);
757	xml->pool = parser.pool;
758	xml->root = root.down;
759	}
760	fz_always(ctx)
761	{
762	if (dofree)
763	fz_free(ctx, p);
764	}
765	fz_catch(ctx)
766	{
767	fz_drop_pool(ctx, parser.pool);
768	fz_rethrow(ctx);
769	}
770
771	return xml;
772	}
773

Browse the source code of MuPDF/source/fitz/xml.c