1 | #include "mupdf/fitz.h" |
2 | |
3 | #include <string.h> |
4 | #include <stdlib.h> |
5 | #include <stdio.h> |
6 | |
7 | static const struct { const char *name; int c; } html_entities[] = { |
8 | {"nbsp" ,160}, {"iexcl" ,161}, {"cent" ,162}, {"pound" ,163}, |
9 | {"curren" ,164}, {"yen" ,165}, {"brvbar" ,166}, {"sect" ,167}, |
10 | {"uml" ,168}, {"copy" ,169}, {"ordf" ,170}, {"laquo" ,171}, |
11 | {"not" ,172}, {"shy" ,173}, {"reg" ,174}, {"macr" ,175}, {"deg" ,176}, |
12 | {"plusmn" ,177}, {"sup2" ,178}, {"sup3" ,179}, {"acute" ,180}, |
13 | {"micro" ,181}, {"para" ,182}, {"middot" ,183}, {"cedil" ,184}, |
14 | {"sup1" ,185}, {"ordm" ,186}, {"raquo" ,187}, {"frac14" ,188}, |
15 | {"frac12" ,189}, {"frac34" ,190}, {"iquest" ,191}, {"Agrave" ,192}, |
16 | {"Aacute" ,193}, {"Acirc" ,194}, {"Atilde" ,195}, {"Auml" ,196}, |
17 | {"Aring" ,197}, {"AElig" ,198}, {"Ccedil" ,199}, {"Egrave" ,200}, |
18 | {"Eacute" ,201}, {"Ecirc" ,202}, {"Euml" ,203}, {"Igrave" ,204}, |
19 | {"Iacute" ,205}, {"Icirc" ,206}, {"Iuml" ,207}, {"ETH" ,208}, |
20 | {"Ntilde" ,209}, {"Ograve" ,210}, {"Oacute" ,211}, {"Ocirc" ,212}, |
21 | {"Otilde" ,213}, {"Ouml" ,214}, {"times" ,215}, {"Oslash" ,216}, |
22 | {"Ugrave" ,217}, {"Uacute" ,218}, {"Ucirc" ,219}, {"Uuml" ,220}, |
23 | {"Yacute" ,221}, {"THORN" ,222}, {"szlig" ,223}, {"agrave" ,224}, |
24 | {"aacute" ,225}, {"acirc" ,226}, {"atilde" ,227}, {"auml" ,228}, |
25 | {"aring" ,229}, {"aelig" ,230}, {"ccedil" ,231}, {"egrave" ,232}, |
26 | {"eacute" ,233}, {"ecirc" ,234}, {"euml" ,235}, {"igrave" ,236}, |
27 | {"iacute" ,237}, {"icirc" ,238}, {"iuml" ,239}, {"eth" ,240}, |
28 | {"ntilde" ,241}, {"ograve" ,242}, {"oacute" ,243}, {"ocirc" ,244}, |
29 | {"otilde" ,245}, {"ouml" ,246}, {"divide" ,247}, {"oslash" ,248}, |
30 | {"ugrave" ,249}, {"uacute" ,250}, {"ucirc" ,251}, {"uuml" ,252}, |
31 | {"yacute" ,253}, {"thorn" ,254}, {"yuml" ,255}, {"lt" ,60}, {"gt" ,62}, |
32 | {"amp" ,38}, {"apos" ,39}, {"quot" ,34}, {"OElig" ,338}, {"oelig" ,339}, |
33 | {"Scaron" ,352}, {"scaron" ,353}, {"Yuml" ,376}, {"circ" ,710}, |
34 | {"tilde" ,732}, {"ensp" ,8194}, {"emsp" ,8195}, {"thinsp" ,8201}, |
35 | {"zwnj" ,8204}, {"zwj" ,8205}, {"lrm" ,8206}, {"rlm" ,8207}, |
36 | {"ndash" ,8211}, {"mdash" ,8212}, {"lsquo" ,8216}, {"rsquo" ,8217}, |
37 | {"sbquo" ,8218}, {"ldquo" ,8220}, {"rdquo" ,8221}, {"bdquo" ,8222}, |
38 | {"dagger" ,8224}, {"Dagger" ,8225}, {"permil" ,8240}, {"lsaquo" ,8249}, |
39 | {"rsaquo" ,8250}, {"euro" ,8364}, {"fnof" ,402}, {"Alpha" ,913}, |
40 | {"Beta" ,914}, {"Gamma" ,915}, {"Delta" ,916}, {"Epsilon" ,917}, |
41 | {"Zeta" ,918}, {"Eta" ,919}, {"Theta" ,920}, {"Iota" ,921}, {"Kappa" ,922}, |
42 | {"Lambda" ,923}, {"Mu" ,924}, {"Nu" ,925}, {"Xi" ,926}, {"Omicron" ,927}, |
43 | {"Pi" ,928}, {"Rho" ,929}, {"Sigma" ,931}, {"Tau" ,932}, {"Upsilon" ,933}, |
44 | {"Phi" ,934}, {"Chi" ,935}, {"Psi" ,936}, {"Omega" ,937}, {"alpha" ,945}, |
45 | {"beta" ,946}, {"gamma" ,947}, {"delta" ,948}, {"epsilon" ,949}, |
46 | {"zeta" ,950}, {"eta" ,951}, {"theta" ,952}, {"iota" ,953}, {"kappa" ,954}, |
47 | {"lambda" ,955}, {"mu" ,956}, {"nu" ,957}, {"xi" ,958}, {"omicron" ,959}, |
48 | {"pi" ,960}, {"rho" ,961}, {"sigmaf" ,962}, {"sigma" ,963}, {"tau" ,964}, |
49 | {"upsilon" ,965}, {"phi" ,966}, {"chi" ,967}, {"psi" ,968}, {"omega" ,969}, |
50 | {"thetasym" ,977}, {"upsih" ,978}, {"piv" ,982}, {"bull" ,8226}, |
51 | {"hellip" ,8230}, {"prime" ,8242}, {"Prime" ,8243}, {"oline" ,8254}, |
52 | {"frasl" ,8260}, {"weierp" ,8472}, {"image" ,8465}, {"real" ,8476}, |
53 | {"trade" ,8482}, {"alefsym" ,8501}, {"larr" ,8592}, {"uarr" ,8593}, |
54 | {"rarr" ,8594}, {"darr" ,8595}, {"harr" ,8596}, {"crarr" ,8629}, |
55 | {"lArr" ,8656}, {"uArr" ,8657}, {"rArr" ,8658}, {"dArr" ,8659}, |
56 | {"hArr" ,8660}, {"forall" ,8704}, {"part" ,8706}, {"exist" ,8707}, |
57 | {"empty" ,8709}, {"nabla" ,8711}, {"isin" ,8712}, {"notin" ,8713}, |
58 | {"ni" ,8715}, {"prod" ,8719}, {"sum" ,8721}, {"minus" ,8722}, |
59 | {"lowast" ,8727}, {"radic" ,8730}, {"prop" ,8733}, {"infin" ,8734}, |
60 | {"ang" ,8736}, {"and" ,8743}, {"or" ,8744}, {"cap" ,8745}, {"cup" ,8746}, |
61 | {"int" ,8747}, {"there4" ,8756}, {"sim" ,8764}, {"cong" ,8773}, |
62 | {"asymp" ,8776}, {"ne" ,8800}, {"equiv" ,8801}, {"le" ,8804}, {"ge" ,8805}, |
63 | {"sub" ,8834}, {"sup" ,8835}, {"nsub" ,8836}, {"sube" ,8838}, |
64 | {"supe" ,8839}, {"oplus" ,8853}, {"otimes" ,8855}, {"perp" ,8869}, |
65 | {"sdot" ,8901}, {"lceil" ,8968}, {"rceil" ,8969}, {"lfloor" ,8970}, |
66 | {"rfloor" ,8971}, {"lang" ,9001}, {"rang" ,9002}, {"loz" ,9674}, |
67 | {"spades" ,9824}, {"clubs" ,9827}, {"hearts" ,9829}, {"diams" ,9830}, |
68 | }; |
69 | |
70 | struct parser |
71 | { |
72 | fz_pool *pool; |
73 | fz_xml *head; |
74 | int preserve_white; |
75 | int depth; |
76 | }; |
77 | |
78 | struct attribute |
79 | { |
80 | char name[40]; |
81 | char *value; |
82 | struct attribute *next; |
83 | }; |
84 | |
85 | struct fz_xml_doc_s |
86 | { |
87 | fz_pool *pool; |
88 | fz_xml *root; |
89 | }; |
90 | |
91 | struct fz_xml_s |
92 | { |
93 | char name[40]; |
94 | char *text; |
95 | struct attribute *atts; |
96 | fz_xml *up, *down, *tail, *prev, *next; |
97 | }; |
98 | |
99 | static void xml_indent(int n) |
100 | { |
101 | while (n--) { |
102 | putchar(' '); |
103 | putchar(' '); |
104 | } |
105 | } |
106 | |
107 | /* |
108 | Pretty-print an XML tree to stdout. |
109 | */ |
110 | void fz_debug_xml(fz_xml *item, int level) |
111 | { |
112 | if (item->text) |
113 | { |
114 | char *s = item->text; |
115 | int c; |
116 | xml_indent(level); |
117 | putchar('"'); |
118 | while ((c = *s++)) { |
119 | switch (c) { |
120 | default: |
121 | if (c < 32 || c > 127) { |
122 | putchar('\\'); |
123 | putchar('x'); |
124 | putchar("0123456789ABCDEF" [(c>>4) & 15]); |
125 | putchar("0123456789ABCDEF" [(c) & 15]); |
126 | } else { |
127 | putchar(c); |
128 | } |
129 | break; |
130 | case '\\': putchar('\\'); putchar('\\'); break; |
131 | case '\b': putchar('\\'); putchar('b'); break; |
132 | case '\f': putchar('\\'); putchar('f'); break; |
133 | case '\n': putchar('\\'); putchar('n'); break; |
134 | case '\r': putchar('\\'); putchar('r'); break; |
135 | case '\t': putchar('\\'); putchar('t'); break; |
136 | } |
137 | } |
138 | putchar('\n'); |
139 | } |
140 | else |
141 | { |
142 | fz_xml *child; |
143 | struct attribute *att; |
144 | |
145 | xml_indent(level); |
146 | printf("(%s\n" , item->name); |
147 | for (att = item->atts; att; att = att->next) |
148 | { |
149 | xml_indent(level); |
150 | printf("=%s %s\n" , att->name, att->value); |
151 | } |
152 | for (child = item->down; child; child = child->next) |
153 | fz_debug_xml(child, level + 1); |
154 | xml_indent(level); |
155 | printf(")%s\n" , item->name); |
156 | } |
157 | } |
158 | |
159 | /* |
160 | Return previous sibling of XML node. |
161 | */ |
162 | fz_xml *fz_xml_prev(fz_xml *item) |
163 | { |
164 | return item ? item->prev : NULL; |
165 | } |
166 | |
167 | /* |
168 | Return next sibling of XML node. |
169 | */ |
170 | fz_xml *fz_xml_next(fz_xml *item) |
171 | { |
172 | return item ? item->next : NULL; |
173 | } |
174 | |
175 | /* |
176 | Return parent of XML node. |
177 | */ |
178 | fz_xml *fz_xml_up(fz_xml *item) |
179 | { |
180 | return item ? item->up : NULL; |
181 | } |
182 | |
183 | /* |
184 | Return first child of XML node. |
185 | */ |
186 | fz_xml *fz_xml_down(fz_xml *item) |
187 | { |
188 | return item ? item->down : NULL; |
189 | } |
190 | |
191 | /* |
192 | Return the text content of an XML node. |
193 | Return NULL if the node is a tag. |
194 | */ |
195 | char *fz_xml_text(fz_xml *item) |
196 | { |
197 | return item ? item->text : NULL; |
198 | } |
199 | |
200 | /* |
201 | Return tag of XML node. Return NULL for text nodes. |
202 | */ |
203 | char *fz_xml_tag(fz_xml *item) |
204 | { |
205 | return item && item->name[0] ? item->name : NULL; |
206 | } |
207 | |
208 | /* |
209 | Return true if the tag name matches. |
210 | */ |
211 | int fz_xml_is_tag(fz_xml *item, const char *name) |
212 | { |
213 | if (!item) |
214 | return 0; |
215 | return !strcmp(item->name, name); |
216 | } |
217 | |
218 | /* |
219 | Return the value of an attribute of an XML node. |
220 | NULL if the attribute doesn't exist. |
221 | */ |
222 | char *fz_xml_att(fz_xml *item, const char *name) |
223 | { |
224 | struct attribute *att; |
225 | if (!item) |
226 | return NULL; |
227 | for (att = item->atts; att; att = att->next) |
228 | if (!strcmp(att->name, name)) |
229 | return att->value; |
230 | return NULL; |
231 | } |
232 | |
233 | fz_xml *fz_xml_find(fz_xml *item, const char *tag) |
234 | { |
235 | while (item) |
236 | { |
237 | if (!strcmp(item->name, tag)) |
238 | return item; |
239 | item = item->next; |
240 | } |
241 | return NULL; |
242 | } |
243 | |
244 | fz_xml *fz_xml_find_next(fz_xml *item, const char *tag) |
245 | { |
246 | if (item) |
247 | item = item->next; |
248 | return fz_xml_find(item, tag); |
249 | } |
250 | |
251 | fz_xml *fz_xml_find_down(fz_xml *item, const char *tag) |
252 | { |
253 | if (item) |
254 | item = item->down; |
255 | return fz_xml_find(item, tag); |
256 | } |
257 | |
258 | fz_xml *fz_xml_root(fz_xml_doc *xml) |
259 | { |
260 | return xml ? xml->root : NULL; |
261 | } |
262 | |
263 | /* |
264 | Free the XML node and all its children and siblings. |
265 | */ |
266 | void fz_drop_xml(fz_context *ctx, fz_xml_doc *xml) |
267 | { |
268 | if (xml) |
269 | fz_drop_pool(ctx, xml->pool); |
270 | } |
271 | |
272 | /* |
273 | Detach a node from the tree, unlinking it from its parent, |
274 | and setting the document root to the node. |
275 | */ |
276 | void fz_detach_xml(fz_context *ctx, fz_xml_doc *xml, fz_xml *node) |
277 | { |
278 | if (node->up) |
279 | node->up->down = NULL; |
280 | xml->root = node; |
281 | } |
282 | |
283 | static size_t xml_parse_entity(int *c, char *a) |
284 | { |
285 | char *b; |
286 | size_t i; |
287 | |
288 | if (a[1] == '#') { |
289 | if (a[2] == 'x') |
290 | *c = strtol(a + 3, &b, 16); |
291 | else |
292 | *c = strtol(a + 2, &b, 10); |
293 | if (*b == ';') |
294 | return b - a + 1; |
295 | } |
296 | else if (a[1] == 'l' && a[2] == 't' && a[3] == ';') { |
297 | *c = '<'; |
298 | return 4; |
299 | } |
300 | else if (a[1] == 'g' && a[2] == 't' && a[3] == ';') { |
301 | *c = '>'; |
302 | return 4; |
303 | } |
304 | else if (a[1] == 'a' && a[2] == 'm' && a[3] == 'p' && a[4] == ';') { |
305 | *c = '&'; |
306 | return 5; |
307 | } |
308 | else if (a[1] == 'a' && a[2] == 'p' && a[3] == 'o' && a[4] == 's' && a[5] == ';') { |
309 | *c = '\''; |
310 | return 6; |
311 | } |
312 | else if (a[1] == 'q' && a[2] == 'u' && a[3] == 'o' && a[4] == 't' && a[5] == ';') { |
313 | *c = '"'; |
314 | return 6; |
315 | } |
316 | |
317 | /* We should only be doing this for XHTML, but it shouldn't be a problem. */ |
318 | for (i = 0; i < nelem(html_entities); ++i) { |
319 | size_t n = strlen(html_entities[i].name); |
320 | if (!strncmp(a+1, html_entities[i].name, n) && a[n+1] == ';') { |
321 | *c = html_entities[i].c; |
322 | return n + 2; |
323 | } |
324 | } |
325 | |
326 | *c = *a; |
327 | return 1; |
328 | } |
329 | |
330 | static inline int isname(int c) |
331 | { |
332 | return c == '.' || c == '-' || c == '_' || c == ':' || |
333 | (c >= '0' && c <= '9') || |
334 | (c >= 'A' && c <= 'Z') || |
335 | (c >= 'a' && c <= 'z'); |
336 | } |
337 | |
338 | static inline int iswhite(int c) |
339 | { |
340 | return c == ' ' || c == '\r' || c == '\n' || c == '\t'; |
341 | } |
342 | |
343 | static void xml_emit_open_tag(fz_context *ctx, struct parser *parser, char *a, char *b) |
344 | { |
345 | fz_xml *head, *tail; |
346 | char *ns; |
347 | |
348 | /* skip namespace prefix */ |
349 | for (ns = a; ns < b; ++ns) |
350 | if (*ns == ':') |
351 | a = ns + 1; |
352 | |
353 | head = fz_pool_alloc(ctx, parser->pool, sizeof *head); |
354 | if (b - a > sizeof(head->name) - 1) |
355 | b = a + sizeof(head->name) - 1; |
356 | memcpy(head->name, a, b - a); |
357 | head->name[b - a] = 0; |
358 | |
359 | head->atts = NULL; |
360 | head->text = NULL; |
361 | head->up = parser->head; |
362 | head->down = NULL; |
363 | head->prev = NULL; |
364 | head->next = NULL; |
365 | |
366 | if (!parser->head->down) { |
367 | parser->head->down = head; |
368 | parser->head->tail = head; |
369 | } |
370 | else { |
371 | tail = parser->head->tail; |
372 | tail->next = head; |
373 | head->prev = tail; |
374 | parser->head->tail = head; |
375 | } |
376 | |
377 | parser->head = head; |
378 | parser->depth++; |
379 | } |
380 | |
381 | static void xml_emit_att_name(fz_context *ctx, struct parser *parser, char *a, char *b) |
382 | { |
383 | fz_xml *head = parser->head; |
384 | struct attribute *att; |
385 | |
386 | att = fz_pool_alloc(ctx, parser->pool, sizeof *att); |
387 | if (b - a > sizeof(att->name) - 1) |
388 | b = a + sizeof(att->name) - 1; |
389 | memcpy(att->name, a, b - a); |
390 | att->name[b - a] = 0; |
391 | att->value = NULL; |
392 | att->next = head->atts; |
393 | head->atts = att; |
394 | } |
395 | |
396 | static void xml_emit_att_value(fz_context *ctx, struct parser *parser, char *a, char *b) |
397 | { |
398 | fz_xml *head = parser->head; |
399 | struct attribute *att = head->atts; |
400 | char *s; |
401 | int c; |
402 | |
403 | /* entities are all longer than UTFmax so runetochar is safe */ |
404 | s = att->value = fz_pool_alloc(ctx, parser->pool, b - a + 1); |
405 | while (a < b) { |
406 | if (*a == '&') { |
407 | a += xml_parse_entity(&c, a); |
408 | s += fz_runetochar(s, c); |
409 | } |
410 | else { |
411 | *s++ = *a++; |
412 | } |
413 | } |
414 | *s = 0; |
415 | } |
416 | |
417 | static void xml_emit_close_tag(fz_context *ctx, struct parser *parser) |
418 | { |
419 | parser->depth--; |
420 | if (parser->head->up) |
421 | parser->head = parser->head->up; |
422 | } |
423 | |
424 | static void xml_emit_text(fz_context *ctx, struct parser *parser, char *a, char *b) |
425 | { |
426 | static char *empty = "" ; |
427 | fz_xml *head; |
428 | char *s; |
429 | int c; |
430 | |
431 | /* Skip text outside the root tag */ |
432 | if (parser->depth == 0) |
433 | return; |
434 | |
435 | /* Skip all-whitespace text nodes */ |
436 | if (!parser->preserve_white) |
437 | { |
438 | for (s = a; s < b; s++) |
439 | if (!iswhite(*s)) |
440 | break; |
441 | if (s == b) |
442 | return; |
443 | } |
444 | |
445 | xml_emit_open_tag(ctx, parser, empty, empty); |
446 | head = parser->head; |
447 | |
448 | /* entities are all longer than UTFmax so runetochar is safe */ |
449 | s = head->text = fz_pool_alloc(ctx, parser->pool, b - a + 1); |
450 | while (a < b) { |
451 | if (*a == '&') { |
452 | a += xml_parse_entity(&c, a); |
453 | s += fz_runetochar(s, c); |
454 | } |
455 | else { |
456 | *s++ = *a++; |
457 | } |
458 | } |
459 | *s = 0; |
460 | |
461 | xml_emit_close_tag(ctx, parser); |
462 | } |
463 | |
464 | static void xml_emit_cdata(fz_context *ctx, struct parser *parser, char *a, char *b) |
465 | { |
466 | static char *empty = "" ; |
467 | fz_xml *head; |
468 | char *s; |
469 | |
470 | xml_emit_open_tag(ctx, parser, empty, empty); |
471 | head = parser->head; |
472 | |
473 | s = head->text = fz_pool_alloc(ctx, parser->pool, b - a + 1); |
474 | while (a < b) |
475 | *s++ = *a++; |
476 | *s = 0; |
477 | |
478 | xml_emit_close_tag(ctx, parser); |
479 | } |
480 | |
481 | static char *xml_parse_document_imp(fz_context *ctx, struct parser *parser, char *p) |
482 | { |
483 | char *mark; |
484 | int quote; |
485 | |
486 | parse_text: |
487 | mark = p; |
488 | while (*p && *p != '<') ++p; |
489 | if (*p == '<') { |
490 | /* skip trailing newline before closing tag */ |
491 | if (p[1] == '/' && p - 1 >= mark && p[-1] == '\n') |
492 | xml_emit_text(ctx, parser, mark, p - 1); |
493 | else if (mark < p) |
494 | xml_emit_text(ctx, parser, mark, p); |
495 | ++p; |
496 | goto parse_element; |
497 | } else if (mark < p) |
498 | xml_emit_text(ctx, parser, mark, p); |
499 | return NULL; |
500 | |
501 | parse_element: |
502 | if (*p == '/') { ++p; goto parse_closing_element; } |
503 | if (*p == '!') { ++p; goto parse_comment; } |
504 | if (*p == '?') { ++p; goto parse_processing_instruction; } |
505 | while (iswhite(*p)) ++p; |
506 | if (isname(*p)) |
507 | goto parse_element_name; |
508 | return "syntax error in element" ; |
509 | |
510 | : |
511 | if (p[0]=='D' && p[1]=='O' && p[2]=='C' && p[3]=='T' && p[4]=='Y' && p[5]=='P' && p[6]=='E') |
512 | goto parse_declaration; |
513 | if (p[0]=='E' && p[1]=='N' && p[2]=='T' && p[3]=='I' && p[4]=='T' && p[5]=='Y') |
514 | goto parse_declaration; |
515 | if (*p == '[') goto parse_cdata; |
516 | if (*p++ != '-') return "syntax error in comment (<! not followed by --)" ; |
517 | if (*p++ != '-') return "syntax error in comment (<!- not followed by -)" ; |
518 | while (*p) { |
519 | if (p[0] == '-' && p[1] == '-' && p[2] == '>') { |
520 | p += 3; |
521 | goto parse_text; |
522 | } |
523 | ++p; |
524 | } |
525 | return "end of data in comment" ; |
526 | |
527 | parse_declaration: |
528 | while (*p) if (*p++ == '>') goto parse_text; |
529 | return "end of data in declaration" ; |
530 | |
531 | parse_cdata: |
532 | if (p[1] != 'C' || p[2] != 'D' || p[3] != 'A' || p[4] != 'T' || p[5] != 'A' || p[6] != '[') |
533 | return "syntax error in CDATA section" ; |
534 | p += 7; |
535 | mark = p; |
536 | while (*p) { |
537 | if (p[0] == ']' && p[1] == ']' && p[2] == '>') { |
538 | xml_emit_cdata(ctx, parser, mark, p); |
539 | p += 3; |
540 | goto parse_text; |
541 | } |
542 | ++p; |
543 | } |
544 | return "end of data in CDATA section" ; |
545 | |
546 | parse_processing_instruction: |
547 | while (*p) { |
548 | if (p[0] == '?' && p[1] == '>') { |
549 | p += 2; |
550 | goto parse_text; |
551 | } |
552 | ++p; |
553 | } |
554 | return "end of data in processing instruction" ; |
555 | |
556 | parse_closing_element: |
557 | while (iswhite(*p)) ++p; |
558 | while (isname(*p)) ++p; |
559 | while (iswhite(*p)) ++p; |
560 | if (*p != '>') |
561 | return "syntax error in closing element" ; |
562 | xml_emit_close_tag(ctx, parser); |
563 | ++p; |
564 | goto parse_text; |
565 | |
566 | parse_element_name: |
567 | mark = p; |
568 | while (isname(*p)) ++p; |
569 | xml_emit_open_tag(ctx, parser, mark, p); |
570 | if (*p == '>') { |
571 | ++p; |
572 | if (*p == '\n') ++p; /* must skip linebreak immediately after an opening tag */ |
573 | goto parse_text; |
574 | } |
575 | if (p[0] == '/' && p[1] == '>') { |
576 | xml_emit_close_tag(ctx, parser); |
577 | p += 2; |
578 | goto parse_text; |
579 | } |
580 | if (iswhite(*p)) |
581 | goto parse_attributes; |
582 | return "syntax error after element name" ; |
583 | |
584 | parse_attributes: |
585 | while (iswhite(*p)) ++p; |
586 | if (isname(*p)) |
587 | goto parse_attribute_name; |
588 | if (*p == '>') { |
589 | ++p; |
590 | if (*p == '\n') ++p; /* must skip linebreak immediately after an opening tag */ |
591 | goto parse_text; |
592 | } |
593 | if (p[0] == '/' && p[1] == '>') { |
594 | xml_emit_close_tag(ctx, parser); |
595 | p += 2; |
596 | goto parse_text; |
597 | } |
598 | return "syntax error in attributes" ; |
599 | |
600 | parse_attribute_name: |
601 | mark = p; |
602 | while (isname(*p)) ++p; |
603 | xml_emit_att_name(ctx, parser, mark, p); |
604 | while (iswhite(*p)) ++p; |
605 | if (*p == '=') { ++p; goto parse_attribute_value; } |
606 | return "syntax error after attribute name" ; |
607 | |
608 | parse_attribute_value: |
609 | while (iswhite(*p)) ++p; |
610 | quote = *p++; |
611 | if (quote != '"' && quote != '\'') |
612 | return "missing quote character" ; |
613 | mark = p; |
614 | while (*p && *p != quote) ++p; |
615 | if (*p == quote) { |
616 | xml_emit_att_value(ctx, parser, mark, p++); |
617 | goto parse_attributes; |
618 | } |
619 | return "end of data in attribute value" ; |
620 | } |
621 | |
622 | static int startswith(const char *a, const char *b) |
623 | { |
624 | return !fz_strncasecmp(a, b, strlen(b)); |
625 | } |
626 | |
627 | static const unsigned short *find_xml_encoding(char *s) |
628 | { |
629 | const unsigned short *table = NULL; |
630 | char *end, *xml, *enc; |
631 | |
632 | end = strchr(s, '>'); |
633 | if (end) |
634 | { |
635 | *end = 0; |
636 | xml = strstr(s, "<?xml" ); |
637 | if (xml) |
638 | { |
639 | enc = strstr(xml, "encoding=" ); |
640 | if (enc) |
641 | { |
642 | enc += 10; |
643 | if (startswith(enc, "iso-8859-1" ) || startswith(enc, "latin1" )) |
644 | table = fz_unicode_from_iso8859_1; |
645 | else if (startswith(enc, "iso-8859-7" ) || startswith(enc, "greek" )) |
646 | table = fz_unicode_from_iso8859_7; |
647 | else if (startswith(enc, "koi8" )) |
648 | table = fz_unicode_from_koi8u; |
649 | else if (startswith(enc, "windows-1250" )) |
650 | table = fz_unicode_from_windows_1250; |
651 | else if (startswith(enc, "windows-1251" )) |
652 | table = fz_unicode_from_windows_1251; |
653 | else if (startswith(enc, "windows-1252" )) |
654 | table = fz_unicode_from_windows_1252; |
655 | } |
656 | } |
657 | *end = '>'; |
658 | } |
659 | |
660 | return table; |
661 | } |
662 | |
663 | static char *convert_to_utf8(fz_context *ctx, unsigned char *s, size_t n, int *dofree) |
664 | { |
665 | const unsigned short *table; |
666 | const unsigned char *e = s + n; |
667 | char *dst, *d; |
668 | int c; |
669 | |
670 | if (s[0] == 0xFE && s[1] == 0xFF) { |
671 | s += 2; |
672 | dst = d = fz_malloc(ctx, n * FZ_UTFMAX); |
673 | while (s + 1 < e) { |
674 | c = s[0] << 8 | s[1]; |
675 | d += fz_runetochar(d, c); |
676 | s += 2; |
677 | } |
678 | *d = 0; |
679 | *dofree = 1; |
680 | return dst; |
681 | } |
682 | |
683 | if (s[0] == 0xFF && s[1] == 0xFE) { |
684 | s += 2; |
685 | dst = d = fz_malloc(ctx, n * FZ_UTFMAX); |
686 | while (s + 1 < e) { |
687 | c = s[0] | s[1] << 8; |
688 | d += fz_runetochar(d, c); |
689 | s += 2; |
690 | } |
691 | *d = 0; |
692 | *dofree = 1; |
693 | return dst; |
694 | } |
695 | |
696 | table = find_xml_encoding((char*)s); |
697 | if (table) { |
698 | dst = d = fz_malloc(ctx, n * FZ_UTFMAX); |
699 | while (*s) { |
700 | c = table[*s++]; |
701 | d += fz_runetochar(d, c); |
702 | } |
703 | *d = 0; |
704 | *dofree = 1; |
705 | return dst; |
706 | } |
707 | |
708 | *dofree = 0; |
709 | |
710 | if (s[0] == 0xEF && s[1] == 0xBB && s[2] == 0xBF) |
711 | return (char*)s+3; |
712 | |
713 | return (char*)s; |
714 | } |
715 | |
716 | /* |
717 | Parse the contents of buffer into a tree of xml nodes. |
718 | |
719 | preserve_white: whether to keep or delete all-whitespace nodes. |
720 | */ |
721 | fz_xml_doc * |
722 | fz_parse_xml(fz_context *ctx, fz_buffer *buf, int preserve_white) |
723 | { |
724 | struct parser parser; |
725 | fz_xml_doc *xml = NULL; |
726 | fz_xml root, *node; |
727 | char *p = NULL; |
728 | char *error; |
729 | int dofree; |
730 | unsigned char *s; |
731 | size_t n; |
732 | |
733 | fz_var(p); |
734 | |
735 | /* ensure we are zero-terminated */ |
736 | fz_terminate_buffer(ctx, buf); |
737 | n = fz_buffer_storage(ctx, buf, &s); |
738 | |
739 | memset(&root, 0, sizeof(root)); |
740 | parser.pool = fz_new_pool(ctx); |
741 | parser.head = &root; |
742 | parser.preserve_white = preserve_white; |
743 | parser.depth = 0; |
744 | |
745 | fz_try(ctx) |
746 | { |
747 | p = convert_to_utf8(ctx, s, n, &dofree); |
748 | |
749 | error = xml_parse_document_imp(ctx, &parser, p); |
750 | if (error) |
751 | fz_throw(ctx, FZ_ERROR_GENERIC, "%s" , error); |
752 | |
753 | for (node = root.down; node; node = node->next) |
754 | node->up = NULL; |
755 | |
756 | xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml); |
757 | xml->pool = parser.pool; |
758 | xml->root = root.down; |
759 | } |
760 | fz_always(ctx) |
761 | { |
762 | if (dofree) |
763 | fz_free(ctx, p); |
764 | } |
765 | fz_catch(ctx) |
766 | { |
767 | fz_drop_pool(ctx, parser.pool); |
768 | fz_rethrow(ctx); |
769 | } |
770 | |
771 | return xml; |
772 | } |
773 | |