| 1 | #include "mupdf/fitz.h" |
| 2 | |
| 3 | #include <string.h> |
| 4 | #include <stdlib.h> |
| 5 | #include <stdio.h> |
| 6 | |
| 7 | static const struct { const char *name; int c; } html_entities[] = { |
| 8 | {"nbsp" ,160}, {"iexcl" ,161}, {"cent" ,162}, {"pound" ,163}, |
| 9 | {"curren" ,164}, {"yen" ,165}, {"brvbar" ,166}, {"sect" ,167}, |
| 10 | {"uml" ,168}, {"copy" ,169}, {"ordf" ,170}, {"laquo" ,171}, |
| 11 | {"not" ,172}, {"shy" ,173}, {"reg" ,174}, {"macr" ,175}, {"deg" ,176}, |
| 12 | {"plusmn" ,177}, {"sup2" ,178}, {"sup3" ,179}, {"acute" ,180}, |
| 13 | {"micro" ,181}, {"para" ,182}, {"middot" ,183}, {"cedil" ,184}, |
| 14 | {"sup1" ,185}, {"ordm" ,186}, {"raquo" ,187}, {"frac14" ,188}, |
| 15 | {"frac12" ,189}, {"frac34" ,190}, {"iquest" ,191}, {"Agrave" ,192}, |
| 16 | {"Aacute" ,193}, {"Acirc" ,194}, {"Atilde" ,195}, {"Auml" ,196}, |
| 17 | {"Aring" ,197}, {"AElig" ,198}, {"Ccedil" ,199}, {"Egrave" ,200}, |
| 18 | {"Eacute" ,201}, {"Ecirc" ,202}, {"Euml" ,203}, {"Igrave" ,204}, |
| 19 | {"Iacute" ,205}, {"Icirc" ,206}, {"Iuml" ,207}, {"ETH" ,208}, |
| 20 | {"Ntilde" ,209}, {"Ograve" ,210}, {"Oacute" ,211}, {"Ocirc" ,212}, |
| 21 | {"Otilde" ,213}, {"Ouml" ,214}, {"times" ,215}, {"Oslash" ,216}, |
| 22 | {"Ugrave" ,217}, {"Uacute" ,218}, {"Ucirc" ,219}, {"Uuml" ,220}, |
| 23 | {"Yacute" ,221}, {"THORN" ,222}, {"szlig" ,223}, {"agrave" ,224}, |
| 24 | {"aacute" ,225}, {"acirc" ,226}, {"atilde" ,227}, {"auml" ,228}, |
| 25 | {"aring" ,229}, {"aelig" ,230}, {"ccedil" ,231}, {"egrave" ,232}, |
| 26 | {"eacute" ,233}, {"ecirc" ,234}, {"euml" ,235}, {"igrave" ,236}, |
| 27 | {"iacute" ,237}, {"icirc" ,238}, {"iuml" ,239}, {"eth" ,240}, |
| 28 | {"ntilde" ,241}, {"ograve" ,242}, {"oacute" ,243}, {"ocirc" ,244}, |
| 29 | {"otilde" ,245}, {"ouml" ,246}, {"divide" ,247}, {"oslash" ,248}, |
| 30 | {"ugrave" ,249}, {"uacute" ,250}, {"ucirc" ,251}, {"uuml" ,252}, |
| 31 | {"yacute" ,253}, {"thorn" ,254}, {"yuml" ,255}, {"lt" ,60}, {"gt" ,62}, |
| 32 | {"amp" ,38}, {"apos" ,39}, {"quot" ,34}, {"OElig" ,338}, {"oelig" ,339}, |
| 33 | {"Scaron" ,352}, {"scaron" ,353}, {"Yuml" ,376}, {"circ" ,710}, |
| 34 | {"tilde" ,732}, {"ensp" ,8194}, {"emsp" ,8195}, {"thinsp" ,8201}, |
| 35 | {"zwnj" ,8204}, {"zwj" ,8205}, {"lrm" ,8206}, {"rlm" ,8207}, |
| 36 | {"ndash" ,8211}, {"mdash" ,8212}, {"lsquo" ,8216}, {"rsquo" ,8217}, |
| 37 | {"sbquo" ,8218}, {"ldquo" ,8220}, {"rdquo" ,8221}, {"bdquo" ,8222}, |
| 38 | {"dagger" ,8224}, {"Dagger" ,8225}, {"permil" ,8240}, {"lsaquo" ,8249}, |
| 39 | {"rsaquo" ,8250}, {"euro" ,8364}, {"fnof" ,402}, {"Alpha" ,913}, |
| 40 | {"Beta" ,914}, {"Gamma" ,915}, {"Delta" ,916}, {"Epsilon" ,917}, |
| 41 | {"Zeta" ,918}, {"Eta" ,919}, {"Theta" ,920}, {"Iota" ,921}, {"Kappa" ,922}, |
| 42 | {"Lambda" ,923}, {"Mu" ,924}, {"Nu" ,925}, {"Xi" ,926}, {"Omicron" ,927}, |
| 43 | {"Pi" ,928}, {"Rho" ,929}, {"Sigma" ,931}, {"Tau" ,932}, {"Upsilon" ,933}, |
| 44 | {"Phi" ,934}, {"Chi" ,935}, {"Psi" ,936}, {"Omega" ,937}, {"alpha" ,945}, |
| 45 | {"beta" ,946}, {"gamma" ,947}, {"delta" ,948}, {"epsilon" ,949}, |
| 46 | {"zeta" ,950}, {"eta" ,951}, {"theta" ,952}, {"iota" ,953}, {"kappa" ,954}, |
| 47 | {"lambda" ,955}, {"mu" ,956}, {"nu" ,957}, {"xi" ,958}, {"omicron" ,959}, |
| 48 | {"pi" ,960}, {"rho" ,961}, {"sigmaf" ,962}, {"sigma" ,963}, {"tau" ,964}, |
| 49 | {"upsilon" ,965}, {"phi" ,966}, {"chi" ,967}, {"psi" ,968}, {"omega" ,969}, |
| 50 | {"thetasym" ,977}, {"upsih" ,978}, {"piv" ,982}, {"bull" ,8226}, |
| 51 | {"hellip" ,8230}, {"prime" ,8242}, {"Prime" ,8243}, {"oline" ,8254}, |
| 52 | {"frasl" ,8260}, {"weierp" ,8472}, {"image" ,8465}, {"real" ,8476}, |
| 53 | {"trade" ,8482}, {"alefsym" ,8501}, {"larr" ,8592}, {"uarr" ,8593}, |
| 54 | {"rarr" ,8594}, {"darr" ,8595}, {"harr" ,8596}, {"crarr" ,8629}, |
| 55 | {"lArr" ,8656}, {"uArr" ,8657}, {"rArr" ,8658}, {"dArr" ,8659}, |
| 56 | {"hArr" ,8660}, {"forall" ,8704}, {"part" ,8706}, {"exist" ,8707}, |
| 57 | {"empty" ,8709}, {"nabla" ,8711}, {"isin" ,8712}, {"notin" ,8713}, |
| 58 | {"ni" ,8715}, {"prod" ,8719}, {"sum" ,8721}, {"minus" ,8722}, |
| 59 | {"lowast" ,8727}, {"radic" ,8730}, {"prop" ,8733}, {"infin" ,8734}, |
| 60 | {"ang" ,8736}, {"and" ,8743}, {"or" ,8744}, {"cap" ,8745}, {"cup" ,8746}, |
| 61 | {"int" ,8747}, {"there4" ,8756}, {"sim" ,8764}, {"cong" ,8773}, |
| 62 | {"asymp" ,8776}, {"ne" ,8800}, {"equiv" ,8801}, {"le" ,8804}, {"ge" ,8805}, |
| 63 | {"sub" ,8834}, {"sup" ,8835}, {"nsub" ,8836}, {"sube" ,8838}, |
| 64 | {"supe" ,8839}, {"oplus" ,8853}, {"otimes" ,8855}, {"perp" ,8869}, |
| 65 | {"sdot" ,8901}, {"lceil" ,8968}, {"rceil" ,8969}, {"lfloor" ,8970}, |
| 66 | {"rfloor" ,8971}, {"lang" ,9001}, {"rang" ,9002}, {"loz" ,9674}, |
| 67 | {"spades" ,9824}, {"clubs" ,9827}, {"hearts" ,9829}, {"diams" ,9830}, |
| 68 | }; |
| 69 | |
| 70 | struct parser |
| 71 | { |
| 72 | fz_pool *pool; |
| 73 | fz_xml *head; |
| 74 | int preserve_white; |
| 75 | int depth; |
| 76 | }; |
| 77 | |
| 78 | struct attribute |
| 79 | { |
| 80 | char name[40]; |
| 81 | char *value; |
| 82 | struct attribute *next; |
| 83 | }; |
| 84 | |
| 85 | struct fz_xml_doc_s |
| 86 | { |
| 87 | fz_pool *pool; |
| 88 | fz_xml *root; |
| 89 | }; |
| 90 | |
| 91 | struct fz_xml_s |
| 92 | { |
| 93 | char name[40]; |
| 94 | char *text; |
| 95 | struct attribute *atts; |
| 96 | fz_xml *up, *down, *tail, *prev, *next; |
| 97 | }; |
| 98 | |
| 99 | static void xml_indent(int n) |
| 100 | { |
| 101 | while (n--) { |
| 102 | putchar(' '); |
| 103 | putchar(' '); |
| 104 | } |
| 105 | } |
| 106 | |
| 107 | /* |
| 108 | Pretty-print an XML tree to stdout. |
| 109 | */ |
| 110 | void fz_debug_xml(fz_xml *item, int level) |
| 111 | { |
| 112 | if (item->text) |
| 113 | { |
| 114 | char *s = item->text; |
| 115 | int c; |
| 116 | xml_indent(level); |
| 117 | putchar('"'); |
| 118 | while ((c = *s++)) { |
| 119 | switch (c) { |
| 120 | default: |
| 121 | if (c < 32 || c > 127) { |
| 122 | putchar('\\'); |
| 123 | putchar('x'); |
| 124 | putchar("0123456789ABCDEF" [(c>>4) & 15]); |
| 125 | putchar("0123456789ABCDEF" [(c) & 15]); |
| 126 | } else { |
| 127 | putchar(c); |
| 128 | } |
| 129 | break; |
| 130 | case '\\': putchar('\\'); putchar('\\'); break; |
| 131 | case '\b': putchar('\\'); putchar('b'); break; |
| 132 | case '\f': putchar('\\'); putchar('f'); break; |
| 133 | case '\n': putchar('\\'); putchar('n'); break; |
| 134 | case '\r': putchar('\\'); putchar('r'); break; |
| 135 | case '\t': putchar('\\'); putchar('t'); break; |
| 136 | } |
| 137 | } |
| 138 | putchar('\n'); |
| 139 | } |
| 140 | else |
| 141 | { |
| 142 | fz_xml *child; |
| 143 | struct attribute *att; |
| 144 | |
| 145 | xml_indent(level); |
| 146 | printf("(%s\n" , item->name); |
| 147 | for (att = item->atts; att; att = att->next) |
| 148 | { |
| 149 | xml_indent(level); |
| 150 | printf("=%s %s\n" , att->name, att->value); |
| 151 | } |
| 152 | for (child = item->down; child; child = child->next) |
| 153 | fz_debug_xml(child, level + 1); |
| 154 | xml_indent(level); |
| 155 | printf(")%s\n" , item->name); |
| 156 | } |
| 157 | } |
| 158 | |
| 159 | /* |
| 160 | Return previous sibling of XML node. |
| 161 | */ |
| 162 | fz_xml *fz_xml_prev(fz_xml *item) |
| 163 | { |
| 164 | return item ? item->prev : NULL; |
| 165 | } |
| 166 | |
| 167 | /* |
| 168 | Return next sibling of XML node. |
| 169 | */ |
| 170 | fz_xml *fz_xml_next(fz_xml *item) |
| 171 | { |
| 172 | return item ? item->next : NULL; |
| 173 | } |
| 174 | |
| 175 | /* |
| 176 | Return parent of XML node. |
| 177 | */ |
| 178 | fz_xml *fz_xml_up(fz_xml *item) |
| 179 | { |
| 180 | return item ? item->up : NULL; |
| 181 | } |
| 182 | |
| 183 | /* |
| 184 | Return first child of XML node. |
| 185 | */ |
| 186 | fz_xml *fz_xml_down(fz_xml *item) |
| 187 | { |
| 188 | return item ? item->down : NULL; |
| 189 | } |
| 190 | |
| 191 | /* |
| 192 | Return the text content of an XML node. |
| 193 | Return NULL if the node is a tag. |
| 194 | */ |
| 195 | char *fz_xml_text(fz_xml *item) |
| 196 | { |
| 197 | return item ? item->text : NULL; |
| 198 | } |
| 199 | |
| 200 | /* |
| 201 | Return tag of XML node. Return NULL for text nodes. |
| 202 | */ |
| 203 | char *fz_xml_tag(fz_xml *item) |
| 204 | { |
| 205 | return item && item->name[0] ? item->name : NULL; |
| 206 | } |
| 207 | |
| 208 | /* |
| 209 | Return true if the tag name matches. |
| 210 | */ |
| 211 | int fz_xml_is_tag(fz_xml *item, const char *name) |
| 212 | { |
| 213 | if (!item) |
| 214 | return 0; |
| 215 | return !strcmp(item->name, name); |
| 216 | } |
| 217 | |
| 218 | /* |
| 219 | Return the value of an attribute of an XML node. |
| 220 | NULL if the attribute doesn't exist. |
| 221 | */ |
| 222 | char *fz_xml_att(fz_xml *item, const char *name) |
| 223 | { |
| 224 | struct attribute *att; |
| 225 | if (!item) |
| 226 | return NULL; |
| 227 | for (att = item->atts; att; att = att->next) |
| 228 | if (!strcmp(att->name, name)) |
| 229 | return att->value; |
| 230 | return NULL; |
| 231 | } |
| 232 | |
| 233 | fz_xml *fz_xml_find(fz_xml *item, const char *tag) |
| 234 | { |
| 235 | while (item) |
| 236 | { |
| 237 | if (!strcmp(item->name, tag)) |
| 238 | return item; |
| 239 | item = item->next; |
| 240 | } |
| 241 | return NULL; |
| 242 | } |
| 243 | |
| 244 | fz_xml *fz_xml_find_next(fz_xml *item, const char *tag) |
| 245 | { |
| 246 | if (item) |
| 247 | item = item->next; |
| 248 | return fz_xml_find(item, tag); |
| 249 | } |
| 250 | |
| 251 | fz_xml *fz_xml_find_down(fz_xml *item, const char *tag) |
| 252 | { |
| 253 | if (item) |
| 254 | item = item->down; |
| 255 | return fz_xml_find(item, tag); |
| 256 | } |
| 257 | |
| 258 | fz_xml *fz_xml_root(fz_xml_doc *xml) |
| 259 | { |
| 260 | return xml ? xml->root : NULL; |
| 261 | } |
| 262 | |
| 263 | /* |
| 264 | Free the XML node and all its children and siblings. |
| 265 | */ |
| 266 | void fz_drop_xml(fz_context *ctx, fz_xml_doc *xml) |
| 267 | { |
| 268 | if (xml) |
| 269 | fz_drop_pool(ctx, xml->pool); |
| 270 | } |
| 271 | |
| 272 | /* |
| 273 | Detach a node from the tree, unlinking it from its parent, |
| 274 | and setting the document root to the node. |
| 275 | */ |
| 276 | void fz_detach_xml(fz_context *ctx, fz_xml_doc *xml, fz_xml *node) |
| 277 | { |
| 278 | if (node->up) |
| 279 | node->up->down = NULL; |
| 280 | xml->root = node; |
| 281 | } |
| 282 | |
| 283 | static size_t xml_parse_entity(int *c, char *a) |
| 284 | { |
| 285 | char *b; |
| 286 | size_t i; |
| 287 | |
| 288 | if (a[1] == '#') { |
| 289 | if (a[2] == 'x') |
| 290 | *c = strtol(a + 3, &b, 16); |
| 291 | else |
| 292 | *c = strtol(a + 2, &b, 10); |
| 293 | if (*b == ';') |
| 294 | return b - a + 1; |
| 295 | } |
| 296 | else if (a[1] == 'l' && a[2] == 't' && a[3] == ';') { |
| 297 | *c = '<'; |
| 298 | return 4; |
| 299 | } |
| 300 | else if (a[1] == 'g' && a[2] == 't' && a[3] == ';') { |
| 301 | *c = '>'; |
| 302 | return 4; |
| 303 | } |
| 304 | else if (a[1] == 'a' && a[2] == 'm' && a[3] == 'p' && a[4] == ';') { |
| 305 | *c = '&'; |
| 306 | return 5; |
| 307 | } |
| 308 | else if (a[1] == 'a' && a[2] == 'p' && a[3] == 'o' && a[4] == 's' && a[5] == ';') { |
| 309 | *c = '\''; |
| 310 | return 6; |
| 311 | } |
| 312 | else if (a[1] == 'q' && a[2] == 'u' && a[3] == 'o' && a[4] == 't' && a[5] == ';') { |
| 313 | *c = '"'; |
| 314 | return 6; |
| 315 | } |
| 316 | |
| 317 | /* We should only be doing this for XHTML, but it shouldn't be a problem. */ |
| 318 | for (i = 0; i < nelem(html_entities); ++i) { |
| 319 | size_t n = strlen(html_entities[i].name); |
| 320 | if (!strncmp(a+1, html_entities[i].name, n) && a[n+1] == ';') { |
| 321 | *c = html_entities[i].c; |
| 322 | return n + 2; |
| 323 | } |
| 324 | } |
| 325 | |
| 326 | *c = *a; |
| 327 | return 1; |
| 328 | } |
| 329 | |
| 330 | static inline int isname(int c) |
| 331 | { |
| 332 | return c == '.' || c == '-' || c == '_' || c == ':' || |
| 333 | (c >= '0' && c <= '9') || |
| 334 | (c >= 'A' && c <= 'Z') || |
| 335 | (c >= 'a' && c <= 'z'); |
| 336 | } |
| 337 | |
| 338 | static inline int iswhite(int c) |
| 339 | { |
| 340 | return c == ' ' || c == '\r' || c == '\n' || c == '\t'; |
| 341 | } |
| 342 | |
| 343 | static void xml_emit_open_tag(fz_context *ctx, struct parser *parser, char *a, char *b) |
| 344 | { |
| 345 | fz_xml *head, *tail; |
| 346 | char *ns; |
| 347 | |
| 348 | /* skip namespace prefix */ |
| 349 | for (ns = a; ns < b; ++ns) |
| 350 | if (*ns == ':') |
| 351 | a = ns + 1; |
| 352 | |
| 353 | head = fz_pool_alloc(ctx, parser->pool, sizeof *head); |
| 354 | if (b - a > sizeof(head->name) - 1) |
| 355 | b = a + sizeof(head->name) - 1; |
| 356 | memcpy(head->name, a, b - a); |
| 357 | head->name[b - a] = 0; |
| 358 | |
| 359 | head->atts = NULL; |
| 360 | head->text = NULL; |
| 361 | head->up = parser->head; |
| 362 | head->down = NULL; |
| 363 | head->prev = NULL; |
| 364 | head->next = NULL; |
| 365 | |
| 366 | if (!parser->head->down) { |
| 367 | parser->head->down = head; |
| 368 | parser->head->tail = head; |
| 369 | } |
| 370 | else { |
| 371 | tail = parser->head->tail; |
| 372 | tail->next = head; |
| 373 | head->prev = tail; |
| 374 | parser->head->tail = head; |
| 375 | } |
| 376 | |
| 377 | parser->head = head; |
| 378 | parser->depth++; |
| 379 | } |
| 380 | |
| 381 | static void xml_emit_att_name(fz_context *ctx, struct parser *parser, char *a, char *b) |
| 382 | { |
| 383 | fz_xml *head = parser->head; |
| 384 | struct attribute *att; |
| 385 | |
| 386 | att = fz_pool_alloc(ctx, parser->pool, sizeof *att); |
| 387 | if (b - a > sizeof(att->name) - 1) |
| 388 | b = a + sizeof(att->name) - 1; |
| 389 | memcpy(att->name, a, b - a); |
| 390 | att->name[b - a] = 0; |
| 391 | att->value = NULL; |
| 392 | att->next = head->atts; |
| 393 | head->atts = att; |
| 394 | } |
| 395 | |
| 396 | static void xml_emit_att_value(fz_context *ctx, struct parser *parser, char *a, char *b) |
| 397 | { |
| 398 | fz_xml *head = parser->head; |
| 399 | struct attribute *att = head->atts; |
| 400 | char *s; |
| 401 | int c; |
| 402 | |
| 403 | /* entities are all longer than UTFmax so runetochar is safe */ |
| 404 | s = att->value = fz_pool_alloc(ctx, parser->pool, b - a + 1); |
| 405 | while (a < b) { |
| 406 | if (*a == '&') { |
| 407 | a += xml_parse_entity(&c, a); |
| 408 | s += fz_runetochar(s, c); |
| 409 | } |
| 410 | else { |
| 411 | *s++ = *a++; |
| 412 | } |
| 413 | } |
| 414 | *s = 0; |
| 415 | } |
| 416 | |
| 417 | static void xml_emit_close_tag(fz_context *ctx, struct parser *parser) |
| 418 | { |
| 419 | parser->depth--; |
| 420 | if (parser->head->up) |
| 421 | parser->head = parser->head->up; |
| 422 | } |
| 423 | |
| 424 | static void xml_emit_text(fz_context *ctx, struct parser *parser, char *a, char *b) |
| 425 | { |
| 426 | static char *empty = "" ; |
| 427 | fz_xml *head; |
| 428 | char *s; |
| 429 | int c; |
| 430 | |
| 431 | /* Skip text outside the root tag */ |
| 432 | if (parser->depth == 0) |
| 433 | return; |
| 434 | |
| 435 | /* Skip all-whitespace text nodes */ |
| 436 | if (!parser->preserve_white) |
| 437 | { |
| 438 | for (s = a; s < b; s++) |
| 439 | if (!iswhite(*s)) |
| 440 | break; |
| 441 | if (s == b) |
| 442 | return; |
| 443 | } |
| 444 | |
| 445 | xml_emit_open_tag(ctx, parser, empty, empty); |
| 446 | head = parser->head; |
| 447 | |
| 448 | /* entities are all longer than UTFmax so runetochar is safe */ |
| 449 | s = head->text = fz_pool_alloc(ctx, parser->pool, b - a + 1); |
| 450 | while (a < b) { |
| 451 | if (*a == '&') { |
| 452 | a += xml_parse_entity(&c, a); |
| 453 | s += fz_runetochar(s, c); |
| 454 | } |
| 455 | else { |
| 456 | *s++ = *a++; |
| 457 | } |
| 458 | } |
| 459 | *s = 0; |
| 460 | |
| 461 | xml_emit_close_tag(ctx, parser); |
| 462 | } |
| 463 | |
| 464 | static void xml_emit_cdata(fz_context *ctx, struct parser *parser, char *a, char *b) |
| 465 | { |
| 466 | static char *empty = "" ; |
| 467 | fz_xml *head; |
| 468 | char *s; |
| 469 | |
| 470 | xml_emit_open_tag(ctx, parser, empty, empty); |
| 471 | head = parser->head; |
| 472 | |
| 473 | s = head->text = fz_pool_alloc(ctx, parser->pool, b - a + 1); |
| 474 | while (a < b) |
| 475 | *s++ = *a++; |
| 476 | *s = 0; |
| 477 | |
| 478 | xml_emit_close_tag(ctx, parser); |
| 479 | } |
| 480 | |
| 481 | static char *xml_parse_document_imp(fz_context *ctx, struct parser *parser, char *p) |
| 482 | { |
| 483 | char *mark; |
| 484 | int quote; |
| 485 | |
| 486 | parse_text: |
| 487 | mark = p; |
| 488 | while (*p && *p != '<') ++p; |
| 489 | if (*p == '<') { |
| 490 | /* skip trailing newline before closing tag */ |
| 491 | if (p[1] == '/' && p - 1 >= mark && p[-1] == '\n') |
| 492 | xml_emit_text(ctx, parser, mark, p - 1); |
| 493 | else if (mark < p) |
| 494 | xml_emit_text(ctx, parser, mark, p); |
| 495 | ++p; |
| 496 | goto parse_element; |
| 497 | } else if (mark < p) |
| 498 | xml_emit_text(ctx, parser, mark, p); |
| 499 | return NULL; |
| 500 | |
| 501 | parse_element: |
| 502 | if (*p == '/') { ++p; goto parse_closing_element; } |
| 503 | if (*p == '!') { ++p; goto parse_comment; } |
| 504 | if (*p == '?') { ++p; goto parse_processing_instruction; } |
| 505 | while (iswhite(*p)) ++p; |
| 506 | if (isname(*p)) |
| 507 | goto parse_element_name; |
| 508 | return "syntax error in element" ; |
| 509 | |
| 510 | : |
| 511 | if (p[0]=='D' && p[1]=='O' && p[2]=='C' && p[3]=='T' && p[4]=='Y' && p[5]=='P' && p[6]=='E') |
| 512 | goto parse_declaration; |
| 513 | if (p[0]=='E' && p[1]=='N' && p[2]=='T' && p[3]=='I' && p[4]=='T' && p[5]=='Y') |
| 514 | goto parse_declaration; |
| 515 | if (*p == '[') goto parse_cdata; |
| 516 | if (*p++ != '-') return "syntax error in comment (<! not followed by --)" ; |
| 517 | if (*p++ != '-') return "syntax error in comment (<!- not followed by -)" ; |
| 518 | while (*p) { |
| 519 | if (p[0] == '-' && p[1] == '-' && p[2] == '>') { |
| 520 | p += 3; |
| 521 | goto parse_text; |
| 522 | } |
| 523 | ++p; |
| 524 | } |
| 525 | return "end of data in comment" ; |
| 526 | |
| 527 | parse_declaration: |
| 528 | while (*p) if (*p++ == '>') goto parse_text; |
| 529 | return "end of data in declaration" ; |
| 530 | |
| 531 | parse_cdata: |
| 532 | if (p[1] != 'C' || p[2] != 'D' || p[3] != 'A' || p[4] != 'T' || p[5] != 'A' || p[6] != '[') |
| 533 | return "syntax error in CDATA section" ; |
| 534 | p += 7; |
| 535 | mark = p; |
| 536 | while (*p) { |
| 537 | if (p[0] == ']' && p[1] == ']' && p[2] == '>') { |
| 538 | xml_emit_cdata(ctx, parser, mark, p); |
| 539 | p += 3; |
| 540 | goto parse_text; |
| 541 | } |
| 542 | ++p; |
| 543 | } |
| 544 | return "end of data in CDATA section" ; |
| 545 | |
| 546 | parse_processing_instruction: |
| 547 | while (*p) { |
| 548 | if (p[0] == '?' && p[1] == '>') { |
| 549 | p += 2; |
| 550 | goto parse_text; |
| 551 | } |
| 552 | ++p; |
| 553 | } |
| 554 | return "end of data in processing instruction" ; |
| 555 | |
| 556 | parse_closing_element: |
| 557 | while (iswhite(*p)) ++p; |
| 558 | while (isname(*p)) ++p; |
| 559 | while (iswhite(*p)) ++p; |
| 560 | if (*p != '>') |
| 561 | return "syntax error in closing element" ; |
| 562 | xml_emit_close_tag(ctx, parser); |
| 563 | ++p; |
| 564 | goto parse_text; |
| 565 | |
| 566 | parse_element_name: |
| 567 | mark = p; |
| 568 | while (isname(*p)) ++p; |
| 569 | xml_emit_open_tag(ctx, parser, mark, p); |
| 570 | if (*p == '>') { |
| 571 | ++p; |
| 572 | if (*p == '\n') ++p; /* must skip linebreak immediately after an opening tag */ |
| 573 | goto parse_text; |
| 574 | } |
| 575 | if (p[0] == '/' && p[1] == '>') { |
| 576 | xml_emit_close_tag(ctx, parser); |
| 577 | p += 2; |
| 578 | goto parse_text; |
| 579 | } |
| 580 | if (iswhite(*p)) |
| 581 | goto parse_attributes; |
| 582 | return "syntax error after element name" ; |
| 583 | |
| 584 | parse_attributes: |
| 585 | while (iswhite(*p)) ++p; |
| 586 | if (isname(*p)) |
| 587 | goto parse_attribute_name; |
| 588 | if (*p == '>') { |
| 589 | ++p; |
| 590 | if (*p == '\n') ++p; /* must skip linebreak immediately after an opening tag */ |
| 591 | goto parse_text; |
| 592 | } |
| 593 | if (p[0] == '/' && p[1] == '>') { |
| 594 | xml_emit_close_tag(ctx, parser); |
| 595 | p += 2; |
| 596 | goto parse_text; |
| 597 | } |
| 598 | return "syntax error in attributes" ; |
| 599 | |
| 600 | parse_attribute_name: |
| 601 | mark = p; |
| 602 | while (isname(*p)) ++p; |
| 603 | xml_emit_att_name(ctx, parser, mark, p); |
| 604 | while (iswhite(*p)) ++p; |
| 605 | if (*p == '=') { ++p; goto parse_attribute_value; } |
| 606 | return "syntax error after attribute name" ; |
| 607 | |
| 608 | parse_attribute_value: |
| 609 | while (iswhite(*p)) ++p; |
| 610 | quote = *p++; |
| 611 | if (quote != '"' && quote != '\'') |
| 612 | return "missing quote character" ; |
| 613 | mark = p; |
| 614 | while (*p && *p != quote) ++p; |
| 615 | if (*p == quote) { |
| 616 | xml_emit_att_value(ctx, parser, mark, p++); |
| 617 | goto parse_attributes; |
| 618 | } |
| 619 | return "end of data in attribute value" ; |
| 620 | } |
| 621 | |
| 622 | static int startswith(const char *a, const char *b) |
| 623 | { |
| 624 | return !fz_strncasecmp(a, b, strlen(b)); |
| 625 | } |
| 626 | |
| 627 | static const unsigned short *find_xml_encoding(char *s) |
| 628 | { |
| 629 | const unsigned short *table = NULL; |
| 630 | char *end, *xml, *enc; |
| 631 | |
| 632 | end = strchr(s, '>'); |
| 633 | if (end) |
| 634 | { |
| 635 | *end = 0; |
| 636 | xml = strstr(s, "<?xml" ); |
| 637 | if (xml) |
| 638 | { |
| 639 | enc = strstr(xml, "encoding=" ); |
| 640 | if (enc) |
| 641 | { |
| 642 | enc += 10; |
| 643 | if (startswith(enc, "iso-8859-1" ) || startswith(enc, "latin1" )) |
| 644 | table = fz_unicode_from_iso8859_1; |
| 645 | else if (startswith(enc, "iso-8859-7" ) || startswith(enc, "greek" )) |
| 646 | table = fz_unicode_from_iso8859_7; |
| 647 | else if (startswith(enc, "koi8" )) |
| 648 | table = fz_unicode_from_koi8u; |
| 649 | else if (startswith(enc, "windows-1250" )) |
| 650 | table = fz_unicode_from_windows_1250; |
| 651 | else if (startswith(enc, "windows-1251" )) |
| 652 | table = fz_unicode_from_windows_1251; |
| 653 | else if (startswith(enc, "windows-1252" )) |
| 654 | table = fz_unicode_from_windows_1252; |
| 655 | } |
| 656 | } |
| 657 | *end = '>'; |
| 658 | } |
| 659 | |
| 660 | return table; |
| 661 | } |
| 662 | |
| 663 | static char *convert_to_utf8(fz_context *ctx, unsigned char *s, size_t n, int *dofree) |
| 664 | { |
| 665 | const unsigned short *table; |
| 666 | const unsigned char *e = s + n; |
| 667 | char *dst, *d; |
| 668 | int c; |
| 669 | |
| 670 | if (s[0] == 0xFE && s[1] == 0xFF) { |
| 671 | s += 2; |
| 672 | dst = d = fz_malloc(ctx, n * FZ_UTFMAX); |
| 673 | while (s + 1 < e) { |
| 674 | c = s[0] << 8 | s[1]; |
| 675 | d += fz_runetochar(d, c); |
| 676 | s += 2; |
| 677 | } |
| 678 | *d = 0; |
| 679 | *dofree = 1; |
| 680 | return dst; |
| 681 | } |
| 682 | |
| 683 | if (s[0] == 0xFF && s[1] == 0xFE) { |
| 684 | s += 2; |
| 685 | dst = d = fz_malloc(ctx, n * FZ_UTFMAX); |
| 686 | while (s + 1 < e) { |
| 687 | c = s[0] | s[1] << 8; |
| 688 | d += fz_runetochar(d, c); |
| 689 | s += 2; |
| 690 | } |
| 691 | *d = 0; |
| 692 | *dofree = 1; |
| 693 | return dst; |
| 694 | } |
| 695 | |
| 696 | table = find_xml_encoding((char*)s); |
| 697 | if (table) { |
| 698 | dst = d = fz_malloc(ctx, n * FZ_UTFMAX); |
| 699 | while (*s) { |
| 700 | c = table[*s++]; |
| 701 | d += fz_runetochar(d, c); |
| 702 | } |
| 703 | *d = 0; |
| 704 | *dofree = 1; |
| 705 | return dst; |
| 706 | } |
| 707 | |
| 708 | *dofree = 0; |
| 709 | |
| 710 | if (s[0] == 0xEF && s[1] == 0xBB && s[2] == 0xBF) |
| 711 | return (char*)s+3; |
| 712 | |
| 713 | return (char*)s; |
| 714 | } |
| 715 | |
| 716 | /* |
| 717 | Parse the contents of buffer into a tree of xml nodes. |
| 718 | |
| 719 | preserve_white: whether to keep or delete all-whitespace nodes. |
| 720 | */ |
| 721 | fz_xml_doc * |
| 722 | fz_parse_xml(fz_context *ctx, fz_buffer *buf, int preserve_white) |
| 723 | { |
| 724 | struct parser parser; |
| 725 | fz_xml_doc *xml = NULL; |
| 726 | fz_xml root, *node; |
| 727 | char *p = NULL; |
| 728 | char *error; |
| 729 | int dofree; |
| 730 | unsigned char *s; |
| 731 | size_t n; |
| 732 | |
| 733 | fz_var(p); |
| 734 | |
| 735 | /* ensure we are zero-terminated */ |
| 736 | fz_terminate_buffer(ctx, buf); |
| 737 | n = fz_buffer_storage(ctx, buf, &s); |
| 738 | |
| 739 | memset(&root, 0, sizeof(root)); |
| 740 | parser.pool = fz_new_pool(ctx); |
| 741 | parser.head = &root; |
| 742 | parser.preserve_white = preserve_white; |
| 743 | parser.depth = 0; |
| 744 | |
| 745 | fz_try(ctx) |
| 746 | { |
| 747 | p = convert_to_utf8(ctx, s, n, &dofree); |
| 748 | |
| 749 | error = xml_parse_document_imp(ctx, &parser, p); |
| 750 | if (error) |
| 751 | fz_throw(ctx, FZ_ERROR_GENERIC, "%s" , error); |
| 752 | |
| 753 | for (node = root.down; node; node = node->next) |
| 754 | node->up = NULL; |
| 755 | |
| 756 | xml = fz_pool_alloc(ctx, parser.pool, sizeof *xml); |
| 757 | xml->pool = parser.pool; |
| 758 | xml->root = root.down; |
| 759 | } |
| 760 | fz_always(ctx) |
| 761 | { |
| 762 | if (dofree) |
| 763 | fz_free(ctx, p); |
| 764 | } |
| 765 | fz_catch(ctx) |
| 766 | { |
| 767 | fz_drop_pool(ctx, parser.pool); |
| 768 | fz_rethrow(ctx); |
| 769 | } |
| 770 | |
| 771 | return xml; |
| 772 | } |
| 773 | |