1 | /* $Id$ $Revision$ */ |
2 | /* vim:set shiftwidth=4 ts=8: */ |
3 | |
4 | /************************************************************************* |
5 | * Copyright (c) 2011 AT&T Intellectual Property |
6 | * All rights reserved. This program and the accompanying materials |
7 | * are made available under the terms of the Eclipse Public License v1.0 |
8 | * which accompanies this distribution, and is available at |
9 | * http://www.eclipse.org/legal/epl-v10.html |
10 | * |
11 | * Contributors: See CVS logs. Details at http://www.graphviz.org/ |
12 | *************************************************************************/ |
13 | |
14 | |
15 | #include "render.h" |
16 | #include "htmltable.h" |
17 | #include "htmlparse.h" |
18 | #include "htmllex.h" |
19 | #include "cdt.h" |
20 | #include <ctype.h> |
21 | |
22 | #ifdef HAVE_EXPAT |
23 | #include <expat.h> |
24 | #endif |
25 | |
26 | #ifndef XML_STATUS_ERROR |
27 | #define XML_STATUS_ERROR 0 |
28 | #endif |
29 | |
30 | typedef struct { |
31 | #ifdef HAVE_EXPAT |
32 | XML_Parser parser; |
33 | #endif |
34 | char* ptr; /* input source */ |
35 | int tok; /* token type */ |
36 | agxbuf* xb; /* buffer to gather T_string data */ |
37 | agxbuf lb; /* buffer for translating lexical data */ |
38 | char warn; /* set if warning given */ |
39 | char error; /* set if error given */ |
40 | char inCell; /* set if in TD to allow T_string */ |
41 | char mode; /* for handling artificial <HTML>..</HTML> */ |
42 | char *currtok; /* for error reporting */ |
43 | char *prevtok; /* for error reporting */ |
44 | int currtoklen; |
45 | int prevtoklen; |
46 | } lexstate_t; |
47 | static lexstate_t state; |
48 | |
49 | /* error_context: |
50 | * Print the last 2 "token"s seen. |
51 | */ |
52 | static void error_context(void) |
53 | { |
54 | agxbclear(state.xb); |
55 | if (state.prevtoklen > 0) |
56 | agxbput_n(state.xb, state.prevtok, state.prevtoklen); |
57 | agxbput_n(state.xb, state.currtok, state.currtoklen); |
58 | agerr(AGPREV, "... %s ...\n" , agxbuse(state.xb)); |
59 | } |
60 | |
61 | /* htmlerror: |
62 | * yyerror - called by yacc output |
63 | */ |
64 | void htmlerror(const char *msg) |
65 | { |
66 | if (state.error) |
67 | return; |
68 | state.error = 1; |
69 | agerr(AGERR, "%s in line %d \n" , msg, htmllineno()); |
70 | error_context(); |
71 | } |
72 | |
73 | #ifdef HAVE_EXPAT |
74 | /* lexerror: |
75 | * called by lexer when unknown <..> is found. |
76 | */ |
77 | static void lexerror(const char *name) |
78 | { |
79 | state.tok = T_error; |
80 | state.error = 1; |
81 | agerr(AGERR, "Unknown HTML element <%s> on line %d \n" , |
82 | name, htmllineno()); |
83 | } |
84 | |
85 | typedef int (*attrFn) (void *, char *); |
86 | typedef int (*bcmpfn) (const void *, const void *); |
87 | |
88 | #define MAX_CHAR (((unsigned char)(~0)) >> 1) |
89 | #define MIN_CHAR ((signed char)(~MAX_CHAR)) |
90 | #define MAX_UCHAR ((unsigned char)(~0)) |
91 | #define MAX_USHORT ((unsigned short)(~0)) |
92 | |
93 | /* Mechanism for automatically processing attributes */ |
94 | typedef struct { |
95 | char *name; /* attribute name */ |
96 | attrFn action; /* action to perform if name matches */ |
97 | } attr_item; |
98 | |
99 | #define ISIZE (sizeof(attr_item)) |
100 | |
101 | /* icmp: |
102 | * Compare two attr_item. Used in bsearch |
103 | */ |
104 | static int icmp(attr_item * i, attr_item * j) |
105 | { |
106 | return strcasecmp(i->name, j->name); |
107 | } |
108 | |
109 | static int bgcolorfn(htmldata_t * p, char *v) |
110 | { |
111 | p->bgcolor = strdup(v); |
112 | return 0; |
113 | } |
114 | |
115 | static int pencolorfn(htmldata_t * p, char *v) |
116 | { |
117 | p->pencolor = strdup(v); |
118 | return 0; |
119 | } |
120 | |
121 | static int hreffn(htmldata_t * p, char *v) |
122 | { |
123 | p->href = strdup(v); |
124 | return 0; |
125 | } |
126 | |
127 | static int sidesfn(htmldata_t * p, char *v) |
128 | { |
129 | unsigned short flags = 0; |
130 | char c; |
131 | |
132 | while ((c = *v++)) { |
133 | switch (tolower(c)) { |
134 | case 'l' : |
135 | flags |= BORDER_LEFT; |
136 | break; |
137 | case 't' : |
138 | flags |= BORDER_TOP; |
139 | break; |
140 | case 'r' : |
141 | flags |= BORDER_RIGHT; |
142 | break; |
143 | case 'b' : |
144 | flags |= BORDER_BOTTOM; |
145 | break; |
146 | default : |
147 | agerr(AGWARN, "Unrecognized character '%c' (%d) in sides attribute\n" , c, c); |
148 | break; |
149 | } |
150 | } |
151 | if (flags != BORDER_MASK) |
152 | p->flags |= flags; |
153 | return 0; |
154 | } |
155 | |
156 | static int titlefn(htmldata_t * p, char *v) |
157 | { |
158 | p->title = strdup(v); |
159 | return 0; |
160 | } |
161 | |
162 | static int portfn(htmldata_t * p, char *v) |
163 | { |
164 | p->port = strdup(v); |
165 | return 0; |
166 | } |
167 | |
168 | #define DELIM " ," |
169 | |
170 | static int stylefn(htmldata_t * p, char *v) |
171 | { |
172 | int rv = 0; |
173 | char c; |
174 | char* tk; |
175 | char* buf = strdup (v); |
176 | for (tk = strtok (buf, DELIM); tk; tk = strtok (NULL, DELIM)) { |
177 | c = (char) toupper(*tk); |
178 | if (c == 'R') { |
179 | if (!strcasecmp(tk + 1, "OUNDED" )) p->style |= ROUNDED; |
180 | else if (!strcasecmp(tk + 1, "ADIAL" )) p->style |= RADIAL; |
181 | else { |
182 | agerr(AGWARN, "Illegal value %s for STYLE - ignored\n" , tk); |
183 | rv = 1; |
184 | } |
185 | } |
186 | else if(!strcasecmp(tk,"SOLID" )) p->style &= ~(DOTTED|DASHED); |
187 | else if(!strcasecmp(tk,"INVISIBLE" ) || !strcasecmp(tk,"INVIS" )) p->style |= INVISIBLE; |
188 | else if(!strcasecmp(tk,"DOTTED" )) p->style |= DOTTED; |
189 | else if(!strcasecmp(tk,"DASHED" )) p->style |= DASHED; |
190 | else { |
191 | agerr(AGWARN, "Illegal value %s for STYLE - ignored\n" , tk); |
192 | rv = 1; |
193 | } |
194 | } |
195 | free (buf); |
196 | return rv; |
197 | } |
198 | |
199 | static int targetfn(htmldata_t * p, char *v) |
200 | { |
201 | p->target = strdup(v); |
202 | return 0; |
203 | } |
204 | |
205 | static int idfn(htmldata_t * p, char *v) |
206 | { |
207 | p->id = strdup(v); |
208 | return 0; |
209 | } |
210 | |
211 | |
212 | /* doInt: |
213 | * Scan v for integral value. Check that |
214 | * the value is >= min and <= max. Return value in ul. |
215 | * String s is name of value. |
216 | * Return 0 if okay; 1 otherwise. |
217 | */ |
218 | static int doInt(char *v, char *s, int min, int max, long *ul) |
219 | { |
220 | int rv = 0; |
221 | char *ep; |
222 | long b = strtol(v, &ep, 10); |
223 | |
224 | if (ep == v) { |
225 | agerr(AGWARN, "Improper %s value %s - ignored" , s, v); |
226 | rv = 1; |
227 | } else if (b > max) { |
228 | agerr(AGWARN, "%s value %s > %d - too large - ignored" , s, v, max); |
229 | rv = 1; |
230 | } else if (b < min) { |
231 | agerr(AGWARN, "%s value %s < %d - too small - ignored" , s, v, min); |
232 | rv = 1; |
233 | } else |
234 | *ul = b; |
235 | return rv; |
236 | } |
237 | |
238 | |
239 | static int gradientanglefn(htmldata_t * p, char *v) |
240 | { |
241 | long u; |
242 | |
243 | if (doInt(v, "GRADIENTANGLE" , 0, 360, &u)) |
244 | return 1; |
245 | p->gradientangle = (unsigned short) u; |
246 | return 0; |
247 | } |
248 | |
249 | |
250 | static int borderfn(htmldata_t * p, char *v) |
251 | { |
252 | long u; |
253 | |
254 | if (doInt(v, "BORDER" , 0, MAX_UCHAR, &u)) |
255 | return 1; |
256 | p->border = (unsigned char) u; |
257 | p->flags |= BORDER_SET; |
258 | return 0; |
259 | } |
260 | |
261 | static int cellpaddingfn(htmldata_t * p, char *v) |
262 | { |
263 | long u; |
264 | |
265 | if (doInt(v, "CELLPADDING" , 0, MAX_UCHAR, &u)) |
266 | return 1; |
267 | p->pad = (unsigned char) u; |
268 | p->flags |= PAD_SET; |
269 | return 0; |
270 | } |
271 | |
272 | static int cellspacingfn(htmldata_t * p, char *v) |
273 | { |
274 | long u; |
275 | |
276 | if (doInt(v, "CELLSPACING" , MIN_CHAR, MAX_CHAR, &u)) |
277 | return 1; |
278 | p->space = (signed char) u; |
279 | p->flags |= SPACE_SET; |
280 | return 0; |
281 | } |
282 | |
283 | static int cellborderfn(htmltbl_t * p, char *v) |
284 | { |
285 | long u; |
286 | |
287 | if (doInt(v, "CELLSBORDER" , 0, MAX_CHAR, &u)) |
288 | return 1; |
289 | p->cb = (unsigned char) u; |
290 | return 0; |
291 | } |
292 | |
293 | static int columnsfn(htmltbl_t * p, char *v) |
294 | { |
295 | if (*v != '*') { |
296 | agerr(AGWARN, "Unknown value %s for COLUMNS - ignored\n" , v); |
297 | return 1; |
298 | } |
299 | p->flags |= HTML_VRULE; |
300 | return 0; |
301 | } |
302 | |
303 | static int rowsfn(htmltbl_t * p, char *v) |
304 | { |
305 | if (*v != '*') { |
306 | agerr(AGWARN, "Unknown value %s for ROWS - ignored\n" , v); |
307 | return 1; |
308 | } |
309 | p->flags |= HTML_HRULE; |
310 | return 0; |
311 | } |
312 | |
313 | static int fixedsizefn(htmldata_t * p, char *v) |
314 | { |
315 | int rv = 0; |
316 | char c = (char) toupper(*(unsigned char *) v); |
317 | if ((c == 'T') && !strcasecmp(v + 1, "RUE" )) |
318 | p->flags |= FIXED_FLAG; |
319 | else if ((c != 'F') || strcasecmp(v + 1, "ALSE" )) { |
320 | agerr(AGWARN, "Illegal value %s for FIXEDSIZE - ignored\n" , v); |
321 | rv = 1; |
322 | } |
323 | return rv; |
324 | } |
325 | |
326 | static int valignfn(htmldata_t * p, char *v) |
327 | { |
328 | int rv = 0; |
329 | char c = (char) toupper(*v); |
330 | if ((c == 'B') && !strcasecmp(v + 1, "OTTOM" )) |
331 | p->flags |= VALIGN_BOTTOM; |
332 | else if ((c == 'T') && !strcasecmp(v + 1, "OP" )) |
333 | p->flags |= VALIGN_TOP; |
334 | else if ((c != 'M') || strcasecmp(v + 1, "IDDLE" )) { |
335 | agerr(AGWARN, "Illegal value %s for VALIGN - ignored\n" , v); |
336 | rv = 1; |
337 | } |
338 | return rv; |
339 | } |
340 | |
341 | static int halignfn(htmldata_t * p, char *v) |
342 | { |
343 | int rv = 0; |
344 | char c = (char) toupper(*v); |
345 | if ((c == 'L') && !strcasecmp(v + 1, "EFT" )) |
346 | p->flags |= HALIGN_LEFT; |
347 | else if ((c == 'R') && !strcasecmp(v + 1, "IGHT" )) |
348 | p->flags |= HALIGN_RIGHT; |
349 | else if ((c != 'C') || strcasecmp(v + 1, "ENTER" )) { |
350 | agerr(AGWARN, "Illegal value %s for ALIGN - ignored\n" , v); |
351 | rv = 1; |
352 | } |
353 | return rv; |
354 | } |
355 | |
356 | static int cell_halignfn(htmldata_t * p, char *v) |
357 | { |
358 | int rv = 0; |
359 | char c = (char) toupper(*v); |
360 | if ((c == 'L') && !strcasecmp(v + 1, "EFT" )) |
361 | p->flags |= HALIGN_LEFT; |
362 | else if ((c == 'R') && !strcasecmp(v + 1, "IGHT" )) |
363 | p->flags |= HALIGN_RIGHT; |
364 | else if ((c == 'T') && !strcasecmp(v + 1, "EXT" )) |
365 | p->flags |= HALIGN_TEXT; |
366 | else if ((c != 'C') || strcasecmp(v + 1, "ENTER" )) |
367 | rv = 1; |
368 | if (rv) |
369 | agerr(AGWARN, "Illegal value %s for ALIGN in TD - ignored\n" , v); |
370 | return rv; |
371 | } |
372 | |
373 | static int balignfn(htmldata_t * p, char *v) |
374 | { |
375 | int rv = 0; |
376 | char c = (char) toupper(*v); |
377 | if ((c == 'L') && !strcasecmp(v + 1, "EFT" )) |
378 | p->flags |= BALIGN_LEFT; |
379 | else if ((c == 'R') && !strcasecmp(v + 1, "IGHT" )) |
380 | p->flags |= BALIGN_RIGHT; |
381 | else if ((c != 'C') || strcasecmp(v + 1, "ENTER" )) |
382 | rv = 1; |
383 | if (rv) |
384 | agerr(AGWARN, "Illegal value %s for BALIGN in TD - ignored\n" , v); |
385 | return rv; |
386 | } |
387 | |
388 | static int heightfn(htmldata_t * p, char *v) |
389 | { |
390 | long u; |
391 | |
392 | if (doInt(v, "HEIGHT" , 0, MAX_USHORT, &u)) |
393 | return 1; |
394 | p->height = (unsigned short) u; |
395 | return 0; |
396 | } |
397 | |
398 | static int widthfn(htmldata_t * p, char *v) |
399 | { |
400 | long u; |
401 | |
402 | if (doInt(v, "WIDTH" , 0, MAX_USHORT, &u)) |
403 | return 1; |
404 | p->width = (unsigned short) u; |
405 | return 0; |
406 | } |
407 | |
408 | static int rowspanfn(htmlcell_t * p, char *v) |
409 | { |
410 | long u; |
411 | |
412 | if (doInt(v, "ROWSPAN" , 0, MAX_USHORT, &u)) |
413 | return 1; |
414 | if (u == 0) { |
415 | agerr(AGWARN, "ROWSPAN value cannot be 0 - ignored\n" ); |
416 | return 1; |
417 | } |
418 | p->rspan = (unsigned short) u; |
419 | return 0; |
420 | } |
421 | |
422 | static int colspanfn(htmlcell_t * p, char *v) |
423 | { |
424 | long u; |
425 | |
426 | if (doInt(v, "COLSPAN" , 0, MAX_USHORT, &u)) |
427 | return 1; |
428 | if (u == 0) { |
429 | agerr(AGWARN, "COLSPAN value cannot be 0 - ignored\n" ); |
430 | return 1; |
431 | } |
432 | p->cspan = (unsigned short) u; |
433 | return 0; |
434 | } |
435 | |
436 | static int fontcolorfn(textfont_t * p, char *v) |
437 | { |
438 | p->color = v; |
439 | return 0; |
440 | } |
441 | |
442 | static int facefn(textfont_t * p, char *v) |
443 | { |
444 | p->name = v; |
445 | return 0; |
446 | } |
447 | |
448 | static int ptsizefn(textfont_t * p, char *v) |
449 | { |
450 | long u; |
451 | |
452 | if (doInt(v, "POINT-SIZE" , 0, MAX_UCHAR, &u)) |
453 | return 1; |
454 | p->size = (double) u; |
455 | return 0; |
456 | } |
457 | |
458 | static int srcfn(htmlimg_t * p, char *v) |
459 | { |
460 | p->src = strdup(v); |
461 | return 0; |
462 | } |
463 | |
464 | static int scalefn(htmlimg_t * p, char *v) |
465 | { |
466 | p->scale = strdup(v); |
467 | return 0; |
468 | } |
469 | |
470 | static int alignfn(int *p, char *v) |
471 | { |
472 | int rv = 0; |
473 | char c = (char) toupper(*v); |
474 | if ((c == 'R') && !strcasecmp(v + 1, "IGHT" )) |
475 | *p = 'r'; |
476 | else if ((c == 'L') || !strcasecmp(v + 1, "EFT" )) |
477 | *p = 'l'; |
478 | else if ((c == 'C') || strcasecmp(v + 1, "ENTER" )) |
479 | *p = 'n'; |
480 | else { |
481 | agerr(AGWARN, "Illegal value %s for ALIGN - ignored\n" , v); |
482 | rv = 1; |
483 | } |
484 | return rv; |
485 | } |
486 | |
487 | /* Tables used in binary search; MUST be alphabetized */ |
488 | static attr_item tbl_items[] = { |
489 | {"align" , (attrFn) halignfn}, |
490 | {"bgcolor" , (attrFn) bgcolorfn}, |
491 | {"border" , (attrFn) borderfn}, |
492 | {"cellborder" , (attrFn) cellborderfn}, |
493 | {"cellpadding" , (attrFn) cellpaddingfn}, |
494 | {"cellspacing" , (attrFn) cellspacingfn}, |
495 | {"color" , (attrFn) pencolorfn}, |
496 | {"columns" , (attrFn) columnsfn}, |
497 | {"fixedsize" , (attrFn) fixedsizefn}, |
498 | {"gradientangle" , (attrFn) gradientanglefn}, |
499 | {"height" , (attrFn) heightfn}, |
500 | {"href" , (attrFn) hreffn}, |
501 | {"id" , (attrFn) idfn}, |
502 | {"port" , (attrFn) portfn}, |
503 | {"rows" , (attrFn) rowsfn}, |
504 | {"sides" , (attrFn) sidesfn}, |
505 | {"style" , (attrFn) stylefn}, |
506 | {"target" , (attrFn) targetfn}, |
507 | {"title" , (attrFn) titlefn}, |
508 | {"tooltip" , (attrFn) titlefn}, |
509 | {"valign" , (attrFn) valignfn}, |
510 | {"width" , (attrFn) widthfn}, |
511 | }; |
512 | |
513 | static attr_item cell_items[] = { |
514 | {"align" , (attrFn) cell_halignfn}, |
515 | {"balign" , (attrFn) balignfn}, |
516 | {"bgcolor" , (attrFn) bgcolorfn}, |
517 | {"border" , (attrFn) borderfn}, |
518 | {"cellpadding" , (attrFn) cellpaddingfn}, |
519 | {"cellspacing" , (attrFn) cellspacingfn}, |
520 | {"color" , (attrFn) pencolorfn}, |
521 | {"colspan" , (attrFn) colspanfn}, |
522 | {"fixedsize" , (attrFn) fixedsizefn}, |
523 | {"gradientangle" , (attrFn) gradientanglefn}, |
524 | {"height" , (attrFn) heightfn}, |
525 | {"href" , (attrFn) hreffn}, |
526 | {"id" , (attrFn) idfn}, |
527 | {"port" , (attrFn) portfn}, |
528 | {"rowspan" , (attrFn) rowspanfn}, |
529 | {"sides" , (attrFn) sidesfn}, |
530 | {"style" , (attrFn) stylefn}, |
531 | {"target" , (attrFn) targetfn}, |
532 | {"title" , (attrFn) titlefn}, |
533 | {"tooltip" , (attrFn) titlefn}, |
534 | {"valign" , (attrFn) valignfn}, |
535 | {"width" , (attrFn) widthfn}, |
536 | }; |
537 | |
538 | static attr_item font_items[] = { |
539 | {"color" , (attrFn) fontcolorfn}, |
540 | {"face" , (attrFn) facefn}, |
541 | {"point-size" , (attrFn) ptsizefn}, |
542 | }; |
543 | |
544 | static attr_item img_items[] = { |
545 | {"scale" , (attrFn) scalefn}, |
546 | {"src" , (attrFn) srcfn}, |
547 | }; |
548 | |
549 | static attr_item br_items[] = { |
550 | {"align" , (attrFn) alignfn}, |
551 | }; |
552 | |
553 | /* doAttrs: |
554 | * General function for processing list of name/value attributes. |
555 | * Do binary search on items table. If match found, invoke action |
556 | * passing it tp and attribute value. |
557 | * Table size is given by nel |
558 | * Name/value pairs are in array atts, which is null terminated. |
559 | * s is the name of the HTML element being processed. |
560 | */ |
561 | static void |
562 | doAttrs(void *tp, attr_item * items, int nel, char **atts, char *s) |
563 | { |
564 | char *name; |
565 | char *val; |
566 | attr_item *ip; |
567 | attr_item key; |
568 | |
569 | while ((name = *atts++) != NULL) { |
570 | val = *atts++; |
571 | key.name = name; |
572 | ip = (attr_item *) bsearch(&key, items, nel, ISIZE, (bcmpfn) icmp); |
573 | if (ip) |
574 | state.warn |= ip->action(tp, val); |
575 | else { |
576 | agerr(AGWARN, "Illegal attribute %s in %s - ignored\n" , name, |
577 | s); |
578 | state.warn = 1; |
579 | } |
580 | } |
581 | } |
582 | |
583 | static void mkBR(char **atts) |
584 | { |
585 | htmllval.i = UNSET_ALIGN; |
586 | doAttrs(&htmllval.i, br_items, sizeof(br_items) / ISIZE, atts, "<BR>" ); |
587 | } |
588 | |
589 | static htmlimg_t *mkImg(char **atts) |
590 | { |
591 | htmlimg_t *img = NEW(htmlimg_t); |
592 | |
593 | doAttrs(img, img_items, sizeof(img_items) / ISIZE, atts, "<IMG>" ); |
594 | |
595 | return img; |
596 | } |
597 | |
598 | static textfont_t *mkFont(GVC_t *gvc, char **atts, int flags, int ul) |
599 | { |
600 | textfont_t tf = {NULL,NULL,NULL,0.0,0,0}; |
601 | |
602 | tf.size = -1.0; /* unassigned */ |
603 | tf.flags = flags; |
604 | if (atts) |
605 | doAttrs(&tf, font_items, sizeof(font_items) / ISIZE, atts, "<FONT>" ); |
606 | |
607 | return dtinsert(gvc->textfont_dt, &tf); |
608 | } |
609 | |
610 | static htmlcell_t *mkCell(char **atts) |
611 | { |
612 | htmlcell_t *cell = NEW(htmlcell_t); |
613 | |
614 | cell->cspan = 1; |
615 | cell->rspan = 1; |
616 | doAttrs(cell, cell_items, sizeof(cell_items) / ISIZE, atts, "<TD>" ); |
617 | |
618 | return cell; |
619 | } |
620 | |
621 | static htmltbl_t *mkTbl(char **atts) |
622 | { |
623 | htmltbl_t *tbl = NEW(htmltbl_t); |
624 | |
625 | tbl->rc = -1; /* flag that table is a raw, parsed table */ |
626 | tbl->cb = -1; /* unset cell border attribute */ |
627 | doAttrs(tbl, tbl_items, sizeof(tbl_items) / ISIZE, atts, "<TABLE>" ); |
628 | |
629 | return tbl; |
630 | } |
631 | |
632 | static void startElement(void *user, const char *name, char **atts) |
633 | { |
634 | GVC_t *gvc = (GVC_t*)user; |
635 | |
636 | if (strcasecmp(name, "TABLE" ) == 0) { |
637 | htmllval.tbl = mkTbl(atts); |
638 | state.inCell = 0; |
639 | state.tok = T_table; |
640 | } else if ((strcasecmp(name, "TR" ) == 0) |
641 | || (strcasecmp(name, "TH" ) == 0)) { |
642 | state.inCell = 0; |
643 | state.tok = T_row; |
644 | } else if (strcasecmp(name, "TD" ) == 0) { |
645 | state.inCell = 1; |
646 | htmllval.cell = mkCell(atts); |
647 | state.tok = T_cell; |
648 | } else if (strcasecmp(name, "FONT" ) == 0) { |
649 | htmllval.font = mkFont(gvc, atts, 0, 0); |
650 | state.tok = T_font; |
651 | } else if (strcasecmp(name, "B" ) == 0) { |
652 | htmllval.font = mkFont(gvc, 0, HTML_BF, 0); |
653 | state.tok = T_bold; |
654 | } else if (strcasecmp(name, "S" ) == 0) { |
655 | htmllval.font = mkFont(gvc, 0, HTML_S, 0); |
656 | state.tok = T_s; |
657 | } else if (strcasecmp(name, "U" ) == 0) { |
658 | htmllval.font = mkFont(gvc, 0, HTML_UL, 1); |
659 | state.tok = T_underline; |
660 | } else if (strcasecmp(name, "O" ) == 0) { |
661 | htmllval.font = mkFont(gvc, 0, HTML_OL, 1); |
662 | state.tok = T_overline; |
663 | } else if (strcasecmp(name, "I" ) == 0) { |
664 | htmllval.font = mkFont(gvc, 0, HTML_IF, 0); |
665 | state.tok = T_italic; |
666 | } else if (strcasecmp(name, "SUP" ) == 0) { |
667 | htmllval.font = mkFont(gvc, 0, HTML_SUP, 0); |
668 | state.tok = T_sup; |
669 | } else if (strcasecmp(name, "SUB" ) == 0) { |
670 | htmllval.font = mkFont(gvc, 0, HTML_SUB, 0); |
671 | state.tok = T_sub; |
672 | } else if (strcasecmp(name, "BR" ) == 0) { |
673 | mkBR(atts); |
674 | state.tok = T_br; |
675 | } else if (strcasecmp(name, "HR" ) == 0) { |
676 | state.tok = T_hr; |
677 | } else if (strcasecmp(name, "VR" ) == 0) { |
678 | state.tok = T_vr; |
679 | } else if (strcasecmp(name, "IMG" ) == 0) { |
680 | htmllval.img = mkImg(atts); |
681 | state.tok = T_img; |
682 | } else if (strcasecmp(name, "HTML" ) == 0) { |
683 | state.tok = T_html; |
684 | } else { |
685 | lexerror(name); |
686 | } |
687 | } |
688 | |
689 | static void endElement(void *user, const char *name) |
690 | { |
691 | if (strcasecmp(name, "TABLE" ) == 0) { |
692 | state.tok = T_end_table; |
693 | state.inCell = 1; |
694 | } else if ((strcasecmp(name, "TR" ) == 0) |
695 | || (strcasecmp(name, "TH" ) == 0)) { |
696 | state.tok = T_end_row; |
697 | } else if (strcasecmp(name, "TD" ) == 0) { |
698 | state.tok = T_end_cell; |
699 | state.inCell = 0; |
700 | } else if (strcasecmp(name, "HTML" ) == 0) { |
701 | state.tok = T_end_html; |
702 | } else if (strcasecmp(name, "FONT" ) == 0) { |
703 | state.tok = T_end_font; |
704 | } else if (strcasecmp(name, "B" ) == 0) { |
705 | state.tok = T_n_bold; |
706 | } else if (strcasecmp(name, "U" ) == 0) { |
707 | state.tok = T_n_underline; |
708 | } else if (strcasecmp(name, "O" ) == 0) { |
709 | state.tok = T_n_overline; |
710 | } else if (strcasecmp(name, "I" ) == 0) { |
711 | state.tok = T_n_italic; |
712 | } else if (strcasecmp(name, "SUP" ) == 0) { |
713 | state.tok = T_n_sup; |
714 | } else if (strcasecmp(name, "SUB" ) == 0) { |
715 | state.tok = T_n_sub; |
716 | } else if (strcasecmp(name, "S" ) == 0) { |
717 | state.tok = T_n_s; |
718 | } else if (strcasecmp(name, "BR" ) == 0) { |
719 | if (state.tok == T_br) |
720 | state.tok = T_BR; |
721 | else |
722 | state.tok = T_end_br; |
723 | } else if (strcasecmp(name, "HR" ) == 0) { |
724 | if (state.tok == T_hr) |
725 | state.tok = T_HR; |
726 | else |
727 | state.tok = T_end_hr; |
728 | } else if (strcasecmp(name, "VR" ) == 0) { |
729 | if (state.tok == T_vr) |
730 | state.tok = T_VR; |
731 | else |
732 | state.tok = T_end_vr; |
733 | } else if (strcasecmp(name, "IMG" ) == 0) { |
734 | if (state.tok == T_img) |
735 | state.tok = T_IMG; |
736 | else |
737 | state.tok = T_end_img; |
738 | } else { |
739 | lexerror(name); |
740 | } |
741 | } |
742 | |
743 | /* characterData: |
744 | * Generate T_string token. Do this only when immediately in |
745 | * <TD>..</TD> or <HTML>..</HTML>, i.e., when inCell is true. |
746 | * Strip out formatting characters but keep spaces. |
747 | * Distinguish between all whitespace vs. strings with non-whitespace |
748 | * characters. |
749 | */ |
750 | static void characterData(void *user, const char *s, int length) |
751 | { |
752 | int i, cnt = 0; |
753 | unsigned char c; |
754 | |
755 | if (state.inCell) { |
756 | for (i = length; i; i--) { |
757 | c = *s++; |
758 | if (c >= ' ') { |
759 | cnt++; |
760 | agxbputc(state.xb, c); |
761 | } |
762 | } |
763 | if (cnt) state.tok = T_string; |
764 | } |
765 | } |
766 | #endif |
767 | |
768 | int initHTMLlexer(char *src, agxbuf * xb, htmlenv_t *env) |
769 | { |
770 | #ifdef HAVE_EXPAT |
771 | state.xb = xb; |
772 | agxbinit (&state.lb, SMALLBUF, NULL); |
773 | state.ptr = src; |
774 | state.mode = 0; |
775 | state.warn = 0; |
776 | state.error = 0; |
777 | state.currtoklen = 0; |
778 | state.prevtoklen = 0; |
779 | state.inCell = 1; |
780 | state.parser = XML_ParserCreate(charsetToStr(GD_charset(env->g))); |
781 | XML_SetUserData(state.parser, GD_gvc(env->g)); |
782 | XML_SetElementHandler(state.parser, |
783 | (XML_StartElementHandler) startElement, |
784 | endElement); |
785 | XML_SetCharacterDataHandler(state.parser, characterData); |
786 | return 0; |
787 | #else |
788 | static int first; |
789 | if (!first) { |
790 | agerr(AGWARN, |
791 | "Not built with libexpat. Table formatting is not available.\n" ); |
792 | first++; |
793 | } |
794 | return 1; |
795 | #endif |
796 | } |
797 | |
798 | int clearHTMLlexer() |
799 | { |
800 | #ifdef HAVE_EXPAT |
801 | int rv = state.warn | state.error; |
802 | XML_ParserFree(state.parser); |
803 | agxbfree (&state.lb); |
804 | return rv; |
805 | #else |
806 | return 1; |
807 | #endif |
808 | } |
809 | |
810 | #ifdef HAVE_EXPAT |
811 | /* eatComment: |
812 | * Given first character after open comment, eat characters |
813 | * up to comment close, returning pointer to closing > if it exists, |
814 | * or null character otherwise. |
815 | * We rely on HTML strings having matched nested <>. |
816 | */ |
817 | static char *(char *p) |
818 | { |
819 | int depth = 1; |
820 | char *s = p; |
821 | char c; |
822 | |
823 | while (depth && (c = *s++)) { |
824 | if (c == '<') |
825 | depth++; |
826 | else if (c == '>') |
827 | depth--; |
828 | } |
829 | s--; /* move back to '\0' or '>' */ |
830 | if (*s) { |
831 | char *t = s - 2; |
832 | if ((t < p) || strncmp(t, "--" , 2)) { |
833 | agerr(AGWARN, "Unclosed comment\n" ); |
834 | state.warn = 1; |
835 | } |
836 | } |
837 | return s; |
838 | } |
839 | |
840 | /* findNext: |
841 | * Return next XML unit. This is either <..>, an HTML |
842 | * comment <!-- ... -->, or characters up to next <. |
843 | */ |
844 | static char *findNext(char *s, agxbuf* xb) |
845 | { |
846 | char* t = s + 1; |
847 | char c; |
848 | |
849 | if (*s == '<') { |
850 | if ((*t == '!') && !strncmp(t + 1, "--" , 2)) |
851 | t = eatComment(t + 3); |
852 | else |
853 | while (*t && (*t != '>')) |
854 | t++; |
855 | if (*t != '>') { |
856 | agerr(AGWARN, "Label closed before end of HTML element\n" ); |
857 | state.warn = 1; |
858 | } else |
859 | t++; |
860 | } else { |
861 | t = s; |
862 | while ((c = *t) && (c != '<')) { |
863 | if ((c == '&') && (*(t+1) != '#')) { |
864 | t = scanEntity(t + 1, xb); |
865 | } |
866 | else { |
867 | agxbputc(xb, c); |
868 | t++; |
869 | } |
870 | } |
871 | } |
872 | return t; |
873 | } |
874 | #endif |
875 | |
876 | int htmllineno() |
877 | { |
878 | #ifdef HAVE_EXPAT |
879 | return XML_GetCurrentLineNumber(state.parser); |
880 | #else |
881 | return 0; |
882 | #endif |
883 | } |
884 | |
885 | #ifdef DEBUG |
886 | static void printTok(int tok) |
887 | { |
888 | char *s; |
889 | |
890 | switch (tok) { |
891 | case T_end_br: |
892 | s = "T_end_br" ; |
893 | break; |
894 | case T_end_img: |
895 | s = "T_end_img" ; |
896 | break; |
897 | case T_row: |
898 | s = "T_row" ; |
899 | break; |
900 | case T_end_row: |
901 | s = "T_end_row" ; |
902 | break; |
903 | case T_html: |
904 | s = "T_html" ; |
905 | break; |
906 | case T_end_html: |
907 | s = "T_end_html" ; |
908 | break; |
909 | case T_end_table: |
910 | s = "T_end_table" ; |
911 | break; |
912 | case T_end_cell: |
913 | s = "T_end_cell" ; |
914 | break; |
915 | case T_end_font: |
916 | s = "T_end_font" ; |
917 | break; |
918 | case T_string: |
919 | s = "T_string" ; |
920 | break; |
921 | case T_error: |
922 | s = "T_error" ; |
923 | break; |
924 | case T_n_italic: |
925 | s = "T_n_italic" ; |
926 | break; |
927 | case T_n_bold: |
928 | s = "T_n_bold" ; |
929 | break; |
930 | case T_n_underline: |
931 | s = "T_n_underline" ; |
932 | break; |
933 | case T_n_overline: |
934 | s = "T_n_overline" ; |
935 | break; |
936 | case T_n_sup: |
937 | s = "T_n_sup" ; |
938 | break; |
939 | case T_n_sub: |
940 | s = "T_n_sub" ; |
941 | break; |
942 | case T_n_s: |
943 | s = "T_n_s" ; |
944 | break; |
945 | case T_HR: |
946 | s = "T_HR" ; |
947 | break; |
948 | case T_hr: |
949 | s = "T_hr" ; |
950 | break; |
951 | case T_end_hr: |
952 | s = "T_end_hr" ; |
953 | break; |
954 | case T_VR: |
955 | s = "T_VR" ; |
956 | break; |
957 | case T_vr: |
958 | s = "T_vr" ; |
959 | break; |
960 | case T_end_vr: |
961 | s = "T_end_vr" ; |
962 | break; |
963 | case T_BR: |
964 | s = "T_BR" ; |
965 | break; |
966 | case T_br: |
967 | s = "T_br" ; |
968 | break; |
969 | case T_IMG: |
970 | s = "T_IMG" ; |
971 | break; |
972 | case T_img: |
973 | s = "T_img" ; |
974 | break; |
975 | case T_table: |
976 | s = "T_table" ; |
977 | break; |
978 | case T_cell: |
979 | s = "T_cell" ; |
980 | break; |
981 | case T_font: |
982 | s = "T_font" ; |
983 | break; |
984 | case T_italic: |
985 | s = "T_italic" ; |
986 | break; |
987 | case T_bold: |
988 | s = "T_bold" ; |
989 | break; |
990 | case T_underline: |
991 | s = "T_underline" ; |
992 | break; |
993 | case T_overline: |
994 | s = "T_overline" ; |
995 | break; |
996 | case T_sup: |
997 | s = "T_sup" ; |
998 | break; |
999 | case T_sub: |
1000 | s = "T_sub" ; |
1001 | break; |
1002 | case T_s: |
1003 | s = "T_s" ; |
1004 | break; |
1005 | default: |
1006 | s = "<unknown>" ; |
1007 | } |
1008 | if (tok == T_string) { |
1009 | fprintf(stderr, "%s \"" , s); |
1010 | fwrite(agxbstart(state.xb), 1, agxblen(state.xb), stderr); |
1011 | fprintf(stderr, "\"\n" ); |
1012 | } else |
1013 | fprintf(stderr, "%s\n" , s); |
1014 | } |
1015 | |
1016 | #endif |
1017 | |
1018 | int htmllex() |
1019 | { |
1020 | #ifdef HAVE_EXPAT |
1021 | static char *begin_html = "<HTML>" ; |
1022 | static char *end_html = "</HTML>" ; |
1023 | |
1024 | char *s; |
1025 | char *endp = 0; |
1026 | int len, llen; |
1027 | int rv; |
1028 | |
1029 | state.tok = 0; |
1030 | do { |
1031 | if (state.mode == 2) |
1032 | return EOF; |
1033 | if (state.mode == 0) { |
1034 | state.mode = 1; |
1035 | s = begin_html; |
1036 | len = strlen(s); |
1037 | endp = 0; |
1038 | } else { |
1039 | s = state.ptr; |
1040 | if (*s == '\0') { |
1041 | state.mode = 2; |
1042 | s = end_html; |
1043 | len = strlen(s); |
1044 | } else { |
1045 | endp = findNext(s,&state.lb); |
1046 | len = endp - s; |
1047 | } |
1048 | } |
1049 | state.prevtok = state.currtok; |
1050 | state.prevtoklen = state.currtoklen; |
1051 | state.currtok = s; |
1052 | state.currtoklen = len; |
1053 | if ((llen = agxblen(&state.lb))) |
1054 | rv = XML_Parse(state.parser, agxbuse(&state.lb),llen, 0); |
1055 | else |
1056 | rv = XML_Parse(state.parser, s, len, (len ? 0 : 1)); |
1057 | if (rv == XML_STATUS_ERROR) { |
1058 | if (!state.error) { |
1059 | agerr(AGERR, "%s in line %d \n" , |
1060 | XML_ErrorString(XML_GetErrorCode(state.parser)), |
1061 | htmllineno()); |
1062 | error_context(); |
1063 | state.error = 1; |
1064 | state.tok = T_error; |
1065 | } |
1066 | } |
1067 | if (endp) |
1068 | state.ptr = endp; |
1069 | } while (state.tok == 0); |
1070 | #if DEBUG |
1071 | printTok (state.tok); |
1072 | #endif |
1073 | return state.tok; |
1074 | #else |
1075 | return EOF; |
1076 | #endif |
1077 | } |
1078 | |
1079 | |