1/* $Id$ $Revision$ */
2/* vim:set shiftwidth=4 ts=8: */
3
4/*************************************************************************
5 * Copyright (c) 2011 AT&T Intellectual Property
6 * All rights reserved. This program and the accompanying materials
7 * are made available under the terms of the Eclipse Public License v1.0
8 * which accompanies this distribution, and is available at
9 * http://www.eclipse.org/legal/epl-v10.html
10 *
11 * Contributors: See CVS logs. Details at http://www.graphviz.org/
12 *************************************************************************/
13
14
15#include "render.h"
16#include "htmltable.h"
17#include "htmlparse.h"
18#include "htmllex.h"
19#include "cdt.h"
20#include <ctype.h>
21
22#ifdef HAVE_EXPAT
23#include <expat.h>
24#endif
25
26#ifndef XML_STATUS_ERROR
27#define XML_STATUS_ERROR 0
28#endif
29
30typedef struct {
31#ifdef HAVE_EXPAT
32 XML_Parser parser;
33#endif
34 char* ptr; /* input source */
35 int tok; /* token type */
36 agxbuf* xb; /* buffer to gather T_string data */
37 agxbuf lb; /* buffer for translating lexical data */
38 char warn; /* set if warning given */
39 char error; /* set if error given */
40 char inCell; /* set if in TD to allow T_string */
41 char mode; /* for handling artificial <HTML>..</HTML> */
42 char *currtok; /* for error reporting */
43 char *prevtok; /* for error reporting */
44 int currtoklen;
45 int prevtoklen;
46} lexstate_t;
47static lexstate_t state;
48
49/* error_context:
50 * Print the last 2 "token"s seen.
51 */
52static void error_context(void)
53{
54 agxbclear(state.xb);
55 if (state.prevtoklen > 0)
56 agxbput_n(state.xb, state.prevtok, state.prevtoklen);
57 agxbput_n(state.xb, state.currtok, state.currtoklen);
58 agerr(AGPREV, "... %s ...\n", agxbuse(state.xb));
59}
60
61/* htmlerror:
62 * yyerror - called by yacc output
63 */
64void htmlerror(const char *msg)
65{
66 if (state.error)
67 return;
68 state.error = 1;
69 agerr(AGERR, "%s in line %d \n", msg, htmllineno());
70 error_context();
71}
72
73#ifdef HAVE_EXPAT
74/* lexerror:
75 * called by lexer when unknown <..> is found.
76 */
77static void lexerror(const char *name)
78{
79 state.tok = T_error;
80 state.error = 1;
81 agerr(AGERR, "Unknown HTML element <%s> on line %d \n",
82 name, htmllineno());
83}
84
85typedef int (*attrFn) (void *, char *);
86typedef int (*bcmpfn) (const void *, const void *);
87
88#define MAX_CHAR (((unsigned char)(~0)) >> 1)
89#define MIN_CHAR ((signed char)(~MAX_CHAR))
90#define MAX_UCHAR ((unsigned char)(~0))
91#define MAX_USHORT ((unsigned short)(~0))
92
93/* Mechanism for automatically processing attributes */
94typedef struct {
95 char *name; /* attribute name */
96 attrFn action; /* action to perform if name matches */
97} attr_item;
98
99#define ISIZE (sizeof(attr_item))
100
101/* icmp:
102 * Compare two attr_item. Used in bsearch
103 */
104static int icmp(attr_item * i, attr_item * j)
105{
106 return strcasecmp(i->name, j->name);
107}
108
109static int bgcolorfn(htmldata_t * p, char *v)
110{
111 p->bgcolor = strdup(v);
112 return 0;
113}
114
115static int pencolorfn(htmldata_t * p, char *v)
116{
117 p->pencolor = strdup(v);
118 return 0;
119}
120
121static int hreffn(htmldata_t * p, char *v)
122{
123 p->href = strdup(v);
124 return 0;
125}
126
127static int sidesfn(htmldata_t * p, char *v)
128{
129 unsigned short flags = 0;
130 char c;
131
132 while ((c = *v++)) {
133 switch (tolower(c)) {
134 case 'l' :
135 flags |= BORDER_LEFT;
136 break;
137 case 't' :
138 flags |= BORDER_TOP;
139 break;
140 case 'r' :
141 flags |= BORDER_RIGHT;
142 break;
143 case 'b' :
144 flags |= BORDER_BOTTOM;
145 break;
146 default :
147 agerr(AGWARN, "Unrecognized character '%c' (%d) in sides attribute\n", c, c);
148 break;
149 }
150 }
151 if (flags != BORDER_MASK)
152 p->flags |= flags;
153 return 0;
154}
155
156static int titlefn(htmldata_t * p, char *v)
157{
158 p->title = strdup(v);
159 return 0;
160}
161
162static int portfn(htmldata_t * p, char *v)
163{
164 p->port = strdup(v);
165 return 0;
166}
167
168#define DELIM " ,"
169
170static int stylefn(htmldata_t * p, char *v)
171{
172 int rv = 0;
173 char c;
174 char* tk;
175 char* buf = strdup (v);
176 for (tk = strtok (buf, DELIM); tk; tk = strtok (NULL, DELIM)) {
177 c = (char) toupper(*tk);
178 if (c == 'R') {
179 if (!strcasecmp(tk + 1, "OUNDED")) p->style |= ROUNDED;
180 else if (!strcasecmp(tk + 1, "ADIAL")) p->style |= RADIAL;
181 else {
182 agerr(AGWARN, "Illegal value %s for STYLE - ignored\n", tk);
183 rv = 1;
184 }
185 }
186 else if(!strcasecmp(tk,"SOLID")) p->style &= ~(DOTTED|DASHED);
187 else if(!strcasecmp(tk,"INVISIBLE") || !strcasecmp(tk,"INVIS")) p->style |= INVISIBLE;
188 else if(!strcasecmp(tk,"DOTTED")) p->style |= DOTTED;
189 else if(!strcasecmp(tk,"DASHED")) p->style |= DASHED;
190 else {
191 agerr(AGWARN, "Illegal value %s for STYLE - ignored\n", tk);
192 rv = 1;
193 }
194 }
195 free (buf);
196 return rv;
197}
198
199static int targetfn(htmldata_t * p, char *v)
200{
201 p->target = strdup(v);
202 return 0;
203}
204
205static int idfn(htmldata_t * p, char *v)
206{
207 p->id = strdup(v);
208 return 0;
209}
210
211
212/* doInt:
213 * Scan v for integral value. Check that
214 * the value is >= min and <= max. Return value in ul.
215 * String s is name of value.
216 * Return 0 if okay; 1 otherwise.
217 */
218static int doInt(char *v, char *s, int min, int max, long *ul)
219{
220 int rv = 0;
221 char *ep;
222 long b = strtol(v, &ep, 10);
223
224 if (ep == v) {
225 agerr(AGWARN, "Improper %s value %s - ignored", s, v);
226 rv = 1;
227 } else if (b > max) {
228 agerr(AGWARN, "%s value %s > %d - too large - ignored", s, v, max);
229 rv = 1;
230 } else if (b < min) {
231 agerr(AGWARN, "%s value %s < %d - too small - ignored", s, v, min);
232 rv = 1;
233 } else
234 *ul = b;
235 return rv;
236}
237
238
239static int gradientanglefn(htmldata_t * p, char *v)
240{
241 long u;
242
243 if (doInt(v, "GRADIENTANGLE", 0, 360, &u))
244 return 1;
245 p->gradientangle = (unsigned short) u;
246 return 0;
247}
248
249
250static int borderfn(htmldata_t * p, char *v)
251{
252 long u;
253
254 if (doInt(v, "BORDER", 0, MAX_UCHAR, &u))
255 return 1;
256 p->border = (unsigned char) u;
257 p->flags |= BORDER_SET;
258 return 0;
259}
260
261static int cellpaddingfn(htmldata_t * p, char *v)
262{
263 long u;
264
265 if (doInt(v, "CELLPADDING", 0, MAX_UCHAR, &u))
266 return 1;
267 p->pad = (unsigned char) u;
268 p->flags |= PAD_SET;
269 return 0;
270}
271
272static int cellspacingfn(htmldata_t * p, char *v)
273{
274 long u;
275
276 if (doInt(v, "CELLSPACING", MIN_CHAR, MAX_CHAR, &u))
277 return 1;
278 p->space = (signed char) u;
279 p->flags |= SPACE_SET;
280 return 0;
281}
282
283static int cellborderfn(htmltbl_t * p, char *v)
284{
285 long u;
286
287 if (doInt(v, "CELLSBORDER", 0, MAX_CHAR, &u))
288 return 1;
289 p->cb = (unsigned char) u;
290 return 0;
291}
292
293static int columnsfn(htmltbl_t * p, char *v)
294{
295 if (*v != '*') {
296 agerr(AGWARN, "Unknown value %s for COLUMNS - ignored\n", v);
297 return 1;
298 }
299 p->flags |= HTML_VRULE;
300 return 0;
301}
302
303static int rowsfn(htmltbl_t * p, char *v)
304{
305 if (*v != '*') {
306 agerr(AGWARN, "Unknown value %s for ROWS - ignored\n", v);
307 return 1;
308 }
309 p->flags |= HTML_HRULE;
310 return 0;
311}
312
313static int fixedsizefn(htmldata_t * p, char *v)
314{
315 int rv = 0;
316 char c = (char) toupper(*(unsigned char *) v);
317 if ((c == 'T') && !strcasecmp(v + 1, "RUE"))
318 p->flags |= FIXED_FLAG;
319 else if ((c != 'F') || strcasecmp(v + 1, "ALSE")) {
320 agerr(AGWARN, "Illegal value %s for FIXEDSIZE - ignored\n", v);
321 rv = 1;
322 }
323 return rv;
324}
325
326static int valignfn(htmldata_t * p, char *v)
327{
328 int rv = 0;
329 char c = (char) toupper(*v);
330 if ((c == 'B') && !strcasecmp(v + 1, "OTTOM"))
331 p->flags |= VALIGN_BOTTOM;
332 else if ((c == 'T') && !strcasecmp(v + 1, "OP"))
333 p->flags |= VALIGN_TOP;
334 else if ((c != 'M') || strcasecmp(v + 1, "IDDLE")) {
335 agerr(AGWARN, "Illegal value %s for VALIGN - ignored\n", v);
336 rv = 1;
337 }
338 return rv;
339}
340
341static int halignfn(htmldata_t * p, char *v)
342{
343 int rv = 0;
344 char c = (char) toupper(*v);
345 if ((c == 'L') && !strcasecmp(v + 1, "EFT"))
346 p->flags |= HALIGN_LEFT;
347 else if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
348 p->flags |= HALIGN_RIGHT;
349 else if ((c != 'C') || strcasecmp(v + 1, "ENTER")) {
350 agerr(AGWARN, "Illegal value %s for ALIGN - ignored\n", v);
351 rv = 1;
352 }
353 return rv;
354}
355
356static int cell_halignfn(htmldata_t * p, char *v)
357{
358 int rv = 0;
359 char c = (char) toupper(*v);
360 if ((c == 'L') && !strcasecmp(v + 1, "EFT"))
361 p->flags |= HALIGN_LEFT;
362 else if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
363 p->flags |= HALIGN_RIGHT;
364 else if ((c == 'T') && !strcasecmp(v + 1, "EXT"))
365 p->flags |= HALIGN_TEXT;
366 else if ((c != 'C') || strcasecmp(v + 1, "ENTER"))
367 rv = 1;
368 if (rv)
369 agerr(AGWARN, "Illegal value %s for ALIGN in TD - ignored\n", v);
370 return rv;
371}
372
373static int balignfn(htmldata_t * p, char *v)
374{
375 int rv = 0;
376 char c = (char) toupper(*v);
377 if ((c == 'L') && !strcasecmp(v + 1, "EFT"))
378 p->flags |= BALIGN_LEFT;
379 else if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
380 p->flags |= BALIGN_RIGHT;
381 else if ((c != 'C') || strcasecmp(v + 1, "ENTER"))
382 rv = 1;
383 if (rv)
384 agerr(AGWARN, "Illegal value %s for BALIGN in TD - ignored\n", v);
385 return rv;
386}
387
388static int heightfn(htmldata_t * p, char *v)
389{
390 long u;
391
392 if (doInt(v, "HEIGHT", 0, MAX_USHORT, &u))
393 return 1;
394 p->height = (unsigned short) u;
395 return 0;
396}
397
398static int widthfn(htmldata_t * p, char *v)
399{
400 long u;
401
402 if (doInt(v, "WIDTH", 0, MAX_USHORT, &u))
403 return 1;
404 p->width = (unsigned short) u;
405 return 0;
406}
407
408static int rowspanfn(htmlcell_t * p, char *v)
409{
410 long u;
411
412 if (doInt(v, "ROWSPAN", 0, MAX_USHORT, &u))
413 return 1;
414 if (u == 0) {
415 agerr(AGWARN, "ROWSPAN value cannot be 0 - ignored\n");
416 return 1;
417 }
418 p->rspan = (unsigned short) u;
419 return 0;
420}
421
422static int colspanfn(htmlcell_t * p, char *v)
423{
424 long u;
425
426 if (doInt(v, "COLSPAN", 0, MAX_USHORT, &u))
427 return 1;
428 if (u == 0) {
429 agerr(AGWARN, "COLSPAN value cannot be 0 - ignored\n");
430 return 1;
431 }
432 p->cspan = (unsigned short) u;
433 return 0;
434}
435
436static int fontcolorfn(textfont_t * p, char *v)
437{
438 p->color = v;
439 return 0;
440}
441
442static int facefn(textfont_t * p, char *v)
443{
444 p->name = v;
445 return 0;
446}
447
448static int ptsizefn(textfont_t * p, char *v)
449{
450 long u;
451
452 if (doInt(v, "POINT-SIZE", 0, MAX_UCHAR, &u))
453 return 1;
454 p->size = (double) u;
455 return 0;
456}
457
458static int srcfn(htmlimg_t * p, char *v)
459{
460 p->src = strdup(v);
461 return 0;
462}
463
464static int scalefn(htmlimg_t * p, char *v)
465{
466 p->scale = strdup(v);
467 return 0;
468}
469
470static int alignfn(int *p, char *v)
471{
472 int rv = 0;
473 char c = (char) toupper(*v);
474 if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
475 *p = 'r';
476 else if ((c == 'L') || !strcasecmp(v + 1, "EFT"))
477 *p = 'l';
478 else if ((c == 'C') || strcasecmp(v + 1, "ENTER"))
479 *p = 'n';
480 else {
481 agerr(AGWARN, "Illegal value %s for ALIGN - ignored\n", v);
482 rv = 1;
483 }
484 return rv;
485}
486
487/* Tables used in binary search; MUST be alphabetized */
488static attr_item tbl_items[] = {
489 {"align", (attrFn) halignfn},
490 {"bgcolor", (attrFn) bgcolorfn},
491 {"border", (attrFn) borderfn},
492 {"cellborder", (attrFn) cellborderfn},
493 {"cellpadding", (attrFn) cellpaddingfn},
494 {"cellspacing", (attrFn) cellspacingfn},
495 {"color", (attrFn) pencolorfn},
496 {"columns", (attrFn) columnsfn},
497 {"fixedsize", (attrFn) fixedsizefn},
498 {"gradientangle", (attrFn) gradientanglefn},
499 {"height", (attrFn) heightfn},
500 {"href", (attrFn) hreffn},
501 {"id", (attrFn) idfn},
502 {"port", (attrFn) portfn},
503 {"rows", (attrFn) rowsfn},
504 {"sides", (attrFn) sidesfn},
505 {"style", (attrFn) stylefn},
506 {"target", (attrFn) targetfn},
507 {"title", (attrFn) titlefn},
508 {"tooltip", (attrFn) titlefn},
509 {"valign", (attrFn) valignfn},
510 {"width", (attrFn) widthfn},
511};
512
513static attr_item cell_items[] = {
514 {"align", (attrFn) cell_halignfn},
515 {"balign", (attrFn) balignfn},
516 {"bgcolor", (attrFn) bgcolorfn},
517 {"border", (attrFn) borderfn},
518 {"cellpadding", (attrFn) cellpaddingfn},
519 {"cellspacing", (attrFn) cellspacingfn},
520 {"color", (attrFn) pencolorfn},
521 {"colspan", (attrFn) colspanfn},
522 {"fixedsize", (attrFn) fixedsizefn},
523 {"gradientangle", (attrFn) gradientanglefn},
524 {"height", (attrFn) heightfn},
525 {"href", (attrFn) hreffn},
526 {"id", (attrFn) idfn},
527 {"port", (attrFn) portfn},
528 {"rowspan", (attrFn) rowspanfn},
529 {"sides", (attrFn) sidesfn},
530 {"style", (attrFn) stylefn},
531 {"target", (attrFn) targetfn},
532 {"title", (attrFn) titlefn},
533 {"tooltip", (attrFn) titlefn},
534 {"valign", (attrFn) valignfn},
535 {"width", (attrFn) widthfn},
536};
537
538static attr_item font_items[] = {
539 {"color", (attrFn) fontcolorfn},
540 {"face", (attrFn) facefn},
541 {"point-size", (attrFn) ptsizefn},
542};
543
544static attr_item img_items[] = {
545 {"scale", (attrFn) scalefn},
546 {"src", (attrFn) srcfn},
547};
548
549static attr_item br_items[] = {
550 {"align", (attrFn) alignfn},
551};
552
553/* doAttrs:
554 * General function for processing list of name/value attributes.
555 * Do binary search on items table. If match found, invoke action
556 * passing it tp and attribute value.
557 * Table size is given by nel
558 * Name/value pairs are in array atts, which is null terminated.
559 * s is the name of the HTML element being processed.
560 */
561static void
562doAttrs(void *tp, attr_item * items, int nel, char **atts, char *s)
563{
564 char *name;
565 char *val;
566 attr_item *ip;
567 attr_item key;
568
569 while ((name = *atts++) != NULL) {
570 val = *atts++;
571 key.name = name;
572 ip = (attr_item *) bsearch(&key, items, nel, ISIZE, (bcmpfn) icmp);
573 if (ip)
574 state.warn |= ip->action(tp, val);
575 else {
576 agerr(AGWARN, "Illegal attribute %s in %s - ignored\n", name,
577 s);
578 state.warn = 1;
579 }
580 }
581}
582
583static void mkBR(char **atts)
584{
585 htmllval.i = UNSET_ALIGN;
586 doAttrs(&htmllval.i, br_items, sizeof(br_items) / ISIZE, atts, "<BR>");
587}
588
589static htmlimg_t *mkImg(char **atts)
590{
591 htmlimg_t *img = NEW(htmlimg_t);
592
593 doAttrs(img, img_items, sizeof(img_items) / ISIZE, atts, "<IMG>");
594
595 return img;
596}
597
598static textfont_t *mkFont(GVC_t *gvc, char **atts, int flags, int ul)
599{
600 textfont_t tf = {NULL,NULL,NULL,0.0,0,0};
601
602 tf.size = -1.0; /* unassigned */
603 tf.flags = flags;
604 if (atts)
605 doAttrs(&tf, font_items, sizeof(font_items) / ISIZE, atts, "<FONT>");
606
607 return dtinsert(gvc->textfont_dt, &tf);
608}
609
610static htmlcell_t *mkCell(char **atts)
611{
612 htmlcell_t *cell = NEW(htmlcell_t);
613
614 cell->cspan = 1;
615 cell->rspan = 1;
616 doAttrs(cell, cell_items, sizeof(cell_items) / ISIZE, atts, "<TD>");
617
618 return cell;
619}
620
621static htmltbl_t *mkTbl(char **atts)
622{
623 htmltbl_t *tbl = NEW(htmltbl_t);
624
625 tbl->rc = -1; /* flag that table is a raw, parsed table */
626 tbl->cb = -1; /* unset cell border attribute */
627 doAttrs(tbl, tbl_items, sizeof(tbl_items) / ISIZE, atts, "<TABLE>");
628
629 return tbl;
630}
631
632static void startElement(void *user, const char *name, char **atts)
633{
634 GVC_t *gvc = (GVC_t*)user;
635
636 if (strcasecmp(name, "TABLE") == 0) {
637 htmllval.tbl = mkTbl(atts);
638 state.inCell = 0;
639 state.tok = T_table;
640 } else if ((strcasecmp(name, "TR") == 0)
641 || (strcasecmp(name, "TH") == 0)) {
642 state.inCell = 0;
643 state.tok = T_row;
644 } else if (strcasecmp(name, "TD") == 0) {
645 state.inCell = 1;
646 htmllval.cell = mkCell(atts);
647 state.tok = T_cell;
648 } else if (strcasecmp(name, "FONT") == 0) {
649 htmllval.font = mkFont(gvc, atts, 0, 0);
650 state.tok = T_font;
651 } else if (strcasecmp(name, "B") == 0) {
652 htmllval.font = mkFont(gvc, 0, HTML_BF, 0);
653 state.tok = T_bold;
654 } else if (strcasecmp(name, "S") == 0) {
655 htmllval.font = mkFont(gvc, 0, HTML_S, 0);
656 state.tok = T_s;
657 } else if (strcasecmp(name, "U") == 0) {
658 htmllval.font = mkFont(gvc, 0, HTML_UL, 1);
659 state.tok = T_underline;
660 } else if (strcasecmp(name, "O") == 0) {
661 htmllval.font = mkFont(gvc, 0, HTML_OL, 1);
662 state.tok = T_overline;
663 } else if (strcasecmp(name, "I") == 0) {
664 htmllval.font = mkFont(gvc, 0, HTML_IF, 0);
665 state.tok = T_italic;
666 } else if (strcasecmp(name, "SUP") == 0) {
667 htmllval.font = mkFont(gvc, 0, HTML_SUP, 0);
668 state.tok = T_sup;
669 } else if (strcasecmp(name, "SUB") == 0) {
670 htmllval.font = mkFont(gvc, 0, HTML_SUB, 0);
671 state.tok = T_sub;
672 } else if (strcasecmp(name, "BR") == 0) {
673 mkBR(atts);
674 state.tok = T_br;
675 } else if (strcasecmp(name, "HR") == 0) {
676 state.tok = T_hr;
677 } else if (strcasecmp(name, "VR") == 0) {
678 state.tok = T_vr;
679 } else if (strcasecmp(name, "IMG") == 0) {
680 htmllval.img = mkImg(atts);
681 state.tok = T_img;
682 } else if (strcasecmp(name, "HTML") == 0) {
683 state.tok = T_html;
684 } else {
685 lexerror(name);
686 }
687}
688
689static void endElement(void *user, const char *name)
690{
691 if (strcasecmp(name, "TABLE") == 0) {
692 state.tok = T_end_table;
693 state.inCell = 1;
694 } else if ((strcasecmp(name, "TR") == 0)
695 || (strcasecmp(name, "TH") == 0)) {
696 state.tok = T_end_row;
697 } else if (strcasecmp(name, "TD") == 0) {
698 state.tok = T_end_cell;
699 state.inCell = 0;
700 } else if (strcasecmp(name, "HTML") == 0) {
701 state.tok = T_end_html;
702 } else if (strcasecmp(name, "FONT") == 0) {
703 state.tok = T_end_font;
704 } else if (strcasecmp(name, "B") == 0) {
705 state.tok = T_n_bold;
706 } else if (strcasecmp(name, "U") == 0) {
707 state.tok = T_n_underline;
708 } else if (strcasecmp(name, "O") == 0) {
709 state.tok = T_n_overline;
710 } else if (strcasecmp(name, "I") == 0) {
711 state.tok = T_n_italic;
712 } else if (strcasecmp(name, "SUP") == 0) {
713 state.tok = T_n_sup;
714 } else if (strcasecmp(name, "SUB") == 0) {
715 state.tok = T_n_sub;
716 } else if (strcasecmp(name, "S") == 0) {
717 state.tok = T_n_s;
718 } else if (strcasecmp(name, "BR") == 0) {
719 if (state.tok == T_br)
720 state.tok = T_BR;
721 else
722 state.tok = T_end_br;
723 } else if (strcasecmp(name, "HR") == 0) {
724 if (state.tok == T_hr)
725 state.tok = T_HR;
726 else
727 state.tok = T_end_hr;
728 } else if (strcasecmp(name, "VR") == 0) {
729 if (state.tok == T_vr)
730 state.tok = T_VR;
731 else
732 state.tok = T_end_vr;
733 } else if (strcasecmp(name, "IMG") == 0) {
734 if (state.tok == T_img)
735 state.tok = T_IMG;
736 else
737 state.tok = T_end_img;
738 } else {
739 lexerror(name);
740 }
741}
742
743/* characterData:
744 * Generate T_string token. Do this only when immediately in
745 * <TD>..</TD> or <HTML>..</HTML>, i.e., when inCell is true.
746 * Strip out formatting characters but keep spaces.
747 * Distinguish between all whitespace vs. strings with non-whitespace
748 * characters.
749 */
750static void characterData(void *user, const char *s, int length)
751{
752 int i, cnt = 0;
753 unsigned char c;
754
755 if (state.inCell) {
756 for (i = length; i; i--) {
757 c = *s++;
758 if (c >= ' ') {
759 cnt++;
760 agxbputc(state.xb, c);
761 }
762 }
763 if (cnt) state.tok = T_string;
764 }
765}
766#endif
767
768int initHTMLlexer(char *src, agxbuf * xb, htmlenv_t *env)
769{
770#ifdef HAVE_EXPAT
771 state.xb = xb;
772 agxbinit (&state.lb, SMALLBUF, NULL);
773 state.ptr = src;
774 state.mode = 0;
775 state.warn = 0;
776 state.error = 0;
777 state.currtoklen = 0;
778 state.prevtoklen = 0;
779 state.inCell = 1;
780 state.parser = XML_ParserCreate(charsetToStr(GD_charset(env->g)));
781 XML_SetUserData(state.parser, GD_gvc(env->g));
782 XML_SetElementHandler(state.parser,
783 (XML_StartElementHandler) startElement,
784 endElement);
785 XML_SetCharacterDataHandler(state.parser, characterData);
786 return 0;
787#else
788 static int first;
789 if (!first) {
790 agerr(AGWARN,
791 "Not built with libexpat. Table formatting is not available.\n");
792 first++;
793 }
794 return 1;
795#endif
796}
797
798int clearHTMLlexer()
799{
800#ifdef HAVE_EXPAT
801 int rv = state.warn | state.error;
802 XML_ParserFree(state.parser);
803 agxbfree (&state.lb);
804 return rv;
805#else
806 return 1;
807#endif
808}
809
810#ifdef HAVE_EXPAT
811/* eatComment:
812 * Given first character after open comment, eat characters
813 * up to comment close, returning pointer to closing > if it exists,
814 * or null character otherwise.
815 * We rely on HTML strings having matched nested <>.
816 */
817static char *eatComment(char *p)
818{
819 int depth = 1;
820 char *s = p;
821 char c;
822
823 while (depth && (c = *s++)) {
824 if (c == '<')
825 depth++;
826 else if (c == '>')
827 depth--;
828 }
829 s--; /* move back to '\0' or '>' */
830 if (*s) {
831 char *t = s - 2;
832 if ((t < p) || strncmp(t, "--", 2)) {
833 agerr(AGWARN, "Unclosed comment\n");
834 state.warn = 1;
835 }
836 }
837 return s;
838}
839
840/* findNext:
841 * Return next XML unit. This is either <..>, an HTML
842 * comment <!-- ... -->, or characters up to next <.
843 */
844static char *findNext(char *s, agxbuf* xb)
845{
846 char* t = s + 1;
847 char c;
848
849 if (*s == '<') {
850 if ((*t == '!') && !strncmp(t + 1, "--", 2))
851 t = eatComment(t + 3);
852 else
853 while (*t && (*t != '>'))
854 t++;
855 if (*t != '>') {
856 agerr(AGWARN, "Label closed before end of HTML element\n");
857 state.warn = 1;
858 } else
859 t++;
860 } else {
861 t = s;
862 while ((c = *t) && (c != '<')) {
863 if ((c == '&') && (*(t+1) != '#')) {
864 t = scanEntity(t + 1, xb);
865 }
866 else {
867 agxbputc(xb, c);
868 t++;
869 }
870 }
871 }
872 return t;
873}
874#endif
875
876int htmllineno()
877{
878#ifdef HAVE_EXPAT
879 return XML_GetCurrentLineNumber(state.parser);
880#else
881 return 0;
882#endif
883}
884
885#ifdef DEBUG
886static void printTok(int tok)
887{
888 char *s;
889
890 switch (tok) {
891 case T_end_br:
892 s = "T_end_br";
893 break;
894 case T_end_img:
895 s = "T_end_img";
896 break;
897 case T_row:
898 s = "T_row";
899 break;
900 case T_end_row:
901 s = "T_end_row";
902 break;
903 case T_html:
904 s = "T_html";
905 break;
906 case T_end_html:
907 s = "T_end_html";
908 break;
909 case T_end_table:
910 s = "T_end_table";
911 break;
912 case T_end_cell:
913 s = "T_end_cell";
914 break;
915 case T_end_font:
916 s = "T_end_font";
917 break;
918 case T_string:
919 s = "T_string";
920 break;
921 case T_error:
922 s = "T_error";
923 break;
924 case T_n_italic:
925 s = "T_n_italic";
926 break;
927 case T_n_bold:
928 s = "T_n_bold";
929 break;
930 case T_n_underline:
931 s = "T_n_underline";
932 break;
933 case T_n_overline:
934 s = "T_n_overline";
935 break;
936 case T_n_sup:
937 s = "T_n_sup";
938 break;
939 case T_n_sub:
940 s = "T_n_sub";
941 break;
942 case T_n_s:
943 s = "T_n_s";
944 break;
945 case T_HR:
946 s = "T_HR";
947 break;
948 case T_hr:
949 s = "T_hr";
950 break;
951 case T_end_hr:
952 s = "T_end_hr";
953 break;
954 case T_VR:
955 s = "T_VR";
956 break;
957 case T_vr:
958 s = "T_vr";
959 break;
960 case T_end_vr:
961 s = "T_end_vr";
962 break;
963 case T_BR:
964 s = "T_BR";
965 break;
966 case T_br:
967 s = "T_br";
968 break;
969 case T_IMG:
970 s = "T_IMG";
971 break;
972 case T_img:
973 s = "T_img";
974 break;
975 case T_table:
976 s = "T_table";
977 break;
978 case T_cell:
979 s = "T_cell";
980 break;
981 case T_font:
982 s = "T_font";
983 break;
984 case T_italic:
985 s = "T_italic";
986 break;
987 case T_bold:
988 s = "T_bold";
989 break;
990 case T_underline:
991 s = "T_underline";
992 break;
993 case T_overline:
994 s = "T_overline";
995 break;
996 case T_sup:
997 s = "T_sup";
998 break;
999 case T_sub:
1000 s = "T_sub";
1001 break;
1002 case T_s:
1003 s = "T_s";
1004 break;
1005 default:
1006 s = "<unknown>";
1007 }
1008 if (tok == T_string) {
1009 fprintf(stderr, "%s \"", s);
1010 fwrite(agxbstart(state.xb), 1, agxblen(state.xb), stderr);
1011 fprintf(stderr, "\"\n");
1012 } else
1013 fprintf(stderr, "%s\n", s);
1014}
1015
1016#endif
1017
1018int htmllex()
1019{
1020#ifdef HAVE_EXPAT
1021 static char *begin_html = "<HTML>";
1022 static char *end_html = "</HTML>";
1023
1024 char *s;
1025 char *endp = 0;
1026 int len, llen;
1027 int rv;
1028
1029 state.tok = 0;
1030 do {
1031 if (state.mode == 2)
1032 return EOF;
1033 if (state.mode == 0) {
1034 state.mode = 1;
1035 s = begin_html;
1036 len = strlen(s);
1037 endp = 0;
1038 } else {
1039 s = state.ptr;
1040 if (*s == '\0') {
1041 state.mode = 2;
1042 s = end_html;
1043 len = strlen(s);
1044 } else {
1045 endp = findNext(s,&state.lb);
1046 len = endp - s;
1047 }
1048 }
1049 state.prevtok = state.currtok;
1050 state.prevtoklen = state.currtoklen;
1051 state.currtok = s;
1052 state.currtoklen = len;
1053 if ((llen = agxblen(&state.lb)))
1054 rv = XML_Parse(state.parser, agxbuse(&state.lb),llen, 0);
1055 else
1056 rv = XML_Parse(state.parser, s, len, (len ? 0 : 1));
1057 if (rv == XML_STATUS_ERROR) {
1058 if (!state.error) {
1059 agerr(AGERR, "%s in line %d \n",
1060 XML_ErrorString(XML_GetErrorCode(state.parser)),
1061 htmllineno());
1062 error_context();
1063 state.error = 1;
1064 state.tok = T_error;
1065 }
1066 }
1067 if (endp)
1068 state.ptr = endp;
1069 } while (state.tok == 0);
1070#if DEBUG
1071 printTok (state.tok);
1072#endif
1073 return state.tok;
1074#else
1075 return EOF;
1076#endif
1077}
1078
1079