Graphviz  2.35.20130930.0449
htmllex.c
Go to the documentation of this file.
1 /* $Id$ $Revision$ */
2 /* vim:set shiftwidth=4 ts=8: */
3 
4 /*************************************************************************
5  * Copyright (c) 2011 AT&T Intellectual Property
6  * All rights reserved. This program and the accompanying materials
7  * are made available under the terms of the Eclipse Public License v1.0
8  * which accompanies this distribution, and is available at
9  * http://www.eclipse.org/legal/epl-v10.html
10  *
11  * Contributors: See CVS logs. Details at http://www.graphviz.org/
12  *************************************************************************/
13 
14 
15 #include "render.h"
16 #include "htmltable.h"
17 #include "htmlparse.h"
18 #include "htmllex.h"
19 #include <ctype.h>
20 
21 #ifdef HAVE_EXPAT
22 #include <expat.h>
23 #endif
24 
25 #ifndef XML_STATUS_ERROR
26 #define XML_STATUS_ERROR 0
27 #endif
28 
29 typedef struct {
30 #ifdef HAVE_EXPAT
31  XML_Parser parser;
32 #endif
33  char* ptr; /* input source */
34  int tok; /* token type */
35  agxbuf* xb; /* buffer to gather T_string data */
36  agxbuf lb; /* buffer for translating lexical data */
37  char warn; /* set if warning given */
38  char error; /* set if error given */
39  char inCell; /* set if in TD to allow T_string */
40  char mode; /* for handling artificial <HTML>..</HTML> */
41  char *currtok; /* for error reporting */
42  char *prevtok; /* for error reporting */
45 } lexstate_t;
46 static lexstate_t state;
47 
48 /* error_context:
49  * Print the last 2 "token"s seen.
50  */
51 static void error_context(void)
52 {
53  agxbclear(state.xb);
54  if (state.prevtoklen > 0)
55  agxbput_n(state.xb, state.prevtok, state.prevtoklen);
56  agxbput_n(state.xb, state.currtok, state.currtoklen);
57  agerr(AGPREV, "... %s ...\n", agxbuse(state.xb));
58 }
59 
60 /* htmlerror:
61  * yyerror - called by yacc output
62  */
63 void htmlerror(const char *msg)
64 {
65  if (state.error)
66  return;
67  state.error = 1;
68  agerr(AGERR, "%s in line %d \n", msg, htmllineno());
69  error_context();
70 }
71 
72 #ifdef HAVE_EXPAT
73 /* lexerror:
74  * called by lexer when unknown <..> is found.
75  */
76 static void lexerror(const char *name)
77 {
78  state.tok = T_error;
79  state.error = 1;
80  agerr(AGERR, "Unknown HTML element <%s> on line %d \n",
81  name, htmllineno());
82 }
83 
84 typedef int (*attrFn) (void *, char *);
85 typedef int (*bcmpfn) (const void *, const void *);
86 
87 #define MAX_CHAR (((unsigned char)(~0)) >> 1)
88 #define MIN_CHAR ((signed char)(~MAX_CHAR))
89 #define MAX_UCHAR ((unsigned char)(~0))
90 #define MAX_USHORT ((unsigned short)(~0))
91 
92 /* Mechanism for automatically processing attributes */
93 typedef struct {
94  char *name; /* attribute name */
95  attrFn action; /* action to perform if name matches */
96 } attr_item;
97 
98 #define ISIZE (sizeof(attr_item))
99 
100 /* icmp:
101  * Compare two attr_item. Used in bsearch
102  */
103 static int icmp(attr_item * i, attr_item * j)
104 {
105  return strcasecmp(i->name, j->name);
106 }
107 
108 static int bgcolorfn(htmldata_t * p, char *v)
109 {
110  p->bgcolor = strdup(v);
111  return 0;
112 }
113 
114 static int pencolorfn(htmldata_t * p, char *v)
115 {
116  p->pencolor = strdup(v);
117  return 0;
118 }
119 
120 static int hreffn(htmldata_t * p, char *v)
121 {
122  p->href = strdup(v);
123  return 0;
124 }
125 
126 static int titlefn(htmldata_t * p, char *v)
127 {
128  p->title = strdup(v);
129  return 0;
130 }
131 
132 static int portfn(htmldata_t * p, char *v)
133 {
134  p->port = strdup(v);
135  return 0;
136 }
137 
138 #define DELIM " ,"
139 
140 static int stylefn(htmldata_t * p, char *v)
141 {
142  int rv = 0;
143  char c;
144  char* tk;
145  char* buf = strdup (v);
146  for (tk = strtok (buf, DELIM); tk; tk = strtok (NULL, DELIM)) {
147  c = toupper(*tk);
148  if (c == 'R') {
149  if (!strcasecmp(tk + 1, "OUNDED")) p->style |= ROUNDED;
150  else if (!strcasecmp(tk + 1, "ADIAL")) p->style |= RADIAL;
151  else {
152  agerr(AGWARN, "Illegal value %s for STYLE - ignored\n", tk);
153  rv = 1;
154  }
155  }
156  else if(!strcasecmp(tk,"SOLID")) p->style &= ~(DOTTED|DASHED);
157  else if(!strcasecmp(tk,"INVISIBLE") || !strcasecmp(tk,"INVIS")) p->style |= INVISIBLE;
158  else if(!strcasecmp(tk,"DOTTED")) p->style |= DOTTED;
159  else if(!strcasecmp(tk,"DASHED")) p->style |= DASHED;
160  else {
161  agerr(AGWARN, "Illegal value %s for STYLE - ignored\n", tk);
162  rv = 1;
163  }
164  }
165  free (buf);
166  return rv;
167 }
168 
169 static int targetfn(htmldata_t * p, char *v)
170 {
171  p->target = strdup(v);
172  return 0;
173 }
174 
175 static int idfn(htmldata_t * p, char *v)
176 {
177  p->id = strdup(v);
178  return 0;
179 }
180 
181 
182 /* doInt:
183  * Scan v for integral value. Check that
184  * the value is >= min and <= max. Return value in ul.
185  * String s is name of value.
186  * Return 0 if okay; 1 otherwise.
187  */
188 static int doInt(char *v, char *s, int min, int max, long *ul)
189 {
190  int rv = 0;
191  char *ep;
192  long b = strtol(v, &ep, 10);
193 
194  if (ep == v) {
195  agerr(AGWARN, "Improper %s value %s - ignored", s, v);
196  rv = 1;
197  } else if (b > max) {
198  agerr(AGWARN, "%s value %s > %d - too large - ignored", s, v, max);
199  rv = 1;
200  } else if (b < min) {
201  agerr(AGWARN, "%s value %s < %d - too small - ignored", s, v, min);
202  rv = 1;
203  } else
204  *ul = b;
205  return rv;
206 }
207 
208 
209 static int gradientanglefn(htmldata_t * p, char *v)
210 {
211  long u;
212 
213  if (doInt(v, "GRADIENTANGLE", 0, 360, &u))
214  return 1;
215  p->gradientangle = (unsigned short) u;
216  return 0;
217 }
218 
219 
220 static int borderfn(htmldata_t * p, char *v)
221 {
222  long u;
223 
224  if (doInt(v, "BORDER", 0, MAX_UCHAR, &u))
225  return 1;
226  p->border = (unsigned char) u;
227  p->flags |= BORDER_SET;
228  return 0;
229 }
230 
231 static int cellpaddingfn(htmldata_t * p, char *v)
232 {
233  long u;
234 
235  if (doInt(v, "CELLPADDING", 0, MAX_UCHAR, &u))
236  return 1;
237  p->pad = (unsigned char) u;
238  p->flags |= PAD_SET;
239  return 0;
240 }
241 
242 static int cellspacingfn(htmldata_t * p, char *v)
243 {
244  long u;
245 
246  if (doInt(v, "CELLSPACING", MIN_CHAR, MAX_CHAR, &u))
247  return 1;
248  p->space = (signed char) u;
249  p->flags |= SPACE_SET;
250  return 0;
251 }
252 
253 static int cellborderfn(htmltbl_t * p, char *v)
254 {
255  long u;
256 
257  if (doInt(v, "CELLSBORDER", 0, MAX_CHAR, &u))
258  return 1;
259  p->cb = (unsigned char) u;
260  return 0;
261 }
262 
263 static int columnsfn(htmltbl_t * p, char *v)
264 {
265  if (*v != '*') {
266  agerr(AGWARN, "Unknown value %s for COLUMNS - ignored\n", v);
267  return 1;
268  }
269  p->flags |= HTML_VRULE;
270  return 0;
271 }
272 
273 static int rowsfn(htmltbl_t * p, char *v)
274 {
275  if (*v != '*') {
276  agerr(AGWARN, "Unknown value %s for ROWS - ignored\n", v);
277  return 1;
278  }
279  p->flags |= HTML_HRULE;
280  return 0;
281 }
282 
283 static int fixedsizefn(htmldata_t * p, char *v)
284 {
285  int rv = 0;
286  char c = toupper(*(unsigned char *) v);
287  if ((c == 'T') && !strcasecmp(v + 1, "RUE"))
288  p->flags |= FIXED_FLAG;
289  else if ((c != 'F') || strcasecmp(v + 1, "ALSE")) {
290  agerr(AGWARN, "Illegal value %s for FIXEDSIZE - ignored\n", v);
291  rv = 1;
292  }
293  return rv;
294 }
295 
296 static int valignfn(htmldata_t * p, char *v)
297 {
298  int rv = 0;
299  char c = toupper(*v);
300  if ((c == 'B') && !strcasecmp(v + 1, "OTTOM"))
301  p->flags |= VALIGN_BOTTOM;
302  else if ((c == 'T') && !strcasecmp(v + 1, "OP"))
303  p->flags |= VALIGN_TOP;
304  else if ((c != 'M') || strcasecmp(v + 1, "IDDLE")) {
305  agerr(AGWARN, "Illegal value %s for VALIGN - ignored\n", v);
306  rv = 1;
307  }
308  return rv;
309 }
310 
311 static int halignfn(htmldata_t * p, char *v)
312 {
313  int rv = 0;
314  char c = toupper(*v);
315  if ((c == 'L') && !strcasecmp(v + 1, "EFT"))
316  p->flags |= HALIGN_LEFT;
317  else if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
318  p->flags |= HALIGN_RIGHT;
319  else if ((c != 'C') || strcasecmp(v + 1, "ENTER")) {
320  agerr(AGWARN, "Illegal value %s for ALIGN - ignored\n", v);
321  rv = 1;
322  }
323  return rv;
324 }
325 
326 static int cell_halignfn(htmldata_t * p, char *v)
327 {
328  int rv = 0;
329  char c = toupper(*v);
330  if ((c == 'L') && !strcasecmp(v + 1, "EFT"))
331  p->flags |= HALIGN_LEFT;
332  else if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
333  p->flags |= HALIGN_RIGHT;
334  else if ((c == 'T') && !strcasecmp(v + 1, "EXT"))
335  p->flags |= HALIGN_TEXT;
336  else if ((c != 'C') || strcasecmp(v + 1, "ENTER"))
337  rv = 1;
338  if (rv)
339  agerr(AGWARN, "Illegal value %s for ALIGN in TD - ignored\n", v);
340  return rv;
341 }
342 
343 static int balignfn(htmldata_t * p, char *v)
344 {
345  int rv = 0;
346  char c = toupper(*v);
347  if ((c == 'L') && !strcasecmp(v + 1, "EFT"))
348  p->flags |= BALIGN_LEFT;
349  else if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
350  p->flags |= BALIGN_RIGHT;
351  else if ((c != 'C') || strcasecmp(v + 1, "ENTER"))
352  rv = 1;
353  if (rv)
354  agerr(AGWARN, "Illegal value %s for BALIGN in TD - ignored\n", v);
355  return rv;
356 }
357 
358 static int heightfn(htmldata_t * p, char *v)
359 {
360  long u;
361 
362  if (doInt(v, "HEIGHT", 0, MAX_USHORT, &u))
363  return 1;
364  p->height = (unsigned short) u;
365  return 0;
366 }
367 
368 static int widthfn(htmldata_t * p, char *v)
369 {
370  long u;
371 
372  if (doInt(v, "WIDTH", 0, MAX_USHORT, &u))
373  return 1;
374  p->width = (unsigned short) u;
375  return 0;
376 }
377 
378 static int rowspanfn(htmlcell_t * p, char *v)
379 {
380  long u;
381 
382  if (doInt(v, "ROWSPAN", 0, MAX_USHORT, &u))
383  return 1;
384  if (u == 0) {
385  agerr(AGWARN, "ROWSPAN value cannot be 0 - ignored\n");
386  return 1;
387  }
388  p->rspan = (unsigned short) u;
389  return 0;
390 }
391 
392 static int colspanfn(htmlcell_t * p, char *v)
393 {
394  long u;
395 
396  if (doInt(v, "COLSPAN", 0, MAX_USHORT, &u))
397  return 1;
398  if (u == 0) {
399  agerr(AGWARN, "COLSPAN value cannot be 0 - ignored\n");
400  return 1;
401  }
402  p->cspan = (unsigned short) u;
403  return 0;
404 }
405 
406 static int fontcolorfn(htmlfont_t * p, char *v)
407 {
408  p->color = strdup(v);
409  return 0;
410 }
411 
412 static int facefn(htmlfont_t * p, char *v)
413 {
414  p->name = strdup(v);
415  return 0;
416 }
417 
418 static int ptsizefn(htmlfont_t * p, char *v)
419 {
420  long u;
421 
422  if (doInt(v, "POINT-SIZE", 0, MAX_UCHAR, &u))
423  return 1;
424  p->size = (double) u;
425  return 0;
426 }
427 
428 static int srcfn(htmlimg_t * p, char *v)
429 {
430  p->src = strdup(v);
431  return 0;
432 }
433 
434 static int scalefn(htmlimg_t * p, char *v)
435 {
436  p->scale = strdup(v);
437  return 0;
438 }
439 
440 static int alignfn(int *p, char *v)
441 {
442  int rv = 0;
443  char c = toupper(*v);
444  if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
445  *p = 'r';
446  else if ((c == 'L') || !strcasecmp(v + 1, "EFT"))
447  *p = 'l';
448  else if ((c == 'C') || strcasecmp(v + 1, "ENTER"))
449  *p = 'n';
450  else {
451  agerr(AGWARN, "Illegal value %s for ALIGN - ignored\n", v);
452  rv = 1;
453  }
454  return rv;
455 }
456 
457 /* Tables used in binary search; MUST be alphabetized */
458 static attr_item tbl_items[] = {
459  {"align", (attrFn) halignfn},
460  {"bgcolor", (attrFn) bgcolorfn},
461  {"border", (attrFn) borderfn},
462  {"cellborder", (attrFn) cellborderfn},
463  {"cellpadding", (attrFn) cellpaddingfn},
464  {"cellspacing", (attrFn) cellspacingfn},
465  {"color", (attrFn) pencolorfn},
466  {"columns", (attrFn) columnsfn},
467  {"fixedsize", (attrFn) fixedsizefn},
468  {"gradientangle", (attrFn) gradientanglefn},
469  {"height", (attrFn) heightfn},
470  {"href", (attrFn) hreffn},
471  {"id", (attrFn) idfn},
472  {"port", (attrFn) portfn},
473  {"rows", (attrFn) rowsfn},
474  {"style", (attrFn) stylefn},
475  {"target", (attrFn) targetfn},
476  {"title", (attrFn) titlefn},
477  {"tooltip", (attrFn) titlefn},
478  {"valign", (attrFn) valignfn},
479  {"width", (attrFn) widthfn},
480 };
481 
482 static attr_item cell_items[] = {
483  {"align", (attrFn) cell_halignfn},
484  {"balign", (attrFn) balignfn},
485  {"bgcolor", (attrFn) bgcolorfn},
486  {"border", (attrFn) borderfn},
487  {"cellpadding", (attrFn) cellpaddingfn},
488  {"cellspacing", (attrFn) cellspacingfn},
489  {"color", (attrFn) pencolorfn},
490  {"colspan", (attrFn) colspanfn},
491  {"fixedsize", (attrFn) fixedsizefn},
492  {"gradientangle", (attrFn) gradientanglefn},
493  {"height", (attrFn) heightfn},
494  {"href", (attrFn) hreffn},
495  {"id", (attrFn) idfn},
496  {"port", (attrFn) portfn},
497  {"rowspan", (attrFn) rowspanfn},
498  {"style", (attrFn) stylefn},
499  {"target", (attrFn) targetfn},
500  {"title", (attrFn) titlefn},
501  {"tooltip", (attrFn) titlefn},
502  {"valign", (attrFn) valignfn},
503  {"width", (attrFn) widthfn},
504 };
505 
506 static attr_item font_items[] = {
507  {"color", (attrFn) fontcolorfn},
508  {"face", (attrFn) facefn},
509  {"point-size", (attrFn) ptsizefn},
510 };
511 
512 static attr_item img_items[] = {
513  {"scale", (attrFn) scalefn},
514  {"src", (attrFn) srcfn},
515 };
516 
517 static attr_item br_items[] = {
518  {"align", (attrFn) alignfn},
519 };
520 
521 /* doAttrs:
522  * General function for processing list of name/value attributes.
523  * Do binary search on items table. If match found, invoke action
524  * passing it tp and attribute value.
525  * Table size is given by nel
526  * Name/value pairs are in array atts, which is null terminated.
527  * s is the name of the HTML element being processed.
528  */
529 static void
530 doAttrs(void *tp, attr_item * items, int nel, char **atts, char *s)
531 {
532  char *name;
533  char *val;
534  attr_item *ip;
535  attr_item key;
536 
537  while ((name = *atts++) != NULL) {
538  val = *atts++;
539  key.name = name;
540  ip = (attr_item *) bsearch(&key, items, nel, ISIZE, (bcmpfn) icmp);
541  if (ip)
542  state.warn |= ip->action(tp, val);
543  else {
544  agerr(AGWARN, "Illegal attribute %s in %s - ignored\n", name,
545  s);
546  state.warn = 1;
547  }
548  }
549 }
550 
551 static void mkBR(char **atts)
552 {
554  doAttrs(&htmllval.i, br_items, sizeof(br_items) / ISIZE, atts, "<BR>");
555 }
556 
557 static htmlimg_t *mkImg(char **atts)
558 {
559  htmlimg_t *img = NEW(htmlimg_t);
560 
561  doAttrs(img, img_items, sizeof(img_items) / ISIZE, atts, "<IMG>");
562 
563  return img;
564 }
565 
566 static htmlfont_t *mkFont(char **atts, int flags, int ul)
567 {
568  htmlfont_t *font = NEW(htmlfont_t);
569 
570  font->size = -1.0; /* unassigned */
571  font->flags = flags;
572  if (atts)
573  doAttrs(font, font_items, sizeof(font_items) / ISIZE, atts, "<FONT>");
574 
575  return font;
576 }
577 
578 static htmlcell_t *mkCell(char **atts)
579 {
581 
582  cell->cspan = 1;
583  cell->rspan = 1;
584  doAttrs(cell, cell_items, sizeof(cell_items) / ISIZE, atts, "<TD>");
585 
586  return cell;
587 }
588 
589 static htmltbl_t *mkTbl(char **atts)
590 {
591  htmltbl_t *tbl = NEW(htmltbl_t);
592 
593  tbl->rc = -1; /* flag that table is a raw, parsed table */
594  tbl->cb = -1; /* unset cell border attribute */
595  doAttrs(tbl, tbl_items, sizeof(tbl_items) / ISIZE, atts, "<TABLE>");
596 
597  return tbl;
598 }
599 
600 static void startElement(void *user, const char *name, char **atts)
601 {
602  if (strcasecmp(name, "TABLE") == 0) {
603  htmllval.tbl = mkTbl(atts);
604  state.inCell = 0;
605  state.tok = T_table;
606  } else if ((strcasecmp(name, "TR") == 0)
607  || (strcasecmp(name, "TH") == 0)) {
608  state.inCell = 0;
609  state.tok = T_row;
610  } else if (strcasecmp(name, "TD") == 0) {
611  state.inCell = 1;
612  htmllval.cell = mkCell(atts);
613  state.tok = T_cell;
614  } else if (strcasecmp(name, "FONT") == 0) {
615  htmllval.font = mkFont(atts, 0, 0);
616  state.tok = T_font;
617  } else if (strcasecmp(name, "B") == 0) {
618  htmllval.font = mkFont(0, HTML_BF, 0);
619  state.tok = T_bold;
620  } else if (strcasecmp(name, "S") == 0) {
621  htmllval.font = mkFont(0, HTML_S, 0);
622  state.tok = T_s;
623  } else if (strcasecmp(name, "U") == 0) {
624  htmllval.font = mkFont(0, HTML_UL, 1);
625  state.tok = T_underline;
626  } else if (strcasecmp(name, "I") == 0) {
627  htmllval.font = mkFont(0, HTML_IF, 0);
628  state.tok = T_italic;
629  } else if (strcasecmp(name, "SUP") == 0) {
630  htmllval.font = mkFont(0, HTML_SUP, 0);
631  state.tok = T_sup;
632  } else if (strcasecmp(name, "SUB") == 0) {
633  htmllval.font = mkFont(0, HTML_SUB, 0);
634  state.tok = T_sub;
635  } else if (strcasecmp(name, "BR") == 0) {
636  mkBR(atts);
637  state.tok = T_br;
638  } else if (strcasecmp(name, "HR") == 0) {
639  state.tok = T_hr;
640  } else if (strcasecmp(name, "VR") == 0) {
641  state.tok = T_vr;
642  } else if (strcasecmp(name, "IMG") == 0) {
643  htmllval.img = mkImg(atts);
644  state.tok = T_img;
645  } else if (strcasecmp(name, "HTML") == 0) {
646  state.tok = T_html;
647  } else {
648  lexerror(name);
649  }
650 }
651 
652 static void endElement(void *user, const char *name)
653 {
654  if (strcasecmp(name, "TABLE") == 0) {
655  state.tok = T_end_table;
656  state.inCell = 1;
657  } else if ((strcasecmp(name, "TR") == 0)
658  || (strcasecmp(name, "TH") == 0)) {
659  state.tok = T_end_row;
660  } else if (strcasecmp(name, "TD") == 0) {
661  state.tok = T_end_cell;
662  state.inCell = 0;
663  } else if (strcasecmp(name, "HTML") == 0) {
664  state.tok = T_end_html;
665  } else if (strcasecmp(name, "FONT") == 0) {
666  state.tok = T_end_font;
667  } else if (strcasecmp(name, "B") == 0) {
668  state.tok = T_n_bold;
669  } else if (strcasecmp(name, "U") == 0) {
670  state.tok = T_n_underline;
671  } else if (strcasecmp(name, "I") == 0) {
672  state.tok = T_n_italic;
673  } else if (strcasecmp(name, "SUP") == 0) {
674  state.tok = T_n_sup;
675  } else if (strcasecmp(name, "SUB") == 0) {
676  state.tok = T_n_sub;
677  } else if (strcasecmp(name, "S") == 0) {
678  state.tok = T_n_s;
679  } else if (strcasecmp(name, "BR") == 0) {
680  if (state.tok == T_br)
681  state.tok = T_BR;
682  else
683  state.tok = T_end_br;
684  } else if (strcasecmp(name, "HR") == 0) {
685  if (state.tok == T_hr)
686  state.tok = T_HR;
687  else
688  state.tok = T_end_hr;
689  } else if (strcasecmp(name, "VR") == 0) {
690  if (state.tok == T_vr)
691  state.tok = T_VR;
692  else
693  state.tok = T_end_vr;
694  } else if (strcasecmp(name, "IMG") == 0) {
695  if (state.tok == T_img)
696  state.tok = T_IMG;
697  else
698  state.tok = T_end_img;
699  } else {
700  lexerror(name);
701  }
702 }
703 
704 /* characterData:
705  * Generate T_string token. Do this only when immediately in
706  * <TD>..</TD> or <HTML>..</HTML>, i.e., when inCell is true.
707  * Strip out formatting characters but keep spaces.
708  * Distinguish between all whitespace vs. strings with non-whitespace
709  * characters.
710  */
711 static void characterData(void *user, const char *s, int length)
712 {
713  int i, rc, cnt = 0;
714  unsigned char c;
715 
716  if (state.inCell) {
717  for (i = length; i; i--) {
718  c = *s++;
719  if (c >= ' ') {
720  cnt++;
721  rc = agxbputc(state.xb, c);
722  }
723  }
724  if (cnt) state.tok = T_string;
725  }
726 }
727 #endif
728 
729 int initHTMLlexer(char *src, agxbuf * xb, int charset)
730 {
731 #ifdef HAVE_EXPAT
732  state.xb = xb;
733  agxbinit (&state.lb, SMALLBUF, NULL);
734  state.ptr = src;
735  state.mode = 0;
736  state.warn = 0;
737  state.error = 0;
738  state.currtoklen = 0;
739  state.prevtoklen = 0;
740  state.inCell = 1;
741  state.parser = XML_ParserCreate(charsetToStr(charset));
742  XML_SetElementHandler(state.parser,
743  (XML_StartElementHandler) startElement,
744  endElement);
745  XML_SetCharacterDataHandler(state.parser, characterData);
746  return 0;
747 #else
748  static int first;
749  if (!first) {
750  agerr(AGWARN,
751  "Not built with libexpat. Table formatting is not available.\n");
752  first++;
753  }
754  return 1;
755 #endif
756 }
757 
759 {
760 #ifdef HAVE_EXPAT
761  int rv = state.warn | state.error;
762  XML_ParserFree(state.parser);
763  agxbfree (&state.lb);
764  return rv;
765 #else
766  return 1;
767 #endif
768 }
769 
770 #ifdef HAVE_EXPAT
771 /* eatComment:
772  * Given first character after open comment, eat characters
773  * upto comment close, returning pointer to closing > if it exists,
774  * or null character otherwise.
775  * We rely on HTML strings having matched nested <>.
776  */
777 static char *eatComment(char *p)
778 {
779  int depth = 1;
780  char *s = p;
781  char c;
782 
783  while (depth && (c = *s++)) {
784  if (c == '<')
785  depth++;
786  else if (c == '>')
787  depth--;
788  }
789  s--; /* move back to '\0' or '>' */
790  if (*s) {
791  char *t = s - 2;
792  if ((t < p) || strncmp(t, "--", 2)) {
793  agerr(AGWARN, "Unclosed comment\n");
794  state.warn = 1;
795  }
796  }
797  return s;
798 }
799 
800 /* findNext:
801  * Return next XML unit. This is either <..>, an HTML
802  * comment <!-- ... -->, or characters up to next <.
803  */
804 static char *findNext(char *s, agxbuf* xb)
805 {
806  char* t = s + 1;
807  char c;
808  int rc;
809 
810  if (*s == '<') {
811  if ((*t == '!') && !strncmp(t + 1, "--", 2))
812  t = eatComment(t + 3);
813  else
814  while (*t && (*t != '>'))
815  t++;
816  if (*t != '>') {
817  agerr(AGWARN, "Label closed before end of HTML element\n");
818  state.warn = 1;
819  } else
820  t++;
821  } else {
822  t = s;
823  while ((c = *t) && (c != '<')) {
824  if ((c == '&') && (*(t+1) != '#')) {
825  t = scanEntity(t + 1, xb);
826  }
827  else {
828  rc = agxbputc(xb, c);
829  t++;
830  }
831  }
832  }
833  return t;
834 }
835 #endif
836 
838 {
839 #ifdef HAVE_EXPAT
840  return XML_GetCurrentLineNumber(state.parser);
841 #else
842  return 0;
843 #endif
844 }
845 
846 #ifdef DEBUG
847 static void printTok(int tok)
848 {
849  char *s;
850 
851  switch (tok) {
852  case T_VR:
853  s = "T_VR";
854  break;
855  case T_vr:
856  s = "T_vr";
857  break;
858  case T_end_vr:
859  s = "T_end_vr";
860  break;
861  case T_HR:
862  s = "T_HR";
863  break;
864  case T_hr:
865  s = "T_hr";
866  break;
867  case T_end_hr:
868  s = "T_end_hr";
869  break;
870  case T_BR:
871  s = "T_BR";
872  break;
873  case T_br:
874  s = "T_br";
875  break;
876  case T_end_br:
877  s = "T_end_br";
878  break;
879  case T_end_table:
880  s = "T_end_table";
881  break;
882  case T_row:
883  s = "T_row";
884  break;
885  case T_end_row:
886  s = "T_end_row";
887  break;
888  case T_end_cell:
889  s = "T_end_cell";
890  break;
891  case T_html:
892  s = "T_html";
893  break;
894  case T_end_html:
895  s = "T_end_html";
896  break;
897  case T_string:
898  s = "T_string";
899  break;
900  case T_error:
901  s = "T_error";
902  break;
903  case T_table:
904  s = "T_table";
905  break;
906  case T_cell:
907  s = "T_cell";
908  break;
909  case T_img:
910  s = "T_img";
911  break;
912  case T_end_img:
913  s = "T_end_img";
914  break;
915  case T_IMG:
916  s = "T_IMG";
917  break;
918  case T_underline:
919  s = "T_underline";
920  break;
921  case T_n_underline:
922  s = "T_n_underline";
923  break;
924  case T_italic:
925  s = "T_italic";
926  break;
927  case T_n_italic:
928  s = "T_n_italic";
929  break;
930  case T_bold:
931  s = "T_bold";
932  break;
933  case T_n_bold:
934  s = "T_n_bold";
935  break;
936  case T_s:
937  s = "T_s";
938  break;
939  case T_n_s:
940  s = "T_n_s";
941  break;
942  default:
943  s = "<unknown>";
944  }
945  if (tok == T_string) {
946  fprintf(stderr, "%s \"", s);
947  fwrite(agxbstart(state.xb), 1, agxblen(state.xb), stderr);
948  fprintf(stderr, "\"\n");
949  } else
950  fprintf(stderr, "%s\n", s);
951 }
952 
953 #endif
954 
955 int htmllex()
956 {
957 #ifdef HAVE_EXPAT
958  static char *begin_html = "<HTML>";
959  static char *end_html = "</HTML>";
960 
961  char *s;
962  char *endp = 0;
963  int len, llen;
964  int rv;
965 
966  state.tok = 0;
967  do {
968  if (state.mode == 2)
969  return EOF;
970  if (state.mode == 0) {
971  state.mode = 1;
972  s = begin_html;
973  len = strlen(s);
974  endp = 0;
975  } else {
976  s = state.ptr;
977  if (*s == '\0') {
978  state.mode = 2;
979  s = end_html;
980  len = strlen(s);
981  } else {
982  endp = findNext(s,&state.lb);
983  len = endp - s;
984  }
985  }
986  state.prevtok = state.currtok;
987  state.prevtoklen = state.currtoklen;
988  state.currtok = s;
989  state.currtoklen = len;
990  if ((llen = agxblen(&state.lb)))
991  rv = XML_Parse(state.parser, agxbuse(&state.lb),llen, 0);
992  else
993  rv = XML_Parse(state.parser, s, len, (len ? 0 : 1));
994  if (rv == XML_STATUS_ERROR) {
995  if (!state.error) {
996  agerr(AGERR, "%s in line %d \n",
997  XML_ErrorString(XML_GetErrorCode(state.parser)),
998  htmllineno());
999  error_context();
1000  state.error = 1;
1001  state.tok = T_error;
1002  }
1003  }
1004  if (endp)
1005  state.ptr = endp;
1006  } while (state.tok == 0);
1007  /* printTok (state.tok); */
1008  return state.tok;
1009 #else
1010  return EOF;
1011 #endif
1012 }
1013