Graphviz  2.39.20141222.0545
htmllex.c
Go to the documentation of this file.
1 /* $Id$ $Revision$ */
2 /* vim:set shiftwidth=4 ts=8: */
3 
4 /*************************************************************************
5  * Copyright (c) 2011 AT&T Intellectual Property
6  * All rights reserved. This program and the accompanying materials
7  * are made available under the terms of the Eclipse Public License v1.0
8  * which accompanies this distribution, and is available at
9  * http://www.eclipse.org/legal/epl-v10.html
10  *
11  * Contributors: See CVS logs. Details at http://www.graphviz.org/
12  *************************************************************************/
13 
14 
15 #include "render.h"
16 #include "htmltable.h"
17 #include "htmlparse.h"
18 #include "htmllex.h"
19 #include "cdt.h"
20 #include <ctype.h>
21 
22 #ifdef HAVE_EXPAT
23 #include <expat.h>
24 #endif
25 
26 #ifndef XML_STATUS_ERROR
27 #define XML_STATUS_ERROR 0
28 #endif
29 
30 typedef struct {
31 #ifdef HAVE_EXPAT
32  XML_Parser parser;
33 #endif
34  char* ptr; /* input source */
35  int tok; /* token type */
36  agxbuf* xb; /* buffer to gather T_string data */
37  agxbuf lb; /* buffer for translating lexical data */
38  char warn; /* set if warning given */
39  char error; /* set if error given */
40  char inCell; /* set if in TD to allow T_string */
41  char mode; /* for handling artificial <HTML>..</HTML> */
42  char *currtok; /* for error reporting */
43  char *prevtok; /* for error reporting */
46 } lexstate_t;
47 static lexstate_t state;
48 
49 /* error_context:
50  * Print the last 2 "token"s seen.
51  */
52 static void error_context(void)
53 {
54  agxbclear(state.xb);
55  if (state.prevtoklen > 0)
56  agxbput_n(state.xb, state.prevtok, state.prevtoklen);
57  agxbput_n(state.xb, state.currtok, state.currtoklen);
58  agerr(AGPREV, "... %s ...\n", agxbuse(state.xb));
59 }
60 
61 /* htmlerror:
62  * yyerror - called by yacc output
63  */
64 void htmlerror(const char *msg)
65 {
66  if (state.error)
67  return;
68  state.error = 1;
69  agerr(AGERR, "%s in line %d \n", msg, htmllineno());
70  error_context();
71 }
72 
73 #ifdef HAVE_EXPAT
74 /* lexerror:
75  * called by lexer when unknown <..> is found.
76  */
77 static void lexerror(const char *name)
78 {
79  state.tok = T_error;
80  state.error = 1;
81  agerr(AGERR, "Unknown HTML element <%s> on line %d \n",
82  name, htmllineno());
83 }
84 
85 typedef int (*attrFn) (void *, char *);
86 typedef int (*bcmpfn) (const void *, const void *);
87 
88 #define MAX_CHAR (((unsigned char)(~0)) >> 1)
89 #define MIN_CHAR ((signed char)(~MAX_CHAR))
90 #define MAX_UCHAR ((unsigned char)(~0))
91 #define MAX_USHORT ((unsigned short)(~0))
92 
93 /* Mechanism for automatically processing attributes */
94 typedef struct {
95  char *name; /* attribute name */
96  attrFn action; /* action to perform if name matches */
97 } attr_item;
98 
99 #define ISIZE (sizeof(attr_item))
100 
101 /* icmp:
102  * Compare two attr_item. Used in bsearch
103  */
104 static int icmp(attr_item * i, attr_item * j)
105 {
106  return strcasecmp(i->name, j->name);
107 }
108 
109 static int bgcolorfn(htmldata_t * p, char *v)
110 {
111  p->bgcolor = strdup(v);
112  return 0;
113 }
114 
115 static int pencolorfn(htmldata_t * p, char *v)
116 {
117  p->pencolor = strdup(v);
118  return 0;
119 }
120 
121 static int hreffn(htmldata_t * p, char *v)
122 {
123  p->href = strdup(v);
124  return 0;
125 }
126 
127 static int sidesfn(htmldata_t * p, char *v)
128 {
129  unsigned short flags = 0;
130  char c;
131 
132  while ((c = *v++)) {
133  switch (tolower(c)) {
134  case 'l' :
135  flags |= BORDER_LEFT;
136  break;
137  case 't' :
138  flags |= BORDER_TOP;
139  break;
140  case 'r' :
141  flags |= BORDER_RIGHT;
142  break;
143  case 'b' :
144  flags |= BORDER_BOTTOM;
145  break;
146  default :
147  agerr(AGWARN, "Unrecognized character '%c' (%d) in sides attribute\n", c, c);
148  break;
149  }
150  }
151  if (flags != BORDER_MASK)
152  p->flags |= flags;
153  return 0;
154 }
155 
156 static int titlefn(htmldata_t * p, char *v)
157 {
158  p->title = strdup(v);
159  return 0;
160 }
161 
162 static int portfn(htmldata_t * p, char *v)
163 {
164  p->port = strdup(v);
165  return 0;
166 }
167 
168 #define DELIM " ,"
169 
170 static int stylefn(htmldata_t * p, char *v)
171 {
172  int rv = 0;
173  char c;
174  char* tk;
175  char* buf = strdup (v);
176  for (tk = strtok (buf, DELIM); tk; tk = strtok (NULL, DELIM)) {
177  c = toupper(*tk);
178  if (c == 'R') {
179  if (!strcasecmp(tk + 1, "OUNDED")) p->style |= ROUNDED;
180  else if (!strcasecmp(tk + 1, "ADIAL")) p->style |= RADIAL;
181  else {
182  agerr(AGWARN, "Illegal value %s for STYLE - ignored\n", tk);
183  rv = 1;
184  }
185  }
186  else if(!strcasecmp(tk,"SOLID")) p->style &= ~(DOTTED|DASHED);
187  else if(!strcasecmp(tk,"INVISIBLE") || !strcasecmp(tk,"INVIS")) p->style |= INVISIBLE;
188  else if(!strcasecmp(tk,"DOTTED")) p->style |= DOTTED;
189  else if(!strcasecmp(tk,"DASHED")) p->style |= DASHED;
190  else {
191  agerr(AGWARN, "Illegal value %s for STYLE - ignored\n", tk);
192  rv = 1;
193  }
194  }
195  free (buf);
196  return rv;
197 }
198 
199 static int targetfn(htmldata_t * p, char *v)
200 {
201  p->target = strdup(v);
202  return 0;
203 }
204 
205 static int idfn(htmldata_t * p, char *v)
206 {
207  p->id = strdup(v);
208  return 0;
209 }
210 
211 
212 /* doInt:
213  * Scan v for integral value. Check that
214  * the value is >= min and <= max. Return value in ul.
215  * String s is name of value.
216  * Return 0 if okay; 1 otherwise.
217  */
218 static int doInt(char *v, char *s, int min, int max, long *ul)
219 {
220  int rv = 0;
221  char *ep;
222  long b = strtol(v, &ep, 10);
223 
224  if (ep == v) {
225  agerr(AGWARN, "Improper %s value %s - ignored", s, v);
226  rv = 1;
227  } else if (b > max) {
228  agerr(AGWARN, "%s value %s > %d - too large - ignored", s, v, max);
229  rv = 1;
230  } else if (b < min) {
231  agerr(AGWARN, "%s value %s < %d - too small - ignored", s, v, min);
232  rv = 1;
233  } else
234  *ul = b;
235  return rv;
236 }
237 
238 
239 static int gradientanglefn(htmldata_t * p, char *v)
240 {
241  long u;
242 
243  if (doInt(v, "GRADIENTANGLE", 0, 360, &u))
244  return 1;
245  p->gradientangle = (unsigned short) u;
246  return 0;
247 }
248 
249 
250 static int borderfn(htmldata_t * p, char *v)
251 {
252  long u;
253 
254  if (doInt(v, "BORDER", 0, MAX_UCHAR, &u))
255  return 1;
256  p->border = (unsigned char) u;
257  p->flags |= BORDER_SET;
258  return 0;
259 }
260 
261 static int cellpaddingfn(htmldata_t * p, char *v)
262 {
263  long u;
264 
265  if (doInt(v, "CELLPADDING", 0, MAX_UCHAR, &u))
266  return 1;
267  p->pad = (unsigned char) u;
268  p->flags |= PAD_SET;
269  return 0;
270 }
271 
272 static int cellspacingfn(htmldata_t * p, char *v)
273 {
274  long u;
275 
276  if (doInt(v, "CELLSPACING", MIN_CHAR, MAX_CHAR, &u))
277  return 1;
278  p->space = (signed char) u;
279  p->flags |= SPACE_SET;
280  return 0;
281 }
282 
283 static int cellborderfn(htmltbl_t * p, char *v)
284 {
285  long u;
286 
287  if (doInt(v, "CELLSBORDER", 0, MAX_CHAR, &u))
288  return 1;
289  p->cb = (unsigned char) u;
290  return 0;
291 }
292 
293 static int columnsfn(htmltbl_t * p, char *v)
294 {
295  if (*v != '*') {
296  agerr(AGWARN, "Unknown value %s for COLUMNS - ignored\n", v);
297  return 1;
298  }
299  p->flags |= HTML_VRULE;
300  return 0;
301 }
302 
303 static int rowsfn(htmltbl_t * p, char *v)
304 {
305  if (*v != '*') {
306  agerr(AGWARN, "Unknown value %s for ROWS - ignored\n", v);
307  return 1;
308  }
309  p->flags |= HTML_HRULE;
310  return 0;
311 }
312 
313 static int fixedsizefn(htmldata_t * p, char *v)
314 {
315  int rv = 0;
316  char c = toupper(*(unsigned char *) v);
317  if ((c == 'T') && !strcasecmp(v + 1, "RUE"))
318  p->flags |= FIXED_FLAG;
319  else if ((c != 'F') || strcasecmp(v + 1, "ALSE")) {
320  agerr(AGWARN, "Illegal value %s for FIXEDSIZE - ignored\n", v);
321  rv = 1;
322  }
323  return rv;
324 }
325 
326 static int valignfn(htmldata_t * p, char *v)
327 {
328  int rv = 0;
329  char c = toupper(*v);
330  if ((c == 'B') && !strcasecmp(v + 1, "OTTOM"))
331  p->flags |= VALIGN_BOTTOM;
332  else if ((c == 'T') && !strcasecmp(v + 1, "OP"))
333  p->flags |= VALIGN_TOP;
334  else if ((c != 'M') || strcasecmp(v + 1, "IDDLE")) {
335  agerr(AGWARN, "Illegal value %s for VALIGN - ignored\n", v);
336  rv = 1;
337  }
338  return rv;
339 }
340 
341 static int halignfn(htmldata_t * p, char *v)
342 {
343  int rv = 0;
344  char c = toupper(*v);
345  if ((c == 'L') && !strcasecmp(v + 1, "EFT"))
346  p->flags |= HALIGN_LEFT;
347  else if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
348  p->flags |= HALIGN_RIGHT;
349  else if ((c != 'C') || strcasecmp(v + 1, "ENTER")) {
350  agerr(AGWARN, "Illegal value %s for ALIGN - ignored\n", v);
351  rv = 1;
352  }
353  return rv;
354 }
355 
356 static int cell_halignfn(htmldata_t * p, char *v)
357 {
358  int rv = 0;
359  char c = toupper(*v);
360  if ((c == 'L') && !strcasecmp(v + 1, "EFT"))
361  p->flags |= HALIGN_LEFT;
362  else if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
363  p->flags |= HALIGN_RIGHT;
364  else if ((c == 'T') && !strcasecmp(v + 1, "EXT"))
365  p->flags |= HALIGN_TEXT;
366  else if ((c != 'C') || strcasecmp(v + 1, "ENTER"))
367  rv = 1;
368  if (rv)
369  agerr(AGWARN, "Illegal value %s for ALIGN in TD - ignored\n", v);
370  return rv;
371 }
372 
373 static int balignfn(htmldata_t * p, char *v)
374 {
375  int rv = 0;
376  char c = toupper(*v);
377  if ((c == 'L') && !strcasecmp(v + 1, "EFT"))
378  p->flags |= BALIGN_LEFT;
379  else if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
380  p->flags |= BALIGN_RIGHT;
381  else if ((c != 'C') || strcasecmp(v + 1, "ENTER"))
382  rv = 1;
383  if (rv)
384  agerr(AGWARN, "Illegal value %s for BALIGN in TD - ignored\n", v);
385  return rv;
386 }
387 
388 static int heightfn(htmldata_t * p, char *v)
389 {
390  long u;
391 
392  if (doInt(v, "HEIGHT", 0, MAX_USHORT, &u))
393  return 1;
394  p->height = (unsigned short) u;
395  return 0;
396 }
397 
398 static int widthfn(htmldata_t * p, char *v)
399 {
400  long u;
401 
402  if (doInt(v, "WIDTH", 0, MAX_USHORT, &u))
403  return 1;
404  p->width = (unsigned short) u;
405  return 0;
406 }
407 
408 static int rowspanfn(htmlcell_t * p, char *v)
409 {
410  long u;
411 
412  if (doInt(v, "ROWSPAN", 0, MAX_USHORT, &u))
413  return 1;
414  if (u == 0) {
415  agerr(AGWARN, "ROWSPAN value cannot be 0 - ignored\n");
416  return 1;
417  }
418  p->rspan = (unsigned short) u;
419  return 0;
420 }
421 
422 static int colspanfn(htmlcell_t * p, char *v)
423 {
424  long u;
425 
426  if (doInt(v, "COLSPAN", 0, MAX_USHORT, &u))
427  return 1;
428  if (u == 0) {
429  agerr(AGWARN, "COLSPAN value cannot be 0 - ignored\n");
430  return 1;
431  }
432  p->cspan = (unsigned short) u;
433  return 0;
434 }
435 
436 static int fontcolorfn(textfont_t * p, char *v)
437 {
438  p->color = v;
439  return 0;
440 }
441 
442 static int facefn(textfont_t * p, char *v)
443 {
444  p->name = v;
445  return 0;
446 }
447 
448 static int ptsizefn(textfont_t * p, char *v)
449 {
450  long u;
451 
452  if (doInt(v, "POINT-SIZE", 0, MAX_UCHAR, &u))
453  return 1;
454  p->size = (double) u;
455  return 0;
456 }
457 
458 static int srcfn(htmlimg_t * p, char *v)
459 {
460  p->src = strdup(v);
461  return 0;
462 }
463 
464 static int scalefn(htmlimg_t * p, char *v)
465 {
466  p->scale = strdup(v);
467  return 0;
468 }
469 
470 static int alignfn(int *p, char *v)
471 {
472  int rv = 0;
473  char c = toupper(*v);
474  if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
475  *p = 'r';
476  else if ((c == 'L') || !strcasecmp(v + 1, "EFT"))
477  *p = 'l';
478  else if ((c == 'C') || strcasecmp(v + 1, "ENTER"))
479  *p = 'n';
480  else {
481  agerr(AGWARN, "Illegal value %s for ALIGN - ignored\n", v);
482  rv = 1;
483  }
484  return rv;
485 }
486 
487 /* Tables used in binary search; MUST be alphabetized */
488 static attr_item tbl_items[] = {
489  {"align", (attrFn) halignfn},
490  {"bgcolor", (attrFn) bgcolorfn},
491  {"border", (attrFn) borderfn},
492  {"cellborder", (attrFn) cellborderfn},
493  {"cellpadding", (attrFn) cellpaddingfn},
494  {"cellspacing", (attrFn) cellspacingfn},
495  {"color", (attrFn) pencolorfn},
496  {"columns", (attrFn) columnsfn},
497  {"fixedsize", (attrFn) fixedsizefn},
498  {"gradientangle", (attrFn) gradientanglefn},
499  {"height", (attrFn) heightfn},
500  {"href", (attrFn) hreffn},
501  {"id", (attrFn) idfn},
502  {"port", (attrFn) portfn},
503  {"rows", (attrFn) rowsfn},
504  {"sides", (attrFn) sidesfn},
505  {"style", (attrFn) stylefn},
506  {"target", (attrFn) targetfn},
507  {"title", (attrFn) titlefn},
508  {"tooltip", (attrFn) titlefn},
509  {"valign", (attrFn) valignfn},
510  {"width", (attrFn) widthfn},
511 };
512 
513 static attr_item cell_items[] = {
514  {"align", (attrFn) cell_halignfn},
515  {"balign", (attrFn) balignfn},
516  {"bgcolor", (attrFn) bgcolorfn},
517  {"border", (attrFn) borderfn},
518  {"cellpadding", (attrFn) cellpaddingfn},
519  {"cellspacing", (attrFn) cellspacingfn},
520  {"color", (attrFn) pencolorfn},
521  {"colspan", (attrFn) colspanfn},
522  {"fixedsize", (attrFn) fixedsizefn},
523  {"gradientangle", (attrFn) gradientanglefn},
524  {"height", (attrFn) heightfn},
525  {"href", (attrFn) hreffn},
526  {"id", (attrFn) idfn},
527  {"port", (attrFn) portfn},
528  {"rowspan", (attrFn) rowspanfn},
529  {"sides", (attrFn) sidesfn},
530  {"style", (attrFn) stylefn},
531  {"target", (attrFn) targetfn},
532  {"title", (attrFn) titlefn},
533  {"tooltip", (attrFn) titlefn},
534  {"valign", (attrFn) valignfn},
535  {"width", (attrFn) widthfn},
536 };
537 
538 static attr_item font_items[] = {
539  {"color", (attrFn) fontcolorfn},
540  {"face", (attrFn) facefn},
541  {"point-size", (attrFn) ptsizefn},
542 };
543 
544 static attr_item img_items[] = {
545  {"scale", (attrFn) scalefn},
546  {"src", (attrFn) srcfn},
547 };
548 
549 static attr_item br_items[] = {
550  {"align", (attrFn) alignfn},
551 };
552 
553 /* doAttrs:
554  * General function for processing list of name/value attributes.
555  * Do binary search on items table. If match found, invoke action
556  * passing it tp and attribute value.
557  * Table size is given by nel
558  * Name/value pairs are in array atts, which is null terminated.
559  * s is the name of the HTML element being processed.
560  */
561 static void
562 doAttrs(void *tp, attr_item * items, int nel, char **atts, char *s)
563 {
564  char *name;
565  char *val;
566  attr_item *ip;
567  attr_item key;
568 
569  while ((name = *atts++) != NULL) {
570  val = *atts++;
571  key.name = name;
572  ip = (attr_item *) bsearch(&key, items, nel, ISIZE, (bcmpfn) icmp);
573  if (ip)
574  state.warn |= ip->action(tp, val);
575  else {
576  agerr(AGWARN, "Illegal attribute %s in %s - ignored\n", name,
577  s);
578  state.warn = 1;
579  }
580  }
581 }
582 
583 static void mkBR(char **atts)
584 {
586  doAttrs(&htmllval.i, br_items, sizeof(br_items) / ISIZE, atts, "<BR>");
587 }
588 
589 static htmlimg_t *mkImg(char **atts)
590 {
591  htmlimg_t *img = NEW(htmlimg_t);
592 
593  doAttrs(img, img_items, sizeof(img_items) / ISIZE, atts, "<IMG>");
594 
595  return img;
596 }
597 
598 static textfont_t *mkFont(GVC_t *gvc, char **atts, int flags, int ul)
599 {
600  textfont_t tf = {NULL,NULL,NULL,0.0,0,0};
601 
602  tf.size = -1.0; /* unassigned */
603  tf.flags = flags;
604  if (atts)
605  doAttrs(&tf, font_items, sizeof(font_items) / ISIZE, atts, "<FONT>");
606 
607  return dtinsert(gvc->textfont_dt, &tf);
608 }
609 
610 static htmlcell_t *mkCell(char **atts)
611 {
613 
614  cell->cspan = 1;
615  cell->rspan = 1;
616  doAttrs(cell, cell_items, sizeof(cell_items) / ISIZE, atts, "<TD>");
617 
618  return cell;
619 }
620 
621 static htmltbl_t *mkTbl(char **atts)
622 {
623  htmltbl_t *tbl = NEW(htmltbl_t);
624 
625  tbl->rc = -1; /* flag that table is a raw, parsed table */
626  tbl->cb = -1; /* unset cell border attribute */
627  doAttrs(tbl, tbl_items, sizeof(tbl_items) / ISIZE, atts, "<TABLE>");
628 
629  return tbl;
630 }
631 
632 static void startElement(void *user, const char *name, char **atts)
633 {
634  GVC_t *gvc = (GVC_t*)user;
635 
636  if (strcasecmp(name, "TABLE") == 0) {
637  htmllval.tbl = mkTbl(atts);
638  state.inCell = 0;
639  state.tok = T_table;
640  } else if ((strcasecmp(name, "TR") == 0)
641  || (strcasecmp(name, "TH") == 0)) {
642  state.inCell = 0;
643  state.tok = T_row;
644  } else if (strcasecmp(name, "TD") == 0) {
645  state.inCell = 1;
646  htmllval.cell = mkCell(atts);
647  state.tok = T_cell;
648  } else if (strcasecmp(name, "FONT") == 0) {
649  htmllval.font = mkFont(gvc, atts, 0, 0);
650  state.tok = T_font;
651  } else if (strcasecmp(name, "B") == 0) {
652  htmllval.font = mkFont(gvc, 0, HTML_BF, 0);
653  state.tok = T_bold;
654  } else if (strcasecmp(name, "S") == 0) {
655  htmllval.font = mkFont(gvc, 0, HTML_S, 0);
656  state.tok = T_s;
657  } else if (strcasecmp(name, "U") == 0) {
658  htmllval.font = mkFont(gvc, 0, HTML_UL, 1);
659  state.tok = T_underline;
660  } else if (strcasecmp(name, "O") == 0) {
661  htmllval.font = mkFont(gvc, 0, HTML_OL, 1);
662  state.tok = T_overline;
663  } else if (strcasecmp(name, "I") == 0) {
664  htmllval.font = mkFont(gvc, 0, HTML_IF, 0);
665  state.tok = T_italic;
666  } else if (strcasecmp(name, "SUP") == 0) {
667  htmllval.font = mkFont(gvc, 0, HTML_SUP, 0);
668  state.tok = T_sup;
669  } else if (strcasecmp(name, "SUB") == 0) {
670  htmllval.font = mkFont(gvc, 0, HTML_SUB, 0);
671  state.tok = T_sub;
672  } else if (strcasecmp(name, "BR") == 0) {
673  mkBR(atts);
674  state.tok = T_br;
675  } else if (strcasecmp(name, "HR") == 0) {
676  state.tok = T_hr;
677  } else if (strcasecmp(name, "VR") == 0) {
678  state.tok = T_vr;
679  } else if (strcasecmp(name, "IMG") == 0) {
680  htmllval.img = mkImg(atts);
681  state.tok = T_img;
682  } else if (strcasecmp(name, "HTML") == 0) {
683  state.tok = T_html;
684  } else {
685  lexerror(name);
686  }
687 }
688 
689 static void endElement(void *user, const char *name)
690 {
691  if (strcasecmp(name, "TABLE") == 0) {
692  state.tok = T_end_table;
693  state.inCell = 1;
694  } else if ((strcasecmp(name, "TR") == 0)
695  || (strcasecmp(name, "TH") == 0)) {
696  state.tok = T_end_row;
697  } else if (strcasecmp(name, "TD") == 0) {
698  state.tok = T_end_cell;
699  state.inCell = 0;
700  } else if (strcasecmp(name, "HTML") == 0) {
701  state.tok = T_end_html;
702  } else if (strcasecmp(name, "FONT") == 0) {
703  state.tok = T_end_font;
704  } else if (strcasecmp(name, "B") == 0) {
705  state.tok = T_n_bold;
706  } else if (strcasecmp(name, "U") == 0) {
707  state.tok = T_n_underline;
708  } else if (strcasecmp(name, "O") == 0) {
709  state.tok = T_n_overline;
710  } else if (strcasecmp(name, "I") == 0) {
711  state.tok = T_n_italic;
712  } else if (strcasecmp(name, "SUP") == 0) {
713  state.tok = T_n_sup;
714  } else if (strcasecmp(name, "SUB") == 0) {
715  state.tok = T_n_sub;
716  } else if (strcasecmp(name, "S") == 0) {
717  state.tok = T_n_s;
718  } else if (strcasecmp(name, "BR") == 0) {
719  if (state.tok == T_br)
720  state.tok = T_BR;
721  else
722  state.tok = T_end_br;
723  } else if (strcasecmp(name, "HR") == 0) {
724  if (state.tok == T_hr)
725  state.tok = T_HR;
726  else
727  state.tok = T_end_hr;
728  } else if (strcasecmp(name, "VR") == 0) {
729  if (state.tok == T_vr)
730  state.tok = T_VR;
731  else
732  state.tok = T_end_vr;
733  } else if (strcasecmp(name, "IMG") == 0) {
734  if (state.tok == T_img)
735  state.tok = T_IMG;
736  else
737  state.tok = T_end_img;
738  } else {
739  lexerror(name);
740  }
741 }
742 
743 /* characterData:
744  * Generate T_string token. Do this only when immediately in
745  * <TD>..</TD> or <HTML>..</HTML>, i.e., when inCell is true.
746  * Strip out formatting characters but keep spaces.
747  * Distinguish between all whitespace vs. strings with non-whitespace
748  * characters.
749  */
750 static void characterData(void *user, const char *s, int length)
751 {
752  int i, rc, cnt = 0;
753  unsigned char c;
754 
755  if (state.inCell) {
756  for (i = length; i; i--) {
757  c = *s++;
758  if (c >= ' ') {
759  cnt++;
760  rc = agxbputc(state.xb, c);
761  }
762  }
763  if (cnt) state.tok = T_string;
764  }
765 }
766 #endif
767 
768 int initHTMLlexer(char *src, agxbuf * xb, htmlenv_t *env)
769 {
770 #ifdef HAVE_EXPAT
771  state.xb = xb;
772  agxbinit (&state.lb, SMALLBUF, NULL);
773  state.ptr = src;
774  state.mode = 0;
775  state.warn = 0;
776  state.error = 0;
777  state.currtoklen = 0;
778  state.prevtoklen = 0;
779  state.inCell = 1;
780  state.parser = XML_ParserCreate(charsetToStr(GD_charset(env->g)));
781  XML_SetUserData(state.parser, GD_gvc(env->g));
782  XML_SetElementHandler(state.parser,
783  (XML_StartElementHandler) startElement,
784  endElement);
785  XML_SetCharacterDataHandler(state.parser, characterData);
786  return 0;
787 #else
788  static int first;
789  if (!first) {
790  agerr(AGWARN,
791  "Not built with libexpat. Table formatting is not available.\n");
792  first++;
793  }
794  return 1;
795 #endif
796 }
797 
799 {
800 #ifdef HAVE_EXPAT
801  int rv = state.warn | state.error;
802  XML_ParserFree(state.parser);
803  agxbfree (&state.lb);
804  return rv;
805 #else
806  return 1;
807 #endif
808 }
809 
810 #ifdef HAVE_EXPAT
811 /* eatComment:
812  * Given first character after open comment, eat characters
813  * upto comment close, returning pointer to closing > if it exists,
814  * or null character otherwise.
815  * We rely on HTML strings having matched nested <>.
816  */
817 static char *eatComment(char *p)
818 {
819  int depth = 1;
820  char *s = p;
821  char c;
822 
823  while (depth && (c = *s++)) {
824  if (c == '<')
825  depth++;
826  else if (c == '>')
827  depth--;
828  }
829  s--; /* move back to '\0' or '>' */
830  if (*s) {
831  char *t = s - 2;
832  if ((t < p) || strncmp(t, "--", 2)) {
833  agerr(AGWARN, "Unclosed comment\n");
834  state.warn = 1;
835  }
836  }
837  return s;
838 }
839 
840 /* findNext:
841  * Return next XML unit. This is either <..>, an HTML
842  * comment <!-- ... -->, or characters up to next <.
843  */
844 static char *findNext(char *s, agxbuf* xb)
845 {
846  char* t = s + 1;
847  char c;
848  int rc;
849 
850  if (*s == '<') {
851  if ((*t == '!') && !strncmp(t + 1, "--", 2))
852  t = eatComment(t + 3);
853  else
854  while (*t && (*t != '>'))
855  t++;
856  if (*t != '>') {
857  agerr(AGWARN, "Label closed before end of HTML element\n");
858  state.warn = 1;
859  } else
860  t++;
861  } else {
862  t = s;
863  while ((c = *t) && (c != '<')) {
864  if ((c == '&') && (*(t+1) != '#')) {
865  t = scanEntity(t + 1, xb);
866  }
867  else {
868  rc = agxbputc(xb, c);
869  t++;
870  }
871  }
872  }
873  return t;
874 }
875 #endif
876 
878 {
879 #ifdef HAVE_EXPAT
880  return XML_GetCurrentLineNumber(state.parser);
881 #else
882  return 0;
883 #endif
884 }
885 
886 #ifdef DEBUG
887 static void printTok(int tok)
888 {
889  char *s;
890 
891  switch (tok) {
892  case T_VR:
893  s = "T_VR";
894  break;
895  case T_vr:
896  s = "T_vr";
897  break;
898  case T_end_vr:
899  s = "T_end_vr";
900  break;
901  case T_HR:
902  s = "T_HR";
903  break;
904  case T_hr:
905  s = "T_hr";
906  break;
907  case T_end_hr:
908  s = "T_end_hr";
909  break;
910  case T_BR:
911  s = "T_BR";
912  break;
913  case T_br:
914  s = "T_br";
915  break;
916  case T_end_br:
917  s = "T_end_br";
918  break;
919  case T_end_table:
920  s = "T_end_table";
921  break;
922  case T_row:
923  s = "T_row";
924  break;
925  case T_end_row:
926  s = "T_end_row";
927  break;
928  case T_end_cell:
929  s = "T_end_cell";
930  break;
931  case T_html:
932  s = "T_html";
933  break;
934  case T_end_html:
935  s = "T_end_html";
936  break;
937  case T_string:
938  s = "T_string";
939  break;
940  case T_error:
941  s = "T_error";
942  break;
943  case T_table:
944  s = "T_table";
945  break;
946  case T_cell:
947  s = "T_cell";
948  break;
949  case T_img:
950  s = "T_img";
951  break;
952  case T_end_img:
953  s = "T_end_img";
954  break;
955  case T_IMG:
956  s = "T_IMG";
957  break;
958  case T_underline:
959  s = "T_underline";
960  break;
961  case T_n_underline:
962  s = "T_n_underline";
963  break;
964  case T_overline:
965  s = "T_overline";
966  break;
967  case T_n_overline:
968  s = "T_n_overline";
969  break;
970  case T_italic:
971  s = "T_italic";
972  break;
973  case T_n_italic:
974  s = "T_n_italic";
975  break;
976  case T_bold:
977  s = "T_bold";
978  break;
979  case T_n_bold:
980  s = "T_n_bold";
981  break;
982  case T_s:
983  s = "T_s";
984  break;
985  case T_n_s:
986  s = "T_n_s";
987  break;
988  default:
989  s = "<unknown>";
990  }
991  if (tok == T_string) {
992  fprintf(stderr, "%s \"", s);
993  fwrite(agxbstart(state.xb), 1, agxblen(state.xb), stderr);
994  fprintf(stderr, "\"\n");
995  } else
996  fprintf(stderr, "%s\n", s);
997 }
998 
999 #endif
1000 
1001 int htmllex()
1002 {
1003 #ifdef HAVE_EXPAT
1004  static char *begin_html = "<HTML>";
1005  static char *end_html = "</HTML>";
1006 
1007  char *s;
1008  char *endp = 0;
1009  int len, llen;
1010  int rv;
1011 
1012  state.tok = 0;
1013  do {
1014  if (state.mode == 2)
1015  return EOF;
1016  if (state.mode == 0) {
1017  state.mode = 1;
1018  s = begin_html;
1019  len = strlen(s);
1020  endp = 0;
1021  } else {
1022  s = state.ptr;
1023  if (*s == '\0') {
1024  state.mode = 2;
1025  s = end_html;
1026  len = strlen(s);
1027  } else {
1028  endp = findNext(s,&state.lb);
1029  len = endp - s;
1030  }
1031  }
1032  state.prevtok = state.currtok;
1033  state.prevtoklen = state.currtoklen;
1034  state.currtok = s;
1035  state.currtoklen = len;
1036  if ((llen = agxblen(&state.lb)))
1037  rv = XML_Parse(state.parser, agxbuse(&state.lb),llen, 0);
1038  else
1039  rv = XML_Parse(state.parser, s, len, (len ? 0 : 1));
1040  if (rv == XML_STATUS_ERROR) {
1041  if (!state.error) {
1042  agerr(AGERR, "%s in line %d \n",
1043  XML_ErrorString(XML_GetErrorCode(state.parser)),
1044  htmllineno());
1045  error_context();
1046  state.error = 1;
1047  state.tok = T_error;
1048  }
1049  }
1050  if (endp)
1051  state.ptr = endp;
1052  } while (state.tok == 0);
1053  /* printTok (state.tok); */
1054  return state.tok;
1055 #else
1056  return EOF;
1057 #endif
1058 }
1059 
#define DOTTED
Definition: const.h:216
#define HTML_VRULE
Definition: htmltable.h:95
#define T_font
Definition: htmlparse.c:575
char warn
Definition: htmllex.c:38
unsigned short rspan
Definition: htmltable.h:134
Definition: cgraph.h:389
#define T_hr
Definition: htmlparse.c:564
#define PAD_SET
Definition: htmltable.h:30
#define INVISIBLE
Definition: const.h:214
#define agxbuse(X)
Definition: agxbuf.h:74
#define T_end_font
Definition: htmlparse.c:553
#define HTML_S
Definition: textspan.h:29
#define HTML_IF
Definition: textspan.h:25
#define BORDER_LEFT
Definition: htmltable.h:35
#define T_end_cell
Definition: htmlparse.c:552
#define agxbstart(X)
Definition: agxbuf.h:80
#define UNSET_ALIGN
Definition: htmltable.h:41
#define T_html
Definition: htmlparse.c:549
#define SMALLBUF
Definition: const.h:17
int initHTMLlexer(char *src, agxbuf *xb, htmlenv_t *env)
Definition: htmllex.c:768
Dt_t * textfont_dt
Definition: gvcint.h:96
signed char space
Definition: htmltable.h:79
#define HTML_BF
Definition: textspan.h:24
double size
Definition: textspan.h:52
char * prevtok
Definition: htmllex.c:43
#define T_br
Definition: htmlparse.c:570
#define T_n_underline
Definition: htmlparse.c:558
char * port
Definition: htmltable.h:72
#define T_table
Definition: htmlparse.c:573
int agxbput_n(agxbuf *xb, const char *s, unsigned int ssz)
Definition: agxbuf.c:72
int htmllineno()
Definition: htmllex.c:877
textfont_t * font
Definition: htmlparse.c:596
unsigned short cspan
Definition: htmltable.h:133
#define HALIGN_TEXT
Definition: htmltable.h:25
unsigned char border
Definition: htmltable.h:80
int agerr(agerrlevel_t level, const char *fmt,...)
Definition: agerror.c:142
#define RADIAL
Definition: const.h:210
#define T_overline
Definition: htmlparse.c:579
#define BORDER_TOP
Definition: htmltable.h:36
char * currtok
Definition: htmllex.c:42
int i
Definition: grammar.c:203
#define BORDER_RIGHT
Definition: htmltable.h:37
#define HALIGN_RIGHT
Definition: htmltable.h:22
char * scale
Definition: htmltable.h:67
char error
Definition: htmllex.c:39
char * name
Definition: textspan.h:49
#define T_vr
Definition: htmlparse.c:567
#define HTML_SUB
Definition: textspan.h:28
#define T_BR
Definition: htmlparse.c:569
agxbuf lb
Definition: htmllex.c:37
#define GD_gvc(g)
Definition: types.h:338
#define T_end_hr
Definition: htmlparse.c:565
#define HALIGN_LEFT
Definition: htmltable.h:23
Definition: cgraph.h:389
#define agxbputc(X, C)
Definition: agxbuf.h:67
char * pencolor
Definition: htmltable.h:77
#define BALIGN_RIGHT
Definition: htmltable.h:32
#define T_HR
Definition: htmlparse.c:563
graph_t * g
Definition: htmltable.h:160
#define agxblen(X)
Definition: agxbuf.h:86
unsigned short flags
Definition: htmltable.h:83
int gradientangle
Definition: htmltable.h:78
#define T_n_italic
Definition: htmlparse.c:556
unsigned int flags
Definition: textspan.h:53
unsigned short width
Definition: htmltable.h:84
void free()
int i
Definition: gvdevice.c:448
char * href
Definition: htmltable.h:71
#define max(x, y)
Definition: stress.c:794
int tok
Definition: htmllex.c:35
#define BORDER_BOTTOM
Definition: htmltable.h:38
int prevtoklen
Definition: htmllex.c:45
#define ROUNDED
Definition: const.h:211
Definition: gvcint.h:70
#define T_cell
Definition: htmlparse.c:574
#define T_row
Definition: htmlparse.c:547
#define HTML_OL
Definition: textspan.h:30
#define T_IMG
Definition: htmlparse.c:571
Definition: grid.h:39
unsigned char flags
Definition: htmltable.h:119
htmlcell_t * cell
Definition: htmlparse.c:594
#define VALIGN_BOTTOM
Definition: htmltable.h:27
#define SPACE_SET
Definition: htmltable.h:31
char * scanEntity(char *t, agxbuf *xb)
Definition: utils.c:1303
unsigned short style
Definition: htmltable.h:86
#define T_bold
Definition: htmlparse.c:577
void agxbinit(agxbuf *xb, unsigned int hint, unsigned char *init)
Definition: agxbuf.c:25
#define HTML_UL
Definition: textspan.h:26
Definition: grammar.c:79
#define T_end_img
Definition: htmlparse.c:546
char * color
Definition: textspan.h:50
signed char cb
Definition: htmltable.h:113
char * src
Definition: htmltable.h:66
#define T_end_table
Definition: htmlparse.c:551
#define dtinsert(d, o)
Definition: cdt.h:311
unsigned char pad
Definition: htmltable.h:81
#define T_end_br
Definition: htmlparse.c:545
#define T_n_sup
Definition: htmlparse.c:560
#define XML_STATUS_ERROR
Definition: htmllex.c:27
#define T_n_overline
Definition: htmlparse.c:559
htmlimg_t * img
Definition: htmlparse.c:597
#define NULL
Definition: logic.h:50
#define BALIGN_LEFT
Definition: htmltable.h:33
#define GD_charset(g)
Definition: types.h:351
#define T_img
Definition: htmlparse.c:572
#define HTML_SUP
Definition: textspan.h:27
#define HTML_HRULE
Definition: htmltable.h:96
htmltbl_t * tbl
Definition: htmlparse.c:595
GVC_t * gvc
Definition: htmlparse.c:87
int currtoklen
Definition: htmllex.c:44
int strcasecmp(const char *s1, const char *s2)
Definition: strcasecmp.c:23
char * title
Definition: htmltable.h:74
unsigned short height
Definition: htmltable.h:85
YYSTYPE htmllval
#define FIXED_FLAG
Definition: htmltable.h:21
#define T_n_sub
Definition: htmlparse.c:561
#define T_end_row
Definition: htmlparse.c:548
char mode
Definition: htmllex.c:41
agxbuf * xb
Definition: htmllex.c:36
#define DASHED
Definition: const.h:217
#define T_end_html
Definition: htmlparse.c:550
#define T_end_vr
Definition: htmlparse.c:568
void htmlerror(const char *msg)
Definition: htmllex.c:64
#define T_sub
Definition: htmlparse.c:581
#define T_n_s
Definition: htmlparse.c:562
char * target
Definition: htmltable.h:73
char * charsetToStr(int c)
Definition: input.c:860
Definition: agxbuf.h:24
#define T_s
Definition: htmlparse.c:582
#define T_italic
Definition: htmlparse.c:576
#define BORDER_MASK
Definition: htmltable.h:39
#define agxbclear(X)
Definition: agxbuf.h:92
#define T_underline
Definition: htmlparse.c:578
char inCell
Definition: htmllex.c:40
#define BORDER_SET
Definition: htmltable.h:29
#define T_sup
Definition: htmlparse.c:580
char * bgcolor
Definition: htmltable.h:76
Definition: cgraph.h:389
#define T_error
Definition: htmlparse.c:555
#define T_string
Definition: htmlparse.c:554
void agxbfree(agxbuf *xb)
Definition: agxbuf.c:94
#define VALIGN_TOP
Definition: htmltable.h:26
int htmllex()
Definition: htmllex.c:1001
int clearHTMLlexer()
Definition: htmllex.c:798
char * ptr
Definition: htmllex.c:34
#define T_VR
Definition: htmlparse.c:566
#define T_n_bold
Definition: htmlparse.c:557
char * id
Definition: htmltable.h:75
#define NEW(t)
Definition: memory.h:35