Graphviz  2.31.20130521.0446
lib/common/htmllex.c
Go to the documentation of this file.
00001 /* $Id$ $Revision$ */
00002 /* vim:set shiftwidth=4 ts=8: */
00003 
00004 /*************************************************************************
00005  * Copyright (c) 2011 AT&T Intellectual Property 
00006  * All rights reserved. This program and the accompanying materials
00007  * are made available under the terms of the Eclipse Public License v1.0
00008  * which accompanies this distribution, and is available at
00009  * http://www.eclipse.org/legal/epl-v10.html
00010  *
00011  * Contributors: See CVS logs. Details at http://www.graphviz.org/
00012  *************************************************************************/
00013 
00014 
00015 #include "render.h"
00016 #include "htmltable.h"
00017 #include "htmlparse.h"
00018 #include "htmllex.h"
00019 #include <ctype.h>
00020 
00021 #ifdef HAVE_EXPAT
00022 #include <expat.h>
00023 #endif
00024 
00025 #ifndef XML_STATUS_ERROR
00026 #define XML_STATUS_ERROR 0
00027 #endif
00028 
00029 typedef struct {
00030 #ifdef HAVE_EXPAT
00031     XML_Parser parser;
00032 #endif
00033     char* ptr;                  /* input source */
00034     int tok;                    /* token type   */
00035     agxbuf* xb;                 /* buffer to gather T_string data */
00036     agxbuf  lb;                 /* buffer for translating lexical data */
00037     char warn;                  /* set if warning given */
00038     char error;                 /* set if error given */
00039     char inCell;                /* set if in TD to allow T_string */
00040     char mode;                  /* for handling artificial <HTML>..</HTML> */
00041     char *currtok;              /* for error reporting */
00042     char *prevtok;              /* for error reporting */
00043     int currtoklen;
00044     int prevtoklen;
00045 } lexstate_t;
00046 static lexstate_t state;
00047 
00048 /* error_context:
00049  * Print the last 2 "token"s seen.
00050  */
00051 static void error_context(void)
00052 {
00053     agxbclear(state.xb);
00054     if (state.prevtoklen > 0)
00055         agxbput_n(state.xb, state.prevtok, state.prevtoklen);
00056     agxbput_n(state.xb, state.currtok, state.currtoklen);
00057     agerr(AGPREV, "... %s ...\n", agxbuse(state.xb));
00058 }
00059 
00060 /* htmlerror:
00061  * yyerror - called by yacc output
00062  */
00063 void htmlerror(const char *msg)
00064 {
00065     if (state.error)
00066         return;
00067     state.error = 1;
00068     agerr(AGERR, "%s in line %d \n", msg, htmllineno());
00069     error_context();
00070 }
00071 
00072 #ifdef HAVE_EXPAT
00073 /* lexerror:
00074  * called by lexer when unknown <..> is found.
00075  */
00076 static void lexerror(const char *name)
00077 {
00078     state.tok = T_error;
00079     state.error = 1;
00080     agerr(AGERR, "Unknown HTML element <%s> on line %d \n",
00081           name, htmllineno());
00082 }
00083 
00084 typedef int (*attrFn) (void *, char *);
00085 typedef int (*bcmpfn) (const void *, const void *);
00086 
00087 #define MAX_CHAR    (((unsigned char)(~0)) >> 1)
00088 #define MIN_CHAR    ((signed char)(~MAX_CHAR))
00089 #define MAX_UCHAR   ((unsigned char)(~0))
00090 #define MAX_USHORT  ((unsigned short)(~0))
00091 
00092 /* Mechanism for automatically processing attributes */
00093 typedef struct {
00094     char *name;                 /* attribute name */
00095     attrFn action;              /* action to perform if name matches */
00096 } attr_item;
00097 
00098 #define ISIZE (sizeof(attr_item))
00099 
00100 /* icmp:
00101  * Compare two attr_item. Used in bsearch
00102  */
00103 static int icmp(attr_item * i, attr_item * j)
00104 {
00105     return strcasecmp(i->name, j->name);
00106 }
00107 
00108 static int bgcolorfn(htmldata_t * p, char *v)
00109 {
00110     p->bgcolor = strdup(v);
00111     return 0;
00112 }
00113 
00114 static int pencolorfn(htmldata_t * p, char *v)
00115 {
00116     p->pencolor = strdup(v);
00117     return 0;
00118 }
00119 
00120 static int hreffn(htmldata_t * p, char *v)
00121 {
00122     p->href = strdup(v);
00123     return 0;
00124 }
00125 
00126 static int titlefn(htmldata_t * p, char *v)
00127 {
00128     p->title = strdup(v);
00129     return 0;
00130 }
00131 
00132 static int portfn(htmldata_t * p, char *v)
00133 {
00134     p->port = strdup(v);
00135     return 0;
00136 }
00137 
00138 #define DELIM " ,"
00139 
00140 static int stylefn(htmldata_t * p, char *v)
00141 {
00142     int rv = 0;
00143     char c;
00144     char* tk;
00145     char* buf = strdup (v);
00146     for (tk = strtok (buf, DELIM); tk; tk = strtok (NULL, DELIM)) {
00147         c = toupper(*tk);
00148         if (c == 'R') {
00149             if (!strcasecmp(tk + 1, "OUNDED")) p->style |= ROUNDED;
00150             else if (!strcasecmp(tk + 1, "ADIAL")) p->style |= RADIAL;
00151             else {
00152                 agerr(AGWARN, "Illegal value %s for STYLE - ignored\n", tk);
00153                 rv = 1;
00154             }
00155         }
00156         else if(!strcasecmp(tk,"SOLID")) p->style &= ~(DOTTED|DASHED);
00157         else if(!strcasecmp(tk,"INVISIBLE") || !strcasecmp(tk,"INVIS")) p->style |= INVISIBLE;
00158         else if(!strcasecmp(tk,"DOTTED")) p->style |= DOTTED;
00159         else if(!strcasecmp(tk,"DASHED")) p->style |= DASHED;
00160         else {
00161             agerr(AGWARN, "Illegal value %s for STYLE - ignored\n", tk);
00162             rv = 1;
00163         }
00164     }
00165     free (buf);
00166     return rv;
00167 }
00168 
00169 static int targetfn(htmldata_t * p, char *v)
00170 {
00171     p->target = strdup(v);
00172     return 0;
00173 }
00174 
00175 static int idfn(htmldata_t * p, char *v)
00176 {
00177     p->id = strdup(v);
00178     return 0;
00179 }
00180 
00181 
00182 /* doInt:
00183  * Scan v for integral value. Check that
00184  * the value is >= min and <= max. Return value in ul.
00185  * String s is name of value.
00186  * Return 0 if okay; 1 otherwise.
00187  */
00188 static int doInt(char *v, char *s, int min, int max, long *ul)
00189 {
00190     int rv = 0;
00191     char *ep;
00192     long b = strtol(v, &ep, 10);
00193 
00194     if (ep == v) {
00195         agerr(AGWARN, "Improper %s value %s - ignored", s, v);
00196         rv = 1;
00197     } else if (b > max) {
00198         agerr(AGWARN, "%s value %s > %d - too large - ignored", s, v, max);
00199         rv = 1;
00200     } else if (b < min) {
00201         agerr(AGWARN, "%s value %s < %d - too small - ignored", s, v, min);
00202         rv = 1;
00203     } else
00204         *ul = b;
00205     return rv;
00206 }
00207 
00208 
00209 static int gradientanglefn(htmldata_t * p, char *v)
00210 {
00211     long u;
00212 
00213     if (doInt(v, "GRADIENTANGLE", 0, 360, &u))
00214         return 1;
00215     p->gradientangle = (unsigned short) u;
00216     return 0;
00217 }
00218 
00219 
00220 static int borderfn(htmldata_t * p, char *v)
00221 {
00222     long u;
00223 
00224     if (doInt(v, "BORDER", 0, MAX_UCHAR, &u))
00225         return 1;
00226     p->border = (unsigned char) u;
00227     p->flags |= BORDER_SET;
00228     return 0;
00229 }
00230 
00231 static int cellpaddingfn(htmldata_t * p, char *v)
00232 {
00233     long u;
00234 
00235     if (doInt(v, "CELLPADDING", 0, MAX_UCHAR, &u))
00236         return 1;
00237     p->pad = (unsigned char) u;
00238     p->flags |= PAD_SET;
00239     return 0;
00240 }
00241 
00242 static int cellspacingfn(htmldata_t * p, char *v)
00243 {
00244     long u;
00245 
00246     if (doInt(v, "CELLSPACING", MIN_CHAR, MAX_CHAR, &u))
00247         return 1;
00248     p->space = (signed char) u;
00249     p->flags |= SPACE_SET;
00250     return 0;
00251 }
00252 
00253 static int cellborderfn(htmltbl_t * p, char *v)
00254 {
00255     long u;
00256 
00257     if (doInt(v, "CELLSBORDER", 0, MAX_CHAR, &u))
00258         return 1;
00259     p->cb = (unsigned char) u;
00260     return 0;
00261 }
00262 
00263 static int columnsfn(htmltbl_t * p, char *v)
00264 {
00265     if (*v != '*') {
00266         agerr(AGWARN, "Unknown value %s for COLUMNS - ignored\n", v);
00267         return 1;
00268     }
00269     p->flags |= HTML_VRULE;
00270     return 0;
00271 }
00272 
00273 static int rowsfn(htmltbl_t * p, char *v)
00274 {
00275     if (*v != '*') {
00276         agerr(AGWARN, "Unknown value %s for ROWS - ignored\n", v);
00277         return 1;
00278     }
00279     p->flags |= HTML_HRULE;
00280     return 0;
00281 }
00282 
00283 static int fixedsizefn(htmldata_t * p, char *v)
00284 {
00285     int rv = 0;
00286     char c = toupper(*(unsigned char *) v);
00287     if ((c == 'T') && !strcasecmp(v + 1, "RUE"))
00288         p->flags |= FIXED_FLAG;
00289     else if ((c != 'F') || strcasecmp(v + 1, "ALSE")) {
00290         agerr(AGWARN, "Illegal value %s for FIXEDSIZE - ignored\n", v);
00291         rv = 1;
00292     }
00293     return rv;
00294 }
00295 
00296 static int valignfn(htmldata_t * p, char *v)
00297 {
00298     int rv = 0;
00299     char c = toupper(*v);
00300     if ((c == 'B') && !strcasecmp(v + 1, "OTTOM"))
00301         p->flags |= VALIGN_BOTTOM;
00302     else if ((c == 'T') && !strcasecmp(v + 1, "OP"))
00303         p->flags |= VALIGN_TOP;
00304     else if ((c != 'M') || strcasecmp(v + 1, "IDDLE")) {
00305         agerr(AGWARN, "Illegal value %s for VALIGN - ignored\n", v);
00306         rv = 1;
00307     }
00308     return rv;
00309 }
00310 
00311 static int halignfn(htmldata_t * p, char *v)
00312 {
00313     int rv = 0;
00314     char c = toupper(*v);
00315     if ((c == 'L') && !strcasecmp(v + 1, "EFT"))
00316         p->flags |= HALIGN_LEFT;
00317     else if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
00318         p->flags |= HALIGN_RIGHT;
00319     else if ((c != 'C') || strcasecmp(v + 1, "ENTER")) {
00320         agerr(AGWARN, "Illegal value %s for ALIGN - ignored\n", v);
00321         rv = 1;
00322     }
00323     return rv;
00324 }
00325 
00326 static int cell_halignfn(htmldata_t * p, char *v)
00327 {
00328     int rv = 0;
00329     char c = toupper(*v);
00330     if ((c == 'L') && !strcasecmp(v + 1, "EFT"))
00331         p->flags |= HALIGN_LEFT;
00332     else if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
00333         p->flags |= HALIGN_RIGHT;
00334     else if ((c == 'T') && !strcasecmp(v + 1, "EXT"))
00335         p->flags |= HALIGN_TEXT;
00336     else if ((c != 'C') || strcasecmp(v + 1, "ENTER"))
00337         rv = 1;
00338     if (rv)
00339         agerr(AGWARN, "Illegal value %s for ALIGN in TD - ignored\n", v);
00340     return rv;
00341 }
00342 
00343 static int balignfn(htmldata_t * p, char *v)
00344 {
00345     int rv = 0;
00346     char c = toupper(*v);
00347     if ((c == 'L') && !strcasecmp(v + 1, "EFT"))
00348         p->flags |= BALIGN_LEFT;
00349     else if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
00350         p->flags |= BALIGN_RIGHT;
00351     else if ((c != 'C') || strcasecmp(v + 1, "ENTER"))
00352         rv = 1;
00353     if (rv)
00354         agerr(AGWARN, "Illegal value %s for BALIGN in TD - ignored\n", v);
00355     return rv;
00356 }
00357 
00358 static int heightfn(htmldata_t * p, char *v)
00359 {
00360     long u;
00361 
00362     if (doInt(v, "HEIGHT", 0, MAX_USHORT, &u))
00363         return 1;
00364     p->height = (unsigned short) u;
00365     return 0;
00366 }
00367 
00368 static int widthfn(htmldata_t * p, char *v)
00369 {
00370     long u;
00371 
00372     if (doInt(v, "WIDTH", 0, MAX_USHORT, &u))
00373         return 1;
00374     p->width = (unsigned short) u;
00375     return 0;
00376 }
00377 
00378 static int rowspanfn(htmlcell_t * p, char *v)
00379 {
00380     long u;
00381 
00382     if (doInt(v, "ROWSPAN", 0, MAX_USHORT, &u))
00383         return 1;
00384     if (u == 0) {
00385         agerr(AGWARN, "ROWSPAN value cannot be 0 - ignored\n");
00386         return 1;
00387     }
00388     p->rspan = (unsigned short) u;
00389     return 0;
00390 }
00391 
00392 static int colspanfn(htmlcell_t * p, char *v)
00393 {
00394     long u;
00395 
00396     if (doInt(v, "COLSPAN", 0, MAX_USHORT, &u))
00397         return 1;
00398     if (u == 0) {
00399         agerr(AGWARN, "COLSPAN value cannot be 0 - ignored\n");
00400         return 1;
00401     }
00402     p->cspan = (unsigned short) u;
00403     return 0;
00404 }
00405 
00406 static int fontcolorfn(htmlfont_t * p, char *v)
00407 {
00408     p->color = strdup(v);
00409     return 0;
00410 }
00411 
00412 static int facefn(htmlfont_t * p, char *v)
00413 {
00414     p->name = strdup(v);
00415     return 0;
00416 }
00417 
00418 static int ptsizefn(htmlfont_t * p, char *v)
00419 {
00420     long u;
00421 
00422     if (doInt(v, "POINT-SIZE", 0, MAX_UCHAR, &u))
00423         return 1;
00424     p->size = (double) u;
00425     return 0;
00426 }
00427 
00428 static int srcfn(htmlimg_t * p, char *v)
00429 {
00430     p->src = strdup(v);
00431     return 0;
00432 }
00433 
00434 static int scalefn(htmlimg_t * p, char *v)
00435 {
00436     p->scale = strdup(v);
00437     return 0;
00438 }
00439 
00440 static int alignfn(int *p, char *v)
00441 {
00442     int rv = 0;
00443     char c = toupper(*v);
00444     if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
00445         *p = 'r';
00446     else if ((c == 'L') || !strcasecmp(v + 1, "EFT"))
00447         *p = 'l';
00448     else if ((c == 'C') || strcasecmp(v + 1, "ENTER")) 
00449         *p = 'n';
00450     else {
00451         agerr(AGWARN, "Illegal value %s for ALIGN - ignored\n", v);
00452         rv = 1;
00453     }
00454     return rv;
00455 }
00456 
00457 /* Tables used in binary search; MUST be alphabetized */
00458 static attr_item tbl_items[] = {
00459     {"align", (attrFn) halignfn},
00460     {"bgcolor", (attrFn) bgcolorfn},
00461     {"border", (attrFn) borderfn},
00462     {"cellborder", (attrFn) cellborderfn},
00463     {"cellpadding", (attrFn) cellpaddingfn},
00464     {"cellspacing", (attrFn) cellspacingfn},
00465     {"color", (attrFn) pencolorfn},
00466     {"columns", (attrFn) columnsfn},
00467     {"fixedsize", (attrFn) fixedsizefn},
00468     {"gradientangle", (attrFn) gradientanglefn},
00469     {"height", (attrFn) heightfn},
00470     {"href", (attrFn) hreffn},
00471     {"id", (attrFn) idfn},
00472     {"port", (attrFn) portfn},
00473     {"rows", (attrFn) rowsfn},
00474     {"style", (attrFn) stylefn},
00475     {"target", (attrFn) targetfn},
00476     {"title", (attrFn) titlefn},
00477     {"tooltip", (attrFn) titlefn},
00478     {"valign", (attrFn) valignfn},
00479     {"width", (attrFn) widthfn},
00480 };
00481 
00482 static attr_item cell_items[] = {
00483     {"align", (attrFn) cell_halignfn},
00484     {"balign", (attrFn) balignfn},
00485     {"bgcolor", (attrFn) bgcolorfn},
00486     {"border", (attrFn) borderfn},
00487     {"cellpadding", (attrFn) cellpaddingfn},
00488     {"cellspacing", (attrFn) cellspacingfn},
00489     {"color", (attrFn) pencolorfn},
00490     {"colspan", (attrFn) colspanfn},
00491     {"fixedsize", (attrFn) fixedsizefn},
00492     {"gradientangle", (attrFn) gradientanglefn},
00493     {"height", (attrFn) heightfn},
00494     {"href", (attrFn) hreffn},
00495     {"id", (attrFn) idfn},
00496     {"port", (attrFn) portfn},
00497     {"rowspan", (attrFn) rowspanfn},
00498     {"style", (attrFn) stylefn},
00499     {"target", (attrFn) targetfn},
00500     {"title", (attrFn) titlefn},
00501     {"tooltip", (attrFn) titlefn},
00502     {"valign", (attrFn) valignfn},
00503     {"width", (attrFn) widthfn},
00504 };
00505 
00506 static attr_item font_items[] = {
00507     {"color", (attrFn) fontcolorfn},
00508     {"face", (attrFn) facefn},
00509     {"point-size", (attrFn) ptsizefn},
00510 };
00511 
00512 static attr_item img_items[] = {
00513     {"scale", (attrFn) scalefn},
00514     {"src", (attrFn) srcfn},
00515 };
00516 
00517 static attr_item br_items[] = {
00518     {"align", (attrFn) alignfn},
00519 };
00520 
00521 /* doAttrs:
00522  * General function for processing list of name/value attributes.
00523  * Do binary search on items table. If match found, invoke action
00524  * passing it tp and attribute value.
00525  * Table size is given by nel
00526  * Name/value pairs are in array atts, which is null terminated.
00527  * s is the name of the HTML element being processed.
00528  */
00529 static void
00530 doAttrs(void *tp, attr_item * items, int nel, char **atts, char *s)
00531 {
00532     char *name;
00533     char *val;
00534     attr_item *ip;
00535     attr_item key;
00536 
00537     while ((name = *atts++) != NULL) {
00538         val = *atts++;
00539         key.name = name;
00540         ip = (attr_item *) bsearch(&key, items, nel, ISIZE, (bcmpfn) icmp);
00541         if (ip)
00542             state.warn |= ip->action(tp, val);
00543         else {
00544             agerr(AGWARN, "Illegal attribute %s in %s - ignored\n", name,
00545                   s);
00546             state.warn = 1;
00547         }
00548     }
00549 }
00550 
00551 static void mkBR(char **atts)
00552 {
00553     htmllval.i = UNSET_ALIGN;
00554     doAttrs(&htmllval.i, br_items, sizeof(br_items) / ISIZE, atts, "<BR>");
00555 }
00556 
00557 static htmlimg_t *mkImg(char **atts)
00558 {
00559     htmlimg_t *img = NEW(htmlimg_t);
00560 
00561     doAttrs(img, img_items, sizeof(img_items) / ISIZE, atts, "<IMG>");
00562 
00563     return img;
00564 }
00565 
00566 static htmlfont_t *mkFont(char **atts, int flags, int ul)
00567 {
00568     htmlfont_t *font = NEW(htmlfont_t);
00569 
00570     font->size = -1.0;          /* unassigned */
00571     font->flags = flags;
00572     if (atts)
00573         doAttrs(font, font_items, sizeof(font_items) / ISIZE, atts, "<FONT>");
00574 
00575     return font;
00576 }
00577 
00578 static htmlcell_t *mkCell(char **atts)
00579 {
00580     htmlcell_t *cell = NEW(htmlcell_t);
00581 
00582     cell->cspan = 1;
00583     cell->rspan = 1;
00584     doAttrs(cell, cell_items, sizeof(cell_items) / ISIZE, atts, "<TD>");
00585 
00586     return cell;
00587 }
00588 
00589 static htmltbl_t *mkTbl(char **atts)
00590 {
00591     htmltbl_t *tbl = NEW(htmltbl_t);
00592 
00593     tbl->rc = -1;               /* flag that table is a raw, parsed table */
00594     tbl->cb = -1;               /* unset cell border attribute */
00595     doAttrs(tbl, tbl_items, sizeof(tbl_items) / ISIZE, atts, "<TABLE>");
00596 
00597     return tbl;
00598 }
00599 
00600 static void startElement(void *user, const char *name, char **atts)
00601 {
00602     if (strcasecmp(name, "TABLE") == 0) {
00603         htmllval.tbl = mkTbl(atts);
00604         state.inCell = 0;
00605         state.tok = T_table;
00606     } else if ((strcasecmp(name, "TR") == 0)
00607                || (strcasecmp(name, "TH") == 0)) {
00608         state.inCell = 0;
00609         state.tok = T_row;
00610     } else if (strcasecmp(name, "TD") == 0) {
00611         state.inCell = 1;
00612         htmllval.cell = mkCell(atts);
00613         state.tok = T_cell;
00614     } else if (strcasecmp(name, "FONT") == 0) {
00615         htmllval.font = mkFont(atts, 0, 0);
00616         state.tok = T_font;
00617     } else if (strcasecmp(name, "B") == 0) {
00618         htmllval.font = mkFont(0, HTML_BF, 0);
00619         state.tok = T_bold;
00620     } else if (strcasecmp(name, "U") == 0) {
00621         htmllval.font = mkFont(0, HTML_UL, 1);
00622         state.tok = T_underline;
00623     } else if (strcasecmp(name, "I") == 0) {
00624         htmllval.font = mkFont(0, HTML_IF, 0);
00625         state.tok = T_italic;
00626     } else if (strcasecmp(name, "SUP") == 0) {
00627         htmllval.font = mkFont(0, HTML_SUP, 0);
00628         state.tok = T_sup;
00629     } else if (strcasecmp(name, "SUB") == 0) {
00630         htmllval.font = mkFont(0, HTML_SUB, 0);
00631         state.tok = T_sub;
00632     } else if (strcasecmp(name, "BR") == 0) {
00633         mkBR(atts);
00634         state.tok = T_br;
00635     } else if (strcasecmp(name, "HR") == 0) {
00636         state.tok = T_hr;
00637     } else if (strcasecmp(name, "VR") == 0) {
00638         state.tok = T_vr;
00639     } else if (strcasecmp(name, "IMG") == 0) {
00640         htmllval.img = mkImg(atts);
00641         state.tok = T_img;
00642     } else if (strcasecmp(name, "HTML") == 0) {
00643         state.tok = T_html;
00644     } else {
00645         lexerror(name);
00646     }
00647 }
00648 
00649 static void endElement(void *user, const char *name)
00650 {
00651     if (strcasecmp(name, "TABLE") == 0) {
00652         state.tok = T_end_table;
00653         state.inCell = 1;
00654     } else if ((strcasecmp(name, "TR") == 0)
00655                || (strcasecmp(name, "TH") == 0)) {
00656         state.tok = T_end_row;
00657     } else if (strcasecmp(name, "TD") == 0) {
00658         state.tok = T_end_cell;
00659         state.inCell = 0;
00660     } else if (strcasecmp(name, "HTML") == 0) {
00661         state.tok = T_end_html;
00662     } else if (strcasecmp(name, "FONT") == 0) {
00663         state.tok = T_end_font;
00664     } else if (strcasecmp(name, "B") == 0) {
00665         state.tok = T_n_bold;
00666     } else if (strcasecmp(name, "U") == 0) {
00667         state.tok = T_n_underline;
00668     } else if (strcasecmp(name, "I") == 0) {
00669         state.tok = T_n_italic;
00670     } else if (strcasecmp(name, "SUP") == 0) {
00671         state.tok = T_n_sup;
00672     } else if (strcasecmp(name, "SUB") == 0) {
00673         state.tok = T_n_sub;
00674     } else if (strcasecmp(name, "BR") == 0) {
00675         if (state.tok == T_br)
00676             state.tok = T_BR;
00677         else
00678             state.tok = T_end_br;
00679     } else if (strcasecmp(name, "HR") == 0) {
00680         if (state.tok == T_hr)
00681             state.tok = T_HR;
00682         else
00683             state.tok = T_end_hr;
00684     } else if (strcasecmp(name, "VR") == 0) {
00685         if (state.tok == T_vr)
00686             state.tok = T_VR;
00687         else
00688             state.tok = T_end_vr;
00689     } else if (strcasecmp(name, "IMG") == 0) {
00690         if (state.tok == T_img)
00691             state.tok = T_IMG;
00692         else
00693             state.tok = T_end_img;
00694     } else {
00695         lexerror(name);
00696     }
00697 }
00698 
00699 /* characterData:
00700  * Generate T_string token. Do this only when immediately in
00701  * <TD>..</TD> or <HTML>..</HTML>, i.e., when inCell is true.
00702  * Strip out formatting characters but keep spaces.
00703  * Distinguish between all whitespace vs. strings with non-whitespace
00704  * characters.
00705  */
00706 static void characterData(void *user, const char *s, int length)
00707 {
00708     int i, rc, cnt = 0;
00709     unsigned char c;
00710 
00711     if (state.inCell) {
00712         for (i = length; i; i--) {
00713             c = *s++;
00714             if (c >= ' ') {
00715                 cnt++;
00716                 rc = agxbputc(state.xb, c);
00717             }
00718         }
00719         if (cnt) state.tok = T_string;
00720     }
00721 }
00722 #endif
00723 
00724 int initHTMLlexer(char *src, agxbuf * xb, int charset)
00725 {
00726 #ifdef HAVE_EXPAT
00727     state.xb = xb;
00728     agxbinit (&state.lb, SMALLBUF, NULL);
00729     state.ptr = src;
00730     state.mode = 0;
00731     state.warn = 0;
00732     state.error = 0;
00733     state.currtoklen = 0;
00734     state.prevtoklen = 0;
00735     state.inCell = 1;
00736     state.parser = XML_ParserCreate(charsetToStr(charset));
00737     XML_SetElementHandler(state.parser,
00738                           (XML_StartElementHandler) startElement,
00739                           endElement);
00740     XML_SetCharacterDataHandler(state.parser, characterData);
00741     return 0;
00742 #else
00743     static int first;
00744     if (!first) {
00745         agerr(AGWARN,
00746               "Not built with libexpat. Table formatting is not available.\n");
00747         first++;
00748     }
00749     return 1;
00750 #endif
00751 }
00752 
00753 int clearHTMLlexer()
00754 {
00755 #ifdef HAVE_EXPAT
00756     int rv = state.warn | state.error;
00757     XML_ParserFree(state.parser);
00758     agxbfree (&state.lb);
00759     return rv;
00760 #else
00761     return 1;
00762 #endif
00763 }
00764 
00765 #ifdef HAVE_EXPAT
00766 /* eatComment:
00767  * Given first character after open comment, eat characters
00768  * upto comment close, returning pointer to closing > if it exists,
00769  * or null character otherwise.
00770  * We rely on HTML strings having matched nested <>.
00771  */
00772 static char *eatComment(char *p)
00773 {
00774     int depth = 1;
00775     char *s = p;
00776     char c;
00777 
00778     while (depth && (c = *s++)) {
00779         if (c == '<')
00780             depth++;
00781         else if (c == '>')
00782             depth--;
00783     }
00784     s--;                        /* move back to '\0' or '>' */
00785     if (*s) {
00786         char *t = s - 2;
00787         if ((t < p) || strncmp(t, "--", 2)) {
00788             agerr(AGWARN, "Unclosed comment\n");
00789             state.warn = 1;
00790         }
00791     }
00792     return s;
00793 }
00794 
00795 /* findNext:
00796  * Return next XML unit. This is either <..>, an HTML 
00797  * comment <!-- ... -->, or characters up to next <.
00798  */
00799 static char *findNext(char *s, agxbuf* xb)
00800 {
00801     char* t = s + 1;
00802     char c;
00803     int rc;
00804 
00805     if (*s == '<') {
00806         if ((*t == '!') && !strncmp(t + 1, "--", 2))
00807             t = eatComment(t + 3);
00808         else
00809             while (*t && (*t != '>'))
00810                 t++;
00811         if (*t != '>') {
00812             agerr(AGWARN, "Label closed before end of HTML element\n");
00813             state.warn = 1;
00814         } else
00815             t++;
00816     } else {
00817         t = s;
00818         while ((c = *t) && (c != '<')) {
00819             if ((c == '&') && (*(t+1) != '#')) {
00820                 t = scanEntity(t + 1, xb);
00821             }
00822             else {
00823                 rc = agxbputc(xb, c);
00824                 t++;
00825             }
00826         }
00827     }
00828     return t;
00829 }
00830 #endif
00831 
00832 int htmllineno()
00833 {
00834 #ifdef HAVE_EXPAT
00835     return XML_GetCurrentLineNumber(state.parser);
00836 #else
00837     return 0;
00838 #endif
00839 }
00840 
00841 #ifdef DEBUG
00842 static void printTok(int tok)
00843 {
00844     char *s;
00845 
00846     switch (tok) {
00847     case T_VR:
00848         s = "T_VR";
00849         break;
00850     case T_vr:
00851         s = "T_vr";
00852         break;
00853     case T_end_vr:
00854         s = "T_end_vr";
00855         break;
00856     case T_HR:
00857         s = "T_HR";
00858         break;
00859     case T_hr:
00860         s = "T_hr";
00861         break;
00862     case T_end_hr:
00863         s = "T_end_hr";
00864         break;
00865     case T_BR:
00866         s = "T_BR";
00867         break;
00868     case T_br:
00869         s = "T_br";
00870         break;
00871     case T_end_br:
00872         s = "T_end_br";
00873         break;
00874     case T_end_table:
00875         s = "T_end_table";
00876         break;
00877     case T_row:
00878         s = "T_row";
00879         break;
00880     case T_end_row:
00881         s = "T_end_row";
00882         break;
00883     case T_end_cell:
00884         s = "T_end_cell";
00885         break;
00886     case T_html:
00887         s = "T_html";
00888         break;
00889     case T_end_html:
00890         s = "T_end_html";
00891         break;
00892     case T_string:
00893         s = "T_string";
00894         break;
00895     case T_error:
00896         s = "T_error";
00897         break;
00898     case T_table:
00899         s = "T_table";
00900         break;
00901     case T_cell:
00902         s = "T_cell";
00903         break;
00904     case T_img:
00905         s = "T_img";
00906         break;
00907     case T_end_img:
00908         s = "T_end_img";
00909         break;
00910     case T_IMG:
00911         s = "T_IMG";
00912         break;
00913     case T_underline:
00914         s = "T_underline";
00915         break;
00916     case T_n_underline:
00917         s = "T_underline";
00918         break;
00919     case T_italic:
00920         s = "T_italic";
00921         break;
00922     case T_n_italic:
00923         s = "T_italic";
00924         break;
00925     case T_bold:
00926         s = "T_bold";
00927         break;
00928     case T_n_bold:
00929         s = "T_bold";
00930         break;
00931     default:
00932         s = "<unknown>";
00933     }
00934     if (tok == T_string) {
00935         fprintf(stderr, "%s \"", s);
00936         fwrite(agxbstart(state.xb), 1, agxblen(state.xb), stderr);
00937         fprintf(stderr, "\"\n");
00938     } else
00939         fprintf(stderr, "%s\n", s);
00940 }
00941 
00942 #endif
00943 
00944 int htmllex()
00945 {
00946 #ifdef HAVE_EXPAT
00947     static char *begin_html = "<HTML>";
00948     static char *end_html = "</HTML>";
00949 
00950     char *s;
00951     char *endp = 0;
00952     int len, llen;
00953     int rv;
00954 
00955     state.tok = 0;
00956     do {
00957         if (state.mode == 2)
00958             return EOF;
00959         if (state.mode == 0) {
00960             state.mode = 1;
00961             s = begin_html;
00962             len = strlen(s);
00963             endp = 0;
00964         } else {
00965             s = state.ptr;
00966             if (*s == '\0') {
00967                 state.mode = 2;
00968                 s = end_html;
00969                 len = strlen(s);
00970             } else {
00971                 endp = findNext(s,&state.lb);
00972                 len = endp - s;
00973             }
00974         }
00975         state.prevtok = state.currtok;
00976         state.prevtoklen = state.currtoklen;
00977         state.currtok = s;
00978         state.currtoklen = len;
00979         if ((llen = agxblen(&state.lb)))
00980             rv = XML_Parse(state.parser, agxbuse(&state.lb),llen, 0);
00981         else
00982             rv = XML_Parse(state.parser, s, len, (len ? 0 : 1));
00983         if (rv == XML_STATUS_ERROR) {
00984             if (!state.error) {
00985                 agerr(AGERR, "%s in line %d \n",
00986                       XML_ErrorString(XML_GetErrorCode(state.parser)),
00987                       htmllineno());
00988                 error_context();
00989                 state.error = 1;
00990                 state.tok = T_error;
00991             }
00992         }
00993         if (endp)
00994             state.ptr = endp;
00995     } while (state.tok == 0);
00996     /* printTok (state.tok); */
00997     return state.tok;
00998 #else
00999     return EOF;
01000 #endif
01001 }
01002