Graphviz  2.29.20120524.0446
lib/common/htmllex.c
Go to the documentation of this file.
00001 /* $Id$ $Revision$ */
00002 /* vim:set shiftwidth=4 ts=8: */
00003 
00004 /*************************************************************************
00005  * Copyright (c) 2011 AT&T Intellectual Property 
00006  * All rights reserved. This program and the accompanying materials
00007  * are made available under the terms of the Eclipse Public License v1.0
00008  * which accompanies this distribution, and is available at
00009  * http://www.eclipse.org/legal/epl-v10.html
00010  *
00011  * Contributors: See CVS logs. Details at http://www.graphviz.org/
00012  *************************************************************************/
00013 
00014 
00015 #include "render.h"
00016 #include "htmltable.h"
00017 #include "htmlparse.h"
00018 #include "htmllex.h"
00019 #include <ctype.h>
00020 
00021 #ifdef HAVE_EXPAT
00022 #include <expat.h>
00023 #endif
00024 
00025 #ifndef XML_STATUS_ERROR
00026 #define XML_STATUS_ERROR 0
00027 #endif
00028 
00029 typedef struct {
00030 #ifdef HAVE_EXPAT
00031     XML_Parser parser;
00032 #endif
00033     char* ptr;                  /* input source */
00034     int tok;                    /* token type   */
00035     agxbuf* xb;                 /* buffer to gather T_string data */
00036     agxbuf  lb;                 /* buffer for translating lexical data */
00037     char warn;                  /* set if warning given */
00038     char error;                 /* set if error given */
00039     char inCell;                /* set if in TD to allow T_string */
00040     char mode;                  /* for handling artificial <HTML>..</HTML> */
00041     char *currtok;              /* for error reporting */
00042     char *prevtok;              /* for error reporting */
00043     int currtoklen;
00044     int prevtoklen;
00045 } lexstate_t;
00046 static lexstate_t state;
00047 
00048 /* error_context:
00049  * Print the last 2 "token"s seen.
00050  */
00051 static void error_context(void)
00052 {
00053     agxbclear(state.xb);
00054     if (state.prevtoklen > 0)
00055         agxbput_n(state.xb, state.prevtok, state.prevtoklen);
00056     agxbput_n(state.xb, state.currtok, state.currtoklen);
00057     agerr(AGPREV, "... %s ...\n", agxbuse(state.xb));
00058 }
00059 
00060 /* htmlerror:
00061  * yyerror - called by yacc output
00062  */
00063 void htmlerror(const char *msg)
00064 {
00065     if (state.error)
00066         return;
00067     state.error = 1;
00068     agerr(AGERR, "%s in line %d \n", msg, htmllineno());
00069     error_context();
00070 }
00071 
00072 #ifdef HAVE_EXPAT
00073 /* lexerror:
00074  * called by lexer when unknown <..> is found.
00075  */
00076 static void lexerror(const char *name)
00077 {
00078     state.tok = T_error;
00079     state.error = 1;
00080     agerr(AGERR, "Unknown HTML element <%s> on line %d \n",
00081           name, htmllineno());
00082 }
00083 
00084 typedef int (*attrFn) (void *, char *);
00085 typedef int (*bcmpfn) (const void *, const void *);
00086 
00087 #define MAX_CHAR    (((unsigned char)(~0)) >> 1)
00088 #define MIN_CHAR    ((signed char)(~MAX_CHAR))
00089 #define MAX_UCHAR   ((unsigned char)(~0))
00090 #define MAX_USHORT  ((unsigned short)(~0))
00091 
00092 /* Mechanism for automatically processing attributes */
00093 typedef struct {
00094     char *name;                 /* attribute name */
00095     attrFn action;              /* action to perform if name matches */
00096 } attr_item;
00097 
00098 #define ISIZE (sizeof(attr_item))
00099 
00100 /* icmp:
00101  * Compare two attr_item. Used in bsearch
00102  */
00103 static int icmp(attr_item * i, attr_item * j)
00104 {
00105     return strcasecmp(i->name, j->name);
00106 }
00107 
00108 static int bgcolorfn(htmldata_t * p, char *v)
00109 {
00110     p->bgcolor = strdup(v);
00111     return 0;
00112 }
00113 
00114 static int pencolorfn(htmldata_t * p, char *v)
00115 {
00116     p->pencolor = strdup(v);
00117     return 0;
00118 }
00119 
00120 static int hreffn(htmldata_t * p, char *v)
00121 {
00122     p->href = strdup(v);
00123     return 0;
00124 }
00125 
00126 static int titlefn(htmldata_t * p, char *v)
00127 {
00128     p->title = strdup(v);
00129     return 0;
00130 }
00131 
00132 static int portfn(htmldata_t * p, char *v)
00133 {
00134     p->port = strdup(v);
00135     return 0;
00136 }
00137 
00138 #define DELIM " ,"
00139 
00140 static int stylefn(htmldata_t * p, char *v)
00141 {
00142     int rv = 0;
00143     char c;
00144     char* tk;
00145     char* buf = strdup (v);
00146     for (tk = strtok (buf, DELIM); tk; tk = strtok (NULL, DELIM)) {
00147         c = toupper(*tk);
00148         if (c == 'R') {
00149             if (!strcasecmp(tk + 1, "OUNDED")) p->style |= ROUNDED;
00150             else if (!strcasecmp(tk + 1, "ADIAL")) p->style |= RADIAL;
00151             else {
00152                 agerr(AGWARN, "Illegal value %s for STYLE - ignored\n", tk);
00153                 rv = 1;
00154             }
00155         }
00156         else {
00157             agerr(AGWARN, "Illegal value %s for STYLE - ignored\n", tk);
00158             rv = 1;
00159         }
00160     }
00161     free (buf);
00162     return rv;
00163 }
00164 
00165 static int targetfn(htmldata_t * p, char *v)
00166 {
00167     p->target = strdup(v);
00168     return 0;
00169 }
00170 
00171 static int idfn(htmldata_t * p, char *v)
00172 {
00173     p->id = strdup(v);
00174     return 0;
00175 }
00176 
00177 
00178 /* doInt:
00179  * Scan v for integral value. Check that
00180  * the value is >= min and <= max. Return value in ul.
00181  * String s is name of value.
00182  * Return 0 if okay; 1 otherwise.
00183  */
00184 static int doInt(char *v, char *s, int min, int max, long *ul)
00185 {
00186     int rv = 0;
00187     char *ep;
00188     long b = strtol(v, &ep, 10);
00189 
00190     if (ep == v) {
00191         agerr(AGWARN, "Improper %s value %s - ignored", s, v);
00192         rv = 1;
00193     } else if (b > max) {
00194         agerr(AGWARN, "%s value %s > %d - too large - ignored", s, v, max);
00195         rv = 1;
00196     } else if (b < min) {
00197         agerr(AGWARN, "%s value %s < %d - too small - ignored", s, v, min);
00198         rv = 1;
00199     } else
00200         *ul = b;
00201     return rv;
00202 }
00203 
00204 
00205 static int gradientanglefn(htmldata_t * p, char *v)
00206 {
00207     long u;
00208 
00209     if (doInt(v, "GRADIENTANGLE", 0, 360, &u))
00210         return 1;
00211     p->gradientangle = (unsigned short) u;
00212     return 0;
00213 }
00214 
00215 
00216 static int borderfn(htmldata_t * p, char *v)
00217 {
00218     long u;
00219 
00220     if (doInt(v, "BORDER", 0, MAX_UCHAR, &u))
00221         return 1;
00222     p->border = (unsigned char) u;
00223     p->flags |= BORDER_SET;
00224     return 0;
00225 }
00226 
00227 static int cellpaddingfn(htmldata_t * p, char *v)
00228 {
00229     long u;
00230 
00231     if (doInt(v, "CELLPADDING", 0, MAX_UCHAR, &u))
00232         return 1;
00233     p->pad = (unsigned char) u;
00234     p->flags |= PAD_SET;
00235     return 0;
00236 }
00237 
00238 static int cellspacingfn(htmldata_t * p, char *v)
00239 {
00240     long u;
00241 
00242     if (doInt(v, "CELLSPACING", MIN_CHAR, MAX_CHAR, &u))
00243         return 1;
00244     p->space = (signed char) u;
00245     p->flags |= SPACE_SET;
00246     return 0;
00247 }
00248 
00249 static int cellborderfn(htmltbl_t * p, char *v)
00250 {
00251     long u;
00252 
00253     if (doInt(v, "CELLSBORDER", 0, MAX_CHAR, &u))
00254         return 1;
00255     p->cb = (unsigned char) u;
00256     return 0;
00257 }
00258 
00259 static int columnsfn(htmltbl_t * p, char *v)
00260 {
00261     if (*v != '*') {
00262         agerr(AGWARN, "Unknown value %s for COLUMNS - ignored\n", v);
00263         return 1;
00264     }
00265     p->flags |= HTML_VRULE;
00266     return 0;
00267 }
00268 
00269 static int rowsfn(htmltbl_t * p, char *v)
00270 {
00271     if (*v != '*') {
00272         agerr(AGWARN, "Unknown value %s for ROWS - ignored\n", v);
00273         return 1;
00274     }
00275     p->flags |= HTML_HRULE;
00276     return 0;
00277 }
00278 
00279 static int fixedsizefn(htmldata_t * p, char *v)
00280 {
00281     int rv = 0;
00282     char c = toupper(*(unsigned char *) v);
00283     if ((c == 'T') && !strcasecmp(v + 1, "RUE"))
00284         p->flags |= FIXED_FLAG;
00285     else if ((c != 'F') || strcasecmp(v + 1, "ALSE")) {
00286         agerr(AGWARN, "Illegal value %s for FIXEDSIZE - ignored\n", v);
00287         rv = 1;
00288     }
00289     return rv;
00290 }
00291 
00292 static int valignfn(htmldata_t * p, char *v)
00293 {
00294     int rv = 0;
00295     char c = toupper(*v);
00296     if ((c == 'B') && !strcasecmp(v + 1, "OTTOM"))
00297         p->flags |= VALIGN_BOTTOM;
00298     else if ((c == 'T') && !strcasecmp(v + 1, "OP"))
00299         p->flags |= VALIGN_TOP;
00300     else if ((c != 'M') || strcasecmp(v + 1, "IDDLE")) {
00301         agerr(AGWARN, "Illegal value %s for VALIGN - ignored\n", v);
00302         rv = 1;
00303     }
00304     return rv;
00305 }
00306 
00307 static int halignfn(htmldata_t * p, char *v)
00308 {
00309     int rv = 0;
00310     char c = toupper(*v);
00311     if ((c == 'L') && !strcasecmp(v + 1, "EFT"))
00312         p->flags |= HALIGN_LEFT;
00313     else if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
00314         p->flags |= HALIGN_RIGHT;
00315     else if ((c != 'C') || strcasecmp(v + 1, "ENTER")) {
00316         agerr(AGWARN, "Illegal value %s for ALIGN - ignored\n", v);
00317         rv = 1;
00318     }
00319     return rv;
00320 }
00321 
00322 static int cell_halignfn(htmldata_t * p, char *v)
00323 {
00324     int rv = 0;
00325     char c = toupper(*v);
00326     if ((c == 'L') && !strcasecmp(v + 1, "EFT"))
00327         p->flags |= HALIGN_LEFT;
00328     else if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
00329         p->flags |= HALIGN_RIGHT;
00330     else if ((c == 'T') && !strcasecmp(v + 1, "EXT"))
00331         p->flags |= HALIGN_TEXT;
00332     else if ((c != 'C') || strcasecmp(v + 1, "ENTER"))
00333         rv = 1;
00334     if (rv)
00335         agerr(AGWARN, "Illegal value %s for ALIGN in TD - ignored\n", v);
00336     return rv;
00337 }
00338 
00339 static int balignfn(htmldata_t * p, char *v)
00340 {
00341     int rv = 0;
00342     char c = toupper(*v);
00343     if ((c == 'L') && !strcasecmp(v + 1, "EFT"))
00344         p->flags |= BALIGN_LEFT;
00345     else if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
00346         p->flags |= BALIGN_RIGHT;
00347     else if ((c != 'C') || strcasecmp(v + 1, "ENTER"))
00348         rv = 1;
00349     if (rv)
00350         agerr(AGWARN, "Illegal value %s for BALIGN in TD - ignored\n", v);
00351     return rv;
00352 }
00353 
00354 static int heightfn(htmldata_t * p, char *v)
00355 {
00356     long u;
00357 
00358     if (doInt(v, "HEIGHT", 0, MAX_USHORT, &u))
00359         return 1;
00360     p->height = (unsigned short) u;
00361     return 0;
00362 }
00363 
00364 static int widthfn(htmldata_t * p, char *v)
00365 {
00366     long u;
00367 
00368     if (doInt(v, "WIDTH", 0, MAX_USHORT, &u))
00369         return 1;
00370     p->width = (unsigned short) u;
00371     return 0;
00372 }
00373 
00374 static int rowspanfn(htmlcell_t * p, char *v)
00375 {
00376     long u;
00377 
00378     if (doInt(v, "ROWSPAN", 0, MAX_USHORT, &u))
00379         return 1;
00380     if (u == 0) {
00381         agerr(AGWARN, "ROWSPAN value cannot be 0 - ignored\n");
00382         return 1;
00383     }
00384     p->rspan = (unsigned short) u;
00385     return 0;
00386 }
00387 
00388 static int colspanfn(htmlcell_t * p, char *v)
00389 {
00390     long u;
00391 
00392     if (doInt(v, "COLSPAN", 0, MAX_USHORT, &u))
00393         return 1;
00394     if (u == 0) {
00395         agerr(AGWARN, "COLSPAN value cannot be 0 - ignored\n");
00396         return 1;
00397     }
00398     p->cspan = (unsigned short) u;
00399     return 0;
00400 }
00401 
00402 static int fontcolorfn(htmlfont_t * p, char *v)
00403 {
00404     p->color = strdup(v);
00405     return 0;
00406 }
00407 
00408 static int facefn(htmlfont_t * p, char *v)
00409 {
00410     p->name = strdup(v);
00411     return 0;
00412 }
00413 
00414 static int ptsizefn(htmlfont_t * p, char *v)
00415 {
00416     long u;
00417 
00418     if (doInt(v, "POINT-SIZE", 0, MAX_UCHAR, &u))
00419         return 1;
00420     p->size = (double) u;
00421     return 0;
00422 }
00423 
00424 static int srcfn(htmlimg_t * p, char *v)
00425 {
00426     p->src = strdup(v);
00427     return 0;
00428 }
00429 
00430 static int scalefn(htmlimg_t * p, char *v)
00431 {
00432     p->scale = strdup(v);
00433     return 0;
00434 }
00435 
00436 static int alignfn(int *p, char *v)
00437 {
00438     int rv = 0;
00439     char c = toupper(*v);
00440     if ((c == 'R') && !strcasecmp(v + 1, "IGHT"))
00441         *p = 'r';
00442     else if ((c == 'L') || !strcasecmp(v + 1, "EFT"))
00443         *p = 'l';
00444     else if ((c == 'C') || strcasecmp(v + 1, "ENTER")) 
00445         *p = 'n';
00446     else {
00447         agerr(AGWARN, "Illegal value %s for ALIGN - ignored\n", v);
00448         rv = 1;
00449     }
00450     return rv;
00451 }
00452 
00453 /* Tables used in binary search; MUST be alphabetized */
00454 static attr_item tbl_items[] = {
00455     {"align", (attrFn) halignfn},
00456     {"bgcolor", (attrFn) bgcolorfn},
00457     {"border", (attrFn) borderfn},
00458     {"cellborder", (attrFn) cellborderfn},
00459     {"cellpadding", (attrFn) cellpaddingfn},
00460     {"cellspacing", (attrFn) cellspacingfn},
00461     {"color", (attrFn) pencolorfn},
00462     {"columns", (attrFn) columnsfn},
00463     {"fixedsize", (attrFn) fixedsizefn},
00464     {"gradientangle", (attrFn) gradientanglefn},
00465     {"height", (attrFn) heightfn},
00466     {"href", (attrFn) hreffn},
00467     {"id", (attrFn) idfn},
00468     {"port", (attrFn) portfn},
00469     {"rows", (attrFn) rowsfn},
00470     {"style", (attrFn) stylefn},
00471     {"target", (attrFn) targetfn},
00472     {"title", (attrFn) titlefn},
00473     {"tooltip", (attrFn) titlefn},
00474     {"valign", (attrFn) valignfn},
00475     {"width", (attrFn) widthfn},
00476 };
00477 
00478 static attr_item cell_items[] = {
00479     {"align", (attrFn) cell_halignfn},
00480     {"balign", (attrFn) balignfn},
00481     {"bgcolor", (attrFn) bgcolorfn},
00482     {"border", (attrFn) borderfn},
00483     {"cellpadding", (attrFn) cellpaddingfn},
00484     {"cellspacing", (attrFn) cellspacingfn},
00485     {"color", (attrFn) pencolorfn},
00486     {"colspan", (attrFn) colspanfn},
00487     {"fixedsize", (attrFn) fixedsizefn},
00488     {"gradientangle", (attrFn) gradientanglefn},
00489     {"height", (attrFn) heightfn},
00490     {"href", (attrFn) hreffn},
00491     {"id", (attrFn) idfn},
00492     {"port", (attrFn) portfn},
00493     {"rowspan", (attrFn) rowspanfn},
00494     {"style", (attrFn) stylefn},
00495     {"target", (attrFn) targetfn},
00496     {"title", (attrFn) titlefn},
00497     {"tooltip", (attrFn) titlefn},
00498     {"valign", (attrFn) valignfn},
00499     {"width", (attrFn) widthfn},
00500 };
00501 
00502 static attr_item font_items[] = {
00503     {"color", (attrFn) fontcolorfn},
00504     {"face", (attrFn) facefn},
00505     {"point-size", (attrFn) ptsizefn},
00506 };
00507 
00508 static attr_item img_items[] = {
00509     {"scale", (attrFn) scalefn},
00510     {"src", (attrFn) srcfn},
00511 };
00512 
00513 static attr_item br_items[] = {
00514     {"align", (attrFn) alignfn},
00515 };
00516 
00517 /* doAttrs:
00518  * General function for processing list of name/value attributes.
00519  * Do binary search on items table. If match found, invoke action
00520  * passing it tp and attribute value.
00521  * Table size is given by nel
00522  * Name/value pairs are in array atts, which is null terminated.
00523  * s is the name of the HTML element being processed.
00524  */
00525 static void
00526 doAttrs(void *tp, attr_item * items, int nel, char **atts, char *s)
00527 {
00528     char *name;
00529     char *val;
00530     attr_item *ip;
00531     attr_item key;
00532 
00533     while ((name = *atts++) != NULL) {
00534         val = *atts++;
00535         key.name = name;
00536         ip = (attr_item *) bsearch(&key, items, nel, ISIZE, (bcmpfn) icmp);
00537         if (ip)
00538             state.warn |= ip->action(tp, val);
00539         else {
00540             agerr(AGWARN, "Illegal attribute %s in %s - ignored\n", name,
00541                   s);
00542             state.warn = 1;
00543         }
00544     }
00545 }
00546 
00547 static void mkBR(char **atts)
00548 {
00549     htmllval.i = UNSET_ALIGN;
00550     doAttrs(&htmllval.i, br_items, sizeof(br_items) / ISIZE, atts, "<BR>");
00551 }
00552 
00553 static htmlimg_t *mkImg(char **atts)
00554 {
00555     htmlimg_t *img = NEW(htmlimg_t);
00556 
00557     doAttrs(img, img_items, sizeof(img_items) / ISIZE, atts, "<IMG>");
00558 
00559     return img;
00560 }
00561 
00562 static htmlfont_t *mkFont(char **atts, int flags, int ul)
00563 {
00564     htmlfont_t *font = NEW(htmlfont_t);
00565 
00566     font->size = -1.0;          /* unassigned */
00567     font->flags = flags;
00568     if (atts)
00569         doAttrs(font, font_items, sizeof(font_items) / ISIZE, atts, "<FONT>");
00570 
00571     return font;
00572 }
00573 
00574 static htmlcell_t *mkCell(char **atts)
00575 {
00576     htmlcell_t *cell = NEW(htmlcell_t);
00577 
00578     cell->cspan = 1;
00579     cell->rspan = 1;
00580     doAttrs(cell, cell_items, sizeof(cell_items) / ISIZE, atts, "<TD>");
00581 
00582     return cell;
00583 }
00584 
00585 static htmltbl_t *mkTbl(char **atts)
00586 {
00587     htmltbl_t *tbl = NEW(htmltbl_t);
00588 
00589     tbl->rc = -1;               /* flag that table is a raw, parsed table */
00590     tbl->cb = -1;               /* unset cell border attribute */
00591     doAttrs(tbl, tbl_items, sizeof(tbl_items) / ISIZE, atts, "<TABLE>");
00592 
00593     return tbl;
00594 }
00595 
00596 static void startElement(void *user, const char *name, char **atts)
00597 {
00598     if (strcasecmp(name, "TABLE") == 0) {
00599         htmllval.tbl = mkTbl(atts);
00600         state.inCell = 0;
00601         state.tok = T_table;
00602     } else if ((strcasecmp(name, "TR") == 0)
00603                || (strcasecmp(name, "TH") == 0)) {
00604         state.inCell = 0;
00605         state.tok = T_row;
00606     } else if (strcasecmp(name, "TD") == 0) {
00607         state.inCell = 1;
00608         htmllval.cell = mkCell(atts);
00609         state.tok = T_cell;
00610     } else if (strcasecmp(name, "FONT") == 0) {
00611         htmllval.font = mkFont(atts, 0, 0);
00612         state.tok = T_font;
00613     } else if (strcasecmp(name, "B") == 0) {
00614         htmllval.font = mkFont(0, HTML_BF, 0);
00615         state.tok = T_bold;
00616     } else if (strcasecmp(name, "U") == 0) {
00617         htmllval.font = mkFont(0, HTML_UL, 1);
00618         state.tok = T_underline;
00619     } else if (strcasecmp(name, "I") == 0) {
00620         htmllval.font = mkFont(0, HTML_IF, 0);
00621         state.tok = T_italic;
00622     } else if (strcasecmp(name, "SUP") == 0) {
00623         htmllval.font = mkFont(0, HTML_SUP, 0);
00624         state.tok = T_sup;
00625     } else if (strcasecmp(name, "SUB") == 0) {
00626         htmllval.font = mkFont(0, HTML_SUB, 0);
00627         state.tok = T_sub;
00628     } else if (strcasecmp(name, "BR") == 0) {
00629         mkBR(atts);
00630         state.tok = T_br;
00631     } else if (strcasecmp(name, "HR") == 0) {
00632         state.tok = T_hr;
00633     } else if (strcasecmp(name, "VR") == 0) {
00634         state.tok = T_vr;
00635     } else if (strcasecmp(name, "IMG") == 0) {
00636         htmllval.img = mkImg(atts);
00637         state.tok = T_img;
00638     } else if (strcasecmp(name, "HTML") == 0) {
00639         state.tok = T_html;
00640     } else {
00641         lexerror(name);
00642     }
00643 }
00644 
00645 static void endElement(void *user, const char *name)
00646 {
00647     if (strcasecmp(name, "TABLE") == 0) {
00648         state.tok = T_end_table;
00649         state.inCell = 1;
00650     } else if ((strcasecmp(name, "TR") == 0)
00651                || (strcasecmp(name, "TH") == 0)) {
00652         state.tok = T_end_row;
00653     } else if (strcasecmp(name, "TD") == 0) {
00654         state.tok = T_end_cell;
00655         state.inCell = 0;
00656     } else if (strcasecmp(name, "HTML") == 0) {
00657         state.tok = T_end_html;
00658     } else if (strcasecmp(name, "FONT") == 0) {
00659         state.tok = T_end_font;
00660     } else if (strcasecmp(name, "B") == 0) {
00661         state.tok = T_n_bold;
00662     } else if (strcasecmp(name, "U") == 0) {
00663         state.tok = T_n_underline;
00664     } else if (strcasecmp(name, "I") == 0) {
00665         state.tok = T_n_italic;
00666     } else if (strcasecmp(name, "SUP") == 0) {
00667         state.tok = T_n_sup;
00668     } else if (strcasecmp(name, "SUB") == 0) {
00669         state.tok = T_n_sub;
00670     } else if (strcasecmp(name, "BR") == 0) {
00671         if (state.tok == T_br)
00672             state.tok = T_BR;
00673         else
00674             state.tok = T_end_br;
00675     } else if (strcasecmp(name, "HR") == 0) {
00676         if (state.tok == T_hr)
00677             state.tok = T_HR;
00678         else
00679             state.tok = T_end_hr;
00680     } else if (strcasecmp(name, "VR") == 0) {
00681         if (state.tok == T_vr)
00682             state.tok = T_VR;
00683         else
00684             state.tok = T_end_vr;
00685     } else if (strcasecmp(name, "IMG") == 0) {
00686         if (state.tok == T_img)
00687             state.tok = T_IMG;
00688         else
00689             state.tok = T_end_img;
00690     } else {
00691         lexerror(name);
00692     }
00693 }
00694 
00695 /* characterData:
00696  * Generate T_string token. Do this only when immediately in
00697  * <TD>..</TD> or <HTML>..</HTML>, i.e., when inCell is true.
00698  * Strip out formatting characters but keep spaces.
00699  * Distinguish between all whitespace vs. strings with non-whitespace
00700  * characters.
00701  */
00702 static void characterData(void *user, const char *s, int length)
00703 {
00704     int i, rc, cnt = 0;
00705     unsigned char c;
00706 
00707     if (state.inCell) {
00708         for (i = length; i; i--) {
00709             c = *s++;
00710             if (c >= ' ') {
00711                 cnt++;
00712                 rc = agxbputc(state.xb, c);
00713             }
00714         }
00715         if (cnt) state.tok = T_string;
00716     }
00717 }
00718 #endif
00719 
00720 int initHTMLlexer(char *src, agxbuf * xb, int charset)
00721 {
00722 #ifdef HAVE_EXPAT
00723     state.xb = xb;
00724     agxbinit (&state.lb, SMALLBUF, NULL);
00725     state.ptr = src;
00726     state.mode = 0;
00727     state.warn = 0;
00728     state.error = 0;
00729     state.currtoklen = 0;
00730     state.prevtoklen = 0;
00731     state.inCell = 1;
00732     state.parser = XML_ParserCreate(charsetToStr(charset));
00733     XML_SetElementHandler(state.parser,
00734                           (XML_StartElementHandler) startElement,
00735                           endElement);
00736     XML_SetCharacterDataHandler(state.parser, characterData);
00737     return 0;
00738 #else
00739     static int first;
00740     if (!first) {
00741         agerr(AGWARN,
00742               "Not built with libexpat. Table formatting is not available.\n");
00743         first++;
00744     }
00745     return 1;
00746 #endif
00747 }
00748 
00749 int clearHTMLlexer()
00750 {
00751 #ifdef HAVE_EXPAT
00752     int rv = state.warn | state.error;
00753     XML_ParserFree(state.parser);
00754     agxbfree (&state.lb);
00755     return rv;
00756 #else
00757     return 1;
00758 #endif
00759 }
00760 
00761 #ifdef HAVE_EXPAT
00762 /* eatComment:
00763  * Given first character after open comment, eat characters
00764  * upto comment close, returning pointer to closing > if it exists,
00765  * or null character otherwise.
00766  * We rely on HTML strings having matched nested <>.
00767  */
00768 static char *eatComment(char *p)
00769 {
00770     int depth = 1;
00771     char *s = p;
00772     char c;
00773 
00774     while (depth && (c = *s++)) {
00775         if (c == '<')
00776             depth++;
00777         else if (c == '>')
00778             depth--;
00779     }
00780     s--;                        /* move back to '\0' or '>' */
00781     if (*s) {
00782         char *t = s - 2;
00783         if ((t < p) || strncmp(t, "--", 2)) {
00784             agerr(AGWARN, "Unclosed comment\n");
00785             state.warn = 1;
00786         }
00787     }
00788     return s;
00789 }
00790 
00791 /* findNext:
00792  * Return next XML unit. This is either <..>, an HTML 
00793  * comment <!-- ... -->, or characters up to next <.
00794  */
00795 static char *findNext(char *s, agxbuf* xb)
00796 {
00797     char* t = s + 1;
00798     char c;
00799     int rc;
00800 
00801     if (*s == '<') {
00802         if ((*t == '!') && !strncmp(t + 1, "--", 2))
00803             t = eatComment(t + 3);
00804         else
00805             while (*t && (*t != '>'))
00806                 t++;
00807         if (*t != '>') {
00808             agerr(AGWARN, "Label closed before end of HTML element\n");
00809             state.warn = 1;
00810         } else
00811             t++;
00812     } else {
00813         t = s;
00814         while ((c = *t) && (c != '<')) {
00815             if ((c == '&') && (*(t+1) != '#')) {
00816                 t = scanEntity(t + 1, xb);
00817             }
00818             else {
00819                 rc = agxbputc(xb, c);
00820                 t++;
00821             }
00822         }
00823     }
00824     return t;
00825 }
00826 #endif
00827 
00828 int htmllineno()
00829 {
00830 #ifdef HAVE_EXPAT
00831     return XML_GetCurrentLineNumber(state.parser);
00832 #else
00833     return 0;
00834 #endif
00835 }
00836 
00837 #ifdef DEBUG
00838 static void printTok(int tok)
00839 {
00840     char *s;
00841 
00842     switch (tok) {
00843     case T_VR:
00844         s = "T_VR";
00845         break;
00846     case T_vr:
00847         s = "T_vr";
00848         break;
00849     case T_end_vr:
00850         s = "T_end_vr";
00851         break;
00852     case T_HR:
00853         s = "T_HR";
00854         break;
00855     case T_hr:
00856         s = "T_hr";
00857         break;
00858     case T_end_hr:
00859         s = "T_end_hr";
00860         break;
00861     case T_BR:
00862         s = "T_BR";
00863         break;
00864     case T_br:
00865         s = "T_br";
00866         break;
00867     case T_end_br:
00868         s = "T_end_br";
00869         break;
00870     case T_end_table:
00871         s = "T_end_table";
00872         break;
00873     case T_row:
00874         s = "T_row";
00875         break;
00876     case T_end_row:
00877         s = "T_end_row";
00878         break;
00879     case T_end_cell:
00880         s = "T_end_cell";
00881         break;
00882     case T_html:
00883         s = "T_html";
00884         break;
00885     case T_end_html:
00886         s = "T_end_html";
00887         break;
00888     case T_string:
00889         s = "T_string";
00890         break;
00891     case T_error:
00892         s = "T_error";
00893         break;
00894     case T_table:
00895         s = "T_table";
00896         break;
00897     case T_cell:
00898         s = "T_cell";
00899         break;
00900     case T_img:
00901         s = "T_img";
00902         break;
00903     case T_end_img:
00904         s = "T_end_img";
00905         break;
00906     case T_IMG:
00907         s = "T_IMG";
00908         break;
00909     case T_underline:
00910         s = "T_underline";
00911         break;
00912     case T_n_underline:
00913         s = "T_underline";
00914         break;
00915     case T_italic:
00916         s = "T_italic";
00917         break;
00918     case T_n_italic:
00919         s = "T_italic";
00920         break;
00921     case T_bold:
00922         s = "T_bold";
00923         break;
00924     case T_n_bold:
00925         s = "T_bold";
00926         break;
00927     default:
00928         s = "<unknown>";
00929     }
00930     if (tok == T_string) {
00931         fprintf(stderr, "%s \"", s);
00932         fwrite(agxbstart(state.xb), 1, agxblen(state.xb), stderr);
00933         fprintf(stderr, "\"\n");
00934     } else
00935         fprintf(stderr, "%s\n", s);
00936 }
00937 
00938 #endif
00939 
00940 int htmllex()
00941 {
00942 #ifdef HAVE_EXPAT
00943     static char *begin_html = "<HTML>";
00944     static char *end_html = "</HTML>";
00945 
00946     char *s;
00947     char *endp = 0;
00948     int len, llen;
00949     int rv;
00950 
00951     state.tok = 0;
00952     do {
00953         if (state.mode == 2)
00954             return EOF;
00955         if (state.mode == 0) {
00956             state.mode = 1;
00957             s = begin_html;
00958             len = strlen(s);
00959             endp = 0;
00960         } else {
00961             s = state.ptr;
00962             if (*s == '\0') {
00963                 state.mode = 2;
00964                 s = end_html;
00965                 len = strlen(s);
00966             } else {
00967                 endp = findNext(s,&state.lb);
00968                 len = endp - s;
00969             }
00970         }
00971         state.prevtok = state.currtok;
00972         state.prevtoklen = state.currtoklen;
00973         state.currtok = s;
00974         state.currtoklen = len;
00975         if ((llen = agxblen(&state.lb)))
00976             rv = XML_Parse(state.parser, agxbuse(&state.lb),llen, 0);
00977         else
00978             rv = XML_Parse(state.parser, s, len, (len ? 0 : 1));
00979         if (rv == XML_STATUS_ERROR) {
00980             if (!state.error) {
00981                 agerr(AGERR, "%s in line %d \n",
00982                       XML_ErrorString(XML_GetErrorCode(state.parser)),
00983                       htmllineno());
00984                 error_context();
00985                 state.error = 1;
00986                 state.tok = T_error;
00987             }
00988         }
00989         if (endp)
00990             state.ptr = endp;
00991     } while (state.tok == 0);
00992     /* printTok (state.tok); */
00993     return state.tok;
00994 #else
00995     return EOF;
00996 #endif
00997 }
00998