|
Graphviz
2.29.20120524.0446
|
00001 /* $Id$ $Revision$ */ 00002 /* vim:set shiftwidth=4 ts=8: */ 00003 00004 /************************************************************************* 00005 * Copyright (c) 2011 AT&T Intellectual Property 00006 * All rights reserved. This program and the accompanying materials 00007 * are made available under the terms of the Eclipse Public License v1.0 00008 * which accompanies this distribution, and is available at 00009 * http://www.eclipse.org/legal/epl-v10.html 00010 * 00011 * Contributors: See CVS logs. Details at http://www.graphviz.org/ 00012 *************************************************************************/ 00013 00014 00015 #include "render.h" 00016 #include "htmltable.h" 00017 #include "htmlparse.h" 00018 #include "htmllex.h" 00019 #include <ctype.h> 00020 00021 #ifdef HAVE_EXPAT 00022 #include <expat.h> 00023 #endif 00024 00025 #ifndef XML_STATUS_ERROR 00026 #define XML_STATUS_ERROR 0 00027 #endif 00028 00029 typedef struct { 00030 #ifdef HAVE_EXPAT 00031 XML_Parser parser; 00032 #endif 00033 char* ptr; /* input source */ 00034 int tok; /* token type */ 00035 agxbuf* xb; /* buffer to gather T_string data */ 00036 agxbuf lb; /* buffer for translating lexical data */ 00037 char warn; /* set if warning given */ 00038 char error; /* set if error given */ 00039 char inCell; /* set if in TD to allow T_string */ 00040 char mode; /* for handling artificial <HTML>..</HTML> */ 00041 char *currtok; /* for error reporting */ 00042 char *prevtok; /* for error reporting */ 00043 int currtoklen; 00044 int prevtoklen; 00045 } lexstate_t; 00046 static lexstate_t state; 00047 00048 /* error_context: 00049 * Print the last 2 "token"s seen. 00050 */ 00051 static void error_context(void) 00052 { 00053 agxbclear(state.xb); 00054 if (state.prevtoklen > 0) 00055 agxbput_n(state.xb, state.prevtok, state.prevtoklen); 00056 agxbput_n(state.xb, state.currtok, state.currtoklen); 00057 agerr(AGPREV, "... %s ...\n", agxbuse(state.xb)); 00058 } 00059 00060 /* htmlerror: 00061 * yyerror - called by yacc output 00062 */ 00063 void htmlerror(const char *msg) 00064 { 00065 if (state.error) 00066 return; 00067 state.error = 1; 00068 agerr(AGERR, "%s in line %d \n", msg, htmllineno()); 00069 error_context(); 00070 } 00071 00072 #ifdef HAVE_EXPAT 00073 /* lexerror: 00074 * called by lexer when unknown <..> is found. 00075 */ 00076 static void lexerror(const char *name) 00077 { 00078 state.tok = T_error; 00079 state.error = 1; 00080 agerr(AGERR, "Unknown HTML element <%s> on line %d \n", 00081 name, htmllineno()); 00082 } 00083 00084 typedef int (*attrFn) (void *, char *); 00085 typedef int (*bcmpfn) (const void *, const void *); 00086 00087 #define MAX_CHAR (((unsigned char)(~0)) >> 1) 00088 #define MIN_CHAR ((signed char)(~MAX_CHAR)) 00089 #define MAX_UCHAR ((unsigned char)(~0)) 00090 #define MAX_USHORT ((unsigned short)(~0)) 00091 00092 /* Mechanism for automatically processing attributes */ 00093 typedef struct { 00094 char *name; /* attribute name */ 00095 attrFn action; /* action to perform if name matches */ 00096 } attr_item; 00097 00098 #define ISIZE (sizeof(attr_item)) 00099 00100 /* icmp: 00101 * Compare two attr_item. Used in bsearch 00102 */ 00103 static int icmp(attr_item * i, attr_item * j) 00104 { 00105 return strcasecmp(i->name, j->name); 00106 } 00107 00108 static int bgcolorfn(htmldata_t * p, char *v) 00109 { 00110 p->bgcolor = strdup(v); 00111 return 0; 00112 } 00113 00114 static int pencolorfn(htmldata_t * p, char *v) 00115 { 00116 p->pencolor = strdup(v); 00117 return 0; 00118 } 00119 00120 static int hreffn(htmldata_t * p, char *v) 00121 { 00122 p->href = strdup(v); 00123 return 0; 00124 } 00125 00126 static int titlefn(htmldata_t * p, char *v) 00127 { 00128 p->title = strdup(v); 00129 return 0; 00130 } 00131 00132 static int portfn(htmldata_t * p, char *v) 00133 { 00134 p->port = strdup(v); 00135 return 0; 00136 } 00137 00138 #define DELIM " ," 00139 00140 static int stylefn(htmldata_t * p, char *v) 00141 { 00142 int rv = 0; 00143 char c; 00144 char* tk; 00145 char* buf = strdup (v); 00146 for (tk = strtok (buf, DELIM); tk; tk = strtok (NULL, DELIM)) { 00147 c = toupper(*tk); 00148 if (c == 'R') { 00149 if (!strcasecmp(tk + 1, "OUNDED")) p->style |= ROUNDED; 00150 else if (!strcasecmp(tk + 1, "ADIAL")) p->style |= RADIAL; 00151 else { 00152 agerr(AGWARN, "Illegal value %s for STYLE - ignored\n", tk); 00153 rv = 1; 00154 } 00155 } 00156 else { 00157 agerr(AGWARN, "Illegal value %s for STYLE - ignored\n", tk); 00158 rv = 1; 00159 } 00160 } 00161 free (buf); 00162 return rv; 00163 } 00164 00165 static int targetfn(htmldata_t * p, char *v) 00166 { 00167 p->target = strdup(v); 00168 return 0; 00169 } 00170 00171 static int idfn(htmldata_t * p, char *v) 00172 { 00173 p->id = strdup(v); 00174 return 0; 00175 } 00176 00177 00178 /* doInt: 00179 * Scan v for integral value. Check that 00180 * the value is >= min and <= max. Return value in ul. 00181 * String s is name of value. 00182 * Return 0 if okay; 1 otherwise. 00183 */ 00184 static int doInt(char *v, char *s, int min, int max, long *ul) 00185 { 00186 int rv = 0; 00187 char *ep; 00188 long b = strtol(v, &ep, 10); 00189 00190 if (ep == v) { 00191 agerr(AGWARN, "Improper %s value %s - ignored", s, v); 00192 rv = 1; 00193 } else if (b > max) { 00194 agerr(AGWARN, "%s value %s > %d - too large - ignored", s, v, max); 00195 rv = 1; 00196 } else if (b < min) { 00197 agerr(AGWARN, "%s value %s < %d - too small - ignored", s, v, min); 00198 rv = 1; 00199 } else 00200 *ul = b; 00201 return rv; 00202 } 00203 00204 00205 static int gradientanglefn(htmldata_t * p, char *v) 00206 { 00207 long u; 00208 00209 if (doInt(v, "GRADIENTANGLE", 0, 360, &u)) 00210 return 1; 00211 p->gradientangle = (unsigned short) u; 00212 return 0; 00213 } 00214 00215 00216 static int borderfn(htmldata_t * p, char *v) 00217 { 00218 long u; 00219 00220 if (doInt(v, "BORDER", 0, MAX_UCHAR, &u)) 00221 return 1; 00222 p->border = (unsigned char) u; 00223 p->flags |= BORDER_SET; 00224 return 0; 00225 } 00226 00227 static int cellpaddingfn(htmldata_t * p, char *v) 00228 { 00229 long u; 00230 00231 if (doInt(v, "CELLPADDING", 0, MAX_UCHAR, &u)) 00232 return 1; 00233 p->pad = (unsigned char) u; 00234 p->flags |= PAD_SET; 00235 return 0; 00236 } 00237 00238 static int cellspacingfn(htmldata_t * p, char *v) 00239 { 00240 long u; 00241 00242 if (doInt(v, "CELLSPACING", MIN_CHAR, MAX_CHAR, &u)) 00243 return 1; 00244 p->space = (signed char) u; 00245 p->flags |= SPACE_SET; 00246 return 0; 00247 } 00248 00249 static int cellborderfn(htmltbl_t * p, char *v) 00250 { 00251 long u; 00252 00253 if (doInt(v, "CELLSBORDER", 0, MAX_CHAR, &u)) 00254 return 1; 00255 p->cb = (unsigned char) u; 00256 return 0; 00257 } 00258 00259 static int columnsfn(htmltbl_t * p, char *v) 00260 { 00261 if (*v != '*') { 00262 agerr(AGWARN, "Unknown value %s for COLUMNS - ignored\n", v); 00263 return 1; 00264 } 00265 p->flags |= HTML_VRULE; 00266 return 0; 00267 } 00268 00269 static int rowsfn(htmltbl_t * p, char *v) 00270 { 00271 if (*v != '*') { 00272 agerr(AGWARN, "Unknown value %s for ROWS - ignored\n", v); 00273 return 1; 00274 } 00275 p->flags |= HTML_HRULE; 00276 return 0; 00277 } 00278 00279 static int fixedsizefn(htmldata_t * p, char *v) 00280 { 00281 int rv = 0; 00282 char c = toupper(*(unsigned char *) v); 00283 if ((c == 'T') && !strcasecmp(v + 1, "RUE")) 00284 p->flags |= FIXED_FLAG; 00285 else if ((c != 'F') || strcasecmp(v + 1, "ALSE")) { 00286 agerr(AGWARN, "Illegal value %s for FIXEDSIZE - ignored\n", v); 00287 rv = 1; 00288 } 00289 return rv; 00290 } 00291 00292 static int valignfn(htmldata_t * p, char *v) 00293 { 00294 int rv = 0; 00295 char c = toupper(*v); 00296 if ((c == 'B') && !strcasecmp(v + 1, "OTTOM")) 00297 p->flags |= VALIGN_BOTTOM; 00298 else if ((c == 'T') && !strcasecmp(v + 1, "OP")) 00299 p->flags |= VALIGN_TOP; 00300 else if ((c != 'M') || strcasecmp(v + 1, "IDDLE")) { 00301 agerr(AGWARN, "Illegal value %s for VALIGN - ignored\n", v); 00302 rv = 1; 00303 } 00304 return rv; 00305 } 00306 00307 static int halignfn(htmldata_t * p, char *v) 00308 { 00309 int rv = 0; 00310 char c = toupper(*v); 00311 if ((c == 'L') && !strcasecmp(v + 1, "EFT")) 00312 p->flags |= HALIGN_LEFT; 00313 else if ((c == 'R') && !strcasecmp(v + 1, "IGHT")) 00314 p->flags |= HALIGN_RIGHT; 00315 else if ((c != 'C') || strcasecmp(v + 1, "ENTER")) { 00316 agerr(AGWARN, "Illegal value %s for ALIGN - ignored\n", v); 00317 rv = 1; 00318 } 00319 return rv; 00320 } 00321 00322 static int cell_halignfn(htmldata_t * p, char *v) 00323 { 00324 int rv = 0; 00325 char c = toupper(*v); 00326 if ((c == 'L') && !strcasecmp(v + 1, "EFT")) 00327 p->flags |= HALIGN_LEFT; 00328 else if ((c == 'R') && !strcasecmp(v + 1, "IGHT")) 00329 p->flags |= HALIGN_RIGHT; 00330 else if ((c == 'T') && !strcasecmp(v + 1, "EXT")) 00331 p->flags |= HALIGN_TEXT; 00332 else if ((c != 'C') || strcasecmp(v + 1, "ENTER")) 00333 rv = 1; 00334 if (rv) 00335 agerr(AGWARN, "Illegal value %s for ALIGN in TD - ignored\n", v); 00336 return rv; 00337 } 00338 00339 static int balignfn(htmldata_t * p, char *v) 00340 { 00341 int rv = 0; 00342 char c = toupper(*v); 00343 if ((c == 'L') && !strcasecmp(v + 1, "EFT")) 00344 p->flags |= BALIGN_LEFT; 00345 else if ((c == 'R') && !strcasecmp(v + 1, "IGHT")) 00346 p->flags |= BALIGN_RIGHT; 00347 else if ((c != 'C') || strcasecmp(v + 1, "ENTER")) 00348 rv = 1; 00349 if (rv) 00350 agerr(AGWARN, "Illegal value %s for BALIGN in TD - ignored\n", v); 00351 return rv; 00352 } 00353 00354 static int heightfn(htmldata_t * p, char *v) 00355 { 00356 long u; 00357 00358 if (doInt(v, "HEIGHT", 0, MAX_USHORT, &u)) 00359 return 1; 00360 p->height = (unsigned short) u; 00361 return 0; 00362 } 00363 00364 static int widthfn(htmldata_t * p, char *v) 00365 { 00366 long u; 00367 00368 if (doInt(v, "WIDTH", 0, MAX_USHORT, &u)) 00369 return 1; 00370 p->width = (unsigned short) u; 00371 return 0; 00372 } 00373 00374 static int rowspanfn(htmlcell_t * p, char *v) 00375 { 00376 long u; 00377 00378 if (doInt(v, "ROWSPAN", 0, MAX_USHORT, &u)) 00379 return 1; 00380 if (u == 0) { 00381 agerr(AGWARN, "ROWSPAN value cannot be 0 - ignored\n"); 00382 return 1; 00383 } 00384 p->rspan = (unsigned short) u; 00385 return 0; 00386 } 00387 00388 static int colspanfn(htmlcell_t * p, char *v) 00389 { 00390 long u; 00391 00392 if (doInt(v, "COLSPAN", 0, MAX_USHORT, &u)) 00393 return 1; 00394 if (u == 0) { 00395 agerr(AGWARN, "COLSPAN value cannot be 0 - ignored\n"); 00396 return 1; 00397 } 00398 p->cspan = (unsigned short) u; 00399 return 0; 00400 } 00401 00402 static int fontcolorfn(htmlfont_t * p, char *v) 00403 { 00404 p->color = strdup(v); 00405 return 0; 00406 } 00407 00408 static int facefn(htmlfont_t * p, char *v) 00409 { 00410 p->name = strdup(v); 00411 return 0; 00412 } 00413 00414 static int ptsizefn(htmlfont_t * p, char *v) 00415 { 00416 long u; 00417 00418 if (doInt(v, "POINT-SIZE", 0, MAX_UCHAR, &u)) 00419 return 1; 00420 p->size = (double) u; 00421 return 0; 00422 } 00423 00424 static int srcfn(htmlimg_t * p, char *v) 00425 { 00426 p->src = strdup(v); 00427 return 0; 00428 } 00429 00430 static int scalefn(htmlimg_t * p, char *v) 00431 { 00432 p->scale = strdup(v); 00433 return 0; 00434 } 00435 00436 static int alignfn(int *p, char *v) 00437 { 00438 int rv = 0; 00439 char c = toupper(*v); 00440 if ((c == 'R') && !strcasecmp(v + 1, "IGHT")) 00441 *p = 'r'; 00442 else if ((c == 'L') || !strcasecmp(v + 1, "EFT")) 00443 *p = 'l'; 00444 else if ((c == 'C') || strcasecmp(v + 1, "ENTER")) 00445 *p = 'n'; 00446 else { 00447 agerr(AGWARN, "Illegal value %s for ALIGN - ignored\n", v); 00448 rv = 1; 00449 } 00450 return rv; 00451 } 00452 00453 /* Tables used in binary search; MUST be alphabetized */ 00454 static attr_item tbl_items[] = { 00455 {"align", (attrFn) halignfn}, 00456 {"bgcolor", (attrFn) bgcolorfn}, 00457 {"border", (attrFn) borderfn}, 00458 {"cellborder", (attrFn) cellborderfn}, 00459 {"cellpadding", (attrFn) cellpaddingfn}, 00460 {"cellspacing", (attrFn) cellspacingfn}, 00461 {"color", (attrFn) pencolorfn}, 00462 {"columns", (attrFn) columnsfn}, 00463 {"fixedsize", (attrFn) fixedsizefn}, 00464 {"gradientangle", (attrFn) gradientanglefn}, 00465 {"height", (attrFn) heightfn}, 00466 {"href", (attrFn) hreffn}, 00467 {"id", (attrFn) idfn}, 00468 {"port", (attrFn) portfn}, 00469 {"rows", (attrFn) rowsfn}, 00470 {"style", (attrFn) stylefn}, 00471 {"target", (attrFn) targetfn}, 00472 {"title", (attrFn) titlefn}, 00473 {"tooltip", (attrFn) titlefn}, 00474 {"valign", (attrFn) valignfn}, 00475 {"width", (attrFn) widthfn}, 00476 }; 00477 00478 static attr_item cell_items[] = { 00479 {"align", (attrFn) cell_halignfn}, 00480 {"balign", (attrFn) balignfn}, 00481 {"bgcolor", (attrFn) bgcolorfn}, 00482 {"border", (attrFn) borderfn}, 00483 {"cellpadding", (attrFn) cellpaddingfn}, 00484 {"cellspacing", (attrFn) cellspacingfn}, 00485 {"color", (attrFn) pencolorfn}, 00486 {"colspan", (attrFn) colspanfn}, 00487 {"fixedsize", (attrFn) fixedsizefn}, 00488 {"gradientangle", (attrFn) gradientanglefn}, 00489 {"height", (attrFn) heightfn}, 00490 {"href", (attrFn) hreffn}, 00491 {"id", (attrFn) idfn}, 00492 {"port", (attrFn) portfn}, 00493 {"rowspan", (attrFn) rowspanfn}, 00494 {"style", (attrFn) stylefn}, 00495 {"target", (attrFn) targetfn}, 00496 {"title", (attrFn) titlefn}, 00497 {"tooltip", (attrFn) titlefn}, 00498 {"valign", (attrFn) valignfn}, 00499 {"width", (attrFn) widthfn}, 00500 }; 00501 00502 static attr_item font_items[] = { 00503 {"color", (attrFn) fontcolorfn}, 00504 {"face", (attrFn) facefn}, 00505 {"point-size", (attrFn) ptsizefn}, 00506 }; 00507 00508 static attr_item img_items[] = { 00509 {"scale", (attrFn) scalefn}, 00510 {"src", (attrFn) srcfn}, 00511 }; 00512 00513 static attr_item br_items[] = { 00514 {"align", (attrFn) alignfn}, 00515 }; 00516 00517 /* doAttrs: 00518 * General function for processing list of name/value attributes. 00519 * Do binary search on items table. If match found, invoke action 00520 * passing it tp and attribute value. 00521 * Table size is given by nel 00522 * Name/value pairs are in array atts, which is null terminated. 00523 * s is the name of the HTML element being processed. 00524 */ 00525 static void 00526 doAttrs(void *tp, attr_item * items, int nel, char **atts, char *s) 00527 { 00528 char *name; 00529 char *val; 00530 attr_item *ip; 00531 attr_item key; 00532 00533 while ((name = *atts++) != NULL) { 00534 val = *atts++; 00535 key.name = name; 00536 ip = (attr_item *) bsearch(&key, items, nel, ISIZE, (bcmpfn) icmp); 00537 if (ip) 00538 state.warn |= ip->action(tp, val); 00539 else { 00540 agerr(AGWARN, "Illegal attribute %s in %s - ignored\n", name, 00541 s); 00542 state.warn = 1; 00543 } 00544 } 00545 } 00546 00547 static void mkBR(char **atts) 00548 { 00549 htmllval.i = UNSET_ALIGN; 00550 doAttrs(&htmllval.i, br_items, sizeof(br_items) / ISIZE, atts, "<BR>"); 00551 } 00552 00553 static htmlimg_t *mkImg(char **atts) 00554 { 00555 htmlimg_t *img = NEW(htmlimg_t); 00556 00557 doAttrs(img, img_items, sizeof(img_items) / ISIZE, atts, "<IMG>"); 00558 00559 return img; 00560 } 00561 00562 static htmlfont_t *mkFont(char **atts, int flags, int ul) 00563 { 00564 htmlfont_t *font = NEW(htmlfont_t); 00565 00566 font->size = -1.0; /* unassigned */ 00567 font->flags = flags; 00568 if (atts) 00569 doAttrs(font, font_items, sizeof(font_items) / ISIZE, atts, "<FONT>"); 00570 00571 return font; 00572 } 00573 00574 static htmlcell_t *mkCell(char **atts) 00575 { 00576 htmlcell_t *cell = NEW(htmlcell_t); 00577 00578 cell->cspan = 1; 00579 cell->rspan = 1; 00580 doAttrs(cell, cell_items, sizeof(cell_items) / ISIZE, atts, "<TD>"); 00581 00582 return cell; 00583 } 00584 00585 static htmltbl_t *mkTbl(char **atts) 00586 { 00587 htmltbl_t *tbl = NEW(htmltbl_t); 00588 00589 tbl->rc = -1; /* flag that table is a raw, parsed table */ 00590 tbl->cb = -1; /* unset cell border attribute */ 00591 doAttrs(tbl, tbl_items, sizeof(tbl_items) / ISIZE, atts, "<TABLE>"); 00592 00593 return tbl; 00594 } 00595 00596 static void startElement(void *user, const char *name, char **atts) 00597 { 00598 if (strcasecmp(name, "TABLE") == 0) { 00599 htmllval.tbl = mkTbl(atts); 00600 state.inCell = 0; 00601 state.tok = T_table; 00602 } else if ((strcasecmp(name, "TR") == 0) 00603 || (strcasecmp(name, "TH") == 0)) { 00604 state.inCell = 0; 00605 state.tok = T_row; 00606 } else if (strcasecmp(name, "TD") == 0) { 00607 state.inCell = 1; 00608 htmllval.cell = mkCell(atts); 00609 state.tok = T_cell; 00610 } else if (strcasecmp(name, "FONT") == 0) { 00611 htmllval.font = mkFont(atts, 0, 0); 00612 state.tok = T_font; 00613 } else if (strcasecmp(name, "B") == 0) { 00614 htmllval.font = mkFont(0, HTML_BF, 0); 00615 state.tok = T_bold; 00616 } else if (strcasecmp(name, "U") == 0) { 00617 htmllval.font = mkFont(0, HTML_UL, 1); 00618 state.tok = T_underline; 00619 } else if (strcasecmp(name, "I") == 0) { 00620 htmllval.font = mkFont(0, HTML_IF, 0); 00621 state.tok = T_italic; 00622 } else if (strcasecmp(name, "SUP") == 0) { 00623 htmllval.font = mkFont(0, HTML_SUP, 0); 00624 state.tok = T_sup; 00625 } else if (strcasecmp(name, "SUB") == 0) { 00626 htmllval.font = mkFont(0, HTML_SUB, 0); 00627 state.tok = T_sub; 00628 } else if (strcasecmp(name, "BR") == 0) { 00629 mkBR(atts); 00630 state.tok = T_br; 00631 } else if (strcasecmp(name, "HR") == 0) { 00632 state.tok = T_hr; 00633 } else if (strcasecmp(name, "VR") == 0) { 00634 state.tok = T_vr; 00635 } else if (strcasecmp(name, "IMG") == 0) { 00636 htmllval.img = mkImg(atts); 00637 state.tok = T_img; 00638 } else if (strcasecmp(name, "HTML") == 0) { 00639 state.tok = T_html; 00640 } else { 00641 lexerror(name); 00642 } 00643 } 00644 00645 static void endElement(void *user, const char *name) 00646 { 00647 if (strcasecmp(name, "TABLE") == 0) { 00648 state.tok = T_end_table; 00649 state.inCell = 1; 00650 } else if ((strcasecmp(name, "TR") == 0) 00651 || (strcasecmp(name, "TH") == 0)) { 00652 state.tok = T_end_row; 00653 } else if (strcasecmp(name, "TD") == 0) { 00654 state.tok = T_end_cell; 00655 state.inCell = 0; 00656 } else if (strcasecmp(name, "HTML") == 0) { 00657 state.tok = T_end_html; 00658 } else if (strcasecmp(name, "FONT") == 0) { 00659 state.tok = T_end_font; 00660 } else if (strcasecmp(name, "B") == 0) { 00661 state.tok = T_n_bold; 00662 } else if (strcasecmp(name, "U") == 0) { 00663 state.tok = T_n_underline; 00664 } else if (strcasecmp(name, "I") == 0) { 00665 state.tok = T_n_italic; 00666 } else if (strcasecmp(name, "SUP") == 0) { 00667 state.tok = T_n_sup; 00668 } else if (strcasecmp(name, "SUB") == 0) { 00669 state.tok = T_n_sub; 00670 } else if (strcasecmp(name, "BR") == 0) { 00671 if (state.tok == T_br) 00672 state.tok = T_BR; 00673 else 00674 state.tok = T_end_br; 00675 } else if (strcasecmp(name, "HR") == 0) { 00676 if (state.tok == T_hr) 00677 state.tok = T_HR; 00678 else 00679 state.tok = T_end_hr; 00680 } else if (strcasecmp(name, "VR") == 0) { 00681 if (state.tok == T_vr) 00682 state.tok = T_VR; 00683 else 00684 state.tok = T_end_vr; 00685 } else if (strcasecmp(name, "IMG") == 0) { 00686 if (state.tok == T_img) 00687 state.tok = T_IMG; 00688 else 00689 state.tok = T_end_img; 00690 } else { 00691 lexerror(name); 00692 } 00693 } 00694 00695 /* characterData: 00696 * Generate T_string token. Do this only when immediately in 00697 * <TD>..</TD> or <HTML>..</HTML>, i.e., when inCell is true. 00698 * Strip out formatting characters but keep spaces. 00699 * Distinguish between all whitespace vs. strings with non-whitespace 00700 * characters. 00701 */ 00702 static void characterData(void *user, const char *s, int length) 00703 { 00704 int i, rc, cnt = 0; 00705 unsigned char c; 00706 00707 if (state.inCell) { 00708 for (i = length; i; i--) { 00709 c = *s++; 00710 if (c >= ' ') { 00711 cnt++; 00712 rc = agxbputc(state.xb, c); 00713 } 00714 } 00715 if (cnt) state.tok = T_string; 00716 } 00717 } 00718 #endif 00719 00720 int initHTMLlexer(char *src, agxbuf * xb, int charset) 00721 { 00722 #ifdef HAVE_EXPAT 00723 state.xb = xb; 00724 agxbinit (&state.lb, SMALLBUF, NULL); 00725 state.ptr = src; 00726 state.mode = 0; 00727 state.warn = 0; 00728 state.error = 0; 00729 state.currtoklen = 0; 00730 state.prevtoklen = 0; 00731 state.inCell = 1; 00732 state.parser = XML_ParserCreate(charsetToStr(charset)); 00733 XML_SetElementHandler(state.parser, 00734 (XML_StartElementHandler) startElement, 00735 endElement); 00736 XML_SetCharacterDataHandler(state.parser, characterData); 00737 return 0; 00738 #else 00739 static int first; 00740 if (!first) { 00741 agerr(AGWARN, 00742 "Not built with libexpat. Table formatting is not available.\n"); 00743 first++; 00744 } 00745 return 1; 00746 #endif 00747 } 00748 00749 int clearHTMLlexer() 00750 { 00751 #ifdef HAVE_EXPAT 00752 int rv = state.warn | state.error; 00753 XML_ParserFree(state.parser); 00754 agxbfree (&state.lb); 00755 return rv; 00756 #else 00757 return 1; 00758 #endif 00759 } 00760 00761 #ifdef HAVE_EXPAT 00762 /* eatComment: 00763 * Given first character after open comment, eat characters 00764 * upto comment close, returning pointer to closing > if it exists, 00765 * or null character otherwise. 00766 * We rely on HTML strings having matched nested <>. 00767 */ 00768 static char *eatComment(char *p) 00769 { 00770 int depth = 1; 00771 char *s = p; 00772 char c; 00773 00774 while (depth && (c = *s++)) { 00775 if (c == '<') 00776 depth++; 00777 else if (c == '>') 00778 depth--; 00779 } 00780 s--; /* move back to '\0' or '>' */ 00781 if (*s) { 00782 char *t = s - 2; 00783 if ((t < p) || strncmp(t, "--", 2)) { 00784 agerr(AGWARN, "Unclosed comment\n"); 00785 state.warn = 1; 00786 } 00787 } 00788 return s; 00789 } 00790 00791 /* findNext: 00792 * Return next XML unit. This is either <..>, an HTML 00793 * comment <!-- ... -->, or characters up to next <. 00794 */ 00795 static char *findNext(char *s, agxbuf* xb) 00796 { 00797 char* t = s + 1; 00798 char c; 00799 int rc; 00800 00801 if (*s == '<') { 00802 if ((*t == '!') && !strncmp(t + 1, "--", 2)) 00803 t = eatComment(t + 3); 00804 else 00805 while (*t && (*t != '>')) 00806 t++; 00807 if (*t != '>') { 00808 agerr(AGWARN, "Label closed before end of HTML element\n"); 00809 state.warn = 1; 00810 } else 00811 t++; 00812 } else { 00813 t = s; 00814 while ((c = *t) && (c != '<')) { 00815 if ((c == '&') && (*(t+1) != '#')) { 00816 t = scanEntity(t + 1, xb); 00817 } 00818 else { 00819 rc = agxbputc(xb, c); 00820 t++; 00821 } 00822 } 00823 } 00824 return t; 00825 } 00826 #endif 00827 00828 int htmllineno() 00829 { 00830 #ifdef HAVE_EXPAT 00831 return XML_GetCurrentLineNumber(state.parser); 00832 #else 00833 return 0; 00834 #endif 00835 } 00836 00837 #ifdef DEBUG 00838 static void printTok(int tok) 00839 { 00840 char *s; 00841 00842 switch (tok) { 00843 case T_VR: 00844 s = "T_VR"; 00845 break; 00846 case T_vr: 00847 s = "T_vr"; 00848 break; 00849 case T_end_vr: 00850 s = "T_end_vr"; 00851 break; 00852 case T_HR: 00853 s = "T_HR"; 00854 break; 00855 case T_hr: 00856 s = "T_hr"; 00857 break; 00858 case T_end_hr: 00859 s = "T_end_hr"; 00860 break; 00861 case T_BR: 00862 s = "T_BR"; 00863 break; 00864 case T_br: 00865 s = "T_br"; 00866 break; 00867 case T_end_br: 00868 s = "T_end_br"; 00869 break; 00870 case T_end_table: 00871 s = "T_end_table"; 00872 break; 00873 case T_row: 00874 s = "T_row"; 00875 break; 00876 case T_end_row: 00877 s = "T_end_row"; 00878 break; 00879 case T_end_cell: 00880 s = "T_end_cell"; 00881 break; 00882 case T_html: 00883 s = "T_html"; 00884 break; 00885 case T_end_html: 00886 s = "T_end_html"; 00887 break; 00888 case T_string: 00889 s = "T_string"; 00890 break; 00891 case T_error: 00892 s = "T_error"; 00893 break; 00894 case T_table: 00895 s = "T_table"; 00896 break; 00897 case T_cell: 00898 s = "T_cell"; 00899 break; 00900 case T_img: 00901 s = "T_img"; 00902 break; 00903 case T_end_img: 00904 s = "T_end_img"; 00905 break; 00906 case T_IMG: 00907 s = "T_IMG"; 00908 break; 00909 case T_underline: 00910 s = "T_underline"; 00911 break; 00912 case T_n_underline: 00913 s = "T_underline"; 00914 break; 00915 case T_italic: 00916 s = "T_italic"; 00917 break; 00918 case T_n_italic: 00919 s = "T_italic"; 00920 break; 00921 case T_bold: 00922 s = "T_bold"; 00923 break; 00924 case T_n_bold: 00925 s = "T_bold"; 00926 break; 00927 default: 00928 s = "<unknown>"; 00929 } 00930 if (tok == T_string) { 00931 fprintf(stderr, "%s \"", s); 00932 fwrite(agxbstart(state.xb), 1, agxblen(state.xb), stderr); 00933 fprintf(stderr, "\"\n"); 00934 } else 00935 fprintf(stderr, "%s\n", s); 00936 } 00937 00938 #endif 00939 00940 int htmllex() 00941 { 00942 #ifdef HAVE_EXPAT 00943 static char *begin_html = "<HTML>"; 00944 static char *end_html = "</HTML>"; 00945 00946 char *s; 00947 char *endp = 0; 00948 int len, llen; 00949 int rv; 00950 00951 state.tok = 0; 00952 do { 00953 if (state.mode == 2) 00954 return EOF; 00955 if (state.mode == 0) { 00956 state.mode = 1; 00957 s = begin_html; 00958 len = strlen(s); 00959 endp = 0; 00960 } else { 00961 s = state.ptr; 00962 if (*s == '\0') { 00963 state.mode = 2; 00964 s = end_html; 00965 len = strlen(s); 00966 } else { 00967 endp = findNext(s,&state.lb); 00968 len = endp - s; 00969 } 00970 } 00971 state.prevtok = state.currtok; 00972 state.prevtoklen = state.currtoklen; 00973 state.currtok = s; 00974 state.currtoklen = len; 00975 if ((llen = agxblen(&state.lb))) 00976 rv = XML_Parse(state.parser, agxbuse(&state.lb),llen, 0); 00977 else 00978 rv = XML_Parse(state.parser, s, len, (len ? 0 : 1)); 00979 if (rv == XML_STATUS_ERROR) { 00980 if (!state.error) { 00981 agerr(AGERR, "%s in line %d \n", 00982 XML_ErrorString(XML_GetErrorCode(state.parser)), 00983 htmllineno()); 00984 error_context(); 00985 state.error = 1; 00986 state.tok = T_error; 00987 } 00988 } 00989 if (endp) 00990 state.ptr = endp; 00991 } while (state.tok == 0); 00992 /* printTok (state.tok); */ 00993 return state.tok; 00994 #else 00995 return EOF; 00996 #endif 00997 } 00998
1.7.5