|
Graphviz
2.31.20130521.0446
|
00001 /* $Id$ $Revision$ */ 00002 /* vim:set shiftwidth=4 ts=8: */ 00003 00004 /************************************************************************* 00005 * Copyright (c) 2011 AT&T Intellectual Property 00006 * All rights reserved. This program and the accompanying materials 00007 * are made available under the terms of the Eclipse Public License v1.0 00008 * which accompanies this distribution, and is available at 00009 * http://www.eclipse.org/legal/epl-v10.html 00010 * 00011 * Contributors: See CVS logs. Details at http://www.graphviz.org/ 00012 *************************************************************************/ 00013 00014 00015 #include "render.h" 00016 #include "htmltable.h" 00017 #include "htmlparse.h" 00018 #include "htmllex.h" 00019 #include <ctype.h> 00020 00021 #ifdef HAVE_EXPAT 00022 #include <expat.h> 00023 #endif 00024 00025 #ifndef XML_STATUS_ERROR 00026 #define XML_STATUS_ERROR 0 00027 #endif 00028 00029 typedef struct { 00030 #ifdef HAVE_EXPAT 00031 XML_Parser parser; 00032 #endif 00033 char* ptr; /* input source */ 00034 int tok; /* token type */ 00035 agxbuf* xb; /* buffer to gather T_string data */ 00036 agxbuf lb; /* buffer for translating lexical data */ 00037 char warn; /* set if warning given */ 00038 char error; /* set if error given */ 00039 char inCell; /* set if in TD to allow T_string */ 00040 char mode; /* for handling artificial <HTML>..</HTML> */ 00041 char *currtok; /* for error reporting */ 00042 char *prevtok; /* for error reporting */ 00043 int currtoklen; 00044 int prevtoklen; 00045 } lexstate_t; 00046 static lexstate_t state; 00047 00048 /* error_context: 00049 * Print the last 2 "token"s seen. 00050 */ 00051 static void error_context(void) 00052 { 00053 agxbclear(state.xb); 00054 if (state.prevtoklen > 0) 00055 agxbput_n(state.xb, state.prevtok, state.prevtoklen); 00056 agxbput_n(state.xb, state.currtok, state.currtoklen); 00057 agerr(AGPREV, "... %s ...\n", agxbuse(state.xb)); 00058 } 00059 00060 /* htmlerror: 00061 * yyerror - called by yacc output 00062 */ 00063 void htmlerror(const char *msg) 00064 { 00065 if (state.error) 00066 return; 00067 state.error = 1; 00068 agerr(AGERR, "%s in line %d \n", msg, htmllineno()); 00069 error_context(); 00070 } 00071 00072 #ifdef HAVE_EXPAT 00073 /* lexerror: 00074 * called by lexer when unknown <..> is found. 00075 */ 00076 static void lexerror(const char *name) 00077 { 00078 state.tok = T_error; 00079 state.error = 1; 00080 agerr(AGERR, "Unknown HTML element <%s> on line %d \n", 00081 name, htmllineno()); 00082 } 00083 00084 typedef int (*attrFn) (void *, char *); 00085 typedef int (*bcmpfn) (const void *, const void *); 00086 00087 #define MAX_CHAR (((unsigned char)(~0)) >> 1) 00088 #define MIN_CHAR ((signed char)(~MAX_CHAR)) 00089 #define MAX_UCHAR ((unsigned char)(~0)) 00090 #define MAX_USHORT ((unsigned short)(~0)) 00091 00092 /* Mechanism for automatically processing attributes */ 00093 typedef struct { 00094 char *name; /* attribute name */ 00095 attrFn action; /* action to perform if name matches */ 00096 } attr_item; 00097 00098 #define ISIZE (sizeof(attr_item)) 00099 00100 /* icmp: 00101 * Compare two attr_item. Used in bsearch 00102 */ 00103 static int icmp(attr_item * i, attr_item * j) 00104 { 00105 return strcasecmp(i->name, j->name); 00106 } 00107 00108 static int bgcolorfn(htmldata_t * p, char *v) 00109 { 00110 p->bgcolor = strdup(v); 00111 return 0; 00112 } 00113 00114 static int pencolorfn(htmldata_t * p, char *v) 00115 { 00116 p->pencolor = strdup(v); 00117 return 0; 00118 } 00119 00120 static int hreffn(htmldata_t * p, char *v) 00121 { 00122 p->href = strdup(v); 00123 return 0; 00124 } 00125 00126 static int titlefn(htmldata_t * p, char *v) 00127 { 00128 p->title = strdup(v); 00129 return 0; 00130 } 00131 00132 static int portfn(htmldata_t * p, char *v) 00133 { 00134 p->port = strdup(v); 00135 return 0; 00136 } 00137 00138 #define DELIM " ," 00139 00140 static int stylefn(htmldata_t * p, char *v) 00141 { 00142 int rv = 0; 00143 char c; 00144 char* tk; 00145 char* buf = strdup (v); 00146 for (tk = strtok (buf, DELIM); tk; tk = strtok (NULL, DELIM)) { 00147 c = toupper(*tk); 00148 if (c == 'R') { 00149 if (!strcasecmp(tk + 1, "OUNDED")) p->style |= ROUNDED; 00150 else if (!strcasecmp(tk + 1, "ADIAL")) p->style |= RADIAL; 00151 else { 00152 agerr(AGWARN, "Illegal value %s for STYLE - ignored\n", tk); 00153 rv = 1; 00154 } 00155 } 00156 else if(!strcasecmp(tk,"SOLID")) p->style &= ~(DOTTED|DASHED); 00157 else if(!strcasecmp(tk,"INVISIBLE") || !strcasecmp(tk,"INVIS")) p->style |= INVISIBLE; 00158 else if(!strcasecmp(tk,"DOTTED")) p->style |= DOTTED; 00159 else if(!strcasecmp(tk,"DASHED")) p->style |= DASHED; 00160 else { 00161 agerr(AGWARN, "Illegal value %s for STYLE - ignored\n", tk); 00162 rv = 1; 00163 } 00164 } 00165 free (buf); 00166 return rv; 00167 } 00168 00169 static int targetfn(htmldata_t * p, char *v) 00170 { 00171 p->target = strdup(v); 00172 return 0; 00173 } 00174 00175 static int idfn(htmldata_t * p, char *v) 00176 { 00177 p->id = strdup(v); 00178 return 0; 00179 } 00180 00181 00182 /* doInt: 00183 * Scan v for integral value. Check that 00184 * the value is >= min and <= max. Return value in ul. 00185 * String s is name of value. 00186 * Return 0 if okay; 1 otherwise. 00187 */ 00188 static int doInt(char *v, char *s, int min, int max, long *ul) 00189 { 00190 int rv = 0; 00191 char *ep; 00192 long b = strtol(v, &ep, 10); 00193 00194 if (ep == v) { 00195 agerr(AGWARN, "Improper %s value %s - ignored", s, v); 00196 rv = 1; 00197 } else if (b > max) { 00198 agerr(AGWARN, "%s value %s > %d - too large - ignored", s, v, max); 00199 rv = 1; 00200 } else if (b < min) { 00201 agerr(AGWARN, "%s value %s < %d - too small - ignored", s, v, min); 00202 rv = 1; 00203 } else 00204 *ul = b; 00205 return rv; 00206 } 00207 00208 00209 static int gradientanglefn(htmldata_t * p, char *v) 00210 { 00211 long u; 00212 00213 if (doInt(v, "GRADIENTANGLE", 0, 360, &u)) 00214 return 1; 00215 p->gradientangle = (unsigned short) u; 00216 return 0; 00217 } 00218 00219 00220 static int borderfn(htmldata_t * p, char *v) 00221 { 00222 long u; 00223 00224 if (doInt(v, "BORDER", 0, MAX_UCHAR, &u)) 00225 return 1; 00226 p->border = (unsigned char) u; 00227 p->flags |= BORDER_SET; 00228 return 0; 00229 } 00230 00231 static int cellpaddingfn(htmldata_t * p, char *v) 00232 { 00233 long u; 00234 00235 if (doInt(v, "CELLPADDING", 0, MAX_UCHAR, &u)) 00236 return 1; 00237 p->pad = (unsigned char) u; 00238 p->flags |= PAD_SET; 00239 return 0; 00240 } 00241 00242 static int cellspacingfn(htmldata_t * p, char *v) 00243 { 00244 long u; 00245 00246 if (doInt(v, "CELLSPACING", MIN_CHAR, MAX_CHAR, &u)) 00247 return 1; 00248 p->space = (signed char) u; 00249 p->flags |= SPACE_SET; 00250 return 0; 00251 } 00252 00253 static int cellborderfn(htmltbl_t * p, char *v) 00254 { 00255 long u; 00256 00257 if (doInt(v, "CELLSBORDER", 0, MAX_CHAR, &u)) 00258 return 1; 00259 p->cb = (unsigned char) u; 00260 return 0; 00261 } 00262 00263 static int columnsfn(htmltbl_t * p, char *v) 00264 { 00265 if (*v != '*') { 00266 agerr(AGWARN, "Unknown value %s for COLUMNS - ignored\n", v); 00267 return 1; 00268 } 00269 p->flags |= HTML_VRULE; 00270 return 0; 00271 } 00272 00273 static int rowsfn(htmltbl_t * p, char *v) 00274 { 00275 if (*v != '*') { 00276 agerr(AGWARN, "Unknown value %s for ROWS - ignored\n", v); 00277 return 1; 00278 } 00279 p->flags |= HTML_HRULE; 00280 return 0; 00281 } 00282 00283 static int fixedsizefn(htmldata_t * p, char *v) 00284 { 00285 int rv = 0; 00286 char c = toupper(*(unsigned char *) v); 00287 if ((c == 'T') && !strcasecmp(v + 1, "RUE")) 00288 p->flags |= FIXED_FLAG; 00289 else if ((c != 'F') || strcasecmp(v + 1, "ALSE")) { 00290 agerr(AGWARN, "Illegal value %s for FIXEDSIZE - ignored\n", v); 00291 rv = 1; 00292 } 00293 return rv; 00294 } 00295 00296 static int valignfn(htmldata_t * p, char *v) 00297 { 00298 int rv = 0; 00299 char c = toupper(*v); 00300 if ((c == 'B') && !strcasecmp(v + 1, "OTTOM")) 00301 p->flags |= VALIGN_BOTTOM; 00302 else if ((c == 'T') && !strcasecmp(v + 1, "OP")) 00303 p->flags |= VALIGN_TOP; 00304 else if ((c != 'M') || strcasecmp(v + 1, "IDDLE")) { 00305 agerr(AGWARN, "Illegal value %s for VALIGN - ignored\n", v); 00306 rv = 1; 00307 } 00308 return rv; 00309 } 00310 00311 static int halignfn(htmldata_t * p, char *v) 00312 { 00313 int rv = 0; 00314 char c = toupper(*v); 00315 if ((c == 'L') && !strcasecmp(v + 1, "EFT")) 00316 p->flags |= HALIGN_LEFT; 00317 else if ((c == 'R') && !strcasecmp(v + 1, "IGHT")) 00318 p->flags |= HALIGN_RIGHT; 00319 else if ((c != 'C') || strcasecmp(v + 1, "ENTER")) { 00320 agerr(AGWARN, "Illegal value %s for ALIGN - ignored\n", v); 00321 rv = 1; 00322 } 00323 return rv; 00324 } 00325 00326 static int cell_halignfn(htmldata_t * p, char *v) 00327 { 00328 int rv = 0; 00329 char c = toupper(*v); 00330 if ((c == 'L') && !strcasecmp(v + 1, "EFT")) 00331 p->flags |= HALIGN_LEFT; 00332 else if ((c == 'R') && !strcasecmp(v + 1, "IGHT")) 00333 p->flags |= HALIGN_RIGHT; 00334 else if ((c == 'T') && !strcasecmp(v + 1, "EXT")) 00335 p->flags |= HALIGN_TEXT; 00336 else if ((c != 'C') || strcasecmp(v + 1, "ENTER")) 00337 rv = 1; 00338 if (rv) 00339 agerr(AGWARN, "Illegal value %s for ALIGN in TD - ignored\n", v); 00340 return rv; 00341 } 00342 00343 static int balignfn(htmldata_t * p, char *v) 00344 { 00345 int rv = 0; 00346 char c = toupper(*v); 00347 if ((c == 'L') && !strcasecmp(v + 1, "EFT")) 00348 p->flags |= BALIGN_LEFT; 00349 else if ((c == 'R') && !strcasecmp(v + 1, "IGHT")) 00350 p->flags |= BALIGN_RIGHT; 00351 else if ((c != 'C') || strcasecmp(v + 1, "ENTER")) 00352 rv = 1; 00353 if (rv) 00354 agerr(AGWARN, "Illegal value %s for BALIGN in TD - ignored\n", v); 00355 return rv; 00356 } 00357 00358 static int heightfn(htmldata_t * p, char *v) 00359 { 00360 long u; 00361 00362 if (doInt(v, "HEIGHT", 0, MAX_USHORT, &u)) 00363 return 1; 00364 p->height = (unsigned short) u; 00365 return 0; 00366 } 00367 00368 static int widthfn(htmldata_t * p, char *v) 00369 { 00370 long u; 00371 00372 if (doInt(v, "WIDTH", 0, MAX_USHORT, &u)) 00373 return 1; 00374 p->width = (unsigned short) u; 00375 return 0; 00376 } 00377 00378 static int rowspanfn(htmlcell_t * p, char *v) 00379 { 00380 long u; 00381 00382 if (doInt(v, "ROWSPAN", 0, MAX_USHORT, &u)) 00383 return 1; 00384 if (u == 0) { 00385 agerr(AGWARN, "ROWSPAN value cannot be 0 - ignored\n"); 00386 return 1; 00387 } 00388 p->rspan = (unsigned short) u; 00389 return 0; 00390 } 00391 00392 static int colspanfn(htmlcell_t * p, char *v) 00393 { 00394 long u; 00395 00396 if (doInt(v, "COLSPAN", 0, MAX_USHORT, &u)) 00397 return 1; 00398 if (u == 0) { 00399 agerr(AGWARN, "COLSPAN value cannot be 0 - ignored\n"); 00400 return 1; 00401 } 00402 p->cspan = (unsigned short) u; 00403 return 0; 00404 } 00405 00406 static int fontcolorfn(htmlfont_t * p, char *v) 00407 { 00408 p->color = strdup(v); 00409 return 0; 00410 } 00411 00412 static int facefn(htmlfont_t * p, char *v) 00413 { 00414 p->name = strdup(v); 00415 return 0; 00416 } 00417 00418 static int ptsizefn(htmlfont_t * p, char *v) 00419 { 00420 long u; 00421 00422 if (doInt(v, "POINT-SIZE", 0, MAX_UCHAR, &u)) 00423 return 1; 00424 p->size = (double) u; 00425 return 0; 00426 } 00427 00428 static int srcfn(htmlimg_t * p, char *v) 00429 { 00430 p->src = strdup(v); 00431 return 0; 00432 } 00433 00434 static int scalefn(htmlimg_t * p, char *v) 00435 { 00436 p->scale = strdup(v); 00437 return 0; 00438 } 00439 00440 static int alignfn(int *p, char *v) 00441 { 00442 int rv = 0; 00443 char c = toupper(*v); 00444 if ((c == 'R') && !strcasecmp(v + 1, "IGHT")) 00445 *p = 'r'; 00446 else if ((c == 'L') || !strcasecmp(v + 1, "EFT")) 00447 *p = 'l'; 00448 else if ((c == 'C') || strcasecmp(v + 1, "ENTER")) 00449 *p = 'n'; 00450 else { 00451 agerr(AGWARN, "Illegal value %s for ALIGN - ignored\n", v); 00452 rv = 1; 00453 } 00454 return rv; 00455 } 00456 00457 /* Tables used in binary search; MUST be alphabetized */ 00458 static attr_item tbl_items[] = { 00459 {"align", (attrFn) halignfn}, 00460 {"bgcolor", (attrFn) bgcolorfn}, 00461 {"border", (attrFn) borderfn}, 00462 {"cellborder", (attrFn) cellborderfn}, 00463 {"cellpadding", (attrFn) cellpaddingfn}, 00464 {"cellspacing", (attrFn) cellspacingfn}, 00465 {"color", (attrFn) pencolorfn}, 00466 {"columns", (attrFn) columnsfn}, 00467 {"fixedsize", (attrFn) fixedsizefn}, 00468 {"gradientangle", (attrFn) gradientanglefn}, 00469 {"height", (attrFn) heightfn}, 00470 {"href", (attrFn) hreffn}, 00471 {"id", (attrFn) idfn}, 00472 {"port", (attrFn) portfn}, 00473 {"rows", (attrFn) rowsfn}, 00474 {"style", (attrFn) stylefn}, 00475 {"target", (attrFn) targetfn}, 00476 {"title", (attrFn) titlefn}, 00477 {"tooltip", (attrFn) titlefn}, 00478 {"valign", (attrFn) valignfn}, 00479 {"width", (attrFn) widthfn}, 00480 }; 00481 00482 static attr_item cell_items[] = { 00483 {"align", (attrFn) cell_halignfn}, 00484 {"balign", (attrFn) balignfn}, 00485 {"bgcolor", (attrFn) bgcolorfn}, 00486 {"border", (attrFn) borderfn}, 00487 {"cellpadding", (attrFn) cellpaddingfn}, 00488 {"cellspacing", (attrFn) cellspacingfn}, 00489 {"color", (attrFn) pencolorfn}, 00490 {"colspan", (attrFn) colspanfn}, 00491 {"fixedsize", (attrFn) fixedsizefn}, 00492 {"gradientangle", (attrFn) gradientanglefn}, 00493 {"height", (attrFn) heightfn}, 00494 {"href", (attrFn) hreffn}, 00495 {"id", (attrFn) idfn}, 00496 {"port", (attrFn) portfn}, 00497 {"rowspan", (attrFn) rowspanfn}, 00498 {"style", (attrFn) stylefn}, 00499 {"target", (attrFn) targetfn}, 00500 {"title", (attrFn) titlefn}, 00501 {"tooltip", (attrFn) titlefn}, 00502 {"valign", (attrFn) valignfn}, 00503 {"width", (attrFn) widthfn}, 00504 }; 00505 00506 static attr_item font_items[] = { 00507 {"color", (attrFn) fontcolorfn}, 00508 {"face", (attrFn) facefn}, 00509 {"point-size", (attrFn) ptsizefn}, 00510 }; 00511 00512 static attr_item img_items[] = { 00513 {"scale", (attrFn) scalefn}, 00514 {"src", (attrFn) srcfn}, 00515 }; 00516 00517 static attr_item br_items[] = { 00518 {"align", (attrFn) alignfn}, 00519 }; 00520 00521 /* doAttrs: 00522 * General function for processing list of name/value attributes. 00523 * Do binary search on items table. If match found, invoke action 00524 * passing it tp and attribute value. 00525 * Table size is given by nel 00526 * Name/value pairs are in array atts, which is null terminated. 00527 * s is the name of the HTML element being processed. 00528 */ 00529 static void 00530 doAttrs(void *tp, attr_item * items, int nel, char **atts, char *s) 00531 { 00532 char *name; 00533 char *val; 00534 attr_item *ip; 00535 attr_item key; 00536 00537 while ((name = *atts++) != NULL) { 00538 val = *atts++; 00539 key.name = name; 00540 ip = (attr_item *) bsearch(&key, items, nel, ISIZE, (bcmpfn) icmp); 00541 if (ip) 00542 state.warn |= ip->action(tp, val); 00543 else { 00544 agerr(AGWARN, "Illegal attribute %s in %s - ignored\n", name, 00545 s); 00546 state.warn = 1; 00547 } 00548 } 00549 } 00550 00551 static void mkBR(char **atts) 00552 { 00553 htmllval.i = UNSET_ALIGN; 00554 doAttrs(&htmllval.i, br_items, sizeof(br_items) / ISIZE, atts, "<BR>"); 00555 } 00556 00557 static htmlimg_t *mkImg(char **atts) 00558 { 00559 htmlimg_t *img = NEW(htmlimg_t); 00560 00561 doAttrs(img, img_items, sizeof(img_items) / ISIZE, atts, "<IMG>"); 00562 00563 return img; 00564 } 00565 00566 static htmlfont_t *mkFont(char **atts, int flags, int ul) 00567 { 00568 htmlfont_t *font = NEW(htmlfont_t); 00569 00570 font->size = -1.0; /* unassigned */ 00571 font->flags = flags; 00572 if (atts) 00573 doAttrs(font, font_items, sizeof(font_items) / ISIZE, atts, "<FONT>"); 00574 00575 return font; 00576 } 00577 00578 static htmlcell_t *mkCell(char **atts) 00579 { 00580 htmlcell_t *cell = NEW(htmlcell_t); 00581 00582 cell->cspan = 1; 00583 cell->rspan = 1; 00584 doAttrs(cell, cell_items, sizeof(cell_items) / ISIZE, atts, "<TD>"); 00585 00586 return cell; 00587 } 00588 00589 static htmltbl_t *mkTbl(char **atts) 00590 { 00591 htmltbl_t *tbl = NEW(htmltbl_t); 00592 00593 tbl->rc = -1; /* flag that table is a raw, parsed table */ 00594 tbl->cb = -1; /* unset cell border attribute */ 00595 doAttrs(tbl, tbl_items, sizeof(tbl_items) / ISIZE, atts, "<TABLE>"); 00596 00597 return tbl; 00598 } 00599 00600 static void startElement(void *user, const char *name, char **atts) 00601 { 00602 if (strcasecmp(name, "TABLE") == 0) { 00603 htmllval.tbl = mkTbl(atts); 00604 state.inCell = 0; 00605 state.tok = T_table; 00606 } else if ((strcasecmp(name, "TR") == 0) 00607 || (strcasecmp(name, "TH") == 0)) { 00608 state.inCell = 0; 00609 state.tok = T_row; 00610 } else if (strcasecmp(name, "TD") == 0) { 00611 state.inCell = 1; 00612 htmllval.cell = mkCell(atts); 00613 state.tok = T_cell; 00614 } else if (strcasecmp(name, "FONT") == 0) { 00615 htmllval.font = mkFont(atts, 0, 0); 00616 state.tok = T_font; 00617 } else if (strcasecmp(name, "B") == 0) { 00618 htmllval.font = mkFont(0, HTML_BF, 0); 00619 state.tok = T_bold; 00620 } else if (strcasecmp(name, "U") == 0) { 00621 htmllval.font = mkFont(0, HTML_UL, 1); 00622 state.tok = T_underline; 00623 } else if (strcasecmp(name, "I") == 0) { 00624 htmllval.font = mkFont(0, HTML_IF, 0); 00625 state.tok = T_italic; 00626 } else if (strcasecmp(name, "SUP") == 0) { 00627 htmllval.font = mkFont(0, HTML_SUP, 0); 00628 state.tok = T_sup; 00629 } else if (strcasecmp(name, "SUB") == 0) { 00630 htmllval.font = mkFont(0, HTML_SUB, 0); 00631 state.tok = T_sub; 00632 } else if (strcasecmp(name, "BR") == 0) { 00633 mkBR(atts); 00634 state.tok = T_br; 00635 } else if (strcasecmp(name, "HR") == 0) { 00636 state.tok = T_hr; 00637 } else if (strcasecmp(name, "VR") == 0) { 00638 state.tok = T_vr; 00639 } else if (strcasecmp(name, "IMG") == 0) { 00640 htmllval.img = mkImg(atts); 00641 state.tok = T_img; 00642 } else if (strcasecmp(name, "HTML") == 0) { 00643 state.tok = T_html; 00644 } else { 00645 lexerror(name); 00646 } 00647 } 00648 00649 static void endElement(void *user, const char *name) 00650 { 00651 if (strcasecmp(name, "TABLE") == 0) { 00652 state.tok = T_end_table; 00653 state.inCell = 1; 00654 } else if ((strcasecmp(name, "TR") == 0) 00655 || (strcasecmp(name, "TH") == 0)) { 00656 state.tok = T_end_row; 00657 } else if (strcasecmp(name, "TD") == 0) { 00658 state.tok = T_end_cell; 00659 state.inCell = 0; 00660 } else if (strcasecmp(name, "HTML") == 0) { 00661 state.tok = T_end_html; 00662 } else if (strcasecmp(name, "FONT") == 0) { 00663 state.tok = T_end_font; 00664 } else if (strcasecmp(name, "B") == 0) { 00665 state.tok = T_n_bold; 00666 } else if (strcasecmp(name, "U") == 0) { 00667 state.tok = T_n_underline; 00668 } else if (strcasecmp(name, "I") == 0) { 00669 state.tok = T_n_italic; 00670 } else if (strcasecmp(name, "SUP") == 0) { 00671 state.tok = T_n_sup; 00672 } else if (strcasecmp(name, "SUB") == 0) { 00673 state.tok = T_n_sub; 00674 } else if (strcasecmp(name, "BR") == 0) { 00675 if (state.tok == T_br) 00676 state.tok = T_BR; 00677 else 00678 state.tok = T_end_br; 00679 } else if (strcasecmp(name, "HR") == 0) { 00680 if (state.tok == T_hr) 00681 state.tok = T_HR; 00682 else 00683 state.tok = T_end_hr; 00684 } else if (strcasecmp(name, "VR") == 0) { 00685 if (state.tok == T_vr) 00686 state.tok = T_VR; 00687 else 00688 state.tok = T_end_vr; 00689 } else if (strcasecmp(name, "IMG") == 0) { 00690 if (state.tok == T_img) 00691 state.tok = T_IMG; 00692 else 00693 state.tok = T_end_img; 00694 } else { 00695 lexerror(name); 00696 } 00697 } 00698 00699 /* characterData: 00700 * Generate T_string token. Do this only when immediately in 00701 * <TD>..</TD> or <HTML>..</HTML>, i.e., when inCell is true. 00702 * Strip out formatting characters but keep spaces. 00703 * Distinguish between all whitespace vs. strings with non-whitespace 00704 * characters. 00705 */ 00706 static void characterData(void *user, const char *s, int length) 00707 { 00708 int i, rc, cnt = 0; 00709 unsigned char c; 00710 00711 if (state.inCell) { 00712 for (i = length; i; i--) { 00713 c = *s++; 00714 if (c >= ' ') { 00715 cnt++; 00716 rc = agxbputc(state.xb, c); 00717 } 00718 } 00719 if (cnt) state.tok = T_string; 00720 } 00721 } 00722 #endif 00723 00724 int initHTMLlexer(char *src, agxbuf * xb, int charset) 00725 { 00726 #ifdef HAVE_EXPAT 00727 state.xb = xb; 00728 agxbinit (&state.lb, SMALLBUF, NULL); 00729 state.ptr = src; 00730 state.mode = 0; 00731 state.warn = 0; 00732 state.error = 0; 00733 state.currtoklen = 0; 00734 state.prevtoklen = 0; 00735 state.inCell = 1; 00736 state.parser = XML_ParserCreate(charsetToStr(charset)); 00737 XML_SetElementHandler(state.parser, 00738 (XML_StartElementHandler) startElement, 00739 endElement); 00740 XML_SetCharacterDataHandler(state.parser, characterData); 00741 return 0; 00742 #else 00743 static int first; 00744 if (!first) { 00745 agerr(AGWARN, 00746 "Not built with libexpat. Table formatting is not available.\n"); 00747 first++; 00748 } 00749 return 1; 00750 #endif 00751 } 00752 00753 int clearHTMLlexer() 00754 { 00755 #ifdef HAVE_EXPAT 00756 int rv = state.warn | state.error; 00757 XML_ParserFree(state.parser); 00758 agxbfree (&state.lb); 00759 return rv; 00760 #else 00761 return 1; 00762 #endif 00763 } 00764 00765 #ifdef HAVE_EXPAT 00766 /* eatComment: 00767 * Given first character after open comment, eat characters 00768 * upto comment close, returning pointer to closing > if it exists, 00769 * or null character otherwise. 00770 * We rely on HTML strings having matched nested <>. 00771 */ 00772 static char *eatComment(char *p) 00773 { 00774 int depth = 1; 00775 char *s = p; 00776 char c; 00777 00778 while (depth && (c = *s++)) { 00779 if (c == '<') 00780 depth++; 00781 else if (c == '>') 00782 depth--; 00783 } 00784 s--; /* move back to '\0' or '>' */ 00785 if (*s) { 00786 char *t = s - 2; 00787 if ((t < p) || strncmp(t, "--", 2)) { 00788 agerr(AGWARN, "Unclosed comment\n"); 00789 state.warn = 1; 00790 } 00791 } 00792 return s; 00793 } 00794 00795 /* findNext: 00796 * Return next XML unit. This is either <..>, an HTML 00797 * comment <!-- ... -->, or characters up to next <. 00798 */ 00799 static char *findNext(char *s, agxbuf* xb) 00800 { 00801 char* t = s + 1; 00802 char c; 00803 int rc; 00804 00805 if (*s == '<') { 00806 if ((*t == '!') && !strncmp(t + 1, "--", 2)) 00807 t = eatComment(t + 3); 00808 else 00809 while (*t && (*t != '>')) 00810 t++; 00811 if (*t != '>') { 00812 agerr(AGWARN, "Label closed before end of HTML element\n"); 00813 state.warn = 1; 00814 } else 00815 t++; 00816 } else { 00817 t = s; 00818 while ((c = *t) && (c != '<')) { 00819 if ((c == '&') && (*(t+1) != '#')) { 00820 t = scanEntity(t + 1, xb); 00821 } 00822 else { 00823 rc = agxbputc(xb, c); 00824 t++; 00825 } 00826 } 00827 } 00828 return t; 00829 } 00830 #endif 00831 00832 int htmllineno() 00833 { 00834 #ifdef HAVE_EXPAT 00835 return XML_GetCurrentLineNumber(state.parser); 00836 #else 00837 return 0; 00838 #endif 00839 } 00840 00841 #ifdef DEBUG 00842 static void printTok(int tok) 00843 { 00844 char *s; 00845 00846 switch (tok) { 00847 case T_VR: 00848 s = "T_VR"; 00849 break; 00850 case T_vr: 00851 s = "T_vr"; 00852 break; 00853 case T_end_vr: 00854 s = "T_end_vr"; 00855 break; 00856 case T_HR: 00857 s = "T_HR"; 00858 break; 00859 case T_hr: 00860 s = "T_hr"; 00861 break; 00862 case T_end_hr: 00863 s = "T_end_hr"; 00864 break; 00865 case T_BR: 00866 s = "T_BR"; 00867 break; 00868 case T_br: 00869 s = "T_br"; 00870 break; 00871 case T_end_br: 00872 s = "T_end_br"; 00873 break; 00874 case T_end_table: 00875 s = "T_end_table"; 00876 break; 00877 case T_row: 00878 s = "T_row"; 00879 break; 00880 case T_end_row: 00881 s = "T_end_row"; 00882 break; 00883 case T_end_cell: 00884 s = "T_end_cell"; 00885 break; 00886 case T_html: 00887 s = "T_html"; 00888 break; 00889 case T_end_html: 00890 s = "T_end_html"; 00891 break; 00892 case T_string: 00893 s = "T_string"; 00894 break; 00895 case T_error: 00896 s = "T_error"; 00897 break; 00898 case T_table: 00899 s = "T_table"; 00900 break; 00901 case T_cell: 00902 s = "T_cell"; 00903 break; 00904 case T_img: 00905 s = "T_img"; 00906 break; 00907 case T_end_img: 00908 s = "T_end_img"; 00909 break; 00910 case T_IMG: 00911 s = "T_IMG"; 00912 break; 00913 case T_underline: 00914 s = "T_underline"; 00915 break; 00916 case T_n_underline: 00917 s = "T_underline"; 00918 break; 00919 case T_italic: 00920 s = "T_italic"; 00921 break; 00922 case T_n_italic: 00923 s = "T_italic"; 00924 break; 00925 case T_bold: 00926 s = "T_bold"; 00927 break; 00928 case T_n_bold: 00929 s = "T_bold"; 00930 break; 00931 default: 00932 s = "<unknown>"; 00933 } 00934 if (tok == T_string) { 00935 fprintf(stderr, "%s \"", s); 00936 fwrite(agxbstart(state.xb), 1, agxblen(state.xb), stderr); 00937 fprintf(stderr, "\"\n"); 00938 } else 00939 fprintf(stderr, "%s\n", s); 00940 } 00941 00942 #endif 00943 00944 int htmllex() 00945 { 00946 #ifdef HAVE_EXPAT 00947 static char *begin_html = "<HTML>"; 00948 static char *end_html = "</HTML>"; 00949 00950 char *s; 00951 char *endp = 0; 00952 int len, llen; 00953 int rv; 00954 00955 state.tok = 0; 00956 do { 00957 if (state.mode == 2) 00958 return EOF; 00959 if (state.mode == 0) { 00960 state.mode = 1; 00961 s = begin_html; 00962 len = strlen(s); 00963 endp = 0; 00964 } else { 00965 s = state.ptr; 00966 if (*s == '\0') { 00967 state.mode = 2; 00968 s = end_html; 00969 len = strlen(s); 00970 } else { 00971 endp = findNext(s,&state.lb); 00972 len = endp - s; 00973 } 00974 } 00975 state.prevtok = state.currtok; 00976 state.prevtoklen = state.currtoklen; 00977 state.currtok = s; 00978 state.currtoklen = len; 00979 if ((llen = agxblen(&state.lb))) 00980 rv = XML_Parse(state.parser, agxbuse(&state.lb),llen, 0); 00981 else 00982 rv = XML_Parse(state.parser, s, len, (len ? 0 : 1)); 00983 if (rv == XML_STATUS_ERROR) { 00984 if (!state.error) { 00985 agerr(AGERR, "%s in line %d \n", 00986 XML_ErrorString(XML_GetErrorCode(state.parser)), 00987 htmllineno()); 00988 error_context(); 00989 state.error = 1; 00990 state.tok = T_error; 00991 } 00992 } 00993 if (endp) 00994 state.ptr = endp; 00995 } while (state.tok == 0); 00996 /* printTok (state.tok); */ 00997 return state.tok; 00998 #else 00999 return EOF; 01000 #endif 01001 } 01002
1.7.5