Graphviz  2.29.20120524.0446
lib/graph/lexer.c
Go to the documentation of this file.
00001 /* $Id$ $Revision$ */
00002 /* vim:set shiftwidth=4 ts=8: */
00003 
00004 /*************************************************************************
00005  * Copyright (c) 2011 AT&T Intellectual Property 
00006  * All rights reserved. This program and the accompanying materials
00007  * are made available under the terms of the Eclipse Public License v1.0
00008  * which accompanies this distribution, and is available at
00009  * http://www.eclipse.org/legal/epl-v10.html
00010  *
00011  * Contributors: See CVS logs. Details at http://www.graphviz.org/
00012  *************************************************************************/
00013 
00014 
00015 #include <stdarg.h>
00016 #include <stdlib.h>
00017 #include "libgraph.h"
00018 #include "parser.h"
00019 #include "triefa.cP"
00020 #include "agxbuf.h"
00021 
00022 #ifdef DMALLOC
00023 #include "dmalloc.h"
00024 #endif
00025 
00026 #define InfileName (InputFile?InputFile:"<unknown>")
00027 
00028 static FILE *Lexer_fp;
00029 static char *LexPtr, *TokenBuf;
00030 static int LineBufSize;
00031 static unsigned char In_comment;
00032 static unsigned char Comment_start;
00033 static unsigned char Start_html_string;
00034 int Line_number;
00035 static char *InputFile;
00036 static int agmaxerr;
00037 
00038 static void
00039 storeFileName (char* fname, int len)
00040 {
00041     static int cnt;
00042     static char* buf;
00043 
00044     if (len > cnt) {
00045         if (cnt) buf = (char*)realloc (buf, len+1);
00046         else buf = (char*)malloc (len+1);
00047         cnt = len;
00048     }
00049     strcpy (buf, fname);
00050     InputFile = buf;
00051 }
00052 
00053   /* Reset line number.
00054    * Argument n is indexed from 1, so we decrement it.
00055    */
00056 void agreadline(int n)
00057 {
00058     Line_number = n - 1;
00059 }
00060 
00061 int aglinenumber ()
00062 {
00063     return Line_number;
00064 }
00065 
00066   /* (Re)set file:
00067    */
00068 void agsetfile(char *f)
00069 {
00070     InputFile = f;
00071     Line_number = 0;
00072 }
00073 
00074 void aglexinit(FILE * fp, gets_f mygets)
00075 {
00076     if (Lexer_fp != fp)
00077         LexPtr = NULL;
00078     Lexer_fp = fp;
00079     if (mygets)
00080         AG.fgets = mygets;
00081     if (AG.fgets == NULL)
00082         AG.fgets = fgets;
00083     if (AG.linebuf == NULL) {
00084         LineBufSize = BUFSIZ;
00085         AG.linebuf = N_NEW(LineBufSize, char);
00086         TokenBuf = N_NEW(LineBufSize, char);
00087     }
00088     AG.fgets (AG.linebuf, 0, fp);       /* reset mygets */
00089     AG.syntax_errors = 0;
00090 }
00091 
00092 #define ISSPACE(c) ((c != 0) && ((isspace(c) || iscntrl(c))))
00093 
00094 /* skip leading white space and comments in a string p
00095  * whitespace includes control characters
00096  */
00097 static char *skip_wscomments(char *pp)
00098 {
00099     unsigned char *p = (unsigned char *) pp;
00100     do {
00101         while (ISSPACE(*p))
00102             p++;
00103         while (In_comment && p[0]) {
00104             while (p[0] && (p[0] != '*'))
00105                 p++;
00106             if (p[0]) {
00107                 if (p[1] == '/') {
00108                     In_comment = FALSE;
00109                     p += 2;
00110                     break;
00111                 } else
00112                     p++;
00113             }
00114         }
00115         if (p[0] == '/') {
00116             if (p[1] == '/')
00117                 while (*p)
00118                     p++;        /* skip to end of line */
00119             else {
00120                 if (p[1] == '*') {
00121                     In_comment = TRUE;
00122                     Comment_start = Line_number;
00123                     p += 2;
00124                     continue;
00125                 } else
00126                     break;      /* return a slash */
00127             }
00128         } else {
00129             if (!ISSPACE(*p))
00130                 break;
00131         }
00132     } while (p[0]);
00133     return (char *) p;
00134 }
00135 
00136 /* scan an unquoted token and return the position after its terminator */
00137 static char *scan_token(unsigned char *p, unsigned char *token)
00138 {
00139     unsigned char *q;
00140 
00141     q = token;
00142     if (p == '\0')
00143         return NULL;
00144     while (ISALNUM(*p)) {
00145         *q++ = *p++;
00146     }
00147     *q = '\0';
00148     return p;
00149 }
00150 
00151 static char *scan_num(char *p, char *token)
00152 {
00153     unsigned char *q, *z;
00154     int saw_rp = FALSE;
00155     int saw_digit = FALSE;
00156 
00157     z = (unsigned char *) p;
00158     q = (unsigned char *) token;
00159     if (*z == '-')
00160         *q++ = *z++;
00161     if (*z == '.') {
00162         saw_rp = TRUE;
00163         *q++ = *z++;
00164     }
00165     while (isdigit(*z)) {
00166         saw_digit = TRUE;
00167         *q++ = *z++;
00168     }
00169     if ((*z == '.') && (saw_rp == FALSE)) {
00170         saw_rp = TRUE;
00171         *q++ = *z++;
00172         while (isdigit(*z)) {
00173             saw_digit = TRUE;
00174             *q++ = *z++;
00175         }
00176     }
00177     *q = '\0';
00178     if (saw_digit && *z && ((isalpha(*z)) || (*z == '_'))) {
00179         unsigned char *endp = z + 1;
00180         unsigned char c;
00181         while ((c = *endp) && ((isalpha(c)) || (c == '_')))
00182             endp++;
00183         *endp = '\0';
00184         agerr(AGWARN,
00185               "%s:%d: ambiguous \"%s\" splits into two names: \"%s\" and \"%s\"\n",
00186               InfileName, Line_number, p, token, z);
00187         *endp = c;
00188     }
00189 
00190     if (saw_digit == FALSE)
00191         z = NULL;
00192     return (char *) z;
00193 }
00194 
00195 /* scan a quoted string and return the position after its terminator */
00196 static char *quoted_string(char *p, char *token)
00197 {
00198     char quote, *q;
00199 
00200     quote = *p++;
00201     q = token;
00202     while ((*p) && (*p != quote)) {
00203         if (*p == '\\') {
00204             if (*(p + 1) == quote)
00205                 p++;
00206             else {
00207                 if (*(p + 1) == '\\')
00208                     *q++ = *p++;
00209             }
00210         }
00211         *q++ = *p++;
00212     }
00213     if (*p == '\0')
00214         agerr(AGWARN, "%s:%d: string ran past end of line\n",
00215               InfileName, Line_number);
00216     else
00217         p++;
00218     *q = 0;
00219     return p;
00220 }
00221 
00222 int myaglex(void)
00223 {                               /* for debugging */
00224     int rv = aglex();
00225     fprintf(stderr, "returning %d\n", rv);
00226     if (rv == T_symbol)
00227         fprintf(stderr, "string val is %s\n", aglval.str);
00228     return rv;
00229 }
00230 
00231 /*
00232  * Return a logical line in AG.linebuf.
00233  * In particular, the buffer will contain a '\n' as the last non-null char.
00234  * Ignore lines beginning with '#'; update cpp line number if applicable.
00235  * Fold long lines, i.e., ignore escaped newlines.
00236  * Assume the AG.fgets function reads upto newline or buffer length
00237  * like fgets.
00238  * Need to be careful that AG.fgets might not return full physical line
00239  * because buffer is too small to hold it.
00240  */
00241 static char *lex_gets(void)
00242 {
00243     char *clp;
00244     int len, curlen;
00245 
00246     len = curlen = 0;
00247 
00248     do {
00249         /* make sure there is room for at least another SMALLBUF worth */
00250         if (curlen + SMALLBUF >= LineBufSize) {
00251             LineBufSize += BUFSIZ;
00252             AG.linebuf = (char*)realloc(AG.linebuf, LineBufSize);
00253             TokenBuf = (char*)realloc(TokenBuf, LineBufSize);
00254         }
00255 
00256         /* off by one so we can back up in LineBuf */
00257         clp = AG.fgets (AG.linebuf + curlen + 1,
00258                           LineBufSize - curlen - 1, Lexer_fp);
00259         if (clp == NULL)
00260             break;
00261 
00262 
00263         len = strlen(clp);      /* since clp != NULL, len > 0 */
00264         if (clp[len - 1] == '\n') {     /* have physical line */
00265             if ((clp[0] == '#') && (curlen == 0)) {
00266                 /* comment line or cpp line sync */
00267                 int r, cnt;
00268                 char buf[2];
00269                 char* s = clp + 1;
00270 
00271                 if (strncmp(s, "line", 4) == 0) s += 4;
00272                 r = sscanf(s, "%d %1[\"]%n", &Line_number, buf, &cnt);
00273                 if (r <= 0) Line_number++;
00274                 else { /* got line number */ 
00275                     Line_number--;
00276                     if (r > 1) { /* saw quote */
00277                         char* p = s + cnt;
00278                         char* e = p;
00279                         while (*e && (*e != '"')) e++; 
00280                         if (e != p) {
00281                             *e = '\0';
00282                             storeFileName (p, e-p);
00283                         }
00284                     }
00285                 }
00286                 clp[0] = 0;
00287                 len = 1;    /* this will make the while test below succeed */
00288                 continue;
00289             }
00290             Line_number++;
00291                 /* Note it is possible len == 1 and last character in
00292                  * previous read was '\\'
00293                  * It is also possible to have curlen=0, and read in
00294                  * "\\\n". 
00295                  */
00296             if (clp[len - 2] == '\\') { /* escaped newline */
00297                 len = len - 2;
00298                 clp[len] = '\0';
00299             }
00300         }
00301         curlen += len;
00302         /* the following test relies on having AG.linebuf[0] == '\0' */
00303     } while (clp[len - 1] != '\n');
00304 
00305     if (curlen > 0)
00306         return AG.linebuf + 1;
00307     else
00308         return NULL;
00309 }
00310 
00311 /* html_pair:
00312  * Iteratively scan nested "<...>"
00313  * p points to first character after initial '<'
00314  * Store characters up to but not including matching '>'
00315  * Return pointer to matching '>'
00316  * We do not check for any escape sequences; pure HTML is
00317  * expected, so special characters need to be HTML escapes.
00318  * We read them in and allow the HTML parser to convert them.
00319  */
00320 static char *html_pair(char *p, agxbuf * tokp)
00321 {
00322     unsigned char c;
00323     int rc, depth = 1;
00324 
00325     while (1) {
00326         while ((c = *p)) {
00327             if (c == '>') {
00328                 depth--;
00329                 if (depth == 0)
00330                     return p;   /* p points to closing > */
00331             } else if (c == '<')
00332                 depth++;
00333             rc = agxbputc(tokp, c);
00334             p++;
00335         }
00336         if ((p = lex_gets()) == NULL) {
00337             agerr(AGWARN,
00338                   "non-terminated HTML string starting line %d, file %s\n",
00339                   Start_html_string, InfileName);
00340             return 0;
00341         }
00342     }
00343 }
00344 
00345 /* html_string:
00346  * scan an html string and return the position after its terminator 
00347  * The string is stored in token.
00348  * p points to the opening <.
00349  */
00350 
00351 static char *html_string(char *p, agxbuf * token)
00352 {
00353     Start_html_string = Line_number;
00354     p = html_pair(p + 1, token);
00355     if (p)
00356         p++;                    /* skip closing '>' */
00357     return p;
00358 }
00359 
00360 int agtoken(char *p)
00361 {
00362     char ch;
00363     TFA_Init();
00364     while ((ch = *p)) {
00365         /* any non-ascii characters converted to ascii DEL (127) */
00366         TFA_Advance(ch & ~127 ? 127 : ch);
00367         p++;
00368     }
00369     return TFA_Definition();
00370 }
00371 
00372 int aglex(void)
00373 {
00374     int token;
00375     char *tbuf, *p;
00376     static unsigned char BOM[] = { 0xEF, 0xBB, 0xBF };  /* UTF-8 byte order marker */
00377 
00378     /* if the parser has accepted a graph, reset and return EOF */
00379     if (AG.accepting_state) {
00380         AG.accepting_state = FALSE;
00381         return EOF;
00382     }
00383 
00384     /* get a nonempty lex buffer */
00385     do {
00386         if ((LexPtr == NULL) || (LexPtr[0] == '\0'))
00387             if ((LexPtr = lex_gets()) == NULL) {
00388                 if (In_comment)
00389                     agerr(AGWARN, "nonterminated comment in line %d\n",
00390                           Comment_start);
00391                 return EOF;
00392             }
00393         /* skip UTF-8 Byte Order Marker if at beginning of file */
00394         if ((Line_number == 1) && !strncmp(LexPtr, (char *) BOM, 3))
00395             LexPtr += 3;
00396         LexPtr = (char *) skip_wscomments(LexPtr);
00397     } while (LexPtr[0] == '\0');
00398 
00399     tbuf = TokenBuf;
00400 
00401     /* scan quoted strings */
00402     if (LexPtr[0] == '\"') {
00403         LexPtr = quoted_string(LexPtr, tbuf);
00404         aglval.str = agstrdup(tbuf);
00405         return T_qsymbol;
00406     }
00407 
00408     /* scan HTML strings */
00409     if (LexPtr[0] == '<') {
00410         agxbuf xb;
00411         unsigned char htmlbuf[BUFSIZ];
00412         agxbinit(&xb, BUFSIZ, htmlbuf);
00413         LexPtr = html_string(LexPtr, &xb);
00414         aglval.str = agstrdup_html(agxbuse(&xb));
00415         agxbfree(&xb);
00416         return T_symbol;
00417     }
00418 
00419     /* scan edge operator */
00420     if (AG.edge_op
00421         && (strncmp(LexPtr, AG.edge_op, strlen(AG.edge_op)) == 0)) {
00422         LexPtr += strlen(AG.edge_op);
00423         return T_edgeop;
00424     }
00425 
00426     /* scan numbers */
00427     if ((p = scan_num(LexPtr, tbuf))) {
00428         LexPtr = p;
00429         aglval.str = agstrdup(tbuf);
00430         return T_symbol;
00431     } else {
00432         unsigned char uc = *(unsigned char *) LexPtr;
00433         if (ispunct(uc) && (uc != '_'))
00434             return *LexPtr++;
00435         else
00436             LexPtr = scan_token(LexPtr, tbuf);
00437     }
00438 
00439     /* scan other tokens */
00440     token = agtoken(tbuf);
00441     if (token == -1) {
00442         aglval.str = agstrdup(tbuf);
00443         token = T_symbol;
00444     }
00445     return token;
00446 }
00447 
00448 static void error_context(void)
00449 {
00450     char *p;
00451     char c;
00452     char *buf = AG.linebuf + 1; /* characters are always put at AG.linebuf[1] */
00453     /* or later; AG.linebuf[0] = '\0' */
00454 
00455     if (LexPtr == NULL)
00456         return;
00457     agerr(AGPREV, "context: ");
00458     for (p = LexPtr - 1; (p > buf) && (!isspace(*(unsigned char *) p));
00459          p--);
00460     if (buf < p) {
00461         c = *p;
00462         *p = '\0';
00463         agerr(AGPREV, buf);
00464         *p = c;
00465     }
00466     agerr(AGPREV, " >>> ");
00467     c = *LexPtr;
00468     *LexPtr = '\0';
00469     agerr(AGPREV, p);
00470     *LexPtr = c;
00471     agerr(AGPREV, " <<< ");
00472     agerr(AGPREV, LexPtr);
00473 }
00474 
00475 void agerror(char *msg)
00476 {
00477     if (AG.syntax_errors++)
00478         return;
00479     agerr(AGERR, "%s:%d: %s near line %d\n",
00480           InfileName, Line_number, msg, Line_number);
00481     error_context();
00482 }
00483 
00484 agerrlevel_t agerrno;           /* Last error */
00485 static agerrlevel_t agerrlevel = AGWARN;        /* Report errors >= agerrlevel */
00486 static long aglast;             /* Last message */
00487 static FILE *agerrout;          /* Message file */
00488 static agusererrf usererrf;     /* User-set error function */
00489 
00490 agusererrf 
00491 agseterrf (agusererrf newf)
00492 {
00493     agusererrf oldf = usererrf;
00494     usererrf = newf;
00495     return oldf;
00496 }
00497 
00498 void agseterr(agerrlevel_t lvl)
00499 {
00500     agerrlevel = lvl;
00501 }
00502 
00503 int agerrors(void)
00504 {
00505     return MAX(agmaxerr, AG.syntax_errors);
00506 }
00507 
00508 int agreseterrors(void)
00509 {
00510     int rc = MAX(agmaxerr, AG.syntax_errors);
00511     agmaxerr = 0;
00512     return rc;
00513 }
00514 
00515 char *aglasterr()
00516 {
00517     long endpos;
00518     long len;
00519     char *buf;
00520 
00521     if (!agerrout)
00522         return 0;
00523     fflush(agerrout);
00524     endpos = ftell(agerrout);
00525     len = endpos - aglast;
00526     buf = (char*)malloc(len + 1);
00527     fseek(agerrout, aglast, SEEK_SET);
00528     fread(buf, sizeof(char), len, agerrout);
00529     buf[len] = '\0';
00530     fseek(agerrout, endpos, SEEK_SET);
00531 
00532     return buf;
00533 }
00534 
00535 static void
00536 userout (agerrlevel_t level, const char *fmt, va_list args)
00537 {
00538     static char* buf;
00539     static int bufsz = 1024;
00540     char* np;
00541     int n;
00542 
00543     if (!buf) {
00544         buf = (char*)malloc(bufsz);
00545         if (!buf) {
00546             fputs("userout: could not allocate memory\n", stderr );
00547             return;
00548         }
00549     }
00550 
00551     if (level != AGPREV) {
00552         usererrf ((level == AGERR) ? "Error" : "Warning");
00553         usererrf (": ");
00554     }
00555 
00556     while (1) {
00557         n = vsnprintf(buf, bufsz, fmt, args);
00558         if ((n > -1) && (n < bufsz)) {
00559             usererrf (buf);
00560             break;
00561         }
00562         bufsz = MAX(bufsz*2,n+1);
00563         if ((np = (char*)realloc(buf, bufsz)) == NULL) {
00564             fputs("userout: could not allocate memory\n", stderr );
00565             return;
00566         }
00567     }
00568     va_end(args);
00569 }
00570 
00571 /* agerr_va:
00572  * Main error reporting function
00573  */
00574 static int agerr_va(agerrlevel_t level, const char *fmt, va_list args)
00575 {
00576     agerrlevel_t lvl;
00577 
00578     /* Use previous error level if continuation message;
00579      * Convert AGMAX to AGERROR;
00580      * else use input level
00581      */
00582     lvl = (level == AGPREV ? agerrno : (level == AGMAX) ? AGERR : level);
00583 
00584     /* store this error level and maximum error level used */
00585     agerrno = lvl;
00586     agmaxerr = MAX(agmaxerr, agerrno);
00587 
00588     /* We report all messages whose level is bigger than the user set agerrlevel
00589      * Setting agerrlevel to AGMAX turns off immediate error reporting.
00590      */
00591     if (lvl >= agerrlevel) {
00592         if (usererrf)
00593             userout (level, fmt, args);
00594         else {
00595             if (level != AGPREV)
00596                 fprintf(stderr, "%s: ", (level == AGERR) ? "Error" : "Warning");
00597             vfprintf(stderr, fmt, args);
00598             va_end(args);
00599         }
00600         return 0;
00601     }
00602 
00603     /* If error is not immediately reported, store in log file */
00604     if (!agerrout) {
00605         agerrout = tmpfile();
00606         if (!agerrout)
00607             return 1;
00608     }
00609 
00610     if (level != AGPREV)
00611         aglast = ftell(agerrout);
00612     vfprintf(agerrout, fmt, args);
00613     va_end(args);
00614     return 0;
00615 }
00616 
00617 /* agerr:
00618  * Varargs function for reporting errors with level argument
00619  */
00620 int agerr(agerrlevel_t level, char *fmt, ...)
00621 {
00622     va_list args;
00623 
00624     va_start(args, fmt);
00625     return agerr_va(level, fmt, args);
00626 }
00627 
00628 /* agerrorf:
00629  * Varargs function for reporting errors
00630  */
00631 void agerrorf(const char *fmt, ...)
00632 {
00633     va_list args;
00634 
00635     va_start(args, fmt);
00636     agerr_va(AGERR, fmt, args);
00637 }
00638 
00639 /* agwarningf:
00640  * Varargs function for reporting warnings
00641  */
00642 void agwarningf(char *fmt, ...)
00643 {
00644     va_list args;
00645 
00646     va_start(args, fmt);
00647     agerr_va(AGWARN, fmt, args);
00648 }