long long read_number(int ch) { char number[32]; size_t nlen; const char *errstr; long long n; nlen = 0; number[nlen++] = ch; while ((ch = lex_getc()) != EOF) { if (!isdigit((u_char) ch)) break; number[nlen++] = ch; if (nlen == (sizeof number) - 1) yyerror("number too long"); } number[nlen] = '\0'; lex_ungetc(ch); n = strtonum(number, 0, LLONG_MAX, &errstr); if (errstr != NULL) yyerror("number is %s", errstr); return (n); }
/*------------------------------------------------------------------------- * Function: lex_special * * Purpose: Special parsing for the next token. For instance, the next * token might be an unquoted file name `file.pdb' which would * normally be returned as (SYM DOT SYM). Instead, this function * would parse it as a single string. * * Return: void * * Programmer: Robb Matzke * [email protected] * Feb 7 1997 * * Modifications: * *------------------------------------------------------------------------- */ void lex_special(lex_t *f, int skipnl) { int c, at=0; if (skipnl && f->tok) { f->tok = 0; f->lexeme[0] = 0; } assert(0==f->tok); /*too late for special lexical analysis*/ /* Skip leading space. Skip line-feeds too if SKIPNL is non-zero. */ f->prompt = skipnl ? LEX_PROMPT2 : LEX_PROMPT; while (EOF!=(c=lex_getc(f)) && isspace(c) && (skipnl || '\n'!=c)) /*void*/; if (EOF==c) return; if (isalpha(c) || isdigit(c) || strchr ("!@$%^&*-_=+,.?/;:~", c)) { f->lexeme[0] = c; f->lexeme[1] = '\0'; at = 1; while (EOF!=(c=lex_getc(f)) && (isalpha(c) || isdigit(c) || strchr("!@$%^&*-_=+,.?/;:~", c))) { if (at+1<sizeof(f->lexeme)) { f->lexeme[at++] = c; f->lexeme[at] = '\0'; } } f->tok = TOK_STR; } lex_ungetc (f, c); }
char * read_macro(int type, int ch) { char name[MAXNAMESIZE]; size_t nlen; int brackets; brackets = 0; if (ch == '{') { ch = lex_getc(); if (!isalnum((u_char) ch)) yyerror("invalid macro name"); brackets = 1; } nlen = 0; name[nlen++] = type; name[nlen++] = ch; while ((ch = lex_getc()) != EOF) { if (!isalnum((u_char) ch) && ch != '-' && ch != '_') break; name[nlen++] = ch; if (nlen == (sizeof name) - 1) yyerror("macro name too long"); } name[nlen] = '\0'; if (!brackets) lex_ungetc(ch); if (brackets && ch != '}') yyerror("missing }"); if (*name == '\0') yyerror("empty macro name"); return (xstrdup(name)); }
/*------------------------------------------------------------------------- * Function: lex_token * * Purpose: Figures out what token is next on the input stream. If * skipnl is non-zero then the new-line token is skipped. * * Return: Success: Token number, optional lexeme returned * through the LEXEME argument. * * Failure: TOK_INVALID * * Programmer: Robb Matzke * [email protected] * Dec 4 1996 * * Modifications: * * Robb Matzke, 3 Feb 1997 * Cleaned up error messages. * * Robb Matzke, 7 Feb 1997 * Added the `=' token. * * Robb Matzke, 7 Feb 1997 * The `*' and `?'characters are now legal as part of a symbol name * so we can give those pattern matching characters to the `ls' * command. * * Robb Matzke, 12 Mar 1997 * Since we don't have mathematical expressions yet, a numeric * constant is allowed to begin with a `-'. * * Robb Matzke, 2000-06-06 * Symbol names may include `-'. Something that starts with a `-' is * only a number if it's followed by a digit. * * Mark C. Miller, Mon Nov 9 18:08:05 PST 2009 * Added logic to support parsing of '#nnnnnn' dataset names, * but only when in '/.silo' dir. *------------------------------------------------------------------------- */ int lex_token(lex_t *f, char **lexeme, int skipnl) { int c, at, quote, inDotSiloDir=0; static const char *symcharsA = "_$/*?"; static const char *symcharsB = "_$/*?#"; const char *symchars = symcharsA; /* Return the current token if appropriate. */ if (f->tok && (!skipnl || TOK_EOL!=f->tok)) { if (lexeme) *lexeme = f->lexeme; return f->tok; } /* Skip leading space. */ f->prompt = skipnl ? LEX_PROMPT2 : LEX_PROMPT; while (EOF!=(c=lex_getc(f)) && '\n'!=c && isspace(c)) /*void*/; /* handle special case of leading '#' and see if we're in .silo dir */ if ('#'==c) { obj_t f1, val; DBfile *file; char cwd[1024]; f1 = obj_new (C_SYM, "$1"); val = sym_vboundp (f1); f1 = obj_dest (f1); if (NULL!=(file=file_file(val)) && DBGetDir(file, cwd)>=0 && !strncmp(cwd,"/.silo",6)) { inDotSiloDir = 1; symchars = symcharsB; } } /* Store the next token. */ if (EOF==c) { f->lexeme[0] = '\0'; f->tok = EOF; } else if ('\n'==c) { if (skipnl) { f->tok = lex_token(f, NULL, true); } else { f->lexeme[0] = '\n'; f->lexeme[1] = '\0'; f->tok = TOK_EOL; } } else if ('#'==c && !inDotSiloDir) { while (EOF!=(c=lex_getc(f)) && '\n'!=c) /*void*/; lex_ungetc(f, c); return lex_token(f, lexeme, skipnl); } else if ('>'==c) { c = lex_getc(f); if ('>'==c) { strcpy(f->lexeme, ">>"); f->tok = TOK_RTRT; } else { lex_ungetc(f, c); strcpy(f->lexeme, ">"); f->tok = TOK_RT; } } else if (strchr("|.()[]{}:,=", c)) { f->lexeme[0] = c; f->lexeme[1] = '\0'; f->tok = c; } else if (isalpha(c) || strchr(symchars,c)) { /* A symbol. */ f->lexeme[0] = c; f->lexeme[1] = '\0'; at = 1; while (EOF!=(c=lex_getc(f)) && (isalpha(c) || isdigit(c) || strchr(symchars, c))) { if (at+1<sizeof(f->lexeme)) { f->lexeme[at++] = c; f->lexeme[at] = '\0'; } } lex_ungetc(f, c); f->tok = TOK_SYM; } else if ('-'==c) { /* Could be a number or a symbol */ f->lexeme[0] = c; f->lexeme[1] = '\0'; if (EOF!=(c=lex_getc(f)) && ('.'==c || isdigit(c))) { f->lexeme[1] = c; f->lexeme[2] = '\0'; at = 2; while (EOF!=(c=lex_getc(f)) && (isdigit(c) || strchr("+-.eE", c))) { if (at+1<sizeof(f->lexeme)) { f->lexeme[at++] = c; f->lexeme[at] = '\0'; } } lex_ungetc(f, c); f->tok = TOK_NUM; } else { at=1; while (EOF!=c && (isalpha(c) || isdigit(c) || strchr("_$/*?-", c))) { if (at+1<sizeof(f->lexeme)) { f->lexeme[at++] = c; f->lexeme[at] = '\0'; } c = lex_getc(f); } lex_ungetc(f, c); f->tok = TOK_SYM; } } else if ('-'==c || isdigit(c)) { /* A number */ f->lexeme[0] = c; f->lexeme[1] = '\0'; at = 1; while (EOF!=(c=lex_getc(f)) && (isdigit(c) || strchr("+-.eE", c))) { if (at+1<sizeof(f->lexeme)) { f->lexeme[at++] = c; f->lexeme[at] = '\0'; } } lex_ungetc(f, c); f->tok = TOK_NUM; } else if ('"'==c || '\''==c) { /* A string */ quote = c; at = 0; f->lexeme[0] = '\0'; while (EOF!=(c=lex_getc(f)) && quote!=c && '\n'!=c) { if ('\\'==c) { switch ((c=lex_getc(f))) { case 'b': c = '\b'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case EOF: c = '\\'; break; default: if (c>='0' && c<='7') { int c2 = lex_getc(f); if (c2>='0' && c2<='7') { int c3 = lex_getc(f); if (c3>='0' && c3<='7') { c = ((c-'0')*8+c2-'0')*8+c3-'0'; } else { lex_ungetc(f, c3); c = (c-'0')*8+c2-'0'; } } else { lex_ungetc(f, c2); c -= '0'; } } break; } } if (at+1<sizeof(f->lexeme)) { f->lexeme[at++] = c; f->lexeme[at] = '\0'; } } if ('\n'==c) { out_errorn("linefeed inside string constant (truncated at EOL)"); lex_ungetc(f, c); } else if (c<0) { out_errorn("EOF inside string constant (truncated at EOF)"); } f->tok = TOK_STR; } else { /* Invalid character. Don't print an error message since a * syntax error will result in the parser anyway. */ f->lexeme[0] = c; f->lexeme[1] = '\0'; f->tok = TOK_INVALID; } if (lexeme) *lexeme = f->lexeme; return f->tok; }
/*------------------------------------------------------------------------- * Function: lex_getc * * Purpose: Similar to getc(3) except uses the GNU readline library * and issues prompts as necessary. * * Return: Success: Next character * * Failure: EOF * * Programmer: Robb Matzke * [email protected] * Dec 10 1996 * * Modifications: * Robb Matzke, 29 Jul 1997 * If the line-feed is escaped with a backslash, then the backslash * and line-feed are both ignored. * * Jeremy Meredith, Thu Aug 26 09:59:44 PDT 1999 * Changed use of strdup() to safe_strdup(). * * Robb Matzke, 2000-07-10 * Modified to work with stacked input streams. *------------------------------------------------------------------------- */ int lex_getc(lex_t *f) { int c=EOF; #ifdef HAVE_READLINE_HISTORY static char buf[1024]; #endif if (f->s) { c = f->s[f->at++]; if (!f->s[f->at]) { free(f->s); f->s = NULL; f->at = 0; } } else if (f->f && isatty(fileno(f->f))) { /* Input is from the standard input stream. Use readline() to * get it and add it to the history if different than the * previous line. */ #if defined(HAVE_READLINE_READLINE_H) && defined(HAVE_LIBREADLINE) char *temp = readline(f->prompt); if (temp) { f->s = malloc(strlen(temp)+2); strcpy(f->s, temp); strcat(f->s, "\n"); } #else char temp[4096]; fputs(f->prompt, stdout); if (fgets(temp, sizeof(temp), f->f)) { f->s = safe_strdup(temp); } else { f->s = NULL; } #endif f->at = 0; #if defined(HAVE_READLINE_READLINE_H) && defined(HAVE_READLINE_HISTORY) if (f->s && f->s[0] && strncmp(buf, f->s, sizeof(buf))) { add_history(f->s); strncpy(buf, f->s, sizeof(buf)); } #endif c = (f->s ? lex_getc(f) : EOF); } else if (f->f) { /* Input is from a non-interactive stream. */ c = getc(f->f); } else if (f->nstack) { while (f->nstack && EOF==(c=lex_getc(f->stack[f->nstack-1]))) { lex_close(f->stack[f->nstack-1]); f->stack[--f->nstack] = NULL; } return c; } else { return EOF; } /* If this character is a backslash and the following character * is a line-feed, then ignore both of them and return the following * character instead. This allows us to always continue a line by * escaping the line-feed. */ if ('\\'==c) { int peek = lex_getc(f); if ('\n'!=peek) lex_ungetc(f, peek); else c = lex_getc(f); } return c; }
char * read_string(char endch, int esc) { int ch, oldch; size_t pos, len, slen; char *name, *s, *buf; struct macro *macro; len = 24; buf = xmalloc(len + 1); pos = 0; while ((ch = lex_getc()) != endch) { switch (ch) { case EOF: yyerror("missing %c", endch); case '\\': if (!esc) break; switch (ch = lex_getc()) { case EOF: yyerror("missing %c", endch); case 'r': ch = '\r'; break; case 'n': ch = '\n'; break; case 't': ch = '\t'; break; } break; case '$': case '%': if (!esc) break; oldch = ch; ch = lex_getc(); if (ch == EOF) yyerror("missing %c", endch); if (ch != '{') { lex_ungetc(ch); ch = oldch; break; } name = read_macro(oldch, '{'); if ((macro = find_macro(name)) == NULL) { xfree(name); continue; } xfree(name); if (macro->type == MACRO_NUMBER) xasprintf(&s, "%lld", macro->value.num); else s = macro->value.str; slen = strlen(s); ENSURE_FOR(buf, len, pos, slen + 1); memcpy(buf + pos, s, slen); pos += slen; if (macro->type == MACRO_NUMBER) xfree(s); continue; } buf[pos++] = ch; ENSURE_SIZE(buf, len, pos); } buf[pos] = '\0'; return (buf); }
int read_token(int ch) { int ch2; char token[128], *name; size_t tlen; struct token *ptr; struct macro *macro; tlen = 0; token[tlen++] = ch; while ((ch = lex_getc()) != EOF) { if (!isalnum((u_char) ch) && ch != '-' && ch != '_') break; token[tlen++] = ch; if (tlen == (sizeof token) - 1) yyerror("token too long"); } token[tlen] = '\0'; lex_ungetc(ch); /* * ifdef/ifndef/endif is special-cased here since it is really really * hard to make work with yacc. */ if (strcmp(token, "ifdef") == 0 || strcmp(token, "ifndef") == 0) { while ((ch = lex_getc()) != EOF && isspace((u_char) ch)) ; if (ch != '$' && ch != '%') yyerror("syntax error"); ch2 = lex_getc(); if (ch2 != '{' && !isalnum((u_char) ch2)) yyerror("invalid macro name"); name = read_macro(ch, ch2); macro = find_macro(name); xfree(name); if (token[2] == 'n' && macro != NULL) lex_skip = 1; if (token[2] != 'n' && macro == NULL) lex_skip = 1; lex_ifdef++; return (NONE); } if (strcmp(token, "endif") == 0) { if (lex_ifdef == 0) yyerror("spurious endif"); lex_ifdef--; if (lex_ifdef == 0) lex_skip = 0; return (NONE); } if (strcmp(token, "include") == 0) { /* * This is a bit strange. * * yacc may have symbols buffered and be waiting for more to * decide which production to match, so we can't just switch * file now. So, we set a flag that tells yylex to switch files * next time it's called and return the NONE symbol. This is a * placeholder not used in any real productions, so it should * cause yacc to match using whatever it has (assuming it * can). If we don't do this, there are problems with things * like: * * $file = "abc" * include "${file}" * * The include token is seen before yacc has matched the * previous line, so the macro doesn't exist when we try to * build the include file path. */ lex_include = 1; return (NONE); } ptr = bsearch(token, tokens, (sizeof tokens)/(sizeof tokens[0]), sizeof tokens[0], cmp_token); if (ptr == NULL) yyerror("unknown token: %s", token); return (ptr->value); }
int yylex(void) { int ch, value; char *path; struct replpath rp; /* Switch to new file. See comment in read_token below. */ if (lex_include) { while ((ch = lex_getc()) != EOF && isspace((u_char) ch)) ; if (ch != '"' && ch != '\'') yyerror("syntax error"); if (ch == '"') rp.str = read_string('"', 1); else rp.str = read_string('\'', 0); path = replacepath(&rp, parse_tags, NULL, NULL, conf.user_home); xfree(rp.str); include_start(path); lex_include = 0; } restart: while ((ch = lex_getc()) != EOF) { switch (ch) { case '#': /* Comment: discard until EOL. */ while ((ch = lex_getc()) != '\n' && ch != EOF) ; parse_file->line++; break; case '\'': yylval.string = read_string('\'', 0); value = STRING; goto out; case '"': yylval.string = read_string('"', 1); value = STRING; goto out; case '$': ch = lex_getc(); if (ch == '(') { yylval.string = read_command(); value = STRCOMMAND; goto out; } if (ch == '{' || isalnum((u_char) ch)) { yylval.string = read_macro('$', ch); value = STRMACRO; goto out; } yyerror("invalid macro name"); case '%': ch = lex_getc(); if (ch == '(') { yylval.string = read_command(); value = NUMCOMMAND; goto out; } if (ch == '{' || isalnum((u_char) ch)) { yylval.string = read_macro('%', ch); value = NUMMACRO; goto out; } yyerror("invalid macro name"); case '=': ch = lex_getc(); if (ch == '=') { value = TOKEQ; goto out; } lex_ungetc(ch); value = '='; goto out; case '!': ch = lex_getc(); if (ch == '=') { value = TOKNE; goto out; } lex_ungetc(ch); value = '!'; goto out; case '~': case '+': case '(': case ')': case ',': case '<': case '>': case '{': case '}': case '*': value = ch; goto out; case '\n': parse_file->line++; break; case ' ': case '\t': break; default: if (ch != '_' && ch != '-' && !isalnum((u_char) ch)) yyerror("unexpected character: %c", ch); if (isdigit((u_char) ch)) { yylval.number = read_number(ch); value = NUMBER; goto out; } value = read_token(ch); goto out; } } if (!include_finish()) goto restart; if (lex_ifdef != 0) yyerror("missing endif"); return (EOF); out: if (lex_skip) goto restart; return (value); }
Token lex_scan(lex *l) { int t; for (;;) { t = lex_getc(l); if (t == ' ' || t == '\t') { ; /* strip whitespace */ } else if (t == '\n') { l->lineno++; } else if (isdigit(t)) { int v; v = 0; do { v = 10 * v + (t - '0'); t = lex_getc(l); } while (isdigit(t)); if (t != EOF) lex_ungetc(l, t); l->val = v; return NUMBER; } else if (isalpha(t)) { char *end; char *p; int sym; end = l->lexbuf + l->lexbufsz; p = l->lexbuf; do { *p++ = t; t = lex_getc(l); } while (p < end && isalnum(t)); *p++ = '\0'; if (t != EOF) lex_ungetc(l, t); sym = sym_lookup(l->sym, l->lexbuf); if (!sym) sym = sym_insert(l->sym, l->lexbuf, IDENT); l->val = sym; return IDENT; } else if (t == EOF) { return END; } else { l->val = NONE; return t; } } }