/* * Read a simple symbol * - the buffer must be empty * - current_char must be simple * - read the sequence of simple chars and add it to the buffer * * If the symbol is a reserved word, return the corresponding * token id. Otherwise, return SMT2_TK_SYMBOL. */ static smt2_token_t smt2_read_symbol(lexer_t *lex) { reader_t *rd; string_buffer_t *buffer; const keyword_t *kw; int c; smt2_token_t tk; rd = &lex->reader; buffer = lex->buffer; c = reader_current_char(rd); assert(string_buffer_length(buffer) == 0 && issimple(c)); do { string_buffer_append_char(buffer, c); c = reader_next_char(rd); } while (issimple(c)); string_buffer_close(buffer); tk = SMT2_TK_SYMBOL; kw = in_smt2_tk(buffer->data, buffer->index); if (kw != NULL) { tk = kw->tk; } return tk; }
/* * Read a symbol or keyword * lex->buffer contains one char (not a separator or digit) * char = next character after that. */ static yices_token_t read_symbol(lexer_t *lex) { reader_t *rd; string_buffer_t *buffer; int c; token_t tk; const keyword_t *kw; rd = &lex->reader; c = reader_current_char(rd); buffer = lex->buffer; while (! is_yices_sep(c)) { string_buffer_append_char(buffer, c); c = reader_next_char(rd); } string_buffer_close(buffer); tk = TK_SYMBOL; kw = in_yices_kw(buffer->data, buffer->index); if (kw != NULL) { tk = kw->tk; } return tk; }
/* * Read a quoted symbol: any sequence of characters delimited by '|' * - exceptions: no '\' allowed in the symbol * - all characters between '|' must be printable * - the delimiting '|' are not part of the symbol * * - the buffer must be empty * - current char must be '|' * * Return SMT2_TK_INVALID_SYMBOL if a non-printable character * or '\' is found before the closing '|'. Return SMT2_TK_QSYMBOL * otherwise. */ static smt2_token_t smt2_read_quoted_symbol(lexer_t *lex) { reader_t *rd; string_buffer_t *buffer; int c; smt2_token_t tk; rd = &lex->reader; buffer = lex->buffer; assert(string_buffer_length(buffer) == 0 && reader_current_char(rd) == '|'); for (;;) { c = reader_next_char(rd); if (c == '|' || c == '\\' || !ok_char(c)) { // (!isprint(c) && !isspace(c))) { // HACK TO PARSE BENCHMARKS // either the terminator '|' or a character not allowed in quoted symbols break; } string_buffer_append_char(buffer, c); } string_buffer_close(buffer); tk = SMT2_TK_INVALID_SYMBOL; if (c == '|') { // consume the closing '|' reader_next_char(rd); tk = SMT2_TK_QSYMBOL; } return tk; }
/* * Read an hexadecimal literal * - the buffer must contain '#' * - current_char must be 'x' * - add 'x' and the sequence of hexadecimal digits that * follows to the buffer * - stop on the first character that's not hexadecimal * * The resulting token is stored in buffer * - return code: * SMT2_TK_HEXADECIMAL if the sequence is non-empty * SMT2_TK_INVALID_HEXADECIMAL if the sequence is empty */ static smt2_token_t smt2_read_hexa(lexer_t *lex) { reader_t *rd; string_buffer_t *buffer; int c; smt2_token_t tk; rd = &lex->reader; buffer = lex->buffer; c = reader_current_char(rd); assert(string_buffer_length(buffer) == 1 && buffer->data[0] == '#' && c == 'x'); do { string_buffer_append_char(buffer, c); c = reader_next_char(rd); } while (isxdigit(c)); string_buffer_close(buffer); tk = SMT2_TK_HEXADECIMAL; if (string_buffer_length(buffer) <= 2) { tk = SMT2_TK_INVALID_HEXADECIMAL; } return tk; }
/* * Read a keyword: * - the buffer must be empty * - current_char must be ':' * - add ':' + the sequence of simple_chars that follows to the buffer * * If ':' is not followed by a simple char, return SMT2_TK_INVALID_KEYWORD * Otherwise return SMT2_TK_KEYWORD. */ static smt2_token_t smt2_read_keyword(lexer_t *lex) { reader_t *rd; string_buffer_t *buffer; int c; smt2_token_t tk; rd = &lex->reader; buffer = lex->buffer; c = reader_current_char(rd); assert(string_buffer_length(buffer) == 0 && c == ':'); do { string_buffer_append_char(buffer, c); c = reader_next_char(rd); } while (issimple(c)); string_buffer_close(buffer); tk = SMT2_TK_KEYWORD; if (string_buffer_length(buffer) <= 1) { tk = SMT2_TK_INVALID_KEYWORD; } return tk; }
/* * Read a binary literal * - the buffer must contain '#' * - current char must be 'b' * - add 'b' and the sequence of '0' and '1' that follows * to the buffer * - stop on the first character that's not '0' or '1' * * The resulting token is stored in buffer * - return code: * SMT2_TK_BINARY if the sequence is non-empty * SMT2_TK_INVALID_BINARY if the sequence is empty */ static smt2_token_t smt2_read_binary(lexer_t *lex) { reader_t *rd; string_buffer_t *buffer; int c; smt2_token_t tk; rd = &lex->reader; buffer = lex->buffer; c = reader_current_char(rd); assert(string_buffer_length(buffer) == 1 && buffer->data[0] == '#' && c == 'b'); do { string_buffer_append_char(buffer, c); c = reader_next_char(rd); } while (c == '0' || c == '1'); string_buffer_close(buffer); tk = SMT2_TK_BINARY; if (string_buffer_length(buffer) <= 2) { tk = SMT2_TK_INVALID_BINARY; } return tk; }
/* * String literal for SMT-LIB 2.5 * * Gratuitous change to the escape sequence: * - replace "" inside the string by " * - note that this means that we can't have an empty string "" * (so the example on page 22 of 'The SMT-LIB Standard Version 2.5' * is wrong). */ static smt2_token_t smt2_read_string_var(lexer_t *lex) { reader_t *rd; string_buffer_t *buffer; int c; smt2_token_t tk; rd = &lex->reader; buffer = lex->buffer; assert(reader_current_char(rd) == '"'); for (;;) { c = reader_next_char(rd); if (c == '"') { c = reader_next_char(rd); if (c != '"') { tk = SMT2_TK_STRING; break; } } if (c < 32 && !isspace(c)) { // error tk = SMT2_TK_INVALID_STRING; break; } string_buffer_append_char(buffer, c); } string_buffer_close(buffer); return tk; }
/* * Read a string literal: * - lex->current_char == '"' and lex->buffer is empty */ static yices_token_t read_string(lexer_t *lex) { yices_token_t tk; int c, x; reader_t *rd; string_buffer_t *buffer; rd = &lex->reader; buffer = lex->buffer; assert(reader_current_char(rd) == '"'); c = reader_next_char(rd); for (;;) { if (c == '"') { // end of string // consume the closing quote reader_next_char(rd); tk = TK_STRING; break; } if (c == '\n' || c == EOF) { // missing quotes tk = TK_OPEN_STRING; break; } if (c == '\\') { // escape sequence c = reader_next_char(rd); switch (c) { case 'n': c = '\n'; break; case 't': c = '\t'; break; default: if ('0' <= c && c <= '7') { // read at most 2 more octal digits x = c - '0'; c = reader_next_char(rd); if ('0' <= c && c <= '7') { x = 8 * x + (c - '0'); c = reader_next_char(rd); if ('0' <= c && c <= '7') { x = 8 * x + (c - '0'); c = reader_next_char(rd); } } // x = character built from the octal digits // c = character after octal digit string_buffer_append_char(buffer, x); continue; } // else skip '\': copy c in the buffer break; } } string_buffer_append_char(buffer, c); c = reader_next_char(rd); } string_buffer_close(buffer); return tk; }
/* * Numbers that start with '0' * - the buffer must be empty * - current char must be '0' */ static smt2_token_t smt2_read_number0(lexer_t *lex) { reader_t *rd; string_buffer_t *buffer; int c; smt2_token_t tk; rd = &lex->reader; buffer = lex->buffer; c = reader_current_char(rd); assert(string_buffer_length(buffer) == 0 && c == '0'); // add '0' string_buffer_append_char(buffer, c); c = reader_next_char(rd); tk = SMT2_TK_NUMERAL; if (c == '.') { // parse a decimal '0.<digits>' do { string_buffer_append_char(buffer, c); c = reader_next_char(rd); } while (isdigit(c)); tk = SMT2_TK_DECIMAL; if (string_buffer_length(buffer) <= 2) { tk = SMT2_TK_INVALID_DECIMAL; // '0.' but not digit after that } } else if (isdigit(c)) { /* * invalid numeral such as '00..' or '05...' * put all the digits that follow '0' in the buffer * to give a nicer error message */ do { string_buffer_append_char(buffer, c); c = reader_next_char(rd); } while (isdigit(c)); tk = SMT2_TK_INVALID_NUMERAL; } string_buffer_close(buffer); return tk; }
/* * Read a hexadecimal constant: * lex->current_char = 'x' and lex->buffer contains "0" */ static yices_token_t read_hex_constant(lexer_t *lex) { reader_t *rd; string_buffer_t *buffer; int c; rd = &lex->reader; c = reader_current_char(rd); buffer = lex->buffer; do { string_buffer_append_char(buffer, c); c = reader_next_char(rd); } while (isxdigit(c)); string_buffer_close(buffer); if (string_buffer_length(buffer) <= 2) { return TK_EMPTY_HEXCONST; // empty constant } else { return TK_HEX_CONSTANT; } }
/* * Numbers that don't start with '0' * - the buffer must be empty * - current char must be a digit '1' to '9' * - read the sequence of digits that follows and add it to the buffer * - if the character after this sequence is '.' then read as a DECIMAL * otherwise the token is a NUMERAL. * * Return code: * - SMT2_INVALID_DECIMAL if the '.' is not followed by a digit */ static smt2_token_t smt2_read_number(lexer_t *lex) { reader_t *rd; string_buffer_t *buffer; int c; smt2_token_t tk; uint32_t i; rd = &lex->reader; buffer = lex->buffer; c = reader_current_char(rd); assert(string_buffer_length(buffer) == 0 && isdigit(c) && c != '0'); // first sequence of digits do { string_buffer_append_char(buffer, c); c = reader_next_char(rd); } while (isdigit(c)); tk = SMT2_TK_NUMERAL; if (c == '.') { i = string_buffer_length(buffer); // attempt to parse a DECIMAL do { string_buffer_append_char(buffer, c); c = reader_next_char(rd); } while (isdigit(c)); tk = SMT2_TK_DECIMAL; if (string_buffer_length(buffer) <= i+1) { tk = SMT2_TK_INVALID_DECIMAL; } } string_buffer_close(buffer); return tk; }
/* * Read a string literal * - current char is " * - read all characters until the closing " or any non-printable * character * - replace escape sequences \" by " and \\ by \ * * Result: the lexer's buffer contains the string literal * without the delimiting quotes. * - return code: * SMT2_TK_STRING if the string is valid * SMT2_TK_INVALID_STRING if the string is terminated by * a non-printable character * * NOTE: this is not strictly compliant with the SMT-LIB 2.0 * standard as we may include non-ascii printable characters * in the string. * * NOTE2: the SMT-LIB2 standard says 'a string is any sequence of * printable ASCII characters delimited by double quotes ...' But it * does not define 'printable ASCII character'. Several benchmarks in * SMT-LIB include line breaks inside a string (which are not * printable characters), so I've changed the loop below to allow both * printable characters and spaces. */ static smt2_token_t smt2_read_string(lexer_t *lex) { reader_t *rd; string_buffer_t *buffer; int c; smt2_token_t tk; rd = &lex->reader; buffer = lex->buffer; assert(reader_current_char(rd) == '"'); for (;;) { c = reader_next_char(rd); if (c == '"') { // consume the closing quote reader_next_char(rd); tk = SMT2_TK_STRING; break; } if (!isprint(c) && !isspace(c)) { // error tk = SMT2_TK_INVALID_STRING; break; } if (c == '\\') { c = reader_next_char(rd); if (c != '"' && c != '\\') { // keep the backslash string_buffer_append_char(buffer, '\\'); } } string_buffer_append_char(buffer, c); } string_buffer_close(buffer); return tk; }
/* * Read next token and return its type tk * - set lex->token to tk * - set lex->tk_pos, etc. * - if token is TK_STRING, TK_NUM_RATIONAL, TK_NUM_FLOAT, TK_BV_CONSTANT, TK_SYMBOL, TK_ERROR, * the token value is stored in lex->buffer (as a string). */ yices_token_t next_yices_token(lexer_t *lex) { yices_token_t tk; reader_t *rd; string_buffer_t *buffer; int c; rd = &lex->reader; c = reader_current_char(rd); buffer = lex->buffer; string_buffer_reset(buffer); // skip spaces and comments for (;;) { while (isspace(c)) c = reader_next_char(rd); if (c != ';') break; do { // read to end-of-line or eof c = reader_next_char(rd); } while (c != '\n' && c != EOF); } // record token position (start of token) lex->tk_pos = rd->pos; lex->tk_line = rd->line; lex->tk_column = rd->column; switch (c) { case '(': tk = TK_LP; goto next_then_return; case ')': tk = TK_RP; goto next_then_return; case EOF: tk = TK_EOS; goto done; case ':': c = reader_next_char(rd); if (c == ':') { tk = TK_COLON_COLON; goto next_then_return; } else { // store ':' in the buffer since that may be used for reporting errors string_buffer_append_char(buffer, ':'); string_buffer_close(buffer); tk = TK_ERROR; goto done; } case '"': tk = read_string(lex); goto done; case '+': case '-': string_buffer_append_char(buffer, c); c = reader_next_char(rd); if (isdigit(c)) { string_buffer_append_char(buffer, c); reader_next_char(rd); tk = read_number(lex); } else { tk = read_symbol(lex); } goto done; case '0': string_buffer_append_char(buffer, c); c = reader_next_char(rd); if (c == 'b') { tk = read_bv_constant(lex); } else if (c == 'x') { tk = read_hex_constant(lex); } else { tk = read_number(lex); } goto done; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': string_buffer_append_char(buffer, c); reader_next_char(rd); tk = read_number(lex); goto done; default: // symbol or keyword string_buffer_append_char(buffer, c); reader_next_char(rd); tk = read_symbol(lex); goto done; } /* * read next character and exit */ next_then_return: reader_next_char(rd); done: lex->token = tk; return tk; }
/* * Read a number * lex->buffer contains <optional_sign> and a single digit * current_char = what's after the digit in buffer. */ static yices_token_t read_number(lexer_t *lex) { reader_t *rd; string_buffer_t *buffer; int c, all_zeros; yices_token_t tk; rd = &lex->reader; c = reader_current_char(rd); buffer = lex->buffer; tk = TK_NUM_RATIONAL; // default while (isdigit(c)) { string_buffer_append_char(buffer, c); c = reader_next_char(rd); } if (c == '/') { // denominator string_buffer_append_char(buffer, c); c = reader_next_char(rd); if (! isdigit(c)) { tk = TK_INVALID_NUM; goto done; } all_zeros = true; do { if (c != '0') all_zeros = false; string_buffer_append_char(buffer, c); c = reader_next_char(rd); } while (isdigit(c)); if (all_zeros) tk = TK_ZERO_DIVISOR; // else tk = TK_NUM_RATIONAL goto done; } if (c == '.') { tk = TK_NUM_FLOAT; // fractional part string_buffer_append_char(buffer, c); c = reader_next_char(rd); if (! isdigit(c)) { tk = TK_INVALID_NUM; goto done; } do { string_buffer_append_char(buffer, c); c = reader_next_char(rd); } while (isdigit(c)); } if (c == 'e' || c == 'E') { tk = TK_NUM_FLOAT; // exponent string_buffer_append_char(buffer, c); c = reader_next_char(rd); if (c == '+' || c == '-') { string_buffer_append_char(buffer, c); c = reader_next_char(rd); } if (! isdigit(c)) { tk = TK_INVALID_NUM; goto done; } do { string_buffer_append_char(buffer, c); c = reader_next_char(rd); } while (isdigit(c)); } done: string_buffer_close(buffer); return tk; }
/* * Read the next token and return its code tk * - set lex->token to tk * - set lex->tk_pos * - if the token is not '(' or ')', then its value is in lex->buffer * as a string */ smt2_token_t next_smt2_token(lexer_t *lex) { reader_t *rd; string_buffer_t *buffer; int c; smt2_token_t tk; rd = &lex->reader; c = reader_current_char(rd); buffer = lex->buffer; string_buffer_reset(buffer); // skip spaces and comments for (;;) { while (isspace(c)) c = reader_next_char(rd); if (c != ';') break; // comments: read everything until the end of the line or EOF do { c = reader_next_char(rd); } while (c != '\n' && c != EOF); } // record start of token lex->tk_pos = rd->pos; lex->tk_line = rd->line; lex->tk_column = rd->column; switch (c) { case '(': tk = SMT2_TK_LP; goto next_then_return; case ')': tk = SMT2_TK_RP; goto next_then_return; case EOF: tk = SMT2_TK_EOS; goto done; case '"': if (two_dot_five_variant) { tk = smt2_read_string_var(lex); } else { tk = smt2_read_string(lex); } goto done; case '#': string_buffer_append_char(buffer, c); c = reader_next_char(rd); if (c == 'b') { tk = smt2_read_binary(lex); } else if (c == 'x') { tk = smt2_read_hexa(lex); } else { tk = SMT2_TK_ERROR; string_buffer_close(buffer); } goto done; case '0': tk = smt2_read_number0(lex); goto done; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': tk = smt2_read_number(lex); goto done; case ':': tk = smt2_read_keyword(lex); goto done; case '|': tk = smt2_read_quoted_symbol(lex); goto done; default: if (issimple(c)) { tk = smt2_read_symbol(lex); goto done; } else { tk = SMT2_TK_ERROR; /* * copy the bad character in buffer for * better error reporting */ string_buffer_append_char(buffer, c); string_buffer_close(buffer); goto next_then_return; } } next_then_return: reader_next_char(rd); done: lex->token = tk; return tk; }