/* This is the lexer. Build a token up in buf, null terminate it, and return a pointer to buf. Clients must copy this out of buf before making any more calls to gettoken. */ static char *gettoken(reader *r) { int ch; /* If we have a valid lookahead, just return the buffer again. */ if(r->token_la_valid) { r->token_la_valid = 0; return r->buf; } /* Empty the buffer */ r->bufused = 0; /* Skip whitespace and comments. Comments begin with a ; and continue to the end of the line. */ do { if((ch = r->char_reader(r)) == EOF) return NULL; if(ch == ';') { do { ch = r->char_reader(r); } while(ch != '\n' && ch != EOF); if(ch == EOF) return NULL; } } while(isspace(ch)); add_to_buf(r, ch); /* If the token begins with one of these characters, then we know we are about to return. */ if(strchr("()\'`,;", ch)) { if(ch == ',') { /* Treat ,@ as a single token. It will be converted into unquote_splicing. */ ch = r->char_reader(r); if(ch == '@') add_to_buf(r, ch); else put_back_char(r, ch); } r->buf[r->bufused] = '\0'; return r->buf; } /* Keep reading characters until we find a whitespace or a character that begins a new token. */ for(;;) { if((ch = r->char_reader(r)) == EOF) return NULL; if(strchr("()\'", ch) || isspace(ch)) { /* We have found a whitespace, a parenthesis, or a single quote. Put it back, null terminate the buffer, and return. */ put_back_char(r, ch); r->buf[r->bufused] = '\0'; return r->buf; } add_to_buf(r, ch); } }
int parse_get_token (char *buf, int limit) { int token_index = 0; int escape_state = FALSE; int quoted_state = FALSE; int comment_state = FALSE; int token_truncated = FALSE; char current_ch, ch; do { current_ch = get_next_char(); if (comment_state) { /* A comment is ended by an EOLN or EOF. If cont_line is not 0, we keep looking for a token (or until a EOF is hit). If it is, we return EOF or EOLN. */ if (current_ch == '\n') { if (cont_line == 0) { first_word = TRUE; return GW_EOLN; } comment_state = FALSE; } else if (current_ch == '\0') { first_word = TRUE; if (cont_line) return GW_ERRCONTINUATION; else return GW_EOF; } } else if (escape_state) { /* An escape character was hit. The next character is copied in (re-escaped) with the exception of the end of line - which is copied unescaped. If a digit follows the escape character there are supposed to be two more, but that error checking is left to the next level of parsing. */ if (current_ch == '\n') { ADD_CHAR (buf, current_ch); } else if (current_ch == 0x7F || current_ch < 0x20) { do ch = get_next_char(); while (ch != '\n' && ch != '\0'); if (ch== '\0') put_back_char (ch); first_word = TRUE; cont_line = 0; return GW_ERRILLEGALCHAR; } else { ADD_CHAR (buf, '\\'); ADD_CHAR (buf, current_ch); } escape_state = FALSE; } else if (quoted_state) { /* In the middle of a quote, escapes matter, and we don't return the enclosing quote marks. */ if (current_ch == '\\') escape_state = TRUE; else if (current_ch == '\n' || current_ch == '\0') { buf[token_index]='\0'; if (cont_line > 0) { while (cont_line > 0) { do ch = get_next_char(); while (ch != '\n' && ch != '\0'); cont_line--; } put_back_char (ch); } first_word = TRUE; cont_line = 0; return GW_ERRDANGLINGQUOTE; } else if (current_ch == '"') { buf[token_index]='\0'; return GW_MAYBE_WORD; } else if (current_ch!='\t' && (current_ch<0x20 || current_ch==0x7F)) { do ch = get_next_char(); while (ch != '\n' && ch != '\0'); first_word = TRUE; cont_line = 0; return GW_ERRILLEGALCHAR; } else { ADD_CHAR (buf, current_ch); } } else if (token_index > 0) { /* In this case, there is a token being built. It is either added to or a separating character ends it and itself may be left (e.g., a ; beginning a comment ends the previous token and is left to signal the start of the comment next time around. */ if (isspace(current_ch)&¤t_ch!='\n'&¤t_ch!='\0') { first_word = FALSE; buf[token_index]='\0'; if (current_ch== '\0') put_back_char (current_ch); return GW_MAYBE_WORD; } else if (current_ch == '\\') { escape_state = TRUE; } else if (SEPARATOR(current_ch)) { first_word = FALSE; put_back_char (current_ch); buf[token_index]='\0'; return GW_MAYBE_WORD; } else if (current_ch < 0x20 || current_ch == 0x7F) { do ch = get_next_char(); while (ch != '\n' && ch != '\0'); if (ch== '\0') put_back_char (ch); first_word = TRUE; cont_line = 0; return GW_ERRILLEGALCHAR; } else ADD_CHAR (buf, current_ch); } else /* The character is potentially the start of a token. Some of these may have been seen by the above code and returned to the input stream. */ switch (current_ch) { case ';': buf[token_index]='\0'; comment_state = TRUE; break; case '(': cont_line++; do ch = get_next_char(); while (ch != '\n' && ch != '\0'); if (ch== '\0') put_back_char (ch); first_word = FALSE; break; case ')': cont_line--; while ((ch = get_next_char()) != '\n') { if (ch == '\0') return GW_EOF; if (ch == ')') { put_back_char (ch); break; } } if (cont_line==0) { first_word = TRUE; return GW_EOLN; } if (cont_line < 0) { /* We've already read to the end of the line */ first_word = TRUE; cont_line = 0; return GW_ERRCONTINUATION; } break; case ' ': case '\t': if (first_word) { ADD_CHAR (buf, current_ch); buf[token_index]='\0'; first_word = FALSE; return GW_MAYBE_WORD; } break; case '\\': escape_state = TRUE; break; case '"': quoted_state = TRUE; break; case '\n': if (cont_line==0) { first_word = TRUE; return GW_EOLN; } break; case '\0': if (cont_line == 0) return GW_EOF; else { cont_line = 0; return GW_ERRCONTINUATION; } default: if (current_ch<0x20 || current_ch==0x7F) { do ch = get_next_char(); while (ch != '\n' && ch != '\0'); if (ch== '\0') put_back_char (ch); first_word = TRUE; cont_line = 0; return GW_ERRILLEGALCHAR; } else { ADD_CHAR (buf, current_ch); } } /* end switch body */ } while (TRUE); /* end the do-while loop */ }