void consumeCharacter(Lexer *self) { if (self->pos > (int) self->inputLength) { errorMessage("Reached end of input, pos(%d) ... len(%d)", self->pos, self->inputLength); return; } // stop consuming if we hit the end of the file if(isEndOfInput(self->currentChar)) { return; } else if(self->currentChar == '\n') { self->charNumber = 0; // reset the char number back to zero self->lineNumber++; } self->currentChar = self->input[++self->pos]; self->charNumber++; }
void recognizeStringToken(Lexer *self) { expectCharacter(self, '"'); int errpos = self->charNumber; int errline = self->lineNumber; // just consume everthing while (!isString(self->currentChar)) { consumeCharacter(self); if (isEndOfInput(self->currentChar)) { errorMessageWithPosition(self->fileName, errline, errpos, "Unterminated string literal"); } } expectCharacter(self, '"'); pushToken(self, TOKEN_STRING); }
void recognizeCharacterToken(Lexer *self) { expectCharacter(self, '\''); int errpos = self->charNumber; int errline = self->lineNumber; if (self->currentChar == '\'') errorMessageWithPosition(self->fileName, self->lineNumber, self->charNumber, "Empty character literal"); while (!(self->currentChar == '\'' && peekAhead(self, -1) != '\\')) { consumeCharacter(self); if (isEndOfInput(self->currentChar)) { errorMessageWithPosition(self->fileName, errline, errpos, "Unterminated character literal"); } } expectCharacter(self, '\''); pushToken(self, TOKEN_CHARACTER); }
void getNextToken(Lexer *self) { self->startPos = 0; skipLayoutAndComments(self); self->startPos = self->pos; if (isEndOfInput(self->currentChar)) { recognizeEndOfInputToken(self); self->running = false; // stop lexing return; } else if (isDigit(self->currentChar) || (self->currentChar == '.' && isDigit(peekAhead(self, 1)))) { // number recognizeNumberToken(self); } else if (isLetterOrDigit(self->currentChar) || self->currentChar == '_') { // ident recognizeIdentifierToken(self); } else if (isString(self->currentChar)) { // string recognizeStringToken(self); } else if (isCharacter(self->currentChar)) { // character recognizeCharacterToken(self); } else if (isOperator(self->currentChar)) { // operator recognizeOperatorToken(self); } else if (isEndOfLine(self->currentChar)) { recognizeEndOfLineToken(self); } else if (isSeparator(self->currentChar)) { // separator recognizeSeparatorToken(self); } else { // errorneous recognizeErroneousToken(self); } }
static bool skipLayoutAndCommentsOnce(Lexer *self) { while (isLayout(self->currentChar)) { consumeCharacter(self); } while (self->currentChar == '#') { consumeCharacter(self); while (!isCommentCloser(self->currentChar)) { if (isEndOfInput(self->currentChar)) return false; consumeCharacter(self); } while (isLayout(self->currentChar)) { consumeCharacter(self); } return true; } // consume a block comment and its contents if (self->currentChar == '/' && peekAhead(self, 1) == '*') { // consume new comment symbols consumeCharacter(self); consumeCharacter(self); while (true) { consumeCharacter(self); if (isEndOfInput(self->currentChar)) { errorMessage("Unterminated block comment"); return false; } if (self->currentChar == '*' && peekAhead(self, 1) == '/') { // consume the comment symbols consumeCharacter(self); consumeCharacter(self); // eat layout stuff like space etc while (isLayout(self->currentChar)) { consumeCharacter(self); } break; } } return true; } // consume a single line comment while ((self->currentChar == '/' && peekAhead(self, 1) == '/')) { consumeCharacter(self); // eat the / consumeCharacter(self); // eat the / while (!isCommentCloser(self->currentChar)) { if (isEndOfInput(self->currentChar)) return false; consumeCharacter(self); } while (isLayout(self->currentChar)) { consumeCharacter(self); } return true; } return false; }
int nextLexeme(char *s, int *length, int *type){ char *p = s; if(isEndOfInput(*p)){ *length = 0; *type = LT_INV; return -1; } else if(isNewLine(*p)){ *length = 1; *type = LT_NL; return 0; } else if(isWhiteSpace(*p)){ while(isWhiteSpace(*p)){ p++; } *length = p - s; *type = LT_WS; return 0; } else if(isNumberBegining(*p, *(p+1))){ while(isPPNumber(*p)){ if(isPPNumberExp(*p) && isPPNumberExpSign(*(p+1))){ p += 2; } else { p++; } } *length = p - s; *type = LT_NUM; return 0; } else if(isIdentifierStart(*p)){ while(isIdentifier(*p)){ p++; } *length = p - s; *type = LT_ID; return 0; } else if(*p == '\''){ p++; while(isSourceCharSet(*p) && *p != '\''){ if(!(*p == '\\' && *(p+1) == '\'')){ p++; } else { p += 2; } } if(*p == '\''){ p++; } else { fprintf(stderr, "invalid token: %02x\n", *p); exit(1); } *length = p - s; *type = LT_CC; return 0; } else if(*p == '\"'){ p++; while(isSourceCharSet(*p) && *p != '\"'){ if(!(*p == '\\' && *(p+1) == '\"')){ p++; } else { p += 2; } } if(*p == '\"'){ p++; } else { fprintf(stderr, "invalid string literal token\n"); exit(1); } *length = p - s; *type = LT_STRLIT; return 0; } else if(*p == '('){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == ')'){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == ','){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == ';'){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == '?'){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == '['){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == ']'){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == '{'){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == '}'){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == '~'){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == '!'){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '%'){ p++; if(*p == ':'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '>'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '&'){ p++; if(*p == '&'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '*'){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '+'){ p++; if(*p == '+'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '-'){ p++; if(*p == '-'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '>'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '.' && isDigit){ p++; if(*p == '.' && *(p+1) == '.'){ p += 2; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '/'){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == ':'){ p++; if(*p == '>'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '<'){ p++; if(*p == '%'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == ':'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '<'){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '='){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '>'){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '>'){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '^'){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '|'){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '|'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else { p++; *length = p - s; *type = LT_PUNC; return 0; } return 0; }