Lexer::Lexer(const std::string& filename) : tokenList(), state(Idle), currentToken(tokenList.end()), storedPosition(tokenList.end()), keywordMap(), location(filename), start(filename), eof(nullptr) { initKeywordMap(); readFile(filename); tokenize(); for (currentToken = tokenList.begin(); currentToken->isNewline(); currentToken++); }
TokenType getNextToken(const util::FastInsertVector<string_view>& lines, size_t* line, size_t* offset, const string_view& whole, Location& pos, Token* out, bool crlf) { bool flag = true; if(*line == lines.size()) { out->loc = pos; out->type = TokenType::EndOfFile; return TokenType::EndOfFile; } string_view stream = lines[*line].substr(*offset); if(stream.empty()) { out->loc = pos; out->type = TokenType::EndOfFile; return TokenType::EndOfFile; } size_t read = 0; size_t unicodeLength = 0; // first eat all whitespace skipWhitespace(stream, pos, offset); Token& tok = *out; tok.loc = pos; tok.type = TokenType::Invalid; // check compound symbols first. if(hasPrefix(stream, "//")) { tok.type = TokenType::Comment; // stream = stream.substr(0, 0); (*line)++; pos.line++; pos.col = 0; (*offset) = 0; // don't assign lines[line] = stream, since over here we've changed 'line' to be the next one. flag = false; tok.text = ""; } else if(hasPrefix(stream, "==")) { tok.type = TokenType::EqualsTo; tok.text = "=="; read = 2; } else if(hasPrefix(stream, ">=")) { tok.type = TokenType::GreaterEquals; tok.text = ">="; read = 2; } else if(hasPrefix(stream, "<=")) { tok.type = TokenType::LessThanEquals; tok.text = "<="; read = 2; } else if(hasPrefix(stream, "!=")) { tok.type = TokenType::NotEquals; tok.text = "!="; read = 2; } else if(hasPrefix(stream, "||")) { tok.type = TokenType::LogicalOr; tok.text = "||"; read = 2; } else if(hasPrefix(stream, "&&")) { tok.type = TokenType::LogicalAnd; tok.text = "&&"; read = 2; } else if(hasPrefix(stream, "<-")) { tok.type = TokenType::LeftArrow; tok.text = "<-"; read = 2; } else if(hasPrefix(stream, "->")) { tok.type = TokenType::RightArrow; tok.text = "->"; read = 2; } else if(hasPrefix(stream, "<=")) { tok.type = TokenType::FatLeftArrow; tok.text = "<="; read = 2; } else if(hasPrefix(stream, "=>")) { tok.type = TokenType::FatRightArrow; tok.text = "=>"; read = 2; } else if(hasPrefix(stream, "++")) { tok.type = TokenType::DoublePlus; tok.text = "++"; read = 2; } else if(hasPrefix(stream, "--")) { tok.type = TokenType::DoubleMinus; tok.text = "--"; read = 2; } else if(hasPrefix(stream, "+=")) { tok.type = TokenType::PlusEq; tok.text = "+="; read = 2; } else if(hasPrefix(stream, "-=")) { tok.type = TokenType::MinusEq; tok.text = "-="; read = 2; } else if(hasPrefix(stream, "*=")) { tok.type = TokenType::MultiplyEq; tok.text = "*="; read = 2; } else if(hasPrefix(stream, "/=")) { tok.type = TokenType::DivideEq; tok.text = "/="; read = 2; } else if(hasPrefix(stream, "%=")) { tok.type = TokenType::ModEq; tok.text = "%="; read = 2; } else if(hasPrefix(stream, "&=")) { tok.type = TokenType::AmpersandEq; tok.text = "&="; read = 2; } else if(hasPrefix(stream, "|=")) { tok.type = TokenType::PipeEq; tok.text = "|="; read = 2; } else if(hasPrefix(stream, "^=")) { tok.type = TokenType::CaretEq; tok.text = "^="; read = 2; } else if(hasPrefix(stream, "::")) { tok.type = TokenType::DoubleColon; tok.text = "::"; read = 2; } else if(hasPrefix(stream, "...")) { tok.type = TokenType::Ellipsis; tok.text = "..."; read = 3; } else if(hasPrefix(stream, "..<")) { tok.type = TokenType::HalfOpenEllipsis; tok.text = "..<"; read = 3; } else if(hasPrefix(stream, "/*")) { int currentNest = 1; // support nested, so basically we have to loop until we find either a /* or a */ stream.remove_prefix(2); (*offset) += 2; pos.col += 2; Location opening = pos; Location curpos = pos; size_t k = 0; while(currentNest > 0) { // we can do this, because we know the closing token (*/) is 2 chars long // so if we have 1 char left, gg. if(k + 1 == stream.size() || stream[k] == '\n') { if(*line + 1 == lines.size()) error(opening, "expected closing */ (reached EOF), for block comment started here:"); // else, get the next line. // also note: if we're in this loop, we're inside a block comment. // since the ending token cannot be split across lines, we know that this last char // must also be part of the comment. hence, just skip over it. k = 0; curpos.line++; curpos.col = 0; (*offset) = 0; (*line)++; stream = lines[*line]; continue; } if(stream[k] == '/' && stream[k + 1] == '*') currentNest++, k++, curpos.col++, opening = curpos; else if(stream[k] == '*' && stream[k + 1] == '/') currentNest--, k++, curpos.col++; k++; curpos.col++; } if(currentNest != 0) error(opening, "expected closing */ (reached EOF), for block comment started here:"); pos = curpos; // don't actually store the text, because it's pointless and memory-wasting // tok.text = "/* I used to be a comment like you, until I took a memory-leak to the knee. */"; tok.type = TokenType::Comment; tok.text = ""; read = k; } else if(hasPrefix(stream, "*/")) { unexpected(tok.loc, "'*/'"); } // attrs else if(hasPrefix(stream, "@nomangle")) { tok.type = TokenType::Attr_NoMangle; tok.text = "@nomangle"; read = 9; } else if(hasPrefix(stream, "@entry")) { tok.type = TokenType::Attr_EntryFn; tok.text = "@entry"; read = 6; } else if(hasPrefix(stream, "@raw")) { tok.type = TokenType::Attr_Raw; tok.text = "@raw"; read = 4; } else if(hasPrefix(stream, "@operator")) { tok.type = TokenType::Attr_Operator; tok.text = "@operator"; read = 9; } // unicode stuff else if(hasPrefix(stream, "ƒ")) { tok.type = TokenType::Func; read = std::string("ƒ").length(); tok.text = "ƒ"; unicodeLength = 1; } else if(hasPrefix(stream, "fi")) { tok.type = TokenType::ForeignFunc; read = std::string("fi").length(); tok.text = "fi"; unicodeLength = 1; } else if(hasPrefix(stream, "÷")) { tok.type = TokenType::Divide; read = std::string("÷").length(); tok.text = "÷"; unicodeLength = 1; } else if(hasPrefix(stream, "≠")) { tok.type = TokenType::NotEquals; read = std::string("≠").length(); tok.text = "≠"; unicodeLength = 1; } else if(hasPrefix(stream, "≤")) { tok.type = TokenType::LessThanEquals; read = std::string("≤").length(); tok.text = "≤"; unicodeLength = 1; } else if(hasPrefix(stream, "≥")) { tok.type = TokenType::GreaterEquals; read = std::string("≥").length(); tok.text = "≥"; unicodeLength = 1; } // note some special-casing is needed to differentiate between unary +/- and binary +/- // cases where we want binary: // ...) + 3 // ...] + 3 // ident + 3 // number + 3 // string + 3 // so in every other case we want unary +/-. // note: this dumb '<=255' thing is because windows likes to assert useless things. else if((!stream.empty() && ((stream[0] >= 1 && (int) stream[0] <= 255 && isdigit(stream[0])) || shouldConsiderUnaryLiteral(stream, pos))) /* handle cases like '+ 3' or '- 14' (ie. space between sign and number) */ && ((isdigit(stream[0]) ? true : false) || (stream.size() > 1 && isdigit(stream[1])))) { // copy it. auto tmp = stream; if(tmp.find('-') == 0 || tmp.find('+') == 0) tmp.remove_prefix(1); int base = 10; if(tmp.find("0x") == 0 || tmp.find("0X") == 0) base = 16, tmp.remove_prefix(2); else if(tmp.find("0b") == 0 || tmp.find("0B") == 0) base = 2, tmp.remove_prefix(2); // find that shit auto end = std::find_if_not(tmp.begin(), tmp.end(), [base](const char& c) -> bool { if(base == 10) return isdigit(c); if(base == 16) return isdigit(c) || (toupper(c) >= 'A' && toupper(c) <= 'F'); else return (c == '0' || c == '1'); }); tmp.remove_prefix((end - tmp.begin())); // check if we have 'e' or 'E' bool hadExp = false; if(tmp.size() > 0 && (tmp[0] == 'e' || tmp[0] == 'E')) { if(base != 10) error("exponential form is supported with neither hexadecimal nor binary literals"); // find that shit auto next = std::find_if_not(tmp.begin() + 1, tmp.end(), isdigit); // this does the 'e' as well. tmp.remove_prefix(next - tmp.begin()); hadExp = true; } size_t didRead = stream.size() - tmp.size(); auto post = stream.substr(didRead); if(!post.empty() && post[0] == '.') { if(base != 10) error("invalid floating point literal; only valid in base 10"); else if(hadExp) error("invalid floating point literal; decimal point cannot occur after the exponent ('e' or 'E')."); // if the previous token was a '.' as well, then we're doing some tuple access // eg. x.0.1 (we would be at '0', having a period both ahead and behind us) // if the next token is not a number, then same thing, eg. // x.0.z, where the first tuple element of 'x' is a struct or something. // so -- lex a floating point *iff* the previous token was not '.', and the next token is a digit. if(prevType != TokenType::Period && post.size() > 1 && isdigit(post[1])) { // yes, parse a floating point post.remove_prefix(1), didRead++; while(post.size() > 0 && isdigit(post.front())) post.remove_prefix(1), didRead++; // ok. } else { // no, just return the integer token. // (which we do below, so just do nothing here) } } tok.text = stream.substr(0, didRead); tok.type = TokenType::Number; tok.loc.len = didRead; read = didRead; } else if(!stream.empty() && (stream[0] == '_' || utf8iscategory(stream.data(), stream.size(), UTF8_CATEGORY_LETTER) > 0)) { // get as many letters as possible first size_t identLength = utf8iscategory(stream.data(), stream.size(), UTF8_CATEGORY_LETTER | UTF8_CATEGORY_PUNCTUATION_CONNECTOR | UTF8_CATEGORY_NUMBER); read = identLength; tok.text = stream.substr(0, identLength); initKeywordMap(); if(auto it = keywordMap.find(tok.text); it != keywordMap.end()) tok.type = it->second; else