PERF_TEST_F(CategoryBig, Incremental)
{
	const char* src = m_contents.c_str();
	const char* src_start = src;
	size_t src_size = m_contents.length();

	while (1)
	{
		size_t f = utf8iscategory(src, src_size, UTF8_CATEGORY_LETTER);
		if (f > 0)
		{
			if (src_size < f)
			{
				break;
			}

			src += f;
			src_size -= f;
		}
		else
		{
			const char* n = utf8seek(src, src_size, src_start, 1, SEEK_CUR);

			size_t d = n - src;
			if (d == 0)
			{
				break;
			}

			src = n;
			src_size -= d;
		}
	}
}
Ejemplo n.º 2
0
	TokenType getNextToken(const util::FastInsertVector<string_view>& lines, size_t* line, size_t* offset, const string_view& whole,
		Location& pos, Token* out, bool crlf)
	{
		bool flag = true;

		if(*line == lines.size())
		{
			out->loc = pos;
			out->type = TokenType::EndOfFile;
			return TokenType::EndOfFile;
		}

		string_view stream = lines[*line].substr(*offset);
		if(stream.empty())
		{
			out->loc = pos;
			out->type = TokenType::EndOfFile;
			return TokenType::EndOfFile;
		}



		size_t read = 0;
		size_t unicodeLength = 0;

		// first eat all whitespace
		skipWhitespace(stream, pos, offset);

		Token& tok = *out;
		tok.loc = pos;
		tok.type = TokenType::Invalid;


		// check compound symbols first.
		if(hasPrefix(stream, "//"))
		{
			tok.type = TokenType::Comment;
			// stream = stream.substr(0, 0);
			(*line)++;
			pos.line++;
			pos.col = 0;


			(*offset) = 0;

			// don't assign lines[line] = stream, since over here we've changed 'line' to be the next one.
			flag = false;
			tok.text = "";
		}
		else if(hasPrefix(stream, "=="))
		{
			tok.type = TokenType::EqualsTo;
			tok.text = "==";
			read = 2;
		}
		else if(hasPrefix(stream, ">="))
		{
			tok.type = TokenType::GreaterEquals;
			tok.text = ">=";
			read = 2;
		}
		else if(hasPrefix(stream, "<="))
		{
			tok.type = TokenType::LessThanEquals;
			tok.text = "<=";
			read = 2;
		}
		else if(hasPrefix(stream, "!="))
		{
			tok.type = TokenType::NotEquals;
			tok.text = "!=";
			read = 2;
		}
		else if(hasPrefix(stream, "||"))
		{
			tok.type = TokenType::LogicalOr;
			tok.text = "||";
			read = 2;
		}
		else if(hasPrefix(stream, "&&"))
		{
			tok.type = TokenType::LogicalAnd;
			tok.text = "&&";
			read = 2;
		}
		else if(hasPrefix(stream, "<-"))
		{
			tok.type = TokenType::LeftArrow;
			tok.text = "<-";
			read = 2;
		}
		else if(hasPrefix(stream, "->"))
		{
			tok.type = TokenType::RightArrow;
			tok.text = "->";
			read = 2;
		}
		else if(hasPrefix(stream, "<="))
		{
			tok.type = TokenType::FatLeftArrow;
			tok.text = "<=";
			read = 2;
		}
		else if(hasPrefix(stream, "=>"))
		{
			tok.type = TokenType::FatRightArrow;
			tok.text = "=>";
			read = 2;
		}
		else if(hasPrefix(stream, "++"))
		{
			tok.type = TokenType::DoublePlus;
			tok.text = "++";
			read = 2;
		}
		else if(hasPrefix(stream, "--"))
		{
			tok.type = TokenType::DoubleMinus;
			tok.text = "--";
			read = 2;
		}
		else if(hasPrefix(stream, "+="))
		{
			tok.type = TokenType::PlusEq;
			tok.text = "+=";
			read = 2;
		}
		else if(hasPrefix(stream, "-="))
		{
			tok.type = TokenType::MinusEq;
			tok.text = "-=";
			read = 2;
		}
		else if(hasPrefix(stream, "*="))
		{
			tok.type = TokenType::MultiplyEq;
			tok.text = "*=";
			read = 2;
		}
		else if(hasPrefix(stream, "/="))
		{
			tok.type = TokenType::DivideEq;
			tok.text = "/=";
			read = 2;
		}
		else if(hasPrefix(stream, "%="))
		{
			tok.type = TokenType::ModEq;
			tok.text = "%=";
			read = 2;
		}
		else if(hasPrefix(stream, "&="))
		{
			tok.type = TokenType::AmpersandEq;
			tok.text = "&=";
			read = 2;
		}
		else if(hasPrefix(stream, "|="))
		{
			tok.type = TokenType::PipeEq;
			tok.text = "|=";
			read = 2;
		}
		else if(hasPrefix(stream, "^="))
		{
			tok.type = TokenType::CaretEq;
			tok.text = "^=";
			read = 2;
		}
		else if(hasPrefix(stream, "::"))
		{
			tok.type = TokenType::DoubleColon;
			tok.text = "::";
			read = 2;
		}
		else if(hasPrefix(stream, "..."))
		{
			tok.type = TokenType::Ellipsis;
			tok.text = "...";
			read = 3;
		}
		else if(hasPrefix(stream, "..<"))
		{
			tok.type = TokenType::HalfOpenEllipsis;
			tok.text = "..<";
			read = 3;
		}
		else if(hasPrefix(stream, "/*"))
		{
			int currentNest = 1;

			// support nested, so basically we have to loop until we find either a /* or a */
			stream.remove_prefix(2);
			(*offset) += 2;
			pos.col += 2;


			Location opening = pos;
			Location curpos = pos;

			size_t k = 0;
			while(currentNest > 0)
			{
				// we can do this, because we know the closing token (*/) is 2 chars long
				// so if we have 1 char left, gg.
				if(k + 1 == stream.size() || stream[k] == '\n')
				{
					if(*line + 1 == lines.size())
						error(opening, "expected closing */ (reached EOF), for block comment started here:");

					// else, get the next line.
					// also note: if we're in this loop, we're inside a block comment.
					// since the ending token cannot be split across lines, we know that this last char
					// must also be part of the comment. hence, just skip over it.

					k = 0;
					curpos.line++;
					curpos.col = 0;
					(*offset) = 0;
					(*line)++;

					stream = lines[*line];
					continue;
				}


				if(stream[k] == '/' && stream[k + 1] == '*')
					currentNest++, k++, curpos.col++, opening = curpos;

				else if(stream[k] == '*' && stream[k + 1] == '/')
					currentNest--, k++, curpos.col++;

				k++;
				curpos.col++;
			}


			if(currentNest != 0)
				error(opening, "expected closing */ (reached EOF), for block comment started here:");

			pos = curpos;

			// don't actually store the text, because it's pointless and memory-wasting
			// tok.text = "/* I used to be a comment like you, until I took a memory-leak to the knee. */";
			tok.type = TokenType::Comment;
			tok.text = "";
			read = k;
		}
		else if(hasPrefix(stream, "*/"))
		{
			unexpected(tok.loc, "'*/'");
		}

		// attrs
		else if(hasPrefix(stream, "@nomangle"))
		{
			tok.type = TokenType::Attr_NoMangle;
			tok.text = "@nomangle";
			read = 9;
		}
		else if(hasPrefix(stream, "@entry"))
		{
			tok.type = TokenType::Attr_EntryFn;
			tok.text = "@entry";
			read = 6;
		}
		else if(hasPrefix(stream, "@raw"))
		{
			tok.type = TokenType::Attr_Raw;
			tok.text = "@raw";
			read = 4;
		}
		else if(hasPrefix(stream, "@operator"))
		{
			tok.type = TokenType::Attr_Operator;
			tok.text = "@operator";
			read = 9;
		}



		// unicode stuff
		else if(hasPrefix(stream, "ƒ"))
		{
			tok.type = TokenType::Func;
			read = std::string("ƒ").length();
			tok.text = "ƒ";

			unicodeLength = 1;
		}
		else if(hasPrefix(stream, "fi"))
		{
			tok.type = TokenType::ForeignFunc;
			read = std::string("fi").length();
			tok.text = "fi";

			unicodeLength = 1;
		}
		else if(hasPrefix(stream, "÷"))
		{
			tok.type = TokenType::Divide;
			read = std::string("÷").length();
			tok.text = "÷";

			unicodeLength = 1;
		}
		else if(hasPrefix(stream, "≠"))
		{
			tok.type = TokenType::NotEquals;
			read = std::string("≠").length();
			tok.text = "≠";

			unicodeLength = 1;
		}
		else if(hasPrefix(stream, "≤"))
		{
			tok.type = TokenType::LessThanEquals;
			read = std::string("≤").length();
			tok.text = "≤";

			unicodeLength = 1;
		}
		else if(hasPrefix(stream, "≥"))
		{
			tok.type = TokenType::GreaterEquals;
			read = std::string("≥").length();
			tok.text = "≥";

			unicodeLength = 1;
		}

		// note some special-casing is needed to differentiate between unary +/- and binary +/-
		// cases where we want binary:
		// ...) + 3
		// ...] + 3
		// ident + 3
		// number + 3
		// string + 3
		// so in every other case we want unary +/-.
		// note: this dumb '<=255' thing is because windows likes to assert useless things.
		else if((!stream.empty() && ((stream[0] >= 1 && (int) stream[0] <= 255 && isdigit(stream[0])) || shouldConsiderUnaryLiteral(stream, pos)))
			/* handle cases like '+ 3' or '- 14' (ie. space between sign and number) */
			&& ((isdigit(stream[0]) ? true : false) || (stream.size() > 1 && isdigit(stream[1]))))
		{
			// copy it.
			auto tmp = stream;

			if(tmp.find('-') == 0 || tmp.find('+') == 0)
				tmp.remove_prefix(1);

			int base = 10;
			if(tmp.find("0x") == 0 || tmp.find("0X") == 0)
				base = 16, tmp.remove_prefix(2);

			else if(tmp.find("0b") == 0 || tmp.find("0B") == 0)
				base = 2, tmp.remove_prefix(2);


			// find that shit
			auto end = std::find_if_not(tmp.begin(), tmp.end(), [base](const char& c) -> bool {
				if(base == 10)	return isdigit(c);
				if(base == 16)	return isdigit(c) || (toupper(c) >= 'A' && toupper(c) <= 'F');
				else			return (c == '0' || c == '1');
			});

			tmp.remove_prefix((end - tmp.begin()));

			// check if we have 'e' or 'E'
			bool hadExp = false;
			if(tmp.size() > 0 && (tmp[0] == 'e' || tmp[0] == 'E'))
			{
				if(base != 10)
					error("exponential form is supported with neither hexadecimal nor binary literals");

				// find that shit
				auto next = std::find_if_not(tmp.begin() + 1, tmp.end(), isdigit);

				// this does the 'e' as well.
				tmp.remove_prefix(next - tmp.begin());

				hadExp = true;
			}

			size_t didRead = stream.size() - tmp.size();
			auto post = stream.substr(didRead);

			if(!post.empty() && post[0] == '.')
			{
				if(base != 10)
					error("invalid floating point literal; only valid in base 10");

				else if(hadExp)
					error("invalid floating point literal; decimal point cannot occur after the exponent ('e' or 'E').");

				// if the previous token was a '.' as well, then we're doing some tuple access
				// eg. x.0.1 (we would be at '0', having a period both ahead and behind us)

				// if the next token is not a number, then same thing, eg.
				// x.0.z, where the first tuple element of 'x' is a struct or something.

				// so -- lex a floating point *iff* the previous token was not '.', and the next token is a digit.
				if(prevType != TokenType::Period && post.size() > 1 && isdigit(post[1]))
				{
					// yes, parse a floating point
					post.remove_prefix(1), didRead++;

					while(post.size() > 0 && isdigit(post.front()))
						post.remove_prefix(1), didRead++;

					// ok.
				}
				else
				{
					// no, just return the integer token.
					// (which we do below, so just do nothing here)
				}
			}

			tok.text = stream.substr(0, didRead);

			tok.type = TokenType::Number;
			tok.loc.len = didRead;

			read = didRead;
		}
		else if(!stream.empty() && (stream[0] == '_'  || utf8iscategory(stream.data(), stream.size(), UTF8_CATEGORY_LETTER) > 0))
		{
			// get as many letters as possible first
			size_t identLength = utf8iscategory(stream.data(), stream.size(),
				UTF8_CATEGORY_LETTER | UTF8_CATEGORY_PUNCTUATION_CONNECTOR | UTF8_CATEGORY_NUMBER);

			read = identLength;
			tok.text = stream.substr(0, identLength);

			initKeywordMap();
			if(auto it = keywordMap.find(tok.text); it != keywordMap.end())
				tok.type = it->second;

			else
PERF_TEST_F(CategoryBig, All)
{
	static const size_t flags = 0xFFFFFFFF & (~UTF8_CATEGORY_COMPATIBILITY);

	utf8iscategory(m_contents.c_str(), m_contents.length(), flags);
}