std::list<Token> jsonnet_lex(const std::string &filename, const char *input) { unsigned long line_number = 1; const char *line_start = input; std::list<Token> r; const char *c = input; for ( ; *c!='\0' ; ++c) { Location begin(line_number, c - line_start + 1); Token::Kind kind; std::string data; switch (*c) { // Skip non-\n whitespace case ' ': case '\t': case '\r': continue; // Skip \n and maintain line numbers case '\n': line_number++; line_start = c+1; continue; case '{': kind = Token::BRACE_L; break; case '}': kind = Token::BRACE_R; break; case '[': kind = Token::BRACKET_L; break; case ']': kind = Token::BRACKET_R; break; case ':': kind = Token::COLON; break; case ',': kind = Token::COMMA; break; case '$': kind = Token::DOLLAR; break; case '.': kind = Token::DOT; break; case '(': kind = Token::PAREN_L; break; case ')': kind = Token::PAREN_R; break; case ';': kind = Token::SEMICOLON; break; // Special cases for unary operators. case '!': kind = Token::OPERATOR; if (*(c+1) == '=') { c++; data = "!="; } else { data = "!"; } break; case '~': kind = Token::OPERATOR; data = "~"; break; case '+': kind = Token::OPERATOR; data = "+"; break; case '-': kind = Token::OPERATOR; data = "-"; break; // Numeric literals. case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': kind = Token::NUMBER; data = lex_number(c, filename, begin); break; // String literals. case '"': { c++; for (; ; ++c) { if (*c == '\0') { throw StaticError(filename, begin, "Unterminated string"); } if (*c == '"') { break; } switch (*c) { case '\\': switch (*(++c)) { case '"': data += *c; break; case '\\': data += *c; break; case '/': data += *c; break; case 'b': data += '\b'; break; case 'f': data += '\f'; break; case 'n': data += '\n'; break; case 'r': data += '\r'; break; case 't': data += '\t'; break; case 'u': { ++c; // Consume the 'u'. unsigned long codepoint = 0; // Expect 4 hex digits. for (unsigned i=0 ; i<4 ; ++i) { auto x = (unsigned char)(c[i]); unsigned digit; if (x == '\0') { auto msg = "Unterminated string"; throw StaticError(filename, begin, msg); } else if (x == '"') { auto msg = "Truncated unicode escape sequence in " "string literal."; throw StaticError(filename, begin, msg); } else if (x >= '0' && x <= '9') { digit = x - '0'; } else if (x >= 'a' && x <= 'f') { digit = x - 'a' + 10; } else if (x >= 'A' && x <= 'F') { digit = x - 'A' + 10; } else { std::stringstream ss; ss << "Malformed unicode escape character, " << "should be hex: '" << x << "'"; throw StaticError(filename, begin, ss.str()); } codepoint *= 16; codepoint += digit; } encode_utf8(codepoint, data); // Leave us on the last char, ready for the ++c at // the outer for loop. c += 3; } break; case '\0': { auto msg = "Truncated escape sequence in string literal."; throw StaticError(filename, begin, msg); } default: { std::stringstream ss; ss << "Unknown escape sequence in string literal: '" << *c << "'"; throw StaticError(filename, begin, ss.str()); } } break; // Treat as a regular letter, but maintain line/column counters. case '\n': line_number++; line_start = c+1; data += *c; break; default: // Just a regular letter. data += *c; } } kind = Token::STRING; } break; // Keywords default: if (is_identifier_first(*c)) { std::string id; for (; *c != '\0' ; ++c) { if (!is_identifier(*c)) { break; } id += *c; } --c; if (id == "assert") { kind = Token::ASSERT; } else if (id == "else") { kind = Token::ELSE; } else if (id == "error") { kind = Token::ERROR; } else if (id == "false") { kind = Token::FALSE; } else if (id == "for") { kind = Token::FOR; } else if (id == "function") { kind = Token::FUNCTION; } else if (id == "if") { kind = Token::IF; } else if (id == "import") { kind = Token::IMPORT; } else if (id == "importstr") { kind = Token::IMPORTSTR; } else if (id == "in") { kind = Token::IN; } else if (id == "local") { kind = Token::LOCAL; } else if (id == "null") { kind = Token::NULL_LIT; } else if (id == "self") { kind = Token::SELF; } else if (id == "super") { kind = Token::SUPER; } else if (id == "tailstrict") { kind = Token::TAILSTRICT; } else if (id == "then") { kind = Token::THEN; } else if (id == "true") { kind = Token::TRUE; } else { // Not a keyword, must be an identifier. kind = Token::IDENTIFIER; data = id; } } else if (is_symbol(*c)) { // Single line C++ style comment if (*c == '/' && *(c+1) == '/') { while (*c != '\0' && *c != '\n') { ++c; } // Leaving it on the \n allows processing of \n on next iteration, // i.e. managing of the line & column counter. c--; continue; } // Single line # comment if (*c == '#') { while (*c != '\0' && *c != '\n') { ++c; } // Leaving it on the \n allows processing of \n on next iteration, // i.e. managing of the line & column counter. c--; continue; } // Multi-line comment. if (*c == '/' && *(c+1) == '*') { c += 2; // Avoid matching /*/: skip the /* before starting the search for */. while (*c != '\0' && !(*c == '*' && *(c+1) == '/')) { if (*c == '\n') { // Just keep track of the line / column counters. line_number++; line_start = c+1; } ++c; } if (*c == '\0') { auto msg = "Multi-line comment has no terminating */."; throw StaticError(filename, begin, msg); } // Leave the counter on the closing /. c++; continue; } // Text block if (*c == '|' && *(c+1) == '|' && *(c+2) == '|' && *(c+3) == '\n') { std::stringstream block; c += 4; // Skip the "|||\n" line_number++; line_start = c; const char *first_line = c; int ws_chars = whitespace_check(first_line, c); if (ws_chars == 0) { auto msg = "Text block's first line must start with whitespace."; throw StaticError(filename, begin, msg); } while (true) { assert(ws_chars > 0); // Read up to the \n for (c = &c[ws_chars]; *c != '\n' ; ++c) { if (*c == '\0') throw StaticError(filename, begin, "Unexpected EOF"); block << *c; } // Add the \n block << '\n'; ++c; line_number++; line_start = c; // Examine next line ws_chars = whitespace_check(first_line, c); if (ws_chars == 0) { // End of text block // Skip over any whitespace while (*c == ' ' || *c == '\t') ++c; // Expect ||| if (!(*c == '|' && *(c+1) == '|' && *(c+2) == '|')) { auto msg = "Text block not terminated with |||"; throw StaticError(filename, begin, msg); } c += 2; // Leave on the last | data = block.str(); kind = Token::STRING; break; } } break; // Out of the switch. } for (; *c != '\0' ; ++c) { if (!is_symbol(*c)) { break; } data += *c; } --c; kind = Token::OPERATOR; } else { std::stringstream ss; ss << "Could not lex the character "; auto uc = (unsigned char)(*c); if (*c < 32) ss << "code " << unsigned(uc); else ss << "'" << *c << "'"; throw StaticError(filename, begin, ss.str()); } break; } Location end(line_number, c - line_start + 1); r.push_back(Token(kind, data, LocationRange(filename, begin, end))); } Location end(line_number, c - line_start + 1); r.push_back(Token(Token::END_OF_FILE, "", LocationRange(filename, end, end))); return r; }
static bool is_identifier(char c) { return is_identifier_first(c) || is_number(c); }
Tokens jsonnet_lex(const std::string &filename, const char *input) { unsigned long line_number = 1; const char *line_start = input; Tokens r; const char *c = input; Fodder fodder; bool fresh_line = true; // Are we tokenizing from the beginning of a new line? while (*c!='\0') { Token::Kind kind; std::string data; std::string string_block_indent; std::string string_block_term_indent; unsigned new_lines, indent; lex_ws(c, new_lines, indent, line_start, line_number); // If it's the end of the file, discard final whitespace. if (*c == '\0') break; if (new_lines > 0) { // Otherwise store whitespace in fodder. unsigned blanks = new_lines - 1; fodder.emplace_back(FodderElement::LINE_END, blanks, indent, EMPTY); fresh_line = true; } Location begin(line_number, c - line_start + 1); switch (*c) { // The following operators should never be combined with subsequent symbols. case '{': kind = Token::BRACE_L; c++; break; case '}': kind = Token::BRACE_R; c++; break; case '[': kind = Token::BRACKET_L; c++; break; case ']': kind = Token::BRACKET_R; c++; break; case ',': kind = Token::COMMA; c++; break; case '.': kind = Token::DOT; c++; break; case '(': kind = Token::PAREN_L; c++; break; case ')': kind = Token::PAREN_R; c++; break; case ';': kind = Token::SEMICOLON; c++; break; // Numeric literals. case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': kind = Token::NUMBER; data = lex_number(c, filename, begin); break; // String literals. case '"': { c++; for (; ; ++c) { if (*c == '\0') { throw StaticError(filename, begin, "Unterminated string"); } if (*c == '"') { break; } if (*c == '\\' && *(c+1) != '\0') { data += *c; ++c; } if (*c == '\n') { // Maintain line/column counters. line_number++; line_start = c+1; } data += *c; } c++; // Advance beyond the ". kind = Token::STRING_DOUBLE; } break; // String literals. case '\'': { c++; for (; ; ++c) { if (*c == '\0') { throw StaticError(filename, begin, "Unterminated string"); } if (*c == '\'') { break; } if (*c == '\\' && *(c+1) != '\0') { data += *c; ++c; } if (*c == '\n') { // Maintain line/column counters. line_number++; line_start = c+1; } data += *c; } c++; // Advance beyond the '. kind = Token::STRING_SINGLE; } break; // Keywords default: if (is_identifier_first(*c)) { std::string id; for (; is_identifier(*c); ++c) id += *c; if (id == "assert") { kind = Token::ASSERT; } else if (id == "else") { kind = Token::ELSE; } else if (id == "error") { kind = Token::ERROR; } else if (id == "false") { kind = Token::FALSE; } else if (id == "for") { kind = Token::FOR; } else if (id == "function") { kind = Token::FUNCTION; } else if (id == "if") { kind = Token::IF; } else if (id == "import") { kind = Token::IMPORT; } else if (id == "importstr") { kind = Token::IMPORTSTR; } else if (id == "in") { kind = Token::IN; } else if (id == "local") { kind = Token::LOCAL; } else if (id == "null") { kind = Token::NULL_LIT; } else if (id == "self") { kind = Token::SELF; } else if (id == "super") { kind = Token::SUPER; } else if (id == "tailstrict") { kind = Token::TAILSTRICT; } else if (id == "then") { kind = Token::THEN; } else if (id == "true") { kind = Token::TRUE; } else { // Not a keyword, must be an identifier. kind = Token::IDENTIFIER; } data = id; } else if (is_symbol(*c) || *c == '#') { // Single line C++ and Python style comments. if (*c == '#' || (*c == '/' && *(c+1) == '/')) { std::vector<std::string> comment(1); unsigned blanks; unsigned indent; lex_until_newline(c, comment[0], blanks, indent, line_start, line_number); auto kind = fresh_line ? FodderElement::PARAGRAPH : FodderElement::LINE_END; fodder.emplace_back(kind, blanks, indent, comment); fresh_line = true; continue; // We've not got a token, just fodder, so keep scanning. } // Multi-line C style comment. if (*c == '/' && *(c+1) == '*') { unsigned margin = c - line_start; const char *initial_c = c; c += 2; // Avoid matching /*/: skip the /* before starting the search for */. while (!(*c == '*' && *(c+1) == '/')) { if (*c == '\0') { auto msg = "Multi-line comment has no terminating */."; throw StaticError(filename, begin, msg); } if (*c == '\n') { // Just keep track of the line / column counters. line_number++; line_start = c+1; } ++c; } c += 2; // Move the pointer to the char after the closing '/'. std::string comment(initial_c, c - initial_c); // Includes the "/*" and "*/". // Lex whitespace after comment unsigned new_lines_after, indent_after; lex_ws(c, new_lines_after, indent_after, line_start, line_number); std::vector<std::string> lines; if (comment.find('\n') >= comment.length()) { // Comment looks like /* foo */ lines.push_back(comment); fodder.emplace_back(FodderElement::INTERSTITIAL, 0, 0, lines); if (new_lines_after > 0) { fodder.emplace_back(FodderElement::LINE_END, new_lines_after - 1, indent_after, EMPTY); fresh_line = true; } } else { lines = line_split(comment, margin); assert(lines[0][0] == '/'); // Little hack to support PARAGRAPHs with * down the LHS: // Add a space to lines that start with a '*' bool all_star = true; for (auto &l : lines) { if (l[0] != '*') all_star = false; } if (all_star) { for (auto &l : lines) { if (l[0] == '*') l = " " + l; } } if (new_lines_after == 0) { // Ensure a line end after the paragraph. new_lines_after = 1; indent_after = 0; } if (!fresh_line) // Ensure a line end before the comment. fodder.emplace_back(FodderElement::LINE_END, 0, 0, EMPTY); fodder.emplace_back(FodderElement::PARAGRAPH, new_lines_after - 1, indent_after, lines); fresh_line = true; } continue; // We've not got a token, just fodder, so keep scanning. } // Text block if (*c == '|' && *(c+1) == '|' && *(c+2) == '|' && *(c+3) == '\n') { std::stringstream block; c += 4; // Skip the "|||\n" line_number++; // Skip any blank lines at the beginning of the block. while (*c == '\n') { line_number++; ++c; block << '\n'; } line_start = c; const char *first_line = c; int ws_chars = whitespace_check(first_line, c); string_block_indent = std::string(first_line, ws_chars); if (ws_chars == 0) { auto msg = "Text block's first line must start with whitespace."; throw StaticError(filename, begin, msg); } while (true) { assert(ws_chars > 0); // Read up to the \n for (c = &c[ws_chars]; *c != '\n' ; ++c) { if (*c == '\0') throw StaticError(filename, begin, "Unexpected EOF"); block << *c; } // Add the \n block << '\n'; ++c; line_number++; line_start = c; // Skip any blank lines while (*c == '\n') { line_number++; ++c; block << '\n'; } // Examine next line ws_chars = whitespace_check(first_line, c); if (ws_chars == 0) { // End of text block // Skip over any whitespace while (*c == ' ' || *c == '\t') { string_block_term_indent += *c; ++c; } // Expect ||| if (!(*c == '|' && *(c+1) == '|' && *(c+2) == '|')) { auto msg = "Text block not terminated with |||"; throw StaticError(filename, begin, msg); } c += 3; // Leave after the last | data = block.str(); kind = Token::STRING_BLOCK; break; // Out of the while loop. } } break; // Out of the switch. } const char *operator_begin = c; for (; is_symbol(*c) ; ++c) { // Not allowed // in operators if (*c == '/' && *(c+1) == '/') break; // Not allowed /* in operators if (*c == '/' && *(c+1) == '*') break; // Not allowed ||| in operators if (*c == '|' && *(c+1) == '|' && *(c+2) == '|') break; } // Not allowed to end with a + - ~ ! unless a single char. // So, wind it back if we need to (but not too far). while (c > operator_begin + 1 && (*(c-1) == '+' || *(c-1) == '-' || *(c-1) == '~' || *(c-1) == '!')) { c--; } data += std::string(operator_begin, c); if (data == "$") { kind = Token::DOLLAR; data = ""; } else { kind = Token::OPERATOR; } } else { std::stringstream ss; ss << "Could not lex the character "; auto uc = (unsigned char)(*c); if (*c < 32) ss << "code " << unsigned(uc); else ss << "'" << *c << "'"; throw StaticError(filename, begin, ss.str()); } } Location end(line_number, c - line_start); r.emplace_back(kind, fodder, data, string_block_indent, string_block_term_indent, LocationRange(filename, begin, end)); fodder.clear(); fresh_line = false; } Location end(line_number, c - line_start + 1); r.emplace_back(Token::END_OF_FILE, fodder, "", "", "", LocationRange(filename, end, end)); return r; }