Exemple #1
0
Tokens jsonnet_lex(const std::string &filename, const char *input)
{
    unsigned long line_number = 1;
    const char *line_start = input;

    Tokens r;

    const char *c = input;

    Fodder fodder;
    bool fresh_line = true;  // Are we tokenizing from the beginning of a new line?

    while (*c!='\0') {
        Token::Kind kind;
        std::string data;
        std::string string_block_indent;
        std::string string_block_term_indent;

        unsigned new_lines, indent;
        lex_ws(c, new_lines, indent, line_start, line_number);

        // If it's the end of the file, discard final whitespace.
        if (*c == '\0')
            break;

        if (new_lines > 0) {
            // Otherwise store whitespace in fodder.
            unsigned blanks = new_lines - 1;
            fodder.emplace_back(FodderElement::LINE_END, blanks, indent, EMPTY);
            fresh_line = true;
        }

        Location begin(line_number, c - line_start + 1);

        switch (*c) {

            // The following operators should never be combined with subsequent symbols.
            case '{':
            kind = Token::BRACE_L;
            c++;
            break;

            case '}':
            kind = Token::BRACE_R;
            c++;
            break;

            case '[':
            kind = Token::BRACKET_L;
            c++;
            break;

            case ']':
            kind = Token::BRACKET_R;
            c++;
            break;

            case ',':
            kind = Token::COMMA;
            c++;
            break;

            case '.':
            kind = Token::DOT;
            c++;
            break;

            case '(':
            kind = Token::PAREN_L;
            c++;
            break;

            case ')':
            kind = Token::PAREN_R;
            c++;
            break;

            case ';':
            kind = Token::SEMICOLON;
            c++;
            break;

            // Numeric literals.
            case '0': case '1': case '2': case '3': case '4':
            case '5': case '6': case '7': case '8': case '9':
            kind = Token::NUMBER;
            data = lex_number(c, filename, begin);
            break;

            // String literals.
            case '"': {
                c++;
                for (; ; ++c) {
                    if (*c == '\0') {
                        throw StaticError(filename, begin, "Unterminated string");
                    }
                    if (*c == '"') {
                        break;
                    }
                    if (*c == '\\' && *(c+1) != '\0') {
                        data += *c;
                        ++c;
                    }
                    if (*c == '\n') {
                        // Maintain line/column counters.
                        line_number++;
                        line_start = c+1;
                    }
                    data += *c;
                }
                c++;  // Advance beyond the ".
                kind = Token::STRING_DOUBLE;
            }
            break;

            // String literals.
            case '\'': {
                c++;
                for (; ; ++c) {
                    if (*c == '\0') {
                        throw StaticError(filename, begin, "Unterminated string");
                    }
                    if (*c == '\'') {
                        break;
                    }
                    if (*c == '\\' && *(c+1) != '\0') {
                        data += *c;
                        ++c;
                    }
                    if (*c == '\n') {
                        // Maintain line/column counters.
                        line_number++;
                        line_start = c+1;
                    }
                    data += *c;
                }
                c++;  // Advance beyond the '.
                kind = Token::STRING_SINGLE;
            }
            break;

            // Keywords
            default:
            if (is_identifier_first(*c)) {
                std::string id;
                for (; is_identifier(*c); ++c)
                    id += *c;
                if (id == "assert") {
                    kind = Token::ASSERT;
                } else if (id == "else") {
                    kind = Token::ELSE;
                } else if (id == "error") {
                    kind = Token::ERROR;
                } else if (id == "false") {
                    kind = Token::FALSE;
                } else if (id == "for") {
                    kind = Token::FOR;
                } else if (id == "function") {
                    kind = Token::FUNCTION;
                } else if (id == "if") {
                    kind = Token::IF;
                } else if (id == "import") {
                    kind = Token::IMPORT;
                } else if (id == "importstr") {
                    kind = Token::IMPORTSTR;
                } else if (id == "in") {
                    kind = Token::IN;
                } else if (id == "local") {
                    kind = Token::LOCAL;
                } else if (id == "null") {
                    kind = Token::NULL_LIT;
                } else if (id == "self") {
                    kind = Token::SELF;
                } else if (id == "super") {
                    kind = Token::SUPER;
                } else if (id == "tailstrict") {
                    kind = Token::TAILSTRICT;
                } else if (id == "then") {
                    kind = Token::THEN;
                } else if (id == "true") {
                    kind = Token::TRUE;
                } else {
                    // Not a keyword, must be an identifier.
                    kind = Token::IDENTIFIER;
                }
                data = id;

            } else if (is_symbol(*c) || *c == '#') {

                // Single line C++ and Python style comments.
                if (*c == '#' || (*c == '/' && *(c+1) == '/')) {
                    std::vector<std::string> comment(1);
                    unsigned blanks;
                    unsigned indent;
                    lex_until_newline(c, comment[0], blanks, indent, line_start, line_number);
                    auto kind = fresh_line ? FodderElement::PARAGRAPH : FodderElement::LINE_END;
                    fodder.emplace_back(kind, blanks, indent, comment);
                    fresh_line = true;
                    continue;  // We've not got a token, just fodder, so keep scanning.
                }

                // Multi-line C style comment.
                if (*c == '/' && *(c+1) == '*') {

                    unsigned margin = c - line_start;
 
                    const char *initial_c = c;
                    c += 2;  // Avoid matching /*/: skip the /* before starting the search for */.

                    while (!(*c == '*' && *(c+1) == '/')) {
                        if (*c == '\0') {
                            auto msg = "Multi-line comment has no terminating */.";
                            throw StaticError(filename, begin, msg);
                        }
                        if (*c == '\n') {
                            // Just keep track of the line / column counters.
                            line_number++;
                            line_start = c+1;
                        }
                        ++c;
                    }
                    c += 2;  // Move the pointer to the char after the closing '/'.

                    std::string comment(initial_c, c - initial_c);  // Includes the "/*" and "*/".

                    // Lex whitespace after comment
                    unsigned new_lines_after, indent_after;
                    lex_ws(c, new_lines_after, indent_after, line_start, line_number);
                    std::vector<std::string> lines;
                    if (comment.find('\n') >= comment.length()) {
                        // Comment looks like /* foo */
                        lines.push_back(comment);
                        fodder.emplace_back(FodderElement::INTERSTITIAL, 0, 0, lines);
                        if (new_lines_after > 0) {
                            fodder.emplace_back(FodderElement::LINE_END, new_lines_after - 1,
                                                indent_after, EMPTY);
                            fresh_line = true;
                        }
                    } else {
                        lines = line_split(comment, margin);
                        assert(lines[0][0] == '/');
                        // Little hack to support PARAGRAPHs with * down the LHS:
                        // Add a space to lines that start with a '*'
                        bool all_star = true;
                        for (auto &l : lines) {
                            if (l[0] != '*')
                                all_star = false;
                        }
                        if (all_star) {
                            for (auto &l : lines) {
                                if (l[0] == '*') l = " " + l;
                            }
                        }
                        if (new_lines_after == 0) {
                            // Ensure a line end after the paragraph.
                            new_lines_after = 1;
                            indent_after = 0;
                        }
                        if (!fresh_line)
                            // Ensure a line end before the comment.
                            fodder.emplace_back(FodderElement::LINE_END, 0, 0, EMPTY);
                        fodder.emplace_back(FodderElement::PARAGRAPH, new_lines_after - 1,
                                            indent_after, lines);
                        fresh_line = true;
                    }
                    continue;  // We've not got a token, just fodder, so keep scanning.
                }

                // Text block
                if (*c == '|' && *(c+1) == '|' && *(c+2) == '|' && *(c+3) == '\n') {
                    std::stringstream block;
                    c += 4; // Skip the "|||\n"
                    line_number++;
                    // Skip any blank lines at the beginning of the block.
                    while (*c == '\n') {
                        line_number++;
                        ++c;
                        block << '\n';
                    }
                    line_start = c;
                    const char *first_line = c;
                    int ws_chars = whitespace_check(first_line, c);
                    string_block_indent = std::string(first_line, ws_chars);
                    if (ws_chars == 0) {
                        auto msg = "Text block's first line must start with whitespace.";
                        throw StaticError(filename, begin, msg);
                    }
                    while (true) {
                        assert(ws_chars > 0);
                        // Read up to the \n
                        for (c = &c[ws_chars]; *c != '\n' ; ++c) {
                            if (*c == '\0')
                                throw StaticError(filename, begin, "Unexpected EOF");
                            block << *c;
                        }
                        // Add the \n
                        block << '\n';
                        ++c;
                        line_number++;
                        line_start = c;
                        // Skip any blank lines
                        while (*c == '\n') {
                            line_number++;
                            ++c;
                            block << '\n';
                        }
                        // Examine next line
                        ws_chars = whitespace_check(first_line, c);
                        if (ws_chars == 0) {
                            // End of text block
                            // Skip over any whitespace
                            while (*c == ' ' || *c == '\t') {
                                string_block_term_indent += *c;
                                ++c;
                            }
                            // Expect |||
                            if (!(*c == '|' && *(c+1) == '|' && *(c+2) == '|')) {
                                auto msg = "Text block not terminated with |||";
                                throw StaticError(filename, begin, msg);
                            }
                            c += 3;  // Leave after the last |
                            data = block.str();
                            kind = Token::STRING_BLOCK;
                            break;  // Out of the while loop.
                        }
                    }

                    break;  // Out of the switch.
                }

                const char *operator_begin = c;
                for (; is_symbol(*c) ; ++c) {
                    // Not allowed // in operators
                    if (*c == '/' && *(c+1) == '/') break;
                    // Not allowed /* in operators
                    if (*c == '/' && *(c+1) == '*') break;
                    // Not allowed ||| in operators
                    if (*c == '|' && *(c+1) == '|' && *(c+2) == '|') break;
                }
                // Not allowed to end with a + - ~ ! unless a single char.
                // So, wind it back if we need to (but not too far).
                while (c > operator_begin + 1
                       && (*(c-1) == '+' || *(c-1) == '-' || *(c-1) == '~' || *(c-1) == '!')) {
                    c--;
                }
                data += std::string(operator_begin, c);
                if (data == "$") {
                    kind = Token::DOLLAR;
                    data = "";
                } else {
                    kind = Token::OPERATOR;
                }
            } else {
                std::stringstream ss;
                ss << "Could not lex the character ";
                auto uc = (unsigned char)(*c);
                if (*c < 32)
                    ss << "code " << unsigned(uc);
                else
                    ss << "'" << *c << "'";
                throw StaticError(filename, begin, ss.str());
            }
        }

        Location end(line_number, c - line_start);
        r.emplace_back(kind, fodder, data, string_block_indent, string_block_term_indent,
                       LocationRange(filename, begin, end));
        fodder.clear();
        fresh_line = false;
    }

    Location end(line_number, c - line_start + 1);
    r.emplace_back(Token::END_OF_FILE, fodder, "", "", "", LocationRange(filename, end, end));
    return r;
}