Ejemplo n.º 1
0
static Symbols tokenize(const QByteArray &input, int lineNum = 1, TokenizeMode mode = TokenizeCpp)
{
    Symbols symbols;
    const char *begin = input;
    const char *data = begin;
    while (*data) {
        if (mode == TokenizeCpp) {
            int column = 0;

            const char *lexem = data;
            int state = 0;
            Token token = NOTOKEN;
            for (;;) {
                if (static_cast<signed char>(*data) < 0) {
                    ++data;
                    continue;
                }
                int nextindex = keywords[state].next;
                int next = 0;
                if (*data == keywords[state].defchar)
                    next = keywords[state].defnext;
                else if (!state || nextindex)
                    next = keyword_trans[nextindex][(int)*data];
                if (!next)
                    break;
                state = next;
                token = keywords[state].token;
                ++data;
            }

            // suboptimal, is_ident_char  should use a table
            if (keywords[state].ident && is_ident_char(*data))
                token = keywords[state].ident;

            if (token == NOTOKEN) {
                // an error really
                ++data;
                continue;
            }

            ++column;

            if (token > SPECIAL_TREATMENT_MARK) {
                switch (token) {
                case QUOTE:
                    data = skipQuote(data);
                    token = STRING_LITERAL;
                    // concatenate multi-line strings for easier
                    // STRING_LITERAAL handling in moc
                    if (!Preprocessor::preprocessOnly
                        && !symbols.isEmpty()
                        && symbols.last().token == STRING_LITERAL) {

                        QByteArray newString = symbols.last().unquotedLexem();
                        newString += input.mid(lexem - begin + 1, data - lexem - 2);
                        newString.prepend('\"');
                        newString.append('\"');
                        symbols.last() = Symbol(symbols.last().lineNum,
                                                STRING_LITERAL,
                                                newString);
                        continue;
                    }
                    break;
                case SINGLEQUOTE:
                    while (*data && (*data != '\''
                                     || (*(data-1)=='\\'
                                         && *(data-2)!='\\')))
                        ++data;
                    if (*data)
                        ++data;
                    token = CHARACTER_LITERAL;
                    break;
                case LANGLE_SCOPE:
                    // split <:: into two tokens, < and ::
                    token = LANGLE;
                    data -= 2;
                    break;
                case DIGIT:
                    while (is_digit_char(*data))
                        ++data;
                    if (!*data || *data != '.') {
                        token = INTEGER_LITERAL;
                        if (data - lexem == 1 &&
                            (*data == 'x' || *data == 'X')
                            && *lexem == '0') {
                            ++data;
                            while (is_hex_char(*data))
                                ++data;
                        }
                        break;
                    }
                    token = FLOATING_LITERAL;
                    ++data;
                    // fall through
                case FLOATING_LITERAL:
                    while (is_digit_char(*data))
                        ++data;
                    if (*data == '+' || *data == '-')
                        ++data;
                    if (*data == 'e' || *data == 'E') {
                        ++data;
                        while (is_digit_char(*data))
                            ++data;
                    }
                    if (*data == 'f' || *data == 'F'
                        || *data == 'l' || *data == 'L')
                        ++data;
                    break;
                case HASH:
                    if (column == 1) {
                        mode = PreparePreprocessorStatement;
                        while (*data && (*data == ' ' || *data == '\t'))
                            ++data;
                        if (is_ident_char(*data))
                            mode = TokenizePreprocessorStatement;
                        continue;
                    }
                    break;
                case NEWLINE:
                    ++lineNum;
                    continue;
                case BACKSLASH:
                {
                    const char *rewind = data;
                    while (*data && (*data == ' ' || *data == '\t'))
                        ++data;
                    if (*data && *data == '\n') {
                        ++data;
                        continue;
                    }
                    data = rewind;
                } break;
                case CHARACTER:
                    while (is_ident_char(*data))
                        ++data;
                    token = IDENTIFIER;
                    break;
                case C_COMMENT:
                    if (*data) {
                        if (*data == '\n')
                            ++lineNum;
                        ++data;
                        if (*data) {
                            if (*data == '\n')
                                ++lineNum;
                            ++data;
                        }
                    }
                    while (*data && (*(data-1) != '/' || *(data-2) != '*')) {
                        if (*data == '\n')
                            ++lineNum;
                        ++data;
                    }
                    token = WHITESPACE; // one comment, one whitespace
                    // fall through;
                case WHITESPACE:
                    if (column == 1)
                        column = 0;
                    while (*data && (*data == ' ' || *data == '\t'))
                        ++data;
                    if (Preprocessor::preprocessOnly) // tokenize whitespace
                        break;
                    continue;
                case CPP_COMMENT:
                    while (*data && *data != '\n')
                        ++data;
                    continue; // ignore safely, the newline is a separator
                default:
                    continue; //ignore
                }
            }
#ifdef USE_LEXEM_STORE
            if (!Preprocessor::preprocessOnly
                && token != IDENTIFIER
                && token != STRING_LITERAL
                && token != FLOATING_LITERAL
                && token != INTEGER_LITERAL)
                symbols += Symbol(lineNum, token);
            else
#endif
                symbols += Symbol(lineNum, token, input, lexem-begin, data-lexem);

        } else { //   Preprocessor

            const char *lexem = data;
            int state = 0;
            Token token = NOTOKEN;
            if (mode == TokenizePreprocessorStatement) {
                state = pp_keyword_trans[0][(int)'#'];
                mode = TokenizePreprocessor;
            }
            for (;;) {
                if (static_cast<signed char>(*data) < 0) {
                    ++data;
                    continue;
                }

                int nextindex = pp_keywords[state].next;
                int next = 0;
                if (*data == pp_keywords[state].defchar)
                    next = pp_keywords[state].defnext;
                else if (!state || nextindex)
                    next = pp_keyword_trans[nextindex][(int)*data];
                if (!next)
                    break;
                state = next;
                token = pp_keywords[state].token;
                ++data;
            }
            // suboptimal, is_ident_char  should use a table
            if (pp_keywords[state].ident && is_ident_char(*data))
                token = pp_keywords[state].ident;

            switch (token) {
            case NOTOKEN:
                ++data;
                break;
            case PP_IFDEF:
                symbols += Symbol(lineNum, PP_IF);
                symbols += Symbol(lineNum, PP_DEFINED);
                continue;
            case PP_IFNDEF:
                symbols += Symbol(lineNum, PP_IF);
                symbols += Symbol(lineNum, PP_NOT);
                symbols += Symbol(lineNum, PP_DEFINED);
                continue;
            case PP_INCLUDE:
                mode = TokenizeInclude;
                break;
            case PP_QUOTE:
                data = skipQuote(data);
                token = PP_STRING_LITERAL;
                break;
            case PP_SINGLEQUOTE:
                while (*data && (*data != '\''
                                 || (*(data-1)=='\\'
                                     && *(data-2)!='\\')))
                    ++data;
                if (*data)
                    ++data;
                token = PP_CHARACTER_LITERAL;
                break;
            case PP_DIGIT:
                while (is_digit_char(*data))
                    ++data;
                if (!*data || *data != '.') {
                    token = PP_INTEGER_LITERAL;
                    if (data - lexem == 1 &&
                        (*data == 'x' || *data == 'X')
                        && *lexem == '0') {
                        ++data;
                        while (is_hex_char(*data))
                            ++data;
                    }
                    break;
                }
                token = PP_FLOATING_LITERAL;
                ++data;
                // fall through
            case PP_FLOATING_LITERAL:
                while (is_digit_char(*data))
                    ++data;
                if (*data == '+' || *data == '-')
                    ++data;
                if (*data == 'e' || *data == 'E') {
                    ++data;
                    while (is_digit_char(*data))
                        ++data;
                }
                if (*data == 'f' || *data == 'F'
                    || *data == 'l' || *data == 'L')
                    ++data;
                break;
            case PP_CHARACTER:
                if (mode == PreparePreprocessorStatement) {
                    // rewind entire token to begin
                    data = lexem;
                    mode = TokenizePreprocessorStatement;
                    continue;
                }
                while (is_ident_char(*data))
                    ++data;
                token = PP_IDENTIFIER;
                break;
            case PP_C_COMMENT:
                if (*data) {
                    if (*data == '\n')
                        ++lineNum;
                    ++data;
                    if (*data) {
                        if (*data == '\n')
                            ++lineNum;
                        ++data;
                    }
                }
                while (*data && (*(data-1) != '/' || *(data-2) != '*')) {
                    if (*data == '\n')
                        ++lineNum;
                    ++data;
                }
                token = PP_WHITESPACE; // one comment, one whitespace
                // fall through;
            case PP_WHITESPACE:
                while (*data && (*data == ' ' || *data == '\t'))
                    ++data;
                continue; // the preprocessor needs no whitespace
            case PP_CPP_COMMENT:
                while (*data && *data != '\n')
                    ++data;
                continue; // ignore safely, the newline is a separator
            case PP_NEWLINE:
                ++lineNum;
                mode = TokenizeCpp;
                break;
            case PP_BACKSLASH:
            {
                const char *rewind = data;
                while (*data && (*data == ' ' || *data == '\t'))
                    ++data;
                if (*data && *data == '\n') {
                    ++data;
                    continue;
                }
                data = rewind;
            } break;
            case PP_LANGLE:
                if (mode != TokenizeInclude)
                    break;
                token = PP_STRING_LITERAL;
                while (*data && *data != '\n' && *(data-1) != '>')
                    ++data;
                break;
            default:
                break;
            }
            if (mode == PreparePreprocessorStatement)
                continue;
#ifdef USE_LEXEM_STORE
            if (token != PP_IDENTIFIER
                && token != PP_STRING_LITERAL
                && token != PP_FLOATING_LITERAL
                && token != PP_INTEGER_LITERAL)
                symbols += Symbol(lineNum, token);
            else
#endif
                symbols += Symbol(lineNum, token, input, lexem-begin, data-lexem);
        }
    }
    symbols += Symbol(); // eof symbol
    return symbols;
}
Ejemplo n.º 2
0
void RE2NFA::tokenize(const QString &input)
{
    symbols.clear();
#if 1
    RegExpTokenizer tokenizer(input);
    Symbol sym;
    int tok = tokenizer.lex();
    while (tok != -1) {
        Symbol sym;
        sym.token = static_cast<Token>(tok);
        sym.lexem = input.mid(tokenizer.lexemStart, tokenizer.lexemLength);

        if (sym.token == TOK_QUOTED_STRING) {
            sym.lexem.chop(1);
            sym.lexem.remove(0, 1);
            sym.token = TOK_STRING;
        }

        if (sym.token == TOK_STRING || sym.token == TOK_SEQUENCE) {
            for (int i = 0; i < sym.lexem.length(); ++i) {
                if (sym.lexem.at(i) == '\\') {
                    if (i >= sym.lexem.length() - 1)
                        break;
                    QChar ch = sym.lexem.at(i + 1);
                    if (ch == QLatin1Char('n')) {
                        ch = '\n';
                    } else if (ch == QLatin1Char('r')) {
                        ch = '\r';
                    } else if (ch == QLatin1Char('t')) {
                        ch = '\t';
                    } else if (ch == QLatin1Char('f')) {
                        ch = '\f';
                    }
                    sym.lexem.replace(i, 2, ch);
                }
            }
        }

        /*
        if (sym.token == TOK_SEQUENCE) {
            Symbol s;
            s.token = TOK_LBRACKET;
            s.lexem = "[";
            symbols.append(s);

            for (int i = 1; i < sym.lexem.length() - 1; ++i) {
                s.token = TOK_STRING;
                s.lexem = sym.lexem.at(i);
                symbols.append(s);
            }

            s.token = TOK_RBRACKET;
            s.lexem = "]";
            symbols.append(s);

            tok = tokenizer.lex();
            continue;
        }
        */

        symbols.append(sym);
        tok = tokenizer.lex();
    }
#else
    int pos = 0;
    bool insideSet = false;
    while (pos < input.length()) {
        QChar ch = input.at(pos);

        Symbol sym;
        sym.column = pos;
        sym.token = TOK_INVALID;
        sym.lexem = QString(ch);
        switch (ch.toLatin1()) {
            case '"': {
                if (insideSet) {
                    sym.token = TOK_STRING;
                    sym.lexem = QString(ch);
                    symbols += sym;
                    ++pos;
                    continue;
                }
                if (pos + 1 >= input.length())
                    return;
                int quoteEnd = skipQuote(input, pos + 1);
                sym.token = TOK_STRING;
                sym.lexem = input.mid(pos + 1, quoteEnd - pos - 2);
                symbols += sym;
                pos = quoteEnd;
                continue;
            }
            case '{':
                sym.token = (insideSet ? TOK_STRING : TOK_LBRACE);
                break;
            case '}':
                sym.token = (insideSet ? TOK_STRING : TOK_RBRACE);
                break;
            case '[':
                insideSet = true;
                sym.token = TOK_LBRACKET;
                break;
            case ']':
                insideSet = false;
                sym.token = TOK_RBRACKET;
                break;
            case '(':
                sym.token = (insideSet ? TOK_STRING : TOK_LPAREN);
                break;
            case ')':
                sym.token = (insideSet ? TOK_STRING : TOK_RPAREN);
                break;
            case ',':
                sym.token = (insideSet ? TOK_STRING : TOK_COMMA);
                break;
            case '*':
                sym.token = (insideSet ? TOK_STRING : TOK_STAR);
                break;
            case '|':
                sym.token = (insideSet ? TOK_STRING : TOK_OR);
                break;
            case '?':
                sym.token = (insideSet ? TOK_STRING : TOK_QUESTION);
                break;
            case '.':
                sym.token = (insideSet ? TOK_STRING : TOK_DOT);
                break;
            case '+':
                sym.token = (insideSet ? TOK_STRING : TOK_PLUS);
                break;
            case '\\':
                ++pos;
                if (pos >= input.length())
                    return;
                ch = input.at(pos);
                if (ch == QLatin1Char('n')) {
                    ch = '\n';
                } else if (ch == QLatin1Char('r')) {
                    ch = '\r';
                } else if (ch == QLatin1Char('t')) {
                    ch = '\t';
                } else if (ch == QLatin1Char('f')) {
                    ch = '\f';
                }
                // fall through
            default:
                sym.token = TOK_STRING;
                sym.lexem = QString(ch);
                symbols += sym;
                ++pos;
                continue;
        }
        symbols += sym;
        ++pos;
    }
#endif
#if 0
    foreach (Symbol s, symbols) {
        qDebug() << "Tok" << tokStr(s.token) << "lexem" << s.lexem;
    }