Exemple #1
0
static Symbols tokenize(const QByteArray &input, int lineNum = 1, TokenizeMode mode = TokenizeCpp)
{
    Symbols symbols;
    const char *begin = input;
    const char *data = begin;
    while (*data) {
        if (mode == TokenizeCpp) {
            int column = 0;

            const char *lexem = data;
            int state = 0;
            Token token = NOTOKEN;
            for (;;) {
                if (static_cast<signed char>(*data) < 0) {
                    ++data;
                    continue;
                }
                int nextindex = keywords[state].next;
                int next = 0;
                if (*data == keywords[state].defchar)
                    next = keywords[state].defnext;
                else if (!state || nextindex)
                    next = keyword_trans[nextindex][(int)*data];
                if (!next)
                    break;
                state = next;
                token = keywords[state].token;
                ++data;
            }

            // suboptimal, is_ident_char  should use a table
            if (keywords[state].ident && is_ident_char(*data))
                token = keywords[state].ident;

            if (token == NOTOKEN) {
                // an error really
                ++data;
                continue;
            }

            ++column;

            if (token > SPECIAL_TREATMENT_MARK) {
                switch (token) {
                case QUOTE:
                    data = skipQuote(data);
                    token = STRING_LITERAL;
                    // concatenate multi-line strings for easier
                    // STRING_LITERAAL handling in moc
                    if (!Preprocessor::preprocessOnly
                        && !symbols.isEmpty()
                        && symbols.last().token == STRING_LITERAL) {

                        QByteArray newString = symbols.last().unquotedLexem();
                        newString += input.mid(lexem - begin + 1, data - lexem - 2);
                        newString.prepend('\"');
                        newString.append('\"');
                        symbols.last() = Symbol(symbols.last().lineNum,
                                                STRING_LITERAL,
                                                newString);
                        continue;
                    }
                    break;
                case SINGLEQUOTE:
                    while (*data && (*data != '\''
                                     || (*(data-1)=='\\'
                                         && *(data-2)!='\\')))
                        ++data;
                    if (*data)
                        ++data;
                    token = CHARACTER_LITERAL;
                    break;
                case LANGLE_SCOPE:
                    // split <:: into two tokens, < and ::
                    token = LANGLE;
                    data -= 2;
                    break;
                case DIGIT:
                    while (is_digit_char(*data))
                        ++data;
                    if (!*data || *data != '.') {
                        token = INTEGER_LITERAL;
                        if (data - lexem == 1 &&
                            (*data == 'x' || *data == 'X')
                            && *lexem == '0') {
                            ++data;
                            while (is_hex_char(*data))
                                ++data;
                        }
                        break;
                    }
                    token = FLOATING_LITERAL;
                    ++data;
                    // fall through
                case FLOATING_LITERAL:
                    while (is_digit_char(*data))
                        ++data;
                    if (*data == '+' || *data == '-')
                        ++data;
                    if (*data == 'e' || *data == 'E') {
                        ++data;
                        while (is_digit_char(*data))
                            ++data;
                    }
                    if (*data == 'f' || *data == 'F'
                        || *data == 'l' || *data == 'L')
                        ++data;
                    break;
                case HASH:
                    if (column == 1) {
                        mode = PreparePreprocessorStatement;
                        while (*data && (*data == ' ' || *data == '\t'))
                            ++data;
                        if (is_ident_char(*data))
                            mode = TokenizePreprocessorStatement;
                        continue;
                    }
                    break;
                case NEWLINE:
                    ++lineNum;
                    continue;
                case BACKSLASH:
                {
                    const char *rewind = data;
                    while (*data && (*data == ' ' || *data == '\t'))
                        ++data;
                    if (*data && *data == '\n') {
                        ++data;
                        continue;
                    }
                    data = rewind;
                } break;
                case CHARACTER:
                    while (is_ident_char(*data))
                        ++data;
                    token = IDENTIFIER;
                    break;
                case C_COMMENT:
                    if (*data) {
                        if (*data == '\n')
                            ++lineNum;
                        ++data;
                        if (*data) {
                            if (*data == '\n')
                                ++lineNum;
                            ++data;
                        }
                    }
                    while (*data && (*(data-1) != '/' || *(data-2) != '*')) {
                        if (*data == '\n')
                            ++lineNum;
                        ++data;
                    }
                    token = WHITESPACE; // one comment, one whitespace
                    // fall through;
                case WHITESPACE:
                    if (column == 1)
                        column = 0;
                    while (*data && (*data == ' ' || *data == '\t'))
                        ++data;
                    if (Preprocessor::preprocessOnly) // tokenize whitespace
                        break;
                    continue;
                case CPP_COMMENT:
                    while (*data && *data != '\n')
                        ++data;
                    continue; // ignore safely, the newline is a separator
                default:
                    continue; //ignore
                }
            }
#ifdef USE_LEXEM_STORE
            if (!Preprocessor::preprocessOnly
                && token != IDENTIFIER
                && token != STRING_LITERAL
                && token != FLOATING_LITERAL
                && token != INTEGER_LITERAL)
                symbols += Symbol(lineNum, token);
            else
#endif
                symbols += Symbol(lineNum, token, input, lexem-begin, data-lexem);

        } else { //   Preprocessor

            const char *lexem = data;
            int state = 0;
            Token token = NOTOKEN;
            if (mode == TokenizePreprocessorStatement) {
                state = pp_keyword_trans[0][(int)'#'];
                mode = TokenizePreprocessor;
            }
            for (;;) {
                if (static_cast<signed char>(*data) < 0) {
                    ++data;
                    continue;
                }

                int nextindex = pp_keywords[state].next;
                int next = 0;
                if (*data == pp_keywords[state].defchar)
                    next = pp_keywords[state].defnext;
                else if (!state || nextindex)
                    next = pp_keyword_trans[nextindex][(int)*data];
                if (!next)
                    break;
                state = next;
                token = pp_keywords[state].token;
                ++data;
            }
            // suboptimal, is_ident_char  should use a table
            if (pp_keywords[state].ident && is_ident_char(*data))
                token = pp_keywords[state].ident;

            switch (token) {
            case NOTOKEN:
                ++data;
                break;
            case PP_IFDEF:
                symbols += Symbol(lineNum, PP_IF);
                symbols += Symbol(lineNum, PP_DEFINED);
                continue;
            case PP_IFNDEF:
                symbols += Symbol(lineNum, PP_IF);
                symbols += Symbol(lineNum, PP_NOT);
                symbols += Symbol(lineNum, PP_DEFINED);
                continue;
            case PP_INCLUDE:
                mode = TokenizeInclude;
                break;
            case PP_QUOTE:
                data = skipQuote(data);
                token = PP_STRING_LITERAL;
                break;
            case PP_SINGLEQUOTE:
                while (*data && (*data != '\''
                                 || (*(data-1)=='\\'
                                     && *(data-2)!='\\')))
                    ++data;
                if (*data)
                    ++data;
                token = PP_CHARACTER_LITERAL;
                break;
            case PP_DIGIT:
                while (is_digit_char(*data))
                    ++data;
                if (!*data || *data != '.') {
                    token = PP_INTEGER_LITERAL;
                    if (data - lexem == 1 &&
                        (*data == 'x' || *data == 'X')
                        && *lexem == '0') {
                        ++data;
                        while (is_hex_char(*data))
                            ++data;
                    }
                    break;
                }
                token = PP_FLOATING_LITERAL;
                ++data;
                // fall through
            case PP_FLOATING_LITERAL:
                while (is_digit_char(*data))
                    ++data;
                if (*data == '+' || *data == '-')
                    ++data;
                if (*data == 'e' || *data == 'E') {
                    ++data;
                    while (is_digit_char(*data))
                        ++data;
                }
                if (*data == 'f' || *data == 'F'
                    || *data == 'l' || *data == 'L')
                    ++data;
                break;
            case PP_CHARACTER:
                if (mode == PreparePreprocessorStatement) {
                    // rewind entire token to begin
                    data = lexem;
                    mode = TokenizePreprocessorStatement;
                    continue;
                }
                while (is_ident_char(*data))
                    ++data;
                token = PP_IDENTIFIER;
                break;
            case PP_C_COMMENT:
                if (*data) {
                    if (*data == '\n')
                        ++lineNum;
                    ++data;
                    if (*data) {
                        if (*data == '\n')
                            ++lineNum;
                        ++data;
                    }
                }
                while (*data && (*(data-1) != '/' || *(data-2) != '*')) {
                    if (*data == '\n')
                        ++lineNum;
                    ++data;
                }
                token = PP_WHITESPACE; // one comment, one whitespace
                // fall through;
            case PP_WHITESPACE:
                while (*data && (*data == ' ' || *data == '\t'))
                    ++data;
                continue; // the preprocessor needs no whitespace
            case PP_CPP_COMMENT:
                while (*data && *data != '\n')
                    ++data;
                continue; // ignore safely, the newline is a separator
            case PP_NEWLINE:
                ++lineNum;
                mode = TokenizeCpp;
                break;
            case PP_BACKSLASH:
            {
                const char *rewind = data;
                while (*data && (*data == ' ' || *data == '\t'))
                    ++data;
                if (*data && *data == '\n') {
                    ++data;
                    continue;
                }
                data = rewind;
            } break;
            case PP_LANGLE:
                if (mode != TokenizeInclude)
                    break;
                token = PP_STRING_LITERAL;
                while (*data && *data != '\n' && *(data-1) != '>')
                    ++data;
                break;
            default:
                break;
            }
            if (mode == PreparePreprocessorStatement)
                continue;
#ifdef USE_LEXEM_STORE
            if (token != PP_IDENTIFIER
                && token != PP_STRING_LITERAL
                && token != PP_FLOATING_LITERAL
                && token != PP_INTEGER_LITERAL)
                symbols += Symbol(lineNum, token);
            else
#endif
                symbols += Symbol(lineNum, token, input, lexem-begin, data-lexem);
        }
    }
    symbols += Symbol(); // eof symbol
    return symbols;
}
Exemple #2
0
Symbols Preprocessor::macroExpandIdentifier(Preprocessor *that, SymbolStack &symbols, int lineNum, QByteArray *macroName)
{
    Symbol s = symbols.symbol();

    // not a macro
    if (s.token != PP_IDENTIFIER || !that->macros.contains(s) || symbols.dontReplaceSymbol(s.lexem())) {
        Symbols syms;
        syms += s;
        syms.last().lineNum = lineNum;
        return syms;
    }

    const Macro &macro = that->macros.value(s);
    *macroName = s.lexem();

    Symbols expansion;
    if (!macro.isFunction) {
        expansion = macro.symbols;
    } else {
        bool haveSpace = false;
        while (symbols.test(PP_WHITESPACE)) {
            haveSpace = true;
        }
        if (!symbols.test(PP_LPAREN)) {
            *macroName = QByteArray();
            Symbols syms;
            if (haveSpace)
                syms += Symbol(lineNum, PP_WHITESPACE);
            syms += s;
            syms.last().lineNum = lineNum;
            return syms;
        }
        QList<Symbols> arguments;
        while (symbols.hasNext()) {
            Symbols argument;
            // strip leading space
            while (symbols.test(PP_WHITESPACE)) {}
            int nesting = 0;
            bool vararg = macro.isVariadic && (arguments.size() == macro.arguments.size() - 1);
            while (symbols.hasNext()) {
                Token t = symbols.next();
                if (t == PP_LPAREN) {
                    ++nesting;
                } else if (t == PP_RPAREN) {
                    --nesting;
                    if (nesting < 0)
                        break;
                } else if (t == PP_COMMA && nesting == 0) {
                    if (!vararg)
                        break;
                }
                argument += symbols.symbol();
            }
            arguments += argument;

            if (nesting < 0)
                break;
        }

        // empty VA_ARGS
        if (macro.isVariadic && arguments.size() == macro.arguments.size() - 1)
            arguments += Symbols();

        if (arguments.size() != macro.arguments.size() &&
                // 0 argument macros are a bit special. They are ok if the
                // argument is pure whitespace or empty
                (macro.arguments.size() != 0 || arguments.size() != 1 || !arguments.at(0).isEmpty()))
            that->error("Macro argument mismatch.");

        // now replace the macro arguments with the expanded arguments
        enum Mode {
            Normal,
            Hash,
            HashHash
        } mode = Normal;

        for (int i = 0; i < macro.symbols.size(); ++i) {
            const Symbol &s = macro.symbols.at(i);
            if (s.token == HASH || s.token == PP_HASHHASH) {
                mode = (s.token == HASH ? Hash : HashHash);
                continue;
            }
            int index = macro.arguments.indexOf(s);
            if (mode == Normal) {
                if (index >= 0) {
                    // each argument undoergoes macro expansion if it's not used as part of a # or ##
                    if (i == macro.symbols.size() - 1 || macro.symbols.at(i + 1).token != PP_HASHHASH) {
                        Symbols arg = arguments.at(index);
                        int idx = 1;
                        expansion += macroExpand(that, arg, idx, lineNum, false);
                    } else {
                        expansion += arguments.at(index);
                    }
                } else {
                    expansion += s;
                }
            } else if (mode == Hash) {
                if (index < 0)
                    that->error("'#' is not followed by a macro parameter");

                const Symbols &arg = arguments.at(index);
                QByteArray stringified;
                for (int i = 0; i < arg.size(); ++i) {
                    stringified += arg.at(i).lexem();
                }
                stringified.replace('"', "\\\"");
                stringified.prepend('"');
                stringified.append('"');
                expansion += Symbol(lineNum, STRING_LITERAL, stringified);
            } else if (mode == HashHash) {
                if (s.token == WHITESPACE)
                    continue;

                while (expansion.size() && expansion.last().token == PP_WHITESPACE)
                    expansion.pop_back();

                Symbol next = s;
                if (index >= 0) {
                    const Symbols &arg = arguments.at(index);
                    if (arg.size() == 0) {
                        mode = Normal;
                        continue;
                    }
                    next = arg.at(0);
                }

                if (!expansion.isEmpty() && expansion.last().token == s.token) {
                    Symbol last = expansion.last();
                    expansion.pop_back();

                    if (last.token == STRING_LITERAL || s.token == STRING_LITERAL)
                        that->error("Can't concatenate non identifier tokens");

                    QByteArray lexem = last.lexem() + next.lexem();
                    expansion += Symbol(lineNum, last.token, lexem);
                } else {
                    expansion += next;
                }

                if (index >= 0) {
                    const Symbols &arg = arguments.at(index);
                    for (int i = 1; i < arg.size(); ++i)
                        expansion += arg.at(i);
                }
            }
            mode = Normal;
        }
        if (mode != Normal)
            that->error("'#' or '##' found at the end of a macro argument");

    }

    return expansion;
}