static Symbols tokenize(const QByteArray &input, int lineNum = 1, TokenizeMode mode = TokenizeCpp) { Symbols symbols; const char *begin = input; const char *data = begin; while (*data) { if (mode == TokenizeCpp) { int column = 0; const char *lexem = data; int state = 0; Token token = NOTOKEN; for (;;) { if (static_cast<signed char>(*data) < 0) { ++data; continue; } int nextindex = keywords[state].next; int next = 0; if (*data == keywords[state].defchar) next = keywords[state].defnext; else if (!state || nextindex) next = keyword_trans[nextindex][(int)*data]; if (!next) break; state = next; token = keywords[state].token; ++data; } // suboptimal, is_ident_char should use a table if (keywords[state].ident && is_ident_char(*data)) token = keywords[state].ident; if (token == NOTOKEN) { // an error really ++data; continue; } ++column; if (token > SPECIAL_TREATMENT_MARK) { switch (token) { case QUOTE: data = skipQuote(data); token = STRING_LITERAL; // concatenate multi-line strings for easier // STRING_LITERAAL handling in moc if (!Preprocessor::preprocessOnly && !symbols.isEmpty() && symbols.last().token == STRING_LITERAL) { QByteArray newString = symbols.last().unquotedLexem(); newString += input.mid(lexem - begin + 1, data - lexem - 2); newString.prepend('\"'); newString.append('\"'); symbols.last() = Symbol(symbols.last().lineNum, STRING_LITERAL, newString); continue; } break; case SINGLEQUOTE: while (*data && (*data != '\'' || (*(data-1)=='\\' && *(data-2)!='\\'))) ++data; if (*data) ++data; token = CHARACTER_LITERAL; break; case LANGLE_SCOPE: // split <:: into two tokens, < and :: token = LANGLE; data -= 2; break; case DIGIT: while (is_digit_char(*data)) ++data; if (!*data || *data != '.') { token = INTEGER_LITERAL; if (data - lexem == 1 && (*data == 'x' || *data == 'X') && *lexem == '0') { ++data; while (is_hex_char(*data)) ++data; } break; } token = FLOATING_LITERAL; ++data; // fall through case FLOATING_LITERAL: while (is_digit_char(*data)) ++data; if (*data == '+' || *data == '-') ++data; if (*data == 'e' || *data == 'E') { ++data; while (is_digit_char(*data)) ++data; } if (*data == 'f' || *data == 'F' || *data == 'l' || *data == 'L') ++data; break; case HASH: if (column == 1) { mode = PreparePreprocessorStatement; while (*data && (*data == ' ' || *data == '\t')) ++data; if (is_ident_char(*data)) mode = TokenizePreprocessorStatement; continue; } break; case NEWLINE: ++lineNum; continue; case BACKSLASH: { const char *rewind = data; while (*data && (*data == ' ' || *data == '\t')) ++data; if (*data && *data == '\n') { ++data; continue; } data = rewind; } break; case CHARACTER: while (is_ident_char(*data)) ++data; token = IDENTIFIER; break; case C_COMMENT: if (*data) { if (*data == '\n') ++lineNum; ++data; if (*data) { if (*data == '\n') ++lineNum; ++data; } } while (*data && (*(data-1) != '/' || *(data-2) != '*')) { if (*data == '\n') ++lineNum; ++data; } token = WHITESPACE; // one comment, one whitespace // fall through; case WHITESPACE: if (column == 1) column = 0; while (*data && (*data == ' ' || *data == '\t')) ++data; if (Preprocessor::preprocessOnly) // tokenize whitespace break; continue; case CPP_COMMENT: while (*data && *data != '\n') ++data; continue; // ignore safely, the newline is a separator default: continue; //ignore } } #ifdef USE_LEXEM_STORE if (!Preprocessor::preprocessOnly && token != IDENTIFIER && token != STRING_LITERAL && token != FLOATING_LITERAL && token != INTEGER_LITERAL) symbols += Symbol(lineNum, token); else #endif symbols += Symbol(lineNum, token, input, lexem-begin, data-lexem); } else { // Preprocessor const char *lexem = data; int state = 0; Token token = NOTOKEN; if (mode == TokenizePreprocessorStatement) { state = pp_keyword_trans[0][(int)'#']; mode = TokenizePreprocessor; } for (;;) { if (static_cast<signed char>(*data) < 0) { ++data; continue; } int nextindex = pp_keywords[state].next; int next = 0; if (*data == pp_keywords[state].defchar) next = pp_keywords[state].defnext; else if (!state || nextindex) next = pp_keyword_trans[nextindex][(int)*data]; if (!next) break; state = next; token = pp_keywords[state].token; ++data; } // suboptimal, is_ident_char should use a table if (pp_keywords[state].ident && is_ident_char(*data)) token = pp_keywords[state].ident; switch (token) { case NOTOKEN: ++data; break; case PP_IFDEF: symbols += Symbol(lineNum, PP_IF); symbols += Symbol(lineNum, PP_DEFINED); continue; case PP_IFNDEF: symbols += Symbol(lineNum, PP_IF); symbols += Symbol(lineNum, PP_NOT); symbols += Symbol(lineNum, PP_DEFINED); continue; case PP_INCLUDE: mode = TokenizeInclude; break; case PP_QUOTE: data = skipQuote(data); token = PP_STRING_LITERAL; break; case PP_SINGLEQUOTE: while (*data && (*data != '\'' || (*(data-1)=='\\' && *(data-2)!='\\'))) ++data; if (*data) ++data; token = PP_CHARACTER_LITERAL; break; case PP_DIGIT: while (is_digit_char(*data)) ++data; if (!*data || *data != '.') { token = PP_INTEGER_LITERAL; if (data - lexem == 1 && (*data == 'x' || *data == 'X') && *lexem == '0') { ++data; while (is_hex_char(*data)) ++data; } break; } token = PP_FLOATING_LITERAL; ++data; // fall through case PP_FLOATING_LITERAL: while (is_digit_char(*data)) ++data; if (*data == '+' || *data == '-') ++data; if (*data == 'e' || *data == 'E') { ++data; while (is_digit_char(*data)) ++data; } if (*data == 'f' || *data == 'F' || *data == 'l' || *data == 'L') ++data; break; case PP_CHARACTER: if (mode == PreparePreprocessorStatement) { // rewind entire token to begin data = lexem; mode = TokenizePreprocessorStatement; continue; } while (is_ident_char(*data)) ++data; token = PP_IDENTIFIER; break; case PP_C_COMMENT: if (*data) { if (*data == '\n') ++lineNum; ++data; if (*data) { if (*data == '\n') ++lineNum; ++data; } } while (*data && (*(data-1) != '/' || *(data-2) != '*')) { if (*data == '\n') ++lineNum; ++data; } token = PP_WHITESPACE; // one comment, one whitespace // fall through; case PP_WHITESPACE: while (*data && (*data == ' ' || *data == '\t')) ++data; continue; // the preprocessor needs no whitespace case PP_CPP_COMMENT: while (*data && *data != '\n') ++data; continue; // ignore safely, the newline is a separator case PP_NEWLINE: ++lineNum; mode = TokenizeCpp; break; case PP_BACKSLASH: { const char *rewind = data; while (*data && (*data == ' ' || *data == '\t')) ++data; if (*data && *data == '\n') { ++data; continue; } data = rewind; } break; case PP_LANGLE: if (mode != TokenizeInclude) break; token = PP_STRING_LITERAL; while (*data && *data != '\n' && *(data-1) != '>') ++data; break; default: break; } if (mode == PreparePreprocessorStatement) continue; #ifdef USE_LEXEM_STORE if (token != PP_IDENTIFIER && token != PP_STRING_LITERAL && token != PP_FLOATING_LITERAL && token != PP_INTEGER_LITERAL) symbols += Symbol(lineNum, token); else #endif symbols += Symbol(lineNum, token, input, lexem-begin, data-lexem); } } symbols += Symbol(); // eof symbol return symbols; }
Symbols Preprocessor::macroExpandIdentifier(Preprocessor *that, SymbolStack &symbols, int lineNum, QByteArray *macroName) { Symbol s = symbols.symbol(); // not a macro if (s.token != PP_IDENTIFIER || !that->macros.contains(s) || symbols.dontReplaceSymbol(s.lexem())) { Symbols syms; syms += s; syms.last().lineNum = lineNum; return syms; } const Macro ¯o = that->macros.value(s); *macroName = s.lexem(); Symbols expansion; if (!macro.isFunction) { expansion = macro.symbols; } else { bool haveSpace = false; while (symbols.test(PP_WHITESPACE)) { haveSpace = true; } if (!symbols.test(PP_LPAREN)) { *macroName = QByteArray(); Symbols syms; if (haveSpace) syms += Symbol(lineNum, PP_WHITESPACE); syms += s; syms.last().lineNum = lineNum; return syms; } QList<Symbols> arguments; while (symbols.hasNext()) { Symbols argument; // strip leading space while (symbols.test(PP_WHITESPACE)) {} int nesting = 0; bool vararg = macro.isVariadic && (arguments.size() == macro.arguments.size() - 1); while (symbols.hasNext()) { Token t = symbols.next(); if (t == PP_LPAREN) { ++nesting; } else if (t == PP_RPAREN) { --nesting; if (nesting < 0) break; } else if (t == PP_COMMA && nesting == 0) { if (!vararg) break; } argument += symbols.symbol(); } arguments += argument; if (nesting < 0) break; } // empty VA_ARGS if (macro.isVariadic && arguments.size() == macro.arguments.size() - 1) arguments += Symbols(); if (arguments.size() != macro.arguments.size() && // 0 argument macros are a bit special. They are ok if the // argument is pure whitespace or empty (macro.arguments.size() != 0 || arguments.size() != 1 || !arguments.at(0).isEmpty())) that->error("Macro argument mismatch."); // now replace the macro arguments with the expanded arguments enum Mode { Normal, Hash, HashHash } mode = Normal; for (int i = 0; i < macro.symbols.size(); ++i) { const Symbol &s = macro.symbols.at(i); if (s.token == HASH || s.token == PP_HASHHASH) { mode = (s.token == HASH ? Hash : HashHash); continue; } int index = macro.arguments.indexOf(s); if (mode == Normal) { if (index >= 0) { // each argument undoergoes macro expansion if it's not used as part of a # or ## if (i == macro.symbols.size() - 1 || macro.symbols.at(i + 1).token != PP_HASHHASH) { Symbols arg = arguments.at(index); int idx = 1; expansion += macroExpand(that, arg, idx, lineNum, false); } else { expansion += arguments.at(index); } } else { expansion += s; } } else if (mode == Hash) { if (index < 0) that->error("'#' is not followed by a macro parameter"); const Symbols &arg = arguments.at(index); QByteArray stringified; for (int i = 0; i < arg.size(); ++i) { stringified += arg.at(i).lexem(); } stringified.replace('"', "\\\""); stringified.prepend('"'); stringified.append('"'); expansion += Symbol(lineNum, STRING_LITERAL, stringified); } else if (mode == HashHash) { if (s.token == WHITESPACE) continue; while (expansion.size() && expansion.last().token == PP_WHITESPACE) expansion.pop_back(); Symbol next = s; if (index >= 0) { const Symbols &arg = arguments.at(index); if (arg.size() == 0) { mode = Normal; continue; } next = arg.at(0); } if (!expansion.isEmpty() && expansion.last().token == s.token) { Symbol last = expansion.last(); expansion.pop_back(); if (last.token == STRING_LITERAL || s.token == STRING_LITERAL) that->error("Can't concatenate non identifier tokens"); QByteArray lexem = last.lexem() + next.lexem(); expansion += Symbol(lineNum, last.token, lexem); } else { expansion += next; } if (index >= 0) { const Symbols &arg = arguments.at(index); for (int i = 1; i < arg.size(); ++i) expansion += arg.at(i); } } mode = Normal; } if (mode != Normal) that->error("'#' or '##' found at the end of a macro argument"); } return expansion; }