static Symbols tokenize(const QByteArray &input, int lineNum = 1, TokenizeMode mode = TokenizeCpp) { Symbols symbols; const char *begin = input; const char *data = begin; while (*data) { if (mode == TokenizeCpp) { int column = 0; const char *lexem = data; int state = 0; Token token = NOTOKEN; for (;;) { if (static_cast<signed char>(*data) < 0) { ++data; continue; } int nextindex = keywords[state].next; int next = 0; if (*data == keywords[state].defchar) next = keywords[state].defnext; else if (!state || nextindex) next = keyword_trans[nextindex][(int)*data]; if (!next) break; state = next; token = keywords[state].token; ++data; } // suboptimal, is_ident_char should use a table if (keywords[state].ident && is_ident_char(*data)) token = keywords[state].ident; if (token == NOTOKEN) { // an error really ++data; continue; } ++column; if (token > SPECIAL_TREATMENT_MARK) { switch (token) { case QUOTE: data = skipQuote(data); token = STRING_LITERAL; // concatenate multi-line strings for easier // STRING_LITERAAL handling in moc if (!Preprocessor::preprocessOnly && !symbols.isEmpty() && symbols.last().token == STRING_LITERAL) { QByteArray newString = symbols.last().unquotedLexem(); newString += input.mid(lexem - begin + 1, data - lexem - 2); newString.prepend('\"'); newString.append('\"'); symbols.last() = Symbol(symbols.last().lineNum, STRING_LITERAL, newString); continue; } break; case SINGLEQUOTE: while (*data && (*data != '\'' || (*(data-1)=='\\' && *(data-2)!='\\'))) ++data; if (*data) ++data; token = CHARACTER_LITERAL; break; case LANGLE_SCOPE: // split <:: into two tokens, < and :: token = LANGLE; data -= 2; break; case DIGIT: while (is_digit_char(*data)) ++data; if (!*data || *data != '.') { token = INTEGER_LITERAL; if (data - lexem == 1 && (*data == 'x' || *data == 'X') && *lexem == '0') { ++data; while (is_hex_char(*data)) ++data; } break; } token = FLOATING_LITERAL; ++data; // fall through case FLOATING_LITERAL: while (is_digit_char(*data)) ++data; if (*data == '+' || *data == '-') ++data; if (*data == 'e' || *data == 'E') { ++data; while (is_digit_char(*data)) ++data; } if (*data == 'f' || *data == 'F' || *data == 'l' || *data == 'L') ++data; break; case HASH: if (column == 1) { mode = PreparePreprocessorStatement; while (*data && (*data == ' ' || *data == '\t')) ++data; if (is_ident_char(*data)) mode = TokenizePreprocessorStatement; continue; } break; case NEWLINE: ++lineNum; continue; case BACKSLASH: { const char *rewind = data; while (*data && (*data == ' ' || *data == '\t')) ++data; if (*data && *data == '\n') { ++data; continue; } data = rewind; } break; case CHARACTER: while (is_ident_char(*data)) ++data; token = IDENTIFIER; break; case C_COMMENT: if (*data) { if (*data == '\n') ++lineNum; ++data; if (*data) { if (*data == '\n') ++lineNum; ++data; } } while (*data && (*(data-1) != '/' || *(data-2) != '*')) { if (*data == '\n') ++lineNum; ++data; } token = WHITESPACE; // one comment, one whitespace // fall through; case WHITESPACE: if (column == 1) column = 0; while (*data && (*data == ' ' || *data == '\t')) ++data; if (Preprocessor::preprocessOnly) // tokenize whitespace break; continue; case CPP_COMMENT: while (*data && *data != '\n') ++data; continue; // ignore safely, the newline is a separator default: continue; //ignore } } #ifdef USE_LEXEM_STORE if (!Preprocessor::preprocessOnly && token != IDENTIFIER && token != STRING_LITERAL && token != FLOATING_LITERAL && token != INTEGER_LITERAL) symbols += Symbol(lineNum, token); else #endif symbols += Symbol(lineNum, token, input, lexem-begin, data-lexem); } else { // Preprocessor const char *lexem = data; int state = 0; Token token = NOTOKEN; if (mode == TokenizePreprocessorStatement) { state = pp_keyword_trans[0][(int)'#']; mode = TokenizePreprocessor; } for (;;) { if (static_cast<signed char>(*data) < 0) { ++data; continue; } int nextindex = pp_keywords[state].next; int next = 0; if (*data == pp_keywords[state].defchar) next = pp_keywords[state].defnext; else if (!state || nextindex) next = pp_keyword_trans[nextindex][(int)*data]; if (!next) break; state = next; token = pp_keywords[state].token; ++data; } // suboptimal, is_ident_char should use a table if (pp_keywords[state].ident && is_ident_char(*data)) token = pp_keywords[state].ident; switch (token) { case NOTOKEN: ++data; break; case PP_IFDEF: symbols += Symbol(lineNum, PP_IF); symbols += Symbol(lineNum, PP_DEFINED); continue; case PP_IFNDEF: symbols += Symbol(lineNum, PP_IF); symbols += Symbol(lineNum, PP_NOT); symbols += Symbol(lineNum, PP_DEFINED); continue; case PP_INCLUDE: mode = TokenizeInclude; break; case PP_QUOTE: data = skipQuote(data); token = PP_STRING_LITERAL; break; case PP_SINGLEQUOTE: while (*data && (*data != '\'' || (*(data-1)=='\\' && *(data-2)!='\\'))) ++data; if (*data) ++data; token = PP_CHARACTER_LITERAL; break; case PP_DIGIT: while (is_digit_char(*data)) ++data; if (!*data || *data != '.') { token = PP_INTEGER_LITERAL; if (data - lexem == 1 && (*data == 'x' || *data == 'X') && *lexem == '0') { ++data; while (is_hex_char(*data)) ++data; } break; } token = PP_FLOATING_LITERAL; ++data; // fall through case PP_FLOATING_LITERAL: while (is_digit_char(*data)) ++data; if (*data == '+' || *data == '-') ++data; if (*data == 'e' || *data == 'E') { ++data; while (is_digit_char(*data)) ++data; } if (*data == 'f' || *data == 'F' || *data == 'l' || *data == 'L') ++data; break; case PP_CHARACTER: if (mode == PreparePreprocessorStatement) { // rewind entire token to begin data = lexem; mode = TokenizePreprocessorStatement; continue; } while (is_ident_char(*data)) ++data; token = PP_IDENTIFIER; break; case PP_C_COMMENT: if (*data) { if (*data == '\n') ++lineNum; ++data; if (*data) { if (*data == '\n') ++lineNum; ++data; } } while (*data && (*(data-1) != '/' || *(data-2) != '*')) { if (*data == '\n') ++lineNum; ++data; } token = PP_WHITESPACE; // one comment, one whitespace // fall through; case PP_WHITESPACE: while (*data && (*data == ' ' || *data == '\t')) ++data; continue; // the preprocessor needs no whitespace case PP_CPP_COMMENT: while (*data && *data != '\n') ++data; continue; // ignore safely, the newline is a separator case PP_NEWLINE: ++lineNum; mode = TokenizeCpp; break; case PP_BACKSLASH: { const char *rewind = data; while (*data && (*data == ' ' || *data == '\t')) ++data; if (*data && *data == '\n') { ++data; continue; } data = rewind; } break; case PP_LANGLE: if (mode != TokenizeInclude) break; token = PP_STRING_LITERAL; while (*data && *data != '\n' && *(data-1) != '>') ++data; break; default: break; } if (mode == PreparePreprocessorStatement) continue; #ifdef USE_LEXEM_STORE if (token != PP_IDENTIFIER && token != PP_STRING_LITERAL && token != PP_FLOATING_LITERAL && token != PP_INTEGER_LITERAL) symbols += Symbol(lineNum, token); else #endif symbols += Symbol(lineNum, token, input, lexem-begin, data-lexem); } } symbols += Symbol(); // eof symbol return symbols; }
void RE2NFA::tokenize(const QString &input) { symbols.clear(); #if 1 RegExpTokenizer tokenizer(input); Symbol sym; int tok = tokenizer.lex(); while (tok != -1) { Symbol sym; sym.token = static_cast<Token>(tok); sym.lexem = input.mid(tokenizer.lexemStart, tokenizer.lexemLength); if (sym.token == TOK_QUOTED_STRING) { sym.lexem.chop(1); sym.lexem.remove(0, 1); sym.token = TOK_STRING; } if (sym.token == TOK_STRING || sym.token == TOK_SEQUENCE) { for (int i = 0; i < sym.lexem.length(); ++i) { if (sym.lexem.at(i) == '\\') { if (i >= sym.lexem.length() - 1) break; QChar ch = sym.lexem.at(i + 1); if (ch == QLatin1Char('n')) { ch = '\n'; } else if (ch == QLatin1Char('r')) { ch = '\r'; } else if (ch == QLatin1Char('t')) { ch = '\t'; } else if (ch == QLatin1Char('f')) { ch = '\f'; } sym.lexem.replace(i, 2, ch); } } } /* if (sym.token == TOK_SEQUENCE) { Symbol s; s.token = TOK_LBRACKET; s.lexem = "["; symbols.append(s); for (int i = 1; i < sym.lexem.length() - 1; ++i) { s.token = TOK_STRING; s.lexem = sym.lexem.at(i); symbols.append(s); } s.token = TOK_RBRACKET; s.lexem = "]"; symbols.append(s); tok = tokenizer.lex(); continue; } */ symbols.append(sym); tok = tokenizer.lex(); } #else int pos = 0; bool insideSet = false; while (pos < input.length()) { QChar ch = input.at(pos); Symbol sym; sym.column = pos; sym.token = TOK_INVALID; sym.lexem = QString(ch); switch (ch.toLatin1()) { case '"': { if (insideSet) { sym.token = TOK_STRING; sym.lexem = QString(ch); symbols += sym; ++pos; continue; } if (pos + 1 >= input.length()) return; int quoteEnd = skipQuote(input, pos + 1); sym.token = TOK_STRING; sym.lexem = input.mid(pos + 1, quoteEnd - pos - 2); symbols += sym; pos = quoteEnd; continue; } case '{': sym.token = (insideSet ? TOK_STRING : TOK_LBRACE); break; case '}': sym.token = (insideSet ? TOK_STRING : TOK_RBRACE); break; case '[': insideSet = true; sym.token = TOK_LBRACKET; break; case ']': insideSet = false; sym.token = TOK_RBRACKET; break; case '(': sym.token = (insideSet ? TOK_STRING : TOK_LPAREN); break; case ')': sym.token = (insideSet ? TOK_STRING : TOK_RPAREN); break; case ',': sym.token = (insideSet ? TOK_STRING : TOK_COMMA); break; case '*': sym.token = (insideSet ? TOK_STRING : TOK_STAR); break; case '|': sym.token = (insideSet ? TOK_STRING : TOK_OR); break; case '?': sym.token = (insideSet ? TOK_STRING : TOK_QUESTION); break; case '.': sym.token = (insideSet ? TOK_STRING : TOK_DOT); break; case '+': sym.token = (insideSet ? TOK_STRING : TOK_PLUS); break; case '\\': ++pos; if (pos >= input.length()) return; ch = input.at(pos); if (ch == QLatin1Char('n')) { ch = '\n'; } else if (ch == QLatin1Char('r')) { ch = '\r'; } else if (ch == QLatin1Char('t')) { ch = '\t'; } else if (ch == QLatin1Char('f')) { ch = '\f'; } // fall through default: sym.token = TOK_STRING; sym.lexem = QString(ch); symbols += sym; ++pos; continue; } symbols += sym; ++pos; } #endif #if 0 foreach (Symbol s, symbols) { qDebug() << "Tok" << tokStr(s.token) << "lexem" << s.lexem; }