// This is really only used for testing bool tokeniseReservedWord(std::string::const_iterator& s, std::string::const_iterator& e, Token& tok) { std::string::const_iterator p = s; bool r = tokeniseIdentifier(p, e, tok) && tokeniseReservedWord(tok); if (r) s = p; return r; }
bool tokenise(std::string::const_iterator& s, std::string::const_iterator& e, Token& tok) { std::string::const_iterator t = s; // Hand constructed state machine recogniser enum { START, REJECT, IDENTIFIER, ZERO, DIGIT, HEXDIGIT_START, HEXDIGIT, OCTDIGIT, BINDIGIT_START, BINDIGIT, DECIMAL_START, DECIMAL, EXPONENT_SIGN, EXPONENT_START, EXPONENT, ACCEPT_IDENTIFIER, ACCEPT_INC, ACCEPT_NOINC } state = START; TokenType tokType = T_EOS; while (true) switch (state) { case START: if (t==e) {tok = Token(T_EOS, s, "<END>"); return true;} else if (std::isspace(*t)) {++t; ++s; continue;} else switch (*t) { case '(': tokType = T_LPAREN; state = ACCEPT_INC; continue; case ')': tokType = T_RPAREN; state = ACCEPT_INC; continue; case ',': tokType = T_COMMA; state = ACCEPT_INC; continue; case '+': tokType = T_PLUS; state = ACCEPT_INC; continue; case '-': tokType = T_MINUS; state = ACCEPT_INC; continue; case '*': tokType = T_MULT; state = ACCEPT_INC; continue; case '/': tokType = T_DIV; state = ACCEPT_INC; continue; case '=': tokType = T_EQUAL; state = ACCEPT_INC; continue; case '<': ++t; if (t==e || (*t!='>' && *t!='=')) {tokType = T_LESS; state = ACCEPT_NOINC; continue; } else {tokType = (*t=='>') ? T_NEQ : T_LSEQ; state = ACCEPT_INC; continue; } case '>': ++t; if (t==e || *t!='=') {tokType = T_GRT; state = ACCEPT_NOINC; continue;} else {tokType = T_GREQ; state = ACCEPT_INC; continue;} default: break; } if (isIdentifierStart(*t)) {++t; state = IDENTIFIER;} else if (*t=='\'') {return processString(s, e, '\'', T_STRING, tok);} else if (*t=='\"') {return processString(s, e, '\"', T_IDENTIFIER, tok);} else if (*t=='0') {++t; state = ZERO;} else if (std::isdigit(*t)) {++t; state = DIGIT;} else if (*t=='.') {++t; state = DECIMAL_START;} else state = REJECT; continue; case IDENTIFIER: if (t==e) {state = ACCEPT_IDENTIFIER;} else if (isIdentifierPart(*t)) {++t; state = IDENTIFIER;} else state = ACCEPT_IDENTIFIER; continue; case DECIMAL_START: if (t==e) {state = REJECT;} else if (std::isdigit(*t)) {++t; state = DECIMAL;} else state = REJECT; continue; case EXPONENT_SIGN: if (t==e) {state = REJECT;} else if (*t=='-' || *t=='+') {++t; state = EXPONENT_START;} else if (std::isdigit(*t)) {++t; state = EXPONENT;} else state = REJECT; continue; case EXPONENT_START: if (t==e) {state = REJECT;} else if (std::isdigit(*t)) {++t; state = EXPONENT;} else state = REJECT; continue; case ZERO: if (t==e) {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} else if (*t=='.') {++t; state = DECIMAL;} else if (*t=='x' || *t=='X') {++t; state = HEXDIGIT_START;} else if (*t=='b' || *t=='B') {++t; state = BINDIGIT_START;} else state = OCTDIGIT; continue; case HEXDIGIT_START: if (t==e) {state = REJECT;} else if (std::isxdigit(*t)) {++t; state = HEXDIGIT;} else state = REJECT; continue; case HEXDIGIT: if (t==e) {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} else if (*t=='l' || *t=='L') {tokType = T_NUMERIC_EXACT; state = ACCEPT_INC;} else if (std::isxdigit(*t) || *t=='_') {++t; state = HEXDIGIT;} else if (*t=='p' || *t=='P') {++t; state = EXPONENT_SIGN;} else {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} continue; case BINDIGIT_START: if (t==e) {state = REJECT;} else if (*t=='0' || *t=='1') {++t; state = BINDIGIT;} else state = REJECT; continue; case BINDIGIT: if (t==e) {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} else if (*t=='l' || *t=='L') {tokType = T_NUMERIC_EXACT; state = ACCEPT_INC;} else if (*t=='0' || *t=='1' || *t=='_') {++t; state = BINDIGIT;} else {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} continue; case OCTDIGIT: if (t==e) {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} else if (*t=='l' || *t=='L') {tokType = T_NUMERIC_EXACT; state = ACCEPT_INC;} else if ((std::isdigit(*t) && *t<'8') || *t=='_') {++t; state = OCTDIGIT;} else {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} continue; case DIGIT: if (t==e) {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} else if (*t=='l' || *t=='L') {tokType = T_NUMERIC_EXACT; state = ACCEPT_INC;} else if (*t=='f' || *t=='F' || *t=='d' || *t=='D') {tokType = T_NUMERIC_APPROX; state = ACCEPT_INC;} else if (std::isdigit(*t) || *t=='_') {++t; state = DIGIT;} else if (*t=='.') {++t; state = DECIMAL;} else if (*t=='e' || *t=='E') {++t; state = EXPONENT_SIGN;} else {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} continue; case DECIMAL: if (t==e) {tokType = T_NUMERIC_APPROX; state = ACCEPT_NOINC;} else if (std::isdigit(*t) || *t=='_') {++t; state = DECIMAL;} else if (*t=='e' || *t=='E') {++t; state = EXPONENT_SIGN;} else if (*t=='f' || *t=='F' || *t=='d' || *t=='D') {tokType = T_NUMERIC_APPROX; state = ACCEPT_INC;} else {tokType = T_NUMERIC_APPROX; state = ACCEPT_NOINC;} continue; case EXPONENT: if (t==e) {tokType = T_NUMERIC_APPROX; state = ACCEPT_NOINC;} else if (std::isdigit(*t)) {++t; state = EXPONENT;} else if (*t=='f' || *t=='F' || *t=='d' || *t=='D') {tokType = T_NUMERIC_APPROX; state = ACCEPT_INC;} else {tokType = T_NUMERIC_APPROX; state = ACCEPT_NOINC;} continue; case ACCEPT_INC: ++t; case ACCEPT_NOINC: tok = Token(tokType, s, t); s = t; return true; case ACCEPT_IDENTIFIER: tok = Token(T_IDENTIFIER, s, t); s = t; tokeniseReservedWord(tok); return true; case REJECT: return false; }; }
bool tokeniseIdentifierOrReservedWord(std::string::const_iterator& s, std::string::const_iterator& e, Token& tok) { bool r = tokeniseIdentifier(s, e, tok); if (r) (void) tokeniseReservedWord(tok); return r; }