//////////////////////////////////////////////////////////////////////////////// // Lexer::Type::tag // ^ | <isWhiteSpace> [ +|- ] <isIdentifierStart> [ <isIdentifierNext> ]* bool Lexer::isTag (std::string& token, Lexer::Type& type) { std::size_t marker = _cursor; // Lookbehind: ^ | <isWhiteSpace> if (marker > 0 && ! isWhitespace (_text[marker - 1])) return false; if (_text[marker] == '+' || _text[marker] == '-') { ++marker; if (isIdentifierStart (_text[marker])) { utf8_next_char (_text, marker); while (isIdentifierNext (_text[marker])) utf8_next_char (_text, marker); token = _text.substr (_cursor, marker - _cursor); type = Lexer::Type::tag; _cursor = marker; return true; } } return false; }
NABoolean ComSqlTextHandle::isIdentifierPart(const char &aChar) const { if (isIdentifierStart(aChar) OR isDigit(aChar)) { return TRUE; } return FALSE; }
Token *parseIdentifier( Tokenizer *tokenizer ) { StatefulString *ss = tokenizer->ss_; assert( isIdentifierStart( ss, 0 ) ); // Delegate to the less-strict `parseName` Token *t = parseName( tokenizer ); t->type = ( t->value[ t->length - 1 ] == L'(' ) ? FUNCTION : IDENTIFIER; return t; }
int main() { std::cout << "Ascii letters are identifier start " << (isIdentifierStart('a') ? "[OK]" : "[FAIL]") << std::endl; std::cout << "Ascii letters are identifier part " << (isIdentifierPart('a') ? "[OK]" : "[FAIL]") << std::endl; std::cout << "Numbers are not identifier start " << (!isIdentifierStart('0') ? "[OK]" : "[FAIL]") << std::endl; std::cout << "Numbers are identifier part " << (isIdentifierPart('0') ? "[OK]" : "[FAIL]") << std::endl; }
/////////////////////////////////////////////////////////////////////////// // // Parsing Interface // Token *tokenizer_next( Tokenizer *tokenizer ) { wchar_t c, next; Token *token = NULL; next = ss_peek( tokenizer->ss_ ); while ( next != WEOF && !token ) { // Whitespace if ( isWhitespaceStart( tokenizer->ss_, 0 ) ) { token = parseWhitespace( tokenizer ); } // Strings else if ( isStringStart( tokenizer->ss_, 0 ) ) { token = parseString( tokenizer ); } // Comments else if ( isCommentStart( tokenizer->ss_, 0 ) ) { token = parseComment( tokenizer ); } // URL else if ( isUrlStart( tokenizer->ss_, 0 ) ) { token = parseUrl( tokenizer ); } // SGML Comments else if ( isSGMLCommentStart( tokenizer->ss_, 0 ) ) { token = parseSGMLComment( tokenizer ); } // Identifier else if ( isIdentifierStart( tokenizer->ss_, 0 ) ) { token = parseIdentifier( tokenizer ); } // @keyword else if ( isAtkeywordStart( tokenizer->ss_, 0 ) ) { token = parseAtkeyword( tokenizer ); } // #keyword else if ( isHashkeywordStart( tokenizer->ss_, 0 ) ) { token = parseHashkeyword( tokenizer ); } // Number else if ( isNumberStart( tokenizer->ss_, 0 ) ) { token = parseNumber( tokenizer ); } // Operators & Delims (everything else) else { token = parseEverythingElse( tokenizer ); } } if ( token ) { return token; } else { return NULL; } }
bool tokeniseIdentifier(std::string::const_iterator& s, std::string::const_iterator& e, Token& tok) { // Be sure that first char is alphanumeric or _ or $ if ( s==e || !isIdentifierStart(*s) ) return false; std::string::const_iterator t = s; while ( s!=e && isIdentifierPart(*++s) ); tok = Token(T_IDENTIFIER, t, s); return true; }
Token Lexer::nextToken() { if (atEof_) { return Token(TokenType::END_OF_INPUT, ""); } if (std::isdigit(next_)) { return parseNumber(); } else if (isIdentifierStart(next_)) { return parseIdentifier(); } else if (isEol(next_)) { return parseNewLine(); } else { return parseOperator(); } }
//////////////////////////////////////////////////////////////////////////////// // Lexer::Type::identifier // <isIdentifierStart> [ <isIdentifierNext> ]* bool Lexer::isIdentifier (std::string& token, Lexer::Type& type) { std::size_t marker = _cursor; if (isIdentifierStart (_text[marker])) { utf8_next_char (_text, marker); while (isIdentifierNext (_text[marker])) utf8_next_char (_text, marker); token = _text.substr (_cursor, marker - _cursor); type = Lexer::Type::identifier; _cursor = marker; return true; } return false; }
Token *parseNumber( Tokenizer *tokenizer ) { StatefulString *ss = tokenizer->ss_; assert( isNumberStart( ss, 0 ) ); int start, length; StatefulStringPosition pos1, pos2; start = ss->next_index; length = 0; pos1 = ss->next_position; TokenType type = NUMBER; int isFloat = 0; wchar_t temp; while ( isNumeric( ss_peek( ss ) ) || ( ss_peek( ss ) == L'-' && length == 0 ) || ( ss_peek( ss ) == L'.' && !isFloat ) ) { temp = ss_getchar( ss ); if ( temp == L'.' ) { isFloat = 1; } length++; } // Is the number followed by a percentage? if ( ss_peek( ss ) == L'%' ) { ss_getchar( ss ); length++; type = PERCENTAGE; } // Is the number followed by a dimension? else if ( isIdentifierStart( ss, 0 ) ) { while ( isNameChar( ss_peek( ss ) ) ) { ss_getchar( ss ); length++; } type = DIMENSION; } pos2 = ss->next_position; return token_new( ss_substr( ss, start, length ), length, type, pos1, pos2 ); }
/* Advances the parser one token, optionally skipping whitespace * (otherwise it is concatenated and returned as a single whitespace token). * Whitespace is needed to properly render function signatures. Unrecognized * token starts are stored literally, e.g. token may equal to a character '#'. */ static int advanceToken (lexerState *lexer, boolean skip_whitspace) { boolean have_whitespace = FALSE; lexer->line = getSourceLineNumber(); lexer->pos = getInputFilePosition(); while (lexer->cur_c != EOF) { if (isWhitespace(lexer->cur_c)) { scanWhitespace(lexer); have_whitespace = TRUE; } else if (lexer->cur_c == '/' && (lexer->next_c == '/' || lexer->next_c == '*')) { scanComments(lexer); have_whitespace = TRUE; } else { if (have_whitespace && !skip_whitspace) return lexer->cur_token = TOKEN_WHITESPACE; break; } } lexer->line = getSourceLineNumber(); lexer->pos = getInputFilePosition(); while (lexer->cur_c != EOF) { if (lexer->cur_c == '"') { scanString(lexer); return lexer->cur_token = TOKEN_STRING; } else if (lexer->cur_c == 'r' && (lexer->next_c == '#' || lexer->next_c == '"')) { scanRawString(lexer); return lexer->cur_token = TOKEN_STRING; } else if (lexer->cur_c == '\'') { scanCharacterOrLifetime(lexer); return lexer->cur_token = TOKEN_STRING; } else if (isIdentifierStart(lexer->cur_c)) { scanIdentifier(lexer); return lexer->cur_token = TOKEN_IDENT; } /* These shift tokens aren't too important for tag-generation per se, * but they confuse the skipUntil code which tracks the <> pairs. */ else if (lexer->cur_c == '>' && lexer->next_c == '>') { advanceNChar(lexer, 2); return lexer->cur_token = TOKEN_RSHIFT; } else if (lexer->cur_c == '<' && lexer->next_c == '<') { advanceNChar(lexer, 2); return lexer->cur_token = TOKEN_LSHIFT; } else if (lexer->cur_c == '-' && lexer->next_c == '>') { advanceNChar(lexer, 2); return lexer->cur_token = TOKEN_RARROW; } else { int c = lexer->cur_c; advanceChar(lexer); return lexer->cur_token = c; } } return lexer->cur_token = TOKEN_EOF; }
bool Lexer::isIdentifierPart(char candidate) const { return isIdentifierStart(candidate) || std::isdigit(candidate); }
int getToken(LexStream *L, AST **ast_ptr) { // int base = 10; int c; AST *ast = NULL; int at_startofline = (L->eoln == 1); int peekc; c = skipSpace(L, &ast); if (c >= 127) { *ast_ptr = last_ast = ast; return c; } else if (safe_isdigit(c)) { lexungetc(L,c); ast = NewAST(AST_INTEGER, NULL, NULL); c = parseNumber(L, 10, &ast->d.ival); if (c == T_FLOATNUM) ast->kind = AST_FLOAT; } else if (c == '$') { ast = NewAST(AST_INTEGER, NULL, NULL); c = parseNumber(L, 16, &ast->d.ival); } else if (c == '%') { ast = NewAST(AST_INTEGER, NULL, NULL); c = lexgetc(L); if (c == '%') { c = parseNumber(L, 4, &ast->d.ival); } else { lexungetc(L, c); c = parseNumber(L, 2, &ast->d.ival); } } else if (isIdentifierStart(c)) { lexungetc(L, c); c = parseIdentifier(L, &ast, NULL); /* if in pasm, and at start of line, restart temporary labels */ if (c == T_IDENTIFIER && InDatBlock(L) && at_startofline) { L->lastGlobal = ast->d.string; } } else if (c == ':') { peekc = lexgetc(L); if (peekc == '=') { c = T_ASSIGN; } else if (!gl_p2 && isIdentifierStart(peekc) && InDatBlock(L)) { lexungetc(L, peekc); c = parseIdentifier(L, &ast, L->lastGlobal ? L->lastGlobal : ""); } else { lexungetc(L, peekc); } } else if (gl_p2 && c == '.' && isIdentifierStart(lexpeekc(L)) && InDatBlock(L)) { c = parseIdentifier(L, &ast, L->lastGlobal ? L->lastGlobal : ""); } else if (strchr(operator_chars, c) != NULL) { char op[6]; int i; int token; Symbol *sym = NULL; op[0] = token = c; for (i = 1; i < sizeof(op)-1; i++) { c = lexgetc(L); if (c >= 128 || strchr(operator_chars, c) == NULL) { lexungetc(L, c); break; } op[i] = c; op[i+1] = 0; sym = FindSymbol(&reservedWords, op); if (sym) { token = INTVAL(sym); } else { lexungetc(L, c); break; } } c = token; } else if (c == '"') { c = parseString(L, &ast); } *ast_ptr = last_ast = ast; return c; }
int isIdentifierChar(int c) { return isIdentifierStart(c) || safe_isdigit(c); }
bool tokenise(std::string::const_iterator& s, std::string::const_iterator& e, Token& tok) { std::string::const_iterator t = s; // Hand constructed state machine recogniser enum { START, REJECT, IDENTIFIER, ZERO, DIGIT, HEXDIGIT_START, HEXDIGIT, OCTDIGIT, BINDIGIT_START, BINDIGIT, DECIMAL_START, DECIMAL, EXPONENT_SIGN, EXPONENT_START, EXPONENT, ACCEPT_IDENTIFIER, ACCEPT_INC, ACCEPT_NOINC } state = START; TokenType tokType = T_EOS; while (true) switch (state) { case START: if (t==e) {tok = Token(T_EOS, s, "<END>"); return true;} else if (std::isspace(*t)) {++t; ++s; continue;} else switch (*t) { case '(': tokType = T_LPAREN; state = ACCEPT_INC; continue; case ')': tokType = T_RPAREN; state = ACCEPT_INC; continue; case ',': tokType = T_COMMA; state = ACCEPT_INC; continue; case '+': tokType = T_PLUS; state = ACCEPT_INC; continue; case '-': tokType = T_MINUS; state = ACCEPT_INC; continue; case '*': tokType = T_MULT; state = ACCEPT_INC; continue; case '/': tokType = T_DIV; state = ACCEPT_INC; continue; case '=': tokType = T_EQUAL; state = ACCEPT_INC; continue; case '<': ++t; if (t==e || (*t!='>' && *t!='=')) {tokType = T_LESS; state = ACCEPT_NOINC; continue; } else {tokType = (*t=='>') ? T_NEQ : T_LSEQ; state = ACCEPT_INC; continue; } case '>': ++t; if (t==e || *t!='=') {tokType = T_GRT; state = ACCEPT_NOINC; continue;} else {tokType = T_GREQ; state = ACCEPT_INC; continue;} default: break; } if (isIdentifierStart(*t)) {++t; state = IDENTIFIER;} else if (*t=='\'') {return processString(s, e, '\'', T_STRING, tok);} else if (*t=='\"') {return processString(s, e, '\"', T_IDENTIFIER, tok);} else if (*t=='0') {++t; state = ZERO;} else if (std::isdigit(*t)) {++t; state = DIGIT;} else if (*t=='.') {++t; state = DECIMAL_START;} else state = REJECT; continue; case IDENTIFIER: if (t==e) {state = ACCEPT_IDENTIFIER;} else if (isIdentifierPart(*t)) {++t; state = IDENTIFIER;} else state = ACCEPT_IDENTIFIER; continue; case DECIMAL_START: if (t==e) {state = REJECT;} else if (std::isdigit(*t)) {++t; state = DECIMAL;} else state = REJECT; continue; case EXPONENT_SIGN: if (t==e) {state = REJECT;} else if (*t=='-' || *t=='+') {++t; state = EXPONENT_START;} else if (std::isdigit(*t)) {++t; state = EXPONENT;} else state = REJECT; continue; case EXPONENT_START: if (t==e) {state = REJECT;} else if (std::isdigit(*t)) {++t; state = EXPONENT;} else state = REJECT; continue; case ZERO: if (t==e) {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} else if (*t=='.') {++t; state = DECIMAL;} else if (*t=='x' || *t=='X') {++t; state = HEXDIGIT_START;} else if (*t=='b' || *t=='B') {++t; state = BINDIGIT_START;} else state = OCTDIGIT; continue; case HEXDIGIT_START: if (t==e) {state = REJECT;} else if (std::isxdigit(*t)) {++t; state = HEXDIGIT;} else state = REJECT; continue; case HEXDIGIT: if (t==e) {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} else if (*t=='l' || *t=='L') {tokType = T_NUMERIC_EXACT; state = ACCEPT_INC;} else if (std::isxdigit(*t) || *t=='_') {++t; state = HEXDIGIT;} else if (*t=='p' || *t=='P') {++t; state = EXPONENT_SIGN;} else {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} continue; case BINDIGIT_START: if (t==e) {state = REJECT;} else if (*t=='0' || *t=='1') {++t; state = BINDIGIT;} else state = REJECT; continue; case BINDIGIT: if (t==e) {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} else if (*t=='l' || *t=='L') {tokType = T_NUMERIC_EXACT; state = ACCEPT_INC;} else if (*t=='0' || *t=='1' || *t=='_') {++t; state = BINDIGIT;} else {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} continue; case OCTDIGIT: if (t==e) {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} else if (*t=='l' || *t=='L') {tokType = T_NUMERIC_EXACT; state = ACCEPT_INC;} else if ((std::isdigit(*t) && *t<'8') || *t=='_') {++t; state = OCTDIGIT;} else {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} continue; case DIGIT: if (t==e) {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} else if (*t=='l' || *t=='L') {tokType = T_NUMERIC_EXACT; state = ACCEPT_INC;} else if (*t=='f' || *t=='F' || *t=='d' || *t=='D') {tokType = T_NUMERIC_APPROX; state = ACCEPT_INC;} else if (std::isdigit(*t) || *t=='_') {++t; state = DIGIT;} else if (*t=='.') {++t; state = DECIMAL;} else if (*t=='e' || *t=='E') {++t; state = EXPONENT_SIGN;} else {tokType = T_NUMERIC_EXACT; state = ACCEPT_NOINC;} continue; case DECIMAL: if (t==e) {tokType = T_NUMERIC_APPROX; state = ACCEPT_NOINC;} else if (std::isdigit(*t) || *t=='_') {++t; state = DECIMAL;} else if (*t=='e' || *t=='E') {++t; state = EXPONENT_SIGN;} else if (*t=='f' || *t=='F' || *t=='d' || *t=='D') {tokType = T_NUMERIC_APPROX; state = ACCEPT_INC;} else {tokType = T_NUMERIC_APPROX; state = ACCEPT_NOINC;} continue; case EXPONENT: if (t==e) {tokType = T_NUMERIC_APPROX; state = ACCEPT_NOINC;} else if (std::isdigit(*t)) {++t; state = EXPONENT;} else if (*t=='f' || *t=='F' || *t=='d' || *t=='D') {tokType = T_NUMERIC_APPROX; state = ACCEPT_INC;} else {tokType = T_NUMERIC_APPROX; state = ACCEPT_NOINC;} continue; case ACCEPT_INC: ++t; case ACCEPT_NOINC: tok = Token(tokType, s, t); s = t; return true; case ACCEPT_IDENTIFIER: tok = Token(T_IDENTIFIER, s, t); s = t; tokeniseReservedWord(tok); return true; case REJECT: return false; }; }
int nextLexeme(char *s, int *length, int *type){ char *p = s; if(isEndOfInput(*p)){ *length = 0; *type = LT_INV; return -1; } else if(isNewLine(*p)){ *length = 1; *type = LT_NL; return 0; } else if(isWhiteSpace(*p)){ while(isWhiteSpace(*p)){ p++; } *length = p - s; *type = LT_WS; return 0; } else if(isNumberBegining(*p, *(p+1))){ while(isPPNumber(*p)){ if(isPPNumberExp(*p) && isPPNumberExpSign(*(p+1))){ p += 2; } else { p++; } } *length = p - s; *type = LT_NUM; return 0; } else if(isIdentifierStart(*p)){ while(isIdentifier(*p)){ p++; } *length = p - s; *type = LT_ID; return 0; } else if(*p == '\''){ p++; while(isSourceCharSet(*p) && *p != '\''){ if(!(*p == '\\' && *(p+1) == '\'')){ p++; } else { p += 2; } } if(*p == '\''){ p++; } else { fprintf(stderr, "invalid token: %02x\n", *p); exit(1); } *length = p - s; *type = LT_CC; return 0; } else if(*p == '\"'){ p++; while(isSourceCharSet(*p) && *p != '\"'){ if(!(*p == '\\' && *(p+1) == '\"')){ p++; } else { p += 2; } } if(*p == '\"'){ p++; } else { fprintf(stderr, "invalid string literal token\n"); exit(1); } *length = p - s; *type = LT_STRLIT; return 0; } else if(*p == '('){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == ')'){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == ','){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == ';'){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == '?'){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == '['){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == ']'){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == '{'){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == '}'){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == '~'){ p++; *length = 1; *type = LT_PUNC; return 0; } else if(*p == '!'){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '%'){ p++; if(*p == ':'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '>'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '&'){ p++; if(*p == '&'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '*'){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '+'){ p++; if(*p == '+'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '-'){ p++; if(*p == '-'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '>'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '.' && isDigit){ p++; if(*p == '.' && *(p+1) == '.'){ p += 2; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '/'){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == ':'){ p++; if(*p == '>'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '<'){ p++; if(*p == '%'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == ':'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '<'){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '='){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '>'){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '>'){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '^'){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else if(*p == '|'){ p++; if(*p == '='){ p++; *length = p - s; *type = LT_PUNC; return 0; } else if(*p == '|'){ p++; *length = p - s; *type = LT_PUNC; return 0; } else { *length = p - s; *type = LT_PUNC; return 0; } } else { p++; *length = p - s; *type = LT_PUNC; return 0; } return 0; }