コード例 #1
0
ファイル: scanner.cpp プロジェクト: stephenchouca/simit
TokenStream Scanner::lex(std::istream &programStream) {
  TokenStream tokens;
  unsigned line = 1;
  unsigned col = 1;
  ScanState state = ScanState::INITIAL;

  while (programStream.peek() != EOF) {
    if (programStream.peek() == '_' || std::isalpha(programStream.peek())) {
      std::string tokenString(1, programStream.get());

      while (programStream.peek() == '_' || 
          std::isalnum(programStream.peek())) {
        tokenString += programStream.get();
      }

      Token newToken;
      newToken.type = getTokenType(tokenString);
      newToken.lineBegin = line;
      newToken.colBegin = col;
      newToken.lineEnd = line;
      newToken.colEnd = col + tokenString.length() - 1;
      if (newToken.type == Token::Type::IDENT) {
        newToken.str = tokenString;
      }
      tokens.addToken(newToken);

      col += tokenString.length();
    } else {
      switch (programStream.peek()) {
        case '(':
          programStream.get();
          tokens.addToken(Token::Type::LP, line, col++);
          break;
        case ')':
          programStream.get();
          tokens.addToken(Token::Type::RP, line, col++);
          break;
        case '[':
          programStream.get();
          tokens.addToken(Token::Type::LB, line, col++);
          break;
        case ']':
          programStream.get();
          tokens.addToken(Token::Type::RB, line, col++);
          break;
        case '{':
          programStream.get();
          tokens.addToken(Token::Type::LC, line, col++);
          break;
        case '}':
          programStream.get();
          tokens.addToken(Token::Type::RC, line, col++);
          break;
        case '<':
          programStream.get();
          if (programStream.peek() == '=') {
            programStream.get();
            tokens.addToken(Token::Type::LE, line, col, 2);
            col += 2;
          } else {
            tokens.addToken(Token::Type::LA, line, col++);
          }
          break;
        case '>':
          programStream.get();
          if (programStream.peek() == '=') {
            programStream.get();
            tokens.addToken(Token::Type::GE, line, col, 2);
            col += 2;
          } else {
            tokens.addToken(Token::Type::RA, line, col++);
          }
          break;
        case ',':
          programStream.get();
          tokens.addToken(Token::Type::COMMA, line, col++);
          break;
        case '.':
          programStream.get();
          switch (programStream.peek()) {
            case '*':
              programStream.get();
              tokens.addToken(Token::Type::DOTSTAR, line, col, 2);
              col += 2;
              break;
            case '/':
              programStream.get();
              tokens.addToken(Token::Type::DOTSLASH, line, col, 2);
              col += 2;
              break;
            default:
              tokens.addToken(Token::Type::PERIOD, line, col++);
              break;
          }
          break;
        case ':':
          programStream.get();
          tokens.addToken(Token::Type::COL, line, col++);
          break;
        case ';':
          programStream.get();
          tokens.addToken(Token::Type::SEMICOL, line, col++);
          break;
        case '=':
          programStream.get();
          if (programStream.peek() == '=') {
            programStream.get();
            tokens.addToken(Token::Type::EQ, line, col, 2);
            col += 2;
          } else {
            tokens.addToken(Token::Type::ASSIGN, line, col++);
          }
          break;
        case '*':
          programStream.get();
          tokens.addToken(Token::Type::STAR, line, col++);
          break;
        case '/':
          programStream.get();
          tokens.addToken(Token::Type::SLASH, line, col++);
          break;
        case '\\':
          programStream.get();
          tokens.addToken(Token::Type::BACKSLASH, line, col++);
          break;
        case '^':
          programStream.get();
          tokens.addToken(Token::Type::EXP, line, col++);
          break;
        case '\'':
          programStream.get();
          tokens.addToken(Token::Type::TRANSPOSE, line, col++);
          break;
        case '!':
          programStream.get();
          if (programStream.peek() == '=') {
            programStream.get();
            tokens.addToken(Token::Type::NE, line, col, 2);
            col += 2;
          } else {
            reportError("unexpected symbol '!'", line, col++);
            while (programStream.peek() != EOF && 
                !std::isspace(programStream.peek())) {
              programStream.get();
              ++col;
            }
          }
          break;
        case '%':
          programStream.get();
          switch (programStream.peek()) {
            case '!':
              programStream.get();
              tokens.addToken(Token::Type::TEST, line, col, 2);
              state = ScanState::SLTEST;
              col += 2;
              break;
            case '{':
              if (programStream.peek() == '!') {
                programStream.get();
                tokens.addToken(Token::Type::TEST, line, col, 2);
                state = ScanState::MLTEST;
                col += 2;
              } else {
                ++col;

                std::string comment;
                while (programStream.peek() != EOF) {
                  if (programStream.peek() == '%') {
                    programStream.get();

                    if (programStream.peek() == '\n') {
                      ++line;
                      col = 1;
                    } else {
                      col += 2;
                    }

                    if (programStream.peek() == '}') {
                      programStream.get();
                      // TODO: emit COMMENT token
                      break;
                    } else {
                      comment += '%';
                      comment += programStream.get();
                    }
                  } else {
                    if (programStream.peek() == '\n') {
                      ++line;
                      col = 1;
                    } else {
                      ++col;
                    }

                    comment += programStream.get();
                  }
                }

                if (programStream.peek() == EOF) {
                  reportError("unclosed comment", line, col);
                }
              }
              break;
            case '}':
            {
              programStream.get();
              if (state == ScanState::MLTEST) {
                state = ScanState::INITIAL;
              } else {
                reportError("could not find corresponding '!%{'", line, col);
              }
              col += 2;
              break;
            }
            default:
            {
              std::string comment;
              while (programStream.peek() != '\n' && 
                  programStream.peek() != EOF) {
                comment += programStream.get();
              }

              col += (comment.length() + 1);
              // TODO: emit COMMENT token
              break;
            }
          }
          break;
        case '"':
        {
          Token newToken;
          newToken.type = Token::Type::STRING_LITERAL;
          newToken.lineBegin = line;
          newToken.colBegin = col;
          
          programStream.get();
          ++col;

          while (programStream.peek() != EOF && programStream.peek() != '"') {
            if (programStream.peek() == '\\') {
              programStream.get();

              std::string escapedChar = "";
              switch (programStream.peek()) {
                case 'a':
                  escapedChar = "\a";
                  break;
                case 'b':
                  escapedChar = "\b";
                  break;
                case 'f':
                  escapedChar = "\f";
                  break;
                case 'n':
                  escapedChar = "\n";
                  break;
                case 'r':
                  escapedChar = "\r";
                  break;
                case 't':
                  escapedChar = "\t";
                  break;
                case 'v':
                  escapedChar = "\v";
                  break;
                case '\\':
                  escapedChar = "\\";
                  break;
                case '\'':
                  escapedChar = "\'";
                  break;
                case '"':
                  escapedChar = "\"";
                  break;
                case '?':
                  escapedChar = "\?";
                  break;
                default:
                  reportError("unrecognized escape sequence", line, col);
                  ++col;
                  break;
              }

              if (escapedChar != "") {
                newToken.str += escapedChar;
                programStream.get();
                col += 2;
              }
            } else {
              newToken.str += programStream.get();
              ++col;
            }
          }
          
          newToken.lineEnd = line;
          newToken.colEnd = col;
          tokens.addToken(newToken);

          if (programStream.peek() == '"') {
            programStream.get();
            ++col;
          } else {
            reportError("unclosed string literal", line, col);
          }
          break;
        }
        case '\r':
          programStream.get();
          if (programStream.peek() == '\n') {
            programStream.get();
          }
          if (state == ScanState::SLTEST) {
            state = ScanState::INITIAL;
          }
          ++line;
          col = 1;
          break;
        case '\v':
        case '\f':
        case '\n':
          programStream.get();
          if (state == ScanState::SLTEST) {
            state = ScanState::INITIAL;
          }
          ++line;
          col = 1;
          break;
        case ' ':
        case '\t':
          programStream.get();
          ++col;
          break;
        case '+':
          programStream.get();
          tokens.addToken(Token::Type::PLUS, line, col++);
          break;
        case '-': 
          programStream.get();
          if (programStream.peek() == '>') {
            programStream.get();
            tokens.addToken(Token::Type::RARROW, line, col, 2);
            col += 2;
          } else {
            tokens.addToken(Token::Type::MINUS, line, col++);
          }
          break;
        default: 
        {
          Token newToken;
          newToken.type = Token::Type::INT_LITERAL;
          newToken.lineBegin = line;
          newToken.colBegin = col;

          if (programStream.peek() != '.' && 
              !std::isdigit(programStream.peek())) {
            std::stringstream errMsg;
            errMsg << "unexpected symbol '" 
                   << (char)programStream.peek() << "'";
            reportError(errMsg.str(), line, col);
            
            while (programStream.peek() != EOF && 
                !std::isspace(programStream.peek())) {
              programStream.get();
              ++col;
            }
            break;
          }

          std::string tokenString;
          while (std::isdigit(programStream.peek())) {
            tokenString += programStream.get();
            ++col;
          }

          if (programStream.peek() == '.') {
            newToken.type = Token::Type::FLOAT_LITERAL;
            tokenString += programStream.get();
            ++col;

            if (!std::isdigit(programStream.peek())) {
              std::stringstream errMsg;
              errMsg << "unexpected symbol '" 
                     << (char)programStream.peek() << "'";
              reportError(errMsg.str(), line, col);
              
              while (programStream.peek() != EOF && 
                  !std::isspace(programStream.peek())) {
                programStream.get();
                ++col;
              }
              break;
            }
            tokenString += programStream.get();
            ++col;

            while (std::isdigit(programStream.peek())) {
              tokenString += programStream.get();
              ++col;
            }
          }

          if (programStream.peek() == 'e' || programStream.peek() == 'E') {
            newToken.type = Token::Type::FLOAT_LITERAL;
            tokenString += programStream.get();
            ++col;

            if (programStream.peek() == '+' || programStream.peek() == '-') {
              tokenString += programStream.get();
              ++col;
            }

            if (!std::isdigit(programStream.peek())) {
              std::stringstream errMsg;
              errMsg << "unexpected symbol '" 
                     << (char)programStream.peek() << "'";
              reportError(errMsg.str(), line, col);
              
              while (programStream.peek() != EOF && 
                  !std::isspace(programStream.peek())) {
                programStream.get();
                ++col;
              }
              break;
            }
            tokenString += programStream.get();
            ++col;

            while (std::isdigit(programStream.peek())) {
              tokenString += programStream.get();
              ++col;
            }
          }

          char *end;
          if (newToken.type == Token::Type::INT_LITERAL) {
            newToken.num = std::strtol(tokenString.c_str(), &end, 0);
          } else {
            newToken.fnum = std::strtod(tokenString.c_str(), &end);
          }
          newToken.lineEnd = line;
          newToken.colEnd = col - 1;
          tokens.addToken(newToken);
          break;
        }
      }
    }
  }

  if (state != ScanState::INITIAL) {
    reportError("unclosed test", line, col);
  }

  tokens.addToken(Token::Type::END, line, col);
  return tokens;
}