Example #1
0
  void MapTokens(const std::vector<std::string>& tokens,
                 size_t& row,
                 arma::Mat<eT>& matrix,
                 MapType& maps,
                 std::vector<Datatype>& types)
  {
    // MissingPolicy allows double type matrix only, because it uses NaN.
    static_assert(std::is_same<eT, double>::value, "You must use double type "
        " matrix in order to apply MissingPolicy");

    std::stringstream token;
    for (size_t i = 0; i != tokens.size(); ++i)
    {
      token.str(tokens[i]);
      token>>matrix.at(row, i);
      // if the token is not number, map it.
      // or if token is a number, but is included in the missingSet, map it.
      if (token.fail() || missingSet.find(tokens[i]) != std::end(missingSet))
      {
        const eT val = static_cast<eT>(this->MapString(tokens[i], row, maps,
                                                       types));
        matrix.at(row, i) = val;
      }
      token.clear();
    }
  }
Example #2
0
bool Load(const std::string& filename,
          arma::Mat<eT>& matrix,
          DatasetInfo& info,
          const bool fatal,
          const bool transpose)
{
  // Get the extension and load as necessary.
  Timer::Start("loading_data");

  // Get the extension.
  std::string extension = Extension(filename);

  // Catch nonexistent files by opening the stream ourselves.
  std::fstream stream;
  stream.open(filename.c_str(), std::fstream::in);

  if (!stream.is_open())
  {
    Timer::Stop("loading_data");
    if (fatal)
      Log::Fatal << "Cannot open file '" << filename << "'. " << std::endl;
    else
      Log::Warn << "Cannot open file '" << filename << "'; load failed."
          << std::endl;

    return false;
  }

  if (extension == "csv" || extension == "tsv" || extension == "txt")
  {
    // True if we're looking for commas; if false, we're looking for spaces.
    bool commas = (extension == "csv");

    std::string type;
    if (extension == "csv")
      type = "CSV data";
    else
      type = "raw ASCII-formatted data";

    Log::Info << "Loading '" << filename << "' as " << type << ".  "
        << std::flush;
    std::string separators;
    if (commas)
      separators = ",";
    else
      separators = " \t";

    // We'll load this as CSV (or CSV with spaces or tabs) according to
    // RFC4180.  So the first thing to do is determine the size of the matrix.
    std::string buffer;
    size_t cols = 0;

    std::getline(stream, buffer, '\n');
    // Count commas and whitespace in the line, ignoring anything inside
    // quotes.
    typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer;
    boost::escaped_list_separator<char> sep("\\", separators, "\"");
    Tokenizer tok(buffer, sep);
    for (Tokenizer::iterator i = tok.begin(); i != tok.end(); ++i)
      ++cols;

    // Now count the number of lines in the file.  We've already counted the
    // first one.
    size_t rows = 1;
    while (!stream.eof() && !stream.bad() && !stream.fail())
    {
      std::getline(stream, buffer, '\n');
      if (!stream.fail())
        ++rows;
    }

    // Now we have the size.  So resize our matrix.
    if (transpose)
    {
      matrix.set_size(cols, rows);
      info = DatasetInfo(cols);
    }
    else
    {
      matrix.set_size(rows, cols);
      info = DatasetInfo(rows);
    }

    stream.close();
    stream.open(filename, std::fstream::in);

    // Extract line by line.
    std::stringstream token;
    size_t row = 0;
    while (!stream.bad() && !stream.fail() && !stream.eof())
    {
      std::getline(stream, buffer, '\n');

      // Look at each token.  Unfortunately we have to do this character by
      // character, because things may be escaped in quotes.
      Tokenizer lineTok(buffer, sep);
      size_t col = 0;
      for (Tokenizer::iterator it = lineTok.begin(); it != lineTok.end(); ++it)
      {
        // Attempt to extract as type eT.  If that fails, we'll assume it's a
        // string and map it (which may involve retroactively mapping everything
        // we've seen so far).
        token.clear();
        token.str(*it);

        eT val = eT(0);
        token >> val;

        if (token.fail())
        {
          // Conversion failed; but it may be a NaN or inf.  Armadillo has
          // convenient functions to check.
          if (!arma::diskio::convert_naninf(val, token.str()))
          {
            // We need to perform a mapping.
            const size_t dim = (transpose) ? col : row;
            if (info.Type(dim) == Datatype::numeric)
            {
              // We must map everything we have seen up to this point and change
              // the values in the matrix.
              if (transpose)
              {
                // Whatever we've seen so far has successfully mapped to an eT.
                // So we need to print it back to a string.  We'll use
                // Armadillo's functionality for that.
                for (size_t i = 0; i < row; ++i)
                {
                  std::stringstream sstr;
                  arma::arma_ostream::print_elem(sstr, matrix.at(i, col),
                      false);
                  eT newVal = info.MapString(sstr.str(), col);
                  matrix.at(i, col) = newVal;
                }
              }
              else
              {
                for (size_t i = 0; i < col; ++i)
                {
                  std::stringstream sstr;
                  arma::arma_ostream::print_elem(sstr, matrix.at(row, i),
                      false);
                  eT newVal = info.MapString(sstr.str(), row);
                  matrix.at(row, i) = newVal;
                }
              }
            }

            // Strip whitespace from either side of the string.
            std::string trimmedToken(token.str());
            boost::trim(trimmedToken);
            val = info.MapString(trimmedToken, dim);
          }
        }

        if (transpose)
          matrix(col, row) = val;
        else
          matrix(row, col) = val;

        ++col;
      }

      ++row;
    }
  }