void MapTokens(const std::vector<std::string>& tokens, size_t& row, arma::Mat<eT>& matrix, MapType& maps, std::vector<Datatype>& types) { // MissingPolicy allows double type matrix only, because it uses NaN. static_assert(std::is_same<eT, double>::value, "You must use double type " " matrix in order to apply MissingPolicy"); std::stringstream token; for (size_t i = 0; i != tokens.size(); ++i) { token.str(tokens[i]); token>>matrix.at(row, i); // if the token is not number, map it. // or if token is a number, but is included in the missingSet, map it. if (token.fail() || missingSet.find(tokens[i]) != std::end(missingSet)) { const eT val = static_cast<eT>(this->MapString(tokens[i], row, maps, types)); matrix.at(row, i) = val; } token.clear(); } }
bool Load(const std::string& filename, arma::Mat<eT>& matrix, DatasetInfo& info, const bool fatal, const bool transpose) { // Get the extension and load as necessary. Timer::Start("loading_data"); // Get the extension. std::string extension = Extension(filename); // Catch nonexistent files by opening the stream ourselves. std::fstream stream; stream.open(filename.c_str(), std::fstream::in); if (!stream.is_open()) { Timer::Stop("loading_data"); if (fatal) Log::Fatal << "Cannot open file '" << filename << "'. " << std::endl; else Log::Warn << "Cannot open file '" << filename << "'; load failed." << std::endl; return false; } if (extension == "csv" || extension == "tsv" || extension == "txt") { // True if we're looking for commas; if false, we're looking for spaces. bool commas = (extension == "csv"); std::string type; if (extension == "csv") type = "CSV data"; else type = "raw ASCII-formatted data"; Log::Info << "Loading '" << filename << "' as " << type << ". " << std::flush; std::string separators; if (commas) separators = ","; else separators = " \t"; // We'll load this as CSV (or CSV with spaces or tabs) according to // RFC4180. So the first thing to do is determine the size of the matrix. std::string buffer; size_t cols = 0; std::getline(stream, buffer, '\n'); // Count commas and whitespace in the line, ignoring anything inside // quotes. typedef boost::tokenizer<boost::escaped_list_separator<char>> Tokenizer; boost::escaped_list_separator<char> sep("\\", separators, "\""); Tokenizer tok(buffer, sep); for (Tokenizer::iterator i = tok.begin(); i != tok.end(); ++i) ++cols; // Now count the number of lines in the file. We've already counted the // first one. size_t rows = 1; while (!stream.eof() && !stream.bad() && !stream.fail()) { std::getline(stream, buffer, '\n'); if (!stream.fail()) ++rows; } // Now we have the size. So resize our matrix. if (transpose) { matrix.set_size(cols, rows); info = DatasetInfo(cols); } else { matrix.set_size(rows, cols); info = DatasetInfo(rows); } stream.close(); stream.open(filename, std::fstream::in); // Extract line by line. std::stringstream token; size_t row = 0; while (!stream.bad() && !stream.fail() && !stream.eof()) { std::getline(stream, buffer, '\n'); // Look at each token. Unfortunately we have to do this character by // character, because things may be escaped in quotes. Tokenizer lineTok(buffer, sep); size_t col = 0; for (Tokenizer::iterator it = lineTok.begin(); it != lineTok.end(); ++it) { // Attempt to extract as type eT. If that fails, we'll assume it's a // string and map it (which may involve retroactively mapping everything // we've seen so far). token.clear(); token.str(*it); eT val = eT(0); token >> val; if (token.fail()) { // Conversion failed; but it may be a NaN or inf. Armadillo has // convenient functions to check. if (!arma::diskio::convert_naninf(val, token.str())) { // We need to perform a mapping. const size_t dim = (transpose) ? col : row; if (info.Type(dim) == Datatype::numeric) { // We must map everything we have seen up to this point and change // the values in the matrix. if (transpose) { // Whatever we've seen so far has successfully mapped to an eT. // So we need to print it back to a string. We'll use // Armadillo's functionality for that. for (size_t i = 0; i < row; ++i) { std::stringstream sstr; arma::arma_ostream::print_elem(sstr, matrix.at(i, col), false); eT newVal = info.MapString(sstr.str(), col); matrix.at(i, col) = newVal; } } else { for (size_t i = 0; i < col; ++i) { std::stringstream sstr; arma::arma_ostream::print_elem(sstr, matrix.at(row, i), false); eT newVal = info.MapString(sstr.str(), row); matrix.at(row, i) = newVal; } } } // Strip whitespace from either side of the string. std::string trimmedToken(token.str()); boost::trim(trimmedToken); val = info.MapString(trimmedToken, dim); } } if (transpose) matrix(col, row) = val; else matrix(row, col) = val; ++col; } ++row; } }