std::string unity_global::__read__(const std::string& url) { general_ifstream fin(url); if (!fin.good()) { fin.close(); log_and_throw_io_failure(std::string("Cannot open " + sanitize_url(url))); } std::stringstream ss; char* buf = new char[4096]; while(fin.good()) { fin.read(buf, 4096); size_t bytes_read = fin.gcount(); ss.write(buf, bytes_read); } delete[] buf; if (!fin.eof()) { fin.close(); log_and_throw_io_failure(std::string("Read fail " + sanitize_url(url))); } fin.close(); return ss.str(); }
bool unity_sgraph::save_graph(std::string target, std::string format) { log_func_entry(); try { if (format == "binary") { dir_archive dir; dir.open_directory_for_write(target); dir.set_metadata("contents", "graph"); oarchive oarc(dir); if (dir.get_output_stream()->fail()) { log_and_throw_io_failure("Fail to write"); } save(oarc); dir.close(); } else if (format == "json") { save_sgraph_to_json(get_graph(), target); } else if (format == "csv") { save_sgraph_to_csv(get_graph(), target); } else { log_and_throw("Unable to save to format : " + format); } } catch (std::ios_base::failure& e) { std::string message = "Unable to save graph to " + sanitize_url(target) + ": " + e.what(); log_and_throw_io_failure(message); } catch (std::string& e) { std::string message = "Unable to save graph to " + sanitize_url(target) + ": " + e; log_and_throw(message); } catch (...) { std::string message = "Unable to save graph to " + sanitize_url(target) + ": Unknown Error."; log_and_throw(message); } return true; }
void unity_global::save_model(std::shared_ptr<model_base> model, const std::string& model_wrapper, const std::string& url) { logstream(LOG_INFO) << "Save model to " << sanitize_url(url) << std::endl; logstream(LOG_INFO) << "Model name: " << model->name() << std::endl; try { dir_archive dir; dir.open_directory_for_write(url); dir.set_metadata("contents", "model"); oarchive oarc(dir); oarc.write(CLASS_MAGIC_HEADER, strlen(CLASS_MAGIC_HEADER)); oarc << model->name(); oarc << model_wrapper; oarc << *model; if (dir.get_output_stream()->fail()) { std::string message = "Fail to write."; log_and_throw_io_failure(message); } dir.close(); } catch (std::ios_base::failure& e) { std::string message = "Unable to save model to " + sanitize_url(url) + ": " + e.what(); log_and_throw_io_failure(message); } catch (std::string& e) { log_and_throw(std::string("Unable to save model to ") + sanitize_url(url) + ": " + e); } catch (...) { log_and_throw(std::string("Unknown Error: Unable to save model to ") + sanitize_url(url)); } }
general_ofstream::general_ofstream(std::string filename) try :general_ofstream_base(filename), opened_filename(filename) { } catch (const std::exception& e) { log_and_throw_io_failure("Cannot open " + sanitize_url(filename) + " for write. " + e.what()); } catch (std::string e) { log_and_throw_io_failure("Cannot open " + sanitize_url(filename) + " for write. " + e); } catch (...) { log_and_throw_io_failure("Cannot open " + sanitize_url(filename)); }
general_ifstream::general_ifstream(std::string filename, bool gzip_compressed) try :general_ifstream_base(filename, gzip_compressed), opened_filename(filename) { } catch (const std::exception& e) { log_and_throw_io_failure("Cannot open " + sanitize_url(filename) + " for read. " + e.what()); } catch (std::string e) { log_and_throw_io_failure("Cannot open " + sanitize_url(filename) + " for read. " + e); } catch (...) { log_and_throw_io_failure("Cannot open " + sanitize_url(filename)); }
general_ifstream::general_ifstream(std::string filename) // this is the function try block syntax which, together with the member // function pointer syntax, is probably the ugliest C++ syntactic element // every conceieved. try :general_ifstream_base(filename), opened_filename(filename) { } catch (const std::exception& e) { log_and_throw_io_failure("Cannot open " + sanitize_url(filename) + " for read. " + e.what()); } catch (std::string e) { log_and_throw_io_failure("Cannot open " + sanitize_url(filename) + " for read. " + e); } catch (...) { log_and_throw_io_failure("Cannot open " + sanitize_url(filename)); }
bool unity_sgraph::load_graph(std::string target_dir) { log_func_entry(); try { dir_archive dir; dir.open_directory_for_read(target_dir); std::string contents; if (dir.get_metadata("contents", contents) == false || contents != "graph") { log_and_throw(std::string("Archive does not contain a graph.")); } iarchive iarc(dir); load(iarc); dir.close(); } catch (std::ios_base::failure& e) { std::string message = "Unable to load graph from " + sanitize_url(target_dir) + ": " + e.what(); log_and_throw_io_failure(message); } catch (std::string& e) { std::string message = "Unable to load graph from " + sanitize_url(target_dir) + ": " + e; log_and_throw(message); } catch (...) { std::string message = "Unable to load graph from " + sanitize_url(target_dir) + ": Unknown Error."; log_and_throw(message); } return true; }
variant_map_type unity_global::load_model(const std::string& url) { logstream(LOG_INFO) << "Load model from " << sanitize_url(url) << std::endl; try { dir_archive dir; dir.open_directory_for_read(url); std::string contents; if (dir.get_metadata("contents", contents) == false || contents != "model") { log_and_throw(std::string("Archive does not contain a model.")); } iarchive iarc(dir); std::string model_name; std::string model_wrapper; char buf[256] = ""; size_t magic_header_size = strlen(CLASS_MAGIC_HEADER); iarc.read(buf, magic_header_size); if (strcmp(buf, CLASS_MAGIC_HEADER)) { log_and_throw(std::string("Invalid model file.")); } iarc >> model_name; logstream(LOG_INFO) << "Model name: " << model_name << std::endl; iarc >> model_wrapper; std::shared_ptr<model_base> model_ptr = classes->get_toolkit_class(model_name); iarc >> *(model_ptr); if (dir.get_input_stream()->fail()) { std::string message = "Fail to read."; log_and_throw_io_failure(message); } dir.close(); variant_map_type ret; variant_set_value<std::shared_ptr<model_base>>(ret["model_base"], model_ptr); flexible_type flex_model_wrapper = (flexible_type)model_wrapper; variant_set_value<flexible_type>(ret["model_wrapper"], flex_model_wrapper); return ret; } catch (std::ios_base::failure& e) { std::string message = "Unable to load model from " + sanitize_url(url) + ": " + e.what(); log_and_throw_io_failure(message); } catch (std::string& e) { log_and_throw(std::string("Unable to load model from ") + sanitize_url(url) + ": " + e); } catch (const std::exception& e) { log_and_throw(std::string("Unable to load model from ") + sanitize_url(url) + ": " + e.what()); } catch (...) { log_and_throw(std::string("Unknown Error: Unable to load model from ") + sanitize_url(url)); } }
void unity_global::__write__(const std::string& url, const std::string& content) { general_ofstream fout(url); if (!fout.good()) { fout.close(); log_and_throw_io_failure(std::string("Cannot open " + sanitize_url(url))); } fout << content; fout.close(); }
void load(GraphType& g, std::string prefix, line_parser_type<GraphType> line_parser) { if (prefix.length() == 0) return; g.dc().full_barrier(); g.clear(); std::string directory_name; std::string original_path(prefix); boost::filesystem::path path(prefix); std::string search_prefix; if (boost::filesystem::is_directory(path)) { // if this is a directory // force a "/" at the end of the path // make sure to check that the path is non-empty. (you do not // want to make the empty path "" the root path "/" ) directory_name = path.native(); } else { directory_name = path.parent_path().native(); search_prefix = path.filename().native(); directory_name = (directory_name.empty() ? "." : directory_name); } std::vector<std::string> graph_files; fs_util::list_files_with_prefix(directory_name, search_prefix, graph_files); if (graph_files.size() == 0) { logstream(LOG_WARNING) << "No files found matching " << original_path << std::endl; } parallel_for(0, graph_files.size(), [&](size_t i) { if (i % g.numprocs() == g.procid()) { logstream(LOG_EMPH) << "Loading graph from file: " << graph_files[i] << std::endl; general_ifstream fin(graph_files[i]); if(!fin.good()) { log_and_throw_io_failure("Cannot open file: " + graph_files[i]); } const bool success = load_from_stream(g, graph_files[i], fin, line_parser); if(!success) { log_and_throw_io_failure("Fail parsing file: " + graph_files[i]); } } }); g.dc().full_barrier(); g.finalize(); } // end of load
void unity_sgraph::save_reference(std::string target_dir) const { dir_archive dir; dir.open_directory_for_write(target_dir); dir.set_metadata("contents", "graph"); oarchive oarc(dir); if (dir.get_output_stream()->fail()) { log_and_throw_io_failure("Fail to write"); } save_reference(oarc); dir.close(); }
size_t block_writer::write_block(size_t segment_id, size_t column_id, char* data, block_info block) { DASSERT_LT(segment_id, m_index_info.nsegments); DASSERT_LT(column_id, m_index_info.columns.size()); DASSERT_TRUE(m_output_files[segment_id] != nullptr); // try to compress the data size_t compress_bound = LZ4_compressBound(block.block_size); auto compression_buffer = m_buffer_pool.get_new_buffer(); compression_buffer->resize(compress_bound); char* cbuffer = compression_buffer->data(); size_t clen = compress_bound; clen = LZ4_compress(data, cbuffer, block.block_size); char* buffer_to_write = NULL; size_t buffer_to_write_len = 0; if (clen < COMPRESSION_DISABLE_THRESHOLD * block.block_size) { // compression has a benefit! block.flags |= LZ4_COMPRESSION; block.length = clen; buffer_to_write = cbuffer; buffer_to_write_len = clen; } else { // compression has no benefit! do not compress! // unset LZ4 block.flags &= (~(size_t)LZ4_COMPRESSION); block.length = block.block_size; buffer_to_write = data; buffer_to_write_len = block.block_size; } size_t padding = ((buffer_to_write_len + 4095) / 4096) * 4096 - buffer_to_write_len; ASSERT_LT(padding, 4096); // write! m_output_file_locks[segment_id].lock(); block.offset = m_output_bytes_written[segment_id]; m_output_bytes_written[segment_id] += buffer_to_write_len + padding; m_index_info.columns[column_id].segment_sizes[segment_id] += block.num_elem; m_output_files[segment_id]->write(buffer_to_write, buffer_to_write_len); m_output_files[segment_id]->write(padding_bytes, padding); m_blocks[segment_id][column_id].push_back(block); m_output_file_locks[segment_id].unlock(); m_buffer_pool.release_buffer(std::move(compression_buffer)); if (!m_output_files[segment_id]->good()) { log_and_throw_io_failure("Fail to write. Disk may be full."); } return buffer_to_write_len; }
void block_writer::emit_footer(size_t segment_id) { // prepare the footer // write out all the block headers oarchive oarc; oarc << m_blocks[segment_id]; m_output_files[segment_id]->write(oarc.buf, oarc.off); uint64_t footer_size = oarc.off; m_output_files[segment_id]->write(reinterpret_cast<char*>(&footer_size), sizeof(footer_size)); free(oarc.buf); if (!m_output_files[segment_id]->good()) { log_and_throw_io_failure("Fail to write. Disk may be full."); } }
std::streampos cache_stream_source::seek(std::streamoff off, std::ios_base::seekdir way) { if (in_array) { std::streampos newpos; if (way == std::ios_base::beg) { newpos = off; } else if (way == std::ios_base::cur) { newpos = (std::streampos)(array_cur_pos) + off; } else if (way == std::ios_base::end) { newpos = array_size + off - 1; } if (newpos < 0 || newpos >= (std::streampos)array_size) { log_and_throw_io_failure("Bad seek. Index out of range."); } array_cur_pos = newpos; return newpos; } return in_file->seek(off, way); }
/** * A simple union of std::fstream and graphlab::hdfs::fstream, and graphlab::fileio::cache_stream. */ union_fstream::union_fstream(std::string url, std::ios_base::openmode mode, std::string proxy) : url(url) { input_stream = NULL; output_stream = NULL; if ((mode & std::ios_base::in) && (mode & std::ios_base::out)) { // If the mode is both in and out, raise exception. log_and_throw_io_failure("Invalid union_fstream open mode: cannot be both in and out"); } else if (!(mode & std::ios_base::in) && !(mode & std::ios_base::out)) { // If the mode is neither in nor out, raise exception. log_and_throw_io_failure("Invalid union_fstream open mode: cannot be neither in nor out"); } bool is_output_stream = (mode & std::ios_base::out); if(boost::starts_with(url, "hdfs://")) { // HDFS file type type = HDFS; std::string host, port, path; std::tie(host, port, path) = fileio::parse_hdfs_url(url); logstream(LOG_INFO) << "HDFS URL parsed: Host: " << host << " Port: " << port << " Path: " << path << std::endl; if (host.empty() && port.empty() && path.empty()) { log_and_throw_io_failure("Invalid hdfs url: " + url); } try { auto& hdfs = graphlab::hdfs::get_hdfs(host, std::stoi(port)); ASSERT_TRUE(hdfs.good()); if (is_output_stream) { output_stream.reset(new graphlab::hdfs::fstream(hdfs, path, true)); } else { input_stream.reset(new graphlab::hdfs::fstream(hdfs, path, false)); m_file_size = hdfs.file_size(path); } } catch(...) { log_and_throw_io_failure("Unable to open " + url); } } else if (boost::starts_with(url, fileio::get_cache_prefix())) { // Cache file type type = CACHE; if (is_output_stream) { output_stream.reset(new fileio::ocache_stream(url)); } else { auto cachestream = std::make_shared<fileio::icache_stream>(url); input_stream = (*cachestream)->get_underlying_stream(); if (input_stream == nullptr) input_stream = cachestream; m_file_size = (*cachestream)->file_size(); original_input_stream_handle = std::static_pointer_cast<std::istream>(cachestream); } } else if (boost::starts_with(url, "s3://")) { // the S3 file type currently works by download/uploading a local file // i.e. the s3_stream simply remaps a local file stream type = STD; if (is_output_stream) { output_stream = std::make_shared<s3_fstream>(url, true); } else { auto s3stream = std::make_shared<s3_fstream>(url, false); input_stream = (*s3stream)->get_underlying_stream(); if (input_stream == nullptr) input_stream = s3stream; m_file_size = (*s3stream)->file_size(); original_input_stream_handle = std::static_pointer_cast<std::istream>(s3stream); } } else { // must be local file if (is_output_stream) { // Output stream must be a local openable file. output_stream.reset(new std::ofstream(url, std::ofstream::binary)); if (!output_stream->good()) { output_stream.reset(); log_and_throw_io_failure("Cannot open " + url + " for writing"); } } else { url = file_download_cache::get_instance().get_file(url); input_stream.reset(new std::ifstream(url, std::ifstream::binary)); if (!input_stream->good()) { input_stream.reset(); log_and_throw_io_failure("Cannot open " + url + " for reading"); } // get the file size { std::ifstream fin; fin.open(url.c_str(), std::ifstream::binary); if (fin.good()) { fin.seekg(0, std::ios::end); m_file_size = fin.tellg(); } } } } if (is_output_stream) { ASSERT_TRUE(output_stream->good()); } else { ASSERT_TRUE(input_stream->good()); } }
std::string file_download_cache::get_file(const std::string& url) { // first check if the file has been downloaded. // if it has, return the downloaded location lock.lock(); if (url_to_file.count(url)) { bool cache_dirty = false; if (boost::starts_with(url, "s3://")) { std::string last_modified = ""; try { last_modified = webstor::get_s3_file_last_modified(url); } catch (...) { lock.unlock(); throw; } if (last_modified != url_to_file[url].last_modified) { cache_dirty = true; } } if (!cache_dirty) { std::string ret = url_to_file[url].filename; lock.unlock(); return ret; } } lock.unlock(); // ok. we need to download the file if (boost::starts_with(url, "s3://")) { // if it is s3. std::string localfile = get_temp_name(); std::string message = webstor::download_from_s3(url, localfile, "").get(); size_t i = 0; // if message contains a permanentredirect error code, we need to try other endpoints. while (boost::algorithm::icontains(message, "PermanentRedirect") && i < webstor::S3_END_POINTS.size()) { message = webstor::download_from_s3(url, localfile, "", webstor::S3_END_POINTS[i]).get(); ++i; } if (!message.empty()) { // OK, we failed. Let's clean up anything that was downloaded // since the real file lives in S3. if(std::remove(localfile.c_str()) != 0) { logstream(LOG_WARNING) << "Could not delete failed cached file: " << localfile << std::endl; } log_and_throw_io_failure("Fail to download from " + webstor::sanitize_s3_url(url) + ". " + webstor::get_s3_error_code(message)); } lock.lock(); url_to_file[url].filename = localfile; try { url_to_file[url].last_modified = webstor::get_s3_file_last_modified(url); } catch (...) { lock.unlock(); throw; } lock.unlock(); return localfile; } else { // Ok, it is either local regular file, file:///, or remote urls http://. // For remote urls, download_url download it into to local file. // For local urls, download_url return as is. std::string localfile; int status; bool is_temp; std::tie(status, is_temp, localfile) = download_url(url); if (status) { log_and_throw_io_failure("Fail to download from " + url + ". " + get_curl_error_string(status)); } if (is_temp) { // if it is a remote file, we check the download status code lock.lock(); url_to_file[url].filename = localfile; url_to_file[url].last_modified = ""; lock.unlock(); return localfile; } else { // purely a local file. just return it return localfile; } } }