Beispiel #1
0
  std::string unity_global::__read__(const std::string& url) {
    general_ifstream fin(url);
    if (!fin.good()) {
      fin.close();
      log_and_throw_io_failure(std::string("Cannot open " + sanitize_url(url)));
    }

    std::stringstream ss;
    char* buf = new char[4096];
    while(fin.good()) {
      fin.read(buf, 4096);
      size_t bytes_read = fin.gcount();
      ss.write(buf, bytes_read);
    }

    delete[] buf;

    if (!fin.eof()) {
      fin.close();
      log_and_throw_io_failure(std::string("Read fail " + sanitize_url(url)));
    }

    fin.close();
    return ss.str();
  }
Beispiel #2
0
bool unity_sgraph::save_graph(std::string target, std::string format) {
  log_func_entry();
  try {
    if (format == "binary") {
      dir_archive dir;
      dir.open_directory_for_write(target);
      dir.set_metadata("contents", "graph");
      oarchive oarc(dir);
      if (dir.get_output_stream()->fail()) {
        log_and_throw_io_failure("Fail to write");
      }
      save(oarc);
      dir.close();
    } else if (format == "json") {
      save_sgraph_to_json(get_graph(), target);
    } else if (format == "csv") {
      save_sgraph_to_csv(get_graph(), target);
    } else {
      log_and_throw("Unable to save to format : " + format);
    }
  } catch (std::ios_base::failure& e) {
    std::string message =
        "Unable to save graph to " + sanitize_url(target) + ": " + e.what();
    log_and_throw_io_failure(message);
  } catch (std::string& e) {
    std::string message =
        "Unable to save graph to " + sanitize_url(target) + ": " + e;
    log_and_throw(message);
  } catch (...) {
    std::string message =
        "Unable to save graph to " + sanitize_url(target) + ": Unknown Error.";
    log_and_throw(message);
  }
  return true;
}
Beispiel #3
0
 void unity_global::save_model(std::shared_ptr<model_base> model,
                               const std::string& model_wrapper,
                               const std::string& url) {
   logstream(LOG_INFO) << "Save model to " << sanitize_url(url) << std::endl;
   logstream(LOG_INFO) << "Model name: " << model->name() << std::endl;
   try {
     dir_archive dir;
     dir.open_directory_for_write(url);
     dir.set_metadata("contents", "model");
     oarchive oarc(dir);
     oarc.write(CLASS_MAGIC_HEADER, strlen(CLASS_MAGIC_HEADER));
     oarc << model->name();
     oarc << model_wrapper;
     oarc << *model;
     if (dir.get_output_stream()->fail()) {
       std::string message = "Fail to write.";
       log_and_throw_io_failure(message);
     }
     dir.close();
   } catch (std::ios_base::failure& e) {
     std::string message = "Unable to save model to " + sanitize_url(url) + ": " + e.what();
     log_and_throw_io_failure(message);
   } catch (std::string& e) {
     log_and_throw(std::string("Unable to save model to ") + sanitize_url(url) + ": " + e);
   } catch (...) {
     log_and_throw(std::string("Unknown Error: Unable to save model to ") + sanitize_url(url));
   }
 }
Beispiel #4
0
general_ofstream::general_ofstream(std::string filename)
    try :general_ofstream_base(filename), opened_filename(filename) { } catch (const std::exception& e) {
      log_and_throw_io_failure("Cannot open " + sanitize_url(filename) + " for write. " + e.what());
    } catch (std::string e) {
      log_and_throw_io_failure("Cannot open " + sanitize_url(filename) + " for write. " + e);
    } catch (...) {
      log_and_throw_io_failure("Cannot open " + sanitize_url(filename));
    }
Beispiel #5
0
general_ifstream::general_ifstream(std::string filename, bool gzip_compressed)
    try :general_ifstream_base(filename, gzip_compressed), 
     opened_filename(filename) { 
     } catch (const std::exception& e) {
      log_and_throw_io_failure("Cannot open " + sanitize_url(filename) + " for read. " + e.what());
    } catch (std::string e) {
      log_and_throw_io_failure("Cannot open " + sanitize_url(filename) + " for read. " + e);
    } catch (...) {
      log_and_throw_io_failure("Cannot open " + sanitize_url(filename));
    } 
Beispiel #6
0
general_ifstream::general_ifstream(std::string filename)
    // this is the function try block syntax which, together with the member
    // function pointer syntax, is probably the ugliest C++ syntactic element
    // every conceieved.
    try :general_ifstream_base(filename), opened_filename(filename) 
    { 
    }  catch (const std::exception& e) {
      log_and_throw_io_failure("Cannot open " + sanitize_url(filename) + " for read. " + e.what());
    } catch (std::string e) {
      log_and_throw_io_failure("Cannot open " + sanitize_url(filename) + " for read. " + e);
    } catch (...) {
      log_and_throw_io_failure("Cannot open " + sanitize_url(filename));
    } 
Beispiel #7
0
bool unity_sgraph::load_graph(std::string target_dir) {
  log_func_entry();
  try {
    dir_archive dir;
    dir.open_directory_for_read(target_dir);
    std::string contents;
    if (dir.get_metadata("contents", contents) == false ||
        contents != "graph") {
      log_and_throw(std::string("Archive does not contain a graph."));
    }
    iarchive iarc(dir);
    load(iarc);
    dir.close();
  } catch (std::ios_base::failure& e) {
    std::string message = "Unable to load graph from " + sanitize_url(target_dir)
      + ": " + e.what();
    log_and_throw_io_failure(message);
  } catch (std::string& e) {
    std::string message = "Unable to load graph from " + sanitize_url(target_dir)
      + ": " + e;
    log_and_throw(message);
  } catch (...) {
    std::string message = "Unable to load graph from " + sanitize_url(target_dir)
      + ": Unknown Error.";
    log_and_throw(message);
  }
  return true;
}
Beispiel #8
0
  variant_map_type unity_global::load_model(const std::string& url) {
    logstream(LOG_INFO) << "Load model from " << sanitize_url(url) << std::endl;
    try {

      dir_archive dir;
      dir.open_directory_for_read(url);
      std::string contents;
      if (dir.get_metadata("contents", contents) == false || contents != "model") {
        log_and_throw(std::string("Archive does not contain a model."));
      }
      iarchive iarc(dir);

      std::string model_name;
      std::string model_wrapper;
      char buf[256] = "";
      size_t magic_header_size = strlen(CLASS_MAGIC_HEADER);
      iarc.read(buf, magic_header_size);
      if (strcmp(buf, CLASS_MAGIC_HEADER)) {
        log_and_throw(std::string("Invalid model file."));
      }
      iarc >> model_name;
      logstream(LOG_INFO) << "Model name: " << model_name << std::endl;
      iarc >> model_wrapper;
      std::shared_ptr<model_base> model_ptr = classes->get_toolkit_class(model_name);
      iarc  >> *(model_ptr);
      if (dir.get_input_stream()->fail()) {
        std::string message = "Fail to read.";
        log_and_throw_io_failure(message);
      }
      dir.close();
      variant_map_type ret;
      variant_set_value<std::shared_ptr<model_base>>(ret["model_base"], model_ptr);
      flexible_type flex_model_wrapper = (flexible_type)model_wrapper;
      variant_set_value<flexible_type>(ret["model_wrapper"], flex_model_wrapper);
      return ret;
    } catch (std::ios_base::failure& e) {
      std::string message = "Unable to load model from " + sanitize_url(url) + ": " + e.what();
      log_and_throw_io_failure(message);
    } catch (std::string& e) {
      log_and_throw(std::string("Unable to load model from ") + sanitize_url(url) + ": " + e);
    } catch (const std::exception& e) {
      log_and_throw(std::string("Unable to load model from ") + sanitize_url(url) + ": " + e.what());
    } catch (...) {
      log_and_throw(std::string("Unknown Error: Unable to load model from ") + sanitize_url(url));
    }
  }
Beispiel #9
0
 void unity_global::__write__(const std::string& url, const std::string& content) {
   general_ofstream fout(url);
   if (!fout.good()) {
     fout.close();
     log_and_throw_io_failure(std::string("Cannot open " + sanitize_url(url)));
   }
   fout << content;
   fout.close();
 }
Beispiel #10
0
    void load(GraphType& g, std::string prefix,
              line_parser_type<GraphType> line_parser) {
      if (prefix.length() == 0)
        return;
      g.dc().full_barrier();
      g.clear();
      std::string directory_name; std::string original_path(prefix);
      boost::filesystem::path path(prefix);
      std::string search_prefix;
      if (boost::filesystem::is_directory(path)) {
        // if this is a directory
        // force a "/" at the end of the path
        // make sure to check that the path is non-empty. (you do not
        // want to make the empty path "" the root path "/" )
        directory_name = path.native();
      } else {
        directory_name = path.parent_path().native();
        search_prefix = path.filename().native();
        directory_name = (directory_name.empty() ? "." : directory_name);
      }
      std::vector<std::string> graph_files;
      fs_util::list_files_with_prefix(directory_name, search_prefix, graph_files);
      if (graph_files.size() == 0) {
        logstream(LOG_WARNING) << "No files found matching " << original_path << std::endl;
      }

      parallel_for(0, graph_files.size(), [&](size_t i) {
        if (i % g.numprocs() == g.procid()) {
          logstream(LOG_EMPH) << "Loading graph from file: " << graph_files[i] << std::endl;
          general_ifstream fin(graph_files[i]);
          if(!fin.good()) {
            log_and_throw_io_failure("Cannot open file: " + graph_files[i]);
          }
          const bool success = load_from_stream(g, graph_files[i], fin, line_parser);
          if(!success) {
            log_and_throw_io_failure("Fail parsing file: " + graph_files[i]);
          }
        }
      });
      g.dc().full_barrier();
      g.finalize();
    } // end of load
Beispiel #11
0
void unity_sgraph::save_reference(std::string target_dir) const {
  dir_archive dir;
  dir.open_directory_for_write(target_dir);
  dir.set_metadata("contents", "graph");
  oarchive oarc(dir);
  if (dir.get_output_stream()->fail()) {
    log_and_throw_io_failure("Fail to write");
  }
  save_reference(oarc);
  dir.close();
}
Beispiel #12
0
size_t block_writer::write_block(size_t segment_id,
                                 size_t column_id, 
                                 char* data,
                                 block_info block) {
  DASSERT_LT(segment_id, m_index_info.nsegments);
  DASSERT_LT(column_id, m_index_info.columns.size());
  DASSERT_TRUE(m_output_files[segment_id] != nullptr);
  // try to compress the data
  size_t compress_bound = LZ4_compressBound(block.block_size);
  auto compression_buffer = m_buffer_pool.get_new_buffer();
  compression_buffer->resize(compress_bound);
  char* cbuffer = compression_buffer->data();
  size_t clen = compress_bound;
  clen = LZ4_compress(data, cbuffer, block.block_size);

  char* buffer_to_write = NULL;
  size_t buffer_to_write_len = 0;
  if (clen < COMPRESSION_DISABLE_THRESHOLD * block.block_size) {
    // compression has a benefit!
    block.flags |= LZ4_COMPRESSION;
    block.length = clen;
    buffer_to_write = cbuffer;
    buffer_to_write_len = clen;
  } else {
    // compression has no benefit! do not compress!
    // unset LZ4
    block.flags &= (~(size_t)LZ4_COMPRESSION);
    block.length = block.block_size;
    buffer_to_write = data;
    buffer_to_write_len = block.block_size;
  }

  size_t padding = ((buffer_to_write_len + 4095) / 4096) * 4096 - buffer_to_write_len;
  ASSERT_LT(padding, 4096);
  // write!
  m_output_file_locks[segment_id].lock();
  block.offset = m_output_bytes_written[segment_id];
  m_output_bytes_written[segment_id] += buffer_to_write_len + padding;
  m_index_info.columns[column_id].segment_sizes[segment_id] += block.num_elem;
  m_output_files[segment_id]->write(buffer_to_write, buffer_to_write_len);
  m_output_files[segment_id]->write(padding_bytes, padding);
  m_blocks[segment_id][column_id].push_back(block);
  m_output_file_locks[segment_id].unlock();

  m_buffer_pool.release_buffer(std::move(compression_buffer));

  if (!m_output_files[segment_id]->good()) {
    log_and_throw_io_failure("Fail to write. Disk may be full.");
  }
  return buffer_to_write_len;
}
Beispiel #13
0
void block_writer::emit_footer(size_t segment_id) {
  // prepare the footer
  // write out all the block headers
  oarchive oarc;
  oarc << m_blocks[segment_id];
  m_output_files[segment_id]->write(oarc.buf, oarc.off);
  uint64_t footer_size = oarc.off;

  m_output_files[segment_id]->write(reinterpret_cast<char*>(&footer_size),
                                 sizeof(footer_size));
  free(oarc.buf);

  if (!m_output_files[segment_id]->good()) {
    log_and_throw_io_failure("Fail to write. Disk may be full.");
  }
}
std::streampos cache_stream_source::seek(std::streamoff off, std::ios_base::seekdir way) {
  if (in_array) {
    std::streampos newpos;
    if (way == std::ios_base::beg) {
      newpos = off;
    } else if (way == std::ios_base::cur) {
      newpos = (std::streampos)(array_cur_pos) + off;
    } else if (way == std::ios_base::end) {
      newpos = array_size + off - 1;
    }

    if (newpos < 0 || newpos >= (std::streampos)array_size) {
      log_and_throw_io_failure("Bad seek. Index out of range.");
    }

    array_cur_pos = newpos;
    return newpos;
  }

  return in_file->seek(off, way);
}
Beispiel #15
0
/**
 * A simple union of std::fstream and graphlab::hdfs::fstream, and graphlab::fileio::cache_stream.
 */
union_fstream::union_fstream(std::string url,
                             std::ios_base::openmode mode,
                             std::string proxy) : url(url) {
  input_stream = NULL;
  output_stream = NULL;

  if ((mode & std::ios_base::in) && (mode & std::ios_base::out)) {
    // If the mode is both in and out, raise exception.
    log_and_throw_io_failure("Invalid union_fstream open mode: cannot be both in and out");
  }
  else if (!(mode & std::ios_base::in) && !(mode & std::ios_base::out)) {
    // If the mode is neither in nor out, raise exception.
    log_and_throw_io_failure("Invalid union_fstream open mode: cannot be neither in nor out");
  }

  bool is_output_stream = (mode & std::ios_base::out);

  if(boost::starts_with(url, "hdfs://")) {
    // HDFS file type
    type = HDFS;
    std::string host, port, path;
    std::tie(host, port, path) = fileio::parse_hdfs_url(url);
    logstream(LOG_INFO) << "HDFS URL parsed: Host: " << host << " Port: " << port
                        << " Path: " << path << std::endl;
    if (host.empty() && port.empty() && path.empty()) {
      log_and_throw_io_failure("Invalid hdfs url: " + url);
    }
    try {
      auto& hdfs = graphlab::hdfs::get_hdfs(host, std::stoi(port));
      ASSERT_TRUE(hdfs.good());
      if (is_output_stream) {
        output_stream.reset(new graphlab::hdfs::fstream(hdfs, path, true));
      } else {
        input_stream.reset(new graphlab::hdfs::fstream(hdfs, path, false));
        m_file_size = hdfs.file_size(path);
      }
    } catch(...) {
      log_and_throw_io_failure("Unable to open " + url);
    }
  } else if (boost::starts_with(url, fileio::get_cache_prefix())) {
    // Cache file type
    type = CACHE;
    if (is_output_stream) {
      output_stream.reset(new fileio::ocache_stream(url));
    } else {
      auto cachestream = std::make_shared<fileio::icache_stream>(url);
      input_stream = (*cachestream)->get_underlying_stream();
      if (input_stream == nullptr) input_stream = cachestream;
      m_file_size = (*cachestream)->file_size();
      original_input_stream_handle = std::static_pointer_cast<std::istream>(cachestream);
    }
  } else if (boost::starts_with(url, "s3://")) {
    // the S3 file type currently works by download/uploading a local file
    // i.e. the s3_stream simply remaps a local file stream
    type = STD;
    if (is_output_stream) {
      output_stream = std::make_shared<s3_fstream>(url, true);
    } else {
      auto s3stream = std::make_shared<s3_fstream>(url, false);
      input_stream = (*s3stream)->get_underlying_stream();
      if (input_stream == nullptr) input_stream = s3stream;
      m_file_size = (*s3stream)->file_size();
      original_input_stream_handle = std::static_pointer_cast<std::istream>(s3stream);
    }
  } else {
    // must be local file
    if (is_output_stream) {
      // Output stream must be a local openable file.
      output_stream.reset(new std::ofstream(url, std::ofstream::binary));
      if (!output_stream->good()) {
        output_stream.reset();
        log_and_throw_io_failure("Cannot open " + url + " for writing");
      }
    } else {
      url = file_download_cache::get_instance().get_file(url);
      input_stream.reset(new std::ifstream(url, std::ifstream::binary));
      if (!input_stream->good()) {
        input_stream.reset();
        log_and_throw_io_failure("Cannot open " + url + " for reading");
      }
      // get the file size
      {
        std::ifstream fin;
        fin.open(url.c_str(), std::ifstream::binary);
        if (fin.good()) {
          fin.seekg(0, std::ios::end);
          m_file_size = fin.tellg();
        }
      }
    }
  }

  if (is_output_stream) {
    ASSERT_TRUE(output_stream->good());
  } else {
    ASSERT_TRUE(input_stream->good());
  }
}
std::string file_download_cache::get_file(const std::string& url) {
  // first check if the file has been downloaded.
  // if it has, return the downloaded location
  lock.lock();
  if (url_to_file.count(url)) {
    bool cache_dirty = false;
    if (boost::starts_with(url, "s3://")) {
      std::string last_modified = "";
      try {
        last_modified = webstor::get_s3_file_last_modified(url);
      } catch (...) {
        lock.unlock();
        throw;
      }
      if (last_modified != url_to_file[url].last_modified) {
        cache_dirty = true;
      }
    }
    if (!cache_dirty) {
      std::string ret = url_to_file[url].filename;
      lock.unlock();
      return ret;
    }
  }
  lock.unlock();

  // ok. we need to download the file
  if (boost::starts_with(url, "s3://")) {
    // if it is s3.
    std::string localfile = get_temp_name();
    std::string message = webstor::download_from_s3(url, localfile, "").get();
    size_t i = 0;
    // if message contains a permanentredirect error code, we need to try other endpoints.
    while (boost::algorithm::icontains(message, "PermanentRedirect") && i < webstor::S3_END_POINTS.size()) {
      message = webstor::download_from_s3(url, localfile, "", webstor::S3_END_POINTS[i]).get();
      ++i;
    }
    if (!message.empty()) {
      // OK, we failed.  Let's clean up anything that was downloaded
      // since the real file lives in S3.
      if(std::remove(localfile.c_str()) != 0) {
        logstream(LOG_WARNING) << "Could not delete failed cached file: " << localfile << std::endl;
      }
      log_and_throw_io_failure("Fail to download from " + webstor::sanitize_s3_url(url) + ". "
                                   + webstor::get_s3_error_code(message));
    }
    lock.lock();
    url_to_file[url].filename = localfile;
    try {
      url_to_file[url].last_modified = webstor::get_s3_file_last_modified(url);
    } catch (...) {
      lock.unlock();
      throw;
    }
    lock.unlock();
    return localfile;
  } else {
    // Ok, it is either local regular file, file:///, or remote urls http://.
    // For remote urls, download_url download it into to local file.
    // For local urls, download_url return as is.
    std::string localfile;
    int status; bool is_temp;
    std::tie(status, is_temp, localfile) = download_url(url);
    if (status) {
      log_and_throw_io_failure("Fail to download from " + url + 
                               ". " + get_curl_error_string(status));
    }
    if (is_temp) {
      // if it is a remote file, we check the download status code
      lock.lock();
      url_to_file[url].filename = localfile;
      url_to_file[url].last_modified = "";
      lock.unlock();
      return localfile;
    } else {
      // purely a local file. just return it
      return localfile;
    }
  }
}