예제 #1
0
void import_json(hashdb::import_manager_t& manager,
                 progress_tracker_t& progress_tracker,
                 std::istream& in) {
  std::string line;
  size_t line_number = 0;
  while(getline(in, line)) {
    ++line_number;

    // skip comment lines
    if (line[0] == '#') {
      continue;
    }

    // skip empty lines
    if (line.size() == 0) {
      continue;
    }

    // import JSON
    std::string error_message = manager.import_json(line);
    if (error_message.size() != 0) {
      std::cerr << "Invalid line " << line_number
                << " error: " << error_message
                << ": '" << line << "'\n";
    } else {
      progress_tracker.track();
    }
  }
}
예제 #2
0
void import_tab(hashdb::import_manager_t& manager,
                const std::string& repository_name,
                const std::string& filename,
                const hashdb::scan_manager_t* const whitelist_manager,
                progress_tracker_t& progress_tracker,
                std::istream& in) {

  // only import file hashes that are new to the session
  std::set<std::string> importable_sources;

  std::string line;
  size_t line_number = 0;
  while(getline(in, line)) {
    ++line_number;

    // skip comment lines
    if (line[0] == '#') {
      continue;
    }

    // skip empty lines
    if (line.size() == 0) {
      continue;
    }

    // find tabs
    size_t tab_index1 = line.find('\t');
    if (tab_index1 == std::string::npos) {
      std::cerr << "Tab not found on line " << line_number << ": '" << line << "'\n";
      continue;
    }
    size_t tab_index2 = line.find('\t', tab_index1 + 1);
    if (tab_index2 == std::string::npos) {
      std::cerr << "Second tab not found on line " << line_number << ": '" << line << "'\n";
      continue;
    }

    // get file hash
    std::string file_hash_string = line.substr(0, tab_index1);
    std::string file_binary_hash = hashdb::hex_to_bin(file_hash_string);
    if (file_binary_hash.size() == 0) {
      std::cerr << "file hexdigest is invalid on line " << line_number
                << ": '" << line << "', '" << file_hash_string << "'\n";
      continue;
    }

    // skip the file hash if it was preexisting else identify it as importable
    if (importable_sources.find(file_binary_hash) == importable_sources.end()) {
      // the file hash has not been seen yet so see if it is preexisting
      if (manager.has_source(file_binary_hash)) {
        // the file is preexisting so skip it
        continue;
      } else {
        // the file hash is new so identify it as importable
        importable_sources.insert(file_binary_hash);
      }
    }

    // skip the file hash if it has not been identified as importable
    if (importable_sources.find(file_binary_hash) == importable_sources.end()) {
      continue;
    }

    // get block hash
    std::string block_hashdigest_string = line.substr(
                                  tab_index1+1, tab_index2 - tab_index1 - 1);
    std::string block_binary_hash = hashdb::hex_to_bin(block_hashdigest_string);
    if (block_binary_hash == "") {
      std::cerr << "Invalid block hash on line " << line_number
                << ": '" << line << "', '" << block_hashdigest_string << "'\n";
      continue;
    }

    // get file offset
    size_t sector_index;
    sector_index = s_to_uint64(line.substr(tab_index2 + 1));
    if (sector_index == 0) {
      // index starts at 1 so 0 is invalid
      std::cerr << "Invalid sector index on line " << line_number
                << ": '" << line << "', '"
                << line.substr(tab_index2 + 1) << "'\n";
      continue;
    }
    uint64_t file_offset = (sector_index - 1) * sector_size;

    // mark with "w" if in whitelist
    std::string whitelist_flag = "";
    if (whitelist_manager != NULL) {
      if (whitelist_manager->find_hash_count(block_binary_hash) > 0) {
        whitelist_flag = "w";
      }
    }

    // add source data
    manager.insert_source_data(file_binary_hash, 0, "", 0, 0);

    // add name pair
    manager.insert_source_name(file_binary_hash, repository_name, filename);

    // add block hash
    manager.insert_hash(block_binary_hash, 0.0, whitelist_flag,
                        file_binary_hash, file_offset);

    // update progress tracker
    progress_tracker.track();
  }
}
예제 #3
0
  std::string ingest_file(
        const hasher::file_reader_t& file_reader,
        hashdb::import_manager_t& import_manager,
        hasher::ingest_tracker_t& ingest_tracker,
        const hashdb::scan_manager_t* const whitelist_scan_manager,
        const std::string& repository_name,
        const size_t step_size,
        const size_t block_size,
        const bool disable_recursive_processing,
        const bool disable_calculate_entropy,
        const bool disable_calculate_labels,
        hasher::job_queue_t* const job_queue) {

    // identify the maximum recursion depth
    size_t max_recursion_depth = 
                    (disable_recursive_processing) ? MAX_RECURSION_DEPTH : 0;

    // create buffer b to read into
    size_t b_size = (file_reader.filesize <= BUFFER_SIZE) ?
                          file_reader.filesize : BUFFER_SIZE;
    uint8_t* b = new (std::nothrow) uint8_t[b_size]();
    if (b == NULL) {
      return "bad memory allocation";
    }

    // read into buffer b
    size_t bytes_read;
    std::string error_message;
    error_message = file_reader.read(0, b, b_size, &bytes_read);
    if (error_message.size() > 0) {
      // abort
      delete[] b;
      return error_message;
    }

    // get a source file hash calculator
    hasher::hash_calculator_t hash_calculator;
    hash_calculator.init();

    // hash first buffer b
    hash_calculator.update(b, b_size, 0, b_size);

    // read and hash subsequent buffers in b2
    if (file_reader.filesize > BUFFER_SIZE) {

      // create b2 to read into
      uint8_t* b2 = new (std::nothrow) uint8_t[BUFFER_SIZE]();
      if (b2 == NULL) {
        // abort
        delete[] b;
        return "bad memory allocation";
      }

      // read and hash b2 into final source file hash value
      for (uint64_t offset = BUFFER_SIZE;
           offset < file_reader.filesize;
           offset += BUFFER_SIZE) {

        // print status
        std::stringstream ss;
        ss << "# Calculating file hash for file " << file_reader.filename
           << " offset " << offset
           << " size " << file_reader.filesize
           << "\n";
        hashdb::tprint(std::cout, ss.str());

        // read into b2
        size_t b2_bytes_read = 0;
        error_message = file_reader.read(
                              offset, b2, BUFFER_SIZE, &b2_bytes_read);
        if (error_message.size() > 0) {
          // abort
          delete[] b2;
          delete[] b;
          return error_message;
        }

        // hash b2 into final source file hash value
        hash_calculator.update(b2, BUFFER_SIZE, 0, b2_bytes_read);
      }

      delete[] b2;
    }

    // get the source file hash
    const std::string file_hash = hash_calculator.final();

    // store the source repository name and filename
    import_manager.insert_source_name(file_hash, repository_name,
                                      file_reader.filename);

    // define the file type, currently not defined
    const std::string file_type = "";

    // calculate the number of buffer parts required to process this file
    const size_t parts_total = (file_reader.filesize + (BUFFER_DATA_SIZE - 1)) /
                               BUFFER_DATA_SIZE;

    // add source file information to ingest_tracker
    const bool source_added = ingest_tracker.add_source(file_hash,
                           file_reader.filesize, file_type, parts_total);

    // do not re-ingest hashes from duplicate sources
    const bool disable_ingest_hashes = (source_added == false);

    // build buffers from file sections and push them onto the job queue

    // push buffer b onto the job queue
    size_t b_data_size = (b_size > BUFFER_DATA_SIZE)
                         ? BUFFER_DATA_SIZE : b_size;
    job_queue->push(hasher::job_t::new_ingest_job(
                 &import_manager,
                 &ingest_tracker,
                 whitelist_scan_manager,
                 repository_name,
                 step_size,
                 block_size,
                 file_hash,
                 file_reader.filename,
                 file_reader.filesize,
                 0,      // file_offset
                 disable_recursive_processing,
                 disable_calculate_entropy,
                 disable_calculate_labels,
                 disable_ingest_hashes,
                 b,      // buffer
                 b_size, // buffer_size
                 b_data_size, // buffer_data_size,
                 max_recursion_depth,
                 0,      // recursion_depth
                 ""));   // recursion path

    // read and push remaining buffers onto the job queue
    for (uint64_t offset = BUFFER_DATA_SIZE;
         offset < file_reader.filesize;
         offset += BUFFER_DATA_SIZE) {

      // create b2 to read into
      uint8_t* b2 = new (std::nothrow) uint8_t[BUFFER_SIZE]();
      if (b2 == NULL) {
        // abort
        return "bad memory allocation";
      }

      // read into b2
      size_t b2_bytes_read = 0;
      error_message = file_reader.read(
                                  offset, b2, BUFFER_SIZE, &b2_bytes_read);
      if (error_message.size() > 0) {
        // abort submitting jobs for this file
        delete[] b2;
        return error_message;
      }

      // push this buffer b2 onto the job queue
      size_t b2_data_size = (b2_bytes_read > BUFFER_DATA_SIZE)
                                        ? BUFFER_DATA_SIZE : b2_bytes_read;
      job_queue->push(hasher::job_t::new_ingest_job(
                 &import_manager,
                 &ingest_tracker,
                 whitelist_scan_manager,
                 repository_name,
                 step_size,
                 block_size,
                 file_hash,
                 file_reader.filename,
                 file_reader.filesize,
                 offset,  // file_offset
                 disable_recursive_processing,
                 disable_calculate_entropy,
                 disable_calculate_labels,
                 disable_ingest_hashes,
                 b2,      // buffer
                 b2_bytes_read, // buffer_size
                 b2_data_size,  // buffer_data_size
                 max_recursion_depth,
                 0,      // recursion_depth
                 ""));   // recursion path
    }
    return "";
  }