void import_json(hashdb::import_manager_t& manager, progress_tracker_t& progress_tracker, std::istream& in) { std::string line; size_t line_number = 0; while(getline(in, line)) { ++line_number; // skip comment lines if (line[0] == '#') { continue; } // skip empty lines if (line.size() == 0) { continue; } // import JSON std::string error_message = manager.import_json(line); if (error_message.size() != 0) { std::cerr << "Invalid line " << line_number << " error: " << error_message << ": '" << line << "'\n"; } else { progress_tracker.track(); } } }
void import_tab(hashdb::import_manager_t& manager, const std::string& repository_name, const std::string& filename, const hashdb::scan_manager_t* const whitelist_manager, progress_tracker_t& progress_tracker, std::istream& in) { // only import file hashes that are new to the session std::set<std::string> importable_sources; std::string line; size_t line_number = 0; while(getline(in, line)) { ++line_number; // skip comment lines if (line[0] == '#') { continue; } // skip empty lines if (line.size() == 0) { continue; } // find tabs size_t tab_index1 = line.find('\t'); if (tab_index1 == std::string::npos) { std::cerr << "Tab not found on line " << line_number << ": '" << line << "'\n"; continue; } size_t tab_index2 = line.find('\t', tab_index1 + 1); if (tab_index2 == std::string::npos) { std::cerr << "Second tab not found on line " << line_number << ": '" << line << "'\n"; continue; } // get file hash std::string file_hash_string = line.substr(0, tab_index1); std::string file_binary_hash = hashdb::hex_to_bin(file_hash_string); if (file_binary_hash.size() == 0) { std::cerr << "file hexdigest is invalid on line " << line_number << ": '" << line << "', '" << file_hash_string << "'\n"; continue; } // skip the file hash if it was preexisting else identify it as importable if (importable_sources.find(file_binary_hash) == importable_sources.end()) { // the file hash has not been seen yet so see if it is preexisting if (manager.has_source(file_binary_hash)) { // the file is preexisting so skip it continue; } else { // the file hash is new so identify it as importable importable_sources.insert(file_binary_hash); } } // skip the file hash if it has not been identified as importable if (importable_sources.find(file_binary_hash) == importable_sources.end()) { continue; } // get block hash std::string block_hashdigest_string = line.substr( tab_index1+1, tab_index2 - tab_index1 - 1); std::string block_binary_hash = hashdb::hex_to_bin(block_hashdigest_string); if (block_binary_hash == "") { std::cerr << "Invalid block hash on line " << line_number << ": '" << line << "', '" << block_hashdigest_string << "'\n"; continue; } // get file offset size_t sector_index; sector_index = s_to_uint64(line.substr(tab_index2 + 1)); if (sector_index == 0) { // index starts at 1 so 0 is invalid std::cerr << "Invalid sector index on line " << line_number << ": '" << line << "', '" << line.substr(tab_index2 + 1) << "'\n"; continue; } uint64_t file_offset = (sector_index - 1) * sector_size; // mark with "w" if in whitelist std::string whitelist_flag = ""; if (whitelist_manager != NULL) { if (whitelist_manager->find_hash_count(block_binary_hash) > 0) { whitelist_flag = "w"; } } // add source data manager.insert_source_data(file_binary_hash, 0, "", 0, 0); // add name pair manager.insert_source_name(file_binary_hash, repository_name, filename); // add block hash manager.insert_hash(block_binary_hash, 0.0, whitelist_flag, file_binary_hash, file_offset); // update progress tracker progress_tracker.track(); } }
std::string ingest_file( const hasher::file_reader_t& file_reader, hashdb::import_manager_t& import_manager, hasher::ingest_tracker_t& ingest_tracker, const hashdb::scan_manager_t* const whitelist_scan_manager, const std::string& repository_name, const size_t step_size, const size_t block_size, const bool disable_recursive_processing, const bool disable_calculate_entropy, const bool disable_calculate_labels, hasher::job_queue_t* const job_queue) { // identify the maximum recursion depth size_t max_recursion_depth = (disable_recursive_processing) ? MAX_RECURSION_DEPTH : 0; // create buffer b to read into size_t b_size = (file_reader.filesize <= BUFFER_SIZE) ? file_reader.filesize : BUFFER_SIZE; uint8_t* b = new (std::nothrow) uint8_t[b_size](); if (b == NULL) { return "bad memory allocation"; } // read into buffer b size_t bytes_read; std::string error_message; error_message = file_reader.read(0, b, b_size, &bytes_read); if (error_message.size() > 0) { // abort delete[] b; return error_message; } // get a source file hash calculator hasher::hash_calculator_t hash_calculator; hash_calculator.init(); // hash first buffer b hash_calculator.update(b, b_size, 0, b_size); // read and hash subsequent buffers in b2 if (file_reader.filesize > BUFFER_SIZE) { // create b2 to read into uint8_t* b2 = new (std::nothrow) uint8_t[BUFFER_SIZE](); if (b2 == NULL) { // abort delete[] b; return "bad memory allocation"; } // read and hash b2 into final source file hash value for (uint64_t offset = BUFFER_SIZE; offset < file_reader.filesize; offset += BUFFER_SIZE) { // print status std::stringstream ss; ss << "# Calculating file hash for file " << file_reader.filename << " offset " << offset << " size " << file_reader.filesize << "\n"; hashdb::tprint(std::cout, ss.str()); // read into b2 size_t b2_bytes_read = 0; error_message = file_reader.read( offset, b2, BUFFER_SIZE, &b2_bytes_read); if (error_message.size() > 0) { // abort delete[] b2; delete[] b; return error_message; } // hash b2 into final source file hash value hash_calculator.update(b2, BUFFER_SIZE, 0, b2_bytes_read); } delete[] b2; } // get the source file hash const std::string file_hash = hash_calculator.final(); // store the source repository name and filename import_manager.insert_source_name(file_hash, repository_name, file_reader.filename); // define the file type, currently not defined const std::string file_type = ""; // calculate the number of buffer parts required to process this file const size_t parts_total = (file_reader.filesize + (BUFFER_DATA_SIZE - 1)) / BUFFER_DATA_SIZE; // add source file information to ingest_tracker const bool source_added = ingest_tracker.add_source(file_hash, file_reader.filesize, file_type, parts_total); // do not re-ingest hashes from duplicate sources const bool disable_ingest_hashes = (source_added == false); // build buffers from file sections and push them onto the job queue // push buffer b onto the job queue size_t b_data_size = (b_size > BUFFER_DATA_SIZE) ? BUFFER_DATA_SIZE : b_size; job_queue->push(hasher::job_t::new_ingest_job( &import_manager, &ingest_tracker, whitelist_scan_manager, repository_name, step_size, block_size, file_hash, file_reader.filename, file_reader.filesize, 0, // file_offset disable_recursive_processing, disable_calculate_entropy, disable_calculate_labels, disable_ingest_hashes, b, // buffer b_size, // buffer_size b_data_size, // buffer_data_size, max_recursion_depth, 0, // recursion_depth "")); // recursion path // read and push remaining buffers onto the job queue for (uint64_t offset = BUFFER_DATA_SIZE; offset < file_reader.filesize; offset += BUFFER_DATA_SIZE) { // create b2 to read into uint8_t* b2 = new (std::nothrow) uint8_t[BUFFER_SIZE](); if (b2 == NULL) { // abort return "bad memory allocation"; } // read into b2 size_t b2_bytes_read = 0; error_message = file_reader.read( offset, b2, BUFFER_SIZE, &b2_bytes_read); if (error_message.size() > 0) { // abort submitting jobs for this file delete[] b2; return error_message; } // push this buffer b2 onto the job queue size_t b2_data_size = (b2_bytes_read > BUFFER_DATA_SIZE) ? BUFFER_DATA_SIZE : b2_bytes_read; job_queue->push(hasher::job_t::new_ingest_job( &import_manager, &ingest_tracker, whitelist_scan_manager, repository_name, step_size, block_size, file_hash, file_reader.filename, file_reader.filesize, offset, // file_offset disable_recursive_processing, disable_calculate_entropy, disable_calculate_labels, disable_ingest_hashes, b2, // buffer b2_bytes_read, // buffer_size b2_data_size, // buffer_data_size max_recursion_depth, 0, // recursion_depth "")); // recursion path } return ""; }