// ************************************************************ // scan_file // ************************************************************ std::string scan_file( const hasher::file_reader_t& file_reader, hashdb::scan_manager_t& scan_manager, hasher::scan_tracker_t& scan_tracker, const size_t step_size, const size_t block_size, const bool process_embedded_data, const hashdb::scan_mode_t scan_mode, hasher::job_queue_t* const job_queue) { // identify the maximum recursion depth size_t max_recursion_depth = (process_embedded_data) ? MAX_RECURSION_DEPTH : 0; // create buffer b to read into size_t b_size = (file_reader.filesize <= BUFFER_SIZE) ? file_reader.filesize : BUFFER_SIZE; uint8_t* b = new (std::nothrow) uint8_t[b_size](); if (b == NULL) { return "bad memory allocation"; } // read into buffer b size_t bytes_read; std::string error_message; error_message = file_reader.read(0, b, b_size, &bytes_read); if (error_message.size() > 0) { // abort delete[] b; return error_message; } // build buffers from file sections and push them onto the job queue // push buffer b onto the job queue size_t b_data_size = (b_size > BUFFER_DATA_SIZE) ? BUFFER_DATA_SIZE : b_size; job_queue->push(hasher::job_t::new_scan_job( &scan_manager, &scan_tracker, step_size, block_size, file_reader.filename, file_reader.filesize, 0, // file_offset process_embedded_data, scan_mode, b, // buffer b_size, // buffer_size b_data_size, // buffer_data_size, max_recursion_depth, 0, // recursion_depth "")); // recursion path // read and push remaining buffers onto the job queue for (uint64_t offset = BUFFER_DATA_SIZE; offset < file_reader.filesize; offset += BUFFER_DATA_SIZE) { // create b2 to read into uint8_t* b2 = new (std::nothrow) uint8_t[BUFFER_SIZE](); if (b2 == NULL) { // abort return "bad memory allocation"; } // read into b2 size_t b2_bytes_read = 0; error_message = file_reader.read( offset, b2, BUFFER_SIZE, &b2_bytes_read); if (error_message.size() > 0) { // abort submitting jobs for this file delete[] b2; return error_message; } // push this buffer b2 onto the job queue size_t b2_data_size = (b2_bytes_read > BUFFER_DATA_SIZE) ? BUFFER_DATA_SIZE : b2_bytes_read; job_queue->push(hasher::job_t::new_scan_job( &scan_manager, &scan_tracker, step_size, block_size, file_reader.filename, file_reader.filesize, offset, // file_offset process_embedded_data, scan_mode, b2, // buffer b2_bytes_read, // buffer_size b2_data_size, // buffer_data_size max_recursion_depth, 0, // recursion_depth "")); // recursion path } return ""; }
std::string ingest_file( const hasher::file_reader_t& file_reader, hashdb::import_manager_t& import_manager, hasher::ingest_tracker_t& ingest_tracker, const hashdb::scan_manager_t* const whitelist_scan_manager, const std::string& repository_name, const size_t step_size, const size_t block_size, const bool disable_recursive_processing, const bool disable_calculate_entropy, const bool disable_calculate_labels, hasher::job_queue_t* const job_queue) { // identify the maximum recursion depth size_t max_recursion_depth = (disable_recursive_processing) ? MAX_RECURSION_DEPTH : 0; // create buffer b to read into size_t b_size = (file_reader.filesize <= BUFFER_SIZE) ? file_reader.filesize : BUFFER_SIZE; uint8_t* b = new (std::nothrow) uint8_t[b_size](); if (b == NULL) { return "bad memory allocation"; } // read into buffer b size_t bytes_read; std::string error_message; error_message = file_reader.read(0, b, b_size, &bytes_read); if (error_message.size() > 0) { // abort delete[] b; return error_message; } // get a source file hash calculator hasher::hash_calculator_t hash_calculator; hash_calculator.init(); // hash first buffer b hash_calculator.update(b, b_size, 0, b_size); // read and hash subsequent buffers in b2 if (file_reader.filesize > BUFFER_SIZE) { // create b2 to read into uint8_t* b2 = new (std::nothrow) uint8_t[BUFFER_SIZE](); if (b2 == NULL) { // abort delete[] b; return "bad memory allocation"; } // read and hash b2 into final source file hash value for (uint64_t offset = BUFFER_SIZE; offset < file_reader.filesize; offset += BUFFER_SIZE) { // print status std::stringstream ss; ss << "# Calculating file hash for file " << file_reader.filename << " offset " << offset << " size " << file_reader.filesize << "\n"; hashdb::tprint(std::cout, ss.str()); // read into b2 size_t b2_bytes_read = 0; error_message = file_reader.read( offset, b2, BUFFER_SIZE, &b2_bytes_read); if (error_message.size() > 0) { // abort delete[] b2; delete[] b; return error_message; } // hash b2 into final source file hash value hash_calculator.update(b2, BUFFER_SIZE, 0, b2_bytes_read); } delete[] b2; } // get the source file hash const std::string file_hash = hash_calculator.final(); // store the source repository name and filename import_manager.insert_source_name(file_hash, repository_name, file_reader.filename); // define the file type, currently not defined const std::string file_type = ""; // calculate the number of buffer parts required to process this file const size_t parts_total = (file_reader.filesize + (BUFFER_DATA_SIZE - 1)) / BUFFER_DATA_SIZE; // add source file information to ingest_tracker const bool source_added = ingest_tracker.add_source(file_hash, file_reader.filesize, file_type, parts_total); // do not re-ingest hashes from duplicate sources const bool disable_ingest_hashes = (source_added == false); // build buffers from file sections and push them onto the job queue // push buffer b onto the job queue size_t b_data_size = (b_size > BUFFER_DATA_SIZE) ? BUFFER_DATA_SIZE : b_size; job_queue->push(hasher::job_t::new_ingest_job( &import_manager, &ingest_tracker, whitelist_scan_manager, repository_name, step_size, block_size, file_hash, file_reader.filename, file_reader.filesize, 0, // file_offset disable_recursive_processing, disable_calculate_entropy, disable_calculate_labels, disable_ingest_hashes, b, // buffer b_size, // buffer_size b_data_size, // buffer_data_size, max_recursion_depth, 0, // recursion_depth "")); // recursion path // read and push remaining buffers onto the job queue for (uint64_t offset = BUFFER_DATA_SIZE; offset < file_reader.filesize; offset += BUFFER_DATA_SIZE) { // create b2 to read into uint8_t* b2 = new (std::nothrow) uint8_t[BUFFER_SIZE](); if (b2 == NULL) { // abort return "bad memory allocation"; } // read into b2 size_t b2_bytes_read = 0; error_message = file_reader.read( offset, b2, BUFFER_SIZE, &b2_bytes_read); if (error_message.size() > 0) { // abort submitting jobs for this file delete[] b2; return error_message; } // push this buffer b2 onto the job queue size_t b2_data_size = (b2_bytes_read > BUFFER_DATA_SIZE) ? BUFFER_DATA_SIZE : b2_bytes_read; job_queue->push(hasher::job_t::new_ingest_job( &import_manager, &ingest_tracker, whitelist_scan_manager, repository_name, step_size, block_size, file_hash, file_reader.filename, file_reader.filesize, offset, // file_offset disable_recursive_processing, disable_calculate_entropy, disable_calculate_labels, disable_ingest_hashes, b2, // buffer b2_bytes_read, // buffer_size b2_data_size, // buffer_data_size max_recursion_depth, 0, // recursion_depth "")); // recursion path } return ""; }