Ejemplo n.º 1
0
// ************************************************************
// scan_file
// ************************************************************
std::string scan_file(
    const hasher::file_reader_t& file_reader,
    hashdb::scan_manager_t& scan_manager,
    hasher::scan_tracker_t& scan_tracker,
    const size_t step_size,
    const size_t block_size,
    const bool process_embedded_data,
    const hashdb::scan_mode_t scan_mode,
    hasher::job_queue_t* const job_queue) {

    // identify the maximum recursion depth
    size_t max_recursion_depth =
        (process_embedded_data) ? MAX_RECURSION_DEPTH : 0;

    // create buffer b to read into
    size_t b_size = (file_reader.filesize <= BUFFER_SIZE) ?
                    file_reader.filesize : BUFFER_SIZE;
    uint8_t* b = new (std::nothrow) uint8_t[b_size]();
    if (b == NULL) {
        return "bad memory allocation";
    }

    // read into buffer b
    size_t bytes_read;
    std::string error_message;
    error_message = file_reader.read(0, b, b_size, &bytes_read);
    if (error_message.size() > 0) {
        // abort
        delete[] b;
        return error_message;
    }

    // build buffers from file sections and push them onto the job queue

    // push buffer b onto the job queue
    size_t b_data_size = (b_size > BUFFER_DATA_SIZE)
                         ? BUFFER_DATA_SIZE : b_size;
    job_queue->push(hasher::job_t::new_scan_job(
                        &scan_manager,
                        &scan_tracker,
                        step_size,
                        block_size,
                        file_reader.filename,
                        file_reader.filesize,
                        0,      // file_offset
                        process_embedded_data,
                        scan_mode,
                        b,      // buffer
                        b_size, // buffer_size
                        b_data_size, // buffer_data_size,
                        max_recursion_depth,
                        0,      // recursion_depth
                        ""));   // recursion path

    // read and push remaining buffers onto the job queue
    for (uint64_t offset = BUFFER_DATA_SIZE;
            offset < file_reader.filesize;
            offset += BUFFER_DATA_SIZE) {

        // create b2 to read into
        uint8_t* b2 = new (std::nothrow) uint8_t[BUFFER_SIZE]();
        if (b2 == NULL) {
            // abort
            return "bad memory allocation";
        }

        // read into b2
        size_t b2_bytes_read = 0;
        error_message = file_reader.read(
                            offset, b2, BUFFER_SIZE, &b2_bytes_read);
        if (error_message.size() > 0) {
            // abort submitting jobs for this file
            delete[] b2;
            return error_message;
        }

        // push this buffer b2 onto the job queue
        size_t b2_data_size = (b2_bytes_read > BUFFER_DATA_SIZE)
                              ? BUFFER_DATA_SIZE : b2_bytes_read;
        job_queue->push(hasher::job_t::new_scan_job(
                            &scan_manager,
                            &scan_tracker,
                            step_size,
                            block_size,
                            file_reader.filename,
                            file_reader.filesize,
                            offset,  // file_offset
                            process_embedded_data,
                            scan_mode,
                            b2,      // buffer
                            b2_bytes_read, // buffer_size
                            b2_data_size,  // buffer_data_size
                            max_recursion_depth,
                            0,      // recursion_depth
                            ""));   // recursion path
    }
    return "";
}
Ejemplo n.º 2
0
  std::string ingest_file(
        const hasher::file_reader_t& file_reader,
        hashdb::import_manager_t& import_manager,
        hasher::ingest_tracker_t& ingest_tracker,
        const hashdb::scan_manager_t* const whitelist_scan_manager,
        const std::string& repository_name,
        const size_t step_size,
        const size_t block_size,
        const bool disable_recursive_processing,
        const bool disable_calculate_entropy,
        const bool disable_calculate_labels,
        hasher::job_queue_t* const job_queue) {

    // identify the maximum recursion depth
    size_t max_recursion_depth = 
                    (disable_recursive_processing) ? MAX_RECURSION_DEPTH : 0;

    // create buffer b to read into
    size_t b_size = (file_reader.filesize <= BUFFER_SIZE) ?
                          file_reader.filesize : BUFFER_SIZE;
    uint8_t* b = new (std::nothrow) uint8_t[b_size]();
    if (b == NULL) {
      return "bad memory allocation";
    }

    // read into buffer b
    size_t bytes_read;
    std::string error_message;
    error_message = file_reader.read(0, b, b_size, &bytes_read);
    if (error_message.size() > 0) {
      // abort
      delete[] b;
      return error_message;
    }

    // get a source file hash calculator
    hasher::hash_calculator_t hash_calculator;
    hash_calculator.init();

    // hash first buffer b
    hash_calculator.update(b, b_size, 0, b_size);

    // read and hash subsequent buffers in b2
    if (file_reader.filesize > BUFFER_SIZE) {

      // create b2 to read into
      uint8_t* b2 = new (std::nothrow) uint8_t[BUFFER_SIZE]();
      if (b2 == NULL) {
        // abort
        delete[] b;
        return "bad memory allocation";
      }

      // read and hash b2 into final source file hash value
      for (uint64_t offset = BUFFER_SIZE;
           offset < file_reader.filesize;
           offset += BUFFER_SIZE) {

        // print status
        std::stringstream ss;
        ss << "# Calculating file hash for file " << file_reader.filename
           << " offset " << offset
           << " size " << file_reader.filesize
           << "\n";
        hashdb::tprint(std::cout, ss.str());

        // read into b2
        size_t b2_bytes_read = 0;
        error_message = file_reader.read(
                              offset, b2, BUFFER_SIZE, &b2_bytes_read);
        if (error_message.size() > 0) {
          // abort
          delete[] b2;
          delete[] b;
          return error_message;
        }

        // hash b2 into final source file hash value
        hash_calculator.update(b2, BUFFER_SIZE, 0, b2_bytes_read);
      }

      delete[] b2;
    }

    // get the source file hash
    const std::string file_hash = hash_calculator.final();

    // store the source repository name and filename
    import_manager.insert_source_name(file_hash, repository_name,
                                      file_reader.filename);

    // define the file type, currently not defined
    const std::string file_type = "";

    // calculate the number of buffer parts required to process this file
    const size_t parts_total = (file_reader.filesize + (BUFFER_DATA_SIZE - 1)) /
                               BUFFER_DATA_SIZE;

    // add source file information to ingest_tracker
    const bool source_added = ingest_tracker.add_source(file_hash,
                           file_reader.filesize, file_type, parts_total);

    // do not re-ingest hashes from duplicate sources
    const bool disable_ingest_hashes = (source_added == false);

    // build buffers from file sections and push them onto the job queue

    // push buffer b onto the job queue
    size_t b_data_size = (b_size > BUFFER_DATA_SIZE)
                         ? BUFFER_DATA_SIZE : b_size;
    job_queue->push(hasher::job_t::new_ingest_job(
                 &import_manager,
                 &ingest_tracker,
                 whitelist_scan_manager,
                 repository_name,
                 step_size,
                 block_size,
                 file_hash,
                 file_reader.filename,
                 file_reader.filesize,
                 0,      // file_offset
                 disable_recursive_processing,
                 disable_calculate_entropy,
                 disable_calculate_labels,
                 disable_ingest_hashes,
                 b,      // buffer
                 b_size, // buffer_size
                 b_data_size, // buffer_data_size,
                 max_recursion_depth,
                 0,      // recursion_depth
                 ""));   // recursion path

    // read and push remaining buffers onto the job queue
    for (uint64_t offset = BUFFER_DATA_SIZE;
         offset < file_reader.filesize;
         offset += BUFFER_DATA_SIZE) {

      // create b2 to read into
      uint8_t* b2 = new (std::nothrow) uint8_t[BUFFER_SIZE]();
      if (b2 == NULL) {
        // abort
        return "bad memory allocation";
      }

      // read into b2
      size_t b2_bytes_read = 0;
      error_message = file_reader.read(
                                  offset, b2, BUFFER_SIZE, &b2_bytes_read);
      if (error_message.size() > 0) {
        // abort submitting jobs for this file
        delete[] b2;
        return error_message;
      }

      // push this buffer b2 onto the job queue
      size_t b2_data_size = (b2_bytes_read > BUFFER_DATA_SIZE)
                                        ? BUFFER_DATA_SIZE : b2_bytes_read;
      job_queue->push(hasher::job_t::new_ingest_job(
                 &import_manager,
                 &ingest_tracker,
                 whitelist_scan_manager,
                 repository_name,
                 step_size,
                 block_size,
                 file_hash,
                 file_reader.filename,
                 file_reader.filesize,
                 offset,  // file_offset
                 disable_recursive_processing,
                 disable_calculate_entropy,
                 disable_calculate_labels,
                 disable_ingest_hashes,
                 b2,      // buffer
                 b2_bytes_read, // buffer_size
                 b2_data_size,  // buffer_data_size
                 max_recursion_depth,
                 0,      // recursion_depth
                 ""));   // recursion path
    }
    return "";
  }