static int read_hashfile(char *hashfile_name) { char buf[MAXLINE]; struct hashfile_handle *handle; const struct chunk_info *ci; uint64_t chunk_count; time_t scan_start_time; int ret; handle = hashfile_open(hashfile_name); if (!handle) { fprintf(stderr, "Error opening hash file: %d", errno); return -1; } /* Print some information about the hash file */ scan_start_time = hashfile_start_time(handle); printf("Collected at [%s] on %s", hashfile_sysid(handle), ctime(&scan_start_time)); ret = hashfile_chunking_method_str(handle, buf, MAXLINE); if (ret < 0) { fprintf(stderr, "Unrecognized chunking method: %d", errno); return -1; } printf("Chunking method: %s", buf); ret = hashfile_hashing_method_str(handle, buf, MAXLINE); if (ret < 0) { fprintf(stderr, "Unrecognized hashing method: %d", errno); return -1; } printf("Hashing method: %s\n", buf); /* Go over the files in a hashfile */ printf("== List of files and hashes ==\n"); while (1) { ret = hashfile_next_file(handle); if (ret < 0) { fprintf(stderr, "Cannot get next file from the hashfile: %d\n", errno); return -1; } /* exit the loop if it was the last file */ if (ret == 0) break; printf("File path: %s\n", hashfile_curfile_path(handle)); printf("File size: %"PRIu64 " B\n", hashfile_curfile_size(handle)); printf("Chunks number: %" PRIu64 "\n", hashfile_curfile_numchunks(handle)); /* Go over the chunks in the current file */ chunk_count = 0; while (1) { ci = hashfile_next_chunk(handle); if (!ci) /* exit the loop if it was the last chunk */ break; chunk_count++; print_chunk_hash(chunk_count, ci->hash, hashfile_hash_size(handle) / 8); } } hashfile_close(handle); return 0; }
static int detect_by_file_minhash(char *hashfile_name) { char buf[MAXLINE]; struct hashfile_handle *handle; const struct chunk_info *ci; time_t scan_start_time; int ret; handle = hashfile_open(hashfile_name); int total_chunks = 0; chunkset = g_hash_table_new_full(g_int_hash, hash_equal, NULL, free); if (!handle) { fprintf(stderr, "Error opening hash file: %d!", errno); return -1; } /* Go over the files in a hashfile */ while (1) { ret = hashfile_next_file(handle); if (ret < 0) { fprintf(stderr, "Cannot get next file from a hashfile: %d!\n", errno); return -1; } /* exit the loop if it was the last file */ if (ret == 0) break; memset(minhash, 0xff, 20); parse_file_suffix(hashfile_curfile_path(handle), suffix, 8); if(strncmp(suffix, "edu,", 4) == 0){ strcpy(suffix, "edu,?"); }else if(strlen(suffix) == 0){ strcpy(suffix, ".None"); } GHashTable *curfile = g_hash_table_new_full(g_int_hash, hash_equal, NULL, free); while (1) { ci = hashfile_next_chunk(handle); if (!ci) /* exit the loop if it was the last chunk */ break; struct chunk_item *chunk = malloc(sizeof(struct chunk_item)); memset(chunk, 0, sizeof(*chunk)); chunk->size = ci->size; memcpy(chunk->hash, ci->hash, hashfile_hash_size(handle)/8); memcpy(chunk->hash+hashfile_hash_size(handle)/8, &chunk->size, sizeof(chunk->size)); chunk->rc = 1; chunk->fsize = hashfile_curfile_size(handle); if(memcmp(chunk->hash, minhash, 20) < 0){ memcpy(minhash, chunk->hash, 20); } struct chunk_item* target = g_hash_table_lookup(curfile, chunk->hash); if(target){ if(target->size != chunk->size){ fprintf(stderr, "+Find an intra-file collision! Cannot be detected! File size = %lld, Type = %s\n", hashfile_curfile_size(handle), suffix); collisions++; } free(chunk); dup_chunks++; }else{ g_hash_table_insert(curfile, chunk->hash, chunk); } total_chunks++; } check_curfile(curfile); g_hash_table_destroy(curfile); file_count++; } hashfile_close(handle); g_hash_table_destroy(chunkset); fprintf(stderr, "# of chunks read back: %d; %.4f of total chunks, %.4f of dup chunks\n", chunks_read_back, 1.0*chunks_read_back/total_chunks, 1.0*chunks_read_back/dup_chunks); fprintf(stderr, "# of hash collisions: %d; %d detected\n", collisions, detected_collisions); printf("%d %d %.4f %.4f\n", collisions, detected_collisions, 1.0*chunks_read_back/total_chunks, 1.0*chunks_read_back/dup_chunks); return 0; }