Пример #1
0
static int read_hashfile(char *hashfile_name)
{
	char buf[MAXLINE];
	struct hashfile_handle *handle;
	const struct chunk_info *ci;
	uint64_t chunk_count;
	time_t scan_start_time;
	int ret;

	handle = hashfile_open(hashfile_name);
	if (!handle) {
		fprintf(stderr, "Error opening hash file: %d", errno);
		return -1;
	}

	/* Print some information about the hash file */
	scan_start_time = hashfile_start_time(handle);
	printf("Collected at [%s] on %s",
			hashfile_sysid(handle),
			ctime(&scan_start_time));

	ret = hashfile_chunking_method_str(handle, buf, MAXLINE);
	if (ret < 0) {
		fprintf(stderr, "Unrecognized chunking method: %d", errno);
		return -1;
	}

	printf("Chunking method: %s", buf);

	ret = hashfile_hashing_method_str(handle, buf, MAXLINE);
	if (ret < 0) {
		fprintf(stderr, "Unrecognized hashing method: %d", errno);
		return -1;
	}

	printf("Hashing method: %s\n", buf);

	/* Go over the files in a hashfile */
	printf("== List of files and hashes ==\n");
	while (1) {
		ret = hashfile_next_file(handle);
		if (ret < 0) {
			fprintf(stderr,
				"Cannot get next file from the hashfile: %d\n",
				errno);
			return -1;
		}

		/* exit the loop if it was the last file */
		if (ret == 0)
			break;

		printf("File path: %s\n", hashfile_curfile_path(handle));
		printf("File size: %"PRIu64 " B\n",
				hashfile_curfile_size(handle));
		printf("Chunks number: %" PRIu64 "\n",
				hashfile_curfile_numchunks(handle));

		/* Go over the chunks in the current file */
		chunk_count = 0;
		while (1) {
			ci = hashfile_next_chunk(handle);
			if (!ci) /* exit the loop if it was the last chunk */
				break;

			chunk_count++;

			print_chunk_hash(chunk_count, ci->hash,
					hashfile_hash_size(handle) / 8);
		}
	}

	hashfile_close(handle);

	return 0;
}
Пример #2
0
static int detect_by_file_minhash(char *hashfile_name)
{
    char buf[MAXLINE];
    struct hashfile_handle *handle;
    const struct chunk_info *ci;
    time_t scan_start_time;
    int ret;

    handle = hashfile_open(hashfile_name);

    int total_chunks = 0;

    chunkset = g_hash_table_new_full(g_int_hash, hash_equal, NULL, free);

    if (!handle) {
        fprintf(stderr, "Error opening hash file: %d!", errno);
        return -1;
    }

    /* Go over the files in a hashfile */
    while (1) {
        ret = hashfile_next_file(handle);
        if (ret < 0) {
            fprintf(stderr,
                    "Cannot get next file from a hashfile: %d!\n",
                    errno);
            return -1;
        }

        /* exit the loop if it was the last file */
        if (ret == 0)
            break;

        memset(minhash, 0xff, 20);

        parse_file_suffix(hashfile_curfile_path(handle), suffix, 8);
        if(strncmp(suffix, "edu,", 4) == 0){
            strcpy(suffix, "edu,?");
        }else if(strlen(suffix) == 0){
            strcpy(suffix, ".None");
        }

        GHashTable *curfile = g_hash_table_new_full(g_int_hash, hash_equal, NULL, free);

        while (1) {
            ci = hashfile_next_chunk(handle);
            if (!ci) /* exit the loop if it was the last chunk */
                break;

            struct chunk_item *chunk = malloc(sizeof(struct chunk_item));
            memset(chunk, 0, sizeof(*chunk));

            chunk->size = ci->size;
            memcpy(chunk->hash, ci->hash, hashfile_hash_size(handle)/8);
            memcpy(chunk->hash+hashfile_hash_size(handle)/8, &chunk->size, sizeof(chunk->size));

            chunk->rc = 1;

            chunk->fsize = hashfile_curfile_size(handle);

            if(memcmp(chunk->hash, minhash, 20) < 0){
                memcpy(minhash, chunk->hash, 20);
            }

            struct chunk_item* target = g_hash_table_lookup(curfile, chunk->hash);
            if(target){
                if(target->size != chunk->size){
                    fprintf(stderr, "+Find an intra-file collision! Cannot be detected! File size = %lld, Type = %s\n", 
                            hashfile_curfile_size(handle), suffix);
                    collisions++;
                }
                free(chunk);
                dup_chunks++;
            }else{
                g_hash_table_insert(curfile, chunk->hash, chunk);
            }

            total_chunks++;
        }

        check_curfile(curfile);
        g_hash_table_destroy(curfile);
        file_count++;

    }

    hashfile_close(handle);

    g_hash_table_destroy(chunkset);

    fprintf(stderr, "# of chunks read back: %d; %.4f of total chunks, %.4f of dup chunks\n", 
            chunks_read_back, 1.0*chunks_read_back/total_chunks,
            1.0*chunks_read_back/dup_chunks);
    fprintf(stderr, "# of hash collisions: %d; %d detected\n", collisions, detected_collisions);
    printf("%d %d %.4f %.4f\n", collisions, detected_collisions, 1.0*chunks_read_back/total_chunks, 1.0*chunks_read_back/dup_chunks);
    return 0;
}