Пример #1
0
static int read_hashfile(char *hashfile_name)
{
	char buf[MAXLINE];
	struct hashfile_handle *handle;
	const struct chunk_info *ci;
	uint64_t chunk_count;
	time_t scan_start_time;
	int ret;

	handle = hashfile_open(hashfile_name);
	if (!handle) {
		fprintf(stderr, "Error opening hash file: %d", errno);
		return -1;
	}

	/* Print some information about the hash file */
	scan_start_time = hashfile_start_time(handle);
	printf("Collected at [%s] on %s",
			hashfile_sysid(handle),
			ctime(&scan_start_time));

	ret = hashfile_chunking_method_str(handle, buf, MAXLINE);
	if (ret < 0) {
		fprintf(stderr, "Unrecognized chunking method: %d", errno);
		return -1;
	}

	printf("Chunking method: %s", buf);

	ret = hashfile_hashing_method_str(handle, buf, MAXLINE);
	if (ret < 0) {
		fprintf(stderr, "Unrecognized hashing method: %d", errno);
		return -1;
	}

	printf("Hashing method: %s\n", buf);

	/* Go over the files in a hashfile */
	printf("== List of files and hashes ==\n");
	while (1) {
		ret = hashfile_next_file(handle);
		if (ret < 0) {
			fprintf(stderr,
				"Cannot get next file from the hashfile: %d\n",
				errno);
			return -1;
		}

		/* exit the loop if it was the last file */
		if (ret == 0)
			break;

		printf("File path: %s\n", hashfile_curfile_path(handle));
		printf("File size: %"PRIu64 " B\n",
				hashfile_curfile_size(handle));
		printf("Chunks number: %" PRIu64 "\n",
				hashfile_curfile_numchunks(handle));

		/* Go over the chunks in the current file */
		chunk_count = 0;
		while (1) {
			ci = hashfile_next_chunk(handle);
			if (!ci) /* exit the loop if it was the last chunk */
				break;

			chunk_count++;

			print_chunk_hash(chunk_count, ci->hash,
					hashfile_hash_size(handle) / 8);
		}
	}

	hashfile_close(handle);

	return 0;
}
Пример #2
0
/*  
 * Chunk level, without file semantics 
 * Dedup
 * (no trace for chunk-level no-dedup model)
 */
void chunk_dedup_simd_trace(char **path, int count, int weighted, char *pophashfile)
{
	if (weighted) {
		fprintf(stderr, "CHUNK:DEDUP:WEIGHTED\n");
		printf("CHUNK:DEDUP:WEIGHTED\n");
	} else {
		fprintf(stderr, "CHUNK:DEDUP:NOT WEIGHTED\n");
		printf("CHUNK:DEDUP:NOT WEIGHTED\n");
	}

	init_iterator("CHUNK");

	struct chunk_rec chunk;
	memset(&chunk, 0, sizeof(chunk));

	int64_t psize = 0;
	int64_t lsize = 0;
	int64_t total_chunks = 0;
	/* USE part */
	int64_t sum4mean = 0;
	int64_t count4mean = 0;
	while (iterate_chunk(&chunk, 0) == 0) {

		int64_t sum = chunk.csize;
		sum *= chunk.rcount;

		lsize += sum;
		psize += chunk.csize;

		total_chunks += chunk.rcount;

		if (weighted) {
			sum4mean += sum * chunk.csize;
			count4mean += chunk.csize;
			print_a_chunk(chunk.csize, sum);
		} else {
			sum4mean += sum; 
			count4mean += chunk.csize;
			print_a_chunk(chunk.csize, chunk.rcount);
		}
	}

	printf("%.6f\n", 1.0*lsize/psize);
	fprintf(stderr, "D/F = %.4f, total_chunks = %"PRId64"\n", 1.0*lsize/psize, 
			total_chunks);
	fprintf(stderr, "mean = %.4f, per DF = %.6f\n", 1.0*sum4mean/count4mean, 
			1.0*sum4mean*psize/count4mean/lsize);

	close_iterator();

	char buf[4096];
	struct hashfile_handle *handle;
	const struct chunk_info *ci;

	int64_t restore_logical_bytes = 0;
	int64_t restore_physical_bytes = 0;
	int64_t restore_chunks = 0;
	GHashTable* chunks = g_hash_table_new_full(g_int_hash, hash20_equal, free, 
			NULL);

	/* RAID Failure part */
	/* 1 - 99 */
	int step = 1;
	/* All chunks lost */
	puts("0");

	if (pophashfile) {
		int popfd = open(pophashfile, O_RDONLY);
		char pophashbuf[20];
		while (read(popfd, pophashbuf, 20) == 20) {
			char *pophash = malloc(20);
			memcpy(pophash, pophashbuf, 20);

			/* restoring a pop chunk */
			memcpy(chunk.hash, pophash, 20);
			assert(search_chunk(&chunk));

			int64_t sum = chunk.csize;
			sum *= chunk.rcount;
			restore_chunks += chunk.rcount;

			restore_physical_bytes += chunk.csize;
			restore_logical_bytes += sum;

			int progress = restore_physical_bytes * 100/psize;
			while (progress >= step && step <= 99) {
				if (weighted) {
					printf("%.6f\n", 1.0*restore_logical_bytes/lsize);
					fprintf(stderr, "%.6f\n", 1.0*restore_logical_bytes/lsize);
				} else {
					printf("%.6f\n", 1.0*restore_chunks/total_chunks);
					fprintf(stderr, "%.6f\n", 1.0*restore_chunks/total_chunks);
				}
				step++;
			}

			assert(!g_hash_table_contains(chunks, pophash));
			g_hash_table_insert(chunks, pophash, NULL);
		}
		close(popfd);
	}

	int pc = 0;
	for (; pc < count; pc++) {
		handle = hashfile_open(path[pc]);

		if (!handle) {
			fprintf(stderr, "Error opening hash file: %d!", errno);
			exit(-1);
		}


		while (1) {
			int ret = hashfile_next_file(handle);
			if (ret < 0) {
				fprintf(stderr,
						"Cannot get next file from a hashfile: %d!\n",
						errno);
				exit(-1);
			}
			if (ret == 0)
				break;

			while (1) {
				ci = hashfile_next_chunk(handle);
				if (!ci) /* exit the loop if it was the last chunk */
					break;

				int hashsize = hashfile_hash_size(handle)/8;
				int chunksize = ci->size;
				memcpy(chunk.hash, ci->hash, hashsize);
				memcpy(&chunk.hash[hashsize], &chunksize, sizeof(chunksize));
				chunk.hashlen = hashfile_hash_size(handle)/8 + sizeof(chunksize);

				if (!g_hash_table_contains(chunks, chunk.hash)) {
					assert(search_chunk(&chunk));
					int64_t sum = chunk.csize;
					sum *= chunk.rcount;
					restore_chunks += chunk.rcount;

					restore_physical_bytes += chunk.csize;
					restore_logical_bytes += sum;

					int progress = restore_physical_bytes * 100/psize;
					while (progress >= step && step <= 99) {
						if (weighted) {
							printf("%.6f\n", 1.0*restore_logical_bytes/lsize);
							fprintf(stderr, "%.6f\n", 1.0*restore_logical_bytes/lsize);
						} else {
							printf("%.6f\n", 1.0*restore_chunks/total_chunks);
							fprintf(stderr, "%.6f\n", 1.0*restore_chunks/total_chunks);
						}
						step++;
					}
					char* hash = malloc(20);
					memcpy(hash, chunk.hash, 20);
					g_hash_table_insert(chunks, hash, NULL);
				}
			}
		}
		hashfile_close(handle);
	}
	g_hash_table_destroy(chunks);

	puts("1.0");
}
Пример #3
0
/*
 * File level, no dedup
 * weighted by size?
 */
void file_nodedup_simd_trace(char **path, int count,  int weighted)
{
	if (weighted) {
		printf("FILE:NO DEDUP:WEIGHTED\n");
		fprintf(stderr, "FILE:NO DEDUP:WEIGHTED\n");
	} else {
		printf("FILE:NO DEDUP:NOT WEIGHTED\n");
		fprintf(stderr, "FILE:NO DEDUP:NOT WEIGHTED\n");
	}

	int64_t sys_capacity = 0;
	int64_t sys_file_number = 0;

	init_iterator("CHUNK");

	struct chunk_rec chunk;
	memset(&chunk, 0, sizeof(chunk));
	struct file_rec fr;
	memset(&fr, 0, sizeof(fr));

	/* USE part */
	while (iterate_chunk(&chunk, 0) == 0) {

		int64_t sum = chunk.csize;
		sum *= chunk.rcount;
		sys_capacity += sum;
		int i = 0;
		int prev = -1;
		for (; i<chunk.rcount; i++) {
			int fid = chunk.list[chunk.rcount+i];
			fr.fid = fid;
			search_file(&fr);

			prev = fid;
			if(weighted)
				printf("%"PRId64"\n", fr.fsize);
			else{
				/* A single file is lost */
				/* no need to output */
			}
		}
	}

	close_iterator();

	sys_file_number = get_file_number();

	fprintf(stderr, "capacity = %.4f GB, Files = %"PRId64"\n", 
			1.0*sys_capacity/1024/1024/1024, sys_file_number);

	char buf[4096];
	struct hashfile_handle *handle;
	const struct chunk_info *ci;
	/* RAID Failure part */
	/* All files lost */
	puts("0");

	int64_t restore_bytes = 0;
	int64_t restore_files = 0;
	int64_t restore_file_bytes = 0;

	/* 1 - 99 */
	int step = 1;

	int pc = 0;
	for (; pc < count; pc++) {
		handle = hashfile_open(path[pc]);

		if (!handle) {
			fprintf(stderr, "Error opening hash file: %d!", errno);
			exit(-1);
		}

		while (1) {
			int ret = hashfile_next_file(handle);
			if (ret < 0) {
				fprintf(stderr,
						"Cannot get next file from a hashfile: %d!\n",
						errno);
				exit(-1);
			}
			if (ret == 0)
				break;

			int64_t filesize = 0;
			while (1) {
				ci = hashfile_next_chunk(handle);
				if (!ci) /* exit the loop if it was the last chunk */
					break;

				int progress = restore_bytes * 100 / sys_capacity;
				while(progress >= step && step <= 99){
					if(!weighted)
						printf("%.6f\n", 1.0*restore_files/sys_file_number);
					else
						printf("%.6f\n", 1.0*restore_file_bytes/sys_capacity);
					step++;
				}

				/* It will overflow */
				/*restore_bytes += ci->size;*/
				int size = ci->size;
				restore_bytes += size;
				filesize += size;
			}

			/*if(filesize != hashfile_curfile_size(handle))*/
			/*printf("%"PRId64" is not %"PRIu64"\n", filesize, hashfile_curfile_size(handle));*/
			/*else*/
			/*printf("%"PRId64" == %"PRIu64"\n", filesize, hashfile_curfile_size(handle));*/
			if(filesize == 0)
				continue;

			restore_files++;
			restore_file_bytes += filesize;

		}

		hashfile_close(handle);
	}
	puts("1.0");
}
Пример #4
0
void file_dedup_simd_trace(char** path, int count,  int weighted, char *pophashfile)
{
	if (weighted) {
		printf("FILE:DEDUP:WEIGHTED\n");
		fprintf(stderr, "FILE:DEDUP:WEIGHTED\n");
	} else {
		printf("FILE:DEDUP:NOT WEIGHTED\n");
		fprintf(stderr, "FILE:DEDUP:NOT WEIGHTED\n");
	}

	init_iterator("CHUNK");

	struct chunk_rec chunk;
	memset(&chunk, 0, sizeof(chunk));
	struct file_rec fr;
	memset(&fr, 0, sizeof(fr));

	/* USE part */
	int64_t psize = 0;
	int64_t lsize = 0;
	while (iterate_chunk(&chunk, 0) == 0) {

		int64_t sum = chunk.csize;
		sum *= chunk.rcount;
		lsize += sum;
		psize += chunk.csize;
		if (!weighted) {
			printf("%d\n", chunk.fcount);
		} else {
			int i = 0;
			int prev = -1;
			int64_t sum = 0;
			for (; i<chunk.rcount; i++) {
				int fid = chunk.list[chunk.rcount+i];
				if (fid == prev)
					continue;
				fr.fid = fid;
				search_file(&fr);

				sum+=fr.fsize;
				prev = fid;
			}
			printf("%"PRId64"\n", sum);
		}
	}

	printf("%.6f\n", 1.0*lsize/psize);
	fprintf(stderr, "LS = %.4f GB, PS = %.4f GB, D/F = %.4f\n", 
			1.0*lsize/1024/1024/1024,
			1.0*psize/1024/1024/1024, 1.0*lsize/psize);

	close_iterator();

	char buf[4096];
	struct hashfile_handle *handle;
	const struct chunk_info *ci;

	int64_t sys_file_number = get_file_number();

	/* All files lost */
	puts("0");

	int64_t restore_bytes = 0;
	int64_t restore_files = 0;
	int64_t restore_file_bytes = 0;

	/* RAID Failure part */
	/* 1 - 99 */
	int step = 1;

	GHashTable* files = g_hash_table_new_full(g_int_hash, g_int_equal, 
			NULL, free);
	GHashTable* chunks = g_hash_table_new_full(g_int_hash, hash20_equal, 
			free, NULL);

	if (pophashfile) {
		int popfd = open(pophashfile, O_RDONLY);
		char pophashbuf[20];
		while (read(popfd, pophashbuf, 20) == 20) {
			char *pophash = malloc(20);
			memcpy(pophash, pophashbuf, 20);

			/* restoring a pop chunk */
			memcpy(chunk.hash, pophash, 20);
			assert(search_chunk(&chunk));

			int i = 0;
			for (;i < chunk.rcount; i++) {
				int fid = chunk.list[chunk.rcount + i];
				struct restoring_file* rfile = g_hash_table_lookup(files, &fid);
				if (!rfile) {
					fr.fid = fid;
					search_file(&fr);

					rfile = malloc(sizeof(*rfile));

					rfile->id = fid;
					rfile->chunk_num = fr.cnum;
					rfile->size = fr.fsize;

					g_hash_table_insert(files, &rfile->id, rfile);
				}
				rfile->chunk_num--;

				if (rfile->chunk_num == 0) {
					/* a file is restored */
					/*fprintf(stderr, "complete file %d\n", fid);*/
					restore_files++;
					restore_file_bytes += rfile->size;
				}
				assert(rfile->chunk_num >= 0);
			}

			restore_bytes += chunk.csize;
			int progress = restore_bytes * 100 / psize;
			while (progress >= step && step <= 99) {
				if (!weighted)
					printf("%.6f\n", 1.0*restore_files/sys_file_number);
				else
					printf("%.6f\n", 1.0*restore_file_bytes/lsize);
				step++;
			}

			assert(!g_hash_table_contains(chunks, pophash));
			g_hash_table_insert(chunks, pophash, NULL);
		}
		close(popfd);
	}

	int pc = 0;
	for (; pc < count; pc++) {

		handle = hashfile_open(path[pc]);

		if (!handle) {
			fprintf(stderr, "Error opening hash file: %d!", errno);
			exit(-1);
		}

		while (1) {
			int ret = hashfile_next_file(handle);
			if (ret < 0) {
				fprintf(stderr,
						"Cannot get next file from a hashfile: %d!\n",
						errno);
				exit(-1);
			}
			if (ret == 0)
				break;

			while (1) {
				ci = hashfile_next_chunk(handle);
				if (!ci) /* exit the loop if it was the last chunk */
					break;

				int hashsize = hashfile_hash_size(handle)/8;
				int chunksize = ci->size;
				memcpy(chunk.hash, ci->hash, hashsize);
				memcpy(&chunk.hash[hashsize], &chunksize, sizeof(chunksize));
				chunk.hashlen = hashfile_hash_size(handle)/8 + sizeof(chunksize);

				if (!g_hash_table_contains(chunks, chunk.hash)) {
					/* restore a chunk */
					assert(search_chunk(&chunk));
					int i = 0;
					for (; i < chunk.rcount; i++) {
						int fid = chunk.list[chunk.rcount + i];
						struct restoring_file* rfile = 
							g_hash_table_lookup(files, &fid);
						if (!rfile) {
							fr.fid = fid;
							search_file(&fr);

							rfile = malloc(sizeof(*rfile));

							rfile->id = fid;
							rfile->chunk_num = fr.cnum;
							rfile->size = fr.fsize;

							g_hash_table_insert(files, &rfile->id, rfile);
						}
						rfile->chunk_num--;

						if(rfile->chunk_num == 0){
							/* a file is restored */
							/*fprintf(stderr, "complete file %d\n", fid);*/
							restore_files++;
							restore_file_bytes += rfile->size;
						}
						assert(rfile->chunk_num >= 0);
					}

					restore_bytes += chunk.csize;
					int progress = restore_bytes * 100/psize;
					while (progress >= step && step <= 99) {
						if (!weighted)
							printf("%.6f\n", 1.0*restore_files/sys_file_number);
						else
							printf("%.6f\n", 1.0*restore_file_bytes/lsize);
						step++;
					}
					char* hash = malloc(20);
					memcpy(hash, chunk.hash, 20);
					g_hash_table_insert(chunks, hash, hash);
				}
			}
		}

		hashfile_close(handle);
	}
	puts("1.0");

	g_hash_table_destroy(files);
	g_hash_table_destroy(chunks);
	fprintf(stderr, "restore %.4f GB\n", 1.0*restore_file_bytes/1024/1024/1024);

}
Пример #5
0
static int detect_by_file_minhash(char *hashfile_name)
{
    char buf[MAXLINE];
    struct hashfile_handle *handle;
    const struct chunk_info *ci;
    time_t scan_start_time;
    int ret;

    handle = hashfile_open(hashfile_name);

    int total_chunks = 0;

    chunkset = g_hash_table_new_full(g_int_hash, hash_equal, NULL, free);

    if (!handle) {
        fprintf(stderr, "Error opening hash file: %d!", errno);
        return -1;
    }

    /* Go over the files in a hashfile */
    while (1) {
        ret = hashfile_next_file(handle);
        if (ret < 0) {
            fprintf(stderr,
                    "Cannot get next file from a hashfile: %d!\n",
                    errno);
            return -1;
        }

        /* exit the loop if it was the last file */
        if (ret == 0)
            break;

        memset(minhash, 0xff, 20);

        parse_file_suffix(hashfile_curfile_path(handle), suffix, 8);
        if(strncmp(suffix, "edu,", 4) == 0){
            strcpy(suffix, "edu,?");
        }else if(strlen(suffix) == 0){
            strcpy(suffix, ".None");
        }

        GHashTable *curfile = g_hash_table_new_full(g_int_hash, hash_equal, NULL, free);

        while (1) {
            ci = hashfile_next_chunk(handle);
            if (!ci) /* exit the loop if it was the last chunk */
                break;

            struct chunk_item *chunk = malloc(sizeof(struct chunk_item));
            memset(chunk, 0, sizeof(*chunk));

            chunk->size = ci->size;
            memcpy(chunk->hash, ci->hash, hashfile_hash_size(handle)/8);
            memcpy(chunk->hash+hashfile_hash_size(handle)/8, &chunk->size, sizeof(chunk->size));

            chunk->rc = 1;

            chunk->fsize = hashfile_curfile_size(handle);

            if(memcmp(chunk->hash, minhash, 20) < 0){
                memcpy(minhash, chunk->hash, 20);
            }

            struct chunk_item* target = g_hash_table_lookup(curfile, chunk->hash);
            if(target){
                if(target->size != chunk->size){
                    fprintf(stderr, "+Find an intra-file collision! Cannot be detected! File size = %lld, Type = %s\n", 
                            hashfile_curfile_size(handle), suffix);
                    collisions++;
                }
                free(chunk);
                dup_chunks++;
            }else{
                g_hash_table_insert(curfile, chunk->hash, chunk);
            }

            total_chunks++;
        }

        check_curfile(curfile);
        g_hash_table_destroy(curfile);
        file_count++;

    }

    hashfile_close(handle);

    g_hash_table_destroy(chunkset);

    fprintf(stderr, "# of chunks read back: %d; %.4f of total chunks, %.4f of dup chunks\n", 
            chunks_read_back, 1.0*chunks_read_back/total_chunks,
            1.0*chunks_read_back/dup_chunks);
    fprintf(stderr, "# of hash collisions: %d; %d detected\n", collisions, detected_collisions);
    printf("%d %d %.4f %.4f\n", collisions, detected_collisions, 1.0*chunks_read_back/total_chunks, 1.0*chunks_read_back/dup_chunks);
    return 0;
}