static int read_hash_config(const char *sysname, mess_image *image) { hash_file *hashfile = NULL; const struct hash_info *info = NULL; hashfile = hashfile_open(sysname, FALSE, NULL); if (!hashfile) goto done; info = hashfile_lookup(hashfile, image->hash); if (!info) goto done; image->longname = image_strdup(image, info->longname); image->manufacturer = image_strdup(image, info->manufacturer); image->year = image_strdup(image, info->year); image->playable = image_strdup(image, info->playable); image->extrainfo = image_strdup(image, info->extrainfo); done: if (hashfile) hashfile_close(hashfile); return !hashfile || !info; }
static int read_hashfile(char *hashfile_name) { char buf[MAXLINE]; struct hashfile_handle *handle; const struct chunk_info *ci; uint64_t chunk_count; time_t scan_start_time; int ret; handle = hashfile_open(hashfile_name); if (!handle) { fprintf(stderr, "Error opening hash file: %d", errno); return -1; } /* Print some information about the hash file */ scan_start_time = hashfile_start_time(handle); printf("Collected at [%s] on %s", hashfile_sysid(handle), ctime(&scan_start_time)); ret = hashfile_chunking_method_str(handle, buf, MAXLINE); if (ret < 0) { fprintf(stderr, "Unrecognized chunking method: %d", errno); return -1; } printf("Chunking method: %s", buf); ret = hashfile_hashing_method_str(handle, buf, MAXLINE); if (ret < 0) { fprintf(stderr, "Unrecognized hashing method: %d", errno); return -1; } printf("Hashing method: %s\n", buf); /* Go over the files in a hashfile */ printf("== List of files and hashes ==\n"); while (1) { ret = hashfile_next_file(handle); if (ret < 0) { fprintf(stderr, "Cannot get next file from the hashfile: %d\n", errno); return -1; } /* exit the loop if it was the last file */ if (ret == 0) break; printf("File path: %s\n", hashfile_curfile_path(handle)); printf("File size: %"PRIu64 " B\n", hashfile_curfile_size(handle)); printf("Chunks number: %" PRIu64 "\n", hashfile_curfile_numchunks(handle)); /* Go over the chunks in the current file */ chunk_count = 0; while (1) { ci = hashfile_next_chunk(handle); if (!ci) /* exit the loop if it was the last chunk */ break; chunk_count++; print_chunk_hash(chunk_count, ci->hash, hashfile_hash_size(handle) / 8); } } hashfile_close(handle); return 0; }
/* * Chunk level, without file semantics * Dedup * (no trace for chunk-level no-dedup model) */ void chunk_dedup_simd_trace(char **path, int count, int weighted, char *pophashfile) { if (weighted) { fprintf(stderr, "CHUNK:DEDUP:WEIGHTED\n"); printf("CHUNK:DEDUP:WEIGHTED\n"); } else { fprintf(stderr, "CHUNK:DEDUP:NOT WEIGHTED\n"); printf("CHUNK:DEDUP:NOT WEIGHTED\n"); } init_iterator("CHUNK"); struct chunk_rec chunk; memset(&chunk, 0, sizeof(chunk)); int64_t psize = 0; int64_t lsize = 0; int64_t total_chunks = 0; /* USE part */ int64_t sum4mean = 0; int64_t count4mean = 0; while (iterate_chunk(&chunk, 0) == 0) { int64_t sum = chunk.csize; sum *= chunk.rcount; lsize += sum; psize += chunk.csize; total_chunks += chunk.rcount; if (weighted) { sum4mean += sum * chunk.csize; count4mean += chunk.csize; print_a_chunk(chunk.csize, sum); } else { sum4mean += sum; count4mean += chunk.csize; print_a_chunk(chunk.csize, chunk.rcount); } } printf("%.6f\n", 1.0*lsize/psize); fprintf(stderr, "D/F = %.4f, total_chunks = %"PRId64"\n", 1.0*lsize/psize, total_chunks); fprintf(stderr, "mean = %.4f, per DF = %.6f\n", 1.0*sum4mean/count4mean, 1.0*sum4mean*psize/count4mean/lsize); close_iterator(); char buf[4096]; struct hashfile_handle *handle; const struct chunk_info *ci; int64_t restore_logical_bytes = 0; int64_t restore_physical_bytes = 0; int64_t restore_chunks = 0; GHashTable* chunks = g_hash_table_new_full(g_int_hash, hash20_equal, free, NULL); /* RAID Failure part */ /* 1 - 99 */ int step = 1; /* All chunks lost */ puts("0"); if (pophashfile) { int popfd = open(pophashfile, O_RDONLY); char pophashbuf[20]; while (read(popfd, pophashbuf, 20) == 20) { char *pophash = malloc(20); memcpy(pophash, pophashbuf, 20); /* restoring a pop chunk */ memcpy(chunk.hash, pophash, 20); assert(search_chunk(&chunk)); int64_t sum = chunk.csize; sum *= chunk.rcount; restore_chunks += chunk.rcount; restore_physical_bytes += chunk.csize; restore_logical_bytes += sum; int progress = restore_physical_bytes * 100/psize; while (progress >= step && step <= 99) { if (weighted) { printf("%.6f\n", 1.0*restore_logical_bytes/lsize); fprintf(stderr, "%.6f\n", 1.0*restore_logical_bytes/lsize); } else { printf("%.6f\n", 1.0*restore_chunks/total_chunks); fprintf(stderr, "%.6f\n", 1.0*restore_chunks/total_chunks); } step++; } assert(!g_hash_table_contains(chunks, pophash)); g_hash_table_insert(chunks, pophash, NULL); } close(popfd); } int pc = 0; for (; pc < count; pc++) { handle = hashfile_open(path[pc]); if (!handle) { fprintf(stderr, "Error opening hash file: %d!", errno); exit(-1); } while (1) { int ret = hashfile_next_file(handle); if (ret < 0) { fprintf(stderr, "Cannot get next file from a hashfile: %d!\n", errno); exit(-1); } if (ret == 0) break; while (1) { ci = hashfile_next_chunk(handle); if (!ci) /* exit the loop if it was the last chunk */ break; int hashsize = hashfile_hash_size(handle)/8; int chunksize = ci->size; memcpy(chunk.hash, ci->hash, hashsize); memcpy(&chunk.hash[hashsize], &chunksize, sizeof(chunksize)); chunk.hashlen = hashfile_hash_size(handle)/8 + sizeof(chunksize); if (!g_hash_table_contains(chunks, chunk.hash)) { assert(search_chunk(&chunk)); int64_t sum = chunk.csize; sum *= chunk.rcount; restore_chunks += chunk.rcount; restore_physical_bytes += chunk.csize; restore_logical_bytes += sum; int progress = restore_physical_bytes * 100/psize; while (progress >= step && step <= 99) { if (weighted) { printf("%.6f\n", 1.0*restore_logical_bytes/lsize); fprintf(stderr, "%.6f\n", 1.0*restore_logical_bytes/lsize); } else { printf("%.6f\n", 1.0*restore_chunks/total_chunks); fprintf(stderr, "%.6f\n", 1.0*restore_chunks/total_chunks); } step++; } char* hash = malloc(20); memcpy(hash, chunk.hash, 20); g_hash_table_insert(chunks, hash, NULL); } } } hashfile_close(handle); } g_hash_table_destroy(chunks); puts("1.0"); }
void file_dedup_simd_trace(char** path, int count, int weighted, char *pophashfile) { if (weighted) { printf("FILE:DEDUP:WEIGHTED\n"); fprintf(stderr, "FILE:DEDUP:WEIGHTED\n"); } else { printf("FILE:DEDUP:NOT WEIGHTED\n"); fprintf(stderr, "FILE:DEDUP:NOT WEIGHTED\n"); } init_iterator("CHUNK"); struct chunk_rec chunk; memset(&chunk, 0, sizeof(chunk)); struct file_rec fr; memset(&fr, 0, sizeof(fr)); /* USE part */ int64_t psize = 0; int64_t lsize = 0; while (iterate_chunk(&chunk, 0) == 0) { int64_t sum = chunk.csize; sum *= chunk.rcount; lsize += sum; psize += chunk.csize; if (!weighted) { printf("%d\n", chunk.fcount); } else { int i = 0; int prev = -1; int64_t sum = 0; for (; i<chunk.rcount; i++) { int fid = chunk.list[chunk.rcount+i]; if (fid == prev) continue; fr.fid = fid; search_file(&fr); sum+=fr.fsize; prev = fid; } printf("%"PRId64"\n", sum); } } printf("%.6f\n", 1.0*lsize/psize); fprintf(stderr, "LS = %.4f GB, PS = %.4f GB, D/F = %.4f\n", 1.0*lsize/1024/1024/1024, 1.0*psize/1024/1024/1024, 1.0*lsize/psize); close_iterator(); char buf[4096]; struct hashfile_handle *handle; const struct chunk_info *ci; int64_t sys_file_number = get_file_number(); /* All files lost */ puts("0"); int64_t restore_bytes = 0; int64_t restore_files = 0; int64_t restore_file_bytes = 0; /* RAID Failure part */ /* 1 - 99 */ int step = 1; GHashTable* files = g_hash_table_new_full(g_int_hash, g_int_equal, NULL, free); GHashTable* chunks = g_hash_table_new_full(g_int_hash, hash20_equal, free, NULL); if (pophashfile) { int popfd = open(pophashfile, O_RDONLY); char pophashbuf[20]; while (read(popfd, pophashbuf, 20) == 20) { char *pophash = malloc(20); memcpy(pophash, pophashbuf, 20); /* restoring a pop chunk */ memcpy(chunk.hash, pophash, 20); assert(search_chunk(&chunk)); int i = 0; for (;i < chunk.rcount; i++) { int fid = chunk.list[chunk.rcount + i]; struct restoring_file* rfile = g_hash_table_lookup(files, &fid); if (!rfile) { fr.fid = fid; search_file(&fr); rfile = malloc(sizeof(*rfile)); rfile->id = fid; rfile->chunk_num = fr.cnum; rfile->size = fr.fsize; g_hash_table_insert(files, &rfile->id, rfile); } rfile->chunk_num--; if (rfile->chunk_num == 0) { /* a file is restored */ /*fprintf(stderr, "complete file %d\n", fid);*/ restore_files++; restore_file_bytes += rfile->size; } assert(rfile->chunk_num >= 0); } restore_bytes += chunk.csize; int progress = restore_bytes * 100 / psize; while (progress >= step && step <= 99) { if (!weighted) printf("%.6f\n", 1.0*restore_files/sys_file_number); else printf("%.6f\n", 1.0*restore_file_bytes/lsize); step++; } assert(!g_hash_table_contains(chunks, pophash)); g_hash_table_insert(chunks, pophash, NULL); } close(popfd); } int pc = 0; for (; pc < count; pc++) { handle = hashfile_open(path[pc]); if (!handle) { fprintf(stderr, "Error opening hash file: %d!", errno); exit(-1); } while (1) { int ret = hashfile_next_file(handle); if (ret < 0) { fprintf(stderr, "Cannot get next file from a hashfile: %d!\n", errno); exit(-1); } if (ret == 0) break; while (1) { ci = hashfile_next_chunk(handle); if (!ci) /* exit the loop if it was the last chunk */ break; int hashsize = hashfile_hash_size(handle)/8; int chunksize = ci->size; memcpy(chunk.hash, ci->hash, hashsize); memcpy(&chunk.hash[hashsize], &chunksize, sizeof(chunksize)); chunk.hashlen = hashfile_hash_size(handle)/8 + sizeof(chunksize); if (!g_hash_table_contains(chunks, chunk.hash)) { /* restore a chunk */ assert(search_chunk(&chunk)); int i = 0; for (; i < chunk.rcount; i++) { int fid = chunk.list[chunk.rcount + i]; struct restoring_file* rfile = g_hash_table_lookup(files, &fid); if (!rfile) { fr.fid = fid; search_file(&fr); rfile = malloc(sizeof(*rfile)); rfile->id = fid; rfile->chunk_num = fr.cnum; rfile->size = fr.fsize; g_hash_table_insert(files, &rfile->id, rfile); } rfile->chunk_num--; if(rfile->chunk_num == 0){ /* a file is restored */ /*fprintf(stderr, "complete file %d\n", fid);*/ restore_files++; restore_file_bytes += rfile->size; } assert(rfile->chunk_num >= 0); } restore_bytes += chunk.csize; int progress = restore_bytes * 100/psize; while (progress >= step && step <= 99) { if (!weighted) printf("%.6f\n", 1.0*restore_files/sys_file_number); else printf("%.6f\n", 1.0*restore_file_bytes/lsize); step++; } char* hash = malloc(20); memcpy(hash, chunk.hash, 20); g_hash_table_insert(chunks, hash, hash); } } } hashfile_close(handle); } puts("1.0"); g_hash_table_destroy(files); g_hash_table_destroy(chunks); fprintf(stderr, "restore %.4f GB\n", 1.0*restore_file_bytes/1024/1024/1024); }
/* * File level, no dedup * weighted by size? */ void file_nodedup_simd_trace(char **path, int count, int weighted) { if (weighted) { printf("FILE:NO DEDUP:WEIGHTED\n"); fprintf(stderr, "FILE:NO DEDUP:WEIGHTED\n"); } else { printf("FILE:NO DEDUP:NOT WEIGHTED\n"); fprintf(stderr, "FILE:NO DEDUP:NOT WEIGHTED\n"); } int64_t sys_capacity = 0; int64_t sys_file_number = 0; init_iterator("CHUNK"); struct chunk_rec chunk; memset(&chunk, 0, sizeof(chunk)); struct file_rec fr; memset(&fr, 0, sizeof(fr)); /* USE part */ while (iterate_chunk(&chunk, 0) == 0) { int64_t sum = chunk.csize; sum *= chunk.rcount; sys_capacity += sum; int i = 0; int prev = -1; for (; i<chunk.rcount; i++) { int fid = chunk.list[chunk.rcount+i]; fr.fid = fid; search_file(&fr); prev = fid; if(weighted) printf("%"PRId64"\n", fr.fsize); else{ /* A single file is lost */ /* no need to output */ } } } close_iterator(); sys_file_number = get_file_number(); fprintf(stderr, "capacity = %.4f GB, Files = %"PRId64"\n", 1.0*sys_capacity/1024/1024/1024, sys_file_number); char buf[4096]; struct hashfile_handle *handle; const struct chunk_info *ci; /* RAID Failure part */ /* All files lost */ puts("0"); int64_t restore_bytes = 0; int64_t restore_files = 0; int64_t restore_file_bytes = 0; /* 1 - 99 */ int step = 1; int pc = 0; for (; pc < count; pc++) { handle = hashfile_open(path[pc]); if (!handle) { fprintf(stderr, "Error opening hash file: %d!", errno); exit(-1); } while (1) { int ret = hashfile_next_file(handle); if (ret < 0) { fprintf(stderr, "Cannot get next file from a hashfile: %d!\n", errno); exit(-1); } if (ret == 0) break; int64_t filesize = 0; while (1) { ci = hashfile_next_chunk(handle); if (!ci) /* exit the loop if it was the last chunk */ break; int progress = restore_bytes * 100 / sys_capacity; while(progress >= step && step <= 99){ if(!weighted) printf("%.6f\n", 1.0*restore_files/sys_file_number); else printf("%.6f\n", 1.0*restore_file_bytes/sys_capacity); step++; } /* It will overflow */ /*restore_bytes += ci->size;*/ int size = ci->size; restore_bytes += size; filesize += size; } /*if(filesize != hashfile_curfile_size(handle))*/ /*printf("%"PRId64" is not %"PRIu64"\n", filesize, hashfile_curfile_size(handle));*/ /*else*/ /*printf("%"PRId64" == %"PRIu64"\n", filesize, hashfile_curfile_size(handle));*/ if(filesize == 0) continue; restore_files++; restore_file_bytes += filesize; } hashfile_close(handle); } puts("1.0"); }
static int detect_by_file_minhash(char *hashfile_name) { char buf[MAXLINE]; struct hashfile_handle *handle; const struct chunk_info *ci; time_t scan_start_time; int ret; handle = hashfile_open(hashfile_name); int total_chunks = 0; chunkset = g_hash_table_new_full(g_int_hash, hash_equal, NULL, free); if (!handle) { fprintf(stderr, "Error opening hash file: %d!", errno); return -1; } /* Go over the files in a hashfile */ while (1) { ret = hashfile_next_file(handle); if (ret < 0) { fprintf(stderr, "Cannot get next file from a hashfile: %d!\n", errno); return -1; } /* exit the loop if it was the last file */ if (ret == 0) break; memset(minhash, 0xff, 20); parse_file_suffix(hashfile_curfile_path(handle), suffix, 8); if(strncmp(suffix, "edu,", 4) == 0){ strcpy(suffix, "edu,?"); }else if(strlen(suffix) == 0){ strcpy(suffix, ".None"); } GHashTable *curfile = g_hash_table_new_full(g_int_hash, hash_equal, NULL, free); while (1) { ci = hashfile_next_chunk(handle); if (!ci) /* exit the loop if it was the last chunk */ break; struct chunk_item *chunk = malloc(sizeof(struct chunk_item)); memset(chunk, 0, sizeof(*chunk)); chunk->size = ci->size; memcpy(chunk->hash, ci->hash, hashfile_hash_size(handle)/8); memcpy(chunk->hash+hashfile_hash_size(handle)/8, &chunk->size, sizeof(chunk->size)); chunk->rc = 1; chunk->fsize = hashfile_curfile_size(handle); if(memcmp(chunk->hash, minhash, 20) < 0){ memcpy(minhash, chunk->hash, 20); } struct chunk_item* target = g_hash_table_lookup(curfile, chunk->hash); if(target){ if(target->size != chunk->size){ fprintf(stderr, "+Find an intra-file collision! Cannot be detected! File size = %lld, Type = %s\n", hashfile_curfile_size(handle), suffix); collisions++; } free(chunk); dup_chunks++; }else{ g_hash_table_insert(curfile, chunk->hash, chunk); } total_chunks++; } check_curfile(curfile); g_hash_table_destroy(curfile); file_count++; } hashfile_close(handle); g_hash_table_destroy(chunkset); fprintf(stderr, "# of chunks read back: %d; %.4f of total chunks, %.4f of dup chunks\n", chunks_read_back, 1.0*chunks_read_back/total_chunks, 1.0*chunks_read_back/dup_chunks); fprintf(stderr, "# of hash collisions: %d; %d detected\n", collisions, detected_collisions); printf("%d %d %.4f %.4f\n", collisions, detected_collisions, 1.0*chunks_read_back/total_chunks, 1.0*chunks_read_back/dup_chunks); return 0; }