static void* read_recipe_thread(void *arg) { int i, j, k; for (i = 0; i < jcr.bv->number_of_files; i++) { TIMER_DECLARE(1); TIMER_BEGIN(1); struct recipeMeta *r = read_next_recipe_meta(jcr.bv); struct chunk *c = new_chunk(sdslen(r->filename) + 1); strcpy(c->data, r->filename); SET_CHUNK(c, CHUNK_FILE_START); TIMER_END(1, jcr.read_recipe_time); sync_queue_push(restore_recipe_queue, c); jcr.file_num++; for (j = 0; j < r->chunknum; j++) { TIMER_DECLARE(1); TIMER_BEGIN(1); struct chunkPointer* cp = read_next_n_chunk_pointers(jcr.bv, 1, &k); struct chunk* c = new_chunk(0); memcpy(&c->fp, &cp->fp, sizeof(fingerprint)); c->size = cp->size; c->id = cp->id; TIMER_END(1, jcr.read_recipe_time); jcr.data_size += c->size; jcr.chunk_num++; sync_queue_push(restore_recipe_queue, c); free(cp); } c = new_chunk(0); SET_CHUNK(c, CHUNK_FILE_END); sync_queue_push(restore_recipe_queue, c); free_recipe_meta(r); } sync_queue_term(restore_recipe_queue); return NULL; }
void har_check(struct chunk* c) { if (!CHECK_CHUNK(c, CHUNK_FILE_START) && !CHECK_CHUNK(c, CHUNK_FILE_END) && CHECK_CHUNK(c, CHUNK_DUPLICATE)) if (g_hash_table_lookup(inherited_sparse_containers, &c->id)) { SET_CHUNK(c, CHUNK_SPARSE); char code[41]; hash2code(c->fp, code); code[40] = 0; VERBOSE("chunk %s in sparse container %ld", code, c->id); } }
void send_segment(struct segment* s) { /* * CHUNK_SEGMENT_START and _END are used for * reconstructing the segment in filter phase. */ struct chunk* ss = new_chunk(0); SET_CHUNK(ss, CHUNK_SEGMENT_START); sync_queue_push(dedup_queue, ss); GSequenceIter *end = g_sequence_get_end_iter(s->chunks); GSequenceIter *begin = g_sequence_get_begin_iter(s->chunks); while(begin != end) { struct chunk* c = g_sequence_get(begin); if (!CHECK_CHUNK(c, CHUNK_FILE_START) && !CHECK_CHUNK(c, CHUNK_FILE_END)) { if (CHECK_CHUNK(c, CHUNK_DUPLICATE)) { if (c->id == TEMPORARY_ID) { DEBUG("Dedup phase: %ldth chunk is identical to a unique chunk", chunk_num++); } else { DEBUG("Dedup phase: %ldth chunk is duplicate in container %lld", chunk_num++, c->id); } } else { DEBUG("Dedup phase: %ldth chunk is unique", chunk_num++); } } sync_queue_push(dedup_queue, c); g_sequence_remove(begin); begin = g_sequence_get_begin_iter(s->chunks); } struct chunk* se = new_chunk(0); SET_CHUNK(se, CHUNK_SEGMENT_END); sync_queue_push(dedup_queue, se); s->chunk_num = 0; }
static void* read_trace_thread(void *argv) { FILE *trace_file = fopen(jcr.path, "r"); char line[128]; while (1) { TIMER_DECLARE(1); TIMER_BEGIN(1); fgets(line, 128, trace_file); TIMER_END(1, jcr.read_time); if (strcmp(line, "stream end") == 0) { sync_queue_term(trace_queue); break; } struct chunk* c; TIMER_BEGIN(1), assert(strncmp(line, "file start ", 11) == 0); int filenamelen; sscanf(line, "file start %d", &filenamelen); /* An additional '\n' is read */ c = new_chunk(filenamelen + 2); fgets(c->data, filenamelen + 2, trace_file); c->data[filenamelen] = 0; VERBOSE("Reading: %s", c->data); SET_CHUNK(c, CHUNK_FILE_START); TIMER_END(1, jcr.read_time); sync_queue_push(trace_queue, c); TIMER_BEGIN(1); fgets(line, 128, trace_file); while (strncmp(line, "file end", 8) != 0) { c = new_chunk(0); char code[41]; strncpy(code, line, 40); code2hash(code, c->fp); c->size = atoi(line + 41); jcr.chunk_num++; jcr.data_size += c->size; TIMER_END(1, jcr.read_time); sync_queue_push(trace_queue, c); TIMER_BEGIN(1), fgets(line, 128, trace_file); } c = new_chunk(0); SET_CHUNK(c, CHUNK_FILE_END); sync_queue_push(trace_queue, c); jcr.file_num++; } fclose(trace_file); return NULL; }
void index_lookup_similarity_detection(struct segment *s){ assert(s->features); top_segment_select(s->features); GSequenceIter *iter = g_sequence_get_begin_iter(s->chunks); GSequenceIter *end = g_sequence_get_end_iter(s->chunks); for (; iter != end; iter = g_sequence_iter_next(iter)) { struct chunk* c = g_sequence_get(iter); if (CHECK_CHUNK(c, CHUNK_FILE_START) || CHECK_CHUNK(c, CHUNK_FILE_END)) continue; /* First check it in the storage buffer */ if(storage_buffer.container_buffer && lookup_fingerprint_in_container(storage_buffer.container_buffer, &c->fp)){ c->id = get_container_id(storage_buffer.container_buffer); SET_CHUNK(c, CHUNK_DUPLICATE); SET_CHUNK(c, CHUNK_REWRITE_DENIED); } /* * First check the buffered fingerprints, * recently backup fingerprints. */ GQueue *tq = g_hash_table_lookup(index_buffer.buffered_fingerprints, &c->fp); if (!tq) { tq = g_queue_new(); } else if (!CHECK_CHUNK(c, CHUNK_DUPLICATE)) { struct indexElem *be = g_queue_peek_head(tq); c->id = be->id; SET_CHUNK(c, CHUNK_DUPLICATE); } /* Check the fingerprint cache */ if (!CHECK_CHUNK(c, CHUNK_DUPLICATE)) { /* Searching in fingerprint cache */ int64_t id = fingerprint_cache_lookup(&c->fp); if(id != TEMPORARY_ID){ c->id = id; SET_CHUNK(c, CHUNK_DUPLICATE); } } if(destor.index_category[0] == INDEX_CATEGORY_EXACT || destor.index_segment_selection_method[0] == INDEX_SEGMENT_SELECT_MIX){ if (!CHECK_CHUNK(c, CHUNK_DUPLICATE)) { /* Searching in key-value store */ int64_t* ids = kvstore_lookup((char*)&c->fp); if(ids){ index_overhead.lookup_requests++; /* prefetch the target unit */ fingerprint_cache_prefetch(ids[0]); int64_t id = fingerprint_cache_lookup(&c->fp); if(id != TEMPORARY_ID){ /* * It can be not cached, * since a partial key is possible in near-exact deduplication. */ c->id = id; SET_CHUNK(c, CHUNK_DUPLICATE); }else{ NOTICE("Filter phase: A key collision occurs"); } }else{ index_overhead.lookup_requests_for_unique++; VERBOSE("Dedup phase: non-existing fingerprint"); } } } /* Insert it into the index buffer */ struct indexElem *ne = (struct indexElem*) malloc( sizeof(struct indexElem)); ne->id = c->id; memcpy(&ne->fp, &c->fp, sizeof(fingerprint)); g_queue_push_tail(tq, ne); g_hash_table_replace(index_buffer.buffered_fingerprints, &ne->fp, tq); index_buffer.chunk_num++; } }
/* * When a container buffer is full, we push it into container_queue. */ static void* filter_thread(void *arg) { int enable_rewrite = 1; struct fileRecipeMeta* r = NULL; while (1) { struct chunk* c = sync_queue_pop(rewrite_queue); if (c == NULL) /* backup job finish */ break; /* reconstruct a segment */ struct segment* s = new_segment(); /* segment head */ assert(CHECK_CHUNK(c, CHUNK_SEGMENT_START)); free_chunk(c); c = sync_queue_pop(rewrite_queue); while (!(CHECK_CHUNK(c, CHUNK_SEGMENT_END))) { g_sequence_append(s->chunks, c); if (!CHECK_CHUNK(c, CHUNK_FILE_START) && !CHECK_CHUNK(c, CHUNK_FILE_END)) s->chunk_num++; c = sync_queue_pop(rewrite_queue); } free_chunk(c); /* For self-references in a segment. * If we find an early copy of the chunk in this segment has been rewritten, * the rewrite request for it will be denied to avoid repeat rewriting. */ GHashTable *recently_rewritten_chunks = g_hash_table_new_full(g_int64_hash, g_fingerprint_equal, NULL, free_chunk); GHashTable *recently_unique_chunks = g_hash_table_new_full(g_int64_hash, g_fingerprint_equal, NULL, free_chunk); pthread_mutex_lock(&index_lock.mutex); TIMER_DECLARE(1); TIMER_BEGIN(1); /* This function will check the fragmented chunks * that would be rewritten later. * If we find an early copy of the chunk in earlier segments, * has been rewritten, * the rewrite request for it will be denied. */ index_check_buffer(s); GSequenceIter *iter = g_sequence_get_begin_iter(s->chunks); GSequenceIter *end = g_sequence_get_end_iter(s->chunks); for (; iter != end; iter = g_sequence_iter_next(iter)) { c = g_sequence_get(iter); if (CHECK_CHUNK(c, CHUNK_FILE_START) || CHECK_CHUNK(c, CHUNK_FILE_END)) continue; VERBOSE("Filter phase: %dth chunk in %s container %lld", chunk_num, CHECK_CHUNK(c, CHUNK_OUT_OF_ORDER) ? "out-of-order" : "", c->id); /* Cache-Aware Filter */ if (destor.rewrite_enable_cache_aware && restore_aware_contains(c->id)) { assert(c->id != TEMPORARY_ID); VERBOSE("Filter phase: %dth chunk is cached", chunk_num); SET_CHUNK(c, CHUNK_IN_CACHE); } /* A cfl-switch for rewriting out-of-order chunks. */ if (destor.rewrite_enable_cfl_switch) { double cfl = restore_aware_get_cfl(); if (enable_rewrite && cfl > destor.rewrite_cfl_require) { VERBOSE("Filter phase: Turn OFF the (out-of-order) rewrite switch of %.3f", cfl); enable_rewrite = 0; } else if (!enable_rewrite && cfl < destor.rewrite_cfl_require) { VERBOSE("Filter phase: Turn ON the (out-of-order) rewrite switch of %.3f", cfl); enable_rewrite = 1; } } if(CHECK_CHUNK(c, CHUNK_DUPLICATE) && c->id == TEMPORARY_ID){ struct chunk* ruc = g_hash_table_lookup(recently_unique_chunks, &c->fp); assert(ruc); c->id = ruc->id; } struct chunk* rwc = g_hash_table_lookup(recently_rewritten_chunks, &c->fp); if(rwc){ c->id = rwc->id; SET_CHUNK(c, CHUNK_REWRITE_DENIED); } /* A fragmented chunk will be denied if it has been rewritten recently */ if (!CHECK_CHUNK(c, CHUNK_DUPLICATE) || (!CHECK_CHUNK(c, CHUNK_REWRITE_DENIED) && (CHECK_CHUNK(c, CHUNK_SPARSE) || (enable_rewrite && CHECK_CHUNK(c, CHUNK_OUT_OF_ORDER) && !CHECK_CHUNK(c, CHUNK_IN_CACHE))))) { /* * If the chunk is unique, or be fragmented and not denied, * we write it to a container. * Fragmented indicates: sparse, or out of order and not in cache, */ if (storage_buffer.container_buffer == NULL){ storage_buffer.container_buffer = create_container(); if(destor.index_category[1] == INDEX_CATEGORY_PHYSICAL_LOCALITY) storage_buffer.chunks = g_sequence_new(free_chunk); } if (container_overflow(storage_buffer.container_buffer, c->size)) { if(destor.index_category[1] == INDEX_CATEGORY_PHYSICAL_LOCALITY){ /* * TO-DO * Update_index for physical locality */ GHashTable *features = sampling(storage_buffer.chunks, g_sequence_get_length(storage_buffer.chunks)); index_update(features, get_container_id(storage_buffer.container_buffer)); g_hash_table_destroy(features); g_sequence_free(storage_buffer.chunks); storage_buffer.chunks = g_sequence_new(free_chunk); } TIMER_END(1, jcr.filter_time); write_container_async(storage_buffer.container_buffer); TIMER_BEGIN(1); storage_buffer.container_buffer = create_container(); } if(add_chunk_to_container(storage_buffer.container_buffer, c)){ struct chunk* wc = new_chunk(0); memcpy(&wc->fp, &c->fp, sizeof(fingerprint)); wc->id = c->id; if (!CHECK_CHUNK(c, CHUNK_DUPLICATE)) { jcr.unique_chunk_num++; jcr.unique_data_size += c->size; g_hash_table_insert(recently_unique_chunks, &wc->fp, wc); VERBOSE("Filter phase: %dth chunk is recently unique, size %d", chunk_num, g_hash_table_size(recently_unique_chunks)); } else { jcr.rewritten_chunk_num++; jcr.rewritten_chunk_size += c->size; g_hash_table_insert(recently_rewritten_chunks, &wc->fp, wc); } if(destor.index_category[1] == INDEX_CATEGORY_PHYSICAL_LOCALITY){ struct chunk* ck = new_chunk(0); memcpy(&ck->fp, &c->fp, sizeof(fingerprint)); g_sequence_append(storage_buffer.chunks, ck); } VERBOSE("Filter phase: Write %dth chunk to container %lld", chunk_num, c->id); }else{ VERBOSE("Filter phase: container %lld already has this chunk", c->id); assert(destor.index_category[0] != INDEX_CATEGORY_EXACT || destor.rewrite_algorithm[0]!=REWRITE_NO); } }else{ if(CHECK_CHUNK(c, CHUNK_REWRITE_DENIED)){ VERBOSE("Filter phase: %lldth fragmented chunk is denied", chunk_num); }else if (CHECK_CHUNK(c, CHUNK_OUT_OF_ORDER)) { VERBOSE("Filter phase: %lldth chunk in out-of-order container %lld is already cached", chunk_num, c->id); } } assert(c->id != TEMPORARY_ID); /* Collect historical information. */ har_monitor_update(c->id, c->size); /* Restore-aware */ restore_aware_update(c->id, c->size); chunk_num++; } int full = index_update_buffer(s); /* Write a SEGMENT_BEGIN */ segmentid sid = append_segment_flag(jcr.bv, CHUNK_SEGMENT_START, s->chunk_num); /* Write recipe */ iter = g_sequence_get_begin_iter(s->chunks); end = g_sequence_get_end_iter(s->chunks); for (; iter != end; iter = g_sequence_iter_next(iter)) { c = g_sequence_get(iter); if(r == NULL){ assert(CHECK_CHUNK(c,CHUNK_FILE_START)); r = new_file_recipe_meta(c->data); }else if(!CHECK_CHUNK(c,CHUNK_FILE_END)){ struct chunkPointer cp; cp.id = c->id; assert(cp.id>=0); memcpy(&cp.fp, &c->fp, sizeof(fingerprint)); cp.size = c->size; append_n_chunk_pointers(jcr.bv, &cp ,1); r->chunknum++; r->filesize += c->size; jcr.chunk_num++; jcr.data_size += c->size; }else{ assert(CHECK_CHUNK(c,CHUNK_FILE_END)); append_file_recipe_meta(jcr.bv, r); free_file_recipe_meta(r); r = NULL; jcr.file_num++; } } /* Write a SEGMENT_END */ append_segment_flag(jcr.bv, CHUNK_SEGMENT_END, 0); if(destor.index_category[1] == INDEX_CATEGORY_LOGICAL_LOCALITY){ /* * TO-DO * Update_index for logical locality */ s->features = sampling(s->chunks, s->chunk_num); if(destor.index_category[0] == INDEX_CATEGORY_EXACT){ /* * For exact deduplication, * unique fingerprints are inserted. */ VERBOSE("Filter phase: add %d unique fingerprints to %d features", g_hash_table_size(recently_unique_chunks), g_hash_table_size(s->features)); GHashTableIter iter; gpointer key, value; g_hash_table_iter_init(&iter, recently_unique_chunks); while(g_hash_table_iter_next(&iter, &key, &value)){ struct chunk* uc = value; fingerprint *ft = malloc(sizeof(fingerprint)); memcpy(ft, &uc->fp, sizeof(fingerprint)); g_hash_table_insert(s->features, ft, NULL); } /* * OPTION: * It is still an open problem whether we need to update * rewritten fingerprints. * It would increase index update overhead, while the benefit * remains unclear. * More experiments are required. */ VERBOSE("Filter phase: add %d rewritten fingerprints to %d features", g_hash_table_size(recently_rewritten_chunks), g_hash_table_size(s->features)); g_hash_table_iter_init(&iter, recently_rewritten_chunks); while(g_hash_table_iter_next(&iter, &key, &value)){ struct chunk* uc = value; fingerprint *ft = malloc(sizeof(fingerprint)); memcpy(ft, &uc->fp, sizeof(fingerprint)); g_hash_table_insert(s->features, ft, NULL); } } index_update(s->features, sid); } free_segment(s); if(index_lock.wait_threshold > 0 && full == 0){ pthread_cond_broadcast(&index_lock.cond); } TIMER_END(1, jcr.filter_time); pthread_mutex_unlock(&index_lock.mutex); g_hash_table_destroy(recently_rewritten_chunks); g_hash_table_destroy(recently_unique_chunks); } if (storage_buffer.container_buffer && !container_empty(storage_buffer.container_buffer)){ if(destor.index_category[1] == INDEX_CATEGORY_PHYSICAL_LOCALITY){ /* * TO-DO * Update_index for physical locality */ GHashTable *features = sampling(storage_buffer.chunks, g_sequence_get_length(storage_buffer.chunks)); index_update(features, get_container_id(storage_buffer.container_buffer)); g_hash_table_destroy(features); g_sequence_free(storage_buffer.chunks); } write_container_async(storage_buffer.container_buffer); } /* All files done */ jcr.status = JCR_STATUS_DONE; return NULL; }