/** * Reads a block of memory and generates a rabin fingerprint list from it. * Since most of the time we will not end on a border, the function returns * a block struct, which keeps track of the current blocksum and rolling checksum */ struct rab_block_info *read_rabin_block(void *buf, ssize_t size, struct rab_block_info *cur_block) { struct rab_block_info *block; if(cur_block == NULL) { block=init_empty_block(); if(block == NULL) return NULL; } else { block=cur_block; } //We ended on a border, gen a new tail if(block->current_poly_finished) { struct rabin_polynomial *new_poly=gen_new_polynomial(NULL,0,0,0); block->tail->next_polynomial=new_poly; block->tail=new_poly; block->current_poly_finished=0; } ssize_t i; for(i=0;i<size;i++) { char cur_byte=*((char *)(buf+i)); char pushed_out=block->current_window_data[block->window_pos]; block->current_window_data[block->window_pos]=cur_byte; block->cur_roll_checksum=(block->cur_roll_checksum*rabin_polynomial_prime)+cur_byte; block->tail->polynomial=(block->tail->polynomial*rabin_polynomial_prime)+cur_byte; block->cur_roll_checksum-=(pushed_out*polynomial_lookup_buf[rabin_sliding_window_size]); block->window_pos++; block->total_bytes_read++; block->tail->length++; if(block->window_pos == rabin_sliding_window_size) //Loop back around block->window_pos=0; //If we hit our special value or reached the max win size create a new block if((block->tail->length >= rabin_polynomial_min_block_size && (block->cur_roll_checksum % rabin_polynomial_average_block_size) == rabin_polynomial_prime)|| block->tail->length == rabin_polynomial_max_block_size) { block->tail->start=block->total_bytes_read-block->tail->length; struct rabin_polynomial *new_poly=gen_new_polynomial(NULL,0,0,0); block->tail->next_polynomial=new_poly; block->tail=new_poly; if(i==size-1) block->current_poly_finished=1; } } return block; }
//given a file extract blocks and check for duplicates int extract_blocks(char* filename){ //printf("Processing %s \n",filename); int fd = open(filename,O_RDONLY | O_LARGEFILE); uint64_t off=0; if(fd){ struct stat st; stat(filename, &st); total_space+=st.st_size; //declare a block unsigned char block_read[READSIZE]; bzero(block_read,sizeof(block_read)); //read first block from file int aux = pread(fd,block_read,READSIZE,off); off+= READSIZE; //check if the file still has more blocks and if size <bzise discard //the last incomplete block while(aux>0){ //Process fixed size dups all sizes specified int curr_sizes_proc=0; while(curr_sizes_proc<nr_sizes_proc){ int size_block=sizes_proc[curr_sizes_proc]; int size_proced=0; unsigned char *block_proc; while(aux-size_proced>=size_block){ block_proc=malloc(size_block); bzero(block_proc,size_block); memcpy(block_proc,&block_read[size_proced],size_block); //index the block and find duplicates check_duplicates(block_proc,size_block,curr_sizes_proc); free(block_proc); size_proced+=size_block; } if(aux-size_proced > 0){ incomplete_blocks[curr_sizes_proc]++; incomplete_space[curr_sizes_proc]+=aux-size_proced; //process the block anyway block_proc=malloc(aux-size_proced); bzero(block_proc,aux-size_proced); memcpy(block_proc,&block_read[size_proced],aux-size_proced); //index the block and find duplicates check_duplicates(block_proc,aux-size_proced,curr_sizes_proc); free(block_proc); } curr_sizes_proc++; } //process rabin chunks with avg size of 1K,4K,8K,16K //min size = avg size/2 and max size = avg size*2 //Process fixed size dups for 1K,4K,8K,16K int curr_sizes_proc_rabin=0; while(curr_sizes_proc_rabin<nr_sizes_proc_rabin){ rabin_polynomial_min_block_size=sizes_proc_rabin[curr_sizes_proc_rabin]/2; rabin_polynomial_max_block_size=sizes_proc_rabin[curr_sizes_proc_rabin]*2; rabin_polynomial_average_block_size=sizes_proc_rabin[curr_sizes_proc_rabin]; rabin_sliding_window_size=30; struct rab_block_info *block; if(cur_block[curr_sizes_proc_rabin] == NULL) { //printf("Initializing rabin block %d\n",curr_sizes_proc_rabin); cur_block[curr_sizes_proc_rabin]=init_empty_block(); } block=cur_block[curr_sizes_proc_rabin]; int i; for(i=0;i<aux;i++) { char cur_byte=*((char *)(block_read+i)); char pushed_out=block->current_window_data[block->window_pos]; block->current_window_data[block->window_pos]=cur_byte; block->cur_roll_checksum=(block->cur_roll_checksum*rabin_polynomial_prime)+cur_byte; block->tail->polynomial=(block->tail->polynomial*rabin_polynomial_prime)+cur_byte; block->cur_roll_checksum-=(pushed_out*polynomial_lookup_buf[rabin_sliding_window_size]); block->window_pos++; block->total_bytes_read++; block->tail->length++; if(block->window_pos == rabin_sliding_window_size) //Loop back around block->window_pos=0; //If we hit our special value or reached the max win size create a new block if((block->tail->length >= rabin_polynomial_min_block_size && (block->cur_roll_checksum % rabin_polynomial_average_block_size) == rabin_polynomial_prime)|| block->tail->length == rabin_polynomial_max_block_size) { block->tail->start=block->total_bytes_read-block->tail->length; //insert hash in berkDB //index the block and find duplicates check_rabin_duplicates(cur_block[curr_sizes_proc_rabin]->tail,nr_sizes_proc+curr_sizes_proc_rabin); //free oldblock and polinomial //free(cur_block[curr_sizes_proc]->tail); cur_block[curr_sizes_proc_rabin]->tail=gen_new_polynomial(NULL,0,0,0); //if(i==READSIZE-1) //cur_block[curr_sizes_proc_rabin]->current_poly_finished=1; } } curr_sizes_proc_rabin++; } //free this block from memory and process another //free(block_read); //block_read=malloc(sizeof(unsigned char)*READSIZE); aux = pread(fd,block_read,READSIZE,off); off+=READSIZE; } //zero the blocks a new file is being processed int auxc=0; for(auxc=0;auxc<nr_sizes_proc_rabin;auxc++){ struct rab_block_info *lastblock; lastblock = cur_block[auxc]; if( lastblock->tail->length > 0){ if(lastblock->tail->length < sizes_proc_rabin[auxc]/2 ){ incomplete_blocks[nr_sizes_proc+auxc]++; incomplete_space[nr_sizes_proc+auxc]+=lastblock->tail->length; } //insert hash in berkDB //index the block and find duplicates check_rabin_duplicates(lastblock->tail,nr_sizes_proc+auxc); } cur_block[auxc]=NULL; } close(fd); } else{ fprintf(stderr,"error opening file %s\n",filename); exit(1); } return 0; }