/**
 * Reads a block of memory and generates a rabin fingerprint list from it.
 * Since most of the time we will not end on a border, the function returns
 * a block struct, which keeps track of the current blocksum and rolling checksum
 */
struct rab_block_info *read_rabin_block(void *buf, ssize_t size, struct rab_block_info *cur_block) {
    struct rab_block_info *block;
    
    if(cur_block == NULL) {
        block=init_empty_block();
        if(block == NULL)
            return NULL;
    }
    
    else {
     	block=cur_block;
    }
    //We ended on a border, gen a new tail
    if(block->current_poly_finished) {
        struct rabin_polynomial *new_poly=gen_new_polynomial(NULL,0,0,0);
        block->tail->next_polynomial=new_poly;
        block->tail=new_poly;
        block->current_poly_finished=0;
    }
   

    ssize_t i;
    for(i=0;i<size;i++) {
    	char cur_byte=*((char *)(buf+i));
        char pushed_out=block->current_window_data[block->window_pos];
        block->current_window_data[block->window_pos]=cur_byte;
        block->cur_roll_checksum=(block->cur_roll_checksum*rabin_polynomial_prime)+cur_byte;
        block->tail->polynomial=(block->tail->polynomial*rabin_polynomial_prime)+cur_byte;
        block->cur_roll_checksum-=(pushed_out*polynomial_lookup_buf[rabin_sliding_window_size]);
        
        block->window_pos++;
        block->total_bytes_read++;
        block->tail->length++;
        
        if(block->window_pos == rabin_sliding_window_size) //Loop back around
            block->window_pos=0;
        
        //If we hit our special value or reached the max win size create a new block
        if((block->tail->length >= rabin_polynomial_min_block_size && (block->cur_roll_checksum % rabin_polynomial_average_block_size) == rabin_polynomial_prime)|| block->tail->length == rabin_polynomial_max_block_size) {
            block->tail->start=block->total_bytes_read-block->tail->length;
            struct rabin_polynomial *new_poly=gen_new_polynomial(NULL,0,0,0);
            block->tail->next_polynomial=new_poly;
            block->tail=new_poly;
            
            if(i==size-1)
                block->current_poly_finished=1;
        }
    }
    
    return block;
    
}
예제 #2
0
//given a file extract blocks and check for duplicates
int extract_blocks(char* filename){

	//printf("Processing %s \n",filename);
    int fd = open(filename,O_RDONLY | O_LARGEFILE);


    
    uint64_t off=0;
    if(fd){

      struct stat st;
      stat(filename, &st);
      total_space+=st.st_size;
      
      //declare a block
      unsigned char block_read[READSIZE];
      bzero(block_read,sizeof(block_read));

      //read first block from file
      int aux = pread(fd,block_read,READSIZE,off);
      off+= READSIZE;

      //check if the file still has more blocks and if size <bzise discard
      //the last incomplete block
      while(aux>0){

      	//Process fixed size dups all sizes specified
         int curr_sizes_proc=0;
         while(curr_sizes_proc<nr_sizes_proc){

        	 int size_block=sizes_proc[curr_sizes_proc];
        	 int size_proced=0;
        	 unsigned char *block_proc;

           
        	 while(aux-size_proced>=size_block){

        		 block_proc=malloc(size_block);
        		 bzero(block_proc,size_block);

        		 memcpy(block_proc,&block_read[size_proced],size_block);

        		 //index the block and find duplicates
        		 check_duplicates(block_proc,size_block,curr_sizes_proc);

        		 free(block_proc);

        		 size_proced+=size_block;
        	 }


           if(aux-size_proced > 0){

	    
            incomplete_blocks[curr_sizes_proc]++;
            incomplete_space[curr_sizes_proc]+=aux-size_proced;


		//process the block anyway
		block_proc=malloc(aux-size_proced);
                bzero(block_proc,aux-size_proced);

                memcpy(block_proc,&block_read[size_proced],aux-size_proced);

                //index the block and find duplicates
                check_duplicates(block_proc,aux-size_proced,curr_sizes_proc);
                free(block_proc);

	
           }


        	 curr_sizes_proc++;
       	 }


       	 //process rabin chunks with avg size of 1K,4K,8K,16K
         //min size = avg size/2  and max size = avg size*2
         //Process fixed size dups for 1K,4K,8K,16K
       	 int curr_sizes_proc_rabin=0;
         while(curr_sizes_proc_rabin<nr_sizes_proc_rabin){

           rabin_polynomial_min_block_size=sizes_proc_rabin[curr_sizes_proc_rabin]/2;
           rabin_polynomial_max_block_size=sizes_proc_rabin[curr_sizes_proc_rabin]*2;
           rabin_polynomial_average_block_size=sizes_proc_rabin[curr_sizes_proc_rabin];
           rabin_sliding_window_size=30;

           struct rab_block_info *block;

           if(cur_block[curr_sizes_proc_rabin] == NULL) {

        	  //printf("Initializing rabin block %d\n",curr_sizes_proc_rabin);
              cur_block[curr_sizes_proc_rabin]=init_empty_block();
           }

           block=cur_block[curr_sizes_proc_rabin];

           int i;
           for(i=0;i<aux;i++) {
               	   char cur_byte=*((char *)(block_read+i));
                   char pushed_out=block->current_window_data[block->window_pos];
                   block->current_window_data[block->window_pos]=cur_byte;
                   block->cur_roll_checksum=(block->cur_roll_checksum*rabin_polynomial_prime)+cur_byte;
                   block->tail->polynomial=(block->tail->polynomial*rabin_polynomial_prime)+cur_byte;
                   block->cur_roll_checksum-=(pushed_out*polynomial_lookup_buf[rabin_sliding_window_size]);

                   block->window_pos++;
                   block->total_bytes_read++;
                   block->tail->length++;

                   if(block->window_pos == rabin_sliding_window_size) //Loop back around
                       block->window_pos=0;


                   //If we hit our special value or reached the max win size create a new block
                   if((block->tail->length >= rabin_polynomial_min_block_size && (block->cur_roll_checksum % rabin_polynomial_average_block_size) == rabin_polynomial_prime)|| block->tail->length == rabin_polynomial_max_block_size) {


                	   block->tail->start=block->total_bytes_read-block->tail->length;

                	   //insert hash in berkDB
                	   //index the block and find duplicates
                	   check_rabin_duplicates(cur_block[curr_sizes_proc_rabin]->tail,nr_sizes_proc+curr_sizes_proc_rabin);


                	   //free oldblock and polinomial
                	   //free(cur_block[curr_sizes_proc]->tail);
                	   cur_block[curr_sizes_proc_rabin]->tail=gen_new_polynomial(NULL,0,0,0);

                       //if(i==READSIZE-1)
                    	   //cur_block[curr_sizes_proc_rabin]->current_poly_finished=1;
                   }


           }

           curr_sizes_proc_rabin++;

         }

       	 //free this block from memory and process another
         //free(block_read);
         //block_read=malloc(sizeof(unsigned char)*READSIZE);
         aux = pread(fd,block_read,READSIZE,off);
         off+=READSIZE;



      }

    //zero the blocks a new file is being processed 
    int auxc=0;
    for(auxc=0;auxc<nr_sizes_proc_rabin;auxc++){

        struct rab_block_info *lastblock;
        lastblock = cur_block[auxc];

        if( lastblock->tail->length > 0){
            
	    if(lastblock->tail->length < sizes_proc_rabin[auxc]/2 ){
	        incomplete_blocks[nr_sizes_proc+auxc]++;
            	incomplete_space[nr_sizes_proc+auxc]+=lastblock->tail->length;
            }
	    //insert hash in berkDB
            //index the block and find duplicates
            check_rabin_duplicates(lastblock->tail,nr_sizes_proc+auxc);

	}

        cur_block[auxc]=NULL;
    } 

    close(fd);
    }
    else{
      fprintf(stderr,"error opening file %s\n",filename);
      exit(1);

    }

  return 0;

}