コード例 #1
0
ファイル: fm_extract.c プロジェクト: ndmitchell/index-search
/*
   read len chars before the position given by row using the LF mapping
   stop if the beginning of the file is encountered
   return the number of chars actually read 
*/
ulong go_back(fm_index *index, ulong row, ulong len, uchar *dest) {
	
  ulong written, curr_row, n, occ_sb[256], occ_b[256];
  uchar c, c_sb, cs;
 
  if (row != index->bwt_eof_pos) curr_row = EOF_shift(row);
  else curr_row = 0;
  

  for( written=0; written < len; ) {
  
    // fetches info from the header of the superbucket
    get_info_sb(curr_row, occ_sb, index);  
    // fetches occ into occ_b properly remapped and returns
    // the remapped code for occ_b of the char in the  specified position
    c = get_info_b(NULL_CHAR, curr_row,occ_b, WHAT_CHAR_IS, index);  
    assert(c < index->alpha_size_sb);
  
    c_sb = index->inv_map_sb[c];
    assert(c_sb < index->alpha_size);
	cs = c_sb;
  
		if ((index->skip <= 1) || (cs != index->specialchar)) { //skip special char		
    		dest[written++] = index->inv_char_map[cs]; 	// store char    
	}
    n = occ_sb[c_sb] + occ_b[c];         	    // # of occ before curr_row

	curr_row = index->bwt_occ[c_sb] + n - 1; // get next row
    if(curr_row == index->bwt_eof_pos) break;    
    curr_row = EOF_shift(curr_row);        
	
  }

  return written;
}
コード例 #2
0
ファイル: compr_main.c プロジェクト: peper/pizza
/* ********************************************************************
   PAOLO: Per il calcolo dei rank delle parole, cosi da poter
   realizzare il mapping inverso. 
   ******************************************************************** */
int compute_ranks_dict(bwi_input *s, int *rank_range)
{
  int mc_pos_cmp(const void *, const void *);
  int i,j,null_occ,count;
  int desired, marked_chars;
  uchar null_remap;
  mc_pos *aux;

  // -------- easy case: no marked char
  if(Marked_char_freq==0) {
    s->skip=0; s->chosen_char = 0; *rank_range=0;
    return 0;
  }

  // -------- Make sure that NULL occurs
  if(s->bool_char_map[0]==0) 
    fatal_error("The dictionary does not contain the NULL char!\n");
  // -------- Determine the remap for NULL
  null_remap = s->char_map[0];
  s->chosen_char = null_remap;
  // -------- Determine the # of occ of null_remap
  if(null_remap==s->alpha_size-1)
    null_occ=s->text_size - s->pfx_char_occ[null_remap];
  else
    null_occ=s->pfx_char_occ[null_remap+1] - s->pfx_char_occ[null_remap];


  // -------- determine how many occ to skip
  desired = (int) (Marked_char_freq*null_occ);
  if(desired==0) desired=1;

  // compute s->skip
  if(null_occ>desired) {
    // s->skip= int_pow2(int_log2(null_occ/desired)); do we need a pow of 2?
    s->skip = 1+null_occ/(desired+1);
    assert(s->skip > 0);
  }
  else
    s->skip=1;

  // -------- compute number of marked chars
  if(null_occ % s->skip)
    marked_chars = null_occ/s->skip + 1;
  else
    marked_chars = null_occ/s->skip;

  // --- alloc s->loc_occ and auxiliary struct for text positions
  s->loc_occ = (int *) malloc(sizeof(int) * (marked_chars));
  aux = (mc_pos *) malloc(null_occ*sizeof(mc_pos));
  if(aux==NULL || s->loc_occ==NULL) 
    fatal_error("compute_locations_dict (aux/loc_occ)");

  // -----  determine the text position of all the \0 chars
  for(i=0; i<null_occ; i++)  {                          

    // SA does not have the suffix starting with EOF, but
    // the bwt matrix has this as the first row
    aux[i].text_pos = s->sa[s->pfx_char_occ[null_remap] + i]; 
    
    assert(aux[i].text_pos>=0 && aux[i].text_pos<s->text_size);
    assert(s->text[aux[i].text_pos] == null_remap);
    
    // We deal with the fact that the first row is the one starting
    // with EOF but actually is not represented. Hence we sum 1
    aux[i].bwt_pos = EOF_shift(s->pfx_char_occ[null_remap] + i);

  }

  // ---- sort all \0 according to their text position 
  qsort(aux, null_occ, sizeof(mc_pos), mc_pos_cmp);

  // ---- Assign ranks to rows ----
  for(i=count=0,j=1;i<s->text_size;i++)
    if(s->text[i]==null_remap) {

      if (j==null_occ){   // last NULL char is surely marked
	s->loc_occ[count] = aux[j-1].bwt_pos;  
	count++;   // how many marked
      } else if(j % s->skip == 0)  {        // this position must be saved
	s->loc_occ[count] = aux[j-1].bwt_pos;    // write the "word rank"
	count++;   // how many marked
      }
	
      j++;                                  // increase the "word rank"
    }

  free(aux);

  if(Verbose>1) {
    fprintf(stderr,"Rank: Marked %d NULLs out of %d ", count,null_occ); 
    fprintf(stderr,"(one every %d is marked)\n", s->skip);
  }

  if(count != marked_chars)
    fatal_error("No correct marking in compute_ranks_dict!\n");

  *rank_range = null_occ;   // ranks are in the range [0,null_occ)
  return count;
}
コード例 #3
0
ファイル: fm_extract.c プロジェクト: ndmitchell/index-search
int extract(void * indexe, ulong from, ulong to, uchar **dest, 
			ulong *snippet_length) {

	fm_index * index = (fm_index *) indexe;
	ulong j, skip, numchar;
	ulong pos_text = 0; /* last readen position */
	uchar * text;
	
	if ((from >= index->text_size) || (from >= to)){
					*dest = NULL; 
					*snippet_length = 0;
			return FM_OK; // Invalid Position
	}

	if (index->skip == 0)  
			return FM_NOMARKEDCHAR;
	to = MIN(to, index->text_size-1);
	
	if(index->smalltext) { //uses Boyer-Moore algorithm
		*snippet_length = to-from+1;
		*dest = malloc(sizeof(uchar)*(*snippet_length));
		if (*dest==NULL) return FM_OUTMEM;
		memcpy(*dest, index->text+from, (*snippet_length)*sizeof(uchar));
		return FM_OK;
	}
	
	ulong real_text_size;
	if(index->skip>1) real_text_size = index->text_size-index->num_marked_rows; 
	else real_text_size = index->text_size;
		
	if ((from == 0) && (to == real_text_size-1)) { // potrebbe essere conveniente anche se inferiore
			int error = fm_unbuild(index, dest, snippet_length);
			return error;
	}
	
	numchar = to-from+1;
		
	/* get_row */
	ulong occ_sb[256], occ_b[256];
	uchar c, c_sb;
	ulong pos_row_compr = to/index->skip;
    ulong offset = (pos_row_compr*index->log2_row)%8;
	fm_init_bit_reader(index->compress+index->pos_marked_row_extr+pos_row_compr*index->log2_row/8);
	if(offset) fm_bit_read(offset);
	ulong row = fm_bit_read(index->log2_row);
	row = EOF_shift(row);
		
	text = (uchar *) malloc(sizeof(uchar) * numchar);
	if (text == NULL) 
			return FM_OUTMEM;
		
    if (to > index->text_size-1)
          skip = index->text_size-1-to;
        else
          skip = index->skip-to%index->skip-1;

		for(j=0; j < numchar+skip;) {
			    get_info_sb(row, occ_sb, index);  
				c = get_info_b(NULL_CHAR, row, occ_b, WHAT_CHAR_IS, index);  
				assert(c < index->alpha_size_sb);
				c_sb = index->inv_map_sb[c];
				assert(c_sb < index->alpha_size);
				 
				if ((index->skip <= 1) || (c_sb != index->specialchar))  	
						if(j>=skip) 
							text[numchar-1-(j-skip)] = index->inv_char_map[c_sb]; 	
				
				row = index->bwt_occ[c_sb] + occ_sb[c_sb] + occ_b[c] - 1; // get next row
				if(row == index->bwt_eof_pos) break;
				if(c_sb != index->specialchar) j++;
				row = EOF_shift(row);   
		}
	
	*snippet_length = numchar;
	*dest = text;
	
	return FM_OK;
}
コード例 #4
0
ファイル: fm_extract.c プロジェクト: ndmitchell/index-search
int display(void *indexe, uchar *pattern, ulong length, ulong nums, ulong *numocc, 
			uchar **snippet_text, ulong **snippet_len) {
	
	ulong row, i,j, *occ, len, skip, to, from, pos_row_compr, offset,occ_sb[256], occ_b[256];
	uchar c, c_sb;
				
	fm_index * index = (fm_index *) indexe;
			
	len = length + 2*nums;
				
	/* locate */
	int error =	locate (indexe, pattern, length, &occ, numocc);
	if(error!=FM_OK) return error;
		
	if(*numocc ==0) {
	  *snippet_len = NULL;
      *snippet_text = NULL;
    }
	*snippet_len = (ulong *) malloc (sizeof (ulong) * (*numocc));
	if (*snippet_len == NULL)
		return FM_OUTMEM;

	*snippet_text = (uchar *) malloc (sizeof (uchar) * len *(*numocc));
	if (*snippet_text == NULL)
		return FM_OUTMEM;	
	
	uchar *text = *snippet_text;
	
	ulong pos;
	
	for(i=0; i<*numocc; i++) {
		pos = occ[i];
		if (pos>nums) from = pos-nums;
        else from = 0;
        to = pos+length+nums-1<index->text_size-1 ? pos+length+nums-1:index->text_size-1;
       	len = to-from+1;
		
		/* get_row */
		pos_row_compr = to/index->skip;
		offset = (pos_row_compr*index->log2_row)%8;
		fm_init_bit_reader(index->compress+index->pos_marked_row_extr+pos_row_compr*index->log2_row/8);
		if(offset) fm_bit_read(offset);
		row = fm_bit_read(index->log2_row);
		row = EOF_shift(row);
		
        if (to > index->text_size-1)
          skip = index->text_size-1-to;
        else
          skip = index->skip-to%index->skip-1;

		for(j=0; j < len+skip;) {
			    get_info_sb(row, occ_sb, index);  
				c = get_info_b(NULL_CHAR, row, occ_b, WHAT_CHAR_IS, index);  
				assert(c < index->alpha_size_sb);
				c_sb = index->inv_map_sb[c];
				assert(c_sb < index->alpha_size);
				 
				if ((index->skip <= 1) || (c_sb != index->specialchar))  	
						if(j>=skip) 
							text[len-1-(j-skip)] = index->inv_char_map[c_sb]; 	
				
				row = index->bwt_occ[c_sb] + occ_sb[c_sb] + occ_b[c] - 1; // get next row
				if(row == index->bwt_eof_pos) break;
				if(c_sb != index->specialchar) j++;
				row = EOF_shift(row);   
		}

		(*snippet_len)[i] = len;
        text += length+2*nums;	
	}		
	
	if (numocc) free (occ);
	return(FM_OK);	
}