/* read len chars before the position given by row using the LF mapping stop if the beginning of the file is encountered return the number of chars actually read */ ulong go_back(fm_index *index, ulong row, ulong len, uchar *dest) { ulong written, curr_row, n, occ_sb[256], occ_b[256]; uchar c, c_sb, cs; if (row != index->bwt_eof_pos) curr_row = EOF_shift(row); else curr_row = 0; for( written=0; written < len; ) { // fetches info from the header of the superbucket get_info_sb(curr_row, occ_sb, index); // fetches occ into occ_b properly remapped and returns // the remapped code for occ_b of the char in the specified position c = get_info_b(NULL_CHAR, curr_row,occ_b, WHAT_CHAR_IS, index); assert(c < index->alpha_size_sb); c_sb = index->inv_map_sb[c]; assert(c_sb < index->alpha_size); cs = c_sb; if ((index->skip <= 1) || (cs != index->specialchar)) { //skip special char dest[written++] = index->inv_char_map[cs]; // store char } n = occ_sb[c_sb] + occ_b[c]; // # of occ before curr_row curr_row = index->bwt_occ[c_sb] + n - 1; // get next row if(curr_row == index->bwt_eof_pos) break; curr_row = EOF_shift(curr_row); } return written; }
/* ******************************************************************** PAOLO: Per il calcolo dei rank delle parole, cosi da poter realizzare il mapping inverso. ******************************************************************** */ int compute_ranks_dict(bwi_input *s, int *rank_range) { int mc_pos_cmp(const void *, const void *); int i,j,null_occ,count; int desired, marked_chars; uchar null_remap; mc_pos *aux; // -------- easy case: no marked char if(Marked_char_freq==0) { s->skip=0; s->chosen_char = 0; *rank_range=0; return 0; } // -------- Make sure that NULL occurs if(s->bool_char_map[0]==0) fatal_error("The dictionary does not contain the NULL char!\n"); // -------- Determine the remap for NULL null_remap = s->char_map[0]; s->chosen_char = null_remap; // -------- Determine the # of occ of null_remap if(null_remap==s->alpha_size-1) null_occ=s->text_size - s->pfx_char_occ[null_remap]; else null_occ=s->pfx_char_occ[null_remap+1] - s->pfx_char_occ[null_remap]; // -------- determine how many occ to skip desired = (int) (Marked_char_freq*null_occ); if(desired==0) desired=1; // compute s->skip if(null_occ>desired) { // s->skip= int_pow2(int_log2(null_occ/desired)); do we need a pow of 2? s->skip = 1+null_occ/(desired+1); assert(s->skip > 0); } else s->skip=1; // -------- compute number of marked chars if(null_occ % s->skip) marked_chars = null_occ/s->skip + 1; else marked_chars = null_occ/s->skip; // --- alloc s->loc_occ and auxiliary struct for text positions s->loc_occ = (int *) malloc(sizeof(int) * (marked_chars)); aux = (mc_pos *) malloc(null_occ*sizeof(mc_pos)); if(aux==NULL || s->loc_occ==NULL) fatal_error("compute_locations_dict (aux/loc_occ)"); // ----- determine the text position of all the \0 chars for(i=0; i<null_occ; i++) { // SA does not have the suffix starting with EOF, but // the bwt matrix has this as the first row aux[i].text_pos = s->sa[s->pfx_char_occ[null_remap] + i]; assert(aux[i].text_pos>=0 && aux[i].text_pos<s->text_size); assert(s->text[aux[i].text_pos] == null_remap); // We deal with the fact that the first row is the one starting // with EOF but actually is not represented. Hence we sum 1 aux[i].bwt_pos = EOF_shift(s->pfx_char_occ[null_remap] + i); } // ---- sort all \0 according to their text position qsort(aux, null_occ, sizeof(mc_pos), mc_pos_cmp); // ---- Assign ranks to rows ---- for(i=count=0,j=1;i<s->text_size;i++) if(s->text[i]==null_remap) { if (j==null_occ){ // last NULL char is surely marked s->loc_occ[count] = aux[j-1].bwt_pos; count++; // how many marked } else if(j % s->skip == 0) { // this position must be saved s->loc_occ[count] = aux[j-1].bwt_pos; // write the "word rank" count++; // how many marked } j++; // increase the "word rank" } free(aux); if(Verbose>1) { fprintf(stderr,"Rank: Marked %d NULLs out of %d ", count,null_occ); fprintf(stderr,"(one every %d is marked)\n", s->skip); } if(count != marked_chars) fatal_error("No correct marking in compute_ranks_dict!\n"); *rank_range = null_occ; // ranks are in the range [0,null_occ) return count; }
int extract(void * indexe, ulong from, ulong to, uchar **dest, ulong *snippet_length) { fm_index * index = (fm_index *) indexe; ulong j, skip, numchar; ulong pos_text = 0; /* last readen position */ uchar * text; if ((from >= index->text_size) || (from >= to)){ *dest = NULL; *snippet_length = 0; return FM_OK; // Invalid Position } if (index->skip == 0) return FM_NOMARKEDCHAR; to = MIN(to, index->text_size-1); if(index->smalltext) { //uses Boyer-Moore algorithm *snippet_length = to-from+1; *dest = malloc(sizeof(uchar)*(*snippet_length)); if (*dest==NULL) return FM_OUTMEM; memcpy(*dest, index->text+from, (*snippet_length)*sizeof(uchar)); return FM_OK; } ulong real_text_size; if(index->skip>1) real_text_size = index->text_size-index->num_marked_rows; else real_text_size = index->text_size; if ((from == 0) && (to == real_text_size-1)) { // potrebbe essere conveniente anche se inferiore int error = fm_unbuild(index, dest, snippet_length); return error; } numchar = to-from+1; /* get_row */ ulong occ_sb[256], occ_b[256]; uchar c, c_sb; ulong pos_row_compr = to/index->skip; ulong offset = (pos_row_compr*index->log2_row)%8; fm_init_bit_reader(index->compress+index->pos_marked_row_extr+pos_row_compr*index->log2_row/8); if(offset) fm_bit_read(offset); ulong row = fm_bit_read(index->log2_row); row = EOF_shift(row); text = (uchar *) malloc(sizeof(uchar) * numchar); if (text == NULL) return FM_OUTMEM; if (to > index->text_size-1) skip = index->text_size-1-to; else skip = index->skip-to%index->skip-1; for(j=0; j < numchar+skip;) { get_info_sb(row, occ_sb, index); c = get_info_b(NULL_CHAR, row, occ_b, WHAT_CHAR_IS, index); assert(c < index->alpha_size_sb); c_sb = index->inv_map_sb[c]; assert(c_sb < index->alpha_size); if ((index->skip <= 1) || (c_sb != index->specialchar)) if(j>=skip) text[numchar-1-(j-skip)] = index->inv_char_map[c_sb]; row = index->bwt_occ[c_sb] + occ_sb[c_sb] + occ_b[c] - 1; // get next row if(row == index->bwt_eof_pos) break; if(c_sb != index->specialchar) j++; row = EOF_shift(row); } *snippet_length = numchar; *dest = text; return FM_OK; }
int display(void *indexe, uchar *pattern, ulong length, ulong nums, ulong *numocc, uchar **snippet_text, ulong **snippet_len) { ulong row, i,j, *occ, len, skip, to, from, pos_row_compr, offset,occ_sb[256], occ_b[256]; uchar c, c_sb; fm_index * index = (fm_index *) indexe; len = length + 2*nums; /* locate */ int error = locate (indexe, pattern, length, &occ, numocc); if(error!=FM_OK) return error; if(*numocc ==0) { *snippet_len = NULL; *snippet_text = NULL; } *snippet_len = (ulong *) malloc (sizeof (ulong) * (*numocc)); if (*snippet_len == NULL) return FM_OUTMEM; *snippet_text = (uchar *) malloc (sizeof (uchar) * len *(*numocc)); if (*snippet_text == NULL) return FM_OUTMEM; uchar *text = *snippet_text; ulong pos; for(i=0; i<*numocc; i++) { pos = occ[i]; if (pos>nums) from = pos-nums; else from = 0; to = pos+length+nums-1<index->text_size-1 ? pos+length+nums-1:index->text_size-1; len = to-from+1; /* get_row */ pos_row_compr = to/index->skip; offset = (pos_row_compr*index->log2_row)%8; fm_init_bit_reader(index->compress+index->pos_marked_row_extr+pos_row_compr*index->log2_row/8); if(offset) fm_bit_read(offset); row = fm_bit_read(index->log2_row); row = EOF_shift(row); if (to > index->text_size-1) skip = index->text_size-1-to; else skip = index->skip-to%index->skip-1; for(j=0; j < len+skip;) { get_info_sb(row, occ_sb, index); c = get_info_b(NULL_CHAR, row, occ_b, WHAT_CHAR_IS, index); assert(c < index->alpha_size_sb); c_sb = index->inv_map_sb[c]; assert(c_sb < index->alpha_size); if ((index->skip <= 1) || (c_sb != index->specialchar)) if(j>=skip) text[len-1-(j-skip)] = index->inv_char_map[c_sb]; row = index->bwt_occ[c_sb] + occ_sb[c_sb] + occ_b[c] - 1; // get next row if(row == index->bwt_eof_pos) break; if(c_sb != index->specialchar) j++; row = EOF_shift(row); } (*snippet_len)[i] = len; text += length+2*nums; } if (numocc) free (occ); return(FM_OK); }