/* Function: FM_backtrackSeed() * * Synopsis: Find position(s) in the FM index for a diagonal that meets score threshold * * Details: Follows the BWT/FM-index until finding an entry of the implicit * suffix array that is found in the sampled SA. * * Args: fmf - FM index for finding matches to the input sequence * fm_cfg - FM-index meta data * i - Single position in the BWT * * Returns: <eslOK> on success. */ static uint32_t FM_backtrackSeed(const FM_DATA *fmf, const FM_CFG *fm_cfg, int i) { int j = i; int len = 0; int c; while ( j != fmf->term_loc && (j % fm_cfg->meta->freq_SA)) { //go until we hit a position in the full SA that was sampled during FM index construction c = fm_getChar( fm_cfg->meta->alph_type, j, fmf->BWT); j = fm_getOccCount (fmf, fm_cfg, j-1, c); j += abs(fmf->C[c]); len++; } return len + (j==fmf->term_loc ? 0 : fmf->SA[ j / fm_cfg->meta->freq_SA ]) ; // len is how many backward steps we had to take to find a sampled SA position }
/* Function: getFMHits() * Synopsis: For a given interval, identify the position in original text for each element * of interval * Purpose: Implement Algorithm 3.7 (p17) of Firth paper (A Comparison of BWT Approaches * to String Pattern Matching). Most of the meat is in the method of counting * characters - bwt_getOccCount, which depends on compilation choices. */ int getFMHits( FM_DATA *fm, FM_CFG *cfg, FM_INTERVAL *interval, int block_id, int hit_offset, int hit_length, FM_HIT *hits_ptr, int fm_direction) { int i, j, len = 0; int dist_from_end; for (i = interval->lower; i<= interval->upper; i++) { j = i; len = 0; while ( j != fm->term_loc && (j % cfg->meta->freq_SA)) { //go until we hit a position in the full SA that was sampled during FM index construction uint8_t c = fm_getChar( cfg->meta->alph_type, j, fm->BWT); j = fm_getOccCount (fm, cfg, j-1, c); j += abs(fm->C[c]); len++; } hits_ptr[hit_offset + i - interval->lower].block = block_id; hits_ptr[hit_offset + i - interval->lower].direction = fm_direction; hits_ptr[hit_offset + i - interval->lower].length = hit_length; dist_from_end = 1 + len + (j==fm->term_loc ? 0 : fm->SA[ j / cfg->meta->freq_SA ]) ; // len is how many backward steps we had to take to find a sampled SA position if (fm_direction == fm_forward) dist_from_end += hit_length; else dist_from_end += 1; //the SA is on the reversed string. What would be the position in the unreversed string? hits_ptr[hit_offset + i - interval->lower].start = fm->N - dist_from_end; //printf ("SA: %d\n", hits_ptr[hit_offset + i - interval->lower].start); } return eslOK; }