/* Function: main() * Synopsis: Run set of queries against an FM * Purpose: Read in a FM and a file of query sequences. * For each query, find matching FM interval, then collect positions in * the original text T for the corresponding occurrences. These positions * are 0-based (so first character is position 0). */ int main(int argc, char *argv[]) { void* tmp; // used for RALLOC calls clock_t t1, t2; struct tms ts1, ts2; char *fname_fm = NULL; char *fname_queries = NULL; FM_HIT *hits = NULL; char *line = NULL; int status = eslOK; int hit_cnt = 0; int hit_indiv_cnt = 0; int miss_cnt = 0; int hit_num = 0; int hit_num2 = 0; int hits_size = 0; int i,j; int count_only = 0; FM_INTERVAL interval; FM_DATA *fmsf = NULL; FM_DATA *fmsb = NULL; FILE* fp_fm = NULL; FILE* fp = NULL; FILE* out = NULL; char *outname = NULL; ESL_GETOPTS *go = NULL; /* command line processing */ FM_CFG *cfg; FM_METADATA *meta; ESL_SQ *tmpseq; // used for sequence validation ESL_ALPHABET *abc = NULL; //start timer t1 = times(&ts1); process_commandline(argc, argv, &go, &fname_fm, &fname_queries); if (esl_opt_IsOn(go, "--out")) { outname = esl_opt_GetString(go, "--out"); if ( esl_strcmp ("-", outname) == 0 ) { out = stdout; outname = "stdout"; } else { out = fopen(outname,"w"); } } if (esl_opt_IsOn(go, "--count_only")) count_only = 1; if((fp_fm = fopen(fname_fm, "rb")) == NULL) esl_fatal("Cannot open file `%s': ", fname_fm); fm_configAlloc(&cfg); cfg->occCallCnt = 0; meta = cfg->meta; meta->fp = fp_fm; fm_readFMmeta( meta); if (meta->alph_type == fm_DNA) abc = esl_alphabet_Create(eslDNA); else if (meta->alph_type == fm_AMINO) abc = esl_alphabet_Create(eslAMINO); tmpseq = esl_sq_CreateDigital(abc); //read in FM-index blocks ESL_ALLOC(fmsf, meta->block_count * sizeof(FM_DATA) ); if (!meta->fwd_only) ESL_ALLOC(fmsb, meta->block_count * sizeof(FM_DATA) ); for (i=0; i<meta->block_count; i++) { fm_FM_read( fmsf+i,meta, TRUE ); if (!meta->fwd_only) { fm_FM_read(fmsb+i, meta, FALSE ); fmsb[i].SA = fmsf[i].SA; fmsb[i].T = fmsf[i].T; } } fclose(fp_fm); output_header(meta, stdout, go, fname_fm, fname_queries); /* initialize a few global variables, then call initGlobals * to do architecture-specific initialization */ fm_configInit(cfg, NULL); fm_alphabetCreate(meta, NULL); // don't override charBits fp = fopen(fname_queries,"r"); if (fp == NULL) esl_fatal("Unable to open file %s\n", fname_queries); ESL_ALLOC(line, FM_MAX_LINE * sizeof(char)); hits_size = 200; ESL_ALLOC(hits, hits_size * sizeof(FM_HIT)); while(fgets(line, FM_MAX_LINE, fp) ) { int qlen=0; while (line[qlen] != '\0' && line[qlen] != '\n') qlen++; if (line[qlen] == '\n') line[qlen] = '\0'; hit_num = 0; for (i=0; i<meta->block_count; i++) { fm_getSARangeReverse(fmsf+i, cfg, line, meta->inv_alph, &interval); if (interval.lower>=0 && interval.lower <= interval.upper) { int new_hit_num = interval.upper - interval.lower + 1; hit_num += new_hit_num; if (!count_only) { if (hit_num > hits_size) { hits_size = 2*hit_num; ESL_RALLOC(hits, tmp, hits_size * sizeof(FM_HIT)); } getFMHits(fmsf+i, cfg, &interval, i, hit_num-new_hit_num, qlen, hits, fm_forward); } } /* find reverse hits, using backward search on the forward FM*/ if (!meta->fwd_only) { fm_getSARangeForward(fmsb+i, cfg, line, meta->inv_alph, &interval);// yes, use the backward fm to produce the equivalent of a forward search on the forward fm if (interval.lower>=0 && interval.lower <= interval.upper) { int new_hit_num = interval.upper - interval.lower + 1; hit_num += new_hit_num; if (!count_only) { if (hit_num > hits_size) { hits_size = 2*hit_num; ESL_RALLOC(hits, tmp, hits_size * sizeof(FM_HIT)); } //even though I used fmsb above, use fmsf here, since we'll now do a backward trace //in the FM-index to find the next sampled SA position getFMHits(fmsf+i, cfg, &interval, i, hit_num-new_hit_num, qlen, hits, fm_backward); } } } } if (hit_num > 0) { if (count_only) { hit_cnt++; hit_indiv_cnt += hit_num; } else { hit_num2 = 0; //for each hit, identify the sequence id and position within that sequence for (i = 0; i< hit_num; i++) { status = fm_getOriginalPosition (fmsf, meta, hits[i].block, hits[i].length, fm_forward, hits[i].start, &(hits[i].block), &(hits[i].start) ); hits[i].sortkey = (status==eslERANGE ? -1 : meta->seq_data[ hits[i].block ].target_id); //validate match - if any characters in orig sequence were ambiguities, reject fm_convertRange2DSQ( fmsf, meta, hits[i].start, hits[i].length, p7_NOCOMPLEMENT, tmpseq, TRUE ); for (j=1; j<=hits[i].length; j++) { if (tmpseq->dsq[j] >= abc->K) { hits[i].sortkey = -1; //reject j = hits[i].length+1; //quit looking } } if (hits[i].sortkey != -1) hit_num2++; // legitimate hit } if (hit_num2 > 0) hit_cnt++; //now sort according the the sequence_id corresponding to that seq_offset qsort(hits, hit_num, sizeof(FM_HIT), hit_sorter); //skim past the skipped entries i = 0; while ( i < hit_num ) { if (hits[i].sortkey != -1 ) break; // i++; } if (i < hit_num) { if (out != NULL) { fprintf (out, "%s\n",line); //fprintf (out, "\t%10s (%8d %s)\n",meta->seq_data[ hits[i].block ].name, hits[i].start, (hits[i].direction==fm_forward?"+":"-")); fprintf (out, " %8ld %s %10s\n", (long)(hits[i].start), (hits[i].direction==fm_forward?"f":"r"), meta->seq_data[ hits[i].block ].name); } hit_indiv_cnt++; i++; // skip the first one, since I'll be comparing each to the previous for ( ; i< hit_num; i++) { if ( //meta->seq_data[ hits[i].block ].id != meta->seq_data[ hits[i-1].block ].id || hits[i].sortkey != hits[i-1].sortkey || //sortkey is seq_data[].id hits[i].direction != hits[i-1].direction || hits[i].start != hits[i-1].start ) { if (out != NULL) //fprintf (out, "\t%10s (%8d %s)\n",meta->seq_data[ hits[i].block ].name, hits[i].start, (hits[i].direction==fm_forward?"+":"-")); fprintf (out, " %8ld %s %10s\n", (long)(hits[i].start), (hits[i].direction==fm_forward?"f":"r"), meta->seq_data[ hits[i].block ].name); hit_indiv_cnt++; } } if (out != NULL) fprintf (out, "\n"); } } } else { miss_cnt++; } } for (i=0; i<meta->block_count; i++) { fm_FM_destroy( fmsf+i, 1 ); if (!meta->fwd_only) fm_FM_destroy( fmsb+i, 0 ); } free (hits); free (line); fclose(fp); fm_configDestroy(cfg); // compute and print the elapsed time in millisec t2 = times(&ts2); { double clk_ticks = sysconf(_SC_CLK_TCK); double elapsedTime = (t2-t1)/clk_ticks; double throughput = cfg->occCallCnt/elapsedTime; fprintf (stderr, "hit: %-10d (%d)\n", hit_cnt, hit_indiv_cnt); fprintf (stderr, "miss:%-10d\n", miss_cnt); fprintf (stderr, "run time: %.2f seconds\n", elapsedTime); fprintf (stderr, "occ calls: %12s\n", commaprint(cfg->occCallCnt)); fprintf (stderr, "occ/sec: %12s\n", commaprint(throughput)); } exit(eslOK); ERROR: printf ("failure allocating memory for hits\n"); exit(status); }
/* Function: FM_extendSeed() * Synopsis: Extend seed in both diagonal directions, capturing the score * * Details: Given a diagonal seed found using FM-index traversal (typically * around length 16, with a modest score, but not necessarily enough * to pass the SSV threshold), establish a window around that seed, * and extend it to maximize score (with the constraint of not going * through a long negative scoring stretch). Capture the score of * this extended diagonal. * * Args: diag - The initial seed * fm - Data for the FM-index (only need the forward FM from the * calling function). * ssvdata - Compact data required for computing MSV (SSV) scores * cfg - FM-index meta data * tmp_sq - Sequence object that this function uses for calculations. * Must be pre-allocated. * * Returns: <eslOK> on success. */ static int FM_extendSeed(FM_DIAG *diag, const FM_DATA *fm, const P7_SCOREDATA *ssvdata, FM_CFG *cfg, ESL_SQ *tmp_sq) { uint64_t k,n; int32_t model_start, model_end; int64_t target_start, target_end; int64_t hit_start, max_hit_start, max_hit_end; float sc; float max_sc = 0.0; int c; // this will allow a diagonal to be extended at least 10 bases in each direction, an up to as much as required to allow a diag of length cfg->ssv_length int extend = ESL_MAX(10, cfg->ssv_length - diag->length); //this determines the start and end of the window that we think it's possible we'll extend to the window to (which determines the sequence we'll extract) model_start = ESL_MAX ( 1, diag->k - extend + 1) ; model_end = ESL_MIN( ssvdata->M, diag->k + diag->length + extend - 1 ); target_start = diag->n - (diag->k - model_start); target_end = diag->n + (model_end - diag->k); if (target_start < 0) { model_start -= target_start; target_start = 0; } if (target_end > fm->N-2) { model_end -= target_end - (fm->N-2); target_end = fm->N-2; } fm_convertRange2DSQ(fm, cfg->meta, target_start, target_end-target_start+1, diag->complementarity, tmp_sq, FALSE ); //This finds the highest-scoring sub-diag in the just-determined potential diagonal range. k = model_start; n = 1; sc = 0.0; hit_start = n; for ( ; k <= model_end; k++, n++) { c = tmp_sq->dsq[n]; sc += ssvdata->ssv_scores_f[k*tmp_sq->abc->Kp + c]; if (sc < 0) { sc = 0; hit_start = n+1; } else if (sc > max_sc) { max_sc = sc; max_hit_start = hit_start; max_hit_end = n; } } diag->n = target_start + max_hit_start - 1; diag->k = model_start + max_hit_start - 1; diag->length = max_hit_end - max_hit_start + 1; diag->score = max_sc; return eslOK; }