/* Function:  main()
 * Synopsis:  Run set of queries against an FM
 * Purpose:   Read in a FM and a file of query sequences.
 *            For each query, find matching FM interval, then collect positions in
 *            the original text T for the corresponding occurrences. These positions
 *            are 0-based (so first character is position 0).
 */
int
main(int argc,  char *argv[]) 
{
  void* tmp; // used for RALLOC calls
  clock_t t1, t2;
  struct tms ts1, ts2;
  char *fname_fm      = NULL;
  char *fname_queries = NULL;
  FM_HIT *hits        = NULL;
  char *line          = NULL;
  int status        = eslOK;
  int hit_cnt       = 0;
  int hit_indiv_cnt = 0;
  int miss_cnt      = 0;
  int hit_num       = 0;
  int hit_num2       = 0;
  int hits_size     = 0;
  int i,j;
  int count_only    = 0;

  FM_INTERVAL interval;
  FM_DATA *fmsf = NULL;
  FM_DATA *fmsb = NULL;
  FILE* fp_fm   = NULL;
  FILE* fp      = NULL;
  FILE* out     = NULL;
  char *outname = NULL;

  ESL_GETOPTS     *go  = NULL;    /* command line processing                 */
  FM_CFG *cfg;
  FM_METADATA *meta;

  ESL_SQ       *tmpseq;  // used for sequence validation
  ESL_ALPHABET *abc = NULL;


  //start timer
  t1 = times(&ts1);

  process_commandline(argc, argv, &go, &fname_fm, &fname_queries);


  if (esl_opt_IsOn(go, "--out")) {
    outname = esl_opt_GetString(go, "--out");
    if ( esl_strcmp ("-", outname) == 0 ) {
      out = stdout;
      outname = "stdout";
    } else {
      out = fopen(outname,"w");
    }
  }

  if (esl_opt_IsOn(go, "--count_only"))
    count_only = 1;


  if((fp_fm = fopen(fname_fm, "rb")) == NULL)
    esl_fatal("Cannot open file `%s': ", fname_fm);


  fm_configAlloc(&cfg);
  cfg->occCallCnt = 0;
  meta = cfg->meta;
  meta->fp = fp_fm;


  fm_readFMmeta( meta);



  if      (meta->alph_type == fm_DNA)   abc     = esl_alphabet_Create(eslDNA);
  else if (meta->alph_type == fm_AMINO) abc     = esl_alphabet_Create(eslAMINO);
  tmpseq = esl_sq_CreateDigital(abc);



  //read in FM-index blocks
  ESL_ALLOC(fmsf, meta->block_count * sizeof(FM_DATA) );
  if (!meta->fwd_only)
    ESL_ALLOC(fmsb, meta->block_count * sizeof(FM_DATA) );

  for (i=0; i<meta->block_count; i++) {
    fm_FM_read( fmsf+i,meta, TRUE );

    if (!meta->fwd_only) {
      fm_FM_read(fmsb+i, meta, FALSE );
      fmsb[i].SA = fmsf[i].SA;
      fmsb[i].T = fmsf[i].T;
    }
  }
  fclose(fp_fm);

  output_header(meta, stdout, go, fname_fm, fname_queries);


  /* initialize a few global variables, then call initGlobals
   * to do architecture-specific initialization
   */
  fm_configInit(cfg, NULL);

  fm_alphabetCreate(meta, NULL); // don't override charBits

  fp = fopen(fname_queries,"r");
  if (fp == NULL)
    esl_fatal("Unable to open file %s\n", fname_queries);

  ESL_ALLOC(line, FM_MAX_LINE * sizeof(char));

  hits_size = 200;
  ESL_ALLOC(hits, hits_size * sizeof(FM_HIT));

  while(fgets(line, FM_MAX_LINE, fp) ) {
    int qlen=0;
    while (line[qlen] != '\0' && line[qlen] != '\n')  qlen++;
    if (line[qlen] == '\n')  line[qlen] = '\0';

    hit_num = 0;

    for (i=0; i<meta->block_count; i++) {

      fm_getSARangeReverse(fmsf+i, cfg, line, meta->inv_alph, &interval);
      if (interval.lower>=0 && interval.lower <= interval.upper) {
        int new_hit_num =  interval.upper - interval.lower + 1;
        hit_num += new_hit_num;
        if (!count_only) {
          if (hit_num > hits_size) {
            hits_size = 2*hit_num;
            ESL_RALLOC(hits, tmp, hits_size * sizeof(FM_HIT));
          }
          getFMHits(fmsf+i, cfg, &interval, i, hit_num-new_hit_num, qlen, hits, fm_forward);
        }

      }


      /* find reverse hits, using backward search on the forward FM*/
      if (!meta->fwd_only) {
        fm_getSARangeForward(fmsb+i, cfg, line, meta->inv_alph, &interval);// yes, use the backward fm to produce the equivalent of a forward search on the forward fm
        if (interval.lower>=0 && interval.lower <= interval.upper) {
          int new_hit_num =  interval.upper - interval.lower + 1;
          hit_num += new_hit_num;
          if (!count_only) {
            if (hit_num > hits_size) {
              hits_size = 2*hit_num;
              ESL_RALLOC(hits, tmp, hits_size * sizeof(FM_HIT));
            }
            //even though I used fmsb above, use fmsf here, since we'll now do a backward trace
            //in the FM-index to find the next sampled SA position
            getFMHits(fmsf+i, cfg, &interval, i, hit_num-new_hit_num, qlen, hits, fm_backward);
          }
        }

      }

    }


    if (hit_num > 0) {
      if (count_only) {
        hit_cnt++;
        hit_indiv_cnt += hit_num;
      } else {
        hit_num2 = 0;

        //for each hit, identify the sequence id and position within that sequence
        for (i = 0; i< hit_num; i++) {

          status = fm_getOriginalPosition (fmsf, meta, hits[i].block, hits[i].length, fm_forward, hits[i].start,  &(hits[i].block), &(hits[i].start) );
          hits[i].sortkey = (status==eslERANGE ? -1 : meta->seq_data[ hits[i].block ].target_id);

          //validate match - if any characters in orig sequence were ambiguities, reject
          fm_convertRange2DSQ( fmsf, meta, hits[i].start, hits[i].length, p7_NOCOMPLEMENT, tmpseq, TRUE );
          for (j=1; j<=hits[i].length; j++) {
            if (tmpseq->dsq[j] >= abc->K) {
              hits[i].sortkey = -1; //reject
              j = hits[i].length+1; //quit looking
            }
          }

          if (hits[i].sortkey != -1)
            hit_num2++; // legitimate hit

        }
        if (hit_num2 > 0)
          hit_cnt++;

        //now sort according the the sequence_id corresponding to that seq_offset
        qsort(hits, hit_num, sizeof(FM_HIT), hit_sorter);

        //skim past the skipped entries
        i = 0;
        while ( i < hit_num ) {
          if (hits[i].sortkey != -1 )
            break;  //
          i++;
        }


        if (i < hit_num) {
          if (out != NULL) {
            fprintf (out, "%s\n",line);
            //fprintf (out, "\t%10s (%8d %s)\n",meta->seq_data[ hits[i].block ].name, hits[i].start, (hits[i].direction==fm_forward?"+":"-"));
            fprintf (out, "    %8ld %s %10s\n", (long)(hits[i].start), (hits[i].direction==fm_forward?"f":"r"), meta->seq_data[ hits[i].block ].name);
          }
          hit_indiv_cnt++;
          i++; // skip the first one, since I'll be comparing each to the previous

          for (  ; i< hit_num; i++) {
            if ( //meta->seq_data[ hits[i].block ].id != meta->seq_data[ hits[i-1].block ].id ||
                 hits[i].sortkey   != hits[i-1].sortkey ||  //sortkey is seq_data[].id
                 hits[i].direction != hits[i-1].direction ||
                 hits[i].start     != hits[i-1].start )
            {
              if (out != NULL)
                //fprintf (out, "\t%10s (%8d %s)\n",meta->seq_data[ hits[i].block ].name, hits[i].start, (hits[i].direction==fm_forward?"+":"-"));
                fprintf (out, "    %8ld %s %10s\n", (long)(hits[i].start), (hits[i].direction==fm_forward?"f":"r"), meta->seq_data[ hits[i].block ].name);
              hit_indiv_cnt++;
            }
          }
          if (out != NULL)
            fprintf (out, "\n");
        }
      }
    } else {
      miss_cnt++;
    }


  }

  for (i=0; i<meta->block_count; i++) {
    fm_FM_destroy( fmsf+i, 1 );
    if (!meta->fwd_only)
      fm_FM_destroy( fmsb+i, 0 );
  }


  free (hits);
  free (line);
  fclose(fp);

  fm_configDestroy(cfg);


  // compute and print the elapsed time in millisec
  t2 = times(&ts2);
  {
    double clk_ticks = sysconf(_SC_CLK_TCK);
    double elapsedTime = (t2-t1)/clk_ticks;
    double throughput = cfg->occCallCnt/elapsedTime;

    fprintf (stderr, "hit: %-10d  (%d)\n", hit_cnt, hit_indiv_cnt);
    fprintf (stderr, "miss:%-10d\n", miss_cnt);
    fprintf (stderr, "run time:  %.2f seconds\n", elapsedTime);
    fprintf (stderr, "occ calls: %12s\n", commaprint(cfg->occCallCnt));
    fprintf (stderr, "occ/sec:   %12s\n", commaprint(throughput));
  }

  exit(eslOK);


ERROR:
  printf ("failure allocating memory for hits\n");
  exit(status);


}
Beispiel #2
0
/* Function:  FM_extendSeed()
 * Synopsis:  Extend seed in both diagonal directions, capturing the score
 *
 * Details:   Given a diagonal seed found using FM-index traversal (typically
 *            around length 16, with a modest score, but not necessarily enough
 *            to pass the SSV threshold), establish a window around that seed,
 *            and extend it to maximize score (with the constraint of not going
 *            through a long negative scoring stretch). Capture the score of
 *            this extended diagonal.
 *
 * Args:      diag    - The initial seed
 *            fm      - Data for the FM-index (only need the forward FM from the
 *                      calling function).
 *            ssvdata - Compact data required for computing MSV (SSV) scores
 *            cfg     - FM-index meta data
 *            tmp_sq  - Sequence object that this function uses for calculations.
 *                      Must be pre-allocated.
 *
 * Returns:   <eslOK> on success.
 */
static int
FM_extendSeed(FM_DIAG *diag, const FM_DATA *fm, const P7_SCOREDATA *ssvdata, FM_CFG *cfg, ESL_SQ  *tmp_sq)
{

  uint64_t k,n;
  int32_t model_start, model_end;
  int64_t target_start, target_end;
  int64_t hit_start, max_hit_start, max_hit_end;
  float sc;
  float max_sc = 0.0;
  int c;
  // this will allow a diagonal to be extended at least 10 bases in each direction, an up to as much as required to allow a diag of length  cfg->ssv_length
  int extend = ESL_MAX(10, cfg->ssv_length - diag->length);

  //this determines the start and end of the window that we think it's possible we'll extend to the window to (which determines the sequence we'll extract)
  model_start      = ESL_MAX ( 1,         diag->k - extend + 1) ;
  model_end        = ESL_MIN( ssvdata->M, diag->k + diag->length + extend - 1 );
  target_start     = diag->n - (diag->k - model_start);
  target_end       = diag->n + (model_end - diag->k);

  if (target_start < 0) {
    model_start -= target_start;
    target_start = 0;
  }
  if (target_end > fm->N-2) {
    model_end -= target_end - (fm->N-2);
    target_end = fm->N-2;
  }

  fm_convertRange2DSQ(fm, cfg->meta, target_start, target_end-target_start+1, diag->complementarity, tmp_sq, FALSE );


  //This finds the highest-scoring sub-diag in the just-determined potential diagonal range.
  k = model_start;
  n = 1;
  sc = 0.0;

  hit_start = n;
  for (  ; k <= model_end; k++, n++) {
      c = tmp_sq->dsq[n];

      sc  += ssvdata->ssv_scores_f[k*tmp_sq->abc->Kp + c];

      if (sc < 0) {
        sc = 0;
        hit_start = n+1;
      } else if (sc > max_sc) {
          max_sc = sc;
          max_hit_start = hit_start;
          max_hit_end   = n;
      }
  }


  diag->n       = target_start + max_hit_start - 1;
  diag->k       = model_start  + max_hit_start - 1;
  diag->length  = max_hit_end - max_hit_start + 1;
  diag->score   = max_sc;


  return eslOK;
}