Пример #1
0
static GtMatchReference* gt_mirror_and_sort_matches(GtArray *matches)
{
  GtMatchReference *mref;
  GtMatch *match;
  GtRange rng_seq1, rng_seq2;
  unsigned long i, j;
  mref = gt_calloc((size_t) (gt_array_size(matches) * 2),
                   sizeof (GtMatchReference));
  for (i = 0, j = 0; i < gt_array_size(matches); i++, j+=2) {
    match = *(GtMatch**) gt_array_get(matches, i);
    gt_match_get_range_seq1(match, &rng_seq1);
    gt_match_get_range_seq2(match, &rng_seq2);
    mref[j].startpos = rng_seq1.start;
    mref[j].matchnum = i;
    mref[j+1].startpos = rng_seq2.start;
    mref[j+1].matchnum = i;
  }
  qsort (mref,
         (size_t) (2 * gt_array_size(matches)),
         sizeof (GtMatchReference),
         cmpmatchreferences);
  return mref;
}
static int gt_condenser_search_runner(GT_UNUSED int argc,
                                      GT_UNUSED const char **argv,
                                      GT_UNUSED int parsed_args,
                                      void *tool_arguments,
                                      GtError *err)
{
  GtCondenserSearchArguments *arguments = tool_arguments;
  int i, had_err = 0;
  char *querypath = gt_str_get(arguments->querypath);
  GtStr* coarse_fname = gt_str_new_cstr("coarse_");
  char *db_basename = NULL;
  char *suffix_ptr = NULL;
  GtTimer *timer = NULL;
  GtLogger *logger = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr);

  db_basename = gt_basename(gt_str_get(arguments->dbpath));
  /* if first char is '.' this might be a hidden file */
  if (strlen(db_basename) > (size_t) 1 &&
      (suffix_ptr = strrchr(db_basename + 1, '.')) != NULL) {
    /* remove suffix */
    *suffix_ptr = '\0';
  }
  gt_str_append_cstr(coarse_fname, db_basename);
  gt_str_append_cstr(coarse_fname, ".fas");
  gt_free(db_basename);
  db_basename = NULL;
  suffix_ptr = NULL;

  if (arguments->blastn || arguments->blastp) {
    GtMatch              *match;
    GtMatchIterator      *mp = NULL;
    GtNREncseq           *nrencseq = NULL;
    GtStr                *fastaname = gt_str_clone(arguments->dbpath);
    HitPosition          *hits;
    double                eval,
                          raw_eval = 0.0;
    GtUword               coarse_db_len = 0;
    GtMatchIteratorStatus status;
    int                   curr_hits = 0,
                          max_hits = 100;

    hits = gt_malloc(sizeof (*hits) * (size_t) max_hits);

    gt_str_append_cstr(fastaname, ".fas");

    for (i=0; i < max_hits; i++) {
      hits[i].range = gt_malloc(sizeof (*hits[i].range) * (size_t) 1);
    }

    if (gt_showtime_enabled()) {
      timer = gt_timer_new_with_progress_description("initialization");
      gt_timer_start(timer);
    }

    /*extract sequences from compressed database*/
    if (!had_err) {
      nrencseq = gt_n_r_encseq_new_from_file(gt_str_get(arguments->dbpath),
                                             logger, err);
      if (nrencseq == NULL)
        had_err = -1;
    }
    if (!had_err) {
      if (arguments->ceval == GT_UNDEF_DOUBLE ||
          arguments->feval == GT_UNDEF_DOUBLE) {
        /* from NCBI BLAST tutorial:
           E = Kmne^{-lambdaS}
           calculates E-value for score S with natural scale parameters K for
           search space size and lambda for the scoring system
           E = mn2^-S'
           m being the subject (total) length, n the length of ONE query
           calculates E-value for bit-score S'
         */
        GtFastaReader *reader;
        GtCondenserSearchAvg avg = {0,0};
        reader = gt_fasta_reader_rec_new(arguments->querypath);
        had_err = gt_fasta_reader_run(reader, NULL, NULL,
                                      gt_condenser_search_cum_moving_avg,
                                      &avg,
                                      err);
        if (!had_err) {
          GtUword S = arguments->bitscore;
          gt_log_log(GT_WU " queries, avg query size: " GT_WU,
                     avg.count, avg.avg);
          raw_eval = 1/pow(2.0, (double) S) * avg.avg;
          gt_logger_log(logger, "Raw E-value set to %.4e", raw_eval);
          gt_assert(avg.avg != 0);
        }
        gt_fasta_reader_delete(reader);
      }
    }

    /*create BLAST database from compressed database fasta file*/
    if (!had_err) {
      if (timer != NULL)
        gt_timer_show_progress(timer, "create coarse BLAST db", stderr);
      if (arguments->blastn)
        had_err = gt_condenser_search_create_nucl_blastdb(gt_str_get(fastaname),
                                                          err);
      else
        had_err = gt_condenser_search_create_prot_blastdb(gt_str_get(fastaname),
                                                          err);
    }

    if (!had_err) {
      GtBlastProcessCall *call;

      if (timer != NULL)
        gt_timer_show_progress(timer, "coarse BLAST run", stderr);

      if (arguments->blastp)
        call = gt_blast_process_call_new_prot();
      else
        call = gt_blast_process_call_new_nucl();
      gt_blast_process_call_set_db(call, gt_str_get(fastaname));
      gt_blast_process_call_set_query(call, querypath);
      gt_blast_process_call_set_evalue(call, arguments->ceval);
      gt_blast_process_call_set_num_threads(call, arguments->blthreads);

      mp = gt_match_iterator_blast_process_new(call, err);
      if (!mp)
        had_err = -1;

      gt_blast_process_call_delete(call);

      while (!had_err &&
             (status = gt_match_iterator_next(mp, &match, err)) !=
             GT_MATCHER_STATUS_END)
      {
        if (status == GT_MATCHER_STATUS_OK) {
          GtUword hit_seq_id;
          char string[7];
          const char *dbseqid = gt_match_get_seqid2(match);
          if (sscanf(dbseqid,"%6s" GT_WU, string, &hit_seq_id) == 2) {
            gt_match_get_range_seq2(match, hits[curr_hits].range);
            hits[curr_hits].idx = hit_seq_id;
            gt_match_delete(match);
            curr_hits++;
            if (curr_hits == max_hits) {
              HitPosition *hit_extention;
              max_hits += 100;
              hits = gt_realloc(hits, sizeof (*hit_extention) * max_hits);
              for (i=max_hits - 100; i < max_hits; i++) {
                hits[i].range = gt_malloc(sizeof (*hits[i].range));
              }
            }
          } else {
            gt_error_set(err, "could not parse unique db header %s", dbseqid);
            had_err = -1;
          }
        } else if (status == GT_MATCHER_STATUS_ERROR) {
          had_err = -1;
        }
      }
      gt_match_iterator_delete(mp);
    }
    /*extract sequences*/
    if (!had_err) {
      GtNREncseqDecompressor *decomp;
      GtFile *coarse_hits;
      if (timer != NULL)
        gt_timer_show_progress(timer, "extract coarse search hits", stderr);
      decomp = gt_n_r_encseq_decompressor_new(nrencseq);
      coarse_hits = gt_file_new(gt_str_get(coarse_fname),"w", err);
      /* TODO DW do NOT extract complete uniques! these could be complete
         chromosomes!! just extract something around it? maybe +- max query
         length*/
      for (i = 0; i < curr_hits; i++) {
        gt_n_r_encseq_decompressor_add_unique_idx_to_extract(decomp,
                                                             hits[i].idx);
      }
      had_err =
        gt_n_r_encseq_decompressor_start_unique_extraction(coarse_hits,
                                                           decomp,
                                                           &coarse_db_len,
                                                           err);
      gt_assert(coarse_db_len != 0);
      gt_file_delete(coarse_hits);
      gt_n_r_encseq_decompressor_delete(decomp);
    }
    gt_n_r_encseq_delete(nrencseq);

    /* create BLAST database from decompressed database file */
    if (!had_err) {
      if (timer != NULL)
        gt_timer_show_progress(timer, "create fine BLAST db", stderr);
      if (arguments->blastn)
        had_err =
          gt_condenser_search_create_nucl_blastdb(gt_str_get(coarse_fname),
                                                  err);
      else
        had_err =
          gt_condenser_search_create_prot_blastdb(gt_str_get(coarse_fname),
                                                  err);
    }
    /* perform fine BLAST search */
    if (!had_err) {
      GtBlastProcessCall *call;

      if (timer != NULL)
        gt_timer_show_progress(timer, "fine BLAST run", stderr);

      if (arguments->feval == GT_UNDEF_DOUBLE) {
        eval = raw_eval * coarse_db_len;
      } else {
        eval = arguments->feval;
      }

      if (arguments->blastp)
        call = gt_blast_process_call_new_prot();
      else
        call = gt_blast_process_call_new_nucl();

      gt_blast_process_call_set_db(call, gt_str_get(coarse_fname));
      gt_blast_process_call_set_query(call, querypath);
      gt_blast_process_call_set_evalue(call, eval);
      gt_blast_process_call_set_num_threads(call, arguments->blthreads);

      gt_logger_log(logger, "Fine E-value set to: %.4e (len)" GT_WU, eval,
                    coarse_db_len);

      mp = gt_match_iterator_blast_process_new(call, err);
      if (!mp)
        had_err = -1;

      gt_blast_process_call_delete(call);

      if (!had_err) {
        GtUword numofhits = 0;
        while (!had_err &&
               (status = gt_match_iterator_next(mp, &match, err)) !=
               GT_MATCHER_STATUS_END) {
          if (status == GT_MATCHER_STATUS_OK) {
            GtMatchBlast *matchb = (GtMatchBlast*) match;
            char *dbseqid = gt_malloc(sizeof (*dbseqid) * 50);
            GtRange range_seq1;
            GtRange range_seq2;
            numofhits++;
            gt_match_get_range_seq1(match, &range_seq1);
            gt_match_get_range_seq2(match, &range_seq2);
            gt_file_xprintf(
                    arguments->outfp,
                    "%s\t%s\t%.2f\t" GT_WU "\t" GT_WU "\t" GT_WU "\t" GT_WU "\t"
                    GT_WU "\t%g\t%.3f\n",
                    gt_match_get_seqid1(match),
                    gt_match_get_seqid2(match),
                    gt_match_blast_get_similarity(matchb),
                    gt_match_blast_get_align_length(matchb),
                    range_seq1.start,
                    range_seq1.end,
                    range_seq2.start,
                    range_seq2.end,
                    gt_match_blast_get_evalue(matchb),
                    (double) gt_match_blast_get_bitscore(matchb));
            gt_match_delete(match);
            gt_free(dbseqid);
          } else if (status == GT_MATCHER_STATUS_ERROR) {
            had_err = -1;
          }
        }
        gt_log_log(GT_WU " hits found\n", numofhits);
      }
      gt_match_iterator_delete(mp);

    }
    if (!had_err)
      if (timer != NULL)
        gt_timer_show_progress_final(timer, stderr);
    gt_timer_delete(timer);

    /*cleanup*/
    for (i=0; i < max_hits; i++) {
      gt_free(hits[i].range);
    }
    gt_free(hits);
    gt_str_delete(fastaname);
  }
  gt_str_delete(coarse_fname);
  gt_logger_delete(logger);
  return had_err;
}
Пример #3
0
static int cluster_sequences(GtArray *matches,
                             GtClusteredSet *cs,
                             GtHashmap *seqdesc2seqnum,
                             unsigned int psmall,
                             unsigned int plarge,
                             GtEncseq *encseq,
                             GtError *err)
{
  GtMatch *match;
  GtMatchEdgeTable matchedgetab;
  GtMatchEdge matchedge;
  GtRange rng_seq1,
          rng_seq2;
  int had_err = 0;
  unsigned long i,
                lsmall,
                llarge,
                matchlen1,
                matchlen2,
                num_of_seq,
                seqnum1 = 0,
                seqnum2 = 0;
  const char *seqid;

  num_of_seq = gt_encseq_num_of_sequences(encseq);
  gt_assert(matches && cs && seqdesc2seqnum && encseq);

  if (gt_clustered_set_num_of_elements(cs, err) != num_of_seq) {
    had_err = -1;
    gt_error_set(err,
                 "number of sequences (%lu) unequals number of elements in"
                 " clustered set (%lu)",
                 num_of_seq, gt_clustered_set_num_of_elements(cs, err));
  }
  if (!had_err) {
    matchedgetab.edges = gt_array_new(sizeof (GtMatchEdge));
    matchedgetab.num_of_edges = 0;

    for (i = 0; i < gt_array_size(matches); i++) {
      match = *(GtMatch**) gt_array_get(matches, i);
      gt_match_get_range_seq1(match, &rng_seq1);
      gt_match_get_range_seq2(match, &rng_seq2);

      matchlen1 =  gt_range_length(&rng_seq1);
      matchlen2 =  gt_range_length(&rng_seq2);

      seqid = gt_match_get_seqid1(match);
      if (gt_hashmap_get(seqdesc2seqnum, (void*) seqid) != NULL)
        seqnum1 = ((unsigned long) gt_hashmap_get(seqdesc2seqnum, seqid)) - 1;
      else {
        had_err = -1;
        gt_error_set(err, "key %s not found", seqid);
      }

      seqid = gt_match_get_seqid2(match);
      if (!had_err && gt_hashmap_get(seqdesc2seqnum, (void*) seqid) != NULL)
        seqnum2 = ((unsigned long) gt_hashmap_get(seqdesc2seqnum, seqid)) - 1;
      else {
        had_err = -1;
        gt_error_set(err, "key %s not found", seqid);
      }

      if (!had_err) {
        if (gt_encseq_seqlength(encseq, seqnum1) >
            gt_encseq_seqlength(encseq, seqnum2)) {
          llarge = gt_encseq_seqlength(encseq, seqnum1);
          lsmall = gt_encseq_seqlength(encseq, seqnum2);
        } else {
          lsmall = gt_encseq_seqlength(encseq, seqnum1);
          llarge = gt_encseq_seqlength(encseq, seqnum2);
        }
        if (((llarge * plarge)/100 <= matchlen1) &&
            ((lsmall * psmall)/100 <= matchlen1) &&
            ((llarge * plarge)/100 <= matchlen2) &&
            ((lsmall * psmall)/100 <= matchlen2)) {
          if (seqnum1 != seqnum2) {
            matchedge.matchnum0 = seqnum1;
            matchedge.matchnum1 = seqnum2;
            gt_array_add(matchedgetab.edges, matchedge);
            matchedgetab.num_of_edges++;
          }
        }
      }
    }
  }
  if (!had_err)
    if (gt_cluster_matches(cs, &matchedgetab, err) != 0)
      had_err = -1;
  if (!had_err)
    gt_array_delete(matchedgetab.edges);
  return had_err;
}