示例#1
0
/*read fastasequences from file with GtFastaReader (-ff)*/
static int get_fastasequences(GtSequenceTable *sequence_table,
                              const GtStr *filename,
                              GtError *err)
{
  int had_err = 0;
  GtFastaReader *reader;

  gt_assert(sequence_table != NULL);
  reader = gt_fasta_reader_rec_new (filename);
  had_err = gt_fasta_reader_run(reader, NULL, save_fastasequence,
                                NULL, sequence_table, err);
  gt_fasta_reader_delete(reader);
  return had_err;
}
示例#2
0
int gt_paircmp(int argc, const char **argv, GtError *err)
{
  int parsed_args;
  Cmppairwiseopt cmppairwise;
  GtOPrval oprval;
  GtFastaReader *reader0 = NULL,
                *reader1 = NULL;

  gt_error_check(err);

  oprval = parse_options(&parsed_args, &cmppairwise, argc, argv, err);
  if (oprval == GT_OPTION_PARSER_OK)
  {
    gt_assert(parsed_args == argc);
    showsimpleoptions(&cmppairwise);
    if (cmppairwise.showedist)
    {
      GtUword edist, len1, len2;
      GtStr *s1, *s2;

      gt_assert(gt_str_array_size(cmppairwise.strings) >= 2);
      s1 = gt_str_array_get_str(cmppairwise.strings,0);
      s2 = gt_str_array_get_str(cmppairwise.strings,1UL);
      len1 = gt_str_length(s1);
      len2 = gt_str_length(s2);
      edist = gt_computegreedyunitedist((const GtUchar *) gt_str_get(s1),
                                        len1,
                                        (const GtUchar *) gt_str_get(s2),
                                        len2);
      printf(GT_WU " " GT_WU " " GT_WU " " GT_WU "%% errors\n",
             edist, len1,len2,(200 * edist)/(len1+len2));
    }
    else if (cmppairwise.print)
    {
      const GtStr *str0 = gt_str_array_get_str(cmppairwise.strings,0),
                  *str1 = gt_str_array_get_str(cmppairwise.strings,1);

      gt_print_edist_alignment((const GtUchar *) gt_str_get(str0),0,
                               gt_str_length(str0),
                               (const GtUchar *) gt_str_get(str1),0,
                               gt_str_length(str1));
    } else
    {
      size_t idx;
      Checkfunctiontabentry checkfunction_tab[] = {
        MAKECheckfunctiontabentry(gt_checkgreedyunitedist),
        MAKECheckfunctiontabentry(gt_checklinearspace),
        MAKECheckfunctiontabentry(gt_checklinearspace_local),
        MAKECheckfunctiontabentry(gt_checkaffinelinearspace),
        MAKECheckfunctiontabentry(gt_checkaffinelinearspace_local),
        MAKECheckfunctiontabentry(gt_checkdiagonalbandalign),
        MAKECheckfunctiontabentry(gt_checkdiagonalbandaffinealign)
      };

      if (cmppairwise.fasta)
      {
        gt_assert(gt_str_array_size(cmppairwise.files) == 3);
        cmppairwise.fastasequences0 = gt_str_array_new();
        cmppairwise.fastasequences1 = gt_str_array_new();

        reader0 = gt_fasta_reader_rec_new(gt_str_array_get_str(
                                                        cmppairwise.files,1UL));
        gt_fasta_reader_run(reader0, NULL, save_fastaentry,
                            NULL, cmppairwise.fastasequences0, err);
        reader1 = gt_fasta_reader_rec_new (gt_str_array_get_str(
                                                        cmppairwise.files,2UL));
        gt_fasta_reader_run(reader1, NULL, save_fastaentry,
                            NULL, cmppairwise.fastasequences1, err);
        gt_error_check(err);
      }
      for (idx = 0; idx < sizeof checkfunction_tab/sizeof checkfunction_tab[0];
           idx++)
      {
        GtUword testcases;

        printf("run %s\n",checkfunction_tab[idx].name);
        testcases
          = applycheckfunctiontosimpleoptions(checkfunction_tab[idx].function,
                                              &cmppairwise);
        printf("# number of testcases for %s: " GT_WU "\n",
               checkfunction_tab[idx].name,testcases);
      }
      gt_fasta_reader_delete(reader0);
      gt_fasta_reader_delete(reader1);
    }
  }
  freesimpleoption(&cmppairwise);
  if (oprval == GT_OPTION_PARSER_REQUESTS_EXIT)
  {
    return 0;
  }
  if (oprval == GT_OPTION_PARSER_ERROR)
  {
    return -1;
  }
  return 0;
}
static int gt_condenser_search_runner(GT_UNUSED int argc,
                                      GT_UNUSED const char **argv,
                                      GT_UNUSED int parsed_args,
                                      void *tool_arguments,
                                      GtError *err)
{
  GtCondenserSearchArguments *arguments = tool_arguments;
  int i, had_err = 0;
  char *querypath = gt_str_get(arguments->querypath);
  GtStr* coarse_fname = gt_str_new_cstr("coarse_");
  char *db_basename = NULL;
  char *suffix_ptr = NULL;
  GtTimer *timer = NULL;
  GtLogger *logger = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr);

  db_basename = gt_basename(gt_str_get(arguments->dbpath));
  /* if first char is '.' this might be a hidden file */
  if (strlen(db_basename) > (size_t) 1 &&
      (suffix_ptr = strrchr(db_basename + 1, '.')) != NULL) {
    /* remove suffix */
    *suffix_ptr = '\0';
  }
  gt_str_append_cstr(coarse_fname, db_basename);
  gt_str_append_cstr(coarse_fname, ".fas");
  gt_free(db_basename);
  db_basename = NULL;
  suffix_ptr = NULL;

  if (arguments->blastn || arguments->blastp) {
    GtMatch              *match;
    GtMatchIterator      *mp = NULL;
    GtNREncseq           *nrencseq = NULL;
    GtStr                *fastaname = gt_str_clone(arguments->dbpath);
    HitPosition          *hits;
    double                eval,
                          raw_eval = 0.0;
    GtUword               coarse_db_len = 0;
    GtMatchIteratorStatus status;
    int                   curr_hits = 0,
                          max_hits = 100;

    hits = gt_malloc(sizeof (*hits) * (size_t) max_hits);

    gt_str_append_cstr(fastaname, ".fas");

    for (i=0; i < max_hits; i++) {
      hits[i].range = gt_malloc(sizeof (*hits[i].range) * (size_t) 1);
    }

    if (gt_showtime_enabled()) {
      timer = gt_timer_new_with_progress_description("initialization");
      gt_timer_start(timer);
    }

    /*extract sequences from compressed database*/
    if (!had_err) {
      nrencseq = gt_n_r_encseq_new_from_file(gt_str_get(arguments->dbpath),
                                             logger, err);
      if (nrencseq == NULL)
        had_err = -1;
    }
    if (!had_err) {
      if (arguments->ceval == GT_UNDEF_DOUBLE ||
          arguments->feval == GT_UNDEF_DOUBLE) {
        /* from NCBI BLAST tutorial:
           E = Kmne^{-lambdaS}
           calculates E-value for score S with natural scale parameters K for
           search space size and lambda for the scoring system
           E = mn2^-S'
           m being the subject (total) length, n the length of ONE query
           calculates E-value for bit-score S'
         */
        GtFastaReader *reader;
        GtCondenserSearchAvg avg = {0,0};
        reader = gt_fasta_reader_rec_new(arguments->querypath);
        had_err = gt_fasta_reader_run(reader, NULL, NULL,
                                      gt_condenser_search_cum_moving_avg,
                                      &avg,
                                      err);
        if (!had_err) {
          GtUword S = arguments->bitscore;
          gt_log_log(GT_WU " queries, avg query size: " GT_WU,
                     avg.count, avg.avg);
          raw_eval = 1/pow(2.0, (double) S) * avg.avg;
          gt_logger_log(logger, "Raw E-value set to %.4e", raw_eval);
          gt_assert(avg.avg != 0);
        }
        gt_fasta_reader_delete(reader);
      }
    }

    /*create BLAST database from compressed database fasta file*/
    if (!had_err) {
      if (timer != NULL)
        gt_timer_show_progress(timer, "create coarse BLAST db", stderr);
      if (arguments->blastn)
        had_err = gt_condenser_search_create_nucl_blastdb(gt_str_get(fastaname),
                                                          err);
      else
        had_err = gt_condenser_search_create_prot_blastdb(gt_str_get(fastaname),
                                                          err);
    }

    if (!had_err) {
      GtBlastProcessCall *call;

      if (timer != NULL)
        gt_timer_show_progress(timer, "coarse BLAST run", stderr);

      if (arguments->blastp)
        call = gt_blast_process_call_new_prot();
      else
        call = gt_blast_process_call_new_nucl();
      gt_blast_process_call_set_db(call, gt_str_get(fastaname));
      gt_blast_process_call_set_query(call, querypath);
      gt_blast_process_call_set_evalue(call, arguments->ceval);
      gt_blast_process_call_set_num_threads(call, arguments->blthreads);

      mp = gt_match_iterator_blast_process_new(call, err);
      if (!mp)
        had_err = -1;

      gt_blast_process_call_delete(call);

      while (!had_err &&
             (status = gt_match_iterator_next(mp, &match, err)) !=
             GT_MATCHER_STATUS_END)
      {
        if (status == GT_MATCHER_STATUS_OK) {
          GtUword hit_seq_id;
          char string[7];
          const char *dbseqid = gt_match_get_seqid2(match);
          if (sscanf(dbseqid,"%6s" GT_WU, string, &hit_seq_id) == 2) {
            gt_match_get_range_seq2(match, hits[curr_hits].range);
            hits[curr_hits].idx = hit_seq_id;
            gt_match_delete(match);
            curr_hits++;
            if (curr_hits == max_hits) {
              HitPosition *hit_extention;
              max_hits += 100;
              hits = gt_realloc(hits, sizeof (*hit_extention) * max_hits);
              for (i=max_hits - 100; i < max_hits; i++) {
                hits[i].range = gt_malloc(sizeof (*hits[i].range));
              }
            }
          } else {
            gt_error_set(err, "could not parse unique db header %s", dbseqid);
            had_err = -1;
          }
        } else if (status == GT_MATCHER_STATUS_ERROR) {
          had_err = -1;
        }
      }
      gt_match_iterator_delete(mp);
    }
    /*extract sequences*/
    if (!had_err) {
      GtNREncseqDecompressor *decomp;
      GtFile *coarse_hits;
      if (timer != NULL)
        gt_timer_show_progress(timer, "extract coarse search hits", stderr);
      decomp = gt_n_r_encseq_decompressor_new(nrencseq);
      coarse_hits = gt_file_new(gt_str_get(coarse_fname),"w", err);
      /* TODO DW do NOT extract complete uniques! these could be complete
         chromosomes!! just extract something around it? maybe +- max query
         length*/
      for (i = 0; i < curr_hits; i++) {
        gt_n_r_encseq_decompressor_add_unique_idx_to_extract(decomp,
                                                             hits[i].idx);
      }
      had_err =
        gt_n_r_encseq_decompressor_start_unique_extraction(coarse_hits,
                                                           decomp,
                                                           &coarse_db_len,
                                                           err);
      gt_assert(coarse_db_len != 0);
      gt_file_delete(coarse_hits);
      gt_n_r_encseq_decompressor_delete(decomp);
    }
    gt_n_r_encseq_delete(nrencseq);

    /* create BLAST database from decompressed database file */
    if (!had_err) {
      if (timer != NULL)
        gt_timer_show_progress(timer, "create fine BLAST db", stderr);
      if (arguments->blastn)
        had_err =
          gt_condenser_search_create_nucl_blastdb(gt_str_get(coarse_fname),
                                                  err);
      else
        had_err =
          gt_condenser_search_create_prot_blastdb(gt_str_get(coarse_fname),
                                                  err);
    }
    /* perform fine BLAST search */
    if (!had_err) {
      GtBlastProcessCall *call;

      if (timer != NULL)
        gt_timer_show_progress(timer, "fine BLAST run", stderr);

      if (arguments->feval == GT_UNDEF_DOUBLE) {
        eval = raw_eval * coarse_db_len;
      } else {
        eval = arguments->feval;
      }

      if (arguments->blastp)
        call = gt_blast_process_call_new_prot();
      else
        call = gt_blast_process_call_new_nucl();

      gt_blast_process_call_set_db(call, gt_str_get(coarse_fname));
      gt_blast_process_call_set_query(call, querypath);
      gt_blast_process_call_set_evalue(call, eval);
      gt_blast_process_call_set_num_threads(call, arguments->blthreads);

      gt_logger_log(logger, "Fine E-value set to: %.4e (len)" GT_WU, eval,
                    coarse_db_len);

      mp = gt_match_iterator_blast_process_new(call, err);
      if (!mp)
        had_err = -1;

      gt_blast_process_call_delete(call);

      if (!had_err) {
        GtUword numofhits = 0;
        while (!had_err &&
               (status = gt_match_iterator_next(mp, &match, err)) !=
               GT_MATCHER_STATUS_END) {
          if (status == GT_MATCHER_STATUS_OK) {
            GtMatchBlast *matchb = (GtMatchBlast*) match;
            char *dbseqid = gt_malloc(sizeof (*dbseqid) * 50);
            GtRange range_seq1;
            GtRange range_seq2;
            numofhits++;
            gt_match_get_range_seq1(match, &range_seq1);
            gt_match_get_range_seq2(match, &range_seq2);
            gt_file_xprintf(
                    arguments->outfp,
                    "%s\t%s\t%.2f\t" GT_WU "\t" GT_WU "\t" GT_WU "\t" GT_WU "\t"
                    GT_WU "\t%g\t%.3f\n",
                    gt_match_get_seqid1(match),
                    gt_match_get_seqid2(match),
                    gt_match_blast_get_similarity(matchb),
                    gt_match_blast_get_align_length(matchb),
                    range_seq1.start,
                    range_seq1.end,
                    range_seq2.start,
                    range_seq2.end,
                    gt_match_blast_get_evalue(matchb),
                    (double) gt_match_blast_get_bitscore(matchb));
            gt_match_delete(match);
            gt_free(dbseqid);
          } else if (status == GT_MATCHER_STATUS_ERROR) {
            had_err = -1;
          }
        }
        gt_log_log(GT_WU " hits found\n", numofhits);
      }
      gt_match_iterator_delete(mp);

    }
    if (!had_err)
      if (timer != NULL)
        gt_timer_show_progress_final(timer, stderr);
    gt_timer_delete(timer);

    /*cleanup*/
    for (i=0; i < max_hits; i++) {
      gt_free(hits[i].range);
    }
    gt_free(hits);
    gt_str_delete(fastaname);
  }
  gt_str_delete(coarse_fname);
  gt_logger_delete(logger);
  return had_err;
}