예제 #1
0
파일: fmi-mkindex.c 프로젝트: 9beckert/TIR
int gt_parseargsandcallmkfmindex(int argc,const char **argv,GtError *err)
{
  Mkfmcallinfo mkfmcallinfo;
  int retval;
  bool haserr = false;

  retval = mkfmindexoptions(&mkfmcallinfo,argc,argv,err);
  if (retval == 0)
  {
    GtLogger *logger = gt_logger_new(false, GT_LOGGER_DEFLT_PREFIX, stdout);
    if (runmkfmindex(&mkfmcallinfo,logger,err) < 0)
    {
      haserr = true;
    }
    gt_logger_delete(logger);
    logger = NULL;
  } else
  {
    if (retval < 0)
    {
      haserr = true;
    }
  }
  freemkfmcallinfo(&mkfmcallinfo);
  return haserr ? -1 : 0;
}
예제 #2
0
static void infer_cds_visitor_test_data(GtQueue *queue)
{
  GtError *error = gt_error_new();
  const char *file = "data/gff3/grape-codons.gff3";
  GtNodeStream *gff3in = gt_gff3_in_stream_new_unsorted(1, &file);
  gt_gff3_in_stream_check_id_attributes((GtGFF3InStream *)gff3in);
  gt_gff3_in_stream_enable_tidy_mode((GtGFF3InStream *)gff3in);
  GtLogger *logger = gt_logger_new(true, "", stderr);
  GtNodeStream *icv_stream = agn_infer_cds_stream_new(gff3in, NULL, logger);
  GtArray *feats = gt_array_new( sizeof(GtFeatureNode *) );
  GtNodeStream *arraystream = gt_array_out_stream_new(icv_stream, feats, error);
  int pullresult = gt_node_stream_pull(arraystream, error);
  if(pullresult == -1)
  {
    fprintf(stderr, "[AgnInferCDSVisitor::infer_cds_visitor_test_data] error "
            "processing features: %s\n", gt_error_get(error));
  }
  gt_node_stream_delete(gff3in);
  gt_node_stream_delete(icv_stream);
  gt_node_stream_delete(arraystream);
  gt_logger_delete(logger);
  gt_array_sort(feats, (GtCompare)agn_genome_node_compare);
  gt_array_reverse(feats);
  while(gt_array_size(feats) > 0)
  {
    GtFeatureNode *fn = *(GtFeatureNode **)gt_array_pop(feats);
    gt_queue_add(queue, fn);
  }
  gt_array_delete(feats);
  gt_error_delete(error);
}
예제 #3
0
static int encode_sequence_files(GtStrArray *infiles, GtEncseqOptions *opts,
                                 const char *indexname, bool verbose,
                                 bool esq_no_header,
                                 GtError *err)
{
  GtEncseqEncoder *encseq_encoder;
  GtLogger *logger;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(infiles && gt_str_array_size(infiles) > 0 && opts);
  logger = gt_logger_new(verbose, "# ", stderr);
  encseq_encoder = gt_encseq_encoder_new_from_options(opts, err);
  if (!encseq_encoder)
    had_err = -1;
  if (!had_err) {
    gt_encseq_encoder_set_logger(encseq_encoder, logger);
    if (esq_no_header)
    {
      gt_encseq_encoder_disable_esq_header(encseq_encoder);
    }
    had_err = gt_encseq_encoder_encode(encseq_encoder, infiles, indexname, err);
  }
  gt_encseq_encoder_delete(encseq_encoder);
  gt_logger_delete(logger);
  return had_err;
}
예제 #4
0
static void gv_test_calc_integrity(AgnUnitTest *test)
{
  const char *filename = "data/gff3/gaeval-stream-unit-test-2.gff3";
  GtNodeStream *align_in = gt_gff3_in_stream_new_unsorted(1, &filename);
  AgnGaevalParams params = { 0.6, 0.3, 0.05, 0.05, 400, 200, 100 };
  GtNodeVisitor *nv = agn_gaeval_visitor_new(align_in, params);
  AgnGaevalVisitor *gv = gaeval_visitor_cast(nv);
  gt_node_stream_delete(align_in);

  GtNodeStream *gff3in = gt_gff3_in_stream_new_unsorted(1, &filename);
  GtHashmap *typestokeep = gt_hashmap_new(GT_HASH_STRING, NULL, NULL);
  gt_hashmap_add(typestokeep, "mRNA", "mRNA");
  GtNodeStream *filtstream = agn_filter_stream_new(gff3in, typestokeep);
  GtLogger *logger = gt_logger_new(true, "", stderr);
  GtNodeStream *ics = agn_infer_cds_stream_new(filtstream, NULL, logger);
  GtNodeStream *ies = agn_infer_exons_stream_new(ics, NULL, logger);

  GtError *error = gt_error_new();
  GtArray *feats = gt_array_new( sizeof(GtFeatureNode *) );
  GtNodeStream *featstream = gt_array_out_stream_new(ies, feats, error);
  int result = gt_node_stream_pull(featstream, error);
  if(result == -1)
  {
    fprintf(stderr, "[AgnGaevalVisitor::gv_test_calc_integrity] error "
            "processing GFF3: %s\n", gt_error_get(error));
    return;
  }
  gt_node_stream_delete(gff3in);
  gt_node_stream_delete(filtstream);
  gt_node_stream_delete(featstream);
  gt_node_stream_delete(ics);
  gt_node_stream_delete(ies);
  gt_logger_delete(logger);
  gt_hashmap_delete(typestokeep);

  agn_assert(gt_array_size(feats) == 2);
  GtFeatureNode *g1 = *(GtFeatureNode **)gt_array_get(feats, 0);
  GtFeatureNode *g2 = *(GtFeatureNode **)gt_array_get(feats, 1);

  double cov1 = gaeval_visitor_calculate_coverage(gv,  g1, error);
  double cov2 = gaeval_visitor_calculate_coverage(gv,  g2, error);
  double int1 = gaeval_visitor_calculate_integrity(gv, g1, cov1, NULL, error);
  double int2 = gaeval_visitor_calculate_integrity(gv, g2, cov2, NULL, error);

  bool test1 = fabs(cov1 - 1.000) < 0.001 &&
               fabs(cov2 - 0.997) < 0.001 &&
               fabs(int1 - 0.850) < 0.001 &&
               fabs(int2 - 0.863) < 0.001;
  agn_unit_test_result(test, "calculate integrity", test1);

  gt_error_delete(error);
  gt_array_delete(feats);
  gt_genome_node_delete((GtGenomeNode *)g1);
  gt_genome_node_delete((GtGenomeNode *)g2);
  gt_node_visitor_delete(nv);
}
static int gt_condenseq_hmmsearch_runner(GT_UNUSED int argc,
                                         GT_UNUSED const char **argv,
                                         GT_UNUSED int parsed_args,
                                         void *tool_arguments,
                                         GtError *err)
{
  GtCondenseqHmmsearchArguments *arguments = tool_arguments;
  GtCondenseq *ces = NULL;
  GtStr *table_filename = NULL;
  GtLogger *logger = NULL;
  int had_err = 0;

  logger = gt_logger_new(gt_condenseq_search_arguments_verbose(arguments->csa),
                         GT_LOGGER_DEFLT_PREFIX, stderr);

  gt_error_check(err);
  gt_assert(arguments);

  table_filename = gt_condenseq_search_arguments_db_filename(arguments->csa,
                                                             "_tabout.tsv");
  if (!had_err) {
    struct stat buf;
    if (stat(gt_str_get(table_filename), &buf) == 0 && !arguments->force_ow) {
      gt_error_set(err, "file %s already exists, use option -force_ow to "
                   "overwrite or delete", gt_str_get(table_filename));
      had_err = -1;
    }
  }

  if (!had_err) {
    ces = gt_condenseq_search_arguments_read_condenseq(arguments->csa,
                                                       logger, err);
    if (ces == NULL)
      had_err = -1;
  }
  if (!had_err) {
    had_err =
      hmmsearch_call_coarse_search(ces,
                                   gt_str_get(arguments->hmmsearch_path),
                                   gt_str_get(table_filename),
                                   gt_str_get(arguments->hmm),
                                   logger, err);
  }
  if (!had_err) {
    had_err = hmmsearch_process_coarse_hits(gt_str_get(table_filename),
                                            ces, arguments,
                                            logger, err);
  }

  gt_condenseq_delete(ces);
  gt_logger_delete(logger);
  gt_str_delete(table_filename);

  return had_err;
}
예제 #6
0
int gt_mergeesa(int argc, const char **argv, GtError *err)
{
    GtStr *storeindex;
    GtStrArray *indexnametab;
    bool haserr = false;
    int parsed_args;

    gt_error_check(err);

    storeindex = gt_str_new();
    indexnametab = gt_str_array_new();
    switch (parse_options(storeindex, indexnametab, &parsed_args, argc, argv,
                          err)) {
    case GT_OPTION_PARSER_OK:
        break;
    case GT_OPTION_PARSER_ERROR:
        haserr = true;
        break;
    case GT_OPTION_PARSER_REQUESTS_EXIT:
        return 0;
    }
    if (!haserr)
    {
        GtUword i;
        GtLogger *logger;

        printf("# storeindex=%s\n",gt_str_get(storeindex));
        for (i=0; i<gt_str_array_size(indexnametab); i++)
        {
            printf("# input=%s\n",gt_str_array_get(indexnametab,i));
        }
        logger = gt_logger_new(false, GT_LOGGER_DEFLT_PREFIX, stdout);
        if (gt_performtheindexmerging(storeindex,
                                      indexnametab,
                                      logger,
                                      err) != 0)
        {
            haserr = true;
        }
        gt_logger_delete(logger);
    }
    gt_str_delete(storeindex);
    gt_str_array_delete(indexnametab);
    return haserr ? -1 : 0;
}
예제 #7
0
static int gt_cge_spacedseed_runner(GT_UNUSED int argc,
                                    GT_UNUSED const char **argv,
                                    GT_UNUSED int parsed_args,
                                    void *tool_arguments,
                                    GtError *err)
{
  Cge_spacedseed_options *arguments = tool_arguments;
  GtLogger *logger = NULL;
  bool haserr = false;

  gt_assert(parsed_args == argc);
  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stdout);
  if (arguments->verbose)
  {
    unsigned long idx;

    printf("# %sindex=%s\n",arguments->withesa ? "esa" : "pck",
                            gt_str_get(arguments->str_inputindex));
    for (idx = 0; idx < gt_str_array_size(arguments->queryfilenames); idx++)
    {
      printf("# queryfile=%s\n",
             gt_str_array_get(arguments->queryfilenames,idx));
    }
  }
  if (gt_matchspacedseed(arguments->withesa,
                      arguments->docompare,
                      gt_str_get(arguments->str_inputindex),
                      arguments->queryfilenames,
                      arguments->verbose,
                      err) != 0)
  {
    haserr = true;
  }
  gt_logger_delete(logger);
  return haserr ? - 1 : 0;
}
extern int
gt_packedindex_chk_search(int argc, const char *argv[], GtError *err)
{
  struct chkSearchOptions params;
  Suffixarray suffixarray;
  Enumpatterniterator *epi = NULL;
  bool saIsLoaded = false;
  BWTSeq *bwtSeq = NULL;
  GtStr *inputProject = NULL;
  int parsedArgs;
  bool had_err = false;
  BWTSeqExactMatchesIterator EMIter;
  bool EMIterInitialized = false;
  GtLogger *logger = NULL;
  inputProject = gt_str_new();

  do {
    gt_error_check(err);
    {
      bool exitNow = false;
      switch (parseChkBWTOptions(&parsedArgs, argc, argv, &params,
                                 inputProject, err))
      {
      case GT_OPTION_PARSER_OK:
        break;
      case GT_OPTION_PARSER_ERROR:
        had_err = true;
        exitNow = true;
        break;
      case GT_OPTION_PARSER_REQUESTS_EXIT:
        exitNow = true;
        break;
      }
      if (exitNow)
        break;
    }
    gt_str_set(inputProject, argv[parsedArgs]);

    logger = gt_logger_new(params.verboseOutput,
                           GT_LOGGER_DEFLT_PREFIX, stdout);

    bwtSeq = gt_availBWTSeq(&params.idx.final, logger, err);
    if ((had_err = bwtSeq == NULL))
      break;

    {
      enum verifyBWTSeqErrCode retval =
        gt_BWTSeqVerifyIntegrity(bwtSeq, gt_str_get(inputProject), params.flags,
                              params.progressInterval, stderr, logger, err);
      if ((had_err = (retval != VERIFY_BWTSEQ_NO_ERROR)))
      {
        fprintf(stderr, "index integrity check failed: %s\n",
                gt_error_get(err));
        gt_error_set(err, "aborted because of index integrity check fail");
        break;
      }
    }
    if (BWTSeqHasLocateInformation(bwtSeq))
    {
      if ((had_err = !gt_initEmptyEMIterator(&EMIter, bwtSeq)))
      {
        gt_error_set(err, "Cannot create matches iterator for sequence index.");
        break;
      }
      EMIterInitialized = true;
    }
    {
      unsigned long totalLen, dbstart;
      unsigned long trial, patternLen;

      if ((had_err =
           gt_mapsuffixarray(&suffixarray, SARR_SUFTAB | SARR_ESQTAB,
                             gt_str_get(inputProject), NULL, err) != 0))
      {
        gt_error_set(err, "Can't load suffix array project with"
                  " demand for encoded sequence and suffix table files\n");
        break;
      }
      totalLen = gt_encseq_total_length(suffixarray.encseq);
      saIsLoaded = true;
      if ((had_err = (params.minPatLen >= 0L && params.maxPatLen >= 0L
                      && params.minPatLen > params.maxPatLen)))
      {
        gt_error_set(err, "Invalid pattern lengths selected: min=%ld, max=%ld;"
                  " min <= max is required.", params.minPatLen,
                  params.maxPatLen);
        break;
      }
      if (params.minPatLen < 0 || params.maxPatLen < 0)
      {
        unsigned int numofchars
          = gt_alphabet_num_of_chars(
                               gt_encseq_alphabet(suffixarray.encseq));
        if (params.minPatLen < 0)
          params.minPatLen
            = gt_recommendedprefixlength(numofchars,
                                         totalLen,
                                         GT_RECOMMENDED_MULTIPLIER_DEFAULT,
                                         true);
        if (params.maxPatLen < 0)
          params.maxPatLen =
            MAX(params.minPatLen,
                125 * gt_recommendedprefixlength(numofchars,totalLen,
                                         GT_RECOMMENDED_MULTIPLIER_DEFAULT,
                                         true)/100);
        else
          params.maxPatLen = MAX(params.maxPatLen, params.minPatLen);
      }
      fprintf(stderr, "Using patterns of lengths %lu to %lu\n",
              params.minPatLen, params.maxPatLen);
      if ((had_err = totalLen + 1 != BWTSeqLength(bwtSeq)))
      {
        gt_error_set(err, "base suffix array and index have diferrent lengths!"
                          "%lu vs. %lu",  totalLen + 1,
                  BWTSeqLength(bwtSeq));
        break;
      }
      if ((had_err =
           (epi = gt_newenumpatterniterator(params.minPatLen, params.maxPatLen,
                                         suffixarray.encseq,
                                         err)) == NULL))
      {
        fputs("Creation of pattern iterator failed!\n", stderr);
        break;
      }
      for (trial = 0; !had_err && trial < params.numOfSamples; ++trial)
      {
        const GtUchar *pptr = gt_nextEnumpatterniterator(&patternLen, epi);
        GtMMsearchiterator *mmsi =
          gt_mmsearchiterator_new_complete_olain(suffixarray.encseq,
                                            suffixarray.suftab,
                                            0,  /* leftbound */
                                            totalLen, /* rightbound */
                                            0, /* offset */
                                            suffixarray.readmode,
                                            pptr,
                                            patternLen);
        if (BWTSeqHasLocateInformation(bwtSeq))
        {
          if ((had_err = !gt_reinitEMIterator(&EMIter, bwtSeq, pptr, patternLen,
                                           false)))
          {
            fputs("Internal error: failed to reinitialize pattern match"
                  " iterator", stderr);
            abort();
          }
          gt_assert(gt_EMINumMatchesTotal(&EMIter) ==
                    gt_BWTSeqMatchCount(bwtSeq, pptr, patternLen,
                                        false));
          gt_assert(gt_EMINumMatchesTotal(&EMIter)
                      == gt_mmsearchiterator_count(mmsi));
          while (gt_mmsearchiterator_next(&dbstart,mmsi))
          {
            unsigned long matchPos = 0;
            bool match = EMIGetNextMatch(&EMIter, &matchPos, bwtSeq);
            if ((had_err = !match))
            {
              gt_error_set(err,
                           "matches of packedindex expired before mmsearch!");
              break;
            }
            if ((had_err = matchPos != dbstart))
            {
              gt_error_set(err, "packedindex match doesn't equal mmsearch "
                           "match result!\n%lu vs. %lu\n",
                           matchPos, dbstart);
            }
          }
          if (!had_err)
          {
            unsigned long matchPos;
            bool trailingMatch = EMIGetNextMatch(&EMIter, &matchPos, bwtSeq);
            if ((had_err = trailingMatch))
            {
              gt_error_set(err, "matches of mmsearch expired before fmindex!");
              break;
            }
          }
        }
        else
        {
          unsigned long numFMIMatches = gt_BWTSeqMatchCount(bwtSeq, pptr,
                                                         patternLen,
                                                         false),
            numMMSearchMatches = gt_mmsearchiterator_count(mmsi);
          if ((had_err = numFMIMatches != numMMSearchMatches))
          {
            gt_error_set(err, "Number of matches not equal for suffix array ("
                              "%lu) and fmindex (%lu).\n",
                      numFMIMatches, numMMSearchMatches);
          }
        }
        gt_mmsearchiterator_delete(mmsi);
        mmsi = NULL;
        if (params.progressInterval && !((trial + 1) % params.progressInterval))
          putc('.', stderr);
      }
      if (params.progressInterval)
        putc('\n', stderr);
      fprintf(stderr, "Finished %lu of %lu matchings successfully.\n",
              trial, params.numOfSamples);
    }
  } while (0);
  if (EMIterInitialized) gt_destructEMIterator(&EMIter);
  if (saIsLoaded) gt_freesuffixarray(&suffixarray);
  gt_freeEnumpatterniterator(epi);
  if (bwtSeq) gt_deleteBWTSeq(bwtSeq);
  if (logger) gt_logger_delete(logger);
  if (inputProject) gt_str_delete(inputProject);
  return had_err?-1:0;
}
static int gt_readjoiner_assembly_runner(GT_UNUSED int argc,
    GT_UNUSED const char **argv, GT_UNUSED int parsed_args,
    void *tool_arguments, GtError *err)
{
  GtReadjoinerAssemblyArguments *arguments = tool_arguments;
  GtLogger *verbose_logger, *default_logger;
  GtEncseqLoader *el;
  GtEncseq *reads;
  GtTimer *timer = NULL;
  GtStrgraph *strgraph = NULL;
  GtBitsequence *contained = NULL;
  const char *readset = gt_str_get(arguments->readset);
  bool eqlen = true;
  GtUword nreads, tlen, rlen;
  int had_err = 0;

  gt_assert(arguments);
  gt_error_check(err);
  default_logger = gt_logger_new(!arguments->quiet, GT_LOGGER_DEFLT_PREFIX,
      stdout);
  gt_logger_log(default_logger,
      "gt readjoiner assembly (version "GT_READJOINER_VERSION")");
  verbose_logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX,
      stdout);
  gt_logger_log(verbose_logger, "verbose output activated");
  gt_logger_log(verbose_logger, "readset name = %s", readset);
  if (gt_showtime_enabled())
  {
    timer = gt_timer_new_with_progress_description(
        GT_READJOINER_ASSEMBLY_MSG_COUNTSPM);
    gt_timer_start(timer);
    gt_timer_show_cpu_time_by_progress(timer);
  }

  if (!arguments->paths2seq)
  {
    el = gt_encseq_loader_new();
    gt_encseq_loader_drop_description_support(el);
    gt_encseq_loader_disable_autosupport(el);
    reads = gt_encseq_loader_load(el, readset, err);
    if (reads == NULL)
    {
      had_err = -1;
    }
    if (had_err == 0)
    {
      eqlen = gt_encseq_accesstype_get(reads) == GT_ACCESS_TYPE_EQUALLENGTH;
      nreads = gt_encseq_num_of_sequences(reads);
      gt_logger_log(default_logger, "number of reads in filtered readset = "
                    GT_WU, nreads);
      tlen = gt_encseq_total_length(reads) - nreads + 1;
      gt_logger_log(verbose_logger, "total length of filtered readset = " GT_WU,
          tlen);

      if (eqlen)
      {
        rlen = gt_encseq_seqlength(reads, 0);
        gt_logger_log(verbose_logger, "read length = " GT_WU, rlen);
        gt_encseq_delete(reads);
        reads = NULL;
      }
      else
      {
        had_err = gt_readjoiner_assembly_build_contained_reads_list(
          arguments, &contained, err);
        rlen = 0;
        gt_logger_log(verbose_logger, "read length = variable");
        gt_assert(reads != NULL);
      }
    }

    if (had_err == 0)
    {
      if (!arguments->load)
      {
        had_err = gt_readjoiner_assembly_build_graph(arguments, &strgraph,
            reads, readset, eqlen, rlen, nreads, contained, default_logger,
            verbose_logger, timer, err);
      }
      else
      {
        gt_readjoiner_assembly_load_graph(&strgraph, reads, readset, rlen,
            default_logger, timer);
      }
    }

    if (!eqlen && reads != NULL && !arguments->errors)
    {
      gt_encseq_delete(reads);
      reads = NULL;
      if (had_err == 0)
        gt_strgraph_set_encseq(strgraph, NULL);
    }

    if (had_err == 0 && arguments->redtrans)
    {
      if (gt_showtime_enabled())
        gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_REDTRANS,
            stdout);
      gt_strgraph_sort_edges_by_len(strgraph, false);
      (void)gt_strgraph_redtrans(strgraph, false);
      (void)gt_strgraph_redself(strgraph, false);
      (void)gt_strgraph_redwithrc(strgraph, false);
      gt_strgraph_log_stats(strgraph, verbose_logger);
    }

    if (had_err == 0 && arguments->errors)
    {
      if (gt_showtime_enabled())
        gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_CLEANSG,
            stdout);
      gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_CLEANSG);
      had_err = gt_readjoiner_assembly_error_correction(strgraph,
          arguments->bubble, arguments->deadend, arguments->deadend_depth,
          verbose_logger);
    }

    if (had_err == 0 && arguments->save)
    {
      if (gt_showtime_enabled())
        gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_SAVESG,
            stdout);
      gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_SAVESG);
      gt_strgraph_show(strgraph, GT_STRGRAPH_BIN,
          gt_str_get(arguments->readset), GT_READJOINER_SUFFIX_SG, false);
    }

    if (!eqlen && reads != NULL)
    {
      gt_encseq_delete(reads);
      reads = NULL;
      if (had_err == 0)
        gt_strgraph_set_encseq(strgraph, NULL);
    }

    if (had_err == 0)
    {
      if (gt_showtime_enabled())
        gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_TRAVERSESG,
            stdout);
      gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_TRAVERSESG);
      gt_readjoiner_assembly_show_current_space("(before traversal)");
      gt_strgraph_spell(strgraph, (GtUword)arguments->depthcutoff,
          (GtUword)arguments->lengthcutoff, arguments->vd, readset,
          GT_READJOINER_SUFFIX_CONTIG_PATHS, NULL, true,
          arguments->show_contigs_info, false, verbose_logger);
    }

    if (contained != NULL)
      gt_free(contained);
    gt_strgraph_delete(strgraph);
    strgraph = NULL;
    gt_assert(reads == NULL);
    gt_encseq_loader_delete(el);
  }

  if (had_err == 0)
  {
    gt_readjoiner_assembly_show_current_space("(before paths2seq)");
    had_err = gt_readjoiner_assembly_paths2seq(readset,
        (GtUword)arguments->lengthcutoff, arguments->vd,
        arguments->astat, arguments->coverage, arguments->copynum,
        arguments->buffersize, default_logger, &timer, err);
  }

  if (gt_showtime_enabled())
  {
    gt_timer_show_progress_final(timer, stdout);
    gt_timer_delete(timer);
  }
  gt_logger_delete(default_logger);
  gt_logger_delete(verbose_logger);
  return had_err;
}
예제 #10
0
파일: gt_ltrdigest.c 프로젝트: 9beckert/TIR
static int gt_ltrdigest_runner(GT_UNUSED int argc, const char **argv,
                               int parsed_args, void *tool_arguments,
                               GtError *err)
{
  GtLTRdigestOptions *arguments = tool_arguments;
  GtNodeStream *gff3_in_stream   = NULL,
               *gff3_out_stream  = NULL,
               *ltrdigest_stream = NULL,
               *tab_out_stream   = NULL,
               *last_stream      = NULL;
  int had_err      = 0,
      tests_to_run = 0,
      arg = parsed_args;
  const char *indexname = argv[arg+1];
  GtLogger *logger = gt_logger_new(arguments->verbose,
                                   GT_LOGGER_DEFLT_PREFIX, stdout);
  GtEncseqLoader *el;
  GtEncseq *encseq;
  gt_error_check(err);
  gt_assert(arguments);

  /* Set sequence encoder options. Defaults are ok. */
  el = gt_encseq_loader_new();
  gt_encseq_loader_set_logger(el, logger);

  /* Open sequence file */
  encseq = gt_encseq_loader_load(el, indexname, err);
  if (!encseq)
    had_err = -1;

  /* Always search for PPT. */
  tests_to_run |= GT_LTRDIGEST_RUN_PPT;

  /* Open tRNA library if given. */
  if (!had_err && arguments->trna_lib
        && gt_str_length(arguments->trna_lib) > 0)
  {
    tests_to_run |= GT_LTRDIGEST_RUN_PBS;
   arguments->pbs_opts.trna_lib = gt_bioseq_new(gt_str_get(arguments->trna_lib),
                                                 err);
    if (gt_error_is_set(err))
      had_err = -1;
  }

#ifdef HAVE_HMMER
  /* Open HMMER files if given. */
  if (!had_err && gt_str_array_size(arguments->pdom_opts.hmm_files) > 0)
  {
    tests_to_run |= GT_LTRDIGEST_RUN_PDOM;
    if (!strcmp(gt_str_get(arguments->cutoffs), "GA")) {
      arguments->pdom_opts.cutoff = GT_PHMM_CUTOFF_GA;
    } else if (!strcmp(gt_str_get(arguments->cutoffs), "TC")) {
      arguments->pdom_opts.cutoff = GT_PHMM_CUTOFF_TC;
    } else if (!strcmp(gt_str_get(arguments->cutoffs), "NONE")) {
      arguments->pdom_opts.cutoff = GT_PHMM_CUTOFF_NONE;
    } else {
      gt_error_set(err, "invalid cutoff setting!");
      had_err = -1;
    }
  }
#endif

  if (!had_err)
  {
    /* set up stream flow
     * ------------------*/
    last_stream = gff3_in_stream  = gt_gff3_in_stream_new_sorted(argv[arg]);

    last_stream = ltrdigest_stream = gt_ltrdigest_stream_new(last_stream,
                                                  tests_to_run,
                                                  encseq,
                                                  &arguments->pbs_opts,
                                                  &arguments->ppt_opts,
#ifdef HAVE_HMMER
                                                  &arguments->pdom_opts,
#endif
                                                  err);
    if (!ltrdigest_stream)
      had_err = -1;
  }

  if (!had_err)
  {
    /* attach tabular output stream, if requested */
    if (gt_str_length(arguments->prefix) > 0)
    {
      last_stream = tab_out_stream = gt_ltr_fileout_stream_new(last_stream,
                                              tests_to_run,
                                              encseq,
                                              gt_str_get(arguments->prefix),
                                              &arguments->ppt_opts,
                                              &arguments->pbs_opts,
#ifdef HAVE_HMMER
                                              &arguments->pdom_opts,
#endif
                                              gt_str_get(arguments->trna_lib),
                                              argv[arg+1],
                                              argv[arg],
                                              arguments->seqnamelen,
                                              err);
#ifdef HAVE_HMMER
    if (&arguments->pdom_opts.write_alignments)
      gt_ltr_fileout_stream_enable_pdom_alignment_output(tab_out_stream);
    if (&arguments->pdom_opts.write_aaseqs)
      gt_ltr_fileout_stream_enable_aa_sequence_output(tab_out_stream);
#endif
    }

    last_stream = gff3_out_stream = gt_gff3_out_stream_new(last_stream,
                                                           arguments->outfp);

    /* pull the features through the stream and free them afterwards */
    had_err = gt_node_stream_pull(last_stream, err);
  }

  gt_node_stream_delete(gff3_out_stream);
  gt_node_stream_delete(ltrdigest_stream);
  if (tab_out_stream != NULL)
    gt_node_stream_delete(tab_out_stream);
  gt_node_stream_delete(gff3_in_stream);

  gt_encseq_loader_delete(el);
  gt_encseq_delete(encseq);
  encseq = NULL;
  gt_bioseq_delete(arguments->pbs_opts.trna_lib);
  gt_logger_delete(logger);

  return had_err;
}
static int gt_condenser_search_runner(GT_UNUSED int argc,
                                      GT_UNUSED const char **argv,
                                      GT_UNUSED int parsed_args,
                                      void *tool_arguments,
                                      GtError *err)
{
  GtCondenserSearchArguments *arguments = tool_arguments;
  int i, had_err = 0;
  char *querypath = gt_str_get(arguments->querypath);
  GtStr* coarse_fname = gt_str_new_cstr("coarse_");
  char *db_basename = NULL;
  char *suffix_ptr = NULL;
  GtTimer *timer = NULL;
  GtLogger *logger = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr);

  db_basename = gt_basename(gt_str_get(arguments->dbpath));
  /* if first char is '.' this might be a hidden file */
  if (strlen(db_basename) > (size_t) 1 &&
      (suffix_ptr = strrchr(db_basename + 1, '.')) != NULL) {
    /* remove suffix */
    *suffix_ptr = '\0';
  }
  gt_str_append_cstr(coarse_fname, db_basename);
  gt_str_append_cstr(coarse_fname, ".fas");
  gt_free(db_basename);
  db_basename = NULL;
  suffix_ptr = NULL;

  if (arguments->blastn || arguments->blastp) {
    GtMatch              *match;
    GtMatchIterator      *mp = NULL;
    GtNREncseq           *nrencseq = NULL;
    GtStr                *fastaname = gt_str_clone(arguments->dbpath);
    HitPosition          *hits;
    double                eval,
                          raw_eval = 0.0;
    GtUword               coarse_db_len = 0;
    GtMatchIteratorStatus status;
    int                   curr_hits = 0,
                          max_hits = 100;

    hits = gt_malloc(sizeof (*hits) * (size_t) max_hits);

    gt_str_append_cstr(fastaname, ".fas");

    for (i=0; i < max_hits; i++) {
      hits[i].range = gt_malloc(sizeof (*hits[i].range) * (size_t) 1);
    }

    if (gt_showtime_enabled()) {
      timer = gt_timer_new_with_progress_description("initialization");
      gt_timer_start(timer);
    }

    /*extract sequences from compressed database*/
    if (!had_err) {
      nrencseq = gt_n_r_encseq_new_from_file(gt_str_get(arguments->dbpath),
                                             logger, err);
      if (nrencseq == NULL)
        had_err = -1;
    }
    if (!had_err) {
      if (arguments->ceval == GT_UNDEF_DOUBLE ||
          arguments->feval == GT_UNDEF_DOUBLE) {
        /* from NCBI BLAST tutorial:
           E = Kmne^{-lambdaS}
           calculates E-value for score S with natural scale parameters K for
           search space size and lambda for the scoring system
           E = mn2^-S'
           m being the subject (total) length, n the length of ONE query
           calculates E-value for bit-score S'
         */
        GtFastaReader *reader;
        GtCondenserSearchAvg avg = {0,0};
        reader = gt_fasta_reader_rec_new(arguments->querypath);
        had_err = gt_fasta_reader_run(reader, NULL, NULL,
                                      gt_condenser_search_cum_moving_avg,
                                      &avg,
                                      err);
        if (!had_err) {
          GtUword S = arguments->bitscore;
          gt_log_log(GT_WU " queries, avg query size: " GT_WU,
                     avg.count, avg.avg);
          raw_eval = 1/pow(2.0, (double) S) * avg.avg;
          gt_logger_log(logger, "Raw E-value set to %.4e", raw_eval);
          gt_assert(avg.avg != 0);
        }
        gt_fasta_reader_delete(reader);
      }
    }

    /*create BLAST database from compressed database fasta file*/
    if (!had_err) {
      if (timer != NULL)
        gt_timer_show_progress(timer, "create coarse BLAST db", stderr);
      if (arguments->blastn)
        had_err = gt_condenser_search_create_nucl_blastdb(gt_str_get(fastaname),
                                                          err);
      else
        had_err = gt_condenser_search_create_prot_blastdb(gt_str_get(fastaname),
                                                          err);
    }

    if (!had_err) {
      GtBlastProcessCall *call;

      if (timer != NULL)
        gt_timer_show_progress(timer, "coarse BLAST run", stderr);

      if (arguments->blastp)
        call = gt_blast_process_call_new_prot();
      else
        call = gt_blast_process_call_new_nucl();
      gt_blast_process_call_set_db(call, gt_str_get(fastaname));
      gt_blast_process_call_set_query(call, querypath);
      gt_blast_process_call_set_evalue(call, arguments->ceval);
      gt_blast_process_call_set_num_threads(call, arguments->blthreads);

      mp = gt_match_iterator_blast_process_new(call, err);
      if (!mp)
        had_err = -1;

      gt_blast_process_call_delete(call);

      while (!had_err &&
             (status = gt_match_iterator_next(mp, &match, err)) !=
             GT_MATCHER_STATUS_END)
      {
        if (status == GT_MATCHER_STATUS_OK) {
          GtUword hit_seq_id;
          char string[7];
          const char *dbseqid = gt_match_get_seqid2(match);
          if (sscanf(dbseqid,"%6s" GT_WU, string, &hit_seq_id) == 2) {
            gt_match_get_range_seq2(match, hits[curr_hits].range);
            hits[curr_hits].idx = hit_seq_id;
            gt_match_delete(match);
            curr_hits++;
            if (curr_hits == max_hits) {
              HitPosition *hit_extention;
              max_hits += 100;
              hits = gt_realloc(hits, sizeof (*hit_extention) * max_hits);
              for (i=max_hits - 100; i < max_hits; i++) {
                hits[i].range = gt_malloc(sizeof (*hits[i].range));
              }
            }
          } else {
            gt_error_set(err, "could not parse unique db header %s", dbseqid);
            had_err = -1;
          }
        } else if (status == GT_MATCHER_STATUS_ERROR) {
          had_err = -1;
        }
      }
      gt_match_iterator_delete(mp);
    }
    /*extract sequences*/
    if (!had_err) {
      GtNREncseqDecompressor *decomp;
      GtFile *coarse_hits;
      if (timer != NULL)
        gt_timer_show_progress(timer, "extract coarse search hits", stderr);
      decomp = gt_n_r_encseq_decompressor_new(nrencseq);
      coarse_hits = gt_file_new(gt_str_get(coarse_fname),"w", err);
      /* TODO DW do NOT extract complete uniques! these could be complete
         chromosomes!! just extract something around it? maybe +- max query
         length*/
      for (i = 0; i < curr_hits; i++) {
        gt_n_r_encseq_decompressor_add_unique_idx_to_extract(decomp,
                                                             hits[i].idx);
      }
      had_err =
        gt_n_r_encseq_decompressor_start_unique_extraction(coarse_hits,
                                                           decomp,
                                                           &coarse_db_len,
                                                           err);
      gt_assert(coarse_db_len != 0);
      gt_file_delete(coarse_hits);
      gt_n_r_encseq_decompressor_delete(decomp);
    }
    gt_n_r_encseq_delete(nrencseq);

    /* create BLAST database from decompressed database file */
    if (!had_err) {
      if (timer != NULL)
        gt_timer_show_progress(timer, "create fine BLAST db", stderr);
      if (arguments->blastn)
        had_err =
          gt_condenser_search_create_nucl_blastdb(gt_str_get(coarse_fname),
                                                  err);
      else
        had_err =
          gt_condenser_search_create_prot_blastdb(gt_str_get(coarse_fname),
                                                  err);
    }
    /* perform fine BLAST search */
    if (!had_err) {
      GtBlastProcessCall *call;

      if (timer != NULL)
        gt_timer_show_progress(timer, "fine BLAST run", stderr);

      if (arguments->feval == GT_UNDEF_DOUBLE) {
        eval = raw_eval * coarse_db_len;
      } else {
        eval = arguments->feval;
      }

      if (arguments->blastp)
        call = gt_blast_process_call_new_prot();
      else
        call = gt_blast_process_call_new_nucl();

      gt_blast_process_call_set_db(call, gt_str_get(coarse_fname));
      gt_blast_process_call_set_query(call, querypath);
      gt_blast_process_call_set_evalue(call, eval);
      gt_blast_process_call_set_num_threads(call, arguments->blthreads);

      gt_logger_log(logger, "Fine E-value set to: %.4e (len)" GT_WU, eval,
                    coarse_db_len);

      mp = gt_match_iterator_blast_process_new(call, err);
      if (!mp)
        had_err = -1;

      gt_blast_process_call_delete(call);

      if (!had_err) {
        GtUword numofhits = 0;
        while (!had_err &&
               (status = gt_match_iterator_next(mp, &match, err)) !=
               GT_MATCHER_STATUS_END) {
          if (status == GT_MATCHER_STATUS_OK) {
            GtMatchBlast *matchb = (GtMatchBlast*) match;
            char *dbseqid = gt_malloc(sizeof (*dbseqid) * 50);
            GtRange range_seq1;
            GtRange range_seq2;
            numofhits++;
            gt_match_get_range_seq1(match, &range_seq1);
            gt_match_get_range_seq2(match, &range_seq2);
            gt_file_xprintf(
                    arguments->outfp,
                    "%s\t%s\t%.2f\t" GT_WU "\t" GT_WU "\t" GT_WU "\t" GT_WU "\t"
                    GT_WU "\t%g\t%.3f\n",
                    gt_match_get_seqid1(match),
                    gt_match_get_seqid2(match),
                    gt_match_blast_get_similarity(matchb),
                    gt_match_blast_get_align_length(matchb),
                    range_seq1.start,
                    range_seq1.end,
                    range_seq2.start,
                    range_seq2.end,
                    gt_match_blast_get_evalue(matchb),
                    (double) gt_match_blast_get_bitscore(matchb));
            gt_match_delete(match);
            gt_free(dbseqid);
          } else if (status == GT_MATCHER_STATUS_ERROR) {
            had_err = -1;
          }
        }
        gt_log_log(GT_WU " hits found\n", numofhits);
      }
      gt_match_iterator_delete(mp);

    }
    if (!had_err)
      if (timer != NULL)
        gt_timer_show_progress_final(timer, stderr);
    gt_timer_delete(timer);

    /*cleanup*/
    for (i=0; i < max_hits; i++) {
      gt_free(hits[i].range);
    }
    gt_free(hits);
    gt_str_delete(fastaname);
  }
  gt_str_delete(coarse_fname);
  gt_logger_delete(logger);
  return had_err;
}
예제 #12
0
static int gt_condenseq_compress_runner(GT_UNUSED int argc, const char **argv,
                                        int parsed_args, void *tool_arguments,
                                        GtError *err)
{
  GtCondenseqCompressArguments *arguments = tool_arguments;
  GtLogger *logger,
           *kdb_logger;
  FILE *kmer_fp = NULL;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr);
  kdb_logger = gt_logger_new(arguments->kdb, GT_LOGGER_DEFLT_PREFIX, stderr);
  if (arguments->kdb) {
    kmer_fp = gt_fa_fopen("kmer_db.out", "w", err);
    gt_logger_set_target(kdb_logger, kmer_fp);
  }

  if (gt_str_length(arguments->indexname) == 0UL) {
    char *basenameptr;
    basenameptr = gt_basename(argv[parsed_args]);
    gt_str_set(arguments->indexname, basenameptr);
    gt_free(basenameptr);
  }

  if (!had_err) {
    GtEncseqLoader *es_l = gt_encseq_loader_new();
    arguments->input_es = gt_encseq_loader_load(es_l, argv[parsed_args], err);
    if (arguments->input_es == NULL)
      had_err = -1;
    gt_encseq_loader_delete(es_l);
  }

  if (!had_err) {
    if (arguments->minalignlength == GT_UNDEF_UWORD)
      arguments->minalignlength = arguments->initsize != GT_UNDEF_UWORD ?
                                  arguments->initsize / (GtUword) 3UL :
                                  GT_UNDEF_UWORD;
    if (arguments->windowsize == GT_UNDEF_UINT)
      arguments->windowsize = arguments->minalignlength != GT_UNDEF_UWORD ?
                              (unsigned int) (arguments->minalignlength / 5U) :
                              GT_UNDEF_UINT;
    if (arguments->windowsize < 4U)
      arguments->windowsize = 4U;
    if (arguments->kmersize == GT_UNDEF_UINT) {
      unsigned int size =
        gt_alphabet_num_of_chars(gt_encseq_alphabet(arguments->input_es));
      /* size^k ~= 100000 */
      gt_safe_assign(arguments->kmersize,
                     gt_round_to_long(gt_log_base(100000.0, (double) size)));
      gt_logger_log(logger, "|A|: %u, k: %u",
                    size, arguments->kmersize);
    }

    if (arguments->windowsize == GT_UNDEF_UINT) {
      arguments->windowsize = 5U * arguments->kmersize;
    }
    if (arguments->minalignlength == GT_UNDEF_UWORD) {
      arguments->minalignlength = (GtUword) (3UL * arguments->windowsize);
    }
    if (arguments->initsize == GT_UNDEF_UWORD) {
      arguments->initsize = (GtUword) (3UL * arguments->minalignlength);
    }
  }
  if (!had_err &&
      arguments->windowsize <= arguments->kmersize) {
    gt_error_set(err, "-windowsize (%u) must be larger -kmersize (%u)!",
                 arguments->windowsize, arguments->kmersize);
    had_err = -1;
  }
  if (!had_err &&
      arguments->minalignlength < (GtUword) arguments->windowsize) {
    gt_error_set(err, "-alignlength (" GT_WU ") must be at least "
                 "-windowsize (%u)!", arguments->minalignlength,
                 arguments->windowsize);
    had_err = -1;
  }
  if (!had_err && (arguments->initsize < arguments->minalignlength)) {
    gt_error_set(err, "-initsize (" GT_WU ") must be at least "
                 "-alignlength (" GT_WU ")!", arguments->initsize,
                 arguments->minalignlength);
    had_err = -1;
  }

  if (!had_err) {
    GtCondenseqCreator *ces_c;

    if (!had_err) {
      ces_c = gt_condenseq_creator_new(arguments->initsize,
                                       arguments->minalignlength,
                                       arguments->xdrop,
                                       &(arguments->scores),
                                       arguments->kmersize,
                                       arguments->windowsize,
                                       logger,
                                       err);
      if (ces_c == NULL)
        had_err = -1;
    }
    if (!had_err) {
      if (arguments->cutoff_value == GT_UNDEF_UWORD)
        gt_condenseq_creator_use_mean_cutoff(ces_c);
      else if (arguments->cutoff_value == 0)
        gt_condenseq_creator_disable_cutoff(ces_c);
      else
        gt_condenseq_creator_set_cutoff(ces_c, arguments->cutoff_value);
      gt_condenseq_creator_set_mean_fraction(ces_c, arguments->fraction);
      if (arguments->prune)
        gt_condenseq_creator_disable_prune(ces_c);
      if (arguments->brute)
        gt_condenseq_creator_enable_brute_force(ces_c);
      if (!arguments->diags)
        gt_condenseq_creator_disable_diagonals(ces_c);
      if (arguments->full_diags)
        gt_condenseq_creator_enable_full_diagonals(ces_c);
      if (arguments->clean_percent != GT_UNDEF_UINT)
        gt_condenseq_creator_set_diags_clean_limit(ces_c,
                                                   arguments->clean_percent);

      had_err = gt_condenseq_creator_create(ces_c,
                                            arguments->indexname,
                                            arguments->input_es,
                                            logger, kdb_logger, err);

      gt_condenseq_creator_delete(ces_c);
    }
  }

  gt_logger_delete(logger);
  gt_logger_delete(kdb_logger);
  if (arguments->kdb)
    gt_fa_fclose(kmer_fp);
  return had_err;
}
static int gt_condenseq_extract_runner(GT_UNUSED int argc,
                                       const char **argv,
                                       int parsed_args,
                                       void *tool_arguments,
                                       GtError *err)
{
  int had_err = 0;
  GtCondenserExtractArguments *arguments = tool_arguments;
  GtCondenseq *condenseq = NULL;
  GtLogger *logger = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr);

  if (!had_err) {
    condenseq = gt_condenseq_new_from_file(argv[parsed_args], logger, err);
    if (condenseq == NULL) {
      had_err = -1;
    }
  }

  if (!had_err) {
    const char *buffer = NULL;
    const char *desc = NULL;
    GtUword desclen,
            seqlen,
            rend = gt_condenseq_total_length(condenseq),
            send = gt_condenseq_num_of_sequences(condenseq);
    bool concat = strcmp(gt_str_get(arguments->mode), "concat") == 0;
    /* single sequence to extract = range of length 1 */
    if (arguments->seq != GT_UNDEF_UWORD) {
      arguments->seqrange.start = arguments->seqrange.end = arguments->seq;
    }
    /* no range given at all: extract all seqs */
    if (arguments->range.start == GT_UNDEF_UWORD &&
        arguments->seqrange.start == GT_UNDEF_UWORD) {
      arguments->seqrange.start = 0;
      arguments->seqrange.end = send - 1;
    }
    /* if seqs are specified, and concat is given, switch to posrange */
    if (concat && arguments->seqrange.start != GT_UNDEF_UWORD) {
      if (arguments->seqrange.end >= send) {
        had_err = -1;
        gt_error_set(err, "range end " GT_WU " excedes number of sequences "
                     GT_WU " (ranges are zero based sequence ids)",
                     arguments->seqrange.end, send);
      }
      else {
        arguments->range.start =
          gt_condenseq_seqstartpos(condenseq, arguments->seqrange.start);
        arguments->range.end =
          gt_condenseq_seqstartpos(condenseq, arguments->seqrange.end) +
          gt_condenseq_seqlength(condenseq, arguments->seqrange.end) - 1;
      }
    }
    /* extract sequence region */
    if (!had_err && arguments->range.start != GT_UNDEF_UWORD) {
      const GtUword maxbuffsize = ((GtUword) 1) << 17; /* ~ 100000byte */
      GtUword clen,
              rstart,
              current_length = 0, i;
      const char sepchar = gt_str_get(arguments->sepchar)[0];

      if (arguments->range.end >= rend) {
        had_err = -1;
        gt_error_set(err, "range end " GT_WU " excedes length of sequence "
                     GT_WU " (ranges are zero based positions)",
                     arguments->range.end, rend);
      }
      if (!had_err) {
        rstart = arguments->range.start;
        rend = arguments->range.end;
        /* nextlength = gt_condenseq_seqlength(condenseq, seqnum); */
        /* seqstart = gt_condenseq_seqstartpos(condenseq, seqnum); */
        /* gt_assert(rstart >= seqstart); */
        /* nextlength -= rstart - seqstart; [> handle first seq <] */
        while (rstart <= rend) {
          GtRange cur_range;
          if (rend - rstart > maxbuffsize) {
            GtUword seqnum = gt_condenseq_pos2seqnum(condenseq,
                                                     rstart + maxbuffsize),
                    closest_sep = gt_condenseq_seqstartpos(condenseq,
                                                           seqnum) - 1;
            gt_assert(closest_sep > rstart);
            clen = closest_sep - rstart + 1;
          }
          else
            clen = rend - rstart + 1;

          cur_range.start = rstart;
          cur_range.end = rstart + clen - 1;
          buffer = gt_condenseq_extract_decoded_range(condenseq, cur_range,
                                                      sepchar);
          gt_assert(buffer != NULL);
          for (i = 0; i < clen; i++, current_length++) {
            if (arguments->width && current_length == arguments->width) {
              gt_file_xfputc('\n', arguments->outfp);
              current_length = 0;
            }
            gt_file_xfputc(buffer[i], arguments->outfp);
          }
          rstart += clen;
        }
        gt_file_xfputc('\n', arguments->outfp);
      }
    }
    else if (!had_err) { /* extract seqwise and always fasta */
      GtUword seqnum,
              sstart = arguments->seqrange.start;

      if (arguments->seqrange.end >= send) {
        had_err = -1;
        gt_error_set(err, "range end " GT_WU " excedes number of sequences "
                     GT_WU " (ranges are zero based sequence ids)",
                     arguments->seqrange.end, send);
      }
      send = arguments->seqrange.end;
      for (seqnum = sstart;
           !had_err && seqnum <= send;
           ++seqnum) {
        buffer = gt_condenseq_extract_decoded(condenseq, &seqlen, seqnum);
        desc = gt_condenseq_description(condenseq, &desclen, seqnum);
        gt_fasta_show_entry_nt(desc, desclen,
                               buffer, seqlen,
                               arguments->width,
                               arguments->outfp);
      }
    }
  }
  gt_condenseq_delete(condenseq);
  gt_logger_delete(logger);
  return had_err;
}
예제 #14
0
static int gt_genomediff_runner(int argc, const char **argv,
                                int parsed_args, void *tool_arguments,
                                GtError *err)
{
    bool mirrored = false;
    int had_err = 0,
        i;
    GtEncseq              *encseq = NULL;
    GtGenomediffArguments *arguments = tool_arguments;
    GtLogger              *logger;
    GtShuUnitFileInfo     *unit_info = NULL;
    GtTimer               *timer = NULL;

    gt_error_check(err);
    gt_assert(arguments);

    logger = gt_logger_new(arguments->verbose,
                           GT_LOGGER_DEFLT_PREFIX,
                           stdout);
    gt_assert(logger);

    for (i = parsed_args; i < argc; i++) {
        gt_str_array_add_cstr(arguments->filenames, argv[i]);
    }

    if (gt_showtime_enabled()) {
        timer = gt_timer_new_with_progress_description("start");
        gt_timer_start(timer);
        gt_assert(timer);
    }

    if (arguments->with_units) {
        gt_logger_log(logger, "unitfile option set, filename is %s\n",
                      gt_str_get(arguments->unitfile));
    }

    if (timer != NULL)
        gt_timer_show_progress(timer, "start shu search", stdout);

    if (gt_str_array_size(arguments->filenames) > 1UL) {
        GtEncseqEncoder *ee = gt_encseq_encoder_new();
        gt_encseq_encoder_set_timer(ee, timer);
        gt_encseq_encoder_set_logger(ee, logger);
        /* kr only makes sense for dna, so we can check this already with ee */
        gt_encseq_encoder_set_input_dna(ee);
        had_err = gt_encseq_encoder_encode(ee, arguments->filenames,
                                           gt_str_get(arguments->indexname), err);
        gt_encseq_encoder_delete(ee);
    }
    else {
        gt_str_append_str(arguments->indexname,
                          gt_str_array_get_str(arguments->filenames, 0));
        if (arguments->with_esa || arguments->with_pck) {
            GtStr *current_line = gt_str_new();
            FILE *prj_fp;
            const char *buffer;
            char **elements = NULL;

            prj_fp = gt_fa_fopen_with_suffix(gt_str_get(arguments->indexname),
                                             GT_PROJECTFILESUFFIX,"rb",err);
            if (prj_fp == NULL)
                had_err = -1;
            while (!had_err && gt_str_read_next_line(current_line, prj_fp) != EOF) {
                buffer = gt_str_get(current_line);
                if (elements != NULL) {
                    gt_free(elements[0]);
                    gt_free(elements[1]);
                }
                gt_free(elements);
                elements = gt_cstr_split(buffer, '=');
                gt_log_log("%s", elements[0]);
                if (strcmp("mirrored", elements[0]) == 0) {
                    gt_log_log("%s", elements[1]);
                    if (strcmp("1", elements[1]) == 0) {
                        mirrored = true;
                        gt_log_log("sequences are treated as mirrored");
                    }
                }
                gt_str_reset(current_line);
            }
            gt_str_delete(current_line);
            if (elements != NULL) {
                gt_free(elements[0]);
                gt_free(elements[1]);
            }
            gt_free(elements);
            gt_fa_xfclose(prj_fp);
        }
    }

    if (!had_err) {
        GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts,
                             err);
        if (mirrored)
            gt_encseq_loader_mirror(el);
        encseq =
            gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err);
        gt_encseq_loader_delete(el);
    }
    if (encseq == NULL)
        had_err = -1;
    if (!had_err) {
        unit_info = gt_shu_unit_info_new(encseq);
        if (arguments->with_units)
            had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info,
                                                 logger, err);
    }

    if (!had_err) {
        uint64_t **shusums = NULL;
        if (arguments->with_esa || arguments->with_pck) {
            shusums = gt_genomediff_shulen_sum(arguments, unit_info,
                                               logger, timer, err);
            if (shusums == NULL)
                had_err = -1;
        }
        else {
            const bool doesa = true;
            GenomediffInfo gd_info;
            Suffixeratoroptions sopts;
            sopts.beverbose = arguments->verbose;
            sopts.indexname = arguments->indexname;
            sopts.db = NULL;
            sopts.encopts = NULL;
            sopts.genomediff = true;
            sopts.inputindex = arguments->indexname;
            sopts.loadopts = arguments->loadopts;
            sopts.showprogress = false;
            sopts.idxopts = arguments->idxopts;

            gt_assert(unit_info != NULL);
            gt_array2dim_calloc(shusums, unit_info->num_of_genomes,
                                unit_info->num_of_genomes);
            gd_info.shulensums = shusums;
            gd_info.unit_info = unit_info;
            had_err = gt_runsuffixerator(doesa, &sopts, &gd_info, logger, err);
        }
        if (!had_err && shusums != NULL) {
            had_err = gt_genomediff_kr_calc(shusums, arguments, unit_info,
                                            arguments->with_pck, logger, timer, err);
            gt_array2dim_delete(shusums);
        }
    }

    if (timer != NULL) {
        gt_timer_show_progress_final(timer, stdout);
        gt_timer_delete(timer);
    }
    gt_logger_delete(logger);
    gt_encseq_delete(encseq);
    gt_shu_unit_info_delete(unit_info);

    return had_err;
}
예제 #15
0
// Main method
int main(int argc, char * const *argv)
{
  GtError *error;
  GtLogger *logger;
  GtQueue *streams;
  GtNodeStream *stream, *last_stream;
  CanonGFF3Options options = { NULL, NULL, false };

  gt_lib_init();
  error = gt_error_new();
  canon_gff3_parse_options(argc, argv + 0, &options, error);

  streams = gt_queue_new();
  logger = gt_logger_new(true, "", stderr);

  stream = gt_gff3_in_stream_new_unsorted(argc - optind, (const char **)
                                                          argv+optind);
  gt_gff3_in_stream_check_id_attributes((GtGFF3InStream *)stream);
  gt_gff3_in_stream_enable_tidy_mode((GtGFF3InStream *)stream);
  gt_queue_add(streams, stream);
  last_stream = stream;

  if(options.infer)
  {
    GtHashmap *type_parents = gt_hashmap_new(GT_HASH_STRING, gt_free_func,
                                             gt_free_func);
    gt_hashmap_add(type_parents, gt_cstr_dup("mRNA"), gt_cstr_dup("gene"));
    gt_hashmap_add(type_parents, gt_cstr_dup("tRNA"), gt_cstr_dup("gene"));
    stream = agn_infer_parent_stream_new(last_stream,
                                                 type_parents);
    gt_hashmap_delete(type_parents);
    gt_queue_add(streams, stream);
    last_stream = stream;
  }

  stream = agn_gene_stream_new(last_stream, logger);
  gt_queue_add(streams, stream);
  last_stream = stream;

  if(options.source != NULL)
  {
    GtNodeVisitor *ssv = gt_set_source_visitor_new(options.source);
    stream = gt_visitor_stream_new(last_stream, ssv);
    gt_queue_add(streams, stream);
    last_stream = stream;
  }

  stream = gt_gff3_out_stream_new(last_stream, options.outstream);
  if(!options.infer)
    gt_gff3_out_stream_retain_id_attributes((GtGFF3OutStream *)stream);
  gt_queue_add(streams, stream);
  last_stream = stream;

  if(gt_node_stream_pull(last_stream, error) == -1)
  {
    fprintf(stderr, "[CanonGFF3] error processing node stream: %s",
            gt_error_get(error));
  }

  while(gt_queue_size(streams) > 0)
  {
    stream = gt_queue_get(streams);
    gt_node_stream_delete(stream);
  }
  gt_queue_delete(streams);
  if(options.source != NULL)
    gt_str_delete(options.source);
  if(options.outstream != NULL)
    gt_file_delete(options.outstream);
  gt_error_delete(error);
  gt_logger_delete(logger);
  gt_lib_clean();

  return 0;
}
예제 #16
0
static int gt_encseq2spm_runner(GT_UNUSED int argc,
                                GT_UNUSED const char **argv,
                                GT_UNUSED int parsed_args,
                                void *tool_arguments,
                                GtError *err)
{
    GtEncseq2spmArguments *arguments = tool_arguments;
    GtEncseqLoader *el = NULL;
    GtEncseq *encseq = NULL;
    bool haserr = false;

    gt_error_check(err);
    gt_assert(arguments);
    el = gt_encseq_loader_new();
    gt_encseq_loader_drop_description_support(el);
    gt_encseq_loader_disable_autosupport(el);
    encseq = gt_encseq_loader_load(el, gt_str_get(arguments->encseqinput),
                                   err);
    if (encseq == NULL)
    {
        haserr = true;
    }
    if (!haserr)
    {
        if (arguments->singlestrand)
        {
            gt_error_set(err,"option -singlestand is not implemented");
            haserr = true;
        } else
        {
            if (gt_encseq_mirror(encseq, err) != 0)
            {
                haserr = true;
            }
        }
    }

    if (!haserr && arguments->singlescan > 0)
    {
        GtTimer *timer = NULL;

        if (gt_showtime_enabled())
        {
            char *outmsg;

            switch (arguments->singlescan)
            {
            case 1:
                outmsg = "to run fast scanning";
                break;
            case 2:
                outmsg = "to run fast scanning with check";
                break;
            case 3:
                outmsg = "to run fast scanning with output";
                break;
            case 4:
                outmsg = "to run old scanning code";
                break;
            default:
                gt_error_set(err,"argument %u to option -singlescan not allowed",
                             arguments->singlescan);
                haserr = true;
            }
            if (!haserr)
            {
                timer = gt_timer_new_with_progress_description(outmsg);
                gt_timer_start(timer);
            }
        }
        if (!haserr)
        {
            unsigned int kmersize = 0;
            haserr = gt_encseq2spm_kmersize(arguments, &kmersize, err);
            if (!haserr)
            {
                if (arguments->singlescan == 4U)
                {
                    gt_rungetencseqkmers(encseq,kmersize);
                } else
                {
                    if (arguments->singlescan > 0)
                    {
                        gt_firstcode_runkmerscan(encseq,arguments->singlescan - 1,kmersize,
                                                 arguments->minmatchlength);
                    }
                }
            }
        }
        if (timer != NULL)
        {
            gt_timer_show_progress_final(timer, stdout);
            gt_timer_delete(timer);
        }
    }
    if (!haserr && arguments->singlescan == 0)
    {
        GtLogger *logger;
        const GtReadmode readmode = GT_READMODE_FORWARD;
        GtBUstate_spmsk **spmsk_states = NULL;
        unsigned int kmersize, threadcount;

#ifdef GT_THREADS_ENABLED
        const unsigned int threads = gt_jobs;
#else
        const unsigned int threads = 1U;
#endif

        if (arguments->countspms || arguments->outputspms)
        {
            spmsk_states = gt_malloc(sizeof (*spmsk_states) * threads);
            for (threadcount = 0; threadcount < threads; threadcount++)
            {
                spmsk_states[threadcount]
                    = gt_spmsk_inl_new(encseq,
                                       readmode,
                                       (unsigned long) arguments->minmatchlength,
                                       arguments->countspms,
                                       arguments->outputspms,
                                       gt_str_get(arguments->encseqinput));
            }
        }
        logger = gt_logger_new(arguments->verbose,GT_LOGGER_DEFLT_PREFIX, stdout);
        haserr = gt_encseq2spm_kmersize(arguments, &kmersize, err);
        if (!haserr)
        {
            if (storefirstcodes_getencseqkmers_twobitencoding(encseq,
                    kmersize,
                    arguments->numofparts,
                    arguments->maximumspace,
                    arguments->minmatchlength,
                    /* use false */  arguments->checksuftab,
                    /* use false */  arguments->onlyaccum,
                    /* use false */  arguments->
                    onlyallfirstcodes,
                    /* use 5U */     arguments->
                    addbscache_depth,
                    /* specify the extra space needed for
                       the function processing the interval */
                    arguments->phase2extra,
                    /* use true */   arguments->radixlarge ?
                    false : true,
                    /* use 2 without threads and
                       use 1 with threads */
                    arguments->radixparts,
                    spmsk_states != NULL
                    ? gt_spmsk_inl_process
                    : NULL,
                    gt_spmsk_inl_process_end,
                    spmsk_states,
                    logger,
                    err) != 0)
            {
                haserr = true;
            }
        }
        if (spmsk_states != NULL)
        {
            unsigned long countmatches = 0;

            for (threadcount = 0; threadcount < threads; threadcount++)
            {
                countmatches += gt_spmsk_inl_delete(spmsk_states[threadcount]);
            }
            if (arguments->countspms)
            {
                printf("number of suffix-prefix matches=%lu\n",countmatches);
            }
            gt_free(spmsk_states);
        }
        gt_logger_delete(logger);
    }
    gt_encseq_delete(encseq);
    gt_encseq_loader_delete(el);
    return haserr ? -1 : 0;
}
static int gt_kmer_database_runner(GT_UNUSED int argc, const char **argv,
                                   int parsed_args, void *tool_arguments,
                                   GtError *err)
{
  GtKmerDatabaseArguments *arguments = tool_arguments;
  int had_err = 0;
  GtEncseq       *es;
  GtUword        es_length,
                 nu_kmer_codes = 0;
  GtKmerDatabase *compare_db = NULL,
                 *db = NULL;
  GtLogger *logger;
  FILE *fp = NULL;
  GtHashmap *kmer_hash = NULL;
  GtTimer *timer = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  if (arguments->use_hash)
    kmer_hash = gt_hashmap_new(GT_HASH_DIRECT, NULL,
                               (GtFree) gt_kmer_database_delete_hash_value);
  if (arguments->bench)
    timer = gt_timer_new_with_progress_description("loading encoded sequence");

  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr);

  if (arguments->verbose && gt_str_length(arguments->print_filename) > 0UL) {
    fp = gt_fa_fopen(gt_str_get(arguments->print_filename), "w", err);
    gt_logger_set_target(logger, fp);
  }

  if (!had_err) {
    GtEncseqLoader *es_l;
    if (arguments->bench)
      gt_timer_start(timer);
    es_l = gt_encseq_loader_new();
    es = gt_encseq_loader_load(es_l, argv[parsed_args], err);
    if (arguments->bench)
      gt_timer_show_progress(timer, "saving kmers (+iterating over file)",
                             stdout);
    if (es == NULL) {
      had_err = -1;
    }
    gt_encseq_loader_delete(es_l);
  }
  if (!had_err) {
    es_length = gt_encseq_total_length(es);
    if (es_length < (GtUword) arguments->kmersize) {
      gt_error_set(err, "Input is too short for used kmersize. File length: "
                   GT_WU " kmersize: %u", es_length, arguments->kmersize);
      had_err = -1;
    }
  }
  if (!had_err) {
    GtAlphabet *alphabet;
    alphabet = gt_encseq_alphabet(es);
    if (arguments->bench)
    nu_kmer_codes = gt_power_for_small_exponents(
                                            gt_alphabet_num_of_chars(alphabet),
                                            arguments->kmersize);
    if (!arguments->merge_only && !arguments->use_hash && !arguments->bench) {
      compare_db = gt_kmer_database_new(gt_alphabet_num_of_chars(alphabet),
                                arguments->kmersize, arguments->sb_size, es);
    }
    if (!arguments->use_hash) {
      db = gt_kmer_database_new(gt_alphabet_num_of_chars(alphabet),
                                arguments->kmersize,
                                arguments->sb_size, es);
      if (arguments->cutoff) {
        if (arguments->mean_cutoff)
          gt_kmer_database_use_mean_cutoff(db, (GtUword) 2,
                                           arguments->cutoff_value);
        else
          gt_kmer_database_set_cutoff(db, arguments->cutoff_value);
        if (!arguments->prune)
          gt_kmer_database_set_prune(db);
      }
    }
  }

  if (!had_err) {
    GtUword startpos = 0,
            endpos;
    GtKmercodeiterator *iter;
    const GtKmercode *kmercode = NULL;
    iter = gt_kmercodeiterator_encseq_new(es, GT_READMODE_FORWARD,
                                          arguments->kmersize, 0);
    while (!had_err && startpos < es_length - (arguments->kmersize - 1)) {
      GtUword startpos_add_kmer = startpos;
      if (arguments->merge_only) {
        endpos = startpos + (arguments->kmersize - 1) +
                 (gt_rand_max((arguments->sb_size - 1) * 2));
        if (endpos > es_length)
          endpos = es_length;
      }
      else {
        endpos = startpos + (arguments->kmersize - 1) +
                 (gt_rand_max(arguments->sb_size - 1));
      }
      gt_kmercodeiterator_reset(iter, GT_READMODE_FORWARD, startpos);
      while ((kmercode = gt_kmercodeiterator_encseq_next(iter)) != NULL &&
             startpos_add_kmer <= endpos - (arguments->kmersize - 1)) {
        if (!arguments->merge_only && !arguments->use_hash &&
            !kmercode->definedspecialposition && !arguments->bench) {
          gt_kmer_database_add_kmer(compare_db, kmercode->code,
                                    startpos_add_kmer);
        }
        if (arguments->use_hash && !kmercode->definedspecialposition) {
          gt_kmer_database_add_to_hash(kmer_hash, kmercode->code,
                                       startpos_add_kmer);
        }
        startpos_add_kmer++;
      }
      if (!arguments->use_hash) {
        gt_kmer_database_add_interval(db, startpos, endpos);
        gt_kmer_database_print_buffer(db, logger);
        if (!arguments->bench)
          had_err = gt_kmer_database_check_consistency(db, err);
      }
      startpos = endpos + 1;
    }
    if (!arguments->use_hash) {
      gt_kmer_database_flush(db);
      gt_kmer_database_print_buffer(db, logger);
      if (!had_err && !arguments->bench)
        had_err = gt_kmer_database_check_consistency(db, err);
      if (!arguments->merge_only && !had_err && !arguments->bench)
        had_err = gt_kmer_database_check_consistency(compare_db, err);
      if (!arguments->merge_only && !arguments->bench)
        gt_kmer_database_print(compare_db, logger, true);
      if (!arguments->merge_only && !had_err && !arguments->bench)
        had_err = gt_kmer_database_compare(compare_db, db, err);
      gt_kmer_database_print(db, logger, true);
    }
    gt_kmercodeiterator_delete(iter);
  }

  if (arguments->bench) {
    GtKmerStartpos pos;
    GtArrayGtUword *pos_hash;
    GtUword rand_access = (GtUword) 50000000,
            rand_code,
            i,
            sum = 0;
    gt_timer_show_progress(timer, "random access", stdout);
    for (i = 0; i < rand_access; i++) {
      rand_code = gt_rand_max(nu_kmer_codes - 1);
      if (arguments->use_hash) {
        pos_hash = gt_hashmap_get(kmer_hash, (const void *) rand_code);
        if (pos_hash != NULL)
          sum += pos_hash->spaceGtUword[pos_hash->nextfreeGtUword - 1];
      }
      else {
        pos = gt_kmer_database_get_startpos(db, rand_code);
        if (pos.no_positions > 0)
          sum += pos.startpos[pos.no_positions - 1];
      }
    }
    printf("sum: " GT_WU "\n", sum);

    gt_timer_show_progress(timer, "", stdout);
    gt_timer_stop(timer);
    gt_timer_delete(timer);
  }
  if (arguments->use_hash)
    gt_hashmap_delete(kmer_hash);
  gt_encseq_delete(es);
  if (!arguments->use_hash)
    gt_kmer_database_delete(db);
  if (!arguments->merge_only && !arguments->bench)
    gt_kmer_database_delete(compare_db);
  gt_logger_delete(logger);
  gt_fa_fclose(fp);

  return had_err;
}
예제 #18
0
static int gt_gdiffcalc_runner(int argc, const char **argv, int parsed_args,
                              void *tool_arguments, GT_UNUSED GtError *err)
{
  GtGenomediffArguments *arguments = tool_arguments;
  int had_err = 0, i;
  GtUword lcounter = 0, zcounter = 0;
  double **shusums = NULL;
  GtEncseq              *encseq = NULL;
  GtLogger              *logger;
  GtShuUnitFileInfo     *unit_info = NULL;
  GtTimer               *timer = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose,
                         GT_LOGGER_DEFLT_PREFIX,
                         stdout);
  gt_assert(logger);

  for (i = parsed_args; i < argc; i++) {
    gt_str_array_add_cstr(arguments->filenames, argv[i]);
  }

  if (gt_showtime_enabled()) {
    timer = gt_timer_new_with_progress_description("load encseq");
    gt_timer_start(timer);
    gt_assert(timer);
  }

  if (arguments->with_units) {
    gt_logger_log(logger, "unitfile option set, filename is %s\n",
                  gt_str_get(arguments->unitfile));
  }

  if (!had_err) {
    GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts,
                                                           err);
    encseq =
      gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err);
    gt_encseq_loader_delete(el);
  }
  if (encseq == NULL)
    had_err = -1;

  if (timer != NULL)
    gt_timer_show_progress(timer, "load units", stdout);

  if (!had_err) {
    unit_info = gt_shu_unit_info_new(encseq);
    if (arguments->with_units)
      had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info,
                                           logger, err);
  }

  if (timer != NULL)
    gt_timer_show_progress(timer, "read table", stdout);

  if (!had_err) {
    GtIO *table_file = NULL;
    GtTokenizer *tokenizer = NULL;
    GtStr *line = NULL;

    gt_assert(unit_info != NULL);
    gt_array2dim_calloc(shusums, unit_info->num_of_genomes,
                        unit_info->num_of_genomes);

    table_file = gt_io_new(gt_str_array_get(arguments->filenames, 0), "r");
    tokenizer = gt_tokenizer_new(table_file);
    line = gt_tokenizer_get_token(tokenizer);
    while (line != NULL && !had_err) {
      char *cline = gt_str_get(line);
      char *elem = strtok(cline, ";");
      zcounter = 0;
      while (elem != NULL && !had_err) {
        if (*elem != '#') {
          if (1 != sscanf(elem, "%lf",
                          &shusums[lcounter][zcounter])) {
            had_err = 1;
            gt_error_set(err, "couldn't scan");
            break;
          }
          gt_logger_log(logger,"wert: %lf", shusums[lcounter][zcounter]);
          zcounter++;
        }
        else {
          gt_logger_log(logger, "name: %s", elem++);
        }
        elem = strtok(NULL, ";");
      }
      gt_tokenizer_next_token(tokenizer);
      gt_str_delete(line);
      line = gt_tokenizer_get_token(tokenizer);
      lcounter++;
      gt_logger_log(logger, "line "GT_WD"", lcounter);
    }
  }
  if (!had_err) {
    GtUword num_of_seq, file_idx, seq_idx, startpos;
    GT_UNUSED GtUword oldpos = 0;

    gt_assert(unit_info != NULL);
    gt_assert(lcounter == zcounter);
    gt_assert(lcounter == unit_info->num_of_genomes);

    num_of_seq = gt_encseq_num_of_sequences(unit_info->encseq);

    for (seq_idx = 0; seq_idx < num_of_seq; seq_idx++) {
      startpos = gt_encseq_seqstartpos(unit_info->encseq, seq_idx);
      file_idx = gt_encseq_filenum(unit_info->encseq, startpos);
      gt_log_log("seq: "GT_WU" starts at: "GT_WU"\n"
                 "belonges to file: "GT_WU" which is part of genome: %s",
                 seq_idx, startpos, file_idx,
                 gt_str_array_get(unit_info->genome_names,
                                  unit_info->map_files[file_idx]));
      gt_assert(oldpos <= startpos);
      oldpos = startpos;
    }
  }
  if (!had_err && shusums != NULL) {
    had_err = gt_genomediff_calculate_div_from_avg(shusums, arguments,
                                                   unit_info,
                                                   logger, timer, err);
    gt_array2dim_delete(shusums);
  }

  if (timer != NULL) {
    gt_timer_show_progress_final(timer, stdout);
    gt_timer_delete(timer);
  }
  gt_logger_delete(logger);
  gt_encseq_delete(encseq);
  gt_shu_unit_info_delete(unit_info);
  return had_err;
}
예제 #19
0
static void showoptions(const Suffixeratoroptions *so)
{
  GtUword i;
  Sfxstrategy sfxtrategy;
  GtLogger *logger = gt_logger_new(true, GT_LOGGER_DEFLT_PREFIX, stdout);

  if (gt_str_length(gt_encseq_options_smap_value(so->encopts)) > 0)
  {
    gt_logger_log_force(logger, "smap=\"%s\"",
                        gt_str_get(gt_encseq_options_smap_value(so->encopts)));
  }
  if (gt_encseq_options_dna_value(so->encopts))
  {
    gt_logger_log_force(logger, "dna=yes");
  }
  if (gt_encseq_options_protein_value(so->encopts))
  {
    gt_logger_log_force(logger, "protein=yes");
  }
  if (gt_encseq_options_plain_value(so->encopts))
  {
    gt_logger_log_force(logger, "plain=yes");
  }
  gt_logger_log_force(logger, "indexname=\"%s\"",
                    gt_str_get(so->indexname));

  if (gt_index_options_prefixlength_value(so->idxopts)
                                                   == GT_PREFIXLENGTH_AUTOMATIC)
  {
    gt_logger_log_force(logger, "prefixlength=automatic");
  } else
  {
    gt_logger_log_force(logger, "prefixlength=%u",
                        gt_index_options_prefixlength_value(so->idxopts));
  }
  sfxtrategy = gt_index_options_sfxstrategy_value(so->idxopts);
  gt_logger_log_force(logger, "storespecialcodes=%s",
                        sfxtrategy.storespecialcodes ? "true" : "false");
  for (i=0; i<gt_str_array_size(so->db); i++)
  {
    gt_logger_log_force(logger, "inputfile["GT_WU"]=%s", i,
                   gt_str_array_get(so->db, i));
  }
  if (gt_str_length(so->inputindex) > 0)
  {
    gt_logger_log_force(logger, "inputindex=%s",
                        gt_str_get(so->inputindex));
  }
  gt_assert(gt_str_length(so->indexname) > 0);
  gt_logger_log_force(logger, "indexname=%s",
                    gt_str_get(so->indexname));
  gt_logger_log_force(logger, "outtistab=%s,outsuftab=%s,outlcptab=%s,"
                              "outbwttab=%s,outbcktab=%s,outdestab=%s,"
                              "outsdstab=%s,outssptab=%s,outkystab=%s",
          gt_encseq_options_tis_value(so->encopts) ? "true" : "false",
          gt_index_options_outsuftab_value(so->idxopts) ? "true" : "false",
          gt_index_options_outlcptab_value(so->idxopts) ? "true" : "false",
          gt_index_options_outbwttab_value(so->idxopts) ? "true" : "false",
          gt_index_options_outbcktab_value(so->idxopts) ? "true" : "false",
          gt_encseq_options_des_value(so->encopts) ? "true" : "false",
          gt_encseq_options_sds_value(so->encopts) ? "true" : "false",
          gt_encseq_options_ssp_value(so->encopts) ? "true" : "false",
          gt_index_options_outkystab_value(so->idxopts) ?
             (gt_index_options_outkyssort_value(so->idxopts) ?
                              "true with sort" : "true") :
                              "false");

  if (gt_index_options_maximumspace_value(so->idxopts) > 0)
  {
    gt_assert(gt_index_options_numofparts_value(so->idxopts) == 1U);
    gt_logger_log_force(logger, "maximumspace=%.0f MB",
            GT_MEGABYTES(gt_index_options_maximumspace_value(so->idxopts)));
  } else
  {
    gt_logger_log_force(logger, "parts=%u",
                            gt_index_options_numofparts_value(so->idxopts));
  }
  gt_logger_log_force(logger, "maxinsertionsort="GT_WU"",
                        sfxtrategy.maxinsertionsort);
  gt_logger_log_force(logger, "maxbltriesort="GT_WU"",
                        sfxtrategy.maxbltriesort);
  gt_logger_log_force(logger, "maxcountingsort="GT_WU"",
                        sfxtrategy.maxcountingsort);
  gt_logger_log_force(logger, "lcpdist=%s",
                        gt_index_options_lcpdist_value(so->idxopts)
                          ? "true"
                          : "false");
  gt_logger_delete(logger);
}
예제 #20
0
static int gt_matstat_runner(GT_UNUSED int argc, GT_UNUSED const char **argv,
                             GT_UNUSED int parsed_args,
                             void *tool_arguments, GtError *err)
{
  Gfmsubcallinfo *arguments = tool_arguments;
  Fmindex fmindex;
  Suffixarray suffixarray;
  void *packedindex = NULL;
  GtLogger *logger = NULL;
  bool haserr = false;
  const GtAlphabet *alphabet = NULL;
#ifdef WITHBCKTAB
  unsigned int prefixlength = 0;
#endif
  GtUword totallength;
  bool gt_mapfmindexfail = false;
  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(false, GT_LOGGER_DEFLT_PREFIX, stdout);
  if (arguments->indextype == Fmindextype)
  {
    if (gt_mapfmindex(&fmindex,gt_str_get(arguments->indexname),
                      logger, err) != 0)
    {
      haserr = true;
      gt_mapfmindexfail = true;
    } else
    {
      alphabet = fmindex.alphabet;
    }
    totallength = fmindex.bwtlength-1;
  } else
  {
    unsigned int mappedbits;

    if (arguments->indextype == Esaindextype)
    {
      mappedbits = SARR_ESQTAB | SARR_SUFTAB
#undef WITHBCKTAB
#ifdef WITHBCKTAB
                   | SARR_BCKTAB
#endif
                   ;
    } else
    {
      if (dotestsequence(arguments))
      {
        mappedbits = SARR_ESQTAB;
      } else
      {
        mappedbits = 0;
      }
    }
    if (gt_mapsuffixarray(&suffixarray,
                       mappedbits,
                       gt_str_get(arguments->indexname),
                       logger,
                       err) != 0)
    {
      haserr = true;
      totallength = 0;
    } else
    {
      alphabet = gt_encseq_alphabet(suffixarray.encseq);
#ifdef WITHBCKTAB
      prefixlength = suffixarray.prefixlength;
#endif
      totallength = gt_encseq_total_length(suffixarray.encseq);
    }
    if (!haserr)
    {
      if (arguments->indextype == Packedindextype)
      {
        packedindex =
          gt_loadvoidBWTSeqForSA(gt_str_get(arguments->indexname),
                                 false,
                                 err);
        if (packedindex == NULL)
        {
          haserr = true;
        }
      }
    }
  }
  if (!haserr)
  {
    const void *theindex;
    Greedygmatchforwardfunction gmatchforwardfunction;

    if (arguments->indextype == Fmindextype)
    {
      theindex = (const void *) &fmindex;
      if (arguments->doms)
      {
        gmatchforwardfunction = gt_skfmmstats;
      } else
      {
        gmatchforwardfunction = gt_skfmuniqueforward;
      }
    } else
    {
      if (arguments->indextype == Esaindextype)
      {
        theindex = (const void *) &suffixarray;
        if (arguments->doms)
        {
          gmatchforwardfunction = gt_suffixarraymstats;
        } else
        {
          gmatchforwardfunction = gt_suffixarrayuniqueforward;
        }
      } else
      {
        gt_assert(arguments->indextype == Packedindextype);
        theindex = (const void *) packedindex;
        if (arguments->doms)
        {
          gmatchforwardfunction = gt_voidpackedindexmstatsforward;
        } else
        {
          gmatchforwardfunction = gt_voidpackedindexuniqueforward;
        }
      }
    }
    if (!haserr)
    {
#ifdef WITHBCKTAB
      if (prefixlength > 0 &&
          arguments->indextype == Esaindextype &&
          runsubstringiteration(gmatchforwardfunction,
                                theindex,
                                totallength,
                                suffixarray.bcktab,
                                suffixarray.countspecialcodes,
                                alphabet,
                                prefixlength,
                                arguments->queryfilenames,
                                err) != 0)

      {
        haserr = true;
      }
#endif
      if (!haserr &&
          gt_findsubquerygmatchforward(dotestsequence(arguments)
                                      ? suffixarray.encseq
                                      : NULL,
                                      theindex,
                                      totallength,
                                      gmatchforwardfunction,
                                      alphabet,
                                      arguments->queryfilenames,
                                      arguments->minlength,
                                      arguments->maxlength,
                                      (arguments->showmode & SHOWSEQUENCE)
                                             ? true : false,
                                      (arguments->showmode & SHOWQUERYPOS)
                                             ? true : false,
                                      (arguments->showmode & SHOWSUBJECTPOS)
                                             ? true : false,
                                      err) != 0)
      {
        haserr = true;
      }
    }
  }
  if (arguments->indextype == Fmindextype)
  {
    if (!gt_mapfmindexfail)
    {
      gt_freefmindex(&fmindex);
    }
  } else
  {
    if (arguments->indextype == Packedindextype && packedindex != NULL)
    {
      gt_deletevoidBWTSeq(packedindex);
    }
    gt_freesuffixarray(&suffixarray);
  }
  gt_logger_delete(logger);

  return haserr ? -1 : 0;;
}
예제 #21
0
int gt_runidxlocali(const IdxlocaliOptions *idxlocalioptions,GtError *err)
{
  Genericindex *genericindex = NULL;
  bool haserr = false;
  GtLogger *logger;
  const GtEncseq *encseq = NULL;

  logger = gt_logger_new(idxlocalioptions->verbose,
                         GT_LOGGER_DEFLT_PREFIX, stdout);

  if (idxlocalioptions->doonline)
  {
    GtEncseqLoader *el;
    el = gt_encseq_loader_new();
    gt_encseq_loader_require_multiseq_support(el);
    gt_encseq_loader_drop_description_support(el);
    gt_encseq_loader_set_logger(el, logger);
    encseq = gt_encseq_loader_load(el, gt_str_get(idxlocalioptions->indexname),
                                   err);
    gt_encseq_loader_delete(el);
    if (encseq == NULL)
    {
      haserr = true;
    }
  } else
  {
    genericindex = genericindex_new(gt_str_get(idxlocalioptions->indexname),
                                    idxlocalioptions->withesa,
                                    idxlocalioptions->withesa ||
                                    idxlocalioptions->docompare,
                                    false,
                                    true,
                                    0,
                                    logger,
                                    err);
    if (genericindex == NULL)
    {
      haserr = true;
    } else
    {
      encseq = genericindex_getencseq(genericindex);
    }
  }
  if (!haserr)
  {
    GtSeqIterator *seqit;
    const GtUchar *query;
    unsigned long querylen;
    char *desc = NULL;
    int retval;
    Limdfsresources *limdfsresources = NULL;
    const AbstractDfstransformer *dfst;
    SWdpresource *swdpresource = NULL;
    Showmatchinfo showmatchinfo;
    ProcessIdxMatch processmatch;
    GtAlphabet *a;
    void *processmatchinfoonline, *processmatchinfooffline;
    Storematchinfo storeonline, storeoffline;

    a = gt_encseq_alphabet(encseq);
    if (idxlocalioptions->docompare)
    {
      processmatch = storematch;
      gt_initstorematch(&storeonline,encseq);
      gt_initstorematch(&storeoffline,encseq);
      processmatchinfoonline = &storeonline;
      processmatchinfooffline = &storeoffline;
    } else
    {
      processmatch = showmatch;
      showmatchinfo.encseq = encseq;
      showmatchinfo.characters = gt_alphabet_characters(a);
      showmatchinfo.wildcardshow = gt_alphabet_wildcard_show(a);
      showmatchinfo.showalignment = idxlocalioptions->showalignment;
      processmatchinfoonline = processmatchinfooffline = &showmatchinfo;
    }
    if (idxlocalioptions->doonline || idxlocalioptions->docompare)
    {
      swdpresource = gt_newSWdpresource(idxlocalioptions->matchscore,
                                     idxlocalioptions->mismatchscore,
                                     idxlocalioptions->gapextend,
                                     idxlocalioptions->threshold,
                                     idxlocalioptions->showalignment,
                                     processmatch,
                                     processmatchinfoonline);
    }
    dfst = gt_locali_AbstractDfstransformer();
    if (!idxlocalioptions->doonline || idxlocalioptions->docompare)
    {
      gt_assert(genericindex != NULL);
      limdfsresources = gt_newLimdfsresources(genericindex,
                                           true,
                                           0,
                                           0,    /* maxpathlength */
                                           true, /* keepexpandedonstack */
                                           processmatch,
                                           processmatchinfooffline,
                                           NULL, /* processresult */
                                           NULL, /* processresult info */
                                           dfst);
    }
    seqit = gt_seq_iterator_sequence_buffer_new(idxlocalioptions->queryfiles,
                                               err);
    if (!seqit)
      haserr = true;
    if (!haserr)
    {
      gt_seq_iterator_set_symbolmap(seqit, gt_alphabet_symbolmap(a));
      for (showmatchinfo.queryunit = 0; /* Nothing */;
           showmatchinfo.queryunit++)
      {
        retval = gt_seq_iterator_next(seqit,
                                     &query,
                                     &querylen,
                                     &desc,
                                     err);
        if (retval < 0)
        {
          haserr = true;
          break;
        }
        if (retval == 0)
        {
          break;
        }
        printf("process sequence " Formatuint64_t " of length %lu\n",
                PRINTuint64_tcast(showmatchinfo.queryunit),querylen);
        if (idxlocalioptions->doonline || idxlocalioptions->docompare)
        {
          gt_multiapplysmithwaterman(swdpresource,encseq,query,querylen);
        }
        if (!idxlocalioptions->doonline || idxlocalioptions->docompare)
        {
          gt_indexbasedlocali(limdfsresources,
                           idxlocalioptions->matchscore,
                           idxlocalioptions->mismatchscore,
                           idxlocalioptions->gapstart,
                           idxlocalioptions->gapextend,
                           idxlocalioptions->threshold,
                           query,
                           querylen,
                           dfst);
        }
        if (idxlocalioptions->docompare)
        {
          gt_checkandresetstorematch(showmatchinfo.queryunit,
                                  &storeonline,&storeoffline);
        }
      }
      if (limdfsresources != NULL)
      {
        gt_freeLimdfsresources(&limdfsresources,dfst);
      }
      if (swdpresource != NULL)
      {
        gt_freeSWdpresource(swdpresource);
        swdpresource = NULL;
      }
      gt_seq_iterator_delete(seqit);
    }
    if (idxlocalioptions->docompare)
    {
      gt_freestorematch(&storeonline);
      gt_freestorematch(&storeoffline);
    }
  }
  if (genericindex == NULL)
  {
    gt_encseq_delete((GtEncseq *) encseq);
    encseq = NULL;
  } else
  {
    genericindex_delete(genericindex);
  }
  gt_logger_delete(logger);
  logger = NULL;
  return haserr ? -1 : 0;
}
예제 #22
0
static int gt_repfind_runner(GT_UNUSED int argc,
                             GT_UNUSED const char **argv,
                             GT_UNUSED int parsed_args,
                             void *tool_arguments, GtError *err)
{
  bool haserr = false;
  Maxpairsoptions *arguments = tool_arguments;
  GtLogger *logger = NULL;
  GtQuerymatch *querymatchspaceptr = gt_querymatch_new();
  GtXdropmatchinfo xdropmatchinfo;

  gt_error_check(err);
  xdropmatchinfo.querymatchspaceptr = querymatchspaceptr;
  xdropmatchinfo.useq = gt_seqabstract_new_empty();
  xdropmatchinfo.vseq = gt_seqabstract_new_empty();
  xdropmatchinfo.arbitscores.mat = 2;
  xdropmatchinfo.arbitscores.mis = -2;
  xdropmatchinfo.arbitscores.ins = -3;
  xdropmatchinfo.arbitscores.del = -3;
  xdropmatchinfo.frontresource = gt_frontresource_new(100UL);
  xdropmatchinfo.res = gt_xdrop_resources_new(&xdropmatchinfo.arbitscores);
  xdropmatchinfo.belowscore = 5L;
  logger = gt_logger_new(arguments->beverbose, GT_LOGGER_DEFLT_PREFIX, stdout);
  if (parsed_args < argc)
  {
    gt_error_set(err,"superfluous arguments: \"%s\"",argv[argc-1]);
    haserr = true;
  }
  if (!haserr)
  {
    if (gt_str_array_size(arguments->queryfiles) == 0)
    {
      if (arguments->samples == 0)
      {
        if (arguments->forward)
        {
          GtProcessmaxpairs processmaxpairs;
          void *processmaxpairsdata;

          if (arguments->searchspm)
          {
            processmaxpairs = gt_simplesuffixprefixmatchoutput;
            processmaxpairsdata = NULL;
          } else
          {
            if (arguments->extendseed)
            {
              processmaxpairs = gt_simplexdropselfmatchoutput;
              processmaxpairsdata = (void *) &xdropmatchinfo;
            } else
            {
              processmaxpairs = gt_simpleexactselfmatchoutput;
              processmaxpairsdata = (void *) querymatchspaceptr;
            }
          }
          if (gt_callenummaxpairs(gt_str_get(arguments->indexname),
                                  arguments->userdefinedleastlength,
                                  arguments->scanfile,
                                  processmaxpairs,
                                  processmaxpairsdata,
                                  logger,
                                  err) != 0)
          {
            haserr = true;
          }
        }
        if (!haserr && arguments->reverse)
        {
          if (gt_callenumselfmatches(gt_str_get(arguments->indexname),
                                     GT_READMODE_REVERSE,
                                     arguments->userdefinedleastlength,
                                     /*arguments->extendseed
                                       ? gt_processxdropquerymatches
                                       :*/ gt_querymatch_output,
                                     /*arguments->extendseed
                                       ? (void *) &xdropmatchinfo
                                       :*/ NULL,
                                     logger,
                                     err) != 0)
          {
            haserr = true;
          }
        }
      } else
      {
        if (gt_testmaxpairs(gt_str_get(arguments->indexname),
                            arguments->samples,
                            arguments->userdefinedleastlength,
                            (GtUword)
                            (100 * arguments->userdefinedleastlength),
                            logger,
                            err) != 0)
        {
          haserr = true;
        }
      }
    } else
    {
      if (gt_callenumquerymatches(gt_str_get(arguments->indexname),
                                  arguments->queryfiles,
                                  false,
                                  true,
                                  false,
                                  arguments->userdefinedleastlength,
                                  NULL,
                                  arguments->extendseed
                                    ? gt_processxdropquerymatches
                                    : gt_querymatch_output,
                                  arguments->extendseed
                                    ? (void *) &xdropmatchinfo
                                    : NULL,
                                  logger,
                                  err) != 0)
      {
        haserr = true;
      }
    }
  }
  gt_querymatch_delete(querymatchspaceptr);
  gt_seqabstract_delete(xdropmatchinfo.useq);
  gt_seqabstract_delete(xdropmatchinfo.vseq);
  gt_xdrop_resources_delete(xdropmatchinfo.res);
  gt_frontresource_delete(xdropmatchinfo.frontresource);
  gt_logger_delete(logger);
  return haserr ? -1 : 0;
}
static int gt_readjoiner_cnttest_runner(GT_UNUSED int argc,
    GT_UNUSED const char **argv, GT_UNUSED int parsed_args,
    void *tool_arguments, GT_UNUSED GtError *err)
{
  GtReadjoinerCnttestArguments *arguments = tool_arguments;
  GtEncseqLoader *el = NULL;
  GtEncseq *reads = NULL;
  GtBitsequence *bits = NULL;
  GtUword nofreads;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(arguments);

  if (arguments->test == GT_READJOINER_CNTTEST_SHOWLIST)
  {
    GtStr *fn = NULL;
    fn = gt_str_clone(arguments->readset);
    gt_str_append_cstr(fn, GT_READJOINER_SUFFIX_CNTLIST);
    had_err = gt_cntlist_parse(gt_str_get(fn), true, &bits, &nofreads, err);
    gt_str_delete(fn);
  }
  else if (arguments->test == GT_READJOINER_CNTTEST_BRUTEFORCE ||
      arguments->test == GT_READJOINER_CNTTEST_KMP)
  {
    el = gt_encseq_loader_new();
    gt_encseq_loader_drop_description_support(el);
    gt_encseq_loader_disable_autosupport(el);
    if (!arguments->singlestrand)
      gt_encseq_loader_mirror(el);
    reads = gt_encseq_loader_load(el, gt_str_get(arguments->readset), err);
    if (reads == NULL)
      had_err = -1;
    else
    {
      gt_rdj_pairwise_exact(GT_OVLFIND_CNT, reads, !arguments->singlestrand,
          false, arguments->test == GT_READJOINER_CNTTEST_KMP, 1UL, true,
          NULL, NULL, false, NULL, &bits, &nofreads);
    }
    gt_encseq_delete(reads);
    gt_encseq_loader_delete(el);
  }
  else if (arguments->test == GT_READJOINER_CNTTEST_ESA)
  {
    Sequentialsuffixarrayreader *ssar = NULL;
    GtUword readlength = 0, firstrevcompl = 0;
    GtLogger *verbose_logger = gt_logger_new(arguments->verbose,
        GT_LOGGER_DEFLT_PREFIX, stderr);
    ssar = gt_newSequentialsuffixarrayreaderfromfile(gt_str_get(
          arguments->readset), SARR_LCPTAB | SARR_SUFTAB | SARR_SSPTAB,
        true, verbose_logger, err);
    if (gt_error_is_set(err))
      had_err = -1;
    else
    {
      nofreads = gt_encseq_num_of_sequences(ssar->encseq);
      if (!arguments->singlestrand)
      {
        nofreads = GT_DIV2(nofreads);
        firstrevcompl = nofreads;
      }
      GT_INITBITTAB(bits, nofreads);
      if (!arguments->singlestrand)
      if (gt_encseq_accesstype_get(ssar->encseq) == GT_ACCESS_TYPE_EQUALLENGTH)
        readlength = gt_encseq_seqlength(ssar->encseq, 0);
      (void)gt_contfind_bottomup(ssar, false, bits, arguments->singlestrand ? 0
          : firstrevcompl, readlength);
    }
    if (ssar != NULL)
      gt_freeSequentialsuffixarrayreader(&ssar);
    gt_logger_delete(verbose_logger);
  }
  else
  {
    gt_assert(false);
  }
  if (!had_err)
    had_err = gt_cntlist_show(bits, nofreads, NULL, false, err);
  gt_free(bits);
  return had_err;
}