static int gt_readjoiner_assembly_build_graph(
    GtReadjoinerAssemblyArguments *arguments, GtStrgraph **strgraph,
    GtEncseq *reads, const char *readset, bool eqlen, GtUword rlen,
    GtUword nreads, GtBitsequence *contained, GtLogger *default_logger,
    GtLogger *verbose_logger, GtTimer *timer, GtError *err)
{
  int had_err = 0;
  *strgraph = gt_strgraph_new(nreads);

  if (arguments->minmatchlength > 0)
    gt_logger_log(verbose_logger, "SPM length cutoff = %u",
        arguments->minmatchlength);

  had_err = gt_readjoiner_assembly_count_spm(readset, eqlen,
      arguments->minmatchlength, arguments->nspmfiles, *strgraph, contained,
      default_logger, err);
  gt_readjoiner_assembly_show_current_space("(edges counted)");
  if (gt_showtime_enabled())
    gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_BUILDSG, stdout);
  gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_BUILDSG);

  if (had_err == 0)
  {
    gt_assert((eqlen && rlen > 0 && reads == NULL) ||
        (!eqlen && rlen == 0 && reads != NULL));
    gt_strgraph_allocate_graph(*strgraph, rlen, reads);
    gt_readjoiner_assembly_show_current_space("(graph allocated)");
    had_err = gt_strgraph_load_spm_from_file(*strgraph,
        (GtUword)arguments->minmatchlength, arguments->redtrans,
        contained, readset, arguments->nspmfiles,
        GT_READJOINER_SUFFIX_SPMLIST, err);
  }
  return had_err;
}
Exemple #2
0
void gt_showmaximalprefixlength(GtLogger *logger,
                             unsigned int maxprefixlen,
                             unsigned int recommended)
{
  gt_logger_log(logger,
              "for this input size and alphabet size, "
              "the maximal prefixlength");
  gt_logger_log(logger,"(argument of option -pl) is %u,",maxprefixlen);
  gt_logger_log(logger,"the recommended prefixlength is %u",recommended);
}
static void infer_cds_visitor_check_stop(AgnInferCDSVisitor *v)
{
  if(gt_array_size(v->cds) == 0)
    return;

  const char *mrnaid = gt_feature_node_get_attribute(v->mrna, "ID");
  unsigned int ln = gt_genome_node_get_line_number((GtGenomeNode *)v->mrna);
  GtStrand strand = gt_feature_node_get_strand(v->mrna);

  GtRange stoprange;
  GtUword threeprimeindex = gt_array_size(v->cds) - 1;
  GtGenomeNode **threeprimesegment = gt_array_get(v->cds, threeprimeindex);
  stoprange = gt_genome_node_get_range(*threeprimesegment);
  stoprange.start = stoprange.end - 2;
  if(strand == GT_STRAND_REVERSE)
  {
    threeprimesegment = gt_array_get(v->cds, 0);
    stoprange = gt_genome_node_get_range(*threeprimesegment);
    stoprange.end = stoprange.start + 2;
  }

  if(gt_array_size(v->stops) > 1)
  {
    gt_logger_log(v->logger, "mRNA '%s' (line %u) has %lu stop codons", mrnaid,
                  ln, gt_array_size(v->starts));
  }
  else if(gt_array_size(v->stops) == 1)
  {
    GtGenomeNode **codon = gt_array_get(v->stops, 0);
    GtRange testrange = gt_genome_node_get_range(*codon);
    if(gt_range_compare(&stoprange, &testrange) != 0)
    {
      gt_logger_log(v->logger, "stop codon inferred from CDS [%lu, %lu] does "
                    "not match explicitly provided stop codon [%lu, %lu] for "
                    "mRNA '%s'", stoprange.start, stoprange.end,
                    testrange.start, testrange.end, mrnaid);
    }
  }
  else // agn_assert(gt_array_size(v->stops) == 0)
  {
    GtStr *seqid = gt_genome_node_get_seqid((GtGenomeNode *)v->mrna);
    GtGenomeNode *codonfeature = gt_feature_node_new(seqid, "stop_codon",
                                                     stoprange.start,
                                                     stoprange.end,
                                                     strand);
    if(v->source)
      gt_feature_node_set_source((GtFeatureNode *)codonfeature, v->source);
    GtFeatureNode *cf = (GtFeatureNode *)codonfeature;
    gt_feature_node_add_child(v->mrna, cf);
    gt_array_add(v->stops, cf);
  }
}
static int gt_readjoiner_assembly_count_spm(const char *readset, bool eqlen,
    unsigned int minmatchlength, unsigned int nspmfiles, GtStrgraph *strgraph,
    GtBitsequence *contained, GtLogger *default_logger, GtError *err)
{
  GtSpmprocSkipData skipdata;
  int had_err = 0;
  unsigned int i;
  GtStr *filename = gt_str_new();
  gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_COUNTSPM);
  if (!eqlen)
  {
    skipdata.out.e.proc = gt_spmproc_strgraph_count;
    skipdata.to_skip = contained;
    skipdata.out.e.data = strgraph;
  }
  for (i = 0; i < nspmfiles; i++)
  {
    gt_str_append_cstr(filename, readset);
    gt_str_append_char(filename, '.');
    gt_str_append_uint(filename, i);
    gt_str_append_cstr(filename, GT_READJOINER_SUFFIX_SPMLIST);
    had_err = gt_spmlist_parse(gt_str_get(filename),
        (GtUword)minmatchlength,
        eqlen ? gt_spmproc_strgraph_count : gt_spmproc_skip,
        eqlen ? (void*)strgraph : (void*)&skipdata, err);
    gt_str_reset(filename);
  }
  gt_str_delete(filename);
  return had_err;
}
static int gt_readjoiner_assembly_paths2seq(const char *readset,
    GtUword lengthcutoff, bool showpaths, bool astat,
    double coverage, bool load_copynum, GtUword buffersize,
    GtLogger *default_logger, GtTimer **timer, GtError *err)
{
  int had_err;
  GtEncseqLoader *el = gt_encseq_loader_new();
  GtEncseq *reads;

  if (gt_showtime_enabled())
  {
    gt_assert(timer != NULL);
    if (*timer == NULL) /* paths2seq */
    {
      *timer = gt_timer_new_with_progress_description(
          GT_READJOINER_ASSEMBLY_MSG_PUMPENCSEQ);
      gt_timer_show_cpu_time_by_progress(*timer);
      gt_timer_start(*timer);
    }
    else
      gt_timer_show_progress(*timer, GT_READJOINER_ASSEMBLY_MSG_PUMPENCSEQ,
          stdout);
  }
  gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_PUMPENCSEQ);
  gt_encseq_loader_drop_description_support(el);
  gt_encseq_loader_disable_autosupport(el);
  gt_encseq_loader_mirror(el);
  reads = gt_encseq_loader_load(el, readset, err);
  gt_assert(reads != NULL);
  gt_readjoiner_assembly_pump_encseq_through_cache(reads);
  if (gt_showtime_enabled())
    gt_timer_show_progress(*timer, GT_READJOINER_ASSEMBLY_MSG_OUTPUTCONTIGS,
        stdout);
  gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_OUTPUTCONTIGS);
  had_err = gt_contigpaths_to_fasta(readset, GT_READJOINER_SUFFIX_CONTIG_PATHS,
      GT_READJOINER_SUFFIX_CONTIGS, reads, lengthcutoff, showpaths,
      astat, coverage, load_copynum, (size_t)buffersize, default_logger, err);
  gt_encseq_delete(reads);
  gt_encseq_loader_delete(el);
  return had_err;
}
static void showfinalstatistics(const TyrDfsstate *state,
                                const char *inputindex,
                                GtLogger *logger)
{
  uint64_t dnumofmers = addupdistribution(&state->occdistribution);

  if (state->performtest)
  {
    checknumofmers(state,dnumofmers);
  }
  gt_logger_log(logger,
              "the following output refers to the set of all sequences");
  gt_logger_log(logger,
              "represented by the index \"%s\"",inputindex);
  gt_logger_log(logger,
              "number of "GT_WU"-mers in the sequences not containing a "
              "wildcard: " Formatuint64_t,
              (GtUword) state->mersize,
              PRINTuint64_tcast(dnumofmers));
  gt_logger_log(logger, "show the distribution of the number of occurrences of "
                GT_WU "-mers", (GtUword) state->mersize);
  gt_logger_log(logger,"not containing a wildcard as rows of the form "
              "i d where");
  gt_logger_log(logger, "d is the number of events that a "GT_WU
                "-mer occurs exactly i times", (GtUword) state->mersize);
  showmerdistribution(state);
}
static int gt_readjoiner_assembly_error_correction(GtStrgraph *strgraph,
    unsigned int bubble, unsigned int deadend, unsigned int deadend_depth,
    GtLogger *verbose_logger)
{
  unsigned int i;
  GtUword retval, retval_sum;
  gt_logger_log(verbose_logger, "remove p-bubbles");

  retval_sum = 0;
  retval = 1UL;
  for (i = 0; i < bubble && retval > 0; i++)
  {
    retval = gt_strgraph_redpbubbles(strgraph, 0, 1UL, false);
    retval_sum += retval;
    gt_logger_log(verbose_logger, "removed p-bubble edges [round %u] = "GT_WU,
                  i + 1, retval);
  }
  gt_logger_log(verbose_logger, "removed p-bubble edges [%u rounds] = "GT_WU,
                i, retval_sum);
  gt_logger_log(verbose_logger, "remove dead-end paths");

  retval_sum = 0;
  retval = 1UL;
  for (i = 0; i < deadend && retval > 0; i++)
  {
    retval = gt_strgraph_reddepaths(strgraph, (GtUword)deadend_depth,
        false);
    retval_sum += retval;
    gt_logger_log(verbose_logger, "removed dead-end path edges [round %u] = "
       GT_WU, i + 1, retval);
  }
  gt_logger_log(verbose_logger,
      "removed dead-end path edges [%u rounds] = " GT_WU, i, retval_sum);
  return 0;
}
static void gt_readjoiner_assembly_load_graph(GtStrgraph **strgraph,
    GtEncseq *reads, const char *readset, GtUword rlen,
    GtLogger *default_logger, GtTimer *timer)
{
  *strgraph = gt_strgraph_new_from_file(reads, rlen, readset,
      GT_READJOINER_SUFFIX_SG);

  if (gt_showtime_enabled())
    gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_LOADSG, stdout);
  gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_LOADSG);

  gt_readjoiner_assembly_show_current_space("(graph loaded)");
}
static int hmmsearch_call_coarse_search(GtCondenseq* ces,
                                        char *hmmsearch_path,
                                        char *table_filename,
                                        char *hmm_filename,
                                        GtLogger *logger,
                                        GtError *err) {
  int had_err = 0;
  char **hmmargs = NULL,
       *hmmenv[] = { NULL };
  GtStr *coarse_fas = gt_condenseq_unique_fasta_file(ces);
  GtSafePipe *pipe = NULL;
  gt_assert(coarse_fas != NULL);

  /* Array has to end with NULL */
  hmmargs = gt_calloc((size_t) 8, sizeof (*hmmargs));
  hmmargs[0] = hmmsearch_path;
  hmmargs[1] = gt_cstr_dup("--noali");
  hmmargs[2] = gt_cstr_dup("--notextw");
  hmmargs[3] = gt_cstr_dup("--domtblout");
  hmmargs[4] = table_filename;
  hmmargs[5] = hmm_filename;
  hmmargs[6] = gt_str_get(coarse_fas);

  gt_logger_log(logger, "calling: %s", hmmsearch_path);

  pipe = gt_safe_popen(hmmsearch_path, hmmargs, hmmenv, err);

  if (pipe == NULL)
    had_err = -1;

  gt_free(hmmargs[1]);
  gt_free(hmmargs[2]);
  gt_free(hmmargs[3]);
  gt_free(hmmargs);
  gt_str_delete(coarse_fas);

  /* pipe test for splint */
  if (!had_err && pipe != NULL) {
    if (gt_log_enabled()) {
      GtStr *line = gt_str_new();
      while (gt_str_read_next_line(line, pipe->read_fd) == 0) {
        gt_log_log("%s", gt_str_get(line));
        gt_str_reset(line);
      }
      gt_str_delete(line);
    }
    (void) gt_safe_pclose(pipe);
  }
  return had_err;
}
static int hmmsearch_call_fine_search(GtStr *table_filename,
                                      char *fine_fasta_filename,
                                      char *hmmsearch_path,
                                      char *hmm_filename,
                                      GtLogger *logger,
                                      GtError *err) {
  int had_err = 0;
  GtSafePipe *pipe = NULL;
  char **hmmargs = NULL,
       *hmmenv[] = { NULL };
  size_t hmmargc = (size_t) 4;
  unsigned int hmmidx = 0;

  if (table_filename != NULL) {
    hmmargc += (size_t) 2;
  }
  hmmargs = gt_calloc(hmmargc, sizeof (*hmmargs));
  hmmargs[hmmidx++] = hmmsearch_path;
  if (table_filename != NULL) {
    hmmargs[hmmidx++] = gt_cstr_dup("--tblout");
    hmmargs[hmmidx++] = gt_str_get(table_filename);
  }
  hmmargs[hmmidx++] = hmm_filename;
  hmmargs[hmmidx++] = fine_fasta_filename;
  gt_assert(hmmargs[hmmidx] == NULL);

  gt_logger_log(logger, "calling: %s", hmmsearch_path);

  pipe = gt_safe_popen(hmmsearch_path, hmmargs, hmmenv, err);

  if (table_filename != NULL)
    gt_free(hmmargs[1]);
  gt_free(hmmargs);

  if (pipe == NULL)
    had_err = -1;

  if (!had_err) {
    GtStr *line = gt_str_new();
    gt_assert(pipe != NULL); /* shut up splint */
    while (gt_str_read_next_line(line, pipe->read_fd) == 0) {
      printf("%s\n", gt_str_get(line));
      gt_str_reset(line);
    }
    gt_str_delete(line);
    (void) gt_safe_pclose(pipe);
  }
  return had_err;
}
GtSuffixsortspace *gt_suffixsortspace_new(unsigned long numofentries,
                                          unsigned long maxvalue,
                                          bool useuint,
                                          GT_UNUSED GtLogger *logger)
{
  GtSuffixsortspace *suffixsortspace;
  unsigned long sufspacesize;

  gt_assert(numofentries > 0);
  suffixsortspace = gt_malloc(sizeof (*suffixsortspace));
  suffixsortspace->maxindex = numofentries-1;
  suffixsortspace->maxvalue = maxvalue;
  suffixsortspace->longestidx.defined = false;
  suffixsortspace->longestidx.valueunsignedlong = 0;
  suffixsortspace->exportptr.ulongtabsectionptr = NULL;
  suffixsortspace->exportptr.uinttabsectionptr = NULL;
  suffixsortspace->currentexport = false;
#ifdef _LP64
  gt_logger_log(logger,"suftab uses %dbit values: "
                         "maxvalue=%lu,numofentries=%lu",
                         gt_decide_to_use_uint(useuint,maxvalue) ? 32 : 64,
                         maxvalue,numofentries);
#endif
  suffixsortspace->basesize = gt_decide_to_use_uint(useuint,maxvalue)
                                ? sizeof (*suffixsortspace->uinttab)
                                : sizeof (*suffixsortspace->ulongtab);
  sufspacesize
    = gt_safe_mult_ulong_check((unsigned long) suffixsortspace->basesize,
                               numofentries,
                               gt_suffixsortspace_overflow_abort,
                               &numofentries);
  gt_log_log("sizeof (suftab)=%lu bytes",sufspacesize);
  if (gt_decide_to_use_uint(useuint,maxvalue))
  {
    suffixsortspace->ulongtab = NULL;
    suffixsortspace->uinttab = gt_malloc((size_t) sufspacesize);
  } else
  {
    suffixsortspace->uinttab = NULL;
    suffixsortspace->ulongtab = gt_malloc((size_t) sufspacesize);
  }
  suffixsortspace->partoffset = 0;
  suffixsortspace->bucketleftidx = 0;
  suffixsortspace->unmapsortspace = false;
  return suffixsortspace;
}
Exemple #12
0
static int gt_genomediff_runner(int argc, const char **argv,
                                int parsed_args, void *tool_arguments,
                                GtError *err)
{
    bool mirrored = false;
    int had_err = 0,
        i;
    GtEncseq              *encseq = NULL;
    GtGenomediffArguments *arguments = tool_arguments;
    GtLogger              *logger;
    GtShuUnitFileInfo     *unit_info = NULL;
    GtTimer               *timer = NULL;

    gt_error_check(err);
    gt_assert(arguments);

    logger = gt_logger_new(arguments->verbose,
                           GT_LOGGER_DEFLT_PREFIX,
                           stdout);
    gt_assert(logger);

    for (i = parsed_args; i < argc; i++) {
        gt_str_array_add_cstr(arguments->filenames, argv[i]);
    }

    if (gt_showtime_enabled()) {
        timer = gt_timer_new_with_progress_description("start");
        gt_timer_start(timer);
        gt_assert(timer);
    }

    if (arguments->with_units) {
        gt_logger_log(logger, "unitfile option set, filename is %s\n",
                      gt_str_get(arguments->unitfile));
    }

    if (timer != NULL)
        gt_timer_show_progress(timer, "start shu search", stdout);

    if (gt_str_array_size(arguments->filenames) > 1UL) {
        GtEncseqEncoder *ee = gt_encseq_encoder_new();
        gt_encseq_encoder_set_timer(ee, timer);
        gt_encseq_encoder_set_logger(ee, logger);
        /* kr only makes sense for dna, so we can check this already with ee */
        gt_encseq_encoder_set_input_dna(ee);
        had_err = gt_encseq_encoder_encode(ee, arguments->filenames,
                                           gt_str_get(arguments->indexname), err);
        gt_encseq_encoder_delete(ee);
    }
    else {
        gt_str_append_str(arguments->indexname,
                          gt_str_array_get_str(arguments->filenames, 0));
        if (arguments->with_esa || arguments->with_pck) {
            GtStr *current_line = gt_str_new();
            FILE *prj_fp;
            const char *buffer;
            char **elements = NULL;

            prj_fp = gt_fa_fopen_with_suffix(gt_str_get(arguments->indexname),
                                             GT_PROJECTFILESUFFIX,"rb",err);
            if (prj_fp == NULL)
                had_err = -1;
            while (!had_err && gt_str_read_next_line(current_line, prj_fp) != EOF) {
                buffer = gt_str_get(current_line);
                if (elements != NULL) {
                    gt_free(elements[0]);
                    gt_free(elements[1]);
                }
                gt_free(elements);
                elements = gt_cstr_split(buffer, '=');
                gt_log_log("%s", elements[0]);
                if (strcmp("mirrored", elements[0]) == 0) {
                    gt_log_log("%s", elements[1]);
                    if (strcmp("1", elements[1]) == 0) {
                        mirrored = true;
                        gt_log_log("sequences are treated as mirrored");
                    }
                }
                gt_str_reset(current_line);
            }
            gt_str_delete(current_line);
            if (elements != NULL) {
                gt_free(elements[0]);
                gt_free(elements[1]);
            }
            gt_free(elements);
            gt_fa_xfclose(prj_fp);
        }
    }

    if (!had_err) {
        GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts,
                             err);
        if (mirrored)
            gt_encseq_loader_mirror(el);
        encseq =
            gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err);
        gt_encseq_loader_delete(el);
    }
    if (encseq == NULL)
        had_err = -1;
    if (!had_err) {
        unit_info = gt_shu_unit_info_new(encseq);
        if (arguments->with_units)
            had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info,
                                                 logger, err);
    }

    if (!had_err) {
        uint64_t **shusums = NULL;
        if (arguments->with_esa || arguments->with_pck) {
            shusums = gt_genomediff_shulen_sum(arguments, unit_info,
                                               logger, timer, err);
            if (shusums == NULL)
                had_err = -1;
        }
        else {
            const bool doesa = true;
            GenomediffInfo gd_info;
            Suffixeratoroptions sopts;
            sopts.beverbose = arguments->verbose;
            sopts.indexname = arguments->indexname;
            sopts.db = NULL;
            sopts.encopts = NULL;
            sopts.genomediff = true;
            sopts.inputindex = arguments->indexname;
            sopts.loadopts = arguments->loadopts;
            sopts.showprogress = false;
            sopts.idxopts = arguments->idxopts;

            gt_assert(unit_info != NULL);
            gt_array2dim_calloc(shusums, unit_info->num_of_genomes,
                                unit_info->num_of_genomes);
            gd_info.shulensums = shusums;
            gd_info.unit_info = unit_info;
            had_err = gt_runsuffixerator(doesa, &sopts, &gd_info, logger, err);
        }
        if (!had_err && shusums != NULL) {
            had_err = gt_genomediff_kr_calc(shusums, arguments, unit_info,
                                            arguments->with_pck, logger, timer, err);
            gt_array2dim_delete(shusums);
        }
    }

    if (timer != NULL) {
        gt_timer_show_progress_final(timer, stdout);
        gt_timer_delete(timer);
    }
    gt_logger_delete(logger);
    gt_encseq_delete(encseq);
    gt_shu_unit_info_delete(unit_info);

    return had_err;
}
Exemple #13
0
void gt_copysort_derivesorting(const GtBucketspec2 *bucketspec2,
                               GtSuffixsortspace *suffixsortspace,
                               GtLogger *logger)
{
  GtUword hardwork = 0,
                *targetoffset;
  unsigned int idx, idxsource, source, second;

#ifdef WITHSUFFIXES
  {
    GtUword idx;
    for (idx = 0; idx < bucketspec2->partwidth; idx++)
    {
      gt_encseq_showatstartpos(
                            stdout,
                            GT_ISDIRREVERSE(readmode) ? false : true,
                            GT_ISDIRCOMPLEMENT(readmode) ? true : false,
                            encseq,
                            gt_suffixsortspace_getdirect(suffixsortspace,idx));
    }
  }
#endif
  targetoffset = gt_malloc(sizeof (*targetoffset) * bucketspec2->numofchars);
  for (idxsource = 0; idxsource<bucketspec2->numofchars; idxsource++)
  {
    source = bucketspec2->order[idxsource];
    for (second = 0; second < bucketspec2->numofchars; second++)
    {
      if (!bucketspec2->subbuckettab[source][second].sorted && source != second)
      {
        gt_assert(bucketspec2->subbuckettab[source][second].hardworktodo);
        gt_logger_log(logger,"hard work for %u %u",source,second);
        hardwork += getendidx(bucketspec2,source,second) -
                    getstartidx(bucketspec2,source,second);
        bucketspec2->subbuckettab[source][second].sorted = true;
      } else
      {
        gt_assert(!bucketspec2->subbuckettab[source][second].hardworktodo);
      }
    }
    if (getstartidx(bucketspec2,source,0) <
        getstartidx(bucketspec2,source,source))
    {
      for (idx = 0; idx < bucketspec2->numofchars; idx++)
      {
        targetoffset[idx] = getstartidx(bucketspec2,idx,source);
      }
      forwardderive(bucketspec2,
                    suffixsortspace,
                    targetoffset,
                    source,
                    getstartidx(bucketspec2,source,0));
    }
    if (getendidx(bucketspec2,source,source) <
        getendidx(bucketspec2,source,bucketspec2->numofchars))
    {
      for (idx = 0; idx < bucketspec2->numofchars; idx++)
      {
        /* do not need to assert that getendidx(idx,source)  > 0, as later the
           value stored in targetoffset is incremented */
        targetoffset[idx] = getendidx(bucketspec2,idx,source) - 1;
      }
      gt_assert(getendidx(bucketspec2,source,bucketspec2->numofchars) > 0);
      backwardderive(bucketspec2,
                     suffixsortspace,
                     targetoffset,
                     source,
                     getendidx(bucketspec2,source,bucketspec2->numofchars) - 1);
    }
    for (idx = 0; idx < bucketspec2->numofchars; idx++)
    {
      bucketspec2->subbuckettab[idx][source].sorted = true;
    }
    bucketspec2->superbuckettab[source].sorted = true;
  }
  gt_free(targetoffset);
  gt_logger_log(logger,"hardwork = "GT_WU" (%.2f)",
                hardwork,
                (double) hardwork/gt_encseq_total_length(bucketspec2->encseq));
}
static int hmmsearch_process_coarse_hits(
                                       char *table_filename,
                                       GtCondenseq *ces,
                                       GtCondenseqHmmsearchArguments *arguments,
                                       GtLogger *logger,
                                       GtError *err) {
  int had_err = 0;
  GtStr *line = gt_str_new();
  FILE *table = NULL;
  GtSplitter *splitter = gt_splitter_new();
  GtStr *query = gt_str_new(),
        *fine_fasta_filename = gt_str_new_cstr("condenseq");
  GtRBTree *sequences = NULL;
  GtUword filecount = (GtUword) 1;
  unsigned int querycount = 0;
  const GtUword fine_fasta_name_length = gt_str_length(fine_fasta_filename);
  const GtUword table_name_length = gt_str_length(arguments->outtable_filename);

  table = gt_xfopen(table_filename, "r");

  sequences = gt_rbtree_new(hmmsearch_cmp_seqnum,
                            hmmsearch_tree_free_node, NULL);

  while (!had_err && gt_str_read_next_line(line, table) == 0) {
    char *c_line = gt_str_get(line);
    GtUword uid;
    const GtUword target_column = 0,
          query_column = (GtUword) 3;

    if (c_line[0] != '#') {
      gt_splitter_split_non_empty(splitter, c_line, gt_str_length(line), ' ');
      gt_assert(gt_splitter_size(splitter) == (GtUword) 23);
      if (sscanf(gt_splitter_get_token(splitter, target_column),
                 GT_WU, &uid) != 1) {
        gt_error_set(err, "couldn't parse target number: %s",
                     gt_splitter_get_token(splitter, target_column));
        had_err = -1;
      }
      if (gt_str_length(query) == 0 ||
          strcmp(gt_str_get(query),
                 gt_splitter_get_token(splitter, query_column)) != 0) {
        gt_str_set(query, gt_splitter_get_token(splitter, query_column));
        gt_logger_log(logger, "new query: %s", gt_str_get(query));
        querycount++;
      }
      if (!had_err && querycount == arguments->max_queries) {
        hmmsearch_create_fine_fas(fine_fasta_filename, sequences, ces);
        if (table_name_length != 0)
          gt_str_append_uword(arguments->outtable_filename, filecount++);
        had_err =
          hmmsearch_call_fine_search(table_name_length != 0 ?
                                       arguments->outtable_filename :
                                       NULL,
                                     gt_str_get(fine_fasta_filename),
                                     gt_str_get(arguments->hmmsearch_path),
                                     gt_str_get(arguments->hmm),
                                     logger, err);
        gt_rbtree_clear(sequences);
        gt_str_set_length(fine_fasta_filename, fine_fasta_name_length);
        if (table_name_length != 0)
          gt_str_set_length(arguments->outtable_filename, table_name_length);
        querycount = 0;
      }
      if (!had_err) {
        if (gt_condenseq_each_redundant_seq(ces, uid,
                                            hmmsearch_process_seq,
                                            sequences, err) == 0) {
          had_err = -1;
        }
      }
      gt_splitter_reset(splitter);
    }
    gt_str_reset(line);
  }
  gt_splitter_delete(splitter);
  gt_str_delete(line);
  gt_str_delete(query);
  gt_xfclose(table);

  if (!had_err) {
    hmmsearch_create_fine_fas(fine_fasta_filename, sequences, ces);
    if (table_name_length != 0)
      gt_str_append_uword(arguments->outtable_filename, filecount++);
    had_err =
      hmmsearch_call_fine_search(table_name_length != 0 ?
                                 arguments->outtable_filename :
                                 NULL,
                                 gt_str_get(fine_fasta_filename),
                                 gt_str_get(arguments->hmmsearch_path),
                                 gt_str_get(arguments->hmm),
                                 logger, err);
  }
  gt_log_log("created " GT_WU " files", filecount);
  gt_rbtree_delete(sequences);
  gt_str_delete(fine_fasta_filename);
  return had_err;
}
int gt_genomediff_pck_shu_simple(GtLogger *logger,
                                 const GtGenomediffArguments *arguments,
                                 GtError *err)
{
  int had_err = 0;
  int retval;
  GtSeqIterator *queries = NULL;
  const GtUchar *symbolmap, *currentQuery;
  const GtAlphabet *alphabet;
  GtUchar c_sym = 0,
          g_sym = 0;
  uint64_t queryNo;
  char *description = NULL;
  unsigned long queryLength,
                subjectLength = 0,
                currentSuffix;
  double avgShuLength,
         currentShuLength = 0.0,
         /*gc_subject,*/
         gc_query /*, gc*/;
  const FMindex *subjectindex = NULL;
  Genericindex *genericindexSubject;
  const GtEncseq *encseq = NULL;
  double *ln_n_fac;

  /* get the precalculation of ln(n!) for 0<n<max_ln_n_fac */
  ln_n_fac = gt_get_ln_n_fac(arguments->max_ln_n_fac);
  gt_log_log("ln(max_ln_n_fac!) = %f\n",
             ln_n_fac[arguments->max_ln_n_fac]);

  genericindexSubject = genericindex_new(gt_str_get(
                                           arguments->indexname),
                                         arguments->with_esa,
                                         true,
                                         false,
                                         true,
                                         arguments->user_max_depth,
                                         logger,
                                         err);
  if (genericindexSubject == NULL)
  {
    had_err = 1;
  }
  else
  {
    encseq = genericindex_getencseq(genericindexSubject);
  }

  if (!had_err)
  {
    subjectLength = genericindex_get_totallength(genericindexSubject) - 1;
    /*subjectLength /= 2;*/
    /*gt_log_log("subject length: %lu", subjectLength);*/
    subjectindex = genericindex_get_packedindex(genericindexSubject);

    queries = gt_seqiterator_sequence_buffer_new(
                                          arguments->queryname,
                                          err);
    gt_assert(queries);
    alphabet = gt_encseq_alphabet(encseq);
    /* makes assumption that alphabet is dna, it has to calculate the gc! */
    if (!gt_alphabet_is_dna(alphabet))
    {
      fprintf(stderr, "error: Sequences need to be dna");
      had_err = 1;
    }
    else
    {
      symbolmap = gt_alphabet_symbolmap(alphabet);
      gt_seqiterator_set_symbolmap(queries, symbolmap);
      c_sym = gt_alphabet_encode(alphabet, 'c');
      g_sym = gt_alphabet_encode(alphabet, 'g');
    }
  }

  for (queryNo = 0; !had_err; queryNo++)
  {
    retval = gt_seqiterator_next(queries,
                                 &currentQuery,
                                 &queryLength,
                                 &description,
                                 err);
    if ( retval != 1)
    {
      if (retval < 0)
      {
        gt_free(description);
      }
      break;
    }
    gt_logger_log(logger,
                  "found query of length: %lu",
                  queryLength);
    avgShuLength = 0.0;
    gc_query = 0.0;
    for (currentSuffix = 0; currentSuffix < queryLength; currentSuffix++)
    {
      currentShuLength = (double) gt_pck_getShuStringLength(
                    subjectindex,
                    &currentQuery[currentSuffix],
                    queryLength - currentSuffix);
      avgShuLength += currentShuLength;
      if (currentQuery[currentSuffix] == c_sym ||
          currentQuery[currentSuffix] == g_sym)
      {
        gc_query++;
      }
    }
    if (arguments->shulen_only)
    {
      printf("# Query %d sum of shulen:\n %.0f\n",
             (int) queryNo, avgShuLength);
    }
    else
    {
      avgShuLength /= (double) queryLength;
      gc_query /= (double) queryLength;

      gt_logger_log(logger, "Query %d has an average SHUstring length "
                            "of\n# shulength: %f",
                            (int) queryNo, avgShuLength);
      gt_logger_log(logger, "Query description: %s", description);
      gt_log_log("Query (i): %s", description);

  /* XXX Fehlerabfragen einbauen */

      if ( !had_err )
      {
        double div, kr;

        gt_logger_log(logger, "shulen:\n%f", avgShuLength);
        gt_log_log("shu: %f, gc: %f, len: %lu",
            avgShuLength, gc_query, subjectLength);
        div =  gt_divergence(arguments->divergence_rel_err,
                             arguments->divergence_abs_err,
                             arguments->divergence_m,
                             arguments->divergence_threshold,
                             avgShuLength,
                             subjectLength,
                             gc_query,
                             ln_n_fac,
                             arguments->max_ln_n_fac);
        gt_logger_log(logger, "divergence:\n%f", div);

        kr = gt_calculateKr(div);

        printf("# Kr:\n%f\n", kr);
      }
    }
  }
  gt_free(ln_n_fac);
  gt_seqiterator_delete(queries);
  genericindex_delete(genericindexSubject);
  return had_err;
}
static int gt_gdiffcalc_runner(int argc, const char **argv, int parsed_args,
                              void *tool_arguments, GT_UNUSED GtError *err)
{
  GtGenomediffArguments *arguments = tool_arguments;
  int had_err = 0, i;
  GtUword lcounter = 0, zcounter = 0;
  double **shusums = NULL;
  GtEncseq              *encseq = NULL;
  GtLogger              *logger;
  GtShuUnitFileInfo     *unit_info = NULL;
  GtTimer               *timer = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose,
                         GT_LOGGER_DEFLT_PREFIX,
                         stdout);
  gt_assert(logger);

  for (i = parsed_args; i < argc; i++) {
    gt_str_array_add_cstr(arguments->filenames, argv[i]);
  }

  if (gt_showtime_enabled()) {
    timer = gt_timer_new_with_progress_description("load encseq");
    gt_timer_start(timer);
    gt_assert(timer);
  }

  if (arguments->with_units) {
    gt_logger_log(logger, "unitfile option set, filename is %s\n",
                  gt_str_get(arguments->unitfile));
  }

  if (!had_err) {
    GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts,
                                                           err);
    encseq =
      gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err);
    gt_encseq_loader_delete(el);
  }
  if (encseq == NULL)
    had_err = -1;

  if (timer != NULL)
    gt_timer_show_progress(timer, "load units", stdout);

  if (!had_err) {
    unit_info = gt_shu_unit_info_new(encseq);
    if (arguments->with_units)
      had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info,
                                           logger, err);
  }

  if (timer != NULL)
    gt_timer_show_progress(timer, "read table", stdout);

  if (!had_err) {
    GtIO *table_file = NULL;
    GtTokenizer *tokenizer = NULL;
    GtStr *line = NULL;

    gt_assert(unit_info != NULL);
    gt_array2dim_calloc(shusums, unit_info->num_of_genomes,
                        unit_info->num_of_genomes);

    table_file = gt_io_new(gt_str_array_get(arguments->filenames, 0), "r");
    tokenizer = gt_tokenizer_new(table_file);
    line = gt_tokenizer_get_token(tokenizer);
    while (line != NULL && !had_err) {
      char *cline = gt_str_get(line);
      char *elem = strtok(cline, ";");
      zcounter = 0;
      while (elem != NULL && !had_err) {
        if (*elem != '#') {
          if (1 != sscanf(elem, "%lf",
                          &shusums[lcounter][zcounter])) {
            had_err = 1;
            gt_error_set(err, "couldn't scan");
            break;
          }
          gt_logger_log(logger,"wert: %lf", shusums[lcounter][zcounter]);
          zcounter++;
        }
        else {
          gt_logger_log(logger, "name: %s", elem++);
        }
        elem = strtok(NULL, ";");
      }
      gt_tokenizer_next_token(tokenizer);
      gt_str_delete(line);
      line = gt_tokenizer_get_token(tokenizer);
      lcounter++;
      gt_logger_log(logger, "line "GT_WD"", lcounter);
    }
  }
  if (!had_err) {
    GtUword num_of_seq, file_idx, seq_idx, startpos;
    GT_UNUSED GtUword oldpos = 0;

    gt_assert(unit_info != NULL);
    gt_assert(lcounter == zcounter);
    gt_assert(lcounter == unit_info->num_of_genomes);

    num_of_seq = gt_encseq_num_of_sequences(unit_info->encseq);

    for (seq_idx = 0; seq_idx < num_of_seq; seq_idx++) {
      startpos = gt_encseq_seqstartpos(unit_info->encseq, seq_idx);
      file_idx = gt_encseq_filenum(unit_info->encseq, startpos);
      gt_log_log("seq: "GT_WU" starts at: "GT_WU"\n"
                 "belonges to file: "GT_WU" which is part of genome: %s",
                 seq_idx, startpos, file_idx,
                 gt_str_array_get(unit_info->genome_names,
                                  unit_info->map_files[file_idx]));
      gt_assert(oldpos <= startpos);
      oldpos = startpos;
    }
  }
  if (!had_err && shusums != NULL) {
    had_err = gt_genomediff_calculate_div_from_avg(shusums, arguments,
                                                   unit_info,
                                                   logger, timer, err);
    gt_array2dim_delete(shusums);
  }

  if (timer != NULL) {
    gt_timer_show_progress_final(timer, stdout);
    gt_timer_delete(timer);
  }
  gt_logger_delete(logger);
  gt_encseq_delete(encseq);
  gt_shu_unit_info_delete(unit_info);
  return had_err;
}
static int gt_condenser_search_runner(GT_UNUSED int argc,
                                      GT_UNUSED const char **argv,
                                      GT_UNUSED int parsed_args,
                                      void *tool_arguments,
                                      GtError *err)
{
  GtCondenserSearchArguments *arguments = tool_arguments;
  int i, had_err = 0;
  char *querypath = gt_str_get(arguments->querypath);
  GtStr* coarse_fname = gt_str_new_cstr("coarse_");
  char *db_basename = NULL;
  char *suffix_ptr = NULL;
  GtTimer *timer = NULL;
  GtLogger *logger = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr);

  db_basename = gt_basename(gt_str_get(arguments->dbpath));
  /* if first char is '.' this might be a hidden file */
  if (strlen(db_basename) > (size_t) 1 &&
      (suffix_ptr = strrchr(db_basename + 1, '.')) != NULL) {
    /* remove suffix */
    *suffix_ptr = '\0';
  }
  gt_str_append_cstr(coarse_fname, db_basename);
  gt_str_append_cstr(coarse_fname, ".fas");
  gt_free(db_basename);
  db_basename = NULL;
  suffix_ptr = NULL;

  if (arguments->blastn || arguments->blastp) {
    GtMatch              *match;
    GtMatchIterator      *mp = NULL;
    GtNREncseq           *nrencseq = NULL;
    GtStr                *fastaname = gt_str_clone(arguments->dbpath);
    HitPosition          *hits;
    double                eval,
                          raw_eval = 0.0;
    GtUword               coarse_db_len = 0;
    GtMatchIteratorStatus status;
    int                   curr_hits = 0,
                          max_hits = 100;

    hits = gt_malloc(sizeof (*hits) * (size_t) max_hits);

    gt_str_append_cstr(fastaname, ".fas");

    for (i=0; i < max_hits; i++) {
      hits[i].range = gt_malloc(sizeof (*hits[i].range) * (size_t) 1);
    }

    if (gt_showtime_enabled()) {
      timer = gt_timer_new_with_progress_description("initialization");
      gt_timer_start(timer);
    }

    /*extract sequences from compressed database*/
    if (!had_err) {
      nrencseq = gt_n_r_encseq_new_from_file(gt_str_get(arguments->dbpath),
                                             logger, err);
      if (nrencseq == NULL)
        had_err = -1;
    }
    if (!had_err) {
      if (arguments->ceval == GT_UNDEF_DOUBLE ||
          arguments->feval == GT_UNDEF_DOUBLE) {
        /* from NCBI BLAST tutorial:
           E = Kmne^{-lambdaS}
           calculates E-value for score S with natural scale parameters K for
           search space size and lambda for the scoring system
           E = mn2^-S'
           m being the subject (total) length, n the length of ONE query
           calculates E-value for bit-score S'
         */
        GtFastaReader *reader;
        GtCondenserSearchAvg avg = {0,0};
        reader = gt_fasta_reader_rec_new(arguments->querypath);
        had_err = gt_fasta_reader_run(reader, NULL, NULL,
                                      gt_condenser_search_cum_moving_avg,
                                      &avg,
                                      err);
        if (!had_err) {
          GtUword S = arguments->bitscore;
          gt_log_log(GT_WU " queries, avg query size: " GT_WU,
                     avg.count, avg.avg);
          raw_eval = 1/pow(2.0, (double) S) * avg.avg;
          gt_logger_log(logger, "Raw E-value set to %.4e", raw_eval);
          gt_assert(avg.avg != 0);
        }
        gt_fasta_reader_delete(reader);
      }
    }

    /*create BLAST database from compressed database fasta file*/
    if (!had_err) {
      if (timer != NULL)
        gt_timer_show_progress(timer, "create coarse BLAST db", stderr);
      if (arguments->blastn)
        had_err = gt_condenser_search_create_nucl_blastdb(gt_str_get(fastaname),
                                                          err);
      else
        had_err = gt_condenser_search_create_prot_blastdb(gt_str_get(fastaname),
                                                          err);
    }

    if (!had_err) {
      GtBlastProcessCall *call;

      if (timer != NULL)
        gt_timer_show_progress(timer, "coarse BLAST run", stderr);

      if (arguments->blastp)
        call = gt_blast_process_call_new_prot();
      else
        call = gt_blast_process_call_new_nucl();
      gt_blast_process_call_set_db(call, gt_str_get(fastaname));
      gt_blast_process_call_set_query(call, querypath);
      gt_blast_process_call_set_evalue(call, arguments->ceval);
      gt_blast_process_call_set_num_threads(call, arguments->blthreads);

      mp = gt_match_iterator_blast_process_new(call, err);
      if (!mp)
        had_err = -1;

      gt_blast_process_call_delete(call);

      while (!had_err &&
             (status = gt_match_iterator_next(mp, &match, err)) !=
             GT_MATCHER_STATUS_END)
      {
        if (status == GT_MATCHER_STATUS_OK) {
          GtUword hit_seq_id;
          char string[7];
          const char *dbseqid = gt_match_get_seqid2(match);
          if (sscanf(dbseqid,"%6s" GT_WU, string, &hit_seq_id) == 2) {
            gt_match_get_range_seq2(match, hits[curr_hits].range);
            hits[curr_hits].idx = hit_seq_id;
            gt_match_delete(match);
            curr_hits++;
            if (curr_hits == max_hits) {
              HitPosition *hit_extention;
              max_hits += 100;
              hits = gt_realloc(hits, sizeof (*hit_extention) * max_hits);
              for (i=max_hits - 100; i < max_hits; i++) {
                hits[i].range = gt_malloc(sizeof (*hits[i].range));
              }
            }
          } else {
            gt_error_set(err, "could not parse unique db header %s", dbseqid);
            had_err = -1;
          }
        } else if (status == GT_MATCHER_STATUS_ERROR) {
          had_err = -1;
        }
      }
      gt_match_iterator_delete(mp);
    }
    /*extract sequences*/
    if (!had_err) {
      GtNREncseqDecompressor *decomp;
      GtFile *coarse_hits;
      if (timer != NULL)
        gt_timer_show_progress(timer, "extract coarse search hits", stderr);
      decomp = gt_n_r_encseq_decompressor_new(nrencseq);
      coarse_hits = gt_file_new(gt_str_get(coarse_fname),"w", err);
      /* TODO DW do NOT extract complete uniques! these could be complete
         chromosomes!! just extract something around it? maybe +- max query
         length*/
      for (i = 0; i < curr_hits; i++) {
        gt_n_r_encseq_decompressor_add_unique_idx_to_extract(decomp,
                                                             hits[i].idx);
      }
      had_err =
        gt_n_r_encseq_decompressor_start_unique_extraction(coarse_hits,
                                                           decomp,
                                                           &coarse_db_len,
                                                           err);
      gt_assert(coarse_db_len != 0);
      gt_file_delete(coarse_hits);
      gt_n_r_encseq_decompressor_delete(decomp);
    }
    gt_n_r_encseq_delete(nrencseq);

    /* create BLAST database from decompressed database file */
    if (!had_err) {
      if (timer != NULL)
        gt_timer_show_progress(timer, "create fine BLAST db", stderr);
      if (arguments->blastn)
        had_err =
          gt_condenser_search_create_nucl_blastdb(gt_str_get(coarse_fname),
                                                  err);
      else
        had_err =
          gt_condenser_search_create_prot_blastdb(gt_str_get(coarse_fname),
                                                  err);
    }
    /* perform fine BLAST search */
    if (!had_err) {
      GtBlastProcessCall *call;

      if (timer != NULL)
        gt_timer_show_progress(timer, "fine BLAST run", stderr);

      if (arguments->feval == GT_UNDEF_DOUBLE) {
        eval = raw_eval * coarse_db_len;
      } else {
        eval = arguments->feval;
      }

      if (arguments->blastp)
        call = gt_blast_process_call_new_prot();
      else
        call = gt_blast_process_call_new_nucl();

      gt_blast_process_call_set_db(call, gt_str_get(coarse_fname));
      gt_blast_process_call_set_query(call, querypath);
      gt_blast_process_call_set_evalue(call, eval);
      gt_blast_process_call_set_num_threads(call, arguments->blthreads);

      gt_logger_log(logger, "Fine E-value set to: %.4e (len)" GT_WU, eval,
                    coarse_db_len);

      mp = gt_match_iterator_blast_process_new(call, err);
      if (!mp)
        had_err = -1;

      gt_blast_process_call_delete(call);

      if (!had_err) {
        GtUword numofhits = 0;
        while (!had_err &&
               (status = gt_match_iterator_next(mp, &match, err)) !=
               GT_MATCHER_STATUS_END) {
          if (status == GT_MATCHER_STATUS_OK) {
            GtMatchBlast *matchb = (GtMatchBlast*) match;
            char *dbseqid = gt_malloc(sizeof (*dbseqid) * 50);
            GtRange range_seq1;
            GtRange range_seq2;
            numofhits++;
            gt_match_get_range_seq1(match, &range_seq1);
            gt_match_get_range_seq2(match, &range_seq2);
            gt_file_xprintf(
                    arguments->outfp,
                    "%s\t%s\t%.2f\t" GT_WU "\t" GT_WU "\t" GT_WU "\t" GT_WU "\t"
                    GT_WU "\t%g\t%.3f\n",
                    gt_match_get_seqid1(match),
                    gt_match_get_seqid2(match),
                    gt_match_blast_get_similarity(matchb),
                    gt_match_blast_get_align_length(matchb),
                    range_seq1.start,
                    range_seq1.end,
                    range_seq2.start,
                    range_seq2.end,
                    gt_match_blast_get_evalue(matchb),
                    (double) gt_match_blast_get_bitscore(matchb));
            gt_match_delete(match);
            gt_free(dbseqid);
          } else if (status == GT_MATCHER_STATUS_ERROR) {
            had_err = -1;
          }
        }
        gt_log_log(GT_WU " hits found\n", numofhits);
      }
      gt_match_iterator_delete(mp);

    }
    if (!had_err)
      if (timer != NULL)
        gt_timer_show_progress_final(timer, stderr);
    gt_timer_delete(timer);

    /*cleanup*/
    for (i=0; i < max_hits; i++) {
      gt_free(hits[i].range);
    }
    gt_free(hits);
    gt_str_delete(fastaname);
  }
  gt_str_delete(coarse_fname);
  gt_logger_delete(logger);
  return had_err;
}
static int enumeratelcpintervals(const char *inputindex,
                                 Sequentialsuffixarrayreader *ssar,
                                 const char *storeindex,
                                 bool storecounts,
                                 GtUword mersize,
                                 GtUword minocc,
                                 GtUword maxocc,
                                 bool performtest,
                                 GtLogger *logger,
                                 GtError *err)
{
  TyrDfsstate *state;
  bool haserr = false;
  unsigned int alphasize;

  gt_error_check(err);
  state = gt_malloc(sizeof (*state));
  GT_INITARRAY(&state->occdistribution,Countwithpositions);
  state->esrspace = gt_encseq_create_reader_with_readmode(
                                   gt_encseqSequentialsuffixarrayreader(ssar),
                                   gt_readmodeSequentialsuffixarrayreader(ssar),
                                   0);
  state->mersize = (GtUword) mersize;
  state->encseq = gt_encseqSequentialsuffixarrayreader(ssar);
  alphasize = gt_alphabet_num_of_chars(gt_encseq_alphabet(state->encseq));
  state->readmode = gt_readmodeSequentialsuffixarrayreader(ssar);
  state->storecounts = storecounts;
  state->minocc = minocc;
  state->maxocc = maxocc;
  state->totallength = gt_encseq_total_length(state->encseq);
  state->performtest = performtest;
  state->countoutputmers = 0;
  state->merindexfpout = NULL;
  state->countsfilefpout = NULL;
  GT_INITARRAY(&state->largecounts,Largecount);
  if (strlen(storeindex) == 0)
  {
    state->sizeofbuffer = 0;
    state->bytebuffer = NULL;
  } else
  {
    state->sizeofbuffer = MERBYTES(mersize);
    state->bytebuffer = gt_malloc(sizeof *state->bytebuffer
                                  * state->sizeofbuffer);
  }
  if (performtest)
  {
    state->currentmer = gt_malloc(sizeof *state->currentmer
                                  * state->mersize);
    state->suftab = gt_suftabSequentialsuffixarrayreader(ssar);
  } else
  {
    state->currentmer = NULL;
    state->suftab = NULL;
  }
  if (state->mersize > state->totallength)
  {
    gt_error_set(err,"mersize "GT_WU" > "GT_WU" = totallength not allowed",
                 state->mersize,
                 state->totallength);
    haserr = true;
  } else
  {
    if (strlen(storeindex) == 0)
    {
      state->processoccurrencecount = adddistpos2distribution;
    } else
    {
      state->merindexfpout = gt_fa_fopen_with_suffix(storeindex,MERSUFFIX,
                                                    "wb",err);
      if (state->merindexfpout == NULL)
      {
        haserr = true;
      } else
      {
        if (state->storecounts)
        {
          state->countsfilefpout
            = gt_fa_fopen_with_suffix(storeindex,COUNTSSUFFIX,"wb",err);
          if (state->countsfilefpout == NULL)
          {
            haserr = true;
          }
        }
      }
      state->processoccurrencecount = outputsortedstring2index;
    }
    if (!haserr)
    {
      if (gt_depthfirstesa(ssar,
                          tyr_allocateDfsinfo,
                          tyr_freeDfsinfo,
                          tyr_processleafedge,
                          NULL,
                          tyr_processcompletenode,
                          tyr_assignleftmostleaf,
                          tyr_assignrightmostleaf,
                          (Dfsstate*) state,
                          logger,
                          err) != 0)
      {
        haserr = true;
      }
      if (strlen(storeindex) == 0)
      {
        showfinalstatistics(state,inputindex,logger);
      }
    }
    if (!haserr)
    {
      if (state->countsfilefpout != NULL)
      {
        gt_logger_log(logger,"write "GT_WU" mercounts > "GT_WU
                      " to file \"%s%s\"",
                      state->largecounts.nextfreeLargecount,
                      (GtUword) MAXSMALLMERCOUNT,
                      storeindex,
                      COUNTSSUFFIX);
        gt_xfwrite(state->largecounts.spaceLargecount, sizeof (Largecount),
                  (size_t) state->largecounts.nextfreeLargecount,
                  state->countsfilefpout);
      }
    }
    if (!haserr)
    {
      gt_logger_log(logger,"number of "GT_WU"-mers in index: "GT_WU"",
                  mersize,
                  state->countoutputmers);
      gt_logger_log(logger,"index size: %.2f megabytes\n",
                  GT_MEGABYTES(state->countoutputmers * state->sizeofbuffer +
                               sizeof (GtUword) * EXTRAINTEGERS));
    }
  }
  /* now out EXTRAINTEGERS integer values */
  if (!haserr && state->merindexfpout != NULL)
  {
    outputbytewiseUlongvalue(state->merindexfpout,
                             (GtUword) state->mersize);
    outputbytewiseUlongvalue(state->merindexfpout,(GtUword) alphasize);
  }
  gt_fa_xfclose(state->merindexfpout);
  gt_fa_xfclose(state->countsfilefpout);
  GT_FREEARRAY(&state->occdistribution,Countwithpositions);
  gt_free(state->currentmer);
  gt_free(state->bytebuffer);
  GT_FREEARRAY(&state->largecounts,Largecount);
  gt_encseq_reader_delete(state->esrspace);
  gt_free(state);
  return haserr ? -1 : 0;
}
Exemple #19
0
int gt_extractkeysfromdesfile(const char *indexname,
                              bool sortkeys,
                              GtLogger *logger,
                              GtError *err)
{
  FILE *fpin, *fpout = NULL;
  GtStr *line = NULL;
  const char *keyptr;
  unsigned long keylen, constantkeylen = 0, linenum;/* incorrectorder = 0;*/
  bool haserr = false, firstdesc = true;
  char *previouskey = NULL;
  Fixedsizekey *keytab = NULL, *keytabptr = NULL;
  GtEncseq *encseq = NULL;
  unsigned long numofentries = 0;
  const unsigned long linewidth = 60UL;

  fpin = gt_fa_fopen_with_suffix(indexname,GT_DESTABFILESUFFIX,"rb",err);
  if (fpin == NULL)
  {
    return -1;
  }
  if (!sortkeys)
  {
    fpout = gt_fa_fopen_with_suffix(indexname,GT_KEYSTABFILESUFFIX,"wb",err);
    if (fpout == NULL)
    {
      haserr = true;
    }
  }
  if (!haserr)
  {
    line = gt_str_new();
  }
  for (linenum = 0; !haserr && gt_str_read_next_line(line, fpin) != EOF;
       linenum++)
  {
    keyptr = desc2key(&keylen,gt_str_get(line),err);
    if (keyptr == NULL)
    {
      haserr = true;
      break;
    }
    if (keylen == 0)
    {
      gt_error_set(err,"key of length 0 in \"%s\" not expected",
                   gt_str_get(line));
      haserr = true;
      break;
    }
    if (firstdesc)
    {
      if (keylen > (unsigned long) CHAR_MAX)
      {
        gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; "
                         "no key must be larger than %d",
                          (int) keylen,(int) keylen,keyptr,keylen,CHAR_MAX);
        haserr = true;
        break;
      }
      constantkeylen = keylen;
      previouskey = gt_malloc(sizeof (char) * (constantkeylen+1));
      firstdesc = false;
      if (!sortkeys)
      {
        gt_xfputc((char) constantkeylen,fpout);
      } else
      {
        GtEncseqLoader *el;
        if (constantkeylen > (unsigned long) MAXFIXEDKEYSIZE)
        {
          gt_error_set(err,"key \"%*.*s\" of length %lu not allowed; "
                           "no key must be larger than %d",
                            (int) keylen,(int) keylen,keyptr,keylen,
                            MAXFIXEDKEYSIZE);
          haserr = true;
          break;
        }
        el = gt_encseq_loader_new();
        gt_encseq_loader_set_logger(el, logger);
        encseq = gt_encseq_loader_load(el, indexname, err);
        gt_encseq_loader_delete(el);
        if (encseq == NULL)
        {
          haserr = true;
          break;
        }
        numofentries = gt_encseq_num_of_sequences(encseq);
        gt_assert(numofentries > 0);
        keytab = gt_malloc(sizeof (*keytab) * numofentries);
        keytabptr = keytab;
      }
    } else
    {
      if (constantkeylen != keylen)
      {
        gt_error_set(err,"key \"%*.*s\" of length %lu: all keys must be of "
                         "the same length which for all previously seen "
                         "headers is %lu",
                         (int) keylen,(int) keylen,keyptr,keylen,
                         constantkeylen);
        haserr = true;
        break;
      }
      gt_assert(previouskey != NULL);
      if (!sortkeys && strncmp(previouskey,keyptr,(size_t) constantkeylen) >= 0)
      {
        gt_error_set(err,"previous key \"%s\" is not lexicographically smaller "
                         "than current key \"%*.*s\"",
                         previouskey,(int) keylen,(int) keylen,keyptr);
        haserr = true;
        break;
        /*
        printf("previous key \"%s\" (no %lu) is lexicographically larger "
               "than current key \"%*.*s\"\n",
               previouskey,linenum,(int) keylen,(int) keylen,keyptr);
        incorrectorder++;
        */
      }
    }
    if (!sortkeys)
    {
      gt_xfwrite(keyptr,sizeof *keyptr,(size_t) keylen,fpout);
      gt_xfputc('\0',fpout);
    } else
    {
      gt_assert(keytabptr != NULL);
      strncpy(keytabptr->key,keyptr,(size_t) constantkeylen);
      keytabptr->key[constantkeylen] = '\0';
      keytabptr->seqnum = linenum;
      keytabptr++;
    }
    strncpy(previouskey,keyptr,(size_t) constantkeylen);
    previouskey[constantkeylen] = '\0';
    gt_str_reset(line);
  }
  if (!haserr)
  {
    gt_logger_log(logger,"number of keys of length %lu = %lu",
                constantkeylen,linenum);
    /*
    gt_logger_log(logger,"number of incorrectly ordered keys = %lu",
                incorrectorder);
    */
  }
  gt_str_delete(line);
  gt_fa_fclose(fpin);
  gt_fa_fclose(fpout);
  gt_free(previouskey);
  if (!haserr && sortkeys)
  {
    gt_assert(keytabptr != NULL);
    gt_assert(numofentries > 0);
    gt_assert(keytabptr == keytab + numofentries);
    qsort(keytab,(size_t) numofentries,sizeof (*keytab),compareFixedkeys);
    gt_assert(keytabptr != NULL);
    for (keytabptr = keytab; !haserr && keytabptr < keytab + numofentries;
         keytabptr++)
    {
      if (giextract_encodedseq2fasta(stdout,
                                     encseq,
                                     keytabptr->seqnum,
                                     NULL,
                                     linewidth,
                                     err) != 0)
      {
        haserr = true;
        break;
      }
    }
  }
  if (encseq != NULL)
  {
    gt_encseq_delete(encseq);
    encseq = NULL;
  }
  gt_free(keytab);
  return haserr ? -1 : 0;
}
Exemple #20
0
static int inputsuffixarray(bool map,
                            Suffixarray *suffixarray,
                            unsigned int demand,
                            const char *indexname,
                            GtLogger *logger,
                            GtError *err)
{
  bool haserr = false;
  GtEncseqLoader *el;
  GtUword totallength = 0;

  gt_error_check(err);
  initsuffixarray(suffixarray);
  el = gt_encseq_loader_new();
  if (!(demand & SARR_DESTAB))
    gt_encseq_loader_do_not_require_des_tab(el);
  else
    gt_encseq_loader_require_des_tab(el);
  if (!(demand & SARR_SDSTAB))
    gt_encseq_loader_do_not_require_sds_tab(el);
  else
    gt_encseq_loader_require_sds_tab(el);
  if (!(demand & SARR_SSPTAB))
    gt_encseq_loader_do_not_require_ssp_tab(el);
  else
    gt_encseq_loader_require_ssp_tab(el);
  gt_encseq_loader_set_logger(el, logger);
  suffixarray->encseq = gt_encseq_loader_load(el, indexname, err);
  gt_encseq_loader_delete(el);
  if (suffixarray->encseq == NULL)
  {
    haserr = true;
  }
  if (!haserr)
  {
    haserr = scanprjfileuintkeys(suffixarray,indexname,logger,err);
  }
  if (!haserr
        && suffixarray->mirroredencseq
        && !gt_encseq_is_mirrored(suffixarray->encseq))
  {
    if (gt_encseq_mirror(suffixarray->encseq, err) != 0)
      haserr = true;
  }
  if (!haserr)
  {
    totallength = gt_encseq_total_length(suffixarray->encseq);
  }
  if (!haserr && (demand & SARR_SUFTAB))
  {
    if (map)
    {
      if (suffixarray->numberofallsortedsuffixes > 0)
      {
        suffixarray->suftab
          = gt_fa_mmap_check_size_with_suffix(indexname,
                                       GT_SUFTABSUFFIX,
                                       suffixarray->numberofallsortedsuffixes,
                                       sizeof (*suffixarray->suftab),
                                       err);
        if (suffixarray->suftab == NULL)
        {
          haserr = true;
        }
      }
    } else
    {
#if defined (_LP64) || defined (_WIN64)
      off_t filesize = gt_file_size_with_suffix(indexname,GT_SUFTABSUFFIX);

      if (filesize == (off_t) sizeof (uint32_t) *
                              suffixarray->numberofallsortedsuffixes)
      {
        gt_logger_log(logger,"read suftab in units of 4 bytes");
        INITBufferedfile(indexname,&suffixarray->suftabstream_uint32_t,uint32_t,
                         GT_SUFTABSUFFIX);
      } else
      {
        gt_logger_log(logger,"read suftab in units of 8 bytes");
        INITBufferedfile(indexname,&suffixarray->suftabstream_GtUword,GtUword,
                         GT_SUFTABSUFFIX);
      }
#else
      gt_logger_log(logger,"read suftab in units of 4 bytes");
      INITBufferedfile(indexname,&suffixarray->suftabstream_GtUword,GtUword,
                       GT_SUFTABSUFFIX);
#endif
    }
    if (!haserr && !suffixarray->longest.defined)
    {
      gt_error_set(err,"longest not defined");
      haserr = true;
    }
  }
  if (!haserr && (demand & SARR_LCPTAB))
  {
    if (map)
    {
      if (suffixarray->numberofallsortedsuffixes > 0)
      {
        suffixarray->lcptab
          = gt_fa_mmap_check_size_with_suffix(indexname,
                                         GT_LCPTABSUFFIX,
                                         suffixarray->numberofallsortedsuffixes,
                                         sizeof (*suffixarray->lcptab),
                                         err);
        if (suffixarray->lcptab == NULL)
        {
          haserr = true;
        }
      }
    } else
    {
      INITBufferedfile(indexname,&suffixarray->lcptabstream,GtUchar,
                       GT_LCPTABSUFFIX);
      if (!haserr &&
          fseek(suffixarray->lcptabstream.fp,
                (GtWord) sizeof (GtUchar),SEEK_SET))
      {
        gt_error_set(err,"fseek(esastream) failed: %s",strerror(errno));
        haserr = true;
      }
    }
    if (!haserr && !suffixarray->numoflargelcpvalues.defined)
    {
      gt_error_set(err,"numoflargelcpvalues not defined");
      haserr = true;
    }
    if (!haserr && suffixarray->numoflargelcpvalues.valueunsignedlong > 0)
    {
      if (map)
      {
        suffixarray->llvtab
          = gt_fa_mmap_check_size_with_suffix(indexname,
                                           GT_LARGELCPTABSUFFIX,
                                           (GtUword)
                                           suffixarray->numoflargelcpvalues.
                                           valueunsignedlong,
                                           sizeof (*suffixarray->llvtab),
                                           err);
        if (suffixarray->llvtab == NULL)
        {
          haserr = true;
        }
      } else
      {
        INITBufferedfile(indexname,&suffixarray->llvtabstream,Largelcpvalue,
                         GT_LARGELCPTABSUFFIX);
      }
    }
  }
  if (!haserr && (demand & SARR_BWTTAB))
  {
    if (map)
    {
      suffixarray->bwttab
        = gt_fa_mmap_check_size_with_suffix(indexname,
                                         GT_BWTTABSUFFIX,
                                         totallength+1,
                                         sizeof (*suffixarray->bwttab),
                                         err);
      if (suffixarray->bwttab == NULL)
      {
        haserr = true;
      }
    } else
    {
      INITBufferedfile(indexname,&suffixarray->bwttabstream,GtUchar,
                       GT_BWTTABSUFFIX);
    }
  }
  if (!haserr && (demand & SARR_BCKTAB))
  {
    suffixarray->bcktab
      = gt_bcktab_map(indexname,
                      gt_encseq_alphabetnumofchars(suffixarray->encseq),
                      suffixarray->prefixlength,
                      totallength+1,
                      true,
                      err);
    if (suffixarray->bcktab == NULL)
    {
      haserr = true;
    }
  }
  if (haserr)
  {
    gt_freesuffixarray(suffixarray);
  }
  return haserr ? -1 : 0;
}
Exemple #21
0
int gt_testmaxpairs(const char *indexname,
                    GtUword samples,
                    unsigned int minlength,
                    GtUword substringlength,
                    GtLogger *logger,
                    GtError *err)
{
  GtEncseq *encseq;
  GtUword totallength = 0, dblen, querylen;
  GtUchar *dbseq = NULL, *query = NULL;
  bool haserr = false;
  GtUword s;
  GtArray *tabmaxquerymatches;
  Maxmatchselfinfo maxmatchselfinfo;
  GtEncseqLoader *el;

  gt_logger_log(logger,"draw "GT_WU" samples",samples);

  el = gt_encseq_loader_new();
  gt_encseq_loader_do_not_require_des_tab(el);
  gt_encseq_loader_do_not_require_ssp_tab(el);
  gt_encseq_loader_do_not_require_sds_tab(el);
  gt_encseq_loader_set_logger(el, logger);
  encseq = gt_encseq_loader_load(el, indexname, err);
  gt_encseq_loader_delete(el);

  if (encseq == NULL)
  {
    haserr = true;
  } else
  {
    totallength = gt_encseq_total_length(encseq);
  }
  if (!haserr)
  {
    if (substringlength > totallength/2)
    {
      substringlength = totallength/2;
    }
    dbseq = gt_malloc(sizeof *dbseq * substringlength);
    query = gt_malloc(sizeof *query * substringlength);
  }
  for (s=0; s<samples && !haserr; s++)
  {
    dblen = samplesubstring(dbseq,encseq,substringlength);
    querylen = samplesubstring(query,encseq,substringlength);
    gt_logger_log(logger,"run query match for dblen="GT_WU""
                         ",querylen= "GT_WU", minlength=%u",
                         dblen,
                         querylen,
                         minlength);
    tabmaxquerymatches = gt_array_new(sizeof (Substringmatch));
    if (gt_sarrquerysubstringmatch(dbseq,
                                   dblen,
                                   query,
                                   (GtUword) querylen,
                                   minlength,
                                   gt_encseq_alphabet(encseq),
                                   storemaxmatchquery,
                                   tabmaxquerymatches,
                                   logger,
                                   err) != 0)
    {
      haserr = true;
      break;
    }
    gt_logger_log(logger,"run self match for dblen="GT_WU""
                         ",querylen= "GT_WU", minlength=%u",
                         dblen,
                         querylen,
                         minlength);
    maxmatchselfinfo.results = gt_array_new(sizeof (Substringmatch));
    maxmatchselfinfo.dblen = dblen;
    maxmatchselfinfo.querylen = querylen;
    maxmatchselfinfo.querymarkpos
      = sequence2markpositions(&maxmatchselfinfo.numofquerysequences,
                               query,querylen);
    if (sarrselfsubstringmatch(dbseq,
                               dblen,
                               query,
                               (GtUword) querylen,
                               minlength,
                               gt_encseq_alphabet(encseq),
                               storemaxmatchself,
                               &maxmatchselfinfo,
                               logger,
                               err) != 0)
    {
      haserr = true;
      break;
    }
    gt_array_sort(tabmaxquerymatches,orderSubstringmatch);
    gt_array_sort(maxmatchselfinfo.results,orderSubstringmatch);
    if (!gt_array_equal(tabmaxquerymatches,maxmatchselfinfo.results,
                        orderSubstringmatch))
    {
      const GtUword width = 60UL;
      printf("failure for query of length "GT_WU"\n",(GtUword) querylen);
      printf("querymatches\n");
      (void) gt_array_iterate(tabmaxquerymatches,showSubstringmatch,NULL,
                           err);
      printf("dbmatches\n");
      (void) gt_array_iterate(maxmatchselfinfo.results,showSubstringmatch,
                           NULL,err);
      gt_symbolstring2fasta(stdout,"dbseq",
                         gt_encseq_alphabet(encseq),
                         dbseq,
                         (GtUword) dblen,
                         width);
      gt_symbolstring2fasta(stdout,"queryseq",
                         gt_encseq_alphabet(encseq),
                         query,
                         (GtUword) querylen,
                         width);
      exit(GT_EXIT_PROGRAMMING_ERROR);
    }
    gt_free(maxmatchselfinfo.querymarkpos);
    printf("# numberofmatches="GT_WU"\n",gt_array_size(tabmaxquerymatches));
    gt_array_delete(tabmaxquerymatches);
    gt_array_delete(maxmatchselfinfo.results);
  }
  gt_free(dbseq);
  gt_free(query);
  gt_encseq_delete(encseq);
  encseq = NULL;
  return haserr ? -1 : 0;
}
static void condenseq_process_descriptions(GtCondenseq *condenseq,
                                           const GtEncseq *orig_es,
                                           GtLogger *logger)
{
  GtUword    *dist;
  const char *desc;
  char       *cur_id_startptr;
  GtUword     desclen,
              dist_idx,
              distsize = (GtUword) 128,
              idlen,
              idx,
              maxendidx = 0,
              maxlen = 0,
              minlen = GT_UWORD_MAX,
              wastedmem = 0,
              sdssize,
              cur_total_id_len = 0;
  bool        use_const_len;

  condenseq->ids_total_len = 0;
  dist = gt_calloc((size_t) distsize, sizeof (*dist));

  for (idx = 0; idx < condenseq->orig_num_seq; ++idx) {
    desc = gt_encseq_description(orig_es, &desclen, idx);
    idlen = condenseq_idlen(desc, desclen);
    if (distsize <= idlen) {
      dist = gt_realloc(dist, (size_t) (idlen + 1) * sizeof (*dist));
      for (dist_idx = distsize; dist_idx <= idlen; dist_idx++)
        dist[dist_idx] = 0;
      distsize = idlen + 1;
    }
    dist[idlen]++;
    if (idlen > maxlen)
      maxlen = idlen;
    if (idlen < minlen)
      minlen = idlen;
    maxendidx += idlen;
  }

  /* calculate memory we would waste if we assume equal length, and size if we
     store actual descriptions */
  for (dist_idx = minlen; dist_idx < maxlen; dist_idx++) {
    wastedmem += dist[dist_idx] * (maxlen - dist_idx);
    condenseq->ids_total_len += dist[dist_idx] * dist_idx;
  }
  condenseq->ids_total_len += dist_idx * dist[dist_idx];

  sdssize = (GtUword) gt_intset_best_memory_size(maxendidx,
                                                 condenseq->orig_num_seq);
  use_const_len = wastedmem < sdssize;

  if (use_const_len) {
    gt_logger_log(logger, "Condenseq descriptions will use const len, " GT_WU
                  ", \"wasting\" " GT_WU " bytes. SDS would use "
                  GT_WU " bytes",
                  maxlen, wastedmem, sdssize);
    condenseq->id_len = maxlen;
    condenseq->ids_total_len = maxlen * condenseq->orig_num_seq;
  }
  else {
    gt_logger_log(logger, "Condenseq descriptions will use sdstab with size "
                  GT_WU ". Const length would have wasted " GT_WU " bytes.",
                  sdssize, wastedmem);
    condenseq->sdstab = gt_intset_best_new(maxendidx, condenseq->orig_num_seq);
  }
  condenseq->orig_ids = gt_calloc((size_t) condenseq->ids_total_len,
                                  sizeof (*condenseq->orig_ids));

  cur_id_startptr = condenseq->orig_ids;
  for (idx = 0; idx < condenseq->orig_num_seq; ++idx) {
    desc = gt_encseq_description(orig_es, &desclen, idx);
    idlen = condenseq_idlen(desc, desclen);
    gt_assert(idlen <= maxlen);
    (void) memcpy(cur_id_startptr, desc, (size_t) idlen);
    if (use_const_len) {
      cur_id_startptr += maxlen;
      cur_total_id_len += maxlen;
    }
    else {
      cur_id_startptr += idlen;
      cur_total_id_len += idlen;
      gt_intset_add(condenseq->sdstab, cur_total_id_len);
    }
  }
  gt_assert(cur_total_id_len == condenseq->ids_total_len);
  gt_free(dist);
}
static int gt_readjoiner_assembly_runner(GT_UNUSED int argc,
    GT_UNUSED const char **argv, GT_UNUSED int parsed_args,
    void *tool_arguments, GtError *err)
{
  GtReadjoinerAssemblyArguments *arguments = tool_arguments;
  GtLogger *verbose_logger, *default_logger;
  GtEncseqLoader *el;
  GtEncseq *reads;
  GtTimer *timer = NULL;
  GtStrgraph *strgraph = NULL;
  GtBitsequence *contained = NULL;
  const char *readset = gt_str_get(arguments->readset);
  bool eqlen = true;
  GtUword nreads, tlen, rlen;
  int had_err = 0;

  gt_assert(arguments);
  gt_error_check(err);
  default_logger = gt_logger_new(!arguments->quiet, GT_LOGGER_DEFLT_PREFIX,
      stdout);
  gt_logger_log(default_logger,
      "gt readjoiner assembly (version "GT_READJOINER_VERSION")");
  verbose_logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX,
      stdout);
  gt_logger_log(verbose_logger, "verbose output activated");
  gt_logger_log(verbose_logger, "readset name = %s", readset);
  if (gt_showtime_enabled())
  {
    timer = gt_timer_new_with_progress_description(
        GT_READJOINER_ASSEMBLY_MSG_COUNTSPM);
    gt_timer_start(timer);
    gt_timer_show_cpu_time_by_progress(timer);
  }

  if (!arguments->paths2seq)
  {
    el = gt_encseq_loader_new();
    gt_encseq_loader_drop_description_support(el);
    gt_encseq_loader_disable_autosupport(el);
    reads = gt_encseq_loader_load(el, readset, err);
    if (reads == NULL)
    {
      had_err = -1;
    }
    if (had_err == 0)
    {
      eqlen = gt_encseq_accesstype_get(reads) == GT_ACCESS_TYPE_EQUALLENGTH;
      nreads = gt_encseq_num_of_sequences(reads);
      gt_logger_log(default_logger, "number of reads in filtered readset = "
                    GT_WU, nreads);
      tlen = gt_encseq_total_length(reads) - nreads + 1;
      gt_logger_log(verbose_logger, "total length of filtered readset = " GT_WU,
          tlen);

      if (eqlen)
      {
        rlen = gt_encseq_seqlength(reads, 0);
        gt_logger_log(verbose_logger, "read length = " GT_WU, rlen);
        gt_encseq_delete(reads);
        reads = NULL;
      }
      else
      {
        had_err = gt_readjoiner_assembly_build_contained_reads_list(
          arguments, &contained, err);
        rlen = 0;
        gt_logger_log(verbose_logger, "read length = variable");
        gt_assert(reads != NULL);
      }
    }

    if (had_err == 0)
    {
      if (!arguments->load)
      {
        had_err = gt_readjoiner_assembly_build_graph(arguments, &strgraph,
            reads, readset, eqlen, rlen, nreads, contained, default_logger,
            verbose_logger, timer, err);
      }
      else
      {
        gt_readjoiner_assembly_load_graph(&strgraph, reads, readset, rlen,
            default_logger, timer);
      }
    }

    if (!eqlen && reads != NULL && !arguments->errors)
    {
      gt_encseq_delete(reads);
      reads = NULL;
      if (had_err == 0)
        gt_strgraph_set_encseq(strgraph, NULL);
    }

    if (had_err == 0 && arguments->redtrans)
    {
      if (gt_showtime_enabled())
        gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_REDTRANS,
            stdout);
      gt_strgraph_sort_edges_by_len(strgraph, false);
      (void)gt_strgraph_redtrans(strgraph, false);
      (void)gt_strgraph_redself(strgraph, false);
      (void)gt_strgraph_redwithrc(strgraph, false);
      gt_strgraph_log_stats(strgraph, verbose_logger);
    }

    if (had_err == 0 && arguments->errors)
    {
      if (gt_showtime_enabled())
        gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_CLEANSG,
            stdout);
      gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_CLEANSG);
      had_err = gt_readjoiner_assembly_error_correction(strgraph,
          arguments->bubble, arguments->deadend, arguments->deadend_depth,
          verbose_logger);
    }

    if (had_err == 0 && arguments->save)
    {
      if (gt_showtime_enabled())
        gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_SAVESG,
            stdout);
      gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_SAVESG);
      gt_strgraph_show(strgraph, GT_STRGRAPH_BIN,
          gt_str_get(arguments->readset), GT_READJOINER_SUFFIX_SG, false);
    }

    if (!eqlen && reads != NULL)
    {
      gt_encseq_delete(reads);
      reads = NULL;
      if (had_err == 0)
        gt_strgraph_set_encseq(strgraph, NULL);
    }

    if (had_err == 0)
    {
      if (gt_showtime_enabled())
        gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_TRAVERSESG,
            stdout);
      gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_TRAVERSESG);
      gt_readjoiner_assembly_show_current_space("(before traversal)");
      gt_strgraph_spell(strgraph, (GtUword)arguments->depthcutoff,
          (GtUword)arguments->lengthcutoff, arguments->vd, readset,
          GT_READJOINER_SUFFIX_CONTIG_PATHS, NULL, true,
          arguments->show_contigs_info, false, verbose_logger);
    }

    if (contained != NULL)
      gt_free(contained);
    gt_strgraph_delete(strgraph);
    strgraph = NULL;
    gt_assert(reads == NULL);
    gt_encseq_loader_delete(el);
  }

  if (had_err == 0)
  {
    gt_readjoiner_assembly_show_current_space("(before paths2seq)");
    had_err = gt_readjoiner_assembly_paths2seq(readset,
        (GtUword)arguments->lengthcutoff, arguments->vd,
        arguments->astat, arguments->coverage, arguments->copynum,
        arguments->buffersize, default_logger, &timer, err);
  }

  if (gt_showtime_enabled())
  {
    gt_timer_show_progress_final(timer, stdout);
    gt_timer_delete(timer);
  }
  gt_logger_delete(default_logger);
  gt_logger_delete(verbose_logger);
  return had_err;
}
/*read condenseq data structure from file*/
GtCondenseq *gt_condenseq_new_from_file(const char *indexname,
                                        GtLogger *logger, GtError *err)
{
  int had_err = 0;
  FILE* fp;
  GtEncseqLoader *esl;
  GtEncseq *unique_es;
  GtCondenseq *condenseq = NULL;
  /*load unique_es*/
  esl = gt_encseq_loader_new();
  unique_es = gt_encseq_loader_load(esl, indexname, err);
  if (!unique_es)
    had_err = -1;
  if (!had_err) {
    gt_encseq_loader_delete(esl);
    condenseq = condenseq_new_empty(gt_encseq_alphabet(unique_es));
    condenseq->filename = gt_cstr_dup(indexname);
    condenseq->unique_es = unique_es;
    fp = gt_fa_fopen_with_suffix(indexname, GT_CONDENSEQ_FILE_SUFFIX,
                                 "rb", err);
    if (fp == NULL) {
      had_err = -1;
    }
    else {
      had_err = condenseq_io(condenseq, fp, gt_io_error_fread, err);
      if (!had_err) {
        GtUword i;
        gt_assert(condenseq->uniques);
        gt_assert(condenseq->links);
        gt_fa_fclose(fp);
        /*create link array for each unique entry*/
        for (i = 0; i < condenseq->udb_nelems; i++) {
          GT_INITARRAY(&(condenseq->uniques[i].links),uint32_t);
        }
        /* check for overflows */
        if (condenseq->ldb_nelems > (GtUword) ((uint32_t) 0 - (uint32_t) 1)) {
          gt_error_set(err, "Overflow, to many link-elements. Can't be stored");
          had_err = -1;
        }
        /* iterate through link entrys and store ids in corresponding unique
          entry array */
        for (i = 0; !had_err && (GtUword) i < condenseq->ldb_nelems; i++) {
          GtUword uid = condenseq->links[i].unique_id;
          gt_assert(uid < condenseq->udb_nelems);
          GT_STOREINARRAY(&(condenseq->uniques[uid].links),
                          uint32_t,
                          10,
                          (uint32_t) i);
        }
      }
    }
  }
  if (!had_err) {
    gt_assert(condenseq != NULL);
    if (condenseq->id_len != GT_UNDEF_UWORD)
      gt_logger_log(logger, "IDs const len: " GT_WU, condenseq->id_len);
    else
      gt_logger_log(logger, "using sdstab to access IDs");
  }
  if (had_err) {
    gt_condenseq_delete(condenseq);
    condenseq = NULL;
  }
  return (condenseq);
}
static int gt_condenseq_compress_runner(GT_UNUSED int argc, const char **argv,
                                        int parsed_args, void *tool_arguments,
                                        GtError *err)
{
  GtCondenseqCompressArguments *arguments = tool_arguments;
  GtLogger *logger,
           *kdb_logger;
  FILE *kmer_fp = NULL;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr);
  kdb_logger = gt_logger_new(arguments->kdb, GT_LOGGER_DEFLT_PREFIX, stderr);
  if (arguments->kdb) {
    kmer_fp = gt_fa_fopen("kmer_db.out", "w", err);
    gt_logger_set_target(kdb_logger, kmer_fp);
  }

  if (gt_str_length(arguments->indexname) == 0UL) {
    char *basenameptr;
    basenameptr = gt_basename(argv[parsed_args]);
    gt_str_set(arguments->indexname, basenameptr);
    gt_free(basenameptr);
  }

  if (!had_err) {
    GtEncseqLoader *es_l = gt_encseq_loader_new();
    arguments->input_es = gt_encseq_loader_load(es_l, argv[parsed_args], err);
    if (arguments->input_es == NULL)
      had_err = -1;
    gt_encseq_loader_delete(es_l);
  }

  if (!had_err) {
    if (arguments->minalignlength == GT_UNDEF_UWORD)
      arguments->minalignlength = arguments->initsize != GT_UNDEF_UWORD ?
                                  arguments->initsize / (GtUword) 3UL :
                                  GT_UNDEF_UWORD;
    if (arguments->windowsize == GT_UNDEF_UINT)
      arguments->windowsize = arguments->minalignlength != GT_UNDEF_UWORD ?
                              (unsigned int) (arguments->minalignlength / 5U) :
                              GT_UNDEF_UINT;
    if (arguments->windowsize < 4U)
      arguments->windowsize = 4U;
    if (arguments->kmersize == GT_UNDEF_UINT) {
      unsigned int size =
        gt_alphabet_num_of_chars(gt_encseq_alphabet(arguments->input_es));
      /* size^k ~= 100000 */
      gt_safe_assign(arguments->kmersize,
                     gt_round_to_long(gt_log_base(100000.0, (double) size)));
      gt_logger_log(logger, "|A|: %u, k: %u",
                    size, arguments->kmersize);
    }

    if (arguments->windowsize == GT_UNDEF_UINT) {
      arguments->windowsize = 5U * arguments->kmersize;
    }
    if (arguments->minalignlength == GT_UNDEF_UWORD) {
      arguments->minalignlength = (GtUword) (3UL * arguments->windowsize);
    }
    if (arguments->initsize == GT_UNDEF_UWORD) {
      arguments->initsize = (GtUword) (3UL * arguments->minalignlength);
    }
  }
  if (!had_err &&
      arguments->windowsize <= arguments->kmersize) {
    gt_error_set(err, "-windowsize (%u) must be larger -kmersize (%u)!",
                 arguments->windowsize, arguments->kmersize);
    had_err = -1;
  }
  if (!had_err &&
      arguments->minalignlength < (GtUword) arguments->windowsize) {
    gt_error_set(err, "-alignlength (" GT_WU ") must be at least "
                 "-windowsize (%u)!", arguments->minalignlength,
                 arguments->windowsize);
    had_err = -1;
  }
  if (!had_err && (arguments->initsize < arguments->minalignlength)) {
    gt_error_set(err, "-initsize (" GT_WU ") must be at least "
                 "-alignlength (" GT_WU ")!", arguments->initsize,
                 arguments->minalignlength);
    had_err = -1;
  }

  if (!had_err) {
    GtCondenseqCreator *ces_c;

    if (!had_err) {
      ces_c = gt_condenseq_creator_new(arguments->initsize,
                                       arguments->minalignlength,
                                       arguments->xdrop,
                                       &(arguments->scores),
                                       arguments->kmersize,
                                       arguments->windowsize,
                                       logger,
                                       err);
      if (ces_c == NULL)
        had_err = -1;
    }
    if (!had_err) {
      if (arguments->cutoff_value == GT_UNDEF_UWORD)
        gt_condenseq_creator_use_mean_cutoff(ces_c);
      else if (arguments->cutoff_value == 0)
        gt_condenseq_creator_disable_cutoff(ces_c);
      else
        gt_condenseq_creator_set_cutoff(ces_c, arguments->cutoff_value);
      gt_condenseq_creator_set_mean_fraction(ces_c, arguments->fraction);
      if (arguments->prune)
        gt_condenseq_creator_disable_prune(ces_c);
      if (arguments->brute)
        gt_condenseq_creator_enable_brute_force(ces_c);
      if (!arguments->diags)
        gt_condenseq_creator_disable_diagonals(ces_c);
      if (arguments->full_diags)
        gt_condenseq_creator_enable_full_diagonals(ces_c);
      if (arguments->clean_percent != GT_UNDEF_UINT)
        gt_condenseq_creator_set_diags_clean_limit(ces_c,
                                                   arguments->clean_percent);

      had_err = gt_condenseq_creator_create(ces_c,
                                            arguments->indexname,
                                            arguments->input_es,
                                            logger, kdb_logger, err);

      gt_condenseq_creator_delete(ces_c);
    }
  }

  gt_logger_delete(logger);
  gt_logger_delete(kdb_logger);
  if (arguments->kdb)
    gt_fa_fclose(kmer_fp);
  return had_err;
}