static int gt_compreads_compress_runner(GT_UNUSED int argc,
                                    GT_UNUSED const char **argv,
                                    GT_UNUSED int parsed_args,
                                    void *tool_arguments, GtError *err)
{
  GtCsrHcrEncodeArguments *arguments = tool_arguments;
  int had_err = 0;
  GtAlphabet *alpha = NULL;
  GtHcrEncoder *hcre = NULL;
  GtTimer *timer = NULL;

  gt_error_check(err);
  gt_assert(arguments);
  if (gt_showtime_enabled()) {
    timer = gt_timer_new_with_progress_description("start");
    gt_timer_start(timer);
    gt_assert(timer);
  }

  if (gt_str_length(arguments->smap) > 0) {
    alpha = gt_alphabet_new_from_file_no_suffix(gt_str_get(arguments->smap),
                                                err);
    if (!alpha)
      had_err = 1;
  }
  else {
    alpha = gt_alphabet_new_dna();
    if (!alpha)
      had_err = 1;
  }
  if (!had_err) {
    if (timer != NULL)
      gt_timer_show_progress(timer, "encoding", stdout);
    hcre = gt_hcr_encoder_new(arguments->files, alpha, arguments->descs,
                              arguments->qrng, timer, err);
    if (!hcre)
      had_err = 1;
    else {
      if (arguments->pagewise)
        gt_hcr_encoder_set_sampling_page(hcre);
      else if (arguments->regular)
        gt_hcr_encoder_set_sampling_regular(hcre);

      gt_hcr_encoder_set_sampling_rate(hcre, arguments->srate);

      if (gt_hcr_encoder_encode(hcre, gt_str_get(arguments->name),
                                timer, err) != 0)
        had_err = 1;
    }
    gt_hcr_encoder_delete(hcre);
  }
  gt_alphabet_delete(alpha);
  if (timer != NULL) {
    gt_timer_show_progress_final(timer, stdout);
    gt_timer_delete(timer);
  }
  return had_err;
}
static int gt_compreads_decompress_benchmark(GtHcrDecoder *hcrd,
                                             unsigned long amount,
                                             GtTimer *timer,
                                             GtError *err) {
  char qual[BUFSIZ] = {0},
       seq[BUFSIZ] = {0};
  int had_err = 0;
  unsigned long rand,
                max_rand = gt_hcr_decoder_num_of_reads(hcrd) - 1,
                count;

  GtStr *timer_comment = gt_str_new_cstr("extracting ");
  GtStr *desc = gt_str_new();

  gt_str_append_ulong(timer_comment, amount);
  gt_str_append_cstr(timer_comment, " reads of ");
  gt_str_append_ulong(timer_comment, max_rand + 1);
  gt_str_append_cstr(timer_comment, "!");

  if (timer == NULL) {
    timer = gt_timer_new_with_progress_description("extract random reads");
    gt_timer_start(timer);
  }
  else {
    gt_timer_show_progress(timer, "extract random reads", stdout);
  }

  gt_log_log("%s",gt_str_get(timer_comment));
  for (count = 0; count < amount; count++) {
    if (!had_err) {
      rand = gt_rand_max(max_rand);
      gt_log_log("get read: %lu", rand);
      had_err = gt_hcr_decoder_decode(hcrd, rand, seq, qual, desc, err);
      gt_log_log("%s",gt_str_get(desc));
      gt_log_log("%s",seq);
      gt_log_log("%s",qual);
    }
  }
  gt_str_delete(timer_comment);
  gt_str_delete(desc);
  if (!gt_showtime_enabled())
    gt_timer_delete(timer);
  return had_err;
}
Пример #3
0
static int gt_seed_extend_runner(GT_UNUSED int argc,
                                 GT_UNUSED const char **argv,
                                 GT_UNUSED int parsed_args,
                                 void *tool_arguments,
                                 GtError *err)
{
  GtSeedExtendArguments *arguments = tool_arguments;
  GtEncseqLoader *encseq_loader = NULL;
  GtEncseq *aencseq = NULL, *bencseq = NULL;
  GtGreedyextendmatchinfo *grextinfo = NULL;
  GtXdropmatchinfo *xdropinfo = NULL;
  GtQuerymatchoutoptions *querymatchoutopt = NULL;
  GtTimer *seedextendtimer = NULL;
  GtExtendCharAccess cam = GT_EXTEND_CHAR_ACCESS_ANY;
  GtUword errorpercentage = 0UL;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(arguments != NULL);
  gt_assert(arguments->se_minidentity >= GT_EXTEND_MIN_IDENTITY_PERCENTAGE &&
            arguments->se_minidentity <= 100UL);

  /* Calculate error percentage from minidentity */
  errorpercentage = 100UL - arguments->se_minidentity;

  /* Measure whole running time */
  if (arguments->benchmark || arguments->verbose) {
    gt_showtime_enable();
  }
  if (gt_showtime_enabled())
  {
    seedextendtimer = gt_timer_new();
    gt_timer_start(seedextendtimer);
  }

  /* Load encseq A */
  encseq_loader = gt_encseq_loader_new();
  gt_encseq_loader_enable_autosupport(encseq_loader);
  aencseq = gt_encseq_loader_load(encseq_loader,
                                  gt_str_get(arguments->dbs_indexname),
                                  err);
  if (aencseq == NULL)
    had_err = -1;

  /* If there is a 2nd read set: Load encseq B */
  if (!had_err) {
    if (strcmp(gt_str_get(arguments->dbs_queryname), "") != 0) {
      bencseq = gt_encseq_loader_load(encseq_loader,
                                      gt_str_get(arguments->dbs_queryname),
                                      err);
    } else {
      bencseq = gt_encseq_ref(aencseq);
    }
    if (bencseq == NULL) {
      had_err = -1;
      gt_encseq_delete(aencseq);
    }
  }
  gt_encseq_loader_delete(encseq_loader);

  /* set character access method */
  if (!had_err && (gt_option_is_set(arguments->se_option_greedy) ||
                   gt_option_is_set(arguments->se_option_xdrop) ||
                   arguments->se_alignmentwidth > 0))
  {
    cam = gt_greedy_extend_char_access(gt_str_get
                                       (arguments->se_char_access_mode),
                                       err);
    if ((int) cam == -1) {
      had_err = -1;
      gt_encseq_delete(aencseq);
      gt_encseq_delete(bencseq);
    }
  }

  /* Use bias dependent parameters, adapted from E. Myers' DALIGNER */
  if (!had_err && arguments->bias_parameters) {
    const GtAlphabet *alpha = gt_encseq_alphabet(aencseq);
    const double bias_factor[10] = {.690, .690, .690, .690, .780,
                                    .850, .900, .933, .966, 1.000};

    if (gt_alphabet_is_dna(alpha)) {
      GtUword at, cg;
      at = gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'a'));
      at += gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 't'));
      cg = gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'c'));
      cg += gt_encseq_charcount(aencseq, gt_alphabet_encode(alpha, 'g'));
      if (at + cg > 0) {
        const double ratio = (double)MIN(at, cg) / (at + cg);
        int bias_index = (int)MAX(0.0, (ratio + 0.025) * 20.0 - 1.0);
        gt_assert(bias_index < 10);
        arguments->se_maxalilendiff = 30;
        arguments->se_perc_match_hist = (GtUword)(100.0 - errorpercentage *
                                                  bias_factor[bias_index]);
        if (arguments->verbose) {
          printf("# Base ratio = %4.2lf -> percmathistory = "GT_WU"\n",
                 ratio, arguments->se_perc_match_hist);
        }
      } else {
        had_err = -1;
      }
    } else {
      had_err = -1;
    }
    if (had_err) {
      gt_error_set(err, "option \"-bias-parameters\" can only be applied to "
                   "the DNA alphabet");
      gt_encseq_delete(aencseq);
      gt_encseq_delete(bencseq);
    }
  }

  /* Prepare options for greedy extension */
  if (!had_err && gt_option_is_set(arguments->se_option_greedy)) {
    grextinfo = gt_greedy_extend_matchinfo_new(errorpercentage,
                                               arguments->se_maxalilendiff,
                                               arguments->se_historysize,
                                               arguments->se_perc_match_hist,
                                               arguments->se_alignlength,
                                               cam,
                                               arguments->se_extendgreedy);
    if (arguments->benchmark) {
      gt_greedy_extend_matchinfo_silent_set(grextinfo);
    }
  }

  /* Prepare options for xdrop extension */
  if (!had_err && gt_option_is_set(arguments->se_option_xdrop)) {
    xdropinfo = gt_xdrop_matchinfo_new(arguments->se_alignlength,
                                       errorpercentage,
                                       arguments->se_xdropbelowscore,
                                       arguments->se_extendxdrop);
    if (arguments->benchmark) {
      gt_xdrop_matchinfo_silent_set(xdropinfo);
    }
  }

  /* Prepare output options */
  if (!had_err && (arguments->se_alignmentwidth > 0 ||
                   gt_option_is_set(arguments->se_option_xdrop)))
  {
    querymatchoutopt
      = gt_querymatchoutoptions_new(arguments->se_alignmentwidth);

    if (gt_option_is_set(arguments->se_option_xdrop) ||
        gt_option_is_set(arguments->se_option_greedy))
    {
      const GtUword sensitivity = gt_option_is_set(arguments->se_option_greedy)
                                    ? arguments->se_extendgreedy : 100;

      gt_querymatchoutoptions_extend(querymatchoutopt,
                                     errorpercentage,
                                     arguments->se_maxalilendiff,
                                     arguments->se_historysize,
                                     arguments->se_perc_match_hist,
                                     cam,
                                     sensitivity);
    }
  }

  /* Start algorithm */
  if (!had_err) {
    GtDiagbandseed dbsarguments;
    dbsarguments.errorpercentage = errorpercentage;
    dbsarguments.userdefinedleastlength = arguments->se_alignlength;
    dbsarguments.seedlength = arguments->dbs_seedlength;
    dbsarguments.logdiagbandwidth = arguments->dbs_logdiagbandwidth;
    dbsarguments.mincoverage = arguments->dbs_mincoverage;
    dbsarguments.maxfreq = arguments->dbs_maxfreq;
    dbsarguments.memlimit = arguments->dbs_memlimit;
    dbsarguments.mirror = arguments->mirror;
    dbsarguments.overlappingseeds = arguments->overlappingseeds;
    dbsarguments.verify = arguments->dbs_verify;
    dbsarguments.verbose = arguments->verbose;
    dbsarguments.debug_kmer = arguments->dbs_debug_kmer;
    dbsarguments.debug_seedpair = arguments->dbs_debug_seedpair;
    dbsarguments.seed_display = arguments->seed_display;
    dbsarguments.extendgreedyinfo = grextinfo;
    dbsarguments.extendxdropinfo = xdropinfo;
    dbsarguments.querymatchoutopt = querymatchoutopt;

    had_err = gt_diagbandseed_run(aencseq, bencseq, &dbsarguments, err);

    /* clean up */
    gt_encseq_delete(aencseq);
    gt_encseq_delete(bencseq);
    if (gt_option_is_set(arguments->se_option_greedy)) {
      gt_greedy_extend_matchinfo_delete(grextinfo);
    }
    if (gt_option_is_set(arguments->se_option_xdrop)) {
      gt_xdrop_matchinfo_delete(xdropinfo);
    }
    if (arguments->se_alignmentwidth > 0 ||
        gt_option_is_set(arguments->se_option_xdrop)) {
      gt_querymatchoutoptions_delete(querymatchoutopt);
    }
  }

  if (gt_showtime_enabled()) {
    if (!had_err) {
      char *keystring
        = gt_seed_extend_params_keystring(gt_option_is_set(arguments->
                                                           se_option_greedy),
                                          gt_option_is_set(arguments->
                                                           se_option_xdrop),
                                          arguments->dbs_seedlength,
                                          arguments->se_alignlength,
                                          arguments->se_minidentity,
                                          arguments->se_maxalilendiff,
                                          arguments->se_perc_match_hist,
                                          arguments->se_extendgreedy,
                                          arguments->se_extendxdrop,
                                          arguments->se_xdropbelowscore);
      printf("# TIME seedextend-%s", keystring);
      gt_free(keystring);
      gt_timer_show_formatted(seedextendtimer,
                              " overall " GT_WD ".%06ld\n",
                              stdout);
    }
    gt_timer_delete(seedextendtimer);
  }
  return had_err;
}
Пример #4
0
static int gt_genomediff_runner(int argc, const char **argv,
                                int parsed_args, void *tool_arguments,
                                GtError *err)
{
    bool mirrored = false;
    int had_err = 0,
        i;
    GtEncseq              *encseq = NULL;
    GtGenomediffArguments *arguments = tool_arguments;
    GtLogger              *logger;
    GtShuUnitFileInfo     *unit_info = NULL;
    GtTimer               *timer = NULL;

    gt_error_check(err);
    gt_assert(arguments);

    logger = gt_logger_new(arguments->verbose,
                           GT_LOGGER_DEFLT_PREFIX,
                           stdout);
    gt_assert(logger);

    for (i = parsed_args; i < argc; i++) {
        gt_str_array_add_cstr(arguments->filenames, argv[i]);
    }

    if (gt_showtime_enabled()) {
        timer = gt_timer_new_with_progress_description("start");
        gt_timer_start(timer);
        gt_assert(timer);
    }

    if (arguments->with_units) {
        gt_logger_log(logger, "unitfile option set, filename is %s\n",
                      gt_str_get(arguments->unitfile));
    }

    if (timer != NULL)
        gt_timer_show_progress(timer, "start shu search", stdout);

    if (gt_str_array_size(arguments->filenames) > 1UL) {
        GtEncseqEncoder *ee = gt_encseq_encoder_new();
        gt_encseq_encoder_set_timer(ee, timer);
        gt_encseq_encoder_set_logger(ee, logger);
        /* kr only makes sense for dna, so we can check this already with ee */
        gt_encseq_encoder_set_input_dna(ee);
        had_err = gt_encseq_encoder_encode(ee, arguments->filenames,
                                           gt_str_get(arguments->indexname), err);
        gt_encseq_encoder_delete(ee);
    }
    else {
        gt_str_append_str(arguments->indexname,
                          gt_str_array_get_str(arguments->filenames, 0));
        if (arguments->with_esa || arguments->with_pck) {
            GtStr *current_line = gt_str_new();
            FILE *prj_fp;
            const char *buffer;
            char **elements = NULL;

            prj_fp = gt_fa_fopen_with_suffix(gt_str_get(arguments->indexname),
                                             GT_PROJECTFILESUFFIX,"rb",err);
            if (prj_fp == NULL)
                had_err = -1;
            while (!had_err && gt_str_read_next_line(current_line, prj_fp) != EOF) {
                buffer = gt_str_get(current_line);
                if (elements != NULL) {
                    gt_free(elements[0]);
                    gt_free(elements[1]);
                }
                gt_free(elements);
                elements = gt_cstr_split(buffer, '=');
                gt_log_log("%s", elements[0]);
                if (strcmp("mirrored", elements[0]) == 0) {
                    gt_log_log("%s", elements[1]);
                    if (strcmp("1", elements[1]) == 0) {
                        mirrored = true;
                        gt_log_log("sequences are treated as mirrored");
                    }
                }
                gt_str_reset(current_line);
            }
            gt_str_delete(current_line);
            if (elements != NULL) {
                gt_free(elements[0]);
                gt_free(elements[1]);
            }
            gt_free(elements);
            gt_fa_xfclose(prj_fp);
        }
    }

    if (!had_err) {
        GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts,
                             err);
        if (mirrored)
            gt_encseq_loader_mirror(el);
        encseq =
            gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err);
        gt_encseq_loader_delete(el);
    }
    if (encseq == NULL)
        had_err = -1;
    if (!had_err) {
        unit_info = gt_shu_unit_info_new(encseq);
        if (arguments->with_units)
            had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info,
                                                 logger, err);
    }

    if (!had_err) {
        uint64_t **shusums = NULL;
        if (arguments->with_esa || arguments->with_pck) {
            shusums = gt_genomediff_shulen_sum(arguments, unit_info,
                                               logger, timer, err);
            if (shusums == NULL)
                had_err = -1;
        }
        else {
            const bool doesa = true;
            GenomediffInfo gd_info;
            Suffixeratoroptions sopts;
            sopts.beverbose = arguments->verbose;
            sopts.indexname = arguments->indexname;
            sopts.db = NULL;
            sopts.encopts = NULL;
            sopts.genomediff = true;
            sopts.inputindex = arguments->indexname;
            sopts.loadopts = arguments->loadopts;
            sopts.showprogress = false;
            sopts.idxopts = arguments->idxopts;

            gt_assert(unit_info != NULL);
            gt_array2dim_calloc(shusums, unit_info->num_of_genomes,
                                unit_info->num_of_genomes);
            gd_info.shulensums = shusums;
            gd_info.unit_info = unit_info;
            had_err = gt_runsuffixerator(doesa, &sopts, &gd_info, logger, err);
        }
        if (!had_err && shusums != NULL) {
            had_err = gt_genomediff_kr_calc(shusums, arguments, unit_info,
                                            arguments->with_pck, logger, timer, err);
            gt_array2dim_delete(shusums);
        }
    }

    if (timer != NULL) {
        gt_timer_show_progress_final(timer, stdout);
        gt_timer_delete(timer);
    }
    gt_logger_delete(logger);
    gt_encseq_delete(encseq);
    gt_shu_unit_info_delete(unit_info);

    return had_err;
}
Пример #5
0
static int gt_speck_runner(int argc, const char **argv, int parsed_args,
                               void *tool_arguments, GtError *err)
{
  GtNodeStream *gff3_in_stream = NULL, *checker_stream = NULL,
               *a_in_stream = NULL, *a_out_stream = NULL,
               *feature_stream = NULL, *sort_stream = NULL,
               *last_stream = NULL;
  GtNodeVisitor *spec_visitor = NULL;
  GtSpecResults *res = NULL;
  GtFeatureIndex *fi = NULL;
  GtTimer *t = NULL;
  GtRegionMapping *rm = NULL;
  GtArray *arr = gt_array_new(sizeof (GtFeatureNode*));
  SpeccheckArguments *arguments = tool_arguments;

  int had_err = 0;
  gt_error_check(err);

  res = gt_spec_results_new();
  gt_assert(res);
  t = gt_timer_new();
  gt_assert(t);

  spec_visitor = gt_spec_visitor_new(gt_str_get(arguments->specfile), res,
                                     err);
  if (!spec_visitor)
    return -1;

  /* add region mapping if given */
  if (gt_seqid2file_option_used(arguments->s2fi)) {
    rm = gt_seqid2file_region_mapping_new(arguments->s2fi, err);
    if (!rm)
      had_err = -1;
    if (!had_err)
      gt_spec_visitor_add_region_mapping((GtSpecVisitor*) spec_visitor, rm);
  }

  /* set runtime error behaviour */
  if (arguments->fail_hard)
    gt_spec_visitor_fail_on_runtime_error((GtSpecVisitor*) spec_visitor);
  else
    gt_spec_visitor_report_runtime_errors((GtSpecVisitor*) spec_visitor);

  /* redirect warnings */
  gt_warning_set_handler(gt_speck_record_warning, res);

  last_stream = gff3_in_stream = gt_gff3_in_stream_new_unsorted(
                                                            argc - parsed_args,
                                                            argv + parsed_args);
  gt_assert(gff3_in_stream);
  gt_gff3_in_stream_enable_tidy_mode((GtGFF3InStream*) gff3_in_stream);

  /* insert sort stream if requested */
  if (arguments->sort) {
    last_stream = sort_stream = gt_sort_stream_new(last_stream);
  }

  /* if -provideindex is given, collect input features and index them first */
  if (arguments->provideindex) {
    fi = gt_feature_index_memory_new();
    gt_assert(fi);

    last_stream = feature_stream = gt_feature_stream_new(last_stream, fi);
    gt_assert(feature_stream);

    last_stream = a_out_stream = gt_array_out_stream_all_new(last_stream, arr,
                                                             err);
    if (!a_out_stream)
      had_err = -1;

    gt_timer_start(t);

    if (!had_err)
      had_err = gt_node_stream_pull(last_stream, err);

    if (!had_err) {
      gt_spec_visitor_add_feature_index((GtSpecVisitor*) spec_visitor,
                                        gt_feature_index_ref(fi));
      last_stream = a_in_stream = gt_array_in_stream_new(arr, NULL, err);
      if (!a_in_stream)
        had_err = -1;
    }
  } else {
    gt_timer_start(t);
  }

  if (!had_err) {
    checker_stream = gt_visitor_stream_new(last_stream, spec_visitor);
    gt_assert(checker_stream);
  }

  /* perform checking  */
  if (!had_err)
    had_err = gt_node_stream_pull(checker_stream, err);

  gt_timer_stop(t);

  /* reset warnings output */
  gt_warning_set_handler(gt_warning_default_handler, NULL);

  /* output results */
  if (!had_err)
    gt_spec_results_report(res, arguments->outfp,
                           gt_str_get(arguments->specfile),
                           arguments->verbose, arguments->colored,
                           !arguments->allexpects);

  if (!had_err)
    gt_timer_show_formatted(t, "Finished in " GT_WD ".%06ld s.\n", stderr);

  /* free */
  gt_node_stream_delete(gff3_in_stream);
  gt_node_stream_delete(a_in_stream);
  gt_node_stream_delete(a_out_stream);
  gt_node_stream_delete(checker_stream);
  gt_node_stream_delete(feature_stream);
  gt_node_stream_delete(sort_stream);
  gt_spec_results_delete(res);
  gt_feature_index_delete(fi);
  gt_timer_delete(t);
  gt_array_delete(arr);

  return had_err;
}
static int gt_compreads_decompress_runner(GT_UNUSED int argc,
                                    GT_UNUSED const char **argv,
                                    GT_UNUSED int parsed_args,
                                    void *tool_arguments, GtError *err)
{
  GtCsrHcrDecodeArguments *arguments = tool_arguments;
  int had_err = 0;
  GtAlphabet *alpha = NULL;
  GtHcrDecoder *hcrd = NULL;
  GtTimer *timer = NULL;
  unsigned long start,
                end;

  gt_error_check(err);
  gt_assert(arguments);

  if (gt_showtime_enabled()) {
    timer = gt_timer_new_with_progress_description("start");
    gt_timer_start(timer);
    gt_assert(timer);
  }

  if (gt_str_length(arguments->smap) > 0) {
    alpha = gt_alphabet_new_from_file_no_suffix(gt_str_get(arguments->smap),
                                                err);
    if (!alpha)
      had_err = -1;
  }
  else {
    alpha = gt_alphabet_new_dna();
    if (!alpha)
      had_err = -1;
  }

  if (!had_err) {
    if (timer != NULL)
      gt_timer_show_progress(timer, "decoding", stdout);

    if (gt_str_length(arguments->name) == 0) {
      char *basenameptr;
      basenameptr = gt_basename(gt_str_get(arguments->file));
      gt_str_set(arguments->name, basenameptr);
      gt_free(basenameptr);
    }
    hcrd = gt_hcr_decoder_new(gt_str_get(arguments->file), alpha,
                              arguments->descs, timer, err);
    if (hcrd == NULL)
      had_err = -1;
    else {
      if (arguments->bench != 0) {
        had_err = gt_compreads_decompress_benchmark(hcrd,
                                                    arguments->bench,
                                                    timer, err);
      }
      else {
        if (arguments->rng.start != GT_UNDEF_ULONG
            && arguments->rng.end != GT_UNDEF_ULONG) {
          if (arguments->rng.start >= gt_hcr_decoder_num_of_reads(hcrd)
                || arguments->rng.end >= gt_hcr_decoder_num_of_reads(hcrd)) {
            gt_error_set(err, "range %lu-%lu includes a read number exceeding "
                              "the total number of reads (%lu)",
                              arguments->rng.start,
                              arguments->rng.end,
                              gt_hcr_decoder_num_of_reads(hcrd));
            had_err = -1;
          }
          start = arguments->rng.start;
          end = arguments->rng.end;
        }
        else {
          start = 0;
          end = gt_hcr_decoder_num_of_reads(hcrd) - 1;
        }
        if (!had_err) {
          gt_log_log("filebasename: %s", gt_str_get(arguments->name));
          if (gt_hcr_decoder_decode_range(hcrd, gt_str_get(arguments->name),
                                          start, end, timer, err)
            != 0)
            had_err = -1;
        }
      }
    }
    gt_hcr_decoder_delete(hcrd);
  }
  gt_alphabet_delete(alpha);
  if (timer != NULL) {
    gt_timer_show_progress_final(timer, stdout);
    gt_timer_delete(timer);
  }
  if (had_err)
    gt_assert(gt_error_is_set(err));
  return had_err;
}
static int gt_readjoiner_assembly_runner(GT_UNUSED int argc,
    GT_UNUSED const char **argv, GT_UNUSED int parsed_args,
    void *tool_arguments, GtError *err)
{
  GtReadjoinerAssemblyArguments *arguments = tool_arguments;
  GtLogger *verbose_logger, *default_logger;
  GtEncseqLoader *el;
  GtEncseq *reads;
  GtTimer *timer = NULL;
  GtStrgraph *strgraph = NULL;
  GtBitsequence *contained = NULL;
  const char *readset = gt_str_get(arguments->readset);
  bool eqlen = true;
  GtUword nreads, tlen, rlen;
  int had_err = 0;

  gt_assert(arguments);
  gt_error_check(err);
  default_logger = gt_logger_new(!arguments->quiet, GT_LOGGER_DEFLT_PREFIX,
      stdout);
  gt_logger_log(default_logger,
      "gt readjoiner assembly (version "GT_READJOINER_VERSION")");
  verbose_logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX,
      stdout);
  gt_logger_log(verbose_logger, "verbose output activated");
  gt_logger_log(verbose_logger, "readset name = %s", readset);
  if (gt_showtime_enabled())
  {
    timer = gt_timer_new_with_progress_description(
        GT_READJOINER_ASSEMBLY_MSG_COUNTSPM);
    gt_timer_start(timer);
    gt_timer_show_cpu_time_by_progress(timer);
  }

  if (!arguments->paths2seq)
  {
    el = gt_encseq_loader_new();
    gt_encseq_loader_drop_description_support(el);
    gt_encseq_loader_disable_autosupport(el);
    reads = gt_encseq_loader_load(el, readset, err);
    if (reads == NULL)
    {
      had_err = -1;
    }
    if (had_err == 0)
    {
      eqlen = gt_encseq_accesstype_get(reads) == GT_ACCESS_TYPE_EQUALLENGTH;
      nreads = gt_encseq_num_of_sequences(reads);
      gt_logger_log(default_logger, "number of reads in filtered readset = "
                    GT_WU, nreads);
      tlen = gt_encseq_total_length(reads) - nreads + 1;
      gt_logger_log(verbose_logger, "total length of filtered readset = " GT_WU,
          tlen);

      if (eqlen)
      {
        rlen = gt_encseq_seqlength(reads, 0);
        gt_logger_log(verbose_logger, "read length = " GT_WU, rlen);
        gt_encseq_delete(reads);
        reads = NULL;
      }
      else
      {
        had_err = gt_readjoiner_assembly_build_contained_reads_list(
          arguments, &contained, err);
        rlen = 0;
        gt_logger_log(verbose_logger, "read length = variable");
        gt_assert(reads != NULL);
      }
    }

    if (had_err == 0)
    {
      if (!arguments->load)
      {
        had_err = gt_readjoiner_assembly_build_graph(arguments, &strgraph,
            reads, readset, eqlen, rlen, nreads, contained, default_logger,
            verbose_logger, timer, err);
      }
      else
      {
        gt_readjoiner_assembly_load_graph(&strgraph, reads, readset, rlen,
            default_logger, timer);
      }
    }

    if (!eqlen && reads != NULL && !arguments->errors)
    {
      gt_encseq_delete(reads);
      reads = NULL;
      if (had_err == 0)
        gt_strgraph_set_encseq(strgraph, NULL);
    }

    if (had_err == 0 && arguments->redtrans)
    {
      if (gt_showtime_enabled())
        gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_REDTRANS,
            stdout);
      gt_strgraph_sort_edges_by_len(strgraph, false);
      (void)gt_strgraph_redtrans(strgraph, false);
      (void)gt_strgraph_redself(strgraph, false);
      (void)gt_strgraph_redwithrc(strgraph, false);
      gt_strgraph_log_stats(strgraph, verbose_logger);
    }

    if (had_err == 0 && arguments->errors)
    {
      if (gt_showtime_enabled())
        gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_CLEANSG,
            stdout);
      gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_CLEANSG);
      had_err = gt_readjoiner_assembly_error_correction(strgraph,
          arguments->bubble, arguments->deadend, arguments->deadend_depth,
          verbose_logger);
    }

    if (had_err == 0 && arguments->save)
    {
      if (gt_showtime_enabled())
        gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_SAVESG,
            stdout);
      gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_SAVESG);
      gt_strgraph_show(strgraph, GT_STRGRAPH_BIN,
          gt_str_get(arguments->readset), GT_READJOINER_SUFFIX_SG, false);
    }

    if (!eqlen && reads != NULL)
    {
      gt_encseq_delete(reads);
      reads = NULL;
      if (had_err == 0)
        gt_strgraph_set_encseq(strgraph, NULL);
    }

    if (had_err == 0)
    {
      if (gt_showtime_enabled())
        gt_timer_show_progress(timer, GT_READJOINER_ASSEMBLY_MSG_TRAVERSESG,
            stdout);
      gt_logger_log(default_logger, GT_READJOINER_ASSEMBLY_MSG_TRAVERSESG);
      gt_readjoiner_assembly_show_current_space("(before traversal)");
      gt_strgraph_spell(strgraph, (GtUword)arguments->depthcutoff,
          (GtUword)arguments->lengthcutoff, arguments->vd, readset,
          GT_READJOINER_SUFFIX_CONTIG_PATHS, NULL, true,
          arguments->show_contigs_info, false, verbose_logger);
    }

    if (contained != NULL)
      gt_free(contained);
    gt_strgraph_delete(strgraph);
    strgraph = NULL;
    gt_assert(reads == NULL);
    gt_encseq_loader_delete(el);
  }

  if (had_err == 0)
  {
    gt_readjoiner_assembly_show_current_space("(before paths2seq)");
    had_err = gt_readjoiner_assembly_paths2seq(readset,
        (GtUword)arguments->lengthcutoff, arguments->vd,
        arguments->astat, arguments->coverage, arguments->copynum,
        arguments->buffersize, default_logger, &timer, err);
  }

  if (gt_showtime_enabled())
  {
    gt_timer_show_progress_final(timer, stdout);
    gt_timer_delete(timer);
  }
  gt_logger_delete(default_logger);
  gt_logger_delete(verbose_logger);
  return had_err;
}
Пример #8
0
static int gt_linspace_align_runner(GT_UNUSED int argc,
                                    GT_UNUSED const char **argv,
                                    GT_UNUSED int parsed_args,
                                    void *tool_arguments,
                                    GtError *err)
{
  GtLinspaceArguments *arguments = tool_arguments;
  int had_err = 0;
  GtAlignment *align;
  GtWord left_dist = 0, right_dist = 0;
  GtSequenceTable *sequence_table1, *sequence_table2;
  GtLinspaceManagement *spacemanager;
  GtScoreHandler *scorehandler = NULL;
  GtTimer *linspacetimer = NULL;
  GtAlphabet *alphabet = NULL;

  gt_error_check(err);
  gt_assert(arguments);
  sequence_table1 = gt_sequence_table_new();
  sequence_table2 = gt_sequence_table_new();
  align = gt_alignment_new();
  spacemanager = gt_linspace_management_new();
  gt_linspace_management_set_TSfactor(spacemanager,arguments->timesquarefactor);

  /* get sequences */
  if (gt_str_array_size(arguments->strings) > 0)
  {
    get_onesequence(sequence_table1, arguments->strings, 0);
    sequence_table1->size++;
    get_onesequence(sequence_table2, arguments->strings, 1);
    sequence_table2->size++;
  }
  else if (gt_str_array_size(arguments->files) > 0)
  {
    had_err = get_fastasequences(sequence_table1,
                                 gt_str_array_get_str(arguments->files,0),err);
    if (!had_err)
    {
      had_err = get_fastasequences(sequence_table2,
                                  gt_str_array_get_str(arguments->files,1),err);
    }
  }
  if (arguments->dna)
  {
    alphabet = gt_alphabet_new_dna();
  } else
  {
    gt_assert(arguments->protein);
    alphabet = gt_alphabet_new_protein();
  }
  gt_encode_sequence_table(alphabet,sequence_table1);
  gt_encode_sequence_table(alphabet,sequence_table2);
  if (!had_err)
  {
    scorehandler = gt_arguments2scorehandler(arguments,err);
    if (scorehandler == NULL)
    {
      had_err = -1;
    } else
    {
      if (arguments->global && arguments->protein && !arguments->has_costmatrix)
      {
        GtScoreHandler *costhandler = gt_scorehandler2costhandler(scorehandler);
        gt_scorehandler_delete(scorehandler);
        scorehandler = costhandler;
      }
    }
  }
  /* get diagonal band */
  if (!had_err && arguments->diagonal)
  {
    if (gt_str_array_size(arguments->diagonalbonds) > 0)
    {
      had_err = gt_parse_score_value(__LINE__,&left_dist,
                                  gt_str_array_get(arguments->diagonalbonds,0),
                                  false, err);
      if (!had_err)
      {
        had_err = gt_parse_score_value(__LINE__,&right_dist,
                                  gt_str_array_get(arguments->diagonalbonds,1),
                                  false, err);
      }
    }
  }
  if (!had_err && arguments->spacetime)
  {
    linspacetimer = gt_timer_new();
  }

  /* alignment functions with linear gap costs */
  if (!had_err)
  {
    bool affine;

    if (gt_str_array_size(arguments->linearcosts) > 0)
    {
      affine = false;
    } else
    {
      gt_assert(gt_str_array_size(arguments->affinecosts) > 0);
      affine = true;
    }
    had_err = gt_all_against_all_alignment_check (
                            affine, align, arguments,
                            spacemanager,
                            scorehandler,
                            gt_alphabet_characters(alphabet),
                            gt_alphabet_wildcard_show(alphabet),
                            sequence_table1,
                            sequence_table2,
                            left_dist,
                            right_dist,
                            linspacetimer,err);
  }
  /*spacetime option*/
  if (!had_err && arguments->spacetime)
  {
    printf("# combined space peak in kilobytes: %f\n",
           GT_KILOBYTES(gt_linspace_management_get_spacepeak(spacemanager)));
    gt_timer_show_formatted(linspacetimer,"# TIME overall " GT_WD ".%02ld\n",
                            stdout);
  }
  gt_timer_delete(linspacetimer);
  gt_linspace_management_delete(spacemanager);
  gt_sequence_table_delete(sequence_table1);
  gt_sequence_table_delete(sequence_table2);
  gt_alignment_delete(align);
  gt_alphabet_delete(alphabet);
  gt_scorehandler_delete(scorehandler);
  return had_err;
}
Пример #9
0
static int gt_encseq2spm_runner(GT_UNUSED int argc,
                                GT_UNUSED const char **argv,
                                GT_UNUSED int parsed_args,
                                void *tool_arguments,
                                GtError *err)
{
    GtEncseq2spmArguments *arguments = tool_arguments;
    GtEncseqLoader *el = NULL;
    GtEncseq *encseq = NULL;
    bool haserr = false;

    gt_error_check(err);
    gt_assert(arguments);
    el = gt_encseq_loader_new();
    gt_encseq_loader_drop_description_support(el);
    gt_encseq_loader_disable_autosupport(el);
    encseq = gt_encseq_loader_load(el, gt_str_get(arguments->encseqinput),
                                   err);
    if (encseq == NULL)
    {
        haserr = true;
    }
    if (!haserr)
    {
        if (arguments->singlestrand)
        {
            gt_error_set(err,"option -singlestand is not implemented");
            haserr = true;
        } else
        {
            if (gt_encseq_mirror(encseq, err) != 0)
            {
                haserr = true;
            }
        }
    }

    if (!haserr && arguments->singlescan > 0)
    {
        GtTimer *timer = NULL;

        if (gt_showtime_enabled())
        {
            char *outmsg;

            switch (arguments->singlescan)
            {
            case 1:
                outmsg = "to run fast scanning";
                break;
            case 2:
                outmsg = "to run fast scanning with check";
                break;
            case 3:
                outmsg = "to run fast scanning with output";
                break;
            case 4:
                outmsg = "to run old scanning code";
                break;
            default:
                gt_error_set(err,"argument %u to option -singlescan not allowed",
                             arguments->singlescan);
                haserr = true;
            }
            if (!haserr)
            {
                timer = gt_timer_new_with_progress_description(outmsg);
                gt_timer_start(timer);
            }
        }
        if (!haserr)
        {
            unsigned int kmersize = 0;
            haserr = gt_encseq2spm_kmersize(arguments, &kmersize, err);
            if (!haserr)
            {
                if (arguments->singlescan == 4U)
                {
                    gt_rungetencseqkmers(encseq,kmersize);
                } else
                {
                    if (arguments->singlescan > 0)
                    {
                        gt_firstcode_runkmerscan(encseq,arguments->singlescan - 1,kmersize,
                                                 arguments->minmatchlength);
                    }
                }
            }
        }
        if (timer != NULL)
        {
            gt_timer_show_progress_final(timer, stdout);
            gt_timer_delete(timer);
        }
    }
    if (!haserr && arguments->singlescan == 0)
    {
        GtLogger *logger;
        const GtReadmode readmode = GT_READMODE_FORWARD;
        GtBUstate_spmsk **spmsk_states = NULL;
        unsigned int kmersize, threadcount;

#ifdef GT_THREADS_ENABLED
        const unsigned int threads = gt_jobs;
#else
        const unsigned int threads = 1U;
#endif

        if (arguments->countspms || arguments->outputspms)
        {
            spmsk_states = gt_malloc(sizeof (*spmsk_states) * threads);
            for (threadcount = 0; threadcount < threads; threadcount++)
            {
                spmsk_states[threadcount]
                    = gt_spmsk_inl_new(encseq,
                                       readmode,
                                       (unsigned long) arguments->minmatchlength,
                                       arguments->countspms,
                                       arguments->outputspms,
                                       gt_str_get(arguments->encseqinput));
            }
        }
        logger = gt_logger_new(arguments->verbose,GT_LOGGER_DEFLT_PREFIX, stdout);
        haserr = gt_encseq2spm_kmersize(arguments, &kmersize, err);
        if (!haserr)
        {
            if (storefirstcodes_getencseqkmers_twobitencoding(encseq,
                    kmersize,
                    arguments->numofparts,
                    arguments->maximumspace,
                    arguments->minmatchlength,
                    /* use false */  arguments->checksuftab,
                    /* use false */  arguments->onlyaccum,
                    /* use false */  arguments->
                    onlyallfirstcodes,
                    /* use 5U */     arguments->
                    addbscache_depth,
                    /* specify the extra space needed for
                       the function processing the interval */
                    arguments->phase2extra,
                    /* use true */   arguments->radixlarge ?
                    false : true,
                    /* use 2 without threads and
                       use 1 with threads */
                    arguments->radixparts,
                    spmsk_states != NULL
                    ? gt_spmsk_inl_process
                    : NULL,
                    gt_spmsk_inl_process_end,
                    spmsk_states,
                    logger,
                    err) != 0)
            {
                haserr = true;
            }
        }
        if (spmsk_states != NULL)
        {
            unsigned long countmatches = 0;

            for (threadcount = 0; threadcount < threads; threadcount++)
            {
                countmatches += gt_spmsk_inl_delete(spmsk_states[threadcount]);
            }
            if (arguments->countspms)
            {
                printf("number of suffix-prefix matches=%lu\n",countmatches);
            }
            gt_free(spmsk_states);
        }
        gt_logger_delete(logger);
    }
    gt_encseq_delete(encseq);
    gt_encseq_loader_delete(el);
    return haserr ? -1 : 0;
}
static int gt_kmer_database_runner(GT_UNUSED int argc, const char **argv,
                                   int parsed_args, void *tool_arguments,
                                   GtError *err)
{
  GtKmerDatabaseArguments *arguments = tool_arguments;
  int had_err = 0;
  GtEncseq       *es;
  GtUword        es_length,
                 nu_kmer_codes = 0;
  GtKmerDatabase *compare_db = NULL,
                 *db = NULL;
  GtLogger *logger;
  FILE *fp = NULL;
  GtHashmap *kmer_hash = NULL;
  GtTimer *timer = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  if (arguments->use_hash)
    kmer_hash = gt_hashmap_new(GT_HASH_DIRECT, NULL,
                               (GtFree) gt_kmer_database_delete_hash_value);
  if (arguments->bench)
    timer = gt_timer_new_with_progress_description("loading encoded sequence");

  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr);

  if (arguments->verbose && gt_str_length(arguments->print_filename) > 0UL) {
    fp = gt_fa_fopen(gt_str_get(arguments->print_filename), "w", err);
    gt_logger_set_target(logger, fp);
  }

  if (!had_err) {
    GtEncseqLoader *es_l;
    if (arguments->bench)
      gt_timer_start(timer);
    es_l = gt_encseq_loader_new();
    es = gt_encseq_loader_load(es_l, argv[parsed_args], err);
    if (arguments->bench)
      gt_timer_show_progress(timer, "saving kmers (+iterating over file)",
                             stdout);
    if (es == NULL) {
      had_err = -1;
    }
    gt_encseq_loader_delete(es_l);
  }
  if (!had_err) {
    es_length = gt_encseq_total_length(es);
    if (es_length < (GtUword) arguments->kmersize) {
      gt_error_set(err, "Input is too short for used kmersize. File length: "
                   GT_WU " kmersize: %u", es_length, arguments->kmersize);
      had_err = -1;
    }
  }
  if (!had_err) {
    GtAlphabet *alphabet;
    alphabet = gt_encseq_alphabet(es);
    if (arguments->bench)
    nu_kmer_codes = gt_power_for_small_exponents(
                                            gt_alphabet_num_of_chars(alphabet),
                                            arguments->kmersize);
    if (!arguments->merge_only && !arguments->use_hash && !arguments->bench) {
      compare_db = gt_kmer_database_new(gt_alphabet_num_of_chars(alphabet),
                                arguments->kmersize, arguments->sb_size, es);
    }
    if (!arguments->use_hash) {
      db = gt_kmer_database_new(gt_alphabet_num_of_chars(alphabet),
                                arguments->kmersize,
                                arguments->sb_size, es);
      if (arguments->cutoff) {
        if (arguments->mean_cutoff)
          gt_kmer_database_use_mean_cutoff(db, (GtUword) 2,
                                           arguments->cutoff_value);
        else
          gt_kmer_database_set_cutoff(db, arguments->cutoff_value);
        if (!arguments->prune)
          gt_kmer_database_set_prune(db);
      }
    }
  }

  if (!had_err) {
    GtUword startpos = 0,
            endpos;
    GtKmercodeiterator *iter;
    const GtKmercode *kmercode = NULL;
    iter = gt_kmercodeiterator_encseq_new(es, GT_READMODE_FORWARD,
                                          arguments->kmersize, 0);
    while (!had_err && startpos < es_length - (arguments->kmersize - 1)) {
      GtUword startpos_add_kmer = startpos;
      if (arguments->merge_only) {
        endpos = startpos + (arguments->kmersize - 1) +
                 (gt_rand_max((arguments->sb_size - 1) * 2));
        if (endpos > es_length)
          endpos = es_length;
      }
      else {
        endpos = startpos + (arguments->kmersize - 1) +
                 (gt_rand_max(arguments->sb_size - 1));
      }
      gt_kmercodeiterator_reset(iter, GT_READMODE_FORWARD, startpos);
      while ((kmercode = gt_kmercodeiterator_encseq_next(iter)) != NULL &&
             startpos_add_kmer <= endpos - (arguments->kmersize - 1)) {
        if (!arguments->merge_only && !arguments->use_hash &&
            !kmercode->definedspecialposition && !arguments->bench) {
          gt_kmer_database_add_kmer(compare_db, kmercode->code,
                                    startpos_add_kmer);
        }
        if (arguments->use_hash && !kmercode->definedspecialposition) {
          gt_kmer_database_add_to_hash(kmer_hash, kmercode->code,
                                       startpos_add_kmer);
        }
        startpos_add_kmer++;
      }
      if (!arguments->use_hash) {
        gt_kmer_database_add_interval(db, startpos, endpos);
        gt_kmer_database_print_buffer(db, logger);
        if (!arguments->bench)
          had_err = gt_kmer_database_check_consistency(db, err);
      }
      startpos = endpos + 1;
    }
    if (!arguments->use_hash) {
      gt_kmer_database_flush(db);
      gt_kmer_database_print_buffer(db, logger);
      if (!had_err && !arguments->bench)
        had_err = gt_kmer_database_check_consistency(db, err);
      if (!arguments->merge_only && !had_err && !arguments->bench)
        had_err = gt_kmer_database_check_consistency(compare_db, err);
      if (!arguments->merge_only && !arguments->bench)
        gt_kmer_database_print(compare_db, logger, true);
      if (!arguments->merge_only && !had_err && !arguments->bench)
        had_err = gt_kmer_database_compare(compare_db, db, err);
      gt_kmer_database_print(db, logger, true);
    }
    gt_kmercodeiterator_delete(iter);
  }

  if (arguments->bench) {
    GtKmerStartpos pos;
    GtArrayGtUword *pos_hash;
    GtUword rand_access = (GtUword) 50000000,
            rand_code,
            i,
            sum = 0;
    gt_timer_show_progress(timer, "random access", stdout);
    for (i = 0; i < rand_access; i++) {
      rand_code = gt_rand_max(nu_kmer_codes - 1);
      if (arguments->use_hash) {
        pos_hash = gt_hashmap_get(kmer_hash, (const void *) rand_code);
        if (pos_hash != NULL)
          sum += pos_hash->spaceGtUword[pos_hash->nextfreeGtUword - 1];
      }
      else {
        pos = gt_kmer_database_get_startpos(db, rand_code);
        if (pos.no_positions > 0)
          sum += pos.startpos[pos.no_positions - 1];
      }
    }
    printf("sum: " GT_WU "\n", sum);

    gt_timer_show_progress(timer, "", stdout);
    gt_timer_stop(timer);
    gt_timer_delete(timer);
  }
  if (arguments->use_hash)
    gt_hashmap_delete(kmer_hash);
  gt_encseq_delete(es);
  if (!arguments->use_hash)
    gt_kmer_database_delete(db);
  if (!arguments->merge_only && !arguments->bench)
    gt_kmer_database_delete(compare_db);
  gt_logger_delete(logger);
  gt_fa_fclose(fp);

  return had_err;
}
Пример #11
0
static int gt_speck_runner(int argc, const char **argv, int parsed_args,
                               void *tool_arguments, GtError *err)
{
  GtNodeStream *gff3_in_stream = NULL, *checker_stream = NULL,
               *a_in_stream = NULL, *a_out_stream = NULL,
               *feature_stream = NULL, *sort_stream = NULL,
               *last_stream = NULL;
  GtNodeVisitor *spec_visitor = NULL;
  GtSpecResults *res = NULL;
  GtFeatureIndex *fi = NULL;
  GtTypeChecker *type_checker = NULL;
  GtTimer *t = NULL;
  GtRegionMapping *rm = NULL;
  GtArray *arr = gt_array_new(sizeof (GtFeatureNode*));
  GtStr *prog, *speclib;
  SpeccheckArguments *arguments = tool_arguments;

  int had_err = 0;
  gt_error_check(err);

  res = gt_spec_results_new();
  gt_assert(res);

  if (gt_file_exists(gt_str_get(arguments->format))) {
    speclib = gt_str_ref(arguments->format);
  } else {
    prog = gt_str_new();
    gt_str_append_cstr_nt(prog, gt_error_get_progname(err),
                    gt_cstr_length_up_to_char(gt_error_get_progname(err), ' '));
    speclib = gt_get_gtdata_path(gt_str_get(prog), NULL);
    gt_str_delete(prog);
    gt_str_append_cstr(speclib, "/spec/output_drivers/");
    gt_str_append_str(speclib, arguments->format);

    if (!gt_file_exists(gt_str_get(speclib))) {
      gt_error_set(err, "output driver file \"%s\" does not exist",
                   gt_str_get(speclib));
      had_err = -1;
    }
  }

  if (!had_err) {
    spec_visitor = gt_spec_visitor_new(gt_str_get(arguments->specfile), res,
                                       err);
    if (!spec_visitor) {
      gt_spec_results_delete(res);
      return -1;
    }
  }

  t = gt_timer_new();
  gt_assert(t);

  /* add region mapping if given */
  if (!had_err && gt_seqid2file_option_used(arguments->s2fi)) {
    rm = gt_seqid2file_region_mapping_new(arguments->s2fi, err);
    if (!rm)
      had_err = -1;
    if (!had_err)
      gt_spec_visitor_add_region_mapping((GtSpecVisitor*) spec_visitor, rm);
  }

  /* set type checker if necessary */
  if (!had_err && gt_typecheck_info_option_used(arguments->tci)) {
    type_checker = gt_typecheck_info_create_type_checker(arguments->tci, err);
    if (!type_checker)
      had_err = -1;
    if (!had_err)
      gt_spec_visitor_add_type_checker((GtSpecVisitor*) spec_visitor,
                                       type_checker);
  }

  if (!had_err) {
    /* set runtime error behaviour */
    if (arguments->fail_hard)
      gt_spec_visitor_fail_on_runtime_error((GtSpecVisitor*) spec_visitor);
    else
      gt_spec_visitor_report_runtime_errors((GtSpecVisitor*) spec_visitor);

    /* redirect warnings */
    gt_warning_set_handler(gt_speck_record_warning, res);

    last_stream = gff3_in_stream = gt_gff3_in_stream_new_unsorted(
                                                            argc - parsed_args,
                                                            argv + parsed_args);
    gt_assert(gff3_in_stream);
    gt_gff3_in_stream_enable_tidy_mode((GtGFF3InStream*) gff3_in_stream);

    /* insert sort stream if requested */
    if (arguments->sort) {
      last_stream = sort_stream = gt_sort_stream_new(last_stream);
    }

    /* if -provideindex is given, collect input features and index them first */
    if (arguments->provideindex) {
      fi = gt_feature_index_memory_new();
      gt_assert(fi);

      last_stream = feature_stream = gt_feature_stream_new(last_stream, fi);
      gt_assert(feature_stream);

      last_stream = a_out_stream = gt_array_out_stream_all_new(last_stream, arr,
                                                               err);
      if (!a_out_stream)
        had_err = -1;

      gt_timer_start(t);

      if (!had_err)
        had_err = gt_node_stream_pull(last_stream, err);

      if (!had_err) {
        gt_spec_visitor_add_feature_index((GtSpecVisitor*) spec_visitor,
                                          gt_feature_index_ref(fi));
        last_stream = a_in_stream = gt_array_in_stream_new(arr, NULL, err);
        if (!a_in_stream)
          had_err = -1;
      }
    } else {
      gt_timer_start(t);
    }

    if (!had_err) {
      checker_stream = gt_visitor_stream_new(last_stream, spec_visitor);
      gt_assert(checker_stream);
    }

    /* perform checking  */
    if (!had_err)
      had_err = gt_node_stream_pull(checker_stream, err);

    gt_timer_stop(t);

    /* reset warnings output */
    gt_warning_set_handler(gt_warning_default_handler, NULL);

    /* output results */
    if (!had_err) {
      GtStr *runtime = gt_str_new();
      gt_timer_get_formatted(t, GT_WD ".%06ld", runtime);
      had_err = gt_spec_results_render_template(res, gt_str_get(speclib),
                                                arguments->outfp,
                                                gt_str_get(arguments->specfile),
                                                arguments->verbose,
                                                arguments->colored,
                                                gt_str_get(runtime), err);
      gt_str_delete(runtime);
    }
  }

  /* free */
  gt_node_stream_delete(gff3_in_stream);
  gt_node_stream_delete(a_in_stream);
  gt_node_stream_delete(a_out_stream);
  gt_node_stream_delete(checker_stream);
  gt_node_stream_delete(feature_stream);
  gt_node_stream_delete(sort_stream);
  gt_spec_results_delete(res);
  gt_feature_index_delete(fi);
  gt_type_checker_delete(type_checker);
  gt_timer_delete(t);
  gt_array_delete(arr);
  gt_str_delete(speclib);

  return had_err;
}
Пример #12
0
static int gt_gdiffcalc_runner(int argc, const char **argv, int parsed_args,
                              void *tool_arguments, GT_UNUSED GtError *err)
{
  GtGenomediffArguments *arguments = tool_arguments;
  int had_err = 0, i;
  GtUword lcounter = 0, zcounter = 0;
  double **shusums = NULL;
  GtEncseq              *encseq = NULL;
  GtLogger              *logger;
  GtShuUnitFileInfo     *unit_info = NULL;
  GtTimer               *timer = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose,
                         GT_LOGGER_DEFLT_PREFIX,
                         stdout);
  gt_assert(logger);

  for (i = parsed_args; i < argc; i++) {
    gt_str_array_add_cstr(arguments->filenames, argv[i]);
  }

  if (gt_showtime_enabled()) {
    timer = gt_timer_new_with_progress_description("load encseq");
    gt_timer_start(timer);
    gt_assert(timer);
  }

  if (arguments->with_units) {
    gt_logger_log(logger, "unitfile option set, filename is %s\n",
                  gt_str_get(arguments->unitfile));
  }

  if (!had_err) {
    GtEncseqLoader *el = gt_encseq_loader_new_from_options(arguments->loadopts,
                                                           err);
    encseq =
      gt_encseq_loader_load(el, gt_str_get(arguments->indexname), err);
    gt_encseq_loader_delete(el);
  }
  if (encseq == NULL)
    had_err = -1;

  if (timer != NULL)
    gt_timer_show_progress(timer, "load units", stdout);

  if (!had_err) {
    unit_info = gt_shu_unit_info_new(encseq);
    if (arguments->with_units)
      had_err = gt_shu_unit_file_info_read(arguments->unitfile, unit_info,
                                           logger, err);
  }

  if (timer != NULL)
    gt_timer_show_progress(timer, "read table", stdout);

  if (!had_err) {
    GtIO *table_file = NULL;
    GtTokenizer *tokenizer = NULL;
    GtStr *line = NULL;

    gt_assert(unit_info != NULL);
    gt_array2dim_calloc(shusums, unit_info->num_of_genomes,
                        unit_info->num_of_genomes);

    table_file = gt_io_new(gt_str_array_get(arguments->filenames, 0), "r");
    tokenizer = gt_tokenizer_new(table_file);
    line = gt_tokenizer_get_token(tokenizer);
    while (line != NULL && !had_err) {
      char *cline = gt_str_get(line);
      char *elem = strtok(cline, ";");
      zcounter = 0;
      while (elem != NULL && !had_err) {
        if (*elem != '#') {
          if (1 != sscanf(elem, "%lf",
                          &shusums[lcounter][zcounter])) {
            had_err = 1;
            gt_error_set(err, "couldn't scan");
            break;
          }
          gt_logger_log(logger,"wert: %lf", shusums[lcounter][zcounter]);
          zcounter++;
        }
        else {
          gt_logger_log(logger, "name: %s", elem++);
        }
        elem = strtok(NULL, ";");
      }
      gt_tokenizer_next_token(tokenizer);
      gt_str_delete(line);
      line = gt_tokenizer_get_token(tokenizer);
      lcounter++;
      gt_logger_log(logger, "line "GT_WD"", lcounter);
    }
  }
  if (!had_err) {
    GtUword num_of_seq, file_idx, seq_idx, startpos;
    GT_UNUSED GtUword oldpos = 0;

    gt_assert(unit_info != NULL);
    gt_assert(lcounter == zcounter);
    gt_assert(lcounter == unit_info->num_of_genomes);

    num_of_seq = gt_encseq_num_of_sequences(unit_info->encseq);

    for (seq_idx = 0; seq_idx < num_of_seq; seq_idx++) {
      startpos = gt_encseq_seqstartpos(unit_info->encseq, seq_idx);
      file_idx = gt_encseq_filenum(unit_info->encseq, startpos);
      gt_log_log("seq: "GT_WU" starts at: "GT_WU"\n"
                 "belonges to file: "GT_WU" which is part of genome: %s",
                 seq_idx, startpos, file_idx,
                 gt_str_array_get(unit_info->genome_names,
                                  unit_info->map_files[file_idx]));
      gt_assert(oldpos <= startpos);
      oldpos = startpos;
    }
  }
  if (!had_err && shusums != NULL) {
    had_err = gt_genomediff_calculate_div_from_avg(shusums, arguments,
                                                   unit_info,
                                                   logger, timer, err);
    gt_array2dim_delete(shusums);
  }

  if (timer != NULL) {
    gt_timer_show_progress_final(timer, stdout);
    gt_timer_delete(timer);
  }
  gt_logger_delete(logger);
  gt_encseq_delete(encseq);
  gt_shu_unit_info_delete(unit_info);
  return had_err;
}
static int gt_condenser_search_runner(GT_UNUSED int argc,
                                      GT_UNUSED const char **argv,
                                      GT_UNUSED int parsed_args,
                                      void *tool_arguments,
                                      GtError *err)
{
  GtCondenserSearchArguments *arguments = tool_arguments;
  int i, had_err = 0;
  char *querypath = gt_str_get(arguments->querypath);
  GtStr* coarse_fname = gt_str_new_cstr("coarse_");
  char *db_basename = NULL;
  char *suffix_ptr = NULL;
  GtTimer *timer = NULL;
  GtLogger *logger = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr);

  db_basename = gt_basename(gt_str_get(arguments->dbpath));
  /* if first char is '.' this might be a hidden file */
  if (strlen(db_basename) > (size_t) 1 &&
      (suffix_ptr = strrchr(db_basename + 1, '.')) != NULL) {
    /* remove suffix */
    *suffix_ptr = '\0';
  }
  gt_str_append_cstr(coarse_fname, db_basename);
  gt_str_append_cstr(coarse_fname, ".fas");
  gt_free(db_basename);
  db_basename = NULL;
  suffix_ptr = NULL;

  if (arguments->blastn || arguments->blastp) {
    GtMatch              *match;
    GtMatchIterator      *mp = NULL;
    GtNREncseq           *nrencseq = NULL;
    GtStr                *fastaname = gt_str_clone(arguments->dbpath);
    HitPosition          *hits;
    double                eval,
                          raw_eval = 0.0;
    GtUword               coarse_db_len = 0;
    GtMatchIteratorStatus status;
    int                   curr_hits = 0,
                          max_hits = 100;

    hits = gt_malloc(sizeof (*hits) * (size_t) max_hits);

    gt_str_append_cstr(fastaname, ".fas");

    for (i=0; i < max_hits; i++) {
      hits[i].range = gt_malloc(sizeof (*hits[i].range) * (size_t) 1);
    }

    if (gt_showtime_enabled()) {
      timer = gt_timer_new_with_progress_description("initialization");
      gt_timer_start(timer);
    }

    /*extract sequences from compressed database*/
    if (!had_err) {
      nrencseq = gt_n_r_encseq_new_from_file(gt_str_get(arguments->dbpath),
                                             logger, err);
      if (nrencseq == NULL)
        had_err = -1;
    }
    if (!had_err) {
      if (arguments->ceval == GT_UNDEF_DOUBLE ||
          arguments->feval == GT_UNDEF_DOUBLE) {
        /* from NCBI BLAST tutorial:
           E = Kmne^{-lambdaS}
           calculates E-value for score S with natural scale parameters K for
           search space size and lambda for the scoring system
           E = mn2^-S'
           m being the subject (total) length, n the length of ONE query
           calculates E-value for bit-score S'
         */
        GtFastaReader *reader;
        GtCondenserSearchAvg avg = {0,0};
        reader = gt_fasta_reader_rec_new(arguments->querypath);
        had_err = gt_fasta_reader_run(reader, NULL, NULL,
                                      gt_condenser_search_cum_moving_avg,
                                      &avg,
                                      err);
        if (!had_err) {
          GtUword S = arguments->bitscore;
          gt_log_log(GT_WU " queries, avg query size: " GT_WU,
                     avg.count, avg.avg);
          raw_eval = 1/pow(2.0, (double) S) * avg.avg;
          gt_logger_log(logger, "Raw E-value set to %.4e", raw_eval);
          gt_assert(avg.avg != 0);
        }
        gt_fasta_reader_delete(reader);
      }
    }

    /*create BLAST database from compressed database fasta file*/
    if (!had_err) {
      if (timer != NULL)
        gt_timer_show_progress(timer, "create coarse BLAST db", stderr);
      if (arguments->blastn)
        had_err = gt_condenser_search_create_nucl_blastdb(gt_str_get(fastaname),
                                                          err);
      else
        had_err = gt_condenser_search_create_prot_blastdb(gt_str_get(fastaname),
                                                          err);
    }

    if (!had_err) {
      GtBlastProcessCall *call;

      if (timer != NULL)
        gt_timer_show_progress(timer, "coarse BLAST run", stderr);

      if (arguments->blastp)
        call = gt_blast_process_call_new_prot();
      else
        call = gt_blast_process_call_new_nucl();
      gt_blast_process_call_set_db(call, gt_str_get(fastaname));
      gt_blast_process_call_set_query(call, querypath);
      gt_blast_process_call_set_evalue(call, arguments->ceval);
      gt_blast_process_call_set_num_threads(call, arguments->blthreads);

      mp = gt_match_iterator_blast_process_new(call, err);
      if (!mp)
        had_err = -1;

      gt_blast_process_call_delete(call);

      while (!had_err &&
             (status = gt_match_iterator_next(mp, &match, err)) !=
             GT_MATCHER_STATUS_END)
      {
        if (status == GT_MATCHER_STATUS_OK) {
          GtUword hit_seq_id;
          char string[7];
          const char *dbseqid = gt_match_get_seqid2(match);
          if (sscanf(dbseqid,"%6s" GT_WU, string, &hit_seq_id) == 2) {
            gt_match_get_range_seq2(match, hits[curr_hits].range);
            hits[curr_hits].idx = hit_seq_id;
            gt_match_delete(match);
            curr_hits++;
            if (curr_hits == max_hits) {
              HitPosition *hit_extention;
              max_hits += 100;
              hits = gt_realloc(hits, sizeof (*hit_extention) * max_hits);
              for (i=max_hits - 100; i < max_hits; i++) {
                hits[i].range = gt_malloc(sizeof (*hits[i].range));
              }
            }
          } else {
            gt_error_set(err, "could not parse unique db header %s", dbseqid);
            had_err = -1;
          }
        } else if (status == GT_MATCHER_STATUS_ERROR) {
          had_err = -1;
        }
      }
      gt_match_iterator_delete(mp);
    }
    /*extract sequences*/
    if (!had_err) {
      GtNREncseqDecompressor *decomp;
      GtFile *coarse_hits;
      if (timer != NULL)
        gt_timer_show_progress(timer, "extract coarse search hits", stderr);
      decomp = gt_n_r_encseq_decompressor_new(nrencseq);
      coarse_hits = gt_file_new(gt_str_get(coarse_fname),"w", err);
      /* TODO DW do NOT extract complete uniques! these could be complete
         chromosomes!! just extract something around it? maybe +- max query
         length*/
      for (i = 0; i < curr_hits; i++) {
        gt_n_r_encseq_decompressor_add_unique_idx_to_extract(decomp,
                                                             hits[i].idx);
      }
      had_err =
        gt_n_r_encseq_decompressor_start_unique_extraction(coarse_hits,
                                                           decomp,
                                                           &coarse_db_len,
                                                           err);
      gt_assert(coarse_db_len != 0);
      gt_file_delete(coarse_hits);
      gt_n_r_encseq_decompressor_delete(decomp);
    }
    gt_n_r_encseq_delete(nrencseq);

    /* create BLAST database from decompressed database file */
    if (!had_err) {
      if (timer != NULL)
        gt_timer_show_progress(timer, "create fine BLAST db", stderr);
      if (arguments->blastn)
        had_err =
          gt_condenser_search_create_nucl_blastdb(gt_str_get(coarse_fname),
                                                  err);
      else
        had_err =
          gt_condenser_search_create_prot_blastdb(gt_str_get(coarse_fname),
                                                  err);
    }
    /* perform fine BLAST search */
    if (!had_err) {
      GtBlastProcessCall *call;

      if (timer != NULL)
        gt_timer_show_progress(timer, "fine BLAST run", stderr);

      if (arguments->feval == GT_UNDEF_DOUBLE) {
        eval = raw_eval * coarse_db_len;
      } else {
        eval = arguments->feval;
      }

      if (arguments->blastp)
        call = gt_blast_process_call_new_prot();
      else
        call = gt_blast_process_call_new_nucl();

      gt_blast_process_call_set_db(call, gt_str_get(coarse_fname));
      gt_blast_process_call_set_query(call, querypath);
      gt_blast_process_call_set_evalue(call, eval);
      gt_blast_process_call_set_num_threads(call, arguments->blthreads);

      gt_logger_log(logger, "Fine E-value set to: %.4e (len)" GT_WU, eval,
                    coarse_db_len);

      mp = gt_match_iterator_blast_process_new(call, err);
      if (!mp)
        had_err = -1;

      gt_blast_process_call_delete(call);

      if (!had_err) {
        GtUword numofhits = 0;
        while (!had_err &&
               (status = gt_match_iterator_next(mp, &match, err)) !=
               GT_MATCHER_STATUS_END) {
          if (status == GT_MATCHER_STATUS_OK) {
            GtMatchBlast *matchb = (GtMatchBlast*) match;
            char *dbseqid = gt_malloc(sizeof (*dbseqid) * 50);
            GtRange range_seq1;
            GtRange range_seq2;
            numofhits++;
            gt_match_get_range_seq1(match, &range_seq1);
            gt_match_get_range_seq2(match, &range_seq2);
            gt_file_xprintf(
                    arguments->outfp,
                    "%s\t%s\t%.2f\t" GT_WU "\t" GT_WU "\t" GT_WU "\t" GT_WU "\t"
                    GT_WU "\t%g\t%.3f\n",
                    gt_match_get_seqid1(match),
                    gt_match_get_seqid2(match),
                    gt_match_blast_get_similarity(matchb),
                    gt_match_blast_get_align_length(matchb),
                    range_seq1.start,
                    range_seq1.end,
                    range_seq2.start,
                    range_seq2.end,
                    gt_match_blast_get_evalue(matchb),
                    (double) gt_match_blast_get_bitscore(matchb));
            gt_match_delete(match);
            gt_free(dbseqid);
          } else if (status == GT_MATCHER_STATUS_ERROR) {
            had_err = -1;
          }
        }
        gt_log_log(GT_WU " hits found\n", numofhits);
      }
      gt_match_iterator_delete(mp);

    }
    if (!had_err)
      if (timer != NULL)
        gt_timer_show_progress_final(timer, stderr);
    gt_timer_delete(timer);

    /*cleanup*/
    for (i=0; i < max_hits; i++) {
      gt_free(hits[i].range);
    }
    gt_free(hits);
    gt_str_delete(fastaname);
  }
  gt_str_delete(coarse_fname);
  gt_logger_delete(logger);
  return had_err;
}