Пример #1
0
/*@null@*/ FILE *opensfxfile(const GtStr *indexname,
                             const char *suffix,
                             const char *mode,
                             GtError *err)
{
  GtStr *tmpfilename;
  FILE *fp;

  gt_error_check(err);
  tmpfilename = gt_str_clone(indexname);
  gt_str_append_cstr(tmpfilename,suffix);
  fp = gt_fa_fopen(gt_str_get(tmpfilename),mode,err);
  gt_str_delete(tmpfilename);
  return fp;
}
Пример #2
0
int gt_cntlist_parse(const char *filename, bool alloc_cntlist,
    GtBitsequence **cntlist, GtUword *nofreads, GtError *err)
{
  int c, retval = 0;
  FILE *infp;

  gt_log_log("parse contained reads list file: %s", filename);
  infp = gt_fa_fopen(filename, "rb", err);

  if (infp == NULL)
    return -1;

  c = gt_xfgetc(infp);
  switch (c)
  {
    case EOF:
      gt_error_set(err, "%s: unexpected end of file", filename);
      retval = 1;
      break;
    case GT_CNTLIST_BIN_HEADER:
      gt_log_log("contained reads list format: BIN");
      retval = gt_cntlist_parse_bin(infp, alloc_cntlist, cntlist, nofreads,
          err);
      break;
    case GT_CNTLIST_BIT_HEADER:
      gt_log_log("contained reads list format: BIT");
      retval = gt_cntlist_parse_bit(infp, alloc_cntlist, cntlist, nofreads,
          err);
      break;
    case GT_CNTLIST_ASCII_HEADER:
      gt_xungetc(c, infp);
      gt_log_log("contained reads list format: ASCII");
      retval = gt_cntlist_parse_ascii(infp, alloc_cntlist, cntlist, nofreads,
          err);
      break;
    default:
      gt_error_set(err, "%s: unrecognized format", filename);
      retval = 1;
      break;
  }
  gt_fa_fclose(infp);

  return retval;
}
Пример #3
0
GtFile* gt_file_open(GtFileMode file_mode, const char *path, const char *mode,
                     GtError *err)
{
  GtFile *file;
  gt_error_check(err);
  gt_assert(mode);
  file = gt_calloc(1, sizeof (GtFile));
  file->mode = file_mode;
  file->reference_count = 0;
  if (path) {
    switch (file_mode) {
      case GT_FILE_MODE_UNCOMPRESSED:
        file->fileptr.file = gt_fa_fopen(path, mode, err);
        if (!file->fileptr.file) {
          gt_file_delete_without_handle(file);
          return NULL;
        }
        break;
      case GT_FILE_MODE_GZIP:
        file->fileptr.gzfile = gt_fa_gzopen(path, mode, err);
        if (!file->fileptr.gzfile) {
          gt_file_delete_without_handle(file);
          return NULL;
        }
        break;
      case GT_FILE_MODE_BZIP2:
        file->fileptr.bzfile = gt_fa_bzopen(path, mode, err);
        if (!file->fileptr.bzfile) {
          gt_file_delete_without_handle(file);
          return NULL;
        }
        file->orig_path = gt_cstr_dup(path);
        file->orig_mode = gt_cstr_dup(path);
        break;
      default: gt_assert(0);
    }
  }
  else {
    gt_assert(file_mode == GT_FILE_MODE_UNCOMPRESSED);
    file->fileptr.file = stdin;
    file->is_stdin = true;
  }
  return file;
}
Пример #4
0
int gt_cntlist_show(GtBitsequence *cntlist, GtUword nofreads,
    const char *path, bool binary, GtError *err)
{
  FILE *file;
  gt_assert(cntlist != NULL);
  if (path == NULL)
    file = stdout;
  else
  {
    file = gt_fa_fopen(path, binary ? "wb" : "w", err);
    if (file == NULL)
      return -1;
  }
  gt_assert(file != NULL);
  (binary ? gt_cntlist_show_bit : gt_cntlist_show_ascii)
    (cntlist, nofreads, file);
  if (path != NULL)
    gt_fa_fclose(file);
  return 0;
}
Пример #5
0
static int gt_condenseq_compress_runner(GT_UNUSED int argc, const char **argv,
                                        int parsed_args, void *tool_arguments,
                                        GtError *err)
{
  GtCondenseqCompressArguments *arguments = tool_arguments;
  GtLogger *logger,
           *kdb_logger;
  FILE *kmer_fp = NULL;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(arguments);

  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr);
  kdb_logger = gt_logger_new(arguments->kdb, GT_LOGGER_DEFLT_PREFIX, stderr);
  if (arguments->kdb) {
    kmer_fp = gt_fa_fopen("kmer_db.out", "w", err);
    gt_logger_set_target(kdb_logger, kmer_fp);
  }

  if (gt_str_length(arguments->indexname) == 0UL) {
    char *basenameptr;
    basenameptr = gt_basename(argv[parsed_args]);
    gt_str_set(arguments->indexname, basenameptr);
    gt_free(basenameptr);
  }

  if (!had_err) {
    GtEncseqLoader *es_l = gt_encseq_loader_new();
    arguments->input_es = gt_encseq_loader_load(es_l, argv[parsed_args], err);
    if (arguments->input_es == NULL)
      had_err = -1;
    gt_encseq_loader_delete(es_l);
  }

  if (!had_err) {
    if (arguments->minalignlength == GT_UNDEF_UWORD)
      arguments->minalignlength = arguments->initsize != GT_UNDEF_UWORD ?
                                  arguments->initsize / (GtUword) 3UL :
                                  GT_UNDEF_UWORD;
    if (arguments->windowsize == GT_UNDEF_UINT)
      arguments->windowsize = arguments->minalignlength != GT_UNDEF_UWORD ?
                              (unsigned int) (arguments->minalignlength / 5U) :
                              GT_UNDEF_UINT;
    if (arguments->windowsize < 4U)
      arguments->windowsize = 4U;
    if (arguments->kmersize == GT_UNDEF_UINT) {
      unsigned int size =
        gt_alphabet_num_of_chars(gt_encseq_alphabet(arguments->input_es));
      /* size^k ~= 100000 */
      gt_safe_assign(arguments->kmersize,
                     gt_round_to_long(gt_log_base(100000.0, (double) size)));
      gt_logger_log(logger, "|A|: %u, k: %u",
                    size, arguments->kmersize);
    }

    if (arguments->windowsize == GT_UNDEF_UINT) {
      arguments->windowsize = 5U * arguments->kmersize;
    }
    if (arguments->minalignlength == GT_UNDEF_UWORD) {
      arguments->minalignlength = (GtUword) (3UL * arguments->windowsize);
    }
    if (arguments->initsize == GT_UNDEF_UWORD) {
      arguments->initsize = (GtUword) (3UL * arguments->minalignlength);
    }
  }
  if (!had_err &&
      arguments->windowsize <= arguments->kmersize) {
    gt_error_set(err, "-windowsize (%u) must be larger -kmersize (%u)!",
                 arguments->windowsize, arguments->kmersize);
    had_err = -1;
  }
  if (!had_err &&
      arguments->minalignlength < (GtUword) arguments->windowsize) {
    gt_error_set(err, "-alignlength (" GT_WU ") must be at least "
                 "-windowsize (%u)!", arguments->minalignlength,
                 arguments->windowsize);
    had_err = -1;
  }
  if (!had_err && (arguments->initsize < arguments->minalignlength)) {
    gt_error_set(err, "-initsize (" GT_WU ") must be at least "
                 "-alignlength (" GT_WU ")!", arguments->initsize,
                 arguments->minalignlength);
    had_err = -1;
  }

  if (!had_err) {
    GtCondenseqCreator *ces_c;

    if (!had_err) {
      ces_c = gt_condenseq_creator_new(arguments->initsize,
                                       arguments->minalignlength,
                                       arguments->xdrop,
                                       &(arguments->scores),
                                       arguments->kmersize,
                                       arguments->windowsize,
                                       logger,
                                       err);
      if (ces_c == NULL)
        had_err = -1;
    }
    if (!had_err) {
      if (arguments->cutoff_value == GT_UNDEF_UWORD)
        gt_condenseq_creator_use_mean_cutoff(ces_c);
      else if (arguments->cutoff_value == 0)
        gt_condenseq_creator_disable_cutoff(ces_c);
      else
        gt_condenseq_creator_set_cutoff(ces_c, arguments->cutoff_value);
      gt_condenseq_creator_set_mean_fraction(ces_c, arguments->fraction);
      if (arguments->prune)
        gt_condenseq_creator_disable_prune(ces_c);
      if (arguments->brute)
        gt_condenseq_creator_enable_brute_force(ces_c);
      if (!arguments->diags)
        gt_condenseq_creator_disable_diagonals(ces_c);
      if (arguments->full_diags)
        gt_condenseq_creator_enable_full_diagonals(ces_c);
      if (arguments->clean_percent != GT_UNDEF_UINT)
        gt_condenseq_creator_set_diags_clean_limit(ces_c,
                                                   arguments->clean_percent);

      had_err = gt_condenseq_creator_create(ces_c,
                                            arguments->indexname,
                                            arguments->input_es,
                                            logger, kdb_logger, err);

      gt_condenseq_creator_delete(ces_c);
    }
  }

  gt_logger_delete(logger);
  gt_logger_delete(kdb_logger);
  if (arguments->kdb)
    gt_fa_fclose(kmer_fp);
  return had_err;
}
Пример #6
0
static int itersearchoverallkeys(const GtEncseq *encseq,
                                 const char *keytab,
                                 unsigned long numofkeys,
                                 unsigned long keysize,
                                 const GtStr *fileofkeystoextract,
                                 unsigned long linewidth,
                                 GtError *err)
{
  FILE *fp;
  GtStr *currentline;
  uint64_t linenum;
  unsigned long seqnum, countmissing = 0;
  bool haserr = false;
  Fastakeyquery fastakeyquery;

  if (linewidth == 0)
  {
    gt_error_set(err,"use option width to specify line width for formatting");
    return -1;
  }
  fp = gt_fa_fopen(gt_str_get(fileofkeystoextract),"r",err);
  if (fp == NULL)
  {
    return -1;
  }
  currentline = gt_str_new();
  fastakeyquery.fastakey = gt_malloc(sizeof (char) * (keysize+1));
  for (linenum = 0; gt_str_read_next_line(currentline, fp) != EOF; linenum++)
  {
    if (extractkeyfromcurrentline(&fastakeyquery,
                                  keysize,
                                  currentline,
                                  linenum,
                                  fileofkeystoextract,
                                  err) != 0)
    {
      haserr = true;
      break;
    }
    seqnum = searchfastaqueryindes(fastakeyquery.fastakey,keytab,numofkeys,
                                   keysize);
    if (seqnum < numofkeys)
    {
      if (giextract_encodedseq2fasta(stdout,
                                     encseq,
                                     seqnum,
                                     &fastakeyquery,
                                     linewidth,
                                     err) != 0)
      {
        haserr = true;
        break;
      }
    } else
    {
      countmissing++;
    }
    gt_str_reset(currentline);
  }
  if (!haserr && countmissing > 0)
  {
    printf("# number of unsatified fastakey-queries: %lu\n",countmissing);
  }
  gt_str_delete(currentline);
  gt_fa_fclose(fp);
  gt_free(fastakeyquery.fastakey);
  return haserr ? - 1 : 0;
}
Пример #7
0
static Fastakeyquery *readfileofkeystoextract(bool verbose,
                                              unsigned long *numofqueries,
                                              const GtStr *fileofkeystoextract,
                                              GtError *err)
{
  FILE *fp;
  GtStr *currentline;
  bool haserr = false;
  uint64_t linenum;
  Fastakeyquery *fastakeyqueries;
#undef SKDEBUG
#ifdef SKDEBUG
  unsigned long i;
#endif

  gt_error_check(err);
  *numofqueries = gt_file_number_of_lines(gt_str_get(fileofkeystoextract));
  if (*numofqueries == 0)
  {
    gt_error_set(err,"empty file \"%s\" not allowed",
                 gt_str_get(fileofkeystoextract));
    return NULL;
  }
  fp = gt_fa_fopen(gt_str_get(fileofkeystoextract),"r",err);
  if (fp == NULL)
  {
    return NULL;
  }
  if (verbose)
  {
    printf("# opened keyfile \"%s\"\n",gt_str_get(fileofkeystoextract));
  }
  fastakeyqueries = gt_malloc(sizeof (*fastakeyqueries) * (*numofqueries));
  currentline = gt_str_new();
  for (linenum = 0; gt_str_read_next_line(currentline, fp) != EOF; linenum++)
  {
    if (extractkeyfromcurrentline(fastakeyqueries + linenum,
                                  0,
                                  currentline,
                                  linenum,
                                  fileofkeystoextract,
                                  err) != 0)
    {
      haserr = true;
      break;
    }
    gt_str_reset(currentline);
  }
  gt_str_delete(currentline);
  gt_fa_fclose(fp);
  if (haserr)
  {
    fastakeyqueries_delete(fastakeyqueries,*numofqueries);
    return NULL;
  }
  qsort(fastakeyqueries,(size_t) *numofqueries,sizeof (*fastakeyqueries),
        comparefastakeys);
  if (verbose)
  {
    printf("# %lu fastakey-queries successfully parsed and sorted\n",
            *numofqueries);
  }
  *numofqueries = remdupsfastakeyqueries(fastakeyqueries,*numofqueries,verbose);
#ifdef SKDEBUG
  for (i=0; i<*numofqueries; i++)
  {
    printf("%lu %s\n",i,fastakeyqueries[i].fastakey);
  }
#endif
  return fastakeyqueries;
}
static int gt_kmer_database_runner(GT_UNUSED int argc, const char **argv,
                                   int parsed_args, void *tool_arguments,
                                   GtError *err)
{
  GtKmerDatabaseArguments *arguments = tool_arguments;
  int had_err = 0;
  GtEncseq       *es;
  GtUword        es_length,
                 nu_kmer_codes = 0;
  GtKmerDatabase *compare_db = NULL,
                 *db = NULL;
  GtLogger *logger;
  FILE *fp = NULL;
  GtHashmap *kmer_hash = NULL;
  GtTimer *timer = NULL;

  gt_error_check(err);
  gt_assert(arguments);

  if (arguments->use_hash)
    kmer_hash = gt_hashmap_new(GT_HASH_DIRECT, NULL,
                               (GtFree) gt_kmer_database_delete_hash_value);
  if (arguments->bench)
    timer = gt_timer_new_with_progress_description("loading encoded sequence");

  logger = gt_logger_new(arguments->verbose, GT_LOGGER_DEFLT_PREFIX, stderr);

  if (arguments->verbose && gt_str_length(arguments->print_filename) > 0UL) {
    fp = gt_fa_fopen(gt_str_get(arguments->print_filename), "w", err);
    gt_logger_set_target(logger, fp);
  }

  if (!had_err) {
    GtEncseqLoader *es_l;
    if (arguments->bench)
      gt_timer_start(timer);
    es_l = gt_encseq_loader_new();
    es = gt_encseq_loader_load(es_l, argv[parsed_args], err);
    if (arguments->bench)
      gt_timer_show_progress(timer, "saving kmers (+iterating over file)",
                             stdout);
    if (es == NULL) {
      had_err = -1;
    }
    gt_encseq_loader_delete(es_l);
  }
  if (!had_err) {
    es_length = gt_encseq_total_length(es);
    if (es_length < (GtUword) arguments->kmersize) {
      gt_error_set(err, "Input is too short for used kmersize. File length: "
                   GT_WU " kmersize: %u", es_length, arguments->kmersize);
      had_err = -1;
    }
  }
  if (!had_err) {
    GtAlphabet *alphabet;
    alphabet = gt_encseq_alphabet(es);
    if (arguments->bench)
    nu_kmer_codes = gt_power_for_small_exponents(
                                            gt_alphabet_num_of_chars(alphabet),
                                            arguments->kmersize);
    if (!arguments->merge_only && !arguments->use_hash && !arguments->bench) {
      compare_db = gt_kmer_database_new(gt_alphabet_num_of_chars(alphabet),
                                arguments->kmersize, arguments->sb_size, es);
    }
    if (!arguments->use_hash) {
      db = gt_kmer_database_new(gt_alphabet_num_of_chars(alphabet),
                                arguments->kmersize,
                                arguments->sb_size, es);
      if (arguments->cutoff) {
        if (arguments->mean_cutoff)
          gt_kmer_database_use_mean_cutoff(db, (GtUword) 2,
                                           arguments->cutoff_value);
        else
          gt_kmer_database_set_cutoff(db, arguments->cutoff_value);
        if (!arguments->prune)
          gt_kmer_database_set_prune(db);
      }
    }
  }

  if (!had_err) {
    GtUword startpos = 0,
            endpos;
    GtKmercodeiterator *iter;
    const GtKmercode *kmercode = NULL;
    iter = gt_kmercodeiterator_encseq_new(es, GT_READMODE_FORWARD,
                                          arguments->kmersize, 0);
    while (!had_err && startpos < es_length - (arguments->kmersize - 1)) {
      GtUword startpos_add_kmer = startpos;
      if (arguments->merge_only) {
        endpos = startpos + (arguments->kmersize - 1) +
                 (gt_rand_max((arguments->sb_size - 1) * 2));
        if (endpos > es_length)
          endpos = es_length;
      }
      else {
        endpos = startpos + (arguments->kmersize - 1) +
                 (gt_rand_max(arguments->sb_size - 1));
      }
      gt_kmercodeiterator_reset(iter, GT_READMODE_FORWARD, startpos);
      while ((kmercode = gt_kmercodeiterator_encseq_next(iter)) != NULL &&
             startpos_add_kmer <= endpos - (arguments->kmersize - 1)) {
        if (!arguments->merge_only && !arguments->use_hash &&
            !kmercode->definedspecialposition && !arguments->bench) {
          gt_kmer_database_add_kmer(compare_db, kmercode->code,
                                    startpos_add_kmer);
        }
        if (arguments->use_hash && !kmercode->definedspecialposition) {
          gt_kmer_database_add_to_hash(kmer_hash, kmercode->code,
                                       startpos_add_kmer);
        }
        startpos_add_kmer++;
      }
      if (!arguments->use_hash) {
        gt_kmer_database_add_interval(db, startpos, endpos);
        gt_kmer_database_print_buffer(db, logger);
        if (!arguments->bench)
          had_err = gt_kmer_database_check_consistency(db, err);
      }
      startpos = endpos + 1;
    }
    if (!arguments->use_hash) {
      gt_kmer_database_flush(db);
      gt_kmer_database_print_buffer(db, logger);
      if (!had_err && !arguments->bench)
        had_err = gt_kmer_database_check_consistency(db, err);
      if (!arguments->merge_only && !had_err && !arguments->bench)
        had_err = gt_kmer_database_check_consistency(compare_db, err);
      if (!arguments->merge_only && !arguments->bench)
        gt_kmer_database_print(compare_db, logger, true);
      if (!arguments->merge_only && !had_err && !arguments->bench)
        had_err = gt_kmer_database_compare(compare_db, db, err);
      gt_kmer_database_print(db, logger, true);
    }
    gt_kmercodeiterator_delete(iter);
  }

  if (arguments->bench) {
    GtKmerStartpos pos;
    GtArrayGtUword *pos_hash;
    GtUword rand_access = (GtUword) 50000000,
            rand_code,
            i,
            sum = 0;
    gt_timer_show_progress(timer, "random access", stdout);
    for (i = 0; i < rand_access; i++) {
      rand_code = gt_rand_max(nu_kmer_codes - 1);
      if (arguments->use_hash) {
        pos_hash = gt_hashmap_get(kmer_hash, (const void *) rand_code);
        if (pos_hash != NULL)
          sum += pos_hash->spaceGtUword[pos_hash->nextfreeGtUword - 1];
      }
      else {
        pos = gt_kmer_database_get_startpos(db, rand_code);
        if (pos.no_positions > 0)
          sum += pos.startpos[pos.no_positions - 1];
      }
    }
    printf("sum: " GT_WU "\n", sum);

    gt_timer_show_progress(timer, "", stdout);
    gt_timer_stop(timer);
    gt_timer_delete(timer);
  }
  if (arguments->use_hash)
    gt_hashmap_delete(kmer_hash);
  gt_encseq_delete(es);
  if (!arguments->use_hash)
    gt_kmer_database_delete(db);
  if (!arguments->merge_only && !arguments->bench)
    gt_kmer_database_delete(compare_db);
  gt_logger_delete(logger);
  gt_fa_fclose(fp);

  return had_err;
}