Пример #1
0
static int show_entry(GtStr *description, GtStr *sequence, bool translate,
                      GtUword width, GtFile *outfp)
{
  int had_err = 0;
  if (translate) {
    GtTranslatorStatus status;
    unsigned int frame;
    char translated;
    GtStr *protein = gt_str_new();

    GtCodonIterator *ci = gt_codon_iterator_simple_new(gt_str_get(sequence),
                                                       gt_str_length(sequence),
                                                       NULL);
    GtTranslator* tr = gt_translator_new(ci);
    status = gt_translator_next(tr, &translated, &frame, NULL);
    while (status == GT_TRANSLATOR_OK) {
      if (frame == 0)
        gt_str_append_char(protein, translated);
      status = gt_translator_next(tr, &translated, &frame, NULL);
    }
    if (status == GT_TRANSLATOR_ERROR)
      had_err = -1;
    gt_fasta_show_entry(gt_str_get(description), gt_str_get(protein),
                        gt_str_length(protein), width, outfp);
    gt_str_delete(protein);
    gt_translator_delete(tr);
    gt_codon_iterator_delete(ci);
  }
  else {
    gt_fasta_show_entry(gt_str_get(description), gt_str_get(sequence),
                        gt_str_length(sequence), width, outfp);
  }
  return had_err;
}
Пример #2
0
static int gt_seqmutate_runner(int argc, const char **argv, int parsed_args,
                            void *tool_arguments, GtError *err)
{
  MutateArguments *arguments = tool_arguments;
  GtBioseqIterator *bsi;
  unsigned long i;
  GtBioseq *bioseq;
  GtSeq *mutated_seq;
  int had_err;

  gt_error_check(err);
  gt_assert(arguments);

  bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args);

  while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) && bioseq) {
    for (i = 0; i < gt_bioseq_number_of_sequences(bioseq); i++) {
      mutated_seq = gt_mutate_seq(gt_bioseq_get_description(bioseq, i),
                                  gt_bioseq_get_sequence(bioseq, i),
                                  gt_bioseq_get_sequence_length(bioseq, i),
                                  gt_bioseq_get_alphabet(bioseq),
                                  arguments->rate);
      gt_fasta_show_entry(gt_seq_get_description(mutated_seq),
                          gt_seq_get_orig(mutated_seq),
                          gt_seq_length(mutated_seq),
                          arguments->width, arguments->outfp);
      gt_seq_delete(mutated_seq);
    }
    gt_bioseq_delete(bioseq);
  }

  gt_bioseq_iterator_delete(bsi);

  return had_err;
}
Пример #3
0
static int gff3_visitor_sequence_node(GtNodeVisitor *nv, GtSequenceNode *sn,
                                      GT_UNUSED GtError *err)
{
  GtGFF3Visitor *gff3_visitor;
  gt_error_check(err);
  gff3_visitor = gff3_visitor_cast(nv);
  gt_assert(nv && sn);
  gff3_version_string(nv);
  if (!gff3_visitor->fasta_directive_shown) {
    if (!gff3_visitor->outstr)
      gt_file_xprintf(gff3_visitor->outfp, "%s\n", GT_GFF_FASTA_DIRECTIVE);
    else {
      gt_str_append_cstr(gff3_visitor->outstr, GT_GFF_FASTA_DIRECTIVE);
      gt_str_append_char(gff3_visitor->outstr, '\n');
    }
    gff3_visitor->fasta_directive_shown = true;
  }
  if (!gff3_visitor->outstr) {
    gt_fasta_show_entry(gt_sequence_node_get_description(sn),
                        gt_sequence_node_get_sequence(sn),
                        gt_sequence_node_get_sequence_length(sn),
                        gff3_visitor->fasta_width, gff3_visitor->outfp);
  } else {
    gt_fasta_show_entry_str(gt_sequence_node_get_description(sn),
                            gt_sequence_node_get_sequence(sn),
                            gt_sequence_node_get_sequence_length(sn),
                            gff3_visitor->fasta_width, gff3_visitor->outstr);
  }
  return 0;
}
Пример #4
0
static bool show_target(GT_UNUSED unsigned long pos, void *data)
{
  TargetInfo *ti = data;
  gt_assert(ti);
  gt_fasta_show_entry(gt_bioseq_get_description(ti->bioseq, ti->seqnum),
                      gt_bioseq_get_sequence(ti->bioseq, ti->seqnum),
                      gt_bioseq_get_sequence_length(ti->bioseq, ti->seqnum), 0);
  return true;
}
Пример #5
0
void gt_bioseq_show_as_fasta(GtBioseq *bs, GtUword width, GtFile *outfp)
{
  GtUword i;

  gt_assert(bs);

  for (i = 0; i < gt_bioseq_number_of_sequences(bs); i++) {
    char *seq = gt_bioseq_get_sequence(bs, i);
    gt_fasta_show_entry(gt_bioseq_get_description(bs, i),
                        seq,
                        gt_bioseq_get_sequence_length(bs, i), width, outfp);
    gt_free(seq);
  }
}
Пример #6
0
void gt_bioseq_show_sequence_as_fasta(GtBioseq *bs, GtUword seqnum,
                                      GtUword width, GtFile *outfp)
{
  char *seq = NULL;
  gt_assert(bs);
  gt_assert(seqnum < gt_bioseq_number_of_sequences(bs));
  seq = gt_bioseq_get_sequence(bs, seqnum);

  gt_fasta_show_entry(gt_bioseq_get_description(bs, seqnum),
                      seq,
                      gt_bioseq_get_sequence_length(bs, seqnum), width, outfp);

  gt_free(seq);
}
Пример #7
0
static int gt_seqfilter_runner(int argc, const char **argv, int parsed_args,
                               void *tool_arguments, GtError *err)
{
  SeqFilterArguments *arguments = tool_arguments;
  GtBioseqIterator *bsi;
  GtBioseq *bioseq;
  unsigned long i;
  unsigned long long passed = 0, filtered = 0, num_of_sequences = 0;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(tool_arguments);

  bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args);

  while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) && bioseq) {
    for (i = 0; i < gt_bioseq_number_of_sequences(bioseq); i++) {
      if ((arguments->minlength == GT_UNDEF_ULONG ||
           gt_bioseq_get_sequence_length(bioseq, i) >= arguments->minlength) &&
          (arguments->maxlength == GT_UNDEF_ULONG ||
           gt_bioseq_get_sequence_length(bioseq, i) <= arguments->maxlength) &&
          (arguments->maxseqnum == GT_UNDEF_ULONG ||
           passed + 1 <= arguments->maxseqnum)) {
        gt_fasta_show_entry(gt_bioseq_get_description(bioseq, i),
                            gt_bioseq_get_sequence(bioseq, i),
                            gt_bioseq_get_sequence_length(bioseq, i),
                            arguments->width, arguments->outfp);
        passed++;
      }
      else
        filtered++;
      num_of_sequences++;
    }
    gt_bioseq_delete(bioseq);
  }

  /* show statistics */
  if (!had_err) {
    gt_assert(passed + filtered == num_of_sequences);
    fprintf(stderr, "# %llu out of %llu sequences have been removed (%.3f%%)\n",
            filtered, num_of_sequences,
            ((double) filtered / num_of_sequences) * 100.0);
  }

  gt_bioseq_iterator_delete(bsi);

  return had_err;
}
Пример #8
0
static int gt_seqtranslate_do_translation(GtTranslateArguments *arguments,
                                       const char *sequence,
                                       GtUword length,
                                       const char *desc,
                                       GtStr **translations,
                                       bool rev,
                                       GtError *err)
{
  GtTranslator *tr;
  GT_UNUSED GtTranslatorStatus trst;
  GtCodonIterator *ci;
  char translated;
  int had_err = 0;
  GtStr *str;
  unsigned int frame,
               i;

  ci = gt_codon_iterator_simple_new(sequence, length, err);
  tr = gt_translator_new(ci);
  trst = gt_translator_next(tr, &translated, &frame, err);
  while (trst == GT_TRANSLATOR_OK) {
    gt_str_append_char(translations[frame], translated);
    trst = gt_translator_next(tr, &translated, &frame, err);
  }
  gt_codon_iterator_delete(ci);
  gt_translator_delete(tr);
  if (trst == GT_TRANSLATOR_ERROR)
    return -1;
  str = gt_str_new();
  for (i = 0; i < 3; i++) {
    if (gt_str_length(translations[i]) > 0) {
      gt_str_append_cstr(str, desc);
      gt_str_append_cstr(str, " (");
      gt_str_append_ulong(str, i+1);
      gt_str_append_cstr(str, rev ? "-" : "+");
      gt_str_append_cstr(str, ")");
      gt_fasta_show_entry(gt_str_get(str), gt_str_get(translations[i]),
                          gt_str_length(translations[i]),
                          arguments->fasta_width, arguments->outfp);
      gt_str_reset(translations[i]);
      gt_str_reset(str);
    }
  }
  gt_str_delete(str);
  return had_err;
}
Пример #9
0
static int gt_shredder_runner(GT_UNUSED int argc, const char **argv,
                              int parsed_args, void *tool_arguments,
                              GtError *err)
{
  GtShredderArguments *arguments = tool_arguments;
  GtBioseqIterator *bsi;
  unsigned long i;
  GtBioseq *bioseq;
  int had_err;
  GtStr *desc;

  gt_error_check(err);
  gt_assert(arguments);

  /* init */
  desc = gt_str_new();
  bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args);

  /* shredder */
  while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) && bioseq) {
    for (i = 0; i < arguments->coverage; i++) {
      GtShredder *shredder;
      unsigned long fragment_length;
      const char *fragment;
      shredder = gt_shredder_new(bioseq, arguments->minlength,
                              arguments->maxlength);
      gt_shredder_set_overlap(shredder, arguments->overlap);
      gt_shredder_set_sample_probability(shredder,
                                         arguments->sample_probability);
      while ((fragment = gt_shredder_shred(shredder, &fragment_length, desc))) {
        gt_str_append_cstr(desc, " [shreddered fragment]");
        gt_fasta_show_entry(gt_str_get(desc), fragment, fragment_length, 0);
      }
      gt_shredder_delete(shredder);
    }
    gt_bioseq_delete(bioseq);
  }

  /* free */
  gt_bioseq_iterator_delete(bsi);
  gt_str_delete(desc);

  return had_err;
}
Пример #10
0
static int extractseq_pos(GtFile *outfp, GtBioseq *bs,
                          unsigned long frompos, unsigned long topos,
                          unsigned long width, GtError *err)
{
  int had_err = 0;
  gt_error_check(err);
  gt_assert(bs);
  if (topos > gt_bioseq_get_raw_sequence_length(bs)) {
    gt_error_set(err,
              "argument %lu to option '-%s' is larger than sequence length %lu",
              topos, TOPOS_OPTION_STR, gt_bioseq_get_raw_sequence_length(bs));
    had_err = -1;
  }
  if (!had_err) {
    gt_fasta_show_entry(NULL, gt_bioseq_get_raw_sequence(bs) + frompos - 1,
                        topos - frompos + 1, width, outfp);
  }
  return had_err;
}
Пример #11
0
static int split_description(const char *filename, GtStr *splitdesc,
                             unsigned long width, bool force, GtError *err)
{
  unsigned long i;
  GtBioseq *bioseq;
  GtStr *descname;
  int had_err = 0;
  gt_error_check(err);
  gt_assert(filename && splitdesc && gt_str_length(splitdesc));

  descname = gt_str_new();
  if (!(bioseq = gt_bioseq_new(filename, err)))
    had_err = -1;

  for (i = 0; !had_err && i < gt_bioseq_number_of_sequences(bioseq); i++) {
    GtFile *outfp;
    char *seq;
    gt_str_reset(descname);
    gt_str_append_str(descname, splitdesc);
    gt_str_append_char(descname, '/');
    gt_str_append_cstr(descname, gt_bioseq_get_description(bioseq, i));
    gt_str_append_cstr(descname, gt_file_suffix(filename));
    if (!(outfp = gt_output_file_xopen_forcecheck(gt_str_get(descname), "w",
                                                 force, err))) {
      had_err = -1;
      break;
    }
    seq = gt_bioseq_get_sequence(bioseq, i);
    gt_fasta_show_entry(gt_bioseq_get_description(bioseq, i), seq,
                        gt_bioseq_get_sequence_length(bioseq, i), width,
                        outfp);
    gt_free(seq);
    gt_file_delete(outfp);
  }

  gt_bioseq_delete(bioseq);
  gt_str_delete(descname);

  return had_err;
}
Пример #12
0
static int extractseq_match(GtFile *outfp, GtBioseq *bs,
                            const char *pattern, unsigned long width,
                            GtError *err)
{
  const char *desc;
  unsigned long i;
  bool match;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(bs && pattern);

  for (i = 0; !had_err && i < gt_bioseq_number_of_sequences(bs); i++) {
    desc = gt_bioseq_get_description(bs, i);
    gt_assert(desc);
    had_err = gt_grep(&match, pattern, desc, err);
    if (!had_err && match) {
      gt_fasta_show_entry(desc, gt_bioseq_get_sequence(bs, i),
                          gt_bioseq_get_sequence_length(bs, i), width, outfp);
    }
  }

  return had_err;
}
static int write_pdom(GtLTRdigestFileOutStream *ls, GtArray *pdoms,
                      const char *pdomname, GT_UNUSED GtRegionMapping *rmap,
                      char *desc, GtError *err)
{
  int had_err = 0;
  GtFile *seqfile = NULL,
            *alifile = NULL,
            *aafile = NULL;
  GtUword i = 0,
                seq_length = 0;
  GtStr *pdom_seq,
        *pdom_aaseq;
  gt_error_check(err);

  pdom_seq = gt_str_new();
  pdom_aaseq = gt_str_new();

  /* get protein domain output file */
  seqfile = (GtFile*) gt_hashmap_get(ls->pdomout_files, pdomname);
  if (seqfile == NULL)
  {
    /* no file opened for this domain yet, do it */
    char buffer[GT_MAXFILENAMELEN];
    (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s.fas",
                    ls->fileprefix, pdomname);
    seqfile = gt_file_xopen(buffer, "w+");
    gt_hashmap_add(ls->pdomout_files, gt_cstr_dup(pdomname), seqfile);
  }

  /* get protein alignment output file */
  if (ls->write_pdom_alignments)
  {
    alifile = (GtFile*) gt_hashmap_get(ls->pdomali_files, pdomname);
    if (alifile == NULL)
    {
      /* no file opened for this domain yet, do it */
      char buffer[GT_MAXFILENAMELEN];
      (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1), "%s_pdom_%s.ali",
                      ls->fileprefix, pdomname);
      alifile = gt_file_xopen(buffer, "w+");
      gt_hashmap_add(ls->pdomali_files, gt_cstr_dup(pdomname), alifile);
    }
  }

  /* get amino acid sequence output file */
  if (ls->write_pdom_aaseqs)
  {
    aafile = (GtFile*) gt_hashmap_get(ls->pdomaa_files, pdomname);
    if (aafile == NULL)
    {
      /* no file opened for this domain yet, do it */
      char buffer[GT_MAXFILENAMELEN];
      (void) snprintf(buffer, (size_t) (GT_MAXFILENAMELEN-1),
                      "%s_pdom_%s_aa.fas",
                      ls->fileprefix, pdomname);
      aafile = gt_file_xopen(buffer, "w+");
      gt_hashmap_add(ls->pdomaa_files, gt_cstr_dup(pdomname), aafile);
    }
  }

  if (gt_array_size(pdoms) > 1UL)
  {
    for (i=1UL; i<gt_array_size(pdoms); i++)
    {
      gt_assert(gt_genome_node_cmp(*(GtGenomeNode**)gt_array_get(pdoms, i),
                                *(GtGenomeNode**)gt_array_get(pdoms, i-1))
                >= 0);
    }
    if (gt_feature_node_get_strand(*(GtFeatureNode**) gt_array_get(pdoms, 0UL))
        == GT_STRAND_REVERSE)
    {
      gt_array_reverse(pdoms);
    }
  }

  /* output protein domain data */
  for (i=0;i<gt_array_size(pdoms);i++)
  {
    GtRange pdom_rng;
    GtStr *ali,
          *aaseq;
    GtFeatureNode *fn;
    int rval;

    fn = *(GtFeatureNode**) gt_array_get(pdoms, i);

    ali = gt_genome_node_get_user_data((GtGenomeNode*) fn, "pdom_alignment");
    aaseq = gt_genome_node_get_user_data((GtGenomeNode*) fn, "pdom_aaseq");
    pdom_rng = gt_genome_node_get_range((GtGenomeNode*) fn);

    rval = gt_extract_feature_sequence(pdom_seq, (GtGenomeNode*) fn,
                                       gt_symbol(gt_ft_protein_match), false,
                                       NULL, NULL, rmap, err);

    if (rval)
    {
      had_err = -1;
      break;
    }
    if (ls->write_pdom_alignments && ali)
    {
      char buf[BUFSIZ];

      /* write away alignment */
      (void) snprintf(buf, BUFSIZ-1, "Protein domain alignment in translated "
                                     "sequence for candidate\n'%s':\n\n",
                                     desc);
      gt_file_xwrite(alifile, buf, (size_t) strlen(buf) * sizeof (char));
      gt_file_xwrite(alifile, gt_str_get(ali),
                        (size_t) gt_str_length(ali) * sizeof (char));
      gt_file_xwrite(alifile, "---\n\n", 5 * sizeof (char));
    }
    if (ls->write_pdom_aaseqs && aaseq)
    {
      /* append amino acid sequence */
      gt_str_append_str(pdom_aaseq, aaseq);
    }
    gt_genome_node_release_user_data((GtGenomeNode*) fn, "pdom_alignment");
    gt_genome_node_release_user_data((GtGenomeNode*) fn, "pdom_aaseq");
    seq_length += gt_range_length(&pdom_rng);
  }

  if (!had_err)
  {
    gt_fasta_show_entry(desc,
                        gt_str_get(pdom_seq),
                        seq_length,
                        GT_FSWIDTH,
                        seqfile);
    if (ls->write_pdom_aaseqs)
    {
      gt_fasta_show_entry(desc,
                          gt_str_get(pdom_aaseq),
                          gt_str_length(pdom_aaseq),
                          GT_FSWIDTH,
                          aafile);
    }
  }
  gt_str_delete(pdom_seq);
  gt_str_delete(pdom_aaseq);
  return had_err;
}
Пример #14
0
int gt_extractkeysfromfastafile(bool verbose,
                                GtFile *outfp,
                                unsigned long width,
                                const GtStr *fileofkeystoextract,
                                GtStrArray *referencefiletab,
                                GtError *err)
{
  GtSeqIterator *seqit;
  const GtUchar *sequence;
  char *desc, *headerbufferspace = NULL, *keyspace = NULL;
  const char *keyptr;
  unsigned long allockeyspace = 0, len, keylen, numofqueries, keyposition,
                countmarkhit = 0;
  int had_err = 0;
  off_t totalsize;
  Fastakeyquery *fastakeyqueries;
  size_t headerbuffersize = 0, headerlength;

  gt_error_check(err);
  fastakeyqueries = readfileofkeystoextract(verbose,&numofqueries,
                                            fileofkeystoextract,err);
  if (fastakeyqueries == NULL)
  {
    return -1;
  }
  totalsize = gt_files_estimate_total_size(referencefiletab);
  if (verbose)
  {
    printf("# estimated total size is " Formatuint64_t "\n",
            PRINTuint64_tcast(totalsize));
  }
  seqit = gt_seqiterator_sequence_buffer_new(referencefiletab, err);
  if (!seqit)
  {
    had_err = -1;
  }
  if (!had_err && verbose)
  {
    gt_progressbar_start(gt_seqiterator_getcurrentcounter(seqit,
                                                          (unsigned long long)
                                                          totalsize),
                                                          (unsigned long long)
                                                          totalsize);
  }
  while (had_err != -1 && countmarkhit < numofqueries)
  {
    had_err = gt_seqiterator_next(seqit, &sequence, &len, &desc, err);
    if (had_err != 1)
    {
      break;
    }
    keyptr = desc2key(&keylen,desc,err);
    if (keyptr == NULL)
    {
      had_err = -1;
    } else
    {
      if (allockeyspace < keylen)
      {
        keyspace = gt_realloc(keyspace,sizeof (*keyspace) * (keylen+1));
        allockeyspace = keylen;
      }
      gt_assert(keyspace != NULL);
      strncpy(keyspace,keyptr,(size_t) keylen);
      keyspace[keylen] = '\0';
      keyposition = searchdesinfastakeyqueries(keyspace,fastakeyqueries,
                                               numofqueries);
      if (keyposition < numofqueries)
      {
        while (keyposition < numofqueries &&
               strcmp(fastakeyqueries[keyposition].fastakey,keyspace) == 0)
        {
#ifndef NDEBUG
          if (fastakeyqueries[keyposition].markhit)
          {
            fprintf(stderr,"key %s was already found before\n",
                     fastakeyqueries[keyposition].fastakey);
            exit(GT_EXIT_PROGRAMMING_ERROR);
          }
#endif
          headerlength = strlen(desc);
          if (headerbuffersize < headerlength + EXTRABUF + 1)
          {
            headerbuffersize = headerlength + EXTRABUF + 1;
            headerbufferspace = gt_realloc(headerbufferspace,
                                           sizeof (*headerbufferspace)
                                           * headerbuffersize);
          }
          if (COMPLETE(fastakeyqueries + keyposition))
          {
            /*
            (void) snprintf(headerbufferspace,headerbuffersize,
                            "%*.*s complete %s",
                            (int) keylen,(int) keylen,keyspace,
                            desc);
            */
            gt_fasta_show_entry(desc, (const char *) sequence, len, width,
                                outfp);
          } else
          {
            (void) snprintf(headerbufferspace,headerbuffersize,
                            "%*.*s %lu %lu %s",
                            (int) keylen,(int) keylen,keyspace,
                            fastakeyqueries[keyposition].frompos,
                            fastakeyqueries[keyposition].topos,
                            desc);
            gt_fasta_show_entry(headerbufferspace,
                                (const char *)
                                (sequence+fastakeyqueries[keyposition].
                                                          frompos - 1),
                                fastakeyqueries[keyposition].topos -
                                fastakeyqueries[keyposition].frompos+1,
                                width, outfp);
          }
          fastakeyqueries[keyposition].markhit = true;
          countmarkhit++;
          keyposition++;
        }
      }
#ifdef SKDEBUG
      printf("%s 1 %lu\n",keyspace, len);
#endif
    }
  }
  gt_free(headerbufferspace);
  gt_free(keyspace);
  if (verbose)
  {
    gt_progressbar_stop();
  }
  if (verbose)
  {
    outputnonmarked(fastakeyqueries,numofqueries);
  }
  fastakeyqueries_delete(fastakeyqueries,numofqueries);
  gt_seqiterator_delete(seqit);
  return had_err;
}
Пример #15
0
static int gt_seqfilter_runner(int argc, const char **argv, int parsed_args,
                               void *tool_arguments, GtError *err)
{
  SeqFilterArguments *arguments = tool_arguments;
  GtBioseqIterator *bsi;
  GtBioseq *bioseq;
  GtUint64 passed = 0, filtered = 0, num_of_sequences = 0, steps = 0;
  int had_err = 0;

  gt_error_check(err);
  gt_assert(tool_arguments);

  bsi = gt_bioseq_iterator_new(argc - parsed_args, argv + parsed_args);

  while (!(had_err = gt_bioseq_iterator_next(bsi, &bioseq, err)) &&
         bioseq != NULL) {
    GtUword i;
    GtUint64 current_num = gt_bioseq_number_of_sequences(bioseq);
    for (i = 0;
         i < current_num &&
         (arguments->maxseqnum == GT_UNDEF_UWORD ||
          passed + 1 <= arguments->maxseqnum);
         i++) {
      char *seq;
      if ((arguments->step == 1 ||
           steps + 1 == arguments->step) &&
          (arguments->sample_prob == 1.0 ||
           gt_rand_0_to_1() <= arguments->sample_prob) &&
          (arguments->minlength == GT_UNDEF_UWORD ||
           gt_bioseq_get_sequence_length(bioseq, i) >= arguments->minlength) &&
          (arguments->maxlength == GT_UNDEF_UWORD ||
           gt_bioseq_get_sequence_length(bioseq, i) <= arguments->maxlength)) {
        seq = gt_bioseq_get_sequence(bioseq, i);
        gt_fasta_show_entry(gt_bioseq_get_description(bioseq, i),
                            seq,
                            gt_bioseq_get_sequence_length(bioseq, i),
                            arguments->width, arguments->outfp);
        gt_free(seq);
        passed++;
      }
      else {
        filtered++;
      }
      steps = (steps + 1 == arguments->step) ? 0 : steps + 1;
    }
    filtered += current_num - i;
    num_of_sequences += current_num;
    gt_bioseq_delete(bioseq);
  }

  /* show statistics */
  if (!had_err) {
    gt_assert(passed + filtered == num_of_sequences);
    fprintf(stderr, "# " GT_LLU " out of " GT_LLU
            " sequences have been removed (%.3f%%)\n",
            filtered, num_of_sequences,
            ((double) filtered / num_of_sequences) * 100.0);
  }

  gt_bioseq_iterator_delete(bsi);

  return had_err;
}
Пример #16
0
static int gt_sequniq_runner(int argc, const char **argv, int parsed_args,
                             void *tool_arguments, GtError *err)
{
  GtSequniqArguments *arguments = tool_arguments;
  GtUint64 duplicates = 0, num_of_sequences = 0;
  int i, had_err = 0;
  GtMD5Set *md5set;

  gt_error_check(err);
  gt_assert(arguments);
  md5set = gt_md5set_new(arguments->nofseqs);
  if (!arguments->seqit) {
    GtUword j;
    GtBioseq *bs;

    for (i = parsed_args; !had_err && i < argc; i++) {
      if (!(bs = gt_bioseq_new(argv[i], err)))
        had_err = -1;
      if (!had_err) {
        GtMD5SetStatus retval;
        for (j = 0; j < gt_bioseq_number_of_sequences(bs) && !had_err; j++) {
          char *seq = gt_bioseq_get_sequence(bs, j);
          retval = gt_md5set_add_sequence(md5set, seq,
                                          gt_bioseq_get_sequence_length(bs, j),
                                          arguments->rev, err);
          if (retval == GT_MD5SET_NOT_FOUND)
            gt_fasta_show_entry(gt_bioseq_get_description(bs, j), seq,
                                gt_bioseq_get_sequence_length(bs, j),
                                arguments->width, arguments->outfp);
          else if (retval != GT_MD5SET_ERROR)
            duplicates++;
          else
            had_err = -1;
          num_of_sequences++;
          gt_free(seq);
        }
        gt_bioseq_delete(bs);
      }
    }
  }
  else {
    GtSeqIterator *seqit;
    GtStrArray *files;
    off_t totalsize;
    const GtUchar *sequence;
    char *desc;
    GtUword len;

    files = gt_str_array_new();
    for (i = parsed_args; i < argc; i++)
      gt_str_array_add_cstr(files, argv[i]);
    totalsize = gt_files_estimate_total_size(files);
    seqit = gt_seq_iterator_sequence_buffer_new(files, err);
    if (!seqit)
      had_err = -1;
    if (!had_err) {
      if (arguments->verbose) {
        gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit,
                                                          (GtUint64) totalsize),
                             (GtUint64) totalsize);
      }
      while (!had_err) {
        GtMD5SetStatus retval;
        if ((gt_seq_iterator_next(seqit, &sequence, &len, &desc, err)) != 1)
          break;

        retval = gt_md5set_add_sequence(md5set, (const char*) sequence, len,
                                        arguments->rev, err);
        if (retval == GT_MD5SET_NOT_FOUND)
          gt_fasta_show_entry(desc, (const char*) sequence, len,
                              arguments->width, arguments->outfp);
        else if (retval != GT_MD5SET_ERROR)
          duplicates++;
        else
          had_err = -1;
        num_of_sequences++;
      }
      if (arguments->verbose)
        gt_progressbar_stop();
      gt_seq_iterator_delete(seqit);
    }
    gt_str_array_delete(files);
  }

  /* show statistics */
  if (!had_err) {
    fprintf(stderr,
            "# "GT_WU" out of "GT_WU" sequences have been removed (%.3f%%)\n",
            (GtUword)duplicates, (GtUword)num_of_sequences,
            ((double) duplicates / (double)num_of_sequences) * 100.0);
  }

  gt_md5set_delete(md5set);
  return had_err;
}
int gt_ltrfileout_stream_next(GtNodeStream *ns, GtGenomeNode **gn, GtError *err)
{
  GtLTRdigestFileOutStream *ls;
  GtFeatureNode *fn;
  GtRange lltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD},
          rltr_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD},
          ppt_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD},
          pbs_rng = {GT_UNDEF_UWORD, GT_UNDEF_UWORD};
  int had_err;
  GtUword i=0;

  gt_error_check(err);
  ls = gt_ltrdigest_file_out_stream_cast(ns);

  /* initialize this element */
  memset(&ls->element, 0, sizeof (GtLTRElement));

  /* get annotations from parser */
  had_err = gt_node_stream_next(ls->in_stream, gn, err);
  if (!had_err && *gn)
  {
    GtFeatureNodeIterator* gni;
    GtFeatureNode *mygn;

    /* only process feature nodes */
    if (!(fn = gt_feature_node_try_cast(*gn)))
      return 0;

    ls->element.pdomorder = gt_array_new(sizeof (const char*));

    /* fill LTRElement structure from GFF3 subgraph */
    gni = gt_feature_node_iterator_new(fn);
    for (mygn = fn; mygn != NULL; mygn = gt_feature_node_iterator_next(gni))
      (void) gt_genome_node_accept((GtGenomeNode*) mygn,
                                   (GtNodeVisitor*) ls->lv,
                                   err);
    gt_feature_node_iterator_delete(gni);
  }

  if (!had_err && ls->element.mainnode != NULL)
  {
    char desc[GT_MAXFASTAHEADER];
    GtFeatureNode *ltr3, *ltr5;
    GtStr *sdesc, *sreg, *seq;

    /* find sequence in GtEncseq */
    sreg = gt_genome_node_get_seqid((GtGenomeNode*) ls->element.mainnode);

    sdesc = gt_str_new();
    had_err = gt_region_mapping_get_description(ls->rmap, sdesc, sreg, err);

    if (!had_err) {
      GtRange rng;
      ls->element.seqid = gt_calloc((size_t) ls->seqnamelen+1, sizeof (char));
      (void) snprintf(ls->element.seqid,
                      MIN((size_t) gt_str_length(sdesc),
                          (size_t) ls->seqnamelen)+1,
                      "%s", gt_str_get(sdesc));
      gt_cstr_rep(ls->element.seqid, ' ', '_');
      if (gt_str_length(sdesc) > (GtUword) ls->seqnamelen)
        ls->element.seqid[ls->seqnamelen] = '\0';

      (void) gt_ltrelement_format_description(&ls->element,
                                              ls->seqnamelen,
                                              desc,
                                              (size_t) (GT_MAXFASTAHEADER-1));
      gt_str_delete(sdesc);

      /* output basic retrotransposon data */
      lltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftLTR);
      rltr_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightLTR);
      rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.mainnode);
      gt_file_xprintf(ls->tabout_file,
                      GT_WU"\t"GT_WU"\t"GT_WU"\t%s\t"GT_WU"\t"GT_WU"\t"GT_WU"\t"
                      GT_WU"\t"GT_WU"\t"GT_WU"\t",
                      rng.start, rng.end, gt_ltrelement_length(&ls->element),
                      ls->element.seqid, lltr_rng.start, lltr_rng.end,
                      gt_ltrelement_leftltrlen(&ls->element), rltr_rng.start,
                      rltr_rng.end, gt_ltrelement_rightltrlen(&ls->element));
    }
    seq = gt_str_new();

    /* output TSDs */
    if (!had_err && ls->element.leftTSD != NULL)
    {
      GtRange tsd_rng;
      tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.leftTSD);
      had_err = gt_extract_feature_sequence(seq,
                                       (GtGenomeNode*) ls->element.leftTSD,
                                       gt_symbol(gt_ft_target_site_duplication),
                                       false,
                                       NULL, NULL, ls->rmap, err);
      if (!had_err) {
        gt_file_xprintf(ls->tabout_file,
                         ""GT_WU"\t"GT_WU"\t%s\t",
                         tsd_rng.start,
                         tsd_rng.end,
                         gt_str_get(seq));
      }
    gt_str_reset(seq);
    } else gt_file_xprintf(ls->tabout_file, "\t\t\t");

    if (!had_err && ls->element.rightTSD != NULL)
    {
      GtRange tsd_rng;

      tsd_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.rightTSD);
      had_err = gt_extract_feature_sequence(seq,
                                       (GtGenomeNode*) ls->element.rightTSD,
                                       gt_symbol(gt_ft_target_site_duplication),
                                       false,
                                       NULL, NULL, ls->rmap, err);
      if (!had_err) {
        gt_file_xprintf(ls->tabout_file,
                           ""GT_WU"\t"GT_WU"\t%s\t",
                           tsd_rng.start,
                           tsd_rng.end,
                           gt_str_get(seq));
      }
      gt_str_reset(seq);
    } else gt_file_xprintf(ls->tabout_file, "\t\t\t");

    /* output PPT */
    if (!had_err && ls->element.ppt != NULL)
    {
      GtStrand ppt_strand = gt_feature_node_get_strand(ls->element.ppt);

      ppt_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.ppt);
      had_err = gt_extract_feature_sequence(seq,
                                            (GtGenomeNode*) ls->element.ppt,
                                            gt_symbol(gt_ft_RR_tract), false,
                                            NULL, NULL, ls->rmap, err);
      if (!had_err) {
        gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&ppt_rng),
                            GT_FSWIDTH, ls->pptout_file);
        gt_file_xprintf(ls->tabout_file,
                           ""GT_WU"\t"GT_WU"\t%s\t%c\t%d\t",
                           ppt_rng.start,
                           ppt_rng.end,
                           gt_str_get(seq),
                           GT_STRAND_CHARS[ppt_strand],
                           (ppt_strand == GT_STRAND_FORWARD ?
                               abs((int) (rltr_rng.start - ppt_rng.end)) :
                               abs((int) (lltr_rng.end - ppt_rng.start))));
      }
      gt_str_reset(seq);
    } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t");

    /* output PBS */
    if (!had_err && ls->element.pbs != NULL)
    {
      GtStrand pbs_strand;

      pbs_strand = gt_feature_node_get_strand(ls->element.pbs);
      pbs_rng = gt_genome_node_get_range((GtGenomeNode*) ls->element.pbs);
      had_err = gt_extract_feature_sequence(seq,
                                           (GtGenomeNode*) ls->element.pbs,
                                           gt_symbol(gt_ft_primer_binding_site),
                                           false, NULL, NULL, ls->rmap, err);
      if (!had_err) {
        gt_fasta_show_entry(desc, gt_str_get(seq), gt_range_length(&pbs_rng),
                            GT_FSWIDTH, ls->pbsout_file);
        gt_file_xprintf(ls->tabout_file,
                         ""GT_WU"\t"GT_WU"\t%c\t%s\t%s\t%s\t%s\t%s\t",
                         pbs_rng.start,
                         pbs_rng.end,
                         GT_STRAND_CHARS[pbs_strand],
                         gt_feature_node_get_attribute(ls->element.pbs, "trna"),
                         gt_str_get(seq),
                         gt_feature_node_get_attribute(ls->element.pbs,
                                                       "pbsoffset"),
                         gt_feature_node_get_attribute(ls->element.pbs,
                                                       "trnaoffset"),
                         gt_feature_node_get_attribute(ls->element.pbs,
                                                       "edist"));
      }
      gt_str_reset(seq);
    } else gt_file_xprintf(ls->tabout_file, "\t\t\t\t\t\t\t\t");

    /* output protein domains */
    if (!had_err && ls->element.pdoms != NULL)
    {
      GtStr *pdomorderstr = gt_str_new();
      for (i=0; !had_err && i<gt_array_size(ls->element.pdomorder); i++)
      {
        const char* key = *(const char**) gt_array_get(ls->element.pdomorder,
                                                       i);
        GtArray *entry = (GtArray*) gt_hashmap_get(ls->element.pdoms, key);
        had_err = write_pdom(ls, entry, key, ls->rmap, desc, err);
      }

      if (GT_STRAND_REVERSE == gt_feature_node_get_strand(ls->element.mainnode))
        gt_array_reverse(ls->element.pdomorder);

      for (i=0 ;!had_err && i<gt_array_size(ls->element.pdomorder); i++)
      {
        const char* name = *(const char**) gt_array_get(ls->element.pdomorder,
                                                        i);
        gt_str_append_cstr(pdomorderstr, name);
        if (i != gt_array_size(ls->element.pdomorder)-1)
          gt_str_append_cstr(pdomorderstr, "/");
      }
      gt_file_xprintf(ls->tabout_file, "%s", gt_str_get(pdomorderstr));
      gt_str_delete(pdomorderstr);
    }

    /* output LTRs (we just expect them to exist) */
    switch (gt_feature_node_get_strand(ls->element.mainnode))
    {
      case GT_STRAND_REVERSE:
        ltr5 = ls->element.rightLTR;
        ltr3 = ls->element.leftLTR;
        break;
      case GT_STRAND_FORWARD:
      default:
        ltr5 = ls->element.leftLTR;
        ltr3 = ls->element.rightLTR;
        break;
    }

    if (!had_err) {
      had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr5,
                                          gt_symbol(gt_ft_long_terminal_repeat),
                                          false,
                                          NULL, NULL, ls->rmap, err);
    }
    if (!had_err) {
      gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq),
                          GT_FSWIDTH, ls->ltr5out_file);
      gt_str_reset(seq);
    }
    if (!had_err) {
      had_err = gt_extract_feature_sequence(seq, (GtGenomeNode*) ltr3,
                                          gt_symbol(gt_ft_long_terminal_repeat),
                                          false,
                                          NULL, NULL, ls->rmap, err);
    }
    if (!had_err) {
      gt_fasta_show_entry(desc, gt_str_get(seq), gt_str_length(seq),
                          GT_FSWIDTH, ls->ltr3out_file);
      gt_str_reset(seq);
    }

    /* output complete oriented element */
    if (!had_err) {
      had_err = gt_extract_feature_sequence(seq,
                                           (GtGenomeNode*) ls->element.mainnode,
                                           gt_symbol(gt_ft_LTR_retrotransposon),
                                           false,
                                           NULL, NULL, ls->rmap, err);
    }
    if (!had_err) {
      gt_fasta_show_entry(desc,gt_str_get(seq), gt_str_length(seq),
                          GT_FSWIDTH, ls->elemout_file);
      gt_str_reset(seq);
    }
    gt_file_xprintf(ls->tabout_file, "\n");
    gt_str_delete(seq);
  }
  gt_hashmap_delete(ls->element.pdoms);
  gt_array_delete(ls->element.pdomorder);
  gt_free(ls->element.seqid);
  return had_err;
}