コード例 #1
0
GtSeqIterator* gt_seq_iterator_sequence_buffer_new(const GtStrArray
                                                                   *filenametab,
                                                  GtError *err)
{
  GtSeqIterator *si;
  GtSequenceBuffer *sb = gt_sequence_buffer_new_guess_type(filenametab, err);
  if (!sb)
    return NULL;
  si = gt_seq_iterator_sequence_buffer_new_with_buffer(sb);
  gt_sequence_buffer_delete(sb); /* drop this reference */
  return si;
}
コード例 #2
0
ファイル: fileutils.c プロジェクト: kowsky/genometools
int gt_files_guess_if_protein_sequences(const GtStrArray *filenames,
                                        GtError *err)
{
  unsigned int countnonbases = 0,
               currentposition;
  GtUchar currentchar;
  GtSequenceBuffer *fb;
  int retval;

  gt_error_check(err);
  fb = gt_sequence_buffer_new_guess_type(filenames, err);
  if (!fb) return -1;

  for (currentposition = 0; currentposition < 1000U;
       currentposition++)
  {
    retval = gt_sequence_buffer_next(fb,&currentchar,err);
    if (retval < 0)
    {
      gt_sequence_buffer_delete(fb);
      return -1;
    }
    if (retval == 0)
    {
      break;
    }
    switch (currentchar)
    {
      case 'L':
      case 'I':
      case 'F':
      case 'E':
      case 'Q':
      case 'P':
      case 'X':
      case 'Z': countnonbases++;
                break;
      default:  break;
    }
    if (countnonbases > 0)
    {
      break;
    }
  }
  gt_sequence_buffer_delete(fb);
  if (countnonbases > 0)
  {
    return 1; /* guess it is a protein sequence */
  }
  return 0; /* guess it is a dna sequence */
}
コード例 #3
0
GtKmercodeiterator *gt_kmercodeiterator_filetab_new(
                                                const GtStrArray *filenametab,
                                                unsigned int numofchars,
                                                unsigned int kmersize,
                                                const GtUchar *symbolmap,
                                                bool plainformat,
                                                GtError *err)
{
  GtKmercodeiterator *kmercodeiterator;
  GtUchar charcode;
  bool haserr = false;
  int retval;

  gt_error_check(err);
  kmercodeiterator = gt_malloc(sizeof (*kmercodeiterator));
  kmercodeiterator->esr = NULL;
  kmercodeiterator->hasprocessedfirst = false;
  kmercodeiterator->inputexhausted = false;
  kmercodeiterator->spwp = kmerstream_new(numofchars,kmersize);
  kmercodeiterator->totallength = 0;
  if (plainformat)
  {
    kmercodeiterator->fb = gt_sequence_buffer_plain_new(filenametab);
  } else
  {
    kmercodeiterator->fb = gt_sequence_buffer_new_guess_type(filenametab, err);
  }
  if (kmercodeiterator->fb == NULL)
  {
    haserr = true;
  }
  if (!haserr)
  {
    gt_sequence_buffer_set_symbolmap(kmercodeiterator->fb, symbolmap);
    for (kmercodeiterator->currentposition = 0;
         kmercodeiterator->currentposition < (unsigned long) kmersize;
         kmercodeiterator->currentposition++)
    {
      retval = gt_sequence_buffer_next(kmercodeiterator->fb,&charcode,err);
      if (retval < 0)
      {
        haserr = true;
        break;
      }
      if (retval == 0)
      {
        kmercodeiterator->inputexhausted = true;
        break;
      }
      kmercodeiterator->spwp->windowwidth++;
      updatespecialpositions(kmercodeiterator->spwp,charcode,false,0);
      kmercodeiterator->spwp->cyclicwindow[kmercodeiterator->
                                           spwp->windowwidth-1] = charcode;
    }
  }
  if (haserr)
  {
    gt_kmercodeiterator_delete(kmercodeiterator);
    return NULL;
  }
  return kmercodeiterator;
}
コード例 #4
0
static int testfullscan(const GtStrArray *filenametab,
                        const Encodedsequence *encseq,
                        Readmode readmode,
                        GtError *err)
{
  Seqpos pos, totallength;
  GtUchar ccscan = 0, ccra, ccsr;
  GtSequenceBuffer *fb = NULL;
  int retval;
  bool haserr = false;
  Encodedsequencescanstate *esr;
  unsigned long long fullscanpbar = 0;

  gt_error_check(err);
  totallength = getencseqtotallength(encseq);
  gt_progressbar_start(&fullscanpbar,(unsigned long long) totallength);
  if (filenametab != NULL)
  {
    fb = gt_sequence_buffer_new_guess_type((GtStrArray*) filenametab, err);
    if (!fb)
      haserr = true;
    if (!haserr)
      gt_sequence_buffer_set_symbolmap(fb, getencseqAlphabetsymbolmap(encseq));
  }
  if (!haserr) {
    esr = newEncodedsequencescanstate();
    initEncodedsequencescanstate(esr,encseq,readmode,0);
    for (pos=0; /* Nothing */; pos++)
    {
      if (filenametab != NULL && readmode == Forwardmode)
      {
        retval = gt_sequence_buffer_next(fb,&ccscan,err);
        if (retval < 0)
        {
          haserr = true;
          break;
        }
        if (retval == 0)
        {
          break;
        }
      } else
      {
        if (pos >= totallength)
        {
          break;
        }
      }
      ccra = getencodedchar(encseq,pos,readmode); /* Random access */
      if (filenametab != NULL && readmode == Forwardmode)
      {
        if (ccscan != ccra)
        {
          gt_error_set(err,"access=%s, position=" FormatSeqpos
                            ": scan (readnextchar) = %u != "
                            "%u = random access",
                            encseqaccessname(encseq),
                            pos,
                            (unsigned int) ccscan,
                            (unsigned int) ccra);
          haserr = true;
          break;
        }
      }
      ccsr = sequentialgetencodedchar(encseq,esr,pos,readmode);
      if (ccra != ccsr)
      {
        gt_error_set(err,"access=%s, mode=%s: position=" FormatSeqpos
                          ": random access = %u != %u = sequential read",
                          encseqaccessname(encseq),
                          showreadmode(readmode),
                          pos,
                          (unsigned int) ccra,
                          (unsigned int) ccsr);
        haserr = true;
        break;
      }
      fullscanpbar++;
    }
    gt_progressbar_stop();
  }
  if (!haserr)
  {
    if (pos != totallength)
    {
      gt_error_set(err,"sequence length must be " FormatSeqpos " but is "
                         FormatSeqpos,totallength,pos);
      haserr = true;
    }
  }
  freeEncodedsequencescanstate(&esr);
  gt_sequence_buffer_delete(fb);
  return haserr ? -1 : 0;
}
コード例 #5
0
static int gt_seqtranslate_runner(int argc, const char **argv, int parsed_args,
                              void *tool_arguments, GT_UNUSED GtError *err)
{
  GtTranslateArguments *arguments = tool_arguments;
  GtSeqIterator *si = NULL;
  GtSequenceBuffer *sb = NULL;
  GtStrArray *infiles;
  int had_err = 0,
      rval,
      i;
  GtStr *translations[3];
  translations[0] = gt_str_new();
  translations[1] = gt_str_new();
  translations[2] = gt_str_new();

  gt_error_check(err);
  gt_assert(arguments);

  infiles = gt_str_array_new();
  for (i = parsed_args; i < argc; i++) {
    gt_str_array_add_cstr(infiles, argv[i]);
  }
  sb = gt_sequence_buffer_new_guess_type(infiles, err);
  if (!sb)
    had_err = -1;
  if (!had_err) {
    si = gt_seq_iterator_sequence_buffer_new_with_buffer(sb);
    if (!si)
      had_err = -1;
  }
  if (!had_err) {
    char *desc;
    const GtUchar *sequence;
    GtUword len;
    while (!had_err && (rval = gt_seq_iterator_next(si,
                                                   &sequence,
                                                   &len, &desc, err))) {
      if (rval < 0) {
        had_err = -1;
        break;
      }
      if (len < GT_CODON_LENGTH) {
        gt_warning("sequence '%s' is shorter than codon length of %d, skipping",
                   desc, GT_CODON_LENGTH);
      } else {
        had_err = gt_seqtranslate_do_translation(arguments, (char*) sequence,
                                                 len, desc,
                                                 translations, false, err);
        if (!had_err && arguments->reverse) {
          char *revseq = gt_cstr_dup_nt((char*) sequence, len);
          had_err = gt_reverse_complement(revseq, len, err);
          if (!had_err) {
            had_err = gt_seqtranslate_do_translation(arguments, revseq, len,
                                                  desc, translations, true,
                                                  err);
          }
          gt_free(revseq);
        }
      }
    }
  }
  gt_str_delete(translations[0]);
  gt_str_delete(translations[1]);
  gt_str_delete(translations[2]);
  gt_str_array_delete(infiles);
  gt_seq_iterator_delete(si);
  gt_sequence_buffer_delete(sb);
  return had_err;
}
コード例 #6
0
ファイル: gt_convertseq.c プロジェクト: AnnSeidel/genometools
static int gt_convertseq_runner(int argc, const char **argv, int parsed_args,
                              void *tool_arguments, GtError *err)
{
  GtConvertseqArguments *arguments = tool_arguments;
  int had_err = 0, i;
  GtFilelengthvalues *flv;
  GtSeqIterator *seqit;
  GtSequenceBuffer *sb = NULL;
  GtStrArray *files;
  const GtUchar *sequence;
  char *desc;
  GtUword len, j;
  off_t totalsize;
  gt_error_check(err);
  gt_assert(arguments != NULL);

  files = gt_str_array_new();
  for (i = parsed_args; i < argc; i++)
  {
    gt_str_array_add_cstr(files, argv[i]);
  }
  totalsize = gt_files_estimate_total_size(files);

  flv = gt_calloc((size_t) gt_str_array_size(files),
                  sizeof (GtFilelengthvalues));

  sb = gt_sequence_buffer_new_guess_type(files, err);
  if (!sb) {
    had_err = -1;
  }
  if (!had_err) {
    gt_sequence_buffer_set_filelengthtab(sb, flv);
    /* read input using seqiterator */
    seqit = gt_seq_iterator_sequence_buffer_new_with_buffer(sb);
    if (arguments->verbose)
    {
      gt_progressbar_start(gt_seq_iterator_getcurrentcounter(seqit,
                                                           (GtUint64)
                                                           totalsize),
                           (GtUint64) totalsize);
    }
    while (true)
    {
      GtUchar *seq = NULL;
      desc = NULL;
      j = 0UL;
      had_err = gt_seq_iterator_next(seqit, &sequence, &len, &desc, err);
      if (had_err != 1)
        break;
      if (arguments->revcomp) {
        GtUchar *newseq = gt_calloc((size_t) len+1, sizeof (GtUchar));
        memcpy(newseq, sequence, (size_t) len*sizeof (GtUchar));
        had_err = gt_reverse_complement((char*) newseq, len, err);
        if (had_err)
          break;
        seq = newseq;
      } else seq = (GtUchar*) sequence;

      if (!arguments->showseq) {
        bool in_wildcard = false;
        gt_file_xprintf(arguments->outfp, ">%s\n", desc);
        for (i = 0; (GtUword) i < len; i++) {
          if (arguments->reduce_wc_dna) {
            switch (seq[i]) {
              case 'a':
              case 'A':
              case 'c':
              case 'C':
              case 'g':
              case 'G':
              case 't':
              case 'u':
              case 'T':
              case 'U':
                in_wildcard = false;
                gt_file_xfputc((int) seq[i], arguments->outfp);
                j++;
                break;
              default:
                if (!in_wildcard) {
                  in_wildcard = true;
                  if (isupper((int) seq[i]))
                    gt_file_xfputc((int) 'N', arguments->outfp);
                  else
                    gt_file_xfputc((int) 'n', arguments->outfp);
                  j++;
                }
            }
          }
          else if (arguments->reduce_wc_prot) {
            switch (seq[i]) {
              case 'X':
              case 'B':
              case 'Z':
                if (!in_wildcard) {
                  in_wildcard = true;
                  gt_file_xfputc((int) 'N', arguments->outfp);
                  j++;
                }
                break;
              case 'x':
              case 'b':
              case 'z':
                if (!in_wildcard) {
                  in_wildcard = true;
                  gt_file_xfputc((int) 'n', arguments->outfp);
                  j++;
                }
                break;
              default:
                in_wildcard = false;
                gt_file_xfputc((int) seq[i], arguments->outfp);
                j++;
            }
          }
          else {
            gt_file_xfputc((int) seq[i], arguments->outfp);
            j++;
          }
          if (arguments->fastawidth > 0 && j % arguments->fastawidth == 0) {
            j = 0;
            gt_file_xprintf(arguments->outfp, "\n");
          }
        }
        if (arguments->fastawidth == 0 || len % arguments->fastawidth != 0)
            gt_file_xprintf(arguments->outfp, "\n");
      }
      if (arguments->revcomp) {
        gt_free(seq);
      }
    }
    if (arguments->showflv) {
      for (j=0;j<gt_str_array_size(files);j++) {
        fprintf(stderr, "file "GT_WU" (%s): "GT_WU"/"GT_WU"\n",
               j,
               gt_str_array_get(files, j),
               (GtUword) flv[j].length,
               (GtUword) flv[j].effectivelength);
      }
    }
    if (arguments->verbose)
    {
      gt_progressbar_stop();
    }
    gt_sequence_buffer_delete(sb);
    gt_seq_iterator_delete(seqit);
  }
  gt_str_array_delete(files);
  gt_free(flv);

  return had_err;
}