Beispiel #1
0
int gt_files_guess_if_protein_sequences(const GtStrArray *filenames,
                                        GtError *err)
{
  unsigned int countnonbases = 0,
               currentposition;
  GtUchar currentchar;
  GtSequenceBuffer *fb;
  int retval;

  gt_error_check(err);
  fb = gt_sequence_buffer_new_guess_type(filenames, err);
  if (!fb) return -1;

  for (currentposition = 0; currentposition < 1000U;
       currentposition++)
  {
    retval = gt_sequence_buffer_next(fb,&currentchar,err);
    if (retval < 0)
    {
      gt_sequence_buffer_delete(fb);
      return -1;
    }
    if (retval == 0)
    {
      break;
    }
    switch (currentchar)
    {
      case 'L':
      case 'I':
      case 'F':
      case 'E':
      case 'Q':
      case 'P':
      case 'X':
      case 'Z': countnonbases++;
                break;
      default:  break;
    }
    if (countnonbases > 0)
    {
      break;
    }
  }
  gt_sequence_buffer_delete(fb);
  if (countnonbases > 0)
  {
    return 1; /* guess it is a protein sequence */
  }
  return 0; /* guess it is a dna sequence */
}
Beispiel #2
0
int gt_kmercodeiterator_filetab_next(const GtKmercode **kmercodeptr,
                                     GtKmercodeiterator *kmercodeiterator,
                                     GtError *err)
{
  if (!kmercodeiterator->inputexhausted)
  {
    if (kmercodeiterator->hasprocessedfirst)
    {
      GtUchar charcode;
      int retval;

      retval = gt_sequence_buffer_next(kmercodeiterator->fb,&charcode,err);
      if (retval < 0)
      {
        *kmercodeptr = NULL;
        return -1;
      }
      if (retval != 0)
      {
        shiftrightwithchar(kmercodeiterator->spwp,charcode);
        kmerstream_newcode(&kmercodeiterator->kmercode, kmercodeiterator->spwp);
        kmercodeiterator->currentposition++,
        *kmercodeptr = &kmercodeiterator->kmercode;
        return 0;
      }
      kmercodeiterator->inputexhausted = true;
      kmercodeiterator->totallength = kmercodeiterator->currentposition;
    } else
    {
      kmerstream_newcode(&kmercodeiterator->kmercode, kmercodeiterator->spwp);
      kmercodeiterator->hasprocessedfirst = true;
      *kmercodeptr = &kmercodeiterator->kmercode;
      return 0;
    }
  }
  if (kmercodeiterator->currentposition < kmercodeiterator->totallength +
                                          kmercodeiterator->spwp->kmersize)
  {
    shiftrightwithchar(kmercodeiterator->spwp,(GtUchar) WILDCARD);
    kmerstream_newcode(&kmercodeiterator->kmercode, kmercodeiterator->spwp);
    kmercodeiterator->currentposition++,
    *kmercodeptr = &kmercodeiterator->kmercode;
  } else
  {
    *kmercodeptr = NULL;
  }
  return 0;
}
Beispiel #3
0
GtKmercodeiterator *gt_kmercodeiterator_filetab_new(
                                                const GtStrArray *filenametab,
                                                unsigned int numofchars,
                                                unsigned int kmersize,
                                                const GtUchar *symbolmap,
                                                bool plainformat,
                                                GtError *err)
{
  GtKmercodeiterator *kmercodeiterator;
  GtUchar charcode;
  bool haserr = false;
  int retval;

  gt_error_check(err);
  kmercodeiterator = gt_malloc(sizeof (*kmercodeiterator));
  kmercodeiterator->esr = NULL;
  kmercodeiterator->hasprocessedfirst = false;
  kmercodeiterator->inputexhausted = false;
  kmercodeiterator->spwp = kmerstream_new(numofchars,kmersize);
  kmercodeiterator->totallength = 0;
  if (plainformat)
  {
    kmercodeiterator->fb = gt_sequence_buffer_plain_new(filenametab);
  } else
  {
    kmercodeiterator->fb = gt_sequence_buffer_new_guess_type(filenametab, err);
  }
  if (kmercodeiterator->fb == NULL)
  {
    haserr = true;
  }
  if (!haserr)
  {
    gt_sequence_buffer_set_symbolmap(kmercodeiterator->fb, symbolmap);
    for (kmercodeiterator->currentposition = 0;
         kmercodeiterator->currentposition < (unsigned long) kmersize;
         kmercodeiterator->currentposition++)
    {
      retval = gt_sequence_buffer_next(kmercodeiterator->fb,&charcode,err);
      if (retval < 0)
      {
        haserr = true;
        break;
      }
      if (retval == 0)
      {
        kmercodeiterator->inputexhausted = true;
        break;
      }
      kmercodeiterator->spwp->windowwidth++;
      updatespecialpositions(kmercodeiterator->spwp,charcode,false,0);
      kmercodeiterator->spwp->cyclicwindow[kmercodeiterator->
                                           spwp->windowwidth-1] = charcode;
    }
  }
  if (haserr)
  {
    gt_kmercodeiterator_delete(kmercodeiterator);
    return NULL;
  }
  return kmercodeiterator;
}
Beispiel #4
0
static int testfullscan(const GtStrArray *filenametab,
                        const Encodedsequence *encseq,
                        Readmode readmode,
                        GtError *err)
{
  Seqpos pos, totallength;
  GtUchar ccscan = 0, ccra, ccsr;
  GtSequenceBuffer *fb = NULL;
  int retval;
  bool haserr = false;
  Encodedsequencescanstate *esr;
  unsigned long long fullscanpbar = 0;

  gt_error_check(err);
  totallength = getencseqtotallength(encseq);
  gt_progressbar_start(&fullscanpbar,(unsigned long long) totallength);
  if (filenametab != NULL)
  {
    fb = gt_sequence_buffer_new_guess_type((GtStrArray*) filenametab, err);
    if (!fb)
      haserr = true;
    if (!haserr)
      gt_sequence_buffer_set_symbolmap(fb, getencseqAlphabetsymbolmap(encseq));
  }
  if (!haserr) {
    esr = newEncodedsequencescanstate();
    initEncodedsequencescanstate(esr,encseq,readmode,0);
    for (pos=0; /* Nothing */; pos++)
    {
      if (filenametab != NULL && readmode == Forwardmode)
      {
        retval = gt_sequence_buffer_next(fb,&ccscan,err);
        if (retval < 0)
        {
          haserr = true;
          break;
        }
        if (retval == 0)
        {
          break;
        }
      } else
      {
        if (pos >= totallength)
        {
          break;
        }
      }
      ccra = getencodedchar(encseq,pos,readmode); /* Random access */
      if (filenametab != NULL && readmode == Forwardmode)
      {
        if (ccscan != ccra)
        {
          gt_error_set(err,"access=%s, position=" FormatSeqpos
                            ": scan (readnextchar) = %u != "
                            "%u = random access",
                            encseqaccessname(encseq),
                            pos,
                            (unsigned int) ccscan,
                            (unsigned int) ccra);
          haserr = true;
          break;
        }
      }
      ccsr = sequentialgetencodedchar(encseq,esr,pos,readmode);
      if (ccra != ccsr)
      {
        gt_error_set(err,"access=%s, mode=%s: position=" FormatSeqpos
                          ": random access = %u != %u = sequential read",
                          encseqaccessname(encseq),
                          showreadmode(readmode),
                          pos,
                          (unsigned int) ccra,
                          (unsigned int) ccsr);
        haserr = true;
        break;
      }
      fullscanpbar++;
    }
    gt_progressbar_stop();
  }
  if (!haserr)
  {
    if (pos != totallength)
    {
      gt_error_set(err,"sequence length must be " FormatSeqpos " but is "
                         FormatSeqpos,totallength,pos);
      haserr = true;
    }
  }
  freeEncodedsequencescanstate(&esr);
  gt_sequence_buffer_delete(fb);
  return haserr ? -1 : 0;
}
static int gt_seq_iterator_sequence_buffer_next(GtSeqIterator *si,
                                               const GtUchar **sequence,
                                               unsigned long *len,
                                               char **desc,
                                               GtError *err)
{
  GtSeqIteratorSequenceBuffer *seqit;
  GtUchar charcode;
  int retval;
  bool haserr = false, foundseq = false;
  gt_assert(si);
  gt_assert(len && desc);

  seqit = gt_seq_iterator_sequence_buffer_cast(si);
  gt_assert((sequence && seqit->withsequence) || !seqit->withsequence);

  if (seqit->exhausted)
  {
    return 0;
  }
  while (true)
  {
    retval = gt_sequence_buffer_next(seqit->fb,&charcode,err);
    if (retval < 0)
    {
      haserr = true;
      break;
    }
    if (retval == 0)
    {
      seqit->exhausted = true;
      break;
    }
    if (seqit->currentread < seqit->maxread)
    {
      seqit->currentread++;
    }
    if (charcode == (GtUchar) SEPARATOR)
    {
      if (seqit->sequencebuffer.nextfreeGtUchar == 0 && seqit->withsequence)
      {
        gt_error_set(err,"sequence %llu is empty", seqit->unitnum);
        haserr = true;
        break;
      }
      *desc = (char*) gt_desc_buffer_get_next(seqit->descptr);
      *len = seqit->sequencebuffer.nextfreeGtUchar;
      if (seqit->withsequence)
      {
        /* make sure the outgoing sequence is '\0' terminated */
        seqit->sequencebuffer.spaceGtUchar
          [seqit->sequencebuffer.nextfreeGtUchar] = (GtUchar) '\0';
        *sequence = seqit->sequencebuffer.spaceGtUchar;
      }
      seqit->sequencebuffer.nextfreeGtUchar = 0;
      foundseq = true;
      seqit->unitnum++;
      break;
    }
    if (seqit->withsequence)
    {
      GT_STOREINARRAY(&seqit->sequencebuffer, GtUchar,
                   MAX(1024UL, seqit->sequencebuffer.nextfreeGtUchar * 0.5),
                   charcode);
    } else
    {
      seqit->sequencebuffer.nextfreeGtUchar++;
    }
  }
  if (!haserr && seqit->sequencebuffer.nextfreeGtUchar > 0)
  {
    *desc = (char*) gt_desc_buffer_get_next(seqit->descptr);
    if (seqit->withsequence)
    {
      /* make sure the outgoing sequence is '\0' terminated */
      seqit->sequencebuffer.spaceGtUchar
        [seqit->sequencebuffer.nextfreeGtUchar] = (GtUchar) '\0';
      *sequence = seqit->sequencebuffer.spaceGtUchar;
    }
    *len = seqit->sequencebuffer.nextfreeGtUchar;
    foundseq = true;
    seqit->sequencebuffer.nextfreeGtUchar = 0;
  }
  if (haserr)
  {
    return -1;
  }
  if (foundseq)
  {
    return 1;
  }
  return 0;
}