예제 #1
0
Enumcodeatposition *gt_Enumcodeatposition_new(const GtEncseq *encseq,
                                              GtReadmode readmode,
                                              unsigned int prefixlength,
                                              unsigned int numofchars)
{
  Enumcodeatposition *ecp;

  ecp = gt_malloc(sizeof *ecp);
  ecp->encseq = encseq;
  ecp->readmode = readmode;
  ecp->multimappower = gt_initmultimappower(numofchars,prefixlength);
  ecp->filltable = gt_initfilltable(numofchars,prefixlength);
  ecp->prefixlength = prefixlength;
  ecp->moveforward = GT_ISDIRREVERSE(readmode) ? true : false;
  ecp->totallength = gt_encseq_total_length(encseq);
  if (ecp->moveforward)
  {
    ecp->previousrange.start = ecp->previousrange.end = 0;
  } else
  {
    ecp->previousrange.start = ecp->previousrange.end = ecp->totallength;
  }
  ecp->exhausted = false;
  if (gt_encseq_has_specialranges(encseq))
  {
    ecp->sri = gt_specialrangeiterator_new(encseq,ecp->moveforward);
  } else
  {
    ecp->sri = NULL;
  }
  return ecp;
}
예제 #2
0
Rankedbounds *gt_fillrankbounds(const GtEncseq *encseq,
                             GtReadmode readmode)
{
  if (gt_encseq_has_specialranges(encseq))
  {
    GtSpecialrangeiterator *sri;
    GtRange range;
    GtUword currentrank = 0, realspecialranges;
    Rankedbounds *rankedbounds, *rbptr;

    realspecialranges = gt_encseq_realspecialranges(encseq);
    rankedbounds = gt_malloc(sizeof (Rankedbounds) * realspecialranges);
    sri = gt_specialrangeiterator_new(encseq,
                                      GT_ISDIRREVERSE(readmode)
                                      ? false : true);
    for (rbptr = rankedbounds;
         gt_specialrangeiterator_next(sri,&range);
         rbptr++)
    {
      rbptr->lowerbound = range.start;
      rbptr->upperbound = range.end;
      rbptr->rank = currentrank;
      currentrank += rbptr->upperbound - rbptr->lowerbound;
    }
    gt_assert(rbptr == rankedbounds + realspecialranges);
    gt_specialrangeiterator_delete(sri);
    return rankedbounds;
  }
  return NULL;
}
예제 #3
0
/*@notnull@*/ GtKmercodeiterator *gt_kmercodeiterator_encseq_new(
                                            const GtEncseq *encseq,
                                            GtReadmode readmode,
                                            unsigned int kmersize,
                                            unsigned long startpos)
{
  GtKmercodeiterator *kmercodeiterator;
  unsigned int numofchars;
  GtUchar charcode;

  gt_assert(!GT_ISDIRREVERSE(readmode) || startpos == 0);
  kmercodeiterator = gt_malloc(sizeof (*kmercodeiterator));
  kmercodeiterator->totallength = gt_encseq_total_length(encseq);
  kmercodeiterator->startpos = startpos;
  gt_assert(startpos < kmercodeiterator->totallength);
  if (kmercodeiterator->totallength - startpos < (unsigned long) kmersize)
  {
    kmercodeiterator->inputexhausted = true;
    kmercodeiterator->fb = NULL;
    kmercodeiterator->encseq = encseq;
    kmercodeiterator->esr = NULL;
    kmercodeiterator->spwp = NULL;
  } else
  {
    kmercodeiterator->inputexhausted = false;
    kmercodeiterator->fb = NULL;
    kmercodeiterator->encseq = encseq;
    kmercodeiterator->readmode = readmode;
    kmercodeiterator->esr = gt_encseq_create_reader_with_readmode(encseq,
                                                                  readmode,
                                                                  startpos);
    numofchars = gt_alphabet_num_of_chars(gt_encseq_alphabet(encseq));
    kmercodeiterator->spwp = kmerstream_new(numofchars,kmersize);
    kmercodeiterator->hasprocessedfirst = false;
    for (kmercodeiterator->currentposition = startpos;
         kmercodeiterator->currentposition < startpos+(unsigned long) kmersize;
         kmercodeiterator->currentposition++)
    {
      charcode = gt_encseq_reader_next_encoded_char(kmercodeiterator->esr);
      kmercodeiterator->spwp->windowwidth++;
      updatespecialpositions(kmercodeiterator->spwp,charcode,false,0);
      kmercodeiterator->spwp->cyclicwindow[kmercodeiterator->
                                           spwp->windowwidth-1] = charcode;
    }
  }
  return kmercodeiterator;
}
예제 #4
0
void gt_kmercodeiterator_reset(GtKmercodeiterator *kmercodeiterator,
                               GtReadmode readmode,
                               GtUword startpos)
{
  GtUchar charcode;
  const GtEncseq *encseq = kmercodeiterator->encseq;
  GtUword kmersize = (GtUword) kmercodeiterator->spwp->kmersize;

  gt_assert(!GT_ISDIRREVERSE(readmode) || startpos == 0);
  kmercodeiterator->totallength = gt_encseq_total_length(encseq);
  kmercodeiterator->startpos = startpos;
  gt_assert(startpos < kmercodeiterator->totallength);
  kmercodeiterator->fb = NULL;
  if (kmercodeiterator->totallength - startpos < kmersize)
  {
    kmercodeiterator->inputexhausted = true;
    gt_encseq_reader_delete(kmercodeiterator->esr);
    kmercodeiterator->esr = NULL;
    kmerstream_delete(kmercodeiterator->spwp);
    kmercodeiterator->spwp = NULL;
  } else
  {
    kmercodeiterator->inputexhausted = false;
    kmercodeiterator->readmode = readmode;
    gt_encseq_reader_reinit_with_readmode(kmercodeiterator->esr,
                                          encseq,
                                          readmode,
                                          startpos);
    kmerstream_reset(kmercodeiterator->spwp);
    kmercodeiterator->hasprocessedfirst = false;
    for (kmercodeiterator->currentposition = startpos;
         kmercodeiterator->currentposition < startpos+(GtUword) kmersize;
         kmercodeiterator->currentposition++)
    {
      charcode = gt_encseq_reader_next_encoded_char(kmercodeiterator->esr);
      kmercodeiterator->spwp->windowwidth++;
      kmerstream_updatespecialpositions(kmercodeiterator->spwp,charcode,
                                        false,0);
      kmercodeiterator->spwp->cyclicwindow[kmercodeiterator->
                                           spwp->windowwidth-1] = charcode;
    }
  }
}
예제 #5
0
Specialrank *gt_fillspecialranklist(const GtEncseq *encseq,
                                 GtReadmode readmode,
                                 const GtUword *inversesuftab)
{
  if (gt_encseq_has_specialranges(encseq))
  {
    GtSpecialrangeiterator *sri;
    GtRange range;
    GtUword realspecialranges, specialrank;
    GT_UNUSED GtUword totallength;
    Specialrank *specialranklist, *rbptr;

    totallength = gt_encseq_total_length(encseq);
    realspecialranges = gt_encseq_realspecialranges(encseq);
    specialranklist = gt_malloc(sizeof (Specialrank) * realspecialranges);
    sri = gt_specialrangeiterator_new(encseq,
                                  GT_ISDIRREVERSE(readmode)
                                  ? false : true);
    rbptr = specialranklist;
    specialrank = 0;
    while (gt_specialrangeiterator_next(sri,&range))
    {
      gt_assert(rbptr < specialranklist + realspecialranges);
      gt_assert(range.end<=totallength);
      specialrank += range.end - range.start;
      rbptr->specialrank = specialrank - 1;
      rbptr->key = inversesuftab[range.end];
      rbptr++;
    }
    gt_assert(rbptr == specialranklist + realspecialranges);
    gt_specialrangeiterator_delete(sri);
    qsort(specialranklist,(size_t) realspecialranges,
          sizeof (Specialrank),compareSpecialrank);
    return specialranklist;
  }
  return NULL;
}
예제 #6
0
static int output_sequence(GtEncseq *encseq, GtEncseqDecodeArguments *args,
                           const char *filename, GtError *err)
{
  GtUword i, j, sfrom, sto;
  int had_err = 0;
  bool has_desc;
  GtEncseqReader *esr;
  gt_assert(encseq);

  if (!(has_desc = gt_encseq_has_description_support(encseq)))
    gt_warning("Missing description support for file %s", filename);

  if (strcmp(gt_str_get(args->mode), "fasta") == 0) {
    /* specify a single sequence to extract */
    if (args->seq != GT_UNDEF_UWORD) {
      if (args->seq >= gt_encseq_num_of_sequences(encseq)) {
        gt_error_set(err,
                     "requested sequence "GT_WU" exceeds number of sequences "
                     "("GT_WU")", args->seq,
                     gt_encseq_num_of_sequences(encseq));
        return -1;
      }
      sfrom = args->seq;
      sto = args->seq + 1;
    } else if (args->seqrng.start != GT_UNDEF_UWORD
                 && args->seqrng.end != GT_UNDEF_UWORD) {
      /* specify a sequence range to extract */
      if (args->seqrng.start >= gt_encseq_num_of_sequences(encseq)
            || args->seqrng.end >= gt_encseq_num_of_sequences(encseq)) {
        gt_error_set(err,
                     "range "GT_WU"-"GT_WU" includes a sequence number "
                     "exceeding the total number of sequences ("GT_WU")",
                     args->seqrng.start,
                     args->seqrng.end,
                     gt_encseq_num_of_sequences(encseq));
        return -1;
      }
      sfrom = args->seqrng.start;
      sto = args->seqrng.end + 1;
    } else {
      /* extract all sequences */
      sfrom = 0;
      sto = gt_encseq_num_of_sequences(encseq);
    }
    for (i = sfrom; i < sto; i++) {
      GtUword desclen, startpos, len;
      char buf[BUFSIZ];
      const char *desc = NULL;
      /* XXX: maybe make this distinction in the functions via readmode? */
      if (!GT_ISDIRREVERSE(args->rm)) {
        startpos = gt_encseq_seqstartpos(encseq, i);
        len = gt_encseq_seqlength(encseq, i);
        if (has_desc) {
          desc = gt_encseq_description(encseq, &desclen, i);
        } else {
          (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i);
          desclen = strlen(buf);
          desc = buf;
        }
      } else {
        startpos = gt_encseq_seqstartpos(encseq, i);
        len = gt_encseq_seqlength(encseq,
                                  gt_encseq_num_of_sequences(encseq)-1-i);
        startpos = gt_encseq_total_length(encseq)
                     - (gt_encseq_seqstartpos(encseq,
                                              gt_encseq_num_of_sequences(
                                                encseq)-1-i) + len);
        if (has_desc) {
          desc = gt_encseq_description(encseq,
                                       &desclen,
                                       gt_encseq_num_of_sequences(encseq)-1-i);
        } else {
          (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i);
          desclen = strlen(buf);
          desc = buf;
        }
      }
      gt_assert(desc);
      /* output description */
      gt_xfputc(GT_FASTA_SEPARATOR, stdout);
      gt_xfwrite(desc, 1, desclen, stdout);
      gt_xfputc('\n', stdout);
      /* XXX: make this more efficient by writing in a buffer first and then
         showing the result */
      if (args->singlechars) {
        for (j = 0; j < len; j++) {
           gt_xfputc(gt_encseq_get_decoded_char(encseq,
                                                startpos + j,
                                                args->rm),
                     stdout);
        }
      } else {
        esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, startpos);
        for (j = 0; j < len; j++) {
           gt_xfputc(gt_encseq_reader_next_decoded_char(esr), stdout);
        }
        gt_encseq_reader_delete(esr);
      }
      gt_xfputc('\n', stdout);
    }
  }

  if (strcmp(gt_str_get(args->mode), "concat") == 0) {
    GtUword from = 0,
                  to = gt_encseq_total_length(encseq) - 1;
    if (args->rng.start != GT_UNDEF_UWORD && args->rng.end != GT_UNDEF_UWORD) {
      if (args->rng.end > to) {
        had_err = -1;
        gt_error_set(err,
                     "end of range ("GT_WU") exceeds encoded sequence length "
                     "("GT_WU")", args->rng.end, to);
      }
      if (!had_err) {
        from = args->rng.start;
        to = args->rng.end;
      }
    }
    if (!had_err) {
      if (args->singlechars) {
        for (j = from; j <= to; j++) {
          char cc = gt_encseq_get_decoded_char(encseq, j, args->rm);
          if (cc == (char) SEPARATOR)
            cc = gt_str_get(args->sepchar)[0];
          gt_xfputc(cc, stdout);
        }
      } else {
        esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, from);
        if (esr) {
          for (j = from; j <= to; j++) {
            char cc = gt_encseq_reader_next_decoded_char(esr);
            if (cc == (char) SEPARATOR)
              cc = gt_str_get(args->sepchar)[0];
            gt_xfputc(cc, stdout);
          }
          gt_encseq_reader_delete(esr);
        }
      }
      gt_xfputc('\n', stdout);
    }
  }
  return had_err;
}
예제 #7
0
static int gt_encseq_bitextract_runner(GT_UNUSED int argc, const char **argv,
                                       GT_UNUSED int parsed_args,
                                       void *tool_arguments,
                                       GT_UNUSED GtError *err)
{
  GtEncseqBitextractArguments *arguments = tool_arguments;
  GtEncseqLoader *el;
  GtEncseq *encseq;
  int had_err = 0;
  bool fwd, it1, GT_UNUSED it2;
  char buffer[BUFSIZ];
  GtEndofTwobitencoding etbe;
  GtEncseqReader *esr;
  GtSpecialrangeiterator *sri;
  GtRange srng;
  GtReadmode rm;

  gt_error_check(err);
  gt_assert(arguments);

  el = gt_encseq_loader_new();
  encseq = gt_encseq_loader_load(el, argv[parsed_args], err);
  if (!encseq)
    had_err = -1;

  if (!had_err && arguments->mirror) {
    had_err = gt_encseq_mirror(encseq, err);
  }

  if (!had_err) {
    rm = gt_readmode_parse(gt_str_get(arguments->readmode), NULL);
    fwd = GT_ISDIRREVERSE(rm) ? false : true;
  }

  if (!had_err && arguments->bitpos != GT_UNDEF_ULONG) {
    if (arguments->bitpos >= gt_encseq_total_length(encseq)) {
      gt_error_set(err, "position %lu exceeds encoded sequence length of %lu",
                   arguments->bitpos, gt_encseq_total_length(encseq));
      had_err = -1;
    }

    if (!had_err) {
      unsigned long ret;
      esr = gt_encseq_create_reader_with_readmode(encseq, rm,
                                                  arguments->bitpos);
      ret = gt_encseq_extract2bitencwithtwobitencodingstoppos(&etbe, esr,
                                                        encseq,
                                                        rm, arguments->bitpos);
      gt_bitsequence_tostring(buffer, etbe.tbe);
      printf("Twobitencoding   %s\n"
             "unitsnotspecial  %u\n"
             "position         %lu\n"
             "returnvalue      %lu\n",
             buffer,
             etbe.unitsnotspecial,
             arguments->bitpos,
             ret);
      gt_encseq_reader_delete(esr);
    }
  }

  if (!had_err && arguments->stoppos != GT_UNDEF_ULONG) {
    if (arguments->stoppos >= gt_encseq_total_length(encseq)) {
      gt_error_set(err, "position %lu exceeds encoded sequence length of %lu",
                   arguments->stoppos, gt_encseq_total_length(encseq));
      had_err = -1;
    }
    if (!had_err) {
      esr = gt_encseq_create_reader_with_readmode(encseq, rm, 0);
      /* check stoppos stuff */
      gt_encseq_reader_reinit_with_readmode(esr, encseq, rm,
                                            arguments->stoppos);
      printf("%lu: %lu\n", arguments->stoppos,
                           gt_getnexttwobitencodingstoppos(fwd, esr));
      gt_encseq_reader_delete(esr);
    }
  }

  if (!had_err && arguments->specialranges) {
    /* check specialrangeiterator stuff */
    if (gt_encseq_has_specialranges(encseq)) {
      sri = gt_specialrangeiterator_new(encseq, fwd);
      while (true) {
        it1 = gt_specialrangeiterator_next(sri, &srng);
        if (it1)
          printf("%lu:%lu\n", srng.start, srng.end);
        else break;
      }
      gt_specialrangeiterator_delete(sri);
    }
  }

  gt_encseq_delete(encseq);
  gt_encseq_loader_delete(el);
  return had_err;
}
예제 #8
0
void gt_copysort_derivesorting(const GtBucketspec2 *bucketspec2,
                               GtSuffixsortspace *suffixsortspace,
                               GtLogger *logger)
{
  GtUword hardwork = 0,
                *targetoffset;
  unsigned int idx, idxsource, source, second;

#ifdef WITHSUFFIXES
  {
    GtUword idx;
    for (idx = 0; idx < bucketspec2->partwidth; idx++)
    {
      gt_encseq_showatstartpos(
                            stdout,
                            GT_ISDIRREVERSE(readmode) ? false : true,
                            GT_ISDIRCOMPLEMENT(readmode) ? true : false,
                            encseq,
                            gt_suffixsortspace_getdirect(suffixsortspace,idx));
    }
  }
#endif
  targetoffset = gt_malloc(sizeof (*targetoffset) * bucketspec2->numofchars);
  for (idxsource = 0; idxsource<bucketspec2->numofchars; idxsource++)
  {
    source = bucketspec2->order[idxsource];
    for (second = 0; second < bucketspec2->numofchars; second++)
    {
      if (!bucketspec2->subbuckettab[source][second].sorted && source != second)
      {
        gt_assert(bucketspec2->subbuckettab[source][second].hardworktodo);
        gt_logger_log(logger,"hard work for %u %u",source,second);
        hardwork += getendidx(bucketspec2,source,second) -
                    getstartidx(bucketspec2,source,second);
        bucketspec2->subbuckettab[source][second].sorted = true;
      } else
      {
        gt_assert(!bucketspec2->subbuckettab[source][second].hardworktodo);
      }
    }
    if (getstartidx(bucketspec2,source,0) <
        getstartidx(bucketspec2,source,source))
    {
      for (idx = 0; idx < bucketspec2->numofchars; idx++)
      {
        targetoffset[idx] = getstartidx(bucketspec2,idx,source);
      }
      forwardderive(bucketspec2,
                    suffixsortspace,
                    targetoffset,
                    source,
                    getstartidx(bucketspec2,source,0));
    }
    if (getendidx(bucketspec2,source,source) <
        getendidx(bucketspec2,source,bucketspec2->numofchars))
    {
      for (idx = 0; idx < bucketspec2->numofchars; idx++)
      {
        /* do not need to assert that getendidx(idx,source)  > 0, as later the
           value stored in targetoffset is incremented */
        targetoffset[idx] = getendidx(bucketspec2,idx,source) - 1;
      }
      gt_assert(getendidx(bucketspec2,source,bucketspec2->numofchars) > 0);
      backwardderive(bucketspec2,
                     suffixsortspace,
                     targetoffset,
                     source,
                     getendidx(bucketspec2,source,bucketspec2->numofchars) - 1);
    }
    for (idx = 0; idx < bucketspec2->numofchars; idx++)
    {
      bucketspec2->subbuckettab[idx][source].sorted = true;
    }
    bucketspec2->superbuckettab[source].sorted = true;
  }
  gt_free(targetoffset);
  gt_logger_log(logger,"hardwork = "GT_WU" (%.2f)",
                hardwork,
                (double) hardwork/gt_encseq_total_length(bucketspec2->encseq));
}
예제 #9
0
static GtUword *leftcontextofspecialchardist(unsigned int numofchars,
                                                   const GtEncseq *encseq,
                                                   GtReadmode readmode)
{
  GtUchar cc;
  unsigned int idx;
  GtUword *specialchardist,
                totallength = gt_encseq_total_length(encseq);
  GtReadmode convertedreadmode = (readmode == GT_READMODE_REVERSE)
                                      ? GT_READMODE_FORWARD
                                      : GT_READMODE_COMPL;

  specialchardist = gt_malloc(sizeof (*specialchardist) * numofchars);
  for (idx = 0; idx<numofchars; idx++)
  {
    specialchardist[idx] = 0;
  }
  if (gt_encseq_has_specialranges(encseq))
  {
    GtSpecialrangeiterator *sri;
    GtRange range;
    sri = gt_specialrangeiterator_new(encseq,true);
    if (GT_ISDIRREVERSE(readmode))
    {
      while (gt_specialrangeiterator_next(sri,&range))
      {
        if (range.end < totallength)
        {
          cc = gt_encseq_get_encoded_char(encseq,range.end,convertedreadmode);
          if (ISNOTSPECIAL(cc))
          {
            specialchardist[cc]++;
          }
        }
      }
    } else
    {
      while (gt_specialrangeiterator_next(sri,&range))
      {
        if (range.start > 0)
        {
          cc = gt_encseq_get_encoded_char(encseq,range.start-1,readmode);
          if (ISNOTSPECIAL(cc))
          {
            specialchardist[cc]++;
          }
        }
      }
    }
    gt_specialrangeiterator_delete(sri);
  }
  if (GT_ISDIRREVERSE(readmode))
  {
    if (gt_encseq_lengthofspecialprefix(encseq) == 0)
    {
      cc = gt_encseq_get_encoded_char(encseq,0,convertedreadmode);
      gt_assert(ISNOTSPECIAL(cc));
      specialchardist[cc]++;
    }
  } else
  {
    if (gt_encseq_lengthofspecialsuffix(encseq) == 0)
    {
      cc = gt_encseq_get_encoded_char(encseq,totallength-1,readmode);
      gt_assert(ISNOTSPECIAL(cc));
      specialchardist[cc]++;
    }
  }
  return specialchardist;
}