Пример #1
0
Sequentialsuffixarrayreader *gt_newSequentialsuffixarrayreaderfromfile(
                                        const char *indexname,
                                        unsigned int demand,
                                        bool scanfile,
                                        GtLogger *logger,
                                        GtError *err)
{
  Sequentialsuffixarrayreader *ssar;

  ssar = gt_malloc(sizeof *ssar);
  ssar->suffixarray = gt_malloc(sizeof *ssar->suffixarray);
  if ((scanfile ? streamsuffixarray : gt_mapsuffixarray)(ssar->suffixarray,
                                                         demand,
                                                         indexname,
                                                         logger,
                                                         err) != 0)
  {
    gt_free(ssar->suffixarray);
    gt_free(ssar);
    return NULL;
  }
  ssar->nextsuftabindex = 0;
  ssar->nextlcptabindex = 1UL;
  ssar->largelcpindex = 0;
  ssar->scanfile = scanfile;
  ssar->suftab = NULL;
  gt_assert(ssar->suffixarray != NULL);
  ssar->encseq = ssar->suffixarray->encseq;
  ssar->readmode = ssar->suffixarray->readmode;
  ssar->numberofsuffixes = gt_encseq_total_length(ssar->encseq) + 1;
  ssar->nonspecials = gt_encseq_total_length(ssar->encseq) -
                      gt_encseq_specialcharacters(ssar->encseq);
  ssar->extrainfo = NULL;
  return ssar;
}
Пример #2
0
static int verifycodelists(const GtEncseq *encseq,
                           unsigned int kmersize,
                           unsigned int numofchars,
                           const GtArrayGtCodetype *codeliststream,
                           GtError *err)
{
  bool haserr = false;
  GtArrayGtCodetype codeliststring;
  const GtUchar *characters;
  GtUword stringtotallength;

  gt_error_check(err);
  stringtotallength = gt_encseq_total_length(encseq);
  characters = gt_alphabet_characters(gt_encseq_alphabet(encseq));
  GT_INITARRAY(&codeliststring,GtCodetype);
  collectkmercode(&codeliststring,
                  encseq,
                  kmersize,
                  numofchars,
                  stringtotallength);
  if (comparecodelists(codeliststream,
                       &codeliststring,
                       kmersize,
                       numofchars,
                       (const char *) characters,
                       err) != 0)
  {
    haserr = true;
  }
  GT_FREEARRAY(&codeliststring,GtCodetype);
  return haserr ? -1 : 0;
}
Пример #3
0
static void onlinespacedseedsearch(const GtEncseq *encseq,
                                   const Spacedseed *spse,
                                   const GtUchar *qptr,qp)
{
  Windowiterator *wit;
  const GtUchar *buffer;
  GtUword currentpos, totallength;
  GtUword firstpos, windowschecked = 0;
  Bitsequence bitmask;
  bool matched;

  totallength = gt_encseq_total_length(encseq);
  wit = gt_windowiterator_new(encseq,spse->seedwidth,0,totallength);
  while (true)
  {
    buffer = gt_windowiterator_next(&currentpos,&firstpos,wit);
    if (buffer != NULL)
    {
      bitmask = FIRSTBIT;
      matched = true;
      for (idx=0; idx < spse->seedwidth; idx++)
      {
        if ((spse->seedbitvector & bitmask) && qptr[idx] != buffer[idx])
        {
          matched = false;
          break;
        }
        bitmask >>= 1;
      }
      if (matched)
      {
      }
    } else
    {
      break;
Пример #4
0
static int encseq_lua_total_length(lua_State *L)
{
  GtEncseq **encseq;
  encseq = check_encseq(L, 1);
  lua_pushnumber(L, gt_encseq_total_length(*encseq));
  return 1;
}
Пример #5
0
static unsigned long gt_esa2shulengthquery(const Suffixarray *suffixarray,
                                           const GtUchar *query,
                                           unsigned long querylen)
{
  const GtUchar *qptr;
  unsigned long totalgmatchlength = 0, gmatchlength, remaining;
  unsigned long totallength = gt_encseq_total_length(suffixarray->encseq);

  for (qptr = query, remaining = querylen; remaining > 0; qptr++, remaining--)
  {
    if (ISSPECIAL(*qptr))
    {
      gmatchlength = 0;
    } else
    {
      gmatchlength = gt_esa2shulengthatposition(suffixarray,
                                              totallength,
                                              0,
                                              0,
                                              totallength,
                                              qptr,
                                              query+querylen);
    }
    totalgmatchlength += gmatchlength;
  }
  return totalgmatchlength;
}
Пример #6
0
void gt_checksortedsuffixes(const char *filename,
                            int line,
                            const GtEncseq *encseq,
                            GtReadmode readmode,
                            const GtSuffixsortspace *suffixsortspace,
                            GtUword subbucketleft,
                            GtUword numberofsuffixes,
                            bool specialsareequal,
                            bool specialsareequalatdepth0,
                            GtUword depth)
{
  GtUword idx, pos1, pos2, maxlcp,
                totallength = gt_encseq_total_length(encseq);
  GtEncseqReader *esr1, *esr2;
  int cmp;

  gt_assert(!specialsareequal || specialsareequalatdepth0);
  esr1 = gt_encseq_create_reader_with_readmode(encseq, readmode, 0);
  esr2 = gt_encseq_create_reader_with_readmode(encseq, readmode, 0);
  gt_assert(numberofsuffixes > 0);
  pos1 = gt_suffixsortspace_get(suffixsortspace,subbucketleft,0);
  gt_assert(pos1 < totallength);
  for (idx = 1UL; idx < numberofsuffixes; idx++)
  {
    pos2 = gt_suffixsortspace_get(suffixsortspace,subbucketleft,idx);
    if (pos2 < totallength)
    {
      cmp = gt_encseq_check_comparetwosuffixes(encseq,
                                               readmode,
                                               &maxlcp,
                                               specialsareequal,
                                               specialsareequalatdepth0,
                                               depth,
                                               pos1,
                                               pos2,
                                               esr1,
                                               esr2);
      if (cmp > 0)
      {
        showcomparisonfailure(filename,
                              line,
                              "checksortedsuffixes",
                              encseq,
                              readmode,
                              suffixsortspace,
                              subbucketleft,
                              depth,
                              idx-1,
                              idx,
                              cmp,
                              maxlcp);
        exit(GT_EXIT_PROGRAMMING_ERROR);
      }
      gt_assert(depth == 0 || maxlcp <= depth);
    }
    pos1 = pos2;
  }
  gt_encseq_reader_delete(esr1);
  gt_encseq_reader_delete(esr2);
}
Пример #7
0
Enumcodeatposition *gt_Enumcodeatposition_new(const GtEncseq *encseq,
                                              GtReadmode readmode,
                                              unsigned int prefixlength,
                                              unsigned int numofchars)
{
  Enumcodeatposition *ecp;

  ecp = gt_malloc(sizeof *ecp);
  ecp->encseq = encseq;
  ecp->readmode = readmode;
  ecp->multimappower = gt_initmultimappower(numofchars,prefixlength);
  ecp->filltable = gt_initfilltable(numofchars,prefixlength);
  ecp->prefixlength = prefixlength;
  ecp->moveforward = GT_ISDIRREVERSE(readmode) ? true : false;
  ecp->totallength = gt_encseq_total_length(encseq);
  if (ecp->moveforward)
  {
    ecp->previousrange.start = ecp->previousrange.end = 0;
  } else
  {
    ecp->previousrange.start = ecp->previousrange.end = ecp->totallength;
  }
  ecp->exhausted = false;
  if (gt_encseq_has_specialranges(encseq))
  {
    ecp->sri = gt_specialrangeiterator_new(encseq,ecp->moveforward);
  } else
  {
    ecp->sri = NULL;
  }
  return ecp;
}
static void sequenceobject_init(Sequenceobject *seq,
                                GtExtendCharAccess extend_char_access_mode,
                                const GtEncseq *encseq,
                                GtReadmode readmode,
                                GtUword startpos,
                                GtUword len,
                                GtEncseqReader *encseq_r,
                                GtAllocatedMemory *sequence_cache,
                                GtUword totallength
                                )
{
  gt_assert(seq != NULL);
  seq->encseq = NULL;
  seq->encseqreader = NULL;
  seq->twobitencoding = NULL;
  seq->cache_ptr = NULL;
  seq->sequence_cache = NULL;
  if (extend_char_access_mode == GT_EXTEND_CHAR_ACCESS_ANY &&
      gt_encseq_has_twobitencoding(encseq) && gt_encseq_wildcards(encseq) == 0)
  {
    seq->twobitencoding = gt_encseq_twobitencoding_export(encseq);
  }
  if (seq->twobitencoding == NULL &&
      (extend_char_access_mode == GT_EXTEND_CHAR_ACCESS_ANY ||
       extend_char_access_mode == GT_EXTEND_CHAR_ACCESS_ENCSEQ_READER))
  {
    gt_encseq_reader_reinit_with_readmode(encseq_r, encseq, readmode, startpos);
    seq->encseqreader = encseq_r;
    gt_assert(seq->encseqreader != NULL);
    seq->sequence_cache = sequence_cache;
    gt_assert(sequence_cache != NULL);
    seq->cache_ptr = sequence_cache->space;
    seq->min_access_pos = GT_UWORD_MAX;
    seq->cache_num_positions = 0;
    seq->cache_offset = 0;
  }
  if (seq->twobitencoding == NULL && seq->encseqreader == NULL &&
      (extend_char_access_mode == GT_EXTEND_CHAR_ACCESS_ANY ||
       extend_char_access_mode == GT_EXTEND_CHAR_ACCESS_ENCSEQ))
  {
    seq->encseq = encseq;
  }
  seq->substringlength = len;
  if (readmode == GT_READMODE_FORWARD)
  {
    seq->startpos = startpos;
    seq->forward = true;
  } else
  {
    gt_assert(readmode == GT_READMODE_REVERSE);
    gt_assert(gt_encseq_total_length(encseq) == totallength);
    gt_assert(startpos + 1 <= totallength);
    seq->startpos = totallength - 1 - startpos;
    seq->forward = false;
  }
  gt_assert(seq->twobitencoding != NULL || seq->encseqreader != NULL ||
            seq->encseq != NULL);
}
Пример #9
0
static int encseq_lua_filenum(lua_State *L)
{
  GtEncseq **encseq;
  GtUword pos;
  encseq = check_encseq(L, 1);
  pos = luaL_checknumber(L, 2);
  luaL_argcheck(L, pos < gt_encseq_total_length(*encseq), 2,
                "cannot exceed total length of encoded sequence");
  lua_pushnumber(L, gt_encseq_filenum(*encseq, pos));
  return 1;
}
Пример #10
0
void getencseqkmers(const GtEncseq *encseq,
                    GtReadmode readmode,
                    unsigned int kmersize,
                    void(*processkmercode)(void *,
                                           unsigned long,
                                           const GtKmercode *),
                    void *processkmercodeinfo)
{
  unsigned long currentposition = 0, totallength;
  Kmerstream *spwp;
  GtUchar charcode;
  GtEncseqReader *esr;
  unsigned int numofchars, overshoot;

  totallength = gt_encseq_total_length(encseq);
  if (totallength < (unsigned long) kmersize)
  {
    return;
  }
  numofchars = gt_alphabet_num_of_chars(gt_encseq_alphabet(encseq));
  spwp = kmerstream_new(numofchars,kmersize);
  esr = gt_encseq_create_reader_with_readmode(encseq,readmode,0);
  for (currentposition = 0; currentposition < (unsigned long) kmersize;
       currentposition++)
  {
    charcode = gt_encseq_reader_next_encoded_char(esr);
    GT_CHECKENCCHAR(charcode,encseq,currentposition,readmode);
    spwp->windowwidth++;
    updatespecialpositions(spwp,charcode,false,0);
    spwp->cyclicwindow[spwp->windowwidth-1] = charcode;
  }
  kmerstream_newcode(&spwp->currentkmercode,spwp);
  processkmercode(processkmercodeinfo,0,&spwp->currentkmercode);
  for (currentposition = (unsigned long) kmersize; currentposition<totallength;
       currentposition++)
  {
    charcode = gt_encseq_reader_next_encoded_char(esr);
    GT_CHECKENCCHAR(charcode,encseq,currentposition,readmode);
    shiftrightwithchar(spwp,charcode);
    kmerstream_newcode(&spwp->currentkmercode,spwp);
    processkmercode(processkmercodeinfo,currentposition + 1 - spwp->kmersize,
                    &spwp->currentkmercode);
  }
  gt_encseq_reader_delete(esr);
  for (overshoot=0; overshoot<kmersize; overshoot++)
  {
    shiftrightwithchar(spwp,(GtUchar) WILDCARD);
    kmerstream_newcode(&spwp->currentkmercode,spwp);
    processkmercode(processkmercodeinfo,
                    overshoot + currentposition + 1 - spwp->kmersize,
                    &spwp->currentkmercode);
  }
  kmerstream_delete(spwp);
}
Пример #11
0
int gt_test_trieins(bool onlyins,const char *indexname,GtError *err)
{
  Suffixarray suffixarray;
  bool haserr = false;
  unsigned long totallength = 0;

  gt_error_check(err);
  if (streamsuffixarray(&suffixarray,
                        SARR_ESQTAB,
                        indexname,
                        NULL,
                        err) != 0)
  {
    haserr = true;
  } else
  {
    totallength = gt_encseq_total_length(suffixarray.encseq);
  }
  if (!haserr)
  {
    Mergertrierep trierep;
    const GtUchar *characters;

    trierep.encseqreadinfo = gt_malloc(sizeof *trierep.encseqreadinfo);
    trierep.encseqreadinfo->encseqptr = suffixarray.encseq;
    trierep.encseqreadinfo->readmode = suffixarray.readmode;
    characters
      = gt_alphabet_characters(gt_encseq_alphabet(suffixarray.encseq));
    gt_mergertrie_initnodetable(&trierep,totallength,1U);
    maketrie(&trierep,characters,totallength);
    if (onlyins)
    {
#ifdef WITHTRIEIDENT
#ifdef WITHTRIESHOW
      showtrie(&trierep,characters);
#endif
      checktrie(&trierep,totallength+1,totallength,err);
#endif
    } else
    {
#ifdef WITHTRIEIDENT
#ifdef WITHTRIESHOW
      showallnoderelations(trierep.root);
#endif
#endif
      successivelydeletesmallest(&trierep,totallength,characters,err);
    }
    gt_mergertrie_delete(&trierep);
  }
  gt_freesuffixarray(&suffixarray);
  return haserr ? -1 : 0;
}
Пример #12
0
GtCondenseq *gt_condenseq_new(const GtEncseq *orig_es, GtLogger *logger)
{
  GtCondenseq *condenseq;
  condenseq = condenseq_new_empty(gt_encseq_alphabet(orig_es));

  condenseq->orig_num_seq = gt_encseq_num_of_sequences(orig_es);

  condenseq->ssptab = condenseq_fill_tab(condenseq, orig_es);
  condenseq->orig_length = gt_encseq_total_length(orig_es);

  condenseq_process_descriptions(condenseq, orig_es, logger);
  return condenseq;
}
Пример #13
0
unsigned long gt_contfind_bottomup(Sequentialsuffixarrayreader *ssar,
                     bool show_progressbar, GtBitsequence *contained,
                     unsigned long firstrevcompl,
                     unsigned long read_length /* 0 = variable */)
{
  ContfindBUstate state;
  unsigned long totallength;
  GT_UNUSED int retval;

  gt_assert(ssar != NULL);
  gt_assert(contained != NULL);

  state.contained = contained;
  state.encseq = gt_encseqSequentialsuffixarrayreader(ssar);
  totallength = gt_encseq_total_length(state.encseq);
  state.nofsequences = gt_encseq_num_of_sequences(state.encseq);

  if (read_length == 0)
  {
    prepare_sspbittab_and_shortest(totallength, &state);
  }
  else
  {
    state.shortest = read_length;
    state.spacing = read_length + 1;
  }

  state.show_progressbar = show_progressbar;
  state.csize            = 0;
  state.cmin             = 0;
  state.firstrevcompl    = firstrevcompl;
  state.counter          = 0;

  if (show_progressbar)
  {
    state.progress = 0;
    gt_progressbar_start(&(state.progress),
        (unsigned long long)totallength);
  }

  retval = (read_length == 0)
      ? gt_esa_bottomup_rdjcv(ssar, &state, NULL)
      : gt_esa_bottomup_rdjce(ssar, &state, NULL);
  gt_assert(retval == 0);

  if (show_progressbar)
    gt_progressbar_stop();
  if (read_length == 0)
    gt_free(state.sspbittab);
  return state.counter;
}
Пример #14
0
static void showmergertrie2(const Mergertrierep *trierep,
                            const GtUchar *characters,
                            unsigned int level,
                            const Mergertrienode *node)
{
  GtUchar cc = 0;
  GtUword pos, endpos;
  Mergertrienode *current;

  for (current = node->firstchild;
       current != NULL;
       current = current->rightsibling)
  {
    printf("%*.*s",(int) (6 * level),(int) (6 * level)," ");
    if (MTRIE_ISLEAF(current))
    {
      endpos = gt_encseq_total_length(
                                 trierep->encseqtable[current->suffixinfo.idx]);
    } else
    {
      endpos = current->suffixinfo.startpos + current->depth;
    }
    for (pos = current->suffixinfo.startpos + node->depth;
         pos < endpos; pos++)
    {
      cc = gt_encseq_get_encoded_char( /* just for testing */
              trierep->enseqreadinfo[current->suffixinfo.idx].encseqptr,
              pos,
              trierep->enseqreadinfo[current->suffixinfo.idx].readmode);
      if (ISSPECIAL(cc))
      {
        printf("#\n");
        break;
      }
      printf("%c",characters[(int) cc]);
    }
    if (MTRIE_ISLEAF(current))
    {
      if (!ISSPECIAL(cc))
      {
        printf("~\n");
      }
    } else
    {
      printf(" d="GT_WU",i=" Formatuint64_t "\n",
            current->depth,
            PRINTuint64_tcast(current->suffixinfo.ident));
      showmergertrie2(trierep,characters,level+1,current);
    }
  }
}
Пример #15
0
static int encseq_reader_lua_reinit_with_readmode(lua_State *L)
{
  GtEncseq **encseq;
  GtEncseqReader **reader;
  GtUword startpos;
  GtReadmode readmode;
  reader = check_encseq_reader(L, 1);
  encseq = check_encseq(L, 2);
  readmode = luaL_checknumber(L, 3);
  startpos = luaL_checknumber(L, 4);
  luaL_argcheck(L, startpos < gt_encseq_total_length(*encseq), 4,
                "cannot exceed total length of encoded sequence");
  gt_encseq_reader_reinit_with_readmode(*reader, *encseq, readmode, startpos);
  return 0;
}
Пример #16
0
static int encseq_lua_get_decoded_char(lua_State *L)
{
  GtEncseq **encseq;
  GtUword pos;
  int readmode;
  char cc;
  encseq = check_encseq(L, 1);
  pos = luaL_checknumber(L, 2);
  readmode = luaL_checknumber(L, 3);
  luaL_argcheck(L, pos < gt_encseq_total_length(*encseq), 2,
                "cannot exceed total length of encoded sequence");
  cc = gt_encseq_get_decoded_char(*encseq, pos, readmode);
  lua_pushlstring(L, &cc, sizeof (char));
  return 1;
}
Пример #17
0
void gt_showentiresuftab(const GtEncseq *encseq,
                         GtReadmode readmode,
                         const GtSuffixsortspace *suffixsortspace,
                         GtUword subbucketleft,
                         GtUword depth)
{
  GtUword idx, pos, totallength = gt_encseq_total_length(encseq);

  for (idx = 0; idx <= totallength; idx++)
  {
    pos = gt_suffixsortspace_get(suffixsortspace,subbucketleft,idx);
    printf("suftab["GT_WU"]="GT_WU" ",idx,pos);
    gt_encseq_showatstartposwithdepth(stdout,encseq,readmode,pos,depth);
    printf("\n");
  }
}
Пример #18
0
void gt_bioseq_show_stat(GtBioseq *bs, GtFile *outfp)
{
  GtUword i, num_of_seqs;
  gt_assert(bs);
  num_of_seqs = gt_bioseq_number_of_sequences(bs);
  gt_file_xprintf(outfp, "showing statistics for sequence file \"%s\"\n",
                  gt_str_get(bs->sequence_file));
  gt_file_xprintf(outfp, "number of sequences: "GT_WU"\n", num_of_seqs);
  gt_file_xprintf(outfp, "total length: "GT_WU"\n",
                    gt_encseq_total_length(bs->encseq)
                      - gt_encseq_num_of_sequences(bs->encseq) + 1);
  for (i = 0; i < num_of_seqs; i++) {
    gt_file_xprintf(outfp, "sequence #"GT_WU" length: "GT_WU"\n", i+1,
                    gt_bioseq_get_sequence_length(bs, i));
  }
}
Пример #19
0
static int encseq_lua_extract_encoded(lua_State *L)
{
  GtEncseq **encseq;
  GtUword from, to;
  unsigned char *string;
  encseq = check_encseq(L, 1);
  from = luaL_checknumber(L, 2);
  to = luaL_checknumber(L, 3);
  luaL_argcheck(L, from <= to, 2, "must be <= range endposition");
  luaL_argcheck(L, to < gt_encseq_total_length(*encseq), 3,
                "cannot exceed total length of encoded sequence");
  string = gt_malloc((to - from + 1) * sizeof (unsigned char));
  gt_encseq_extract_encoded(*encseq, string, from, to);
  encseq_lua_push_buffer(L, string, (to - from + 1));
  return 1;
}
Пример #20
0
static int encseq_lua_create_reader_with_readmode(lua_State *L)
{
  GtEncseq **encseq;
  GtEncseqReader *reader;
  GtUword startpos;
  GtReadmode readmode;
  encseq = check_encseq(L, 1);
  readmode = luaL_checknumber(L, 2);
  startpos = luaL_checknumber(L, 3);
  luaL_argcheck(L, startpos < gt_encseq_total_length(*encseq), 3,
                "cannot exceed total length of encoded sequence");
  reader = gt_encseq_create_reader_with_readmode(*encseq, readmode, startpos);
  gt_assert(reader);
  gt_lua_encseq_reader_push(L, reader);
  return 1;
}
Пример #21
0
static GtUchar getfirstedgechar(const Mergertrierep *trierep,
                              const Mergertrienode *node,
                              GtUword prevdepth)
{
  Encseqreadinfo *eri = trierep->encseqreadinfo + node->suffixinfo.idx;

  if (MTRIE_ISLEAF(node) &&
      node->suffixinfo.startpos + prevdepth >=
      gt_encseq_total_length(eri->encseqptr))
  {
    return (GtUchar) SEPARATOR;
  }
  return gt_encseq_get_encoded_char(eri->encseqptr, /* Random access */
                        node->suffixinfo.startpos + prevdepth,
                        eri->readmode);
}
Пример #22
0
/*@notnull@*/ GtKmercodeiterator *gt_kmercodeiterator_encseq_new(
                                            const GtEncseq *encseq,
                                            GtReadmode readmode,
                                            unsigned int kmersize,
                                            unsigned long startpos)
{
  GtKmercodeiterator *kmercodeiterator;
  unsigned int numofchars;
  GtUchar charcode;

  gt_assert(!GT_ISDIRREVERSE(readmode) || startpos == 0);
  kmercodeiterator = gt_malloc(sizeof (*kmercodeiterator));
  kmercodeiterator->totallength = gt_encseq_total_length(encseq);
  kmercodeiterator->startpos = startpos;
  gt_assert(startpos < kmercodeiterator->totallength);
  if (kmercodeiterator->totallength - startpos < (unsigned long) kmersize)
  {
    kmercodeiterator->inputexhausted = true;
    kmercodeiterator->fb = NULL;
    kmercodeiterator->encseq = encseq;
    kmercodeiterator->esr = NULL;
    kmercodeiterator->spwp = NULL;
  } else
  {
    kmercodeiterator->inputexhausted = false;
    kmercodeiterator->fb = NULL;
    kmercodeiterator->encseq = encseq;
    kmercodeiterator->readmode = readmode;
    kmercodeiterator->esr = gt_encseq_create_reader_with_readmode(encseq,
                                                                  readmode,
                                                                  startpos);
    numofchars = gt_alphabet_num_of_chars(gt_encseq_alphabet(encseq));
    kmercodeiterator->spwp = kmerstream_new(numofchars,kmersize);
    kmercodeiterator->hasprocessedfirst = false;
    for (kmercodeiterator->currentposition = startpos;
         kmercodeiterator->currentposition < startpos+(unsigned long) kmersize;
         kmercodeiterator->currentposition++)
    {
      charcode = gt_encseq_reader_next_encoded_char(kmercodeiterator->esr);
      kmercodeiterator->spwp->windowwidth++;
      updatespecialpositions(kmercodeiterator->spwp,charcode,false,0);
      kmercodeiterator->spwp->cyclicwindow[kmercodeiterator->
                                           spwp->windowwidth-1] = charcode;
    }
  }
  return kmercodeiterator;
}
Пример #23
0
static GtUword samplesubstring(GtUchar *seqspace,
                              const GtEncseq *encseq,
                              GtUword substringlength)
{
  GtUword start, totallength;

  totallength = gt_encseq_total_length(encseq);
  start = (GtUword) (random() % totallength);
  if (start + substringlength > totallength)
  {
    substringlength = totallength - start;
  }
  gt_assert(substringlength > 0);
  gt_encseq_extract_encoded(encseq,seqspace,start,
                                       start+substringlength-1);
  return substringlength;
}
Пример #24
0
static void showprjinfo(FILE *outprj,
                        GtReadmode readmode,
                        const GtEncseq *encseq,
                        GtUword numberofallsortedsuffixes,
                        unsigned int prefixlength,
                        GtUword numoflargelcpvalues,
                        double averagelcp,
                        GtUword maxbranchdepth,
                        const Definedunsignedlong *longest)
{
  GtUword totallength;
  GtUword numofsequences;

  totallength = gt_encseq_total_length(encseq);
  fprintf(outprj,"totallength="GT_WU"\n",totallength);
  PRJSPECIALOUT(specialcharacters);
  PRJSPECIALOUT(specialranges);
  PRJSPECIALOUT(realspecialranges);
  PRJSPECIALOUT(lengthofspecialprefix);
  PRJSPECIALOUT(lengthofspecialsuffix);
  PRJSPECIALOUT(wildcards);
  PRJSPECIALOUT(wildcardranges);
  PRJSPECIALOUT(realwildcardranges);
  PRJSPECIALOUT(lengthofwildcardprefix);
  PRJSPECIALOUT(lengthofwildcardsuffix);
  numofsequences = gt_encseq_num_of_sequences(encseq);
  fprintf(outprj,"numofsequences="GT_WU"\n",numofsequences);
  fprintf(outprj,"numofdbsequences="GT_WU"\n",numofsequences);
  fprintf(outprj,"numofquerysequences=0\n");
  fprintf(outprj,"numberofallsortedsuffixes="GT_WU"\n",
          numberofallsortedsuffixes);
  if (longest->defined)
  {
    fprintf(outprj,"longest="GT_WU"\n",longest->valueunsignedlong);
  }
  fprintf(outprj,"prefixlength=%u\n",prefixlength);
  fprintf(outprj,"largelcpvalues="GT_WU"\n",numoflargelcpvalues);
  fprintf(outprj,"averagelcp=%.2f\n",averagelcp);
  fprintf(outprj,"maxbranchdepth="GT_WU"\n",maxbranchdepth);
  fprintf(outprj,"integersize=%u\n",
                  (unsigned int) (sizeof (GtUword) * CHAR_BIT));
  fprintf(outprj,"littleendian=%c\n",gt_is_little_endian() ? '1' : '0');
  fprintf(outprj,"readmode=%u\n",(unsigned int) readmode);
  fprintf(outprj,"mirrored=%c\n", gt_encseq_is_mirrored(encseq) ? '1' : '0');
}
Пример #25
0
static void gt_readjoiner_assembly_pump_encseq_through_cache(
    const GtEncseq *encseq)
{
  const GtTwobitencoding *twobitencoding = gt_encseq_twobitencoding_export(
      encseq);
  uint64_t sum = 0; /* compute the sum, so that the compiler does no remove the
                       code accessing twobitencoding during optimization */
  GtUword idx, totallength = gt_encseq_total_length(encseq),
                numofunits = ! gt_encseq_is_mirrored(encseq)
                  ? gt_unitsoftwobitencoding(totallength)
                  : gt_unitsoftwobitencoding((totallength - 1)/2);
  for (idx = 0; idx < numofunits; idx++)
    sum += twobitencoding[idx];
  gt_assert(sum > 0);
#ifndef S_SPLINT_S
  gt_log_log("encseq codes-sum: %"PRIu64, sum);
#endif
}
Пример #26
0
void gt_kmercodeiterator_reset(GtKmercodeiterator *kmercodeiterator,
                               GtReadmode readmode,
                               GtUword startpos)
{
  GtUchar charcode;
  const GtEncseq *encseq = kmercodeiterator->encseq;
  GtUword kmersize = (GtUword) kmercodeiterator->spwp->kmersize;

  gt_assert(!GT_ISDIRREVERSE(readmode) || startpos == 0);
  kmercodeiterator->totallength = gt_encseq_total_length(encseq);
  kmercodeiterator->startpos = startpos;
  gt_assert(startpos < kmercodeiterator->totallength);
  kmercodeiterator->fb = NULL;
  if (kmercodeiterator->totallength - startpos < kmersize)
  {
    kmercodeiterator->inputexhausted = true;
    gt_encseq_reader_delete(kmercodeiterator->esr);
    kmercodeiterator->esr = NULL;
    kmerstream_delete(kmercodeiterator->spwp);
    kmercodeiterator->spwp = NULL;
  } else
  {
    kmercodeiterator->inputexhausted = false;
    kmercodeiterator->readmode = readmode;
    gt_encseq_reader_reinit_with_readmode(kmercodeiterator->esr,
                                          encseq,
                                          readmode,
                                          startpos);
    kmerstream_reset(kmercodeiterator->spwp);
    kmercodeiterator->hasprocessedfirst = false;
    for (kmercodeiterator->currentposition = startpos;
         kmercodeiterator->currentposition < startpos+(GtUword) kmersize;
         kmercodeiterator->currentposition++)
    {
      charcode = gt_encseq_reader_next_encoded_char(kmercodeiterator->esr);
      kmercodeiterator->spwp->windowwidth++;
      kmerstream_updatespecialpositions(kmercodeiterator->spwp,charcode,
                                        false,0);
      kmercodeiterator->spwp->cyclicwindow[kmercodeiterator->
                                           spwp->windowwidth-1] = charcode;
    }
  }
}
Пример #27
0
GtCodonIterator* gt_codon_iterator_encseq_new_with_readmode(GtEncseq *encseq,
                                                         unsigned long startpos,
                                                         unsigned long length,
                                                         GtReadmode readmode,
                                                         GT_UNUSED GtError *err)
{
  GtCodonIteratorEncseq *cie;
  GtCodonIterator *ci;
  gt_assert(encseq && startpos + length - 1 < gt_encseq_total_length(encseq));
  gt_error_check(err);
  ci = gt_codon_iterator_create(gt_codon_iterator_encseq_class());
  cie = gt_codon_iterator_encseq_cast(ci);
  cie->encseq = gt_encseq_ref(encseq);
  cie->readmode = readmode;
  ci->pvt->length = length;
  ci->pvt->curpos = 0;
  ci->pvt->startpos = startpos;
  return ci;
}
Пример #28
0
static void verifymatch(const GtEncseq *encseq,
                        GtUword len,
                        GtUword pos1,
                        uint64_t seqnum2,
                        GtUword pos2,
                        GtReadmode readmode)
{
  if (readmode == GT_READMODE_REVERSE)
  {
    GtUword offset,
                  seqstartpos,
                  totallength = gt_encseq_total_length(encseq);
    GtUchar cc1, cc2;

    seqstartpos = gt_encseq_seqstartpos(encseq, seqnum2);
    pos2 += seqstartpos;
    for (offset = 0; offset < len; offset++)
    {
      gt_assert(pos1 + len - 1 < totallength);
      gt_assert(pos2 + len - 1 < totallength);
      cc1 = gt_encseq_get_encoded_char(encseq,pos1+offset,GT_READMODE_FORWARD);
      cc2 = gt_encseq_get_encoded_char(encseq,pos2+len-1-offset,
                                       GT_READMODE_FORWARD);
      gt_assert(cc1 == cc2 && ISNOTSPECIAL(cc1));
    }
    if (pos1 + len < totallength)
    {
      cc1 = gt_encseq_get_encoded_char(encseq,pos1+len,GT_READMODE_FORWARD);
    } else
    {
      cc1 = SEPARATOR;
    }
    if (pos2 > 0)
    {
      cc2 = gt_encseq_get_encoded_char(encseq,pos2-1,GT_READMODE_FORWARD);
    } else
    {
      cc2 = SEPARATOR;
    }
    gt_assert(cc1 != cc2 || ISSPECIAL(cc1));
  }
}
Пример #29
0
static Mergertrienode *mtrie_makenewbranch(Mergertrierep *trierep,
                                     Suffixinfo *suffixinfo,
                                     GtUword currentdepth,
                                     Mergertrienode *oldnode)
{
  Mergertrienode *newbranch, *newleaf;
  GtUchar cc1, cc2;
  Encseqreadinfo *eri = trierep->encseqreadinfo + suffixinfo->idx;

#ifdef WITHTRIEIDENT
#ifdef WITHTRIESHOW
  printf("makenewbranch(ident=" Formatuint64_t ")\n",
          PRINTuint64_tcast(suffixinfo->ident));
#endif
#endif
  newbranch = newMergertrienode(trierep);
  newbranch->suffixinfo = *suffixinfo;
  newbranch->rightsibling = oldnode->rightsibling;
  cc1 = getfirstedgechar(trierep,oldnode,currentdepth);
  if (suffixinfo->startpos + currentdepth >=
      gt_encseq_total_length(eri->encseqptr))
  {
    cc2 = (GtUchar) SEPARATOR;
  } else
  {
    cc2 = gt_encseq_get_encoded_char(eri->encseqptr,
                         suffixinfo->startpos + currentdepth,
                         eri->readmode);
  }
  newleaf = mtrie_makenewleaf(trierep,suffixinfo);
  if (mtrie_comparecharacters(cc1,oldnode->suffixinfo.idx,
                        cc2,suffixinfo->idx) <= 0)
  {
    makesuccs(newbranch,oldnode,newleaf);
  } else
  {
    makesuccs(newbranch,newleaf,oldnode);
  }
  newbranch->depth = currentdepth;
  return newbranch;
}
Пример #30
0
GtWtree* gt_wtree_encseq_new(GtEncseq *encseq)
{
  /* sample rate for compressd bitseq */
  const unsigned int samplerate = 32U;
  GtWtree *wtree;
  GtWtreeEncseq *wtree_encseq;
  wtree = gt_wtree_create(gt_wtree_encseq_class());
  wtree_encseq = gt_wtree_encseq_cast(wtree);
  wtree_encseq->encseq = gt_encseq_ref(encseq);
  wtree_encseq->alpha = gt_alphabet_ref(gt_encseq_alphabet(encseq));
  /* encoded chars + WC given by gt_alphabet_size,
     we have to encode UNDEFCHAR and SEPARATOR too */
  wtree_encseq->alpha_size = gt_alphabet_size(wtree_encseq->alpha) + 2;
  wtree->members->num_of_symbols = (GtUword) wtree_encseq->alpha_size;
  /* levels in tree: \lceil log_2(\sigma)\rceil */
  wtree_encseq->levels =
    gt_determinebitspervalue((GtUword) wtree_encseq->alpha_size);
  wtree_encseq->root_fo = gt_wtree_encseq_fill_offset_new();
  wtree_encseq->current_fo = wtree_encseq->root_fo;
  wtree->members->length =
    gt_encseq_total_length(encseq);
  /* each level has number of symbols bits */
  wtree_encseq->num_of_bits =
    wtree_encseq->levels *
    wtree->members->length;
  wtree_encseq->bits_size =
    wtree_encseq->num_of_bits / (sizeof (GtBitsequence) * CHAR_BIT);
  if (wtree_encseq->num_of_bits % (sizeof (GtBitsequence) * CHAR_BIT) != 0)
    wtree_encseq->bits_size++;
  wtree_encseq->bits =
    gt_calloc((size_t) wtree_encseq->bits_size, sizeof (GtBitsequence));
  wtree_encseq->node_start = 0;
  gt_wtree_encseq_fill_bits(wtree_encseq);
  wtree_encseq->c_bits =
    gt_compressed_bitsequence_new(wtree_encseq->bits,
                                  samplerate,
                                  wtree_encseq->num_of_bits);
  gt_free(wtree_encseq->bits);
  wtree_encseq->bits = NULL;
  return wtree;
}