Example #1
0
static GtCodetype qgram2codefillspecial(unsigned int numofchars,
                                      unsigned int kmersize,
                                      const GtEncseq *encseq,
                                      GtReadmode readmode,
                                      GtUword startpos,
                                      GtUword totallength)
{
  GtCodetype integercode;
  GtUword pos;
  bool foundspecial;
  GtUchar cc;

  if (startpos >= totallength)
  {
    integercode = (GtCodetype) (numofchars - 1);
    foundspecial = true;
  } else
  {
    /* for testing */
    cc = gt_encseq_get_encoded_char(encseq,startpos,readmode);
    if (ISSPECIAL(cc))
    {
      integercode = (GtCodetype) (numofchars - 1);
      foundspecial = true;
    } else
    {
      integercode = (GtCodetype) cc;
      foundspecial = false;
    }
  }
  for (pos = startpos + 1; pos < startpos + kmersize; pos++)
  {
    if (foundspecial)
    {
      ADDNEXTCHAR(integercode,numofchars-1,numofchars);
    } else
    {
      if (pos >= totallength)
      {
        ADDNEXTCHAR(integercode,numofchars-1,numofchars);
        foundspecial = true;
      } else
      {
        /* for testing */
        cc = gt_encseq_get_encoded_char(encseq,pos,readmode);
        if (ISSPECIAL(cc))
        {
          ADDNEXTCHAR(integercode,numofchars-1,numofchars);
          foundspecial = true;
        } else
        {
          ADDNEXTCHAR(integercode,cc,numofchars);
        }
      }
    }
  }
  return integercode;
}
Example #2
0
static void backwardderive(const GtBucketspec2 *bucketspec2,
                           GtSuffixsortspace *suffixsortspace,
                           GtUword *targetoffset,
                           unsigned int source,
                           GtUword idx)
{
  GtUword startpos;
  GtUchar cc;

  for (; idx + 1 > targetoffset[source] + 1; idx--)
  {
    startpos = gt_suffixsortspace_getdirect(suffixsortspace,idx);
    if (startpos > 0)
    {
      cc = gt_encseq_get_encoded_char(bucketspec2->encseq,
                                      startpos-1,
                                      bucketspec2->readmode);
      if (ISNOTSPECIAL(cc) && !bucketspec2->superbuckettab[cc].sorted)
      {
        gt_suffixsortspace_setdirect(suffixsortspace,targetoffset[cc],
                                     startpos - 1);
        targetoffset[cc]--;
      }
    }
  }
}
static GtUchar sequenceobject_get_char(Sequenceobject *seq,GtUword pos)
{
  if (seq->twobitencoding != NULL)
  {
    return gt_twobitencoding_char_at_pos(seq->twobitencoding,
                                         seq->forward ? seq->startpos + pos
                                                      : seq->startpos - pos);
  }
  if (seq->encseqreader != NULL)
  {
    const GtUword addamount = 256UL;

    if (seq->min_access_pos != GT_UWORD_MAX &&
        seq->min_access_pos >= seq->cache_offset + addamount)
    {
      GtUword idx, end = MIN(seq->cache_num_positions,seq->substringlength);
      GtUchar *cs = ((GtUchar *) seq->sequence_cache->space)
                    - seq->min_access_pos;

      for (idx = seq->min_access_pos; idx < end; idx++)
      {
        cs[idx] = seq->cache_ptr[idx];
      }
      seq->cache_offset = seq->min_access_pos;
      seq->cache_ptr = ((GtUchar *) seq->sequence_cache->space)
                       - seq->cache_offset;
    }
    if (pos >= seq->cache_num_positions)
    {
      GtUword idx, tostore;

      tostore = MIN(seq->cache_num_positions + addamount,seq->substringlength);
      if (tostore > seq->cache_offset + seq->sequence_cache->allocated)
      {
        seq->sequence_cache->allocated += addamount;
        seq->sequence_cache->space
          = gt_realloc(seq->sequence_cache->space,
                       sizeof (GtUchar) * seq->sequence_cache->allocated);
        seq->cache_ptr = ((GtUchar *) seq->sequence_cache->space)
                         - seq->cache_offset;
      }
      gt_assert(pos >= seq->cache_offset);
      for (idx = seq->cache_num_positions; idx < tostore; idx++)
      {
        seq->cache_ptr[idx]
          = gt_encseq_reader_next_encoded_char(seq->encseqreader);
      }
      seq->cache_num_positions = tostore;
    }
    gt_assert(pos < seq->cache_offset + seq->sequence_cache->allocated);
    gt_assert(seq->cache_ptr != NULL);
    return seq->cache_ptr[pos];
  }
  gt_assert(seq->encseq != NULL);
  gt_assert(seq->forward || seq->startpos >= pos);
  return gt_encseq_get_encoded_char(seq->encseq,
                                    seq->forward ? seq->startpos + pos
                                                 : seq->startpos - pos,
                                    GT_READMODE_FORWARD);
}
Example #4
0
static void iteritvdistribution(GtArrayuint64_t *distribution,
                                const GtEncseq *encseq,
                                GtReadmode readmode,
                                unsigned long totallength,
                                unsigned long minmersize,
                                unsigned long maxmersize,
                                unsigned long length,
                                unsigned long startpos)
{

    if (length <= (unsigned long) maxmersize)
    {
        unsigned long ulen, pos;

        for (ulen = length,
                pos = startpos + length - 1;
                ulen <= (unsigned long) maxmersize &&
                pos < totallength &&
                ISNOTSPECIAL(gt_encseq_get_encoded_char(encseq,pos,readmode));
                pos++, ulen++)
        {
            if (ulen >= (unsigned long) minmersize)
            {
                adddistributionuint64_t(distribution,(unsigned long) ulen,1UL);
            }
        }
    }
}
static void checknumberofoccurrences(const TyrDfsstate *dfsstate,
                                     GtUword countocc,
                                     GtUword position)
{
  GtMMsearchiterator *mmsi;
  GtUword idx, bfcount;

  for (idx = 0; idx < dfsstate->mersize; idx++)
  {
    dfsstate->currentmer[idx] =
              gt_encseq_get_encoded_char(dfsstate->encseq,position+idx,
                                                dfsstate->readmode);
  }
  mmsi = gt_mmsearchiterator_new_complete_plain(dfsstate->encseq,
                                              dfsstate->suftab,
                                              0,
                                              dfsstate->totallength,
                                              0,
                                              dfsstate->readmode,
                                              dfsstate->currentmer,
                                              dfsstate->mersize);
  bfcount = gt_mmsearchiterator_count(mmsi);
  if (bfcount != countocc)
  {
    fprintf(stderr,"bfcount = "GT_WU" != "GT_WU" = countocc\n",
            bfcount,countocc);
    exit(GT_EXIT_PROGRAMMING_ERROR);
  }
  gt_mmsearchiterator_delete(mmsi);
}
Example #6
0
static GtUchar gt_mmsearch_accessquery(const GtQueryrepresentation *queryrep,
                                       GtUword pos)
{
  GtUword abspos, cc;

  gt_assert(queryrep != NULL);
  gt_assert(pos < queryrep->seqlen);
  abspos = queryrep->startpos + (queryrep->readmode == GT_READMODE_FORWARD
                                  ? pos
                                  : GT_REVERSEPOS(queryrep->seqlen,pos));
  if (queryrep->sequence != NULL)
  {
    cc = queryrep->sequence[abspos];
  } else
  {
    gt_assert(queryrep->encseq != NULL);
    cc = gt_encseq_get_encoded_char(queryrep->encseq,abspos,
                                    GT_READMODE_FORWARD);
  }
  if (GT_ISDIRCOMPLEMENT(queryrep->readmode))
  {
    if (ISSPECIAL(cc))
    {
      return cc;
    }
    return GT_COMPLEMENTBASE(cc);
  } else
  {
    return cc;
  }
}
Example #7
0
GtUchar gt_bioseq_get_encoded_char(const GtBioseq *bs, GtUword index,
                                   GtUword position)
{
  GtUword startpos;
  gt_assert(bs);
  gt_assert(index < gt_encseq_num_of_sequences(bs->encseq));
  startpos = gt_encseq_seqstartpos(bs->encseq, index);
  return gt_encseq_get_encoded_char(bs->encseq, startpos + position,
                                    GT_READMODE_FORWARD);
}
Example #8
0
static Scoretype swlocalsimilarityscore(Scoretype *scol,
                                        Maxscorecoord *maxpair,
                                        const Scorevalues *scorevalues,
                                        const GtUchar *useq,
                                        GtUword ulen,
                                        const GtEncseq *vencseq,
                                        GtUword startpos,
                                        GtUword endpos)
{
  Scoretype val, we, nw, *scolptr, maximalscore = 0;
  const GtUchar *uptr;
  GtUchar vcurrent;
  GtUword j;

  maxpair->umax = maxpair->vmax = 0;
  for (scolptr = scol; scolptr <= scol + ulen; scolptr++)
  {
    *scolptr = 0;
  }
  for (j = startpos; j < endpos; j++)
  {
    nw = 0;
    vcurrent = gt_encseq_get_encoded_char(vencseq,j,
                                                   GT_READMODE_FORWARD);
    gt_assert(vcurrent != (GtUchar) SEPARATOR);
    for (scolptr = scol+1, uptr = useq; uptr < useq + ulen; scolptr++, uptr++)
    {
      gt_assert(*uptr != (GtUchar) SEPARATOR);
      we = *scolptr;
      *scolptr = *(scolptr-1) + scorevalues->gapextend;
      if ((val = nw + REPLACEMENTSCORE(scorevalues,*uptr,vcurrent)) > *scolptr)
      {
        *scolptr = val;
      }
      if ((val = we + scorevalues->gapextend) > *scolptr)
      {
        *scolptr = val;
      }
      if (*scolptr < 0)
      {
        *scolptr = 0;
      } else
      {
        if (*scolptr > maximalscore)
        {
          maximalscore = *scolptr;
          maxpair->umax = (GtUword) (uptr - useq + 1);
          maxpair->vmax = (GtUword) (j - startpos + 1);
        }
      }
      nw = we;
    }
  }
  return maximalscore;
}
Example #9
0
static void verifymatch(const GtEncseq *encseq,
                        GtUword len,
                        GtUword pos1,
                        uint64_t seqnum2,
                        GtUword pos2,
                        GtReadmode readmode)
{
  if (readmode == GT_READMODE_REVERSE)
  {
    GtUword offset,
                  seqstartpos,
                  totallength = gt_encseq_total_length(encseq);
    GtUchar cc1, cc2;

    seqstartpos = gt_encseq_seqstartpos(encseq, seqnum2);
    pos2 += seqstartpos;
    for (offset = 0; offset < len; offset++)
    {
      gt_assert(pos1 + len - 1 < totallength);
      gt_assert(pos2 + len - 1 < totallength);
      cc1 = gt_encseq_get_encoded_char(encseq,pos1+offset,GT_READMODE_FORWARD);
      cc2 = gt_encseq_get_encoded_char(encseq,pos2+len-1-offset,
                                       GT_READMODE_FORWARD);
      gt_assert(cc1 == cc2 && ISNOTSPECIAL(cc1));
    }
    if (pos1 + len < totallength)
    {
      cc1 = gt_encseq_get_encoded_char(encseq,pos1+len,GT_READMODE_FORWARD);
    } else
    {
      cc1 = SEPARATOR;
    }
    if (pos2 > 0)
    {
      cc2 = gt_encseq_get_encoded_char(encseq,pos2-1,GT_READMODE_FORWARD);
    } else
    {
      cc2 = SEPARATOR;
    }
    gt_assert(cc1 != cc2 || ISSPECIAL(cc1));
  }
}
Example #10
0
static GtUword getlcp(const GtEncseq *encseq1,
                            GtReadmode readmode1,
                            GtUword start1, GtUword end1,
                            const GtEncseq *encseq2,
                            GtReadmode readmode2,
                            GtUword start2, GtUword end2)
{
  GtUword i1, i2;
  GtUchar cc1;

  for (i1=start1, i2=start2; i1 <= end1 && i2 <= end2; i1++, i2++)
  {
    cc1 = gt_encseq_get_encoded_char(/*XXX*/ encseq1,i1,readmode1);
    if (cc1 != gt_encseq_get_encoded_char(/*XXX*/ encseq2,i2,readmode2)
          || ISSPECIAL(cc1))
    {
      break;
    }
  }
  return i1 - start1;
}
Example #11
0
static void showmergertrie2(const Mergertrierep *trierep,
                            const GtUchar *characters,
                            unsigned int level,
                            const Mergertrienode *node)
{
  GtUchar cc = 0;
  GtUword pos, endpos;
  Mergertrienode *current;

  for (current = node->firstchild;
       current != NULL;
       current = current->rightsibling)
  {
    printf("%*.*s",(int) (6 * level),(int) (6 * level)," ");
    if (MTRIE_ISLEAF(current))
    {
      endpos = gt_encseq_total_length(
                                 trierep->encseqtable[current->suffixinfo.idx]);
    } else
    {
      endpos = current->suffixinfo.startpos + current->depth;
    }
    for (pos = current->suffixinfo.startpos + node->depth;
         pos < endpos; pos++)
    {
      cc = gt_encseq_get_encoded_char( /* just for testing */
              trierep->enseqreadinfo[current->suffixinfo.idx].encseqptr,
              pos,
              trierep->enseqreadinfo[current->suffixinfo.idx].readmode);
      if (ISSPECIAL(cc))
      {
        printf("#\n");
        break;
      }
      printf("%c",characters[(int) cc]);
    }
    if (MTRIE_ISLEAF(current))
    {
      if (!ISSPECIAL(cc))
      {
        printf("~\n");
      }
    } else
    {
      printf(" d="GT_WU",i=" Formatuint64_t "\n",
            current->depth,
            PRINTuint64_tcast(current->suffixinfo.ident));
      showmergertrie2(trierep,characters,level+1,current);
    }
  }
}
Example #12
0
static int encseq_lua_get_encoded_char(lua_State *L)
{
  GtEncseq **encseq;
  GtUword pos;
  int readmode;
  unsigned char cc;
  encseq = check_encseq(L, 1);
  pos = luaL_checknumber(L, 2);
  readmode = luaL_checknumber(L, 3);
  luaL_argcheck(L, pos < gt_encseq_total_length(*encseq), 2,
                "cannot exceed total length of encoded sequence");
  cc = gt_encseq_get_encoded_char(*encseq, pos, readmode);
  lua_pushnumber(L, cc);
  return 1;
}
Example #13
0
static GtUchar getfirstedgechar(const Mergertrierep *trierep,
                              const Mergertrienode *node,
                              GtUword prevdepth)
{
  Encseqreadinfo *eri = trierep->encseqreadinfo + node->suffixinfo.idx;

  if (MTRIE_ISLEAF(node) &&
      node->suffixinfo.startpos + prevdepth >=
      gt_encseq_total_length(eri->encseqptr))
  {
    return (GtUchar) SEPARATOR;
  }
  return gt_encseq_get_encoded_char(eri->encseqptr, /* Random access */
                        node->suffixinfo.startpos + prevdepth,
                        eri->readmode);
}
Example #14
0
static GtUchar gt_mmsearch_accessquery(const GtQueryrep *queryrep,
                                       unsigned long pos)
{
  unsigned long abspos;

  gt_assert(queryrep != NULL && pos < queryrep->length);
  abspos = queryrep->startpos + pos;
  if (queryrep->sequence != NULL)
  {
    gt_assert(queryrep->readmode == GT_READMODE_FORWARD);
    return queryrep->sequence[abspos];
  } else
  {
    gt_assert(queryrep->readmode != GT_READMODE_FORWARD &&
              queryrep->encseq != NULL);
    return gt_encseq_get_encoded_char(queryrep->encseq,abspos,
                                      queryrep->readmode);
  }
}
Example #15
0
void gt_fprintfencseq(FILE *fpout,
                      const GtEncseq *encseq,
                      unsigned long start,
                      unsigned long wlen)
{
    unsigned long idx;
    GtUchar currentchar;
    const GtAlphabet *alpha;

    alpha = gt_encseq_alphabet(encseq);
    for (idx = start; idx < start + wlen; idx++)
    {
        currentchar = gt_encseq_get_encoded_char(encseq,
                      idx,
                      GT_READMODE_FORWARD);
        gt_assert(ISNOTSPECIAL(currentchar));
        gt_alphabet_echo_pretty_symbol(alpha,fpout,currentchar);
    }
}
Example #16
0
static Mergertrienode *mtrie_makenewbranch(Mergertrierep *trierep,
                                     Suffixinfo *suffixinfo,
                                     GtUword currentdepth,
                                     Mergertrienode *oldnode)
{
  Mergertrienode *newbranch, *newleaf;
  GtUchar cc1, cc2;
  Encseqreadinfo *eri = trierep->encseqreadinfo + suffixinfo->idx;

#ifdef WITHTRIEIDENT
#ifdef WITHTRIESHOW
  printf("makenewbranch(ident=" Formatuint64_t ")\n",
          PRINTuint64_tcast(suffixinfo->ident));
#endif
#endif
  newbranch = newMergertrienode(trierep);
  newbranch->suffixinfo = *suffixinfo;
  newbranch->rightsibling = oldnode->rightsibling;
  cc1 = getfirstedgechar(trierep,oldnode,currentdepth);
  if (suffixinfo->startpos + currentdepth >=
      gt_encseq_total_length(eri->encseqptr))
  {
    cc2 = (GtUchar) SEPARATOR;
  } else
  {
    cc2 = gt_encseq_get_encoded_char(eri->encseqptr,
                         suffixinfo->startpos + currentdepth,
                         eri->readmode);
  }
  newleaf = mtrie_makenewleaf(trierep,suffixinfo);
  if (mtrie_comparecharacters(cc1,oldnode->suffixinfo.idx,
                        cc2,suffixinfo->idx) <= 0)
  {
    makesuccs(newbranch,oldnode,newleaf);
  } else
  {
    makesuccs(newbranch,newleaf,oldnode);
  }
  newbranch->depth = currentdepth;
  return newbranch;
}
Example #17
0
static void swtracebackDPedges(GtAlignment *alignment,
                               GtUword ulen,
                               const GtEncseq *encseq,
                               GtUword vlen,
                               GtUchar *dbsubstring,
                               GtUword startpos,
                               const Retracebits *edges)
{
  const Retracebits *eptr = edges + (ulen+1) * (vlen+1) - 1;

  while (true)
  {
    if (*eptr & DELETIONBIT)
    {
      gt_alignment_add_deletion(alignment);
      eptr--;
    } else
    {
      if (*eptr & REPLACEMENTBIT)
      {
        gt_alignment_add_replacement(alignment);
        eptr -= (ulen+2);
      } else
      {
        if (*eptr & INSERTIONBIT)
        {
          gt_alignment_add_insertion(alignment);
          eptr -= (ulen+1);
        } else
        {
          break;
        }
      }
      gt_assert(vlen > 0);
      vlen--;
      dbsubstring[vlen] = gt_encseq_get_encoded_char(encseq,
                                                           startpos + vlen,
                                                           GT_READMODE_FORWARD);
    }
  }
}
Example #18
0
static bool gt_mum_isleftmaximal(const GtEncseq *dbencseq,
                                 GtReadmode readmode,
                                 GtUword dbstart,
                                 GtUword queryoffset,
                                 const GtUchar *query)
{
  GtUchar dbleftchar;

  if (dbstart == 0 || queryoffset == 0)
  {
    return true;
  }
  dbleftchar = gt_encseq_get_encoded_char(dbencseq, /* Random access */
                                          dbstart-1,
                                          readmode);
  if (ISSPECIAL(dbleftchar) || dbleftchar != query[queryoffset-1])
  {
    return true;
  }
  return false;
}
Example #19
0
static GtUchar gt_mmsearch_accessquery(const GtQueryrepresentation *queryrep,
                                       GtUword pos)
{
    GtUword abspos;

    gt_assert(queryrep != NULL);
    gt_assert(pos < queryrep->seqlen);
    abspos = queryrep->startpos + (queryrep->readmode == GT_READMODE_FORWARD
                                   ? pos
                                   : GT_REVERSEPOS(queryrep->seqlen,pos));
    if (queryrep->sequence != NULL)
    {
        gt_assert(!GT_ISDIRCOMPLEMENT(queryrep->readmode)); /* not implemented */
        return queryrep->sequence[abspos];
    } else
    {
        gt_assert(queryrep->encseq != NULL);
        return gt_encseq_get_encoded_char(queryrep->encseq,abspos,
                                          GT_READMODE_FORWARD);
    }
}
Example #20
0
static bool gt_mmsearch_isleftmaximal(const GtEncseq *dbencseq,
                                      GtReadmode readmode,
                                      GtUword dbstart,
                                      const GtQuerysubstring *querysubstring)
{
  GtUchar dbleftchar;

  if (dbstart == 0 || querysubstring->currentoffset == 0)
  {
    return true;
  }
  dbleftchar = gt_encseq_get_encoded_char(dbencseq, /* Random access */
                                          dbstart-1,
                                          readmode);
  if (ISSPECIAL(dbleftchar) ||
      dbleftchar != gt_mmsearch_accessquery(querysubstring->queryrep,
                                            querysubstring->currentoffset-1))
  {
    return true;
  }
  return false;
}
Example #21
0
static void swmaximalDPedges(Retracebits *edges,
                             Scoretype *scol,
                             const Scorevalues *scorevalues,
                             const GtUchar *useq,
                             GtUword ulen,
                             const GtEncseq *vencseq,
                             GtUword startpos,
                             GtUword endpos)
{
  Scoretype val, we, nw, *scolptr;
  const GtUchar *uptr;
  GtUchar vcurrent;
  GtUword j;
  Retracebits *eptr;

  eptr = edges;
  *eptr = 0;
  for (*scol = 0, scolptr = scol+1, uptr = useq, eptr++; uptr < useq + ulen;
       scolptr++, uptr++, eptr++)
  {
    *scolptr = *(scolptr-1) + scorevalues->gapextend;
    *eptr = DELETIONBIT;
  }
  for (j = startpos; j < endpos; j++)
  {
    vcurrent = gt_encseq_get_encoded_char(vencseq,j,
                                                   GT_READMODE_FORWARD);
    gt_assert(vcurrent != (GtUchar) SEPARATOR);
    nw = *scol;
    *scol = nw + scorevalues->gapextend;
    *eptr = INSERTIONBIT;
    for (scolptr = scol+1, uptr = useq, eptr++; uptr < useq + ulen;
         scolptr++, uptr++, eptr++)
    {
      gt_assert(*uptr != (GtUchar) SEPARATOR);
      we = *scolptr;
      *scolptr = *(scolptr-1) + scorevalues->gapextend;
      *eptr = DELETIONBIT;
      if ((val = nw + REPLACEMENTSCORE(scorevalues,*uptr,vcurrent))
               >= *scolptr)
      {
        if (val == *scolptr)
        {
          *eptr = *eptr | REPLACEMENTBIT;
        } else
        {
          *eptr = REPLACEMENTBIT;
        }
        *scolptr = val;
      }
      if ((val = we + scorevalues->gapextend) >= *scolptr)
      {
        if (val == *scolptr)
        {
          *eptr = *eptr | INSERTIONBIT;
        } else
        {
          *eptr = INSERTIONBIT;
        }
        *scolptr = val;
      }
      nw = we;
    }
  }
}
Example #22
0
enum verifyBWTSeqErrCode
gt_BWTSeqVerifyIntegrity(BWTSeq *bwtSeq, const char *projectName,
                      int checkFlags,
                      GtUword tickPrint, FILE *fp,
                      GtLogger *verbosity, GtError *err)
{
  Suffixarray suffixArray;
  struct extBitsRetrieval extBits;
  bool suffixArrayIsInitialized = false, extBitsAreInitialized = false;
  enum verifyBWTSeqErrCode retval = VERIFY_BWTSEQ_NO_ERROR;
  do
  {
    GtUword seqLen;
    gt_assert(bwtSeq && projectName && err);
    gt_error_check(err);

    initExtBitsRetrieval(&extBits);
    extBitsAreInitialized = true;

    if (gt_mapsuffixarray(&suffixArray,
                       SARR_SUFTAB | SARR_ESQTAB, projectName, verbosity, err))
    {
      gt_error_set(err, "Cannot load reference suffix array project with"
                    " demand for suffix table file and encoded sequence"
                    " for project: %s", projectName);
      retval = VERIFY_BWTSEQ_REFLOAD_ERROR;
      break;
    }
    suffixArrayIsInitialized = true;
    seqLen = gt_encseq_total_length(suffixArray.encseq) + 1;
    if (BWTSeqLength(bwtSeq) != seqLen)
    {
      gt_error_set(err, "length mismatch for suffix array project %s and "
                "bwt sequence index", projectName);
      retval = VERIFY_BWTSEQ_LENCOMPARE_ERROR;
      break;
    }

    if (checkFlags & VERIFY_BWTSEQ_SUFVAL
        && BWTSeqHasLocateInformation(bwtSeq))
    {
      GtUword i;
      for (i = 0; i < seqLen && retval == VERIFY_BWTSEQ_NO_ERROR; ++i)
      {
        if (gt_BWTSeqPosHasLocateInfo(bwtSeq, i, &extBits))
        {
          GtUword sfxArrayValue = gt_BWTSeqLocateMatch(bwtSeq, i,
                                                             &extBits);
          if (sfxArrayValue != ESASUFFIXPTRGET(suffixArray.suftab,i))
          {
            gt_error_set(err, "Failed suffix array value comparison"
                          " at position "GT_WU": "GT_WU" != "GT_WU"",
                          i, sfxArrayValue,
                          ESASUFFIXPTRGET(suffixArray.suftab,i));
            retval = VERIFY_BWTSEQ_SUFVAL_ERROR;
            break;
          }
        }
        if (tickPrint && !((i + 1) % tickPrint))
          putc('.', fp);
      }
      if (tickPrint)
        putc('\n', fp);
      if (retval != VERIFY_BWTSEQ_NO_ERROR)
        break;
    }
    else if (checkFlags & VERIFY_BWTSEQ_SUFVAL)
    {
      gt_error_set(err, "check of suffix array values was requested,"
                " but index contains no  locate information!");
      retval = VERIFY_BWTSEQ_SUFVAL_ERROR;
      break;
    }
    else if (!(checkFlags & VERIFY_BWTSEQ_SUFVAL)
             && BWTSeqHasLocateInformation(bwtSeq))
    {
      fputs("Not checking suftab values.\n", stderr);
    }
    if (BWTSeqHasLocateInformation(bwtSeq))
    {
      GtUword nextLocate = BWTSeqTerminatorPos(bwtSeq);
      if (suffixArray.longest.defined &&
          suffixArray.longest.valueunsignedlong != nextLocate)
      {
        gt_error_set(err, "terminator/0-rotation position mismatch "GT_WU""
                  " vs. "GT_WU"", suffixArray.longest.valueunsignedlong,
                  nextLocate);
        retval = VERIFY_BWTSEQ_TERMPOS_ERROR;
        break;
      }
      if ((checkFlags & VERIFY_BWTSEQ_LFMAPWALK)
          && (bwtSeq->featureToggles & BWTReversiblySorted))
      {
        GtUword i = seqLen;
        /* handle first symbol specially because the encseq
         * will not return the terminator symbol */
        {
          Symbol sym = BWTSeqGetSym(bwtSeq, nextLocate);
          if (sym != UNDEFBWTCHAR)
          {
            gt_error_set(err, "symbol mismatch at position "GT_WU": "
                      "%d vs. reference symbol %d", i - 1, (int)sym,
                      (int)UNDEFBWTCHAR);
            retval = VERIFY_BWTSEQ_LFMAPWALK_ERROR;
            break;
          }
          --i;
          nextLocate = BWTSeqLFMap(bwtSeq, nextLocate, &extBits);
        }
        while (i > 0)
        {
          Symbol symRef =
                         gt_encseq_get_encoded_char(suffixArray.encseq,
                                                          --i,
                                                          suffixArray.readmode);
          Symbol symCmp = BWTSeqGetSym(bwtSeq, nextLocate);
          if (symCmp != symRef)
          {
            gt_error_set(err, "symbol mismatch at position "GT_WU": "
                      "%d vs. reference symbol %d", i, symCmp, symRef);
            retval = VERIFY_BWTSEQ_LFMAPWALK_ERROR;
            break;
          }
          nextLocate = BWTSeqLFMap(bwtSeq, nextLocate, &extBits);
        }
        if (retval != VERIFY_BWTSEQ_NO_ERROR)
          break;
      }
      else if ((checkFlags & VERIFY_BWTSEQ_LFMAPWALK)
               && !(bwtSeq->featureToggles & BWTReversiblySorted))
      {
        gt_error_set(err, "requested complete backwards regeneration in index"
                  " without regeneration capability");
        retval = VERIFY_BWTSEQ_LFMAPWALK_IMP_ERROR;
        break;
      }
    }
    if (checkFlags & VERIFY_BWTSEQ_CONTEXT)
    {
      BWTSeqContextRetriever *bwtSeqCR =
        gt_BWTSeqCRLoad(bwtSeq, projectName, CTX_MAP_ILOG_AUTOSIZE);
      if (!bwtSeqCR)
      {
        gt_error_set(err, "cannot load BWT sequence context access table"
                  " for project %s", projectName);
        retval = VERIFY_BWTSEQ_CONTEXT_LOADFAIL;
        break;
      }
      fputs("Checking context regeneration.\n", stderr);
      {
        GtUword i, start, subSeqLen,
          maxSubSeqLen = MIN(MAX(MIN_CONTEXT_LEN, seqLen/CONTEXT_FRACTION),
                             MAX_CONTEXT_LEN),
          numTries = MIN(MAX_NUM_CONTEXT_CHECKS,
                         MAX(2, seqLen/CONTEXT_INTERVAL));
        Symbol *contextBuf = gt_malloc(sizeof (Symbol) * MAX_CONTEXT_LEN);
        GtEncseqReader *esr =
           gt_encseq_create_reader_with_readmode(suffixArray.encseq,
                                                 suffixArray.readmode,
                                                 0);
        for (i = 0; i < numTries && retval == VERIFY_BWTSEQ_NO_ERROR; ++i)
        {
          GtUword j, end, inSubSeqLen;
          subSeqLen = random()%maxSubSeqLen + 1;
          start = random()%(seqLen - subSeqLen + 1);
          end = start + subSeqLen;
          inSubSeqLen = subSeqLen - ((end==seqLen)?1:0);
          gt_BWTSeqCRAccessSubseq(bwtSeqCR, start, subSeqLen, contextBuf);
          gt_encseq_reader_reinit_with_readmode(esr, suffixArray.encseq,
                                                suffixArray.readmode, start);
          for (j = 0; j < inSubSeqLen; ++j)
          {
            Symbol symRef = gt_encseq_reader_next_encoded_char(esr);
            Symbol symCmp = contextBuf[j];
            if (symCmp != symRef)
            {
              gt_error_set(err, "symbol mismatch at position "GT_WU": "
                        "%d vs. reference symbol %d", start + j, (int)symCmp,
                        (int)symRef);
              retval = VERIFY_BWTSEQ_CONTEXT_SYMFAIL;
              break;
            }
          }
          while (j < subSeqLen)
          {
            Symbol symRef = UNDEFBWTCHAR;
            Symbol symCmp = contextBuf[j];
            if (symCmp != symRef)
            {
              gt_error_set(err, "symbol mismatch at position "GT_WU": "
                        "%d vs. reference symbol %d", start + j, (int)symCmp,
                        (int)symRef);
              retval = VERIFY_BWTSEQ_CONTEXT_SYMFAIL;
              break;
            }
            ++j;
          }
        }
        if (retval == VERIFY_BWTSEQ_NO_ERROR)
          fputs("Context regeneration completed successfully.\n", stderr);
        gt_encseq_reader_delete(esr);
        gt_free(contextBuf);
      }
      gt_deleteBWTSeqCR(bwtSeqCR);
    }
  } while (0);
  if (suffixArrayIsInitialized) gt_freesuffixarray(&suffixArray);
  if (extBitsAreInitialized) destructExtBitsRetrieval(&extBits);
  return retval;
}
Example #23
0
void gt_mergertrie_insertsuffix(Mergertrierep *trierep,
                             Mergertrienode *node,
                             Suffixinfo *suffixinfo)
{
  if (trierep->root == NULL)
  {
    trierep->root = mtrie_makeroot(trierep,suffixinfo);
  } else
  {
    GtUword currentdepth, lcpvalue, totallength;
    Mergertrienode *currentnode, *newleaf, *newbranch, *succ;
    Nodepair np;
    GtUchar cc;
    Encseqreadinfo *eri = trierep->encseqreadinfo + suffixinfo->idx;

    gt_assert(!MTRIE_ISLEAF(node));
    currentnode = node;
    currentdepth = node->depth;
    totallength = gt_encseq_total_length(eri->encseqptr);
    while (true)
    {
      if (suffixinfo->startpos + currentdepth >= totallength)
      {
        cc = (GtUchar) SEPARATOR;
      } else
      {
        /* Random access */
        cc = gt_encseq_get_encoded_char(eri->encseqptr,
                                            suffixinfo->startpos + currentdepth,
                                            eri->readmode);
      }
      gt_assert(currentnode != NULL);
      gt_assert(!MTRIE_ISLEAF(currentnode));
      if (!hassuccessor(trierep,&np,currentdepth,currentnode,cc,
                        suffixinfo->idx))
      {
        newleaf = mtrie_makenewleaf(trierep,suffixinfo);
        newleaf->rightsibling = np.current;
        SHOWNODERELATIONS(newleaf);
        if (np.previous == NULL)
        {
          SETFIRSTCHILD(currentnode,newleaf);
          SHOWNODERELATIONS(currentnode);
        } else
        {
          np.previous->rightsibling = newleaf;
          SHOWNODERELATIONS(np.previous);
        }
        return;
      }
      succ = np.current;
      if (MTRIE_ISLEAF(succ))
      {
        lcpvalue = getlcp(eri->encseqptr,
                          eri->readmode,
                          suffixinfo->startpos + currentdepth + 1,
                          gt_encseq_total_length(eri->encseqptr) - 1,
                          trierep->encseqreadinfo[succ->suffixinfo.idx].
                                encseqptr,
                          trierep->encseqreadinfo[succ->suffixinfo.idx].
                                readmode,
                          succ->suffixinfo.startpos + currentdepth + 1,
                          gt_encseq_total_length(
                              trierep->encseqreadinfo[succ->suffixinfo.idx].
                                        encseqptr) - 1);
        newbranch = mtrie_makenewbranch(trierep,
                                  suffixinfo,
                                  currentdepth + lcpvalue + 1,
                                  succ);
        if (np.previous == NULL)
        {
          SETFIRSTCHILD(currentnode,newbranch);
          SHOWNODERELATIONS(currentnode);
        } else
        {
          np.previous->rightsibling = newbranch;
          SHOWNODERELATIONS(np.previous);
        }
        return;
      }
      lcpvalue = getlcp(eri->encseqptr,
                        eri->readmode,
                        suffixinfo->startpos + currentdepth + 1,
                        gt_encseq_total_length(eri->encseqptr) - 1,
                        trierep->encseqreadinfo[succ->suffixinfo.idx].encseqptr,
                        trierep->encseqreadinfo[succ->suffixinfo.idx].readmode,
                        succ->suffixinfo.startpos + currentdepth + 1,
                        succ->suffixinfo.startpos + succ->depth - 1);
      if (currentdepth + lcpvalue + 1 < succ->depth)
      {
        newbranch = mtrie_makenewbranch(trierep,
                                  suffixinfo,
                                  currentdepth + lcpvalue + 1,
                                  succ);
        if (np.previous == NULL)
        {
          SETFIRSTCHILD(currentnode,newbranch);
          SHOWNODERELATIONS(currentnode);
        } else
        {
          np.previous->rightsibling = newbranch;
          SHOWNODERELATIONS(np.previous);
        }
        return;
      }
      currentnode = succ;
      currentdepth = currentnode->depth;
    }
  }
}
Example #24
0
static void swlocalsimilarityregion(DPpoint *scol,
                                    DPregion *maxentry,
                                    const Scorevalues *scorevalues,
                                    const GtUchar *useq,
                                    GtUword ulen,
                                    const GtEncseq *vencseq,
                                    GtUword startpos,
                                    GtUword endpos)
{
  Scoretype val;
  DPpoint *scolptr, we, nw;
  const GtUchar *uptr;
  GtUchar vcurrent;
  GtUword j;

  maxentry->similarity = 0;
  maxentry->len1 = 0;
  maxentry->len2 = 0;
  maxentry->start1 = 0;
  maxentry->start2 = 0;
  for (scolptr = scol; scolptr <= scol + ulen; scolptr++)
  {
    scolptr->similarity = 0;
    scolptr->lu = 0;
    scolptr->lv = 0;
  }
  for (j = startpos; j < endpos; j++)
  {
    vcurrent = gt_encseq_get_encoded_char(vencseq,j,
                                                   GT_READMODE_FORWARD);
    gt_assert(vcurrent != (GtUchar) SEPARATOR);
    nw = *scol;
    for (scolptr = scol+1, uptr = useq; uptr < useq + ulen; scolptr++, uptr++)
    {
      gt_assert(*uptr != (GtUchar) SEPARATOR);
      we = *scolptr;
      scolptr->similarity = (scolptr-1)->similarity + scorevalues->gapextend;
      scolptr->lu = (scolptr-1)->lu + 1;
      scolptr->lv = (scolptr-1)->lv;
      if ((val = nw.similarity + REPLACEMENTSCORE(scorevalues,*uptr,vcurrent))
               > scolptr->similarity)
      {
        scolptr->similarity = val;
        scolptr->lu = nw.lu + 1;
        scolptr->lv = nw.lv + 1;
      }
      if ((val = we.similarity + scorevalues->gapextend)
               > scolptr->similarity)
      {
        scolptr->similarity = val;
        scolptr->lu = we.lu;
        scolptr->lv = we.lv + 1;
      }
      if (scolptr->similarity < 0)
      {
        scolptr->similarity = 0;
        scolptr->lu = 0;
        scolptr->lv = 0;
      } else
      {
        if (scolptr->similarity > maxentry->similarity)
        {
          maxentry->similarity = scolptr->similarity;
          maxentry->len1 = scolptr->lu;
          maxentry->len2 = scolptr->lv;
          maxentry->start1 = (GtUword) (uptr - useq) - scolptr->lu + 1;
          maxentry->start2 = (j - startpos) - scolptr->lv + 1;
        }
      }
      nw = we;
    }
  }
}
Example #25
0
static GtUword *leftcontextofspecialchardist(unsigned int numofchars,
                                                   const GtEncseq *encseq,
                                                   GtReadmode readmode)
{
  GtUchar cc;
  unsigned int idx;
  GtUword *specialchardist,
                totallength = gt_encseq_total_length(encseq);
  GtReadmode convertedreadmode = (readmode == GT_READMODE_REVERSE)
                                      ? GT_READMODE_FORWARD
                                      : GT_READMODE_COMPL;

  specialchardist = gt_malloc(sizeof (*specialchardist) * numofchars);
  for (idx = 0; idx<numofchars; idx++)
  {
    specialchardist[idx] = 0;
  }
  if (gt_encseq_has_specialranges(encseq))
  {
    GtSpecialrangeiterator *sri;
    GtRange range;
    sri = gt_specialrangeiterator_new(encseq,true);
    if (GT_ISDIRREVERSE(readmode))
    {
      while (gt_specialrangeiterator_next(sri,&range))
      {
        if (range.end < totallength)
        {
          cc = gt_encseq_get_encoded_char(encseq,range.end,convertedreadmode);
          if (ISNOTSPECIAL(cc))
          {
            specialchardist[cc]++;
          }
        }
      }
    } else
    {
      while (gt_specialrangeiterator_next(sri,&range))
      {
        if (range.start > 0)
        {
          cc = gt_encseq_get_encoded_char(encseq,range.start-1,readmode);
          if (ISNOTSPECIAL(cc))
          {
            specialchardist[cc]++;
          }
        }
      }
    }
    gt_specialrangeiterator_delete(sri);
  }
  if (GT_ISDIRREVERSE(readmode))
  {
    if (gt_encseq_lengthofspecialprefix(encseq) == 0)
    {
      cc = gt_encseq_get_encoded_char(encseq,0,convertedreadmode);
      gt_assert(ISNOTSPECIAL(cc));
      specialchardist[cc]++;
    }
  } else
  {
    if (gt_encseq_lengthofspecialsuffix(encseq) == 0)
    {
      cc = gt_encseq_get_encoded_char(encseq,totallength-1,readmode);
      gt_assert(ISNOTSPECIAL(cc));
      specialchardist[cc]++;
    }
  }
  return specialchardist;
}