Ejemplo n.º 1
0
int gt_nextSequentialsuftabvalue(GtUword *currentsuffix,
                                 Sequentialsuffixarrayreader *ssar)
{
  gt_assert(ssar != NULL);
  if (ssar->scanfile)
  {
#if defined (_LP64) || defined (_WIN64)
    if (ssar->suffixarray->suftabstream_GtUword.fp != NULL)
    {
      return gt_readnextfromstream_GtUword(currentsuffix,
                                 &ssar->suffixarray->suftabstream_GtUword);
    } else
    {
      uint32_t readvalue = 0;
      int ret = gt_readnextfromstream_uint32_t(
                                 &readvalue,
                                 &ssar->suffixarray->suftabstream_uint32_t);
      *currentsuffix = (GtUword) readvalue;
      return ret;
    }
#else
     return gt_readnextfromstream_GtUword(currentsuffix,
                                      &ssar->suffixarray->suftabstream_GtUword);
#endif
  }
  if (ssar->scanfile)
  {
    *currentsuffix = ESASUFFIXPTRGET(ssar->suftab,ssar->nextsuftabindex++);
  } else
  {
    *currentsuffix = ESASUFFIXPTRGET(ssar->suffixarray->suftab,
                                     ssar->nextsuftabindex++);
  }
  return 1;
}
bool gt_lcpintervalfindcharchildintv_simple(const GtEncseq *encseq,
                                     GtReadmode readmode,
                                     GtUword totallength,
                                     const ESASuffixptr *suftab,
                                     Simplelcpinterval *itv,
                                     GtUchar cc,
                                     GtUword offset,
                                     GtUword left,
                                     GtUword right)
{
  GtUword pos;

  pos = ESASUFFIXPTRGET(suftab,left) + offset;
  if (cc < SEQUENCE(encseq,pos))
  {
    return false;
  }
  pos = ESASUFFIXPTRGET(suftab,right) + offset;
  if (cc > SEQUENCE(encseq,pos))
  {
    return false;
  }
  itv->left = lcpintervalfindfirst(encseq,
                                   readmode,
                                   totallength,
                                   suftab,
                                   cc,
                                   offset,
                                   left,
                                   right);
  if (itv->left == ULONG_MAX)
  {
    return false;
  }
  itv->right = lcpintervalfindlast(encseq,
                                   readmode,
                                   totallength,
                                   suftab,
                                   cc,
                                   offset,
                                   itv->left + 1,
                                   right);
  if (itv->right == ULONG_MAX)
  {
    itv->right = itv->left;
  }
  return true;
}
Ejemplo n.º 3
0
static unsigned long lcpintervalfindrightbound(const GtEncseq *encseq,
                                        GtReadmode readmode,
                                        unsigned long totallength,
                                        const ESASuffixptr *suftab,
                                        GtUchar cc,
                                        unsigned long offset,
                                        unsigned long left,
                                        unsigned long right)
{
  unsigned long pos, mid;
  GtUchar midcc;

  while (right > left+1)
  {
    mid = GT_DIV2(left+right);
    pos = ESASUFFIXPTRGET(suftab,mid) + offset;
    midcc = SEQUENCE(encseq,pos);
    if (cc < midcc)
    {
      right = mid;
    } else
    {
      left = mid;
    }
  }
  return left;
}
static GtUword lcpintervalfindlast(const GtEncseq *encseq,
                                         GtReadmode readmode,
                                         GtUword totallength,
                                         const ESASuffixptr *suftab,
                                         GtUchar cc,
                                         GtUword offset,
                                         GtUword left,
                                         GtUword right)
{
  GtUword found = ULONG_MAX;

  while (left <= right)
  {
    GtUword mid = left + GT_DIV2(right - left + 1);
    GtUword pos = ESASUFFIXPTRGET(suftab,mid) + offset;
    GtUchar midcc = SEQUENCE(encseq,pos);
    if (cc < midcc)
    {
      if (mid == 0)
      {
        break;
      }
      right = mid - 1;
    } else
    {
      if (cc == midcc)
      {
        found = mid;
      }
      left = mid + 1;
    }
  }
  return found;
}
Ejemplo n.º 5
0
bool gt_lcpintervalfindcharchildintv(const GtEncseq *encseq,
                                     GtReadmode readmode,
                                     unsigned long totallength,
                                     const ESASuffixptr *suftab,
                                     Simplelcpinterval *itv,
                                     GtUchar cc,
                                     unsigned long offset,
                                     unsigned long left,
                                     unsigned long right)
{
  GtUchar leftcc, rightcc;
  unsigned long pos, rightbound, leftbound = left;

  pos = ESASUFFIXPTRGET(suftab,right) + offset;
  rightcc = SEQUENCE(encseq,pos);
  while (true)
  {
    pos = ESASUFFIXPTRGET(suftab,leftbound) + offset;
    leftcc = SEQUENCE(encseq,pos);
    if (leftcc == rightcc)
    {
      break;
    }
    rightbound = lcpintervalfindrightbound(encseq,readmode,
                                           totallength,suftab,leftcc,
                                           offset,leftbound,right);
    if (leftcc == cc)
    {
      itv->left = leftbound;
      itv->right = rightbound;
      return true;
    }
    if (leftcc > cc)
    {
      return false;
    }
    leftbound = rightbound+1;
  }
  if (leftcc == cc)
  {
    itv->left = leftbound;
    itv->right = right;
    return true;
  }
  return false;
}
Ejemplo n.º 6
0
bool gt_mmsearchiterator_next(unsigned long *dbstart,GtMMsearchiterator *mmsi)
{
  if (mmsi->sufindex <= mmsi->lcpitv.right)
  {
    *dbstart = ESASUFFIXPTRGET(mmsi->suftab,mmsi->sufindex++);
    return true;
  }
  return false;
}
Ejemplo n.º 7
0
bool gt_mmsearchiterator_next(GtUword *dbstart,GtMMsearchiterator *mmsi)
{
  gt_assert(mmsi != NULL);
  if (mmsi->sufindex <= mmsi->lcpitv.right)
  {
    *dbstart = ESASUFFIXPTRGET(mmsi->suftab,mmsi->sufindex++);
    return true;
  }
  return false;
}
Ejemplo n.º 8
0
void gt_lcpintervalsplitwithoutspecial(GtArrayBoundswithchar *bwci,
                                       const GtEncseq *encseq,
                                       GtReadmode readmode,
                                       unsigned long totallength,
                                       const ESASuffixptr *suftab,
                                       unsigned long parentoffset,
                                       unsigned long parentleft,
                                       unsigned long parentright)
{
  GtUchar leftcc, rightcc;
  unsigned long rightbound = 0, leftbound = parentleft;

  /* call gt_lcpintervalextendlcp and verify if interval can be extended by
     some character */
  bwci->nextfreeBoundswithchar = 0;
  rightcc = SEQUENCE(encseq,ESASUFFIXPTRGET(suftab,parentright) + parentoffset);
  while (true)
  {
    leftcc = SEQUENCE(encseq,ESASUFFIXPTRGET(suftab,leftbound) + parentoffset);
    gt_assert(bwci->nextfreeBoundswithchar < bwci->allocatedBoundswithchar);
    if (ISSPECIAL(leftcc))
    {
      ADDPREVIOUSRBOUND(rightbound);
      ADDCURRENTLBOUND(rightbound+1);
      return;
    }
    ADDPREVIOUSRBOUND(leftbound-1);
    ADDCURRENTLBOUND(leftbound);
    ADDCURRENTINCHAR(leftcc);
    if (leftcc == rightcc)
    {
      break;
    }
    rightbound = lcpintervalfindrightbound(encseq,readmode,totallength,suftab,
                                           leftcc,parentoffset,
                                           leftbound,parentright);
    leftbound = rightbound+1;
  }
  gt_assert(bwci->nextfreeBoundswithchar < bwci->allocatedBoundswithchar);
  ADDPREVIOUSRBOUND(parentright);
  ADDCURRENTLBOUND(parentright+1);
}
Ejemplo n.º 9
0
GtUchar gt_lcpintervalextendlcp(const GtEncseq *encseq,
                                GtReadmode readmode,
                                const ESASuffixptr *suftab,
                                unsigned long totallength,
                                GtUchar alphasize,
                                unsigned long parentoffset,
                                unsigned long parentleft,
                                unsigned long parentright)
{
  GtUchar ccl, ccr;

  ccl = SEQUENCE(encseq,ESASUFFIXPTRGET(suftab,parentleft) + parentoffset);
  ccr = SEQUENCE(encseq,ESASUFFIXPTRGET(suftab,parentright) + parentoffset);
  if (ccl != ccr || ISSPECIAL(ccl))
  {
    return alphasize;
  }
  gt_assert(ccl < alphasize);
  return ccl;
}
Ejemplo n.º 10
0
static bool gt_mmsearch(const GtEncseq *dbencseq,
                        GtEncseqReader *esr,
                        const ESASuffixptr *suftab,
                        GtReadmode readmode,
                        Lcpinterval *lcpitv,
                        const GtQuerysubstring *querysubstring,
                        GtUword minmatchlength)
{
  GtUword left, leftsave, mid, right, lpref, rpref,
                totallength, lcplen, sidx;
  int retcode = 0;
  GtUchar currentdbchar, currentquerychar;

  totallength = gt_encseq_total_length(dbencseq);
  leftsave = left = lcpitv->left;
  right = lcpitv->right;
  lcplen = lcpitv->offset;
  GT_MMSEARCH_COMPARE(ESASUFFIXPTRGET(suftab,left),lcplen);
  if (retcode > 0)
  {
    lpref = lcplen;
    lcplen = lcpitv->offset;
    GT_MMSEARCH_COMPARE(ESASUFFIXPTRGET(suftab,right),lcplen);
    if (retcode > 0)
    {
      return false;
    } else
    {
      rpref = lcplen;
      while (right > left + 1)
      {
        mid = GT_DIV2(left+right);
        lcplen = MIN(lpref,rpref);
        GT_MMSEARCH_COMPARE(ESASUFFIXPTRGET(suftab,mid),lcplen);
        if (retcode <= 0)
        {
          right = mid;
          rpref = lcplen;
        } else
        {
          left = mid;
          lpref = lcplen;
        }
      }
      lcpitv->left = right;
    }
  }

  left = leftsave;
  right = lcpitv->right;
  lcplen = lcpitv->offset;
  GT_MMSEARCH_COMPARE(ESASUFFIXPTRGET(suftab,left),lcplen);
  if (retcode < 0)
  {
    return false;
  } else
  {
    lpref = lcplen;
    lcplen = lcpitv->offset;
    GT_MMSEARCH_COMPARE(ESASUFFIXPTRGET(suftab,right),lcplen);
    if (retcode >= 0)
    {
      lcpitv->right = right;
    } else
    {
      rpref = lcplen;
      while (right > left + 1)
      {
        mid = GT_DIV2(left+right);
        lcplen = MIN(lpref,rpref);
        GT_MMSEARCH_COMPARE(ESASUFFIXPTRGET(suftab,mid),lcplen);
        if (retcode >= 0)
        {
          left = mid;
          lpref = lcplen;
        } else
        {
          right = mid;
          rpref = lcplen;
        }
      }
      lcpitv->right = left;
    }
  }
  return true;
}
Ejemplo n.º 11
0
enum verifyBWTSeqErrCode
gt_BWTSeqVerifyIntegrity(BWTSeq *bwtSeq, const char *projectName,
                      int checkFlags,
                      GtUword tickPrint, FILE *fp,
                      GtLogger *verbosity, GtError *err)
{
  Suffixarray suffixArray;
  struct extBitsRetrieval extBits;
  bool suffixArrayIsInitialized = false, extBitsAreInitialized = false;
  enum verifyBWTSeqErrCode retval = VERIFY_BWTSEQ_NO_ERROR;
  do
  {
    GtUword seqLen;
    gt_assert(bwtSeq && projectName && err);
    gt_error_check(err);

    initExtBitsRetrieval(&extBits);
    extBitsAreInitialized = true;

    if (gt_mapsuffixarray(&suffixArray,
                       SARR_SUFTAB | SARR_ESQTAB, projectName, verbosity, err))
    {
      gt_error_set(err, "Cannot load reference suffix array project with"
                    " demand for suffix table file and encoded sequence"
                    " for project: %s", projectName);
      retval = VERIFY_BWTSEQ_REFLOAD_ERROR;
      break;
    }
    suffixArrayIsInitialized = true;
    seqLen = gt_encseq_total_length(suffixArray.encseq) + 1;
    if (BWTSeqLength(bwtSeq) != seqLen)
    {
      gt_error_set(err, "length mismatch for suffix array project %s and "
                "bwt sequence index", projectName);
      retval = VERIFY_BWTSEQ_LENCOMPARE_ERROR;
      break;
    }

    if (checkFlags & VERIFY_BWTSEQ_SUFVAL
        && BWTSeqHasLocateInformation(bwtSeq))
    {
      GtUword i;
      for (i = 0; i < seqLen && retval == VERIFY_BWTSEQ_NO_ERROR; ++i)
      {
        if (gt_BWTSeqPosHasLocateInfo(bwtSeq, i, &extBits))
        {
          GtUword sfxArrayValue = gt_BWTSeqLocateMatch(bwtSeq, i,
                                                             &extBits);
          if (sfxArrayValue != ESASUFFIXPTRGET(suffixArray.suftab,i))
          {
            gt_error_set(err, "Failed suffix array value comparison"
                          " at position "GT_WU": "GT_WU" != "GT_WU"",
                          i, sfxArrayValue,
                          ESASUFFIXPTRGET(suffixArray.suftab,i));
            retval = VERIFY_BWTSEQ_SUFVAL_ERROR;
            break;
          }
        }
        if (tickPrint && !((i + 1) % tickPrint))
          putc('.', fp);
      }
      if (tickPrint)
        putc('\n', fp);
      if (retval != VERIFY_BWTSEQ_NO_ERROR)
        break;
    }
    else if (checkFlags & VERIFY_BWTSEQ_SUFVAL)
    {
      gt_error_set(err, "check of suffix array values was requested,"
                " but index contains no  locate information!");
      retval = VERIFY_BWTSEQ_SUFVAL_ERROR;
      break;
    }
    else if (!(checkFlags & VERIFY_BWTSEQ_SUFVAL)
             && BWTSeqHasLocateInformation(bwtSeq))
    {
      fputs("Not checking suftab values.\n", stderr);
    }
    if (BWTSeqHasLocateInformation(bwtSeq))
    {
      GtUword nextLocate = BWTSeqTerminatorPos(bwtSeq);
      if (suffixArray.longest.defined &&
          suffixArray.longest.valueunsignedlong != nextLocate)
      {
        gt_error_set(err, "terminator/0-rotation position mismatch "GT_WU""
                  " vs. "GT_WU"", suffixArray.longest.valueunsignedlong,
                  nextLocate);
        retval = VERIFY_BWTSEQ_TERMPOS_ERROR;
        break;
      }
      if ((checkFlags & VERIFY_BWTSEQ_LFMAPWALK)
          && (bwtSeq->featureToggles & BWTReversiblySorted))
      {
        GtUword i = seqLen;
        /* handle first symbol specially because the encseq
         * will not return the terminator symbol */
        {
          Symbol sym = BWTSeqGetSym(bwtSeq, nextLocate);
          if (sym != UNDEFBWTCHAR)
          {
            gt_error_set(err, "symbol mismatch at position "GT_WU": "
                      "%d vs. reference symbol %d", i - 1, (int)sym,
                      (int)UNDEFBWTCHAR);
            retval = VERIFY_BWTSEQ_LFMAPWALK_ERROR;
            break;
          }
          --i;
          nextLocate = BWTSeqLFMap(bwtSeq, nextLocate, &extBits);
        }
        while (i > 0)
        {
          Symbol symRef =
                         gt_encseq_get_encoded_char(suffixArray.encseq,
                                                          --i,
                                                          suffixArray.readmode);
          Symbol symCmp = BWTSeqGetSym(bwtSeq, nextLocate);
          if (symCmp != symRef)
          {
            gt_error_set(err, "symbol mismatch at position "GT_WU": "
                      "%d vs. reference symbol %d", i, symCmp, symRef);
            retval = VERIFY_BWTSEQ_LFMAPWALK_ERROR;
            break;
          }
          nextLocate = BWTSeqLFMap(bwtSeq, nextLocate, &extBits);
        }
        if (retval != VERIFY_BWTSEQ_NO_ERROR)
          break;
      }
      else if ((checkFlags & VERIFY_BWTSEQ_LFMAPWALK)
               && !(bwtSeq->featureToggles & BWTReversiblySorted))
      {
        gt_error_set(err, "requested complete backwards regeneration in index"
                  " without regeneration capability");
        retval = VERIFY_BWTSEQ_LFMAPWALK_IMP_ERROR;
        break;
      }
    }
    if (checkFlags & VERIFY_BWTSEQ_CONTEXT)
    {
      BWTSeqContextRetriever *bwtSeqCR =
        gt_BWTSeqCRLoad(bwtSeq, projectName, CTX_MAP_ILOG_AUTOSIZE);
      if (!bwtSeqCR)
      {
        gt_error_set(err, "cannot load BWT sequence context access table"
                  " for project %s", projectName);
        retval = VERIFY_BWTSEQ_CONTEXT_LOADFAIL;
        break;
      }
      fputs("Checking context regeneration.\n", stderr);
      {
        GtUword i, start, subSeqLen,
          maxSubSeqLen = MIN(MAX(MIN_CONTEXT_LEN, seqLen/CONTEXT_FRACTION),
                             MAX_CONTEXT_LEN),
          numTries = MIN(MAX_NUM_CONTEXT_CHECKS,
                         MAX(2, seqLen/CONTEXT_INTERVAL));
        Symbol *contextBuf = gt_malloc(sizeof (Symbol) * MAX_CONTEXT_LEN);
        GtEncseqReader *esr =
           gt_encseq_create_reader_with_readmode(suffixArray.encseq,
                                                 suffixArray.readmode,
                                                 0);
        for (i = 0; i < numTries && retval == VERIFY_BWTSEQ_NO_ERROR; ++i)
        {
          GtUword j, end, inSubSeqLen;
          subSeqLen = random()%maxSubSeqLen + 1;
          start = random()%(seqLen - subSeqLen + 1);
          end = start + subSeqLen;
          inSubSeqLen = subSeqLen - ((end==seqLen)?1:0);
          gt_BWTSeqCRAccessSubseq(bwtSeqCR, start, subSeqLen, contextBuf);
          gt_encseq_reader_reinit_with_readmode(esr, suffixArray.encseq,
                                                suffixArray.readmode, start);
          for (j = 0; j < inSubSeqLen; ++j)
          {
            Symbol symRef = gt_encseq_reader_next_encoded_char(esr);
            Symbol symCmp = contextBuf[j];
            if (symCmp != symRef)
            {
              gt_error_set(err, "symbol mismatch at position "GT_WU": "
                        "%d vs. reference symbol %d", start + j, (int)symCmp,
                        (int)symRef);
              retval = VERIFY_BWTSEQ_CONTEXT_SYMFAIL;
              break;
            }
          }
          while (j < subSeqLen)
          {
            Symbol symRef = UNDEFBWTCHAR;
            Symbol symCmp = contextBuf[j];
            if (symCmp != symRef)
            {
              gt_error_set(err, "symbol mismatch at position "GT_WU": "
                        "%d vs. reference symbol %d", start + j, (int)symCmp,
                        (int)symRef);
              retval = VERIFY_BWTSEQ_CONTEXT_SYMFAIL;
              break;
            }
            ++j;
          }
        }
        if (retval == VERIFY_BWTSEQ_NO_ERROR)
          fputs("Context regeneration completed successfully.\n", stderr);
        gt_encseq_reader_delete(esr);
        gt_free(contextBuf);
      }
      gt_deleteBWTSeqCR(bwtSeqCR);
    }
  } while (0);
  if (suffixArrayIsInitialized) gt_freesuffixarray(&suffixArray);
  if (extBitsAreInitialized) destructExtBitsRetrieval(&extBits);
  return retval;
}