int gt_nextSequentialsuftabvalue(GtUword *currentsuffix, Sequentialsuffixarrayreader *ssar) { gt_assert(ssar != NULL); if (ssar->scanfile) { #if defined (_LP64) || defined (_WIN64) if (ssar->suffixarray->suftabstream_GtUword.fp != NULL) { return gt_readnextfromstream_GtUword(currentsuffix, &ssar->suffixarray->suftabstream_GtUword); } else { uint32_t readvalue = 0; int ret = gt_readnextfromstream_uint32_t( &readvalue, &ssar->suffixarray->suftabstream_uint32_t); *currentsuffix = (GtUword) readvalue; return ret; } #else return gt_readnextfromstream_GtUword(currentsuffix, &ssar->suffixarray->suftabstream_GtUword); #endif } if (ssar->scanfile) { *currentsuffix = ESASUFFIXPTRGET(ssar->suftab,ssar->nextsuftabindex++); } else { *currentsuffix = ESASUFFIXPTRGET(ssar->suffixarray->suftab, ssar->nextsuftabindex++); } return 1; }
bool gt_lcpintervalfindcharchildintv_simple(const GtEncseq *encseq, GtReadmode readmode, GtUword totallength, const ESASuffixptr *suftab, Simplelcpinterval *itv, GtUchar cc, GtUword offset, GtUword left, GtUword right) { GtUword pos; pos = ESASUFFIXPTRGET(suftab,left) + offset; if (cc < SEQUENCE(encseq,pos)) { return false; } pos = ESASUFFIXPTRGET(suftab,right) + offset; if (cc > SEQUENCE(encseq,pos)) { return false; } itv->left = lcpintervalfindfirst(encseq, readmode, totallength, suftab, cc, offset, left, right); if (itv->left == ULONG_MAX) { return false; } itv->right = lcpintervalfindlast(encseq, readmode, totallength, suftab, cc, offset, itv->left + 1, right); if (itv->right == ULONG_MAX) { itv->right = itv->left; } return true; }
static unsigned long lcpintervalfindrightbound(const GtEncseq *encseq, GtReadmode readmode, unsigned long totallength, const ESASuffixptr *suftab, GtUchar cc, unsigned long offset, unsigned long left, unsigned long right) { unsigned long pos, mid; GtUchar midcc; while (right > left+1) { mid = GT_DIV2(left+right); pos = ESASUFFIXPTRGET(suftab,mid) + offset; midcc = SEQUENCE(encseq,pos); if (cc < midcc) { right = mid; } else { left = mid; } } return left; }
static GtUword lcpintervalfindlast(const GtEncseq *encseq, GtReadmode readmode, GtUword totallength, const ESASuffixptr *suftab, GtUchar cc, GtUword offset, GtUword left, GtUword right) { GtUword found = ULONG_MAX; while (left <= right) { GtUword mid = left + GT_DIV2(right - left + 1); GtUword pos = ESASUFFIXPTRGET(suftab,mid) + offset; GtUchar midcc = SEQUENCE(encseq,pos); if (cc < midcc) { if (mid == 0) { break; } right = mid - 1; } else { if (cc == midcc) { found = mid; } left = mid + 1; } } return found; }
bool gt_lcpintervalfindcharchildintv(const GtEncseq *encseq, GtReadmode readmode, unsigned long totallength, const ESASuffixptr *suftab, Simplelcpinterval *itv, GtUchar cc, unsigned long offset, unsigned long left, unsigned long right) { GtUchar leftcc, rightcc; unsigned long pos, rightbound, leftbound = left; pos = ESASUFFIXPTRGET(suftab,right) + offset; rightcc = SEQUENCE(encseq,pos); while (true) { pos = ESASUFFIXPTRGET(suftab,leftbound) + offset; leftcc = SEQUENCE(encseq,pos); if (leftcc == rightcc) { break; } rightbound = lcpintervalfindrightbound(encseq,readmode, totallength,suftab,leftcc, offset,leftbound,right); if (leftcc == cc) { itv->left = leftbound; itv->right = rightbound; return true; } if (leftcc > cc) { return false; } leftbound = rightbound+1; } if (leftcc == cc) { itv->left = leftbound; itv->right = right; return true; } return false; }
bool gt_mmsearchiterator_next(unsigned long *dbstart,GtMMsearchiterator *mmsi) { if (mmsi->sufindex <= mmsi->lcpitv.right) { *dbstart = ESASUFFIXPTRGET(mmsi->suftab,mmsi->sufindex++); return true; } return false; }
bool gt_mmsearchiterator_next(GtUword *dbstart,GtMMsearchiterator *mmsi) { gt_assert(mmsi != NULL); if (mmsi->sufindex <= mmsi->lcpitv.right) { *dbstart = ESASUFFIXPTRGET(mmsi->suftab,mmsi->sufindex++); return true; } return false; }
void gt_lcpintervalsplitwithoutspecial(GtArrayBoundswithchar *bwci, const GtEncseq *encseq, GtReadmode readmode, unsigned long totallength, const ESASuffixptr *suftab, unsigned long parentoffset, unsigned long parentleft, unsigned long parentright) { GtUchar leftcc, rightcc; unsigned long rightbound = 0, leftbound = parentleft; /* call gt_lcpintervalextendlcp and verify if interval can be extended by some character */ bwci->nextfreeBoundswithchar = 0; rightcc = SEQUENCE(encseq,ESASUFFIXPTRGET(suftab,parentright) + parentoffset); while (true) { leftcc = SEQUENCE(encseq,ESASUFFIXPTRGET(suftab,leftbound) + parentoffset); gt_assert(bwci->nextfreeBoundswithchar < bwci->allocatedBoundswithchar); if (ISSPECIAL(leftcc)) { ADDPREVIOUSRBOUND(rightbound); ADDCURRENTLBOUND(rightbound+1); return; } ADDPREVIOUSRBOUND(leftbound-1); ADDCURRENTLBOUND(leftbound); ADDCURRENTINCHAR(leftcc); if (leftcc == rightcc) { break; } rightbound = lcpintervalfindrightbound(encseq,readmode,totallength,suftab, leftcc,parentoffset, leftbound,parentright); leftbound = rightbound+1; } gt_assert(bwci->nextfreeBoundswithchar < bwci->allocatedBoundswithchar); ADDPREVIOUSRBOUND(parentright); ADDCURRENTLBOUND(parentright+1); }
GtUchar gt_lcpintervalextendlcp(const GtEncseq *encseq, GtReadmode readmode, const ESASuffixptr *suftab, unsigned long totallength, GtUchar alphasize, unsigned long parentoffset, unsigned long parentleft, unsigned long parentright) { GtUchar ccl, ccr; ccl = SEQUENCE(encseq,ESASUFFIXPTRGET(suftab,parentleft) + parentoffset); ccr = SEQUENCE(encseq,ESASUFFIXPTRGET(suftab,parentright) + parentoffset); if (ccl != ccr || ISSPECIAL(ccl)) { return alphasize; } gt_assert(ccl < alphasize); return ccl; }
static bool gt_mmsearch(const GtEncseq *dbencseq, GtEncseqReader *esr, const ESASuffixptr *suftab, GtReadmode readmode, Lcpinterval *lcpitv, const GtQuerysubstring *querysubstring, GtUword minmatchlength) { GtUword left, leftsave, mid, right, lpref, rpref, totallength, lcplen, sidx; int retcode = 0; GtUchar currentdbchar, currentquerychar; totallength = gt_encseq_total_length(dbencseq); leftsave = left = lcpitv->left; right = lcpitv->right; lcplen = lcpitv->offset; GT_MMSEARCH_COMPARE(ESASUFFIXPTRGET(suftab,left),lcplen); if (retcode > 0) { lpref = lcplen; lcplen = lcpitv->offset; GT_MMSEARCH_COMPARE(ESASUFFIXPTRGET(suftab,right),lcplen); if (retcode > 0) { return false; } else { rpref = lcplen; while (right > left + 1) { mid = GT_DIV2(left+right); lcplen = MIN(lpref,rpref); GT_MMSEARCH_COMPARE(ESASUFFIXPTRGET(suftab,mid),lcplen); if (retcode <= 0) { right = mid; rpref = lcplen; } else { left = mid; lpref = lcplen; } } lcpitv->left = right; } } left = leftsave; right = lcpitv->right; lcplen = lcpitv->offset; GT_MMSEARCH_COMPARE(ESASUFFIXPTRGET(suftab,left),lcplen); if (retcode < 0) { return false; } else { lpref = lcplen; lcplen = lcpitv->offset; GT_MMSEARCH_COMPARE(ESASUFFIXPTRGET(suftab,right),lcplen); if (retcode >= 0) { lcpitv->right = right; } else { rpref = lcplen; while (right > left + 1) { mid = GT_DIV2(left+right); lcplen = MIN(lpref,rpref); GT_MMSEARCH_COMPARE(ESASUFFIXPTRGET(suftab,mid),lcplen); if (retcode >= 0) { left = mid; lpref = lcplen; } else { right = mid; rpref = lcplen; } } lcpitv->right = left; } } return true; }
enum verifyBWTSeqErrCode gt_BWTSeqVerifyIntegrity(BWTSeq *bwtSeq, const char *projectName, int checkFlags, GtUword tickPrint, FILE *fp, GtLogger *verbosity, GtError *err) { Suffixarray suffixArray; struct extBitsRetrieval extBits; bool suffixArrayIsInitialized = false, extBitsAreInitialized = false; enum verifyBWTSeqErrCode retval = VERIFY_BWTSEQ_NO_ERROR; do { GtUword seqLen; gt_assert(bwtSeq && projectName && err); gt_error_check(err); initExtBitsRetrieval(&extBits); extBitsAreInitialized = true; if (gt_mapsuffixarray(&suffixArray, SARR_SUFTAB | SARR_ESQTAB, projectName, verbosity, err)) { gt_error_set(err, "Cannot load reference suffix array project with" " demand for suffix table file and encoded sequence" " for project: %s", projectName); retval = VERIFY_BWTSEQ_REFLOAD_ERROR; break; } suffixArrayIsInitialized = true; seqLen = gt_encseq_total_length(suffixArray.encseq) + 1; if (BWTSeqLength(bwtSeq) != seqLen) { gt_error_set(err, "length mismatch for suffix array project %s and " "bwt sequence index", projectName); retval = VERIFY_BWTSEQ_LENCOMPARE_ERROR; break; } if (checkFlags & VERIFY_BWTSEQ_SUFVAL && BWTSeqHasLocateInformation(bwtSeq)) { GtUword i; for (i = 0; i < seqLen && retval == VERIFY_BWTSEQ_NO_ERROR; ++i) { if (gt_BWTSeqPosHasLocateInfo(bwtSeq, i, &extBits)) { GtUword sfxArrayValue = gt_BWTSeqLocateMatch(bwtSeq, i, &extBits); if (sfxArrayValue != ESASUFFIXPTRGET(suffixArray.suftab,i)) { gt_error_set(err, "Failed suffix array value comparison" " at position "GT_WU": "GT_WU" != "GT_WU"", i, sfxArrayValue, ESASUFFIXPTRGET(suffixArray.suftab,i)); retval = VERIFY_BWTSEQ_SUFVAL_ERROR; break; } } if (tickPrint && !((i + 1) % tickPrint)) putc('.', fp); } if (tickPrint) putc('\n', fp); if (retval != VERIFY_BWTSEQ_NO_ERROR) break; } else if (checkFlags & VERIFY_BWTSEQ_SUFVAL) { gt_error_set(err, "check of suffix array values was requested," " but index contains no locate information!"); retval = VERIFY_BWTSEQ_SUFVAL_ERROR; break; } else if (!(checkFlags & VERIFY_BWTSEQ_SUFVAL) && BWTSeqHasLocateInformation(bwtSeq)) { fputs("Not checking suftab values.\n", stderr); } if (BWTSeqHasLocateInformation(bwtSeq)) { GtUword nextLocate = BWTSeqTerminatorPos(bwtSeq); if (suffixArray.longest.defined && suffixArray.longest.valueunsignedlong != nextLocate) { gt_error_set(err, "terminator/0-rotation position mismatch "GT_WU"" " vs. "GT_WU"", suffixArray.longest.valueunsignedlong, nextLocate); retval = VERIFY_BWTSEQ_TERMPOS_ERROR; break; } if ((checkFlags & VERIFY_BWTSEQ_LFMAPWALK) && (bwtSeq->featureToggles & BWTReversiblySorted)) { GtUword i = seqLen; /* handle first symbol specially because the encseq * will not return the terminator symbol */ { Symbol sym = BWTSeqGetSym(bwtSeq, nextLocate); if (sym != UNDEFBWTCHAR) { gt_error_set(err, "symbol mismatch at position "GT_WU": " "%d vs. reference symbol %d", i - 1, (int)sym, (int)UNDEFBWTCHAR); retval = VERIFY_BWTSEQ_LFMAPWALK_ERROR; break; } --i; nextLocate = BWTSeqLFMap(bwtSeq, nextLocate, &extBits); } while (i > 0) { Symbol symRef = gt_encseq_get_encoded_char(suffixArray.encseq, --i, suffixArray.readmode); Symbol symCmp = BWTSeqGetSym(bwtSeq, nextLocate); if (symCmp != symRef) { gt_error_set(err, "symbol mismatch at position "GT_WU": " "%d vs. reference symbol %d", i, symCmp, symRef); retval = VERIFY_BWTSEQ_LFMAPWALK_ERROR; break; } nextLocate = BWTSeqLFMap(bwtSeq, nextLocate, &extBits); } if (retval != VERIFY_BWTSEQ_NO_ERROR) break; } else if ((checkFlags & VERIFY_BWTSEQ_LFMAPWALK) && !(bwtSeq->featureToggles & BWTReversiblySorted)) { gt_error_set(err, "requested complete backwards regeneration in index" " without regeneration capability"); retval = VERIFY_BWTSEQ_LFMAPWALK_IMP_ERROR; break; } } if (checkFlags & VERIFY_BWTSEQ_CONTEXT) { BWTSeqContextRetriever *bwtSeqCR = gt_BWTSeqCRLoad(bwtSeq, projectName, CTX_MAP_ILOG_AUTOSIZE); if (!bwtSeqCR) { gt_error_set(err, "cannot load BWT sequence context access table" " for project %s", projectName); retval = VERIFY_BWTSEQ_CONTEXT_LOADFAIL; break; } fputs("Checking context regeneration.\n", stderr); { GtUword i, start, subSeqLen, maxSubSeqLen = MIN(MAX(MIN_CONTEXT_LEN, seqLen/CONTEXT_FRACTION), MAX_CONTEXT_LEN), numTries = MIN(MAX_NUM_CONTEXT_CHECKS, MAX(2, seqLen/CONTEXT_INTERVAL)); Symbol *contextBuf = gt_malloc(sizeof (Symbol) * MAX_CONTEXT_LEN); GtEncseqReader *esr = gt_encseq_create_reader_with_readmode(suffixArray.encseq, suffixArray.readmode, 0); for (i = 0; i < numTries && retval == VERIFY_BWTSEQ_NO_ERROR; ++i) { GtUword j, end, inSubSeqLen; subSeqLen = random()%maxSubSeqLen + 1; start = random()%(seqLen - subSeqLen + 1); end = start + subSeqLen; inSubSeqLen = subSeqLen - ((end==seqLen)?1:0); gt_BWTSeqCRAccessSubseq(bwtSeqCR, start, subSeqLen, contextBuf); gt_encseq_reader_reinit_with_readmode(esr, suffixArray.encseq, suffixArray.readmode, start); for (j = 0; j < inSubSeqLen; ++j) { Symbol symRef = gt_encseq_reader_next_encoded_char(esr); Symbol symCmp = contextBuf[j]; if (symCmp != symRef) { gt_error_set(err, "symbol mismatch at position "GT_WU": " "%d vs. reference symbol %d", start + j, (int)symCmp, (int)symRef); retval = VERIFY_BWTSEQ_CONTEXT_SYMFAIL; break; } } while (j < subSeqLen) { Symbol symRef = UNDEFBWTCHAR; Symbol symCmp = contextBuf[j]; if (symCmp != symRef) { gt_error_set(err, "symbol mismatch at position "GT_WU": " "%d vs. reference symbol %d", start + j, (int)symCmp, (int)symRef); retval = VERIFY_BWTSEQ_CONTEXT_SYMFAIL; break; } ++j; } } if (retval == VERIFY_BWTSEQ_NO_ERROR) fputs("Context regeneration completed successfully.\n", stderr); gt_encseq_reader_delete(esr); gt_free(contextBuf); } gt_deleteBWTSeqCR(bwtSeqCR); } } while (0); if (suffixArrayIsInitialized) gt_freesuffixarray(&suffixArray); if (extBitsAreInitialized) destructExtBitsRetrieval(&extBits); return retval; }