GtUchar gt_Bwtseqcontextiterator_next(GtUword *bound, Bwtseqcontextiterator *bsci) { GtUchar cc; if (bsci->bound != BWTSeqTerminatorPos(bsci->bwtseq)) { cc = BWTSeqGetSym(bsci->bwtseq, bsci->bound); } else { cc = SEPARATOR; } *bound = bsci->bound = BWTSeqLFMap(bsci->bwtseq, bsci->bound, &bsci->extBits); return cc; }
extern Seqpos BWTSeqLocateMatch(const BWTSeq *bwtSeq, Seqpos pos, struct extBitsRetrieval *extBits) { if (bwtSeq->featureToggles & BWTLocateBitmap) { Seqpos nextLocate = pos; unsigned locateOffset = 0; while (!BWTSeqPosHasLocateInfo(bwtSeq, nextLocate, extBits)) nextLocate = BWTSeqLFMap(bwtSeq, nextLocate, extBits), ++locateOffset; EISRetrieveExtraBits(bwtSeq->seqIdx, nextLocate, EBRF_RETRIEVE_CWBITS | EBRF_RETRIEVE_VARBITS, extBits, bwtSeq->hint); { Seqpos maxPosVal = ((bwtSeq->featureToggles & BWTReversiblySorted)? (BWTSeqLength(bwtSeq) - 1) /bwtSeq->locateSampleInterval: BWTSeqLength(bwtSeq) - 1); unsigned bitsPerCount = requiredSeqposBits(extBits->len), bitsPerBWTPos = requiredSeqposBits(extBits->len - 1), bitsPerOrigPos = requiredSeqposBits(maxPosVal); BitOffset locateRecordIndex = gt_bs1BitsCount(extBits->cwPart, extBits->cwOffset, nextLocate - extBits->start), locateRecordOffset = ((bwtSeq->featureToggles & BWTLocateCount? bitsPerBWTPos:0) + bitsPerOrigPos) * locateRecordIndex + ((bwtSeq->featureToggles & BWTLocateCount)?bitsPerCount:0); Seqpos matchPos = gt_bsGetSeqpos( extBits->varPart, extBits->varOffset + locateRecordOffset + ((bwtSeq->featureToggles & BWTLocateCount)?bitsPerBWTPos:0), bitsPerOrigPos); if (bwtSeq->featureToggles & BWTReversiblySorted) matchPos = matchPos * bwtSeq->locateSampleInterval; matchPos += locateOffset; gt_assert(!(bwtSeq->featureToggles & BWTLocateCount) || gt_bsGetSeqpos(extBits->varPart, extBits->varOffset + locateRecordOffset, bitsPerBWTPos) == nextLocate - extBits->start); return matchPos; } } else if (bwtSeq->featureToggles & BWTLocateCount) { BitOffset markOffset; Seqpos nextLocate = pos, matchPos; unsigned locateOffset = 0; /* mark is at most locateInterval * positions away */ unsigned bitsPerOrigPos = requiredSeqposBits(((bwtSeq->featureToggles & BWTReversiblySorted)? (BWTSeqLength(bwtSeq) - 1) /bwtSeq->locateSampleInterval: BWTSeqLength(bwtSeq) - 1)); while ((markOffset = searchLocateCountMark(bwtSeq, nextLocate, extBits)) == 0) { nextLocate = BWTSeqLFMap(bwtSeq, nextLocate, extBits); ++locateOffset; } matchPos = gt_bsGetSeqpos(extBits->varPart, markOffset, bitsPerOrigPos); if (bwtSeq->featureToggles & BWTReversiblySorted) matchPos = matchPos * bwtSeq->locateSampleInterval; matchPos += locateOffset; return matchPos; } /* Internal error: Trying to locate in BWT sequence index without locate information. */ abort(); return 0; /* shut up compiler */ }
enum verifyBWTSeqErrCode gt_BWTSeqVerifyIntegrity(BWTSeq *bwtSeq, const char *projectName, int checkFlags, GtUword tickPrint, FILE *fp, GtLogger *verbosity, GtError *err) { Suffixarray suffixArray; struct extBitsRetrieval extBits; bool suffixArrayIsInitialized = false, extBitsAreInitialized = false; enum verifyBWTSeqErrCode retval = VERIFY_BWTSEQ_NO_ERROR; do { GtUword seqLen; gt_assert(bwtSeq && projectName && err); gt_error_check(err); initExtBitsRetrieval(&extBits); extBitsAreInitialized = true; if (gt_mapsuffixarray(&suffixArray, SARR_SUFTAB | SARR_ESQTAB, projectName, verbosity, err)) { gt_error_set(err, "Cannot load reference suffix array project with" " demand for suffix table file and encoded sequence" " for project: %s", projectName); retval = VERIFY_BWTSEQ_REFLOAD_ERROR; break; } suffixArrayIsInitialized = true; seqLen = gt_encseq_total_length(suffixArray.encseq) + 1; if (BWTSeqLength(bwtSeq) != seqLen) { gt_error_set(err, "length mismatch for suffix array project %s and " "bwt sequence index", projectName); retval = VERIFY_BWTSEQ_LENCOMPARE_ERROR; break; } if (checkFlags & VERIFY_BWTSEQ_SUFVAL && BWTSeqHasLocateInformation(bwtSeq)) { GtUword i; for (i = 0; i < seqLen && retval == VERIFY_BWTSEQ_NO_ERROR; ++i) { if (gt_BWTSeqPosHasLocateInfo(bwtSeq, i, &extBits)) { GtUword sfxArrayValue = gt_BWTSeqLocateMatch(bwtSeq, i, &extBits); if (sfxArrayValue != ESASUFFIXPTRGET(suffixArray.suftab,i)) { gt_error_set(err, "Failed suffix array value comparison" " at position "GT_WU": "GT_WU" != "GT_WU"", i, sfxArrayValue, ESASUFFIXPTRGET(suffixArray.suftab,i)); retval = VERIFY_BWTSEQ_SUFVAL_ERROR; break; } } if (tickPrint && !((i + 1) % tickPrint)) putc('.', fp); } if (tickPrint) putc('\n', fp); if (retval != VERIFY_BWTSEQ_NO_ERROR) break; } else if (checkFlags & VERIFY_BWTSEQ_SUFVAL) { gt_error_set(err, "check of suffix array values was requested," " but index contains no locate information!"); retval = VERIFY_BWTSEQ_SUFVAL_ERROR; break; } else if (!(checkFlags & VERIFY_BWTSEQ_SUFVAL) && BWTSeqHasLocateInformation(bwtSeq)) { fputs("Not checking suftab values.\n", stderr); } if (BWTSeqHasLocateInformation(bwtSeq)) { GtUword nextLocate = BWTSeqTerminatorPos(bwtSeq); if (suffixArray.longest.defined && suffixArray.longest.valueunsignedlong != nextLocate) { gt_error_set(err, "terminator/0-rotation position mismatch "GT_WU"" " vs. "GT_WU"", suffixArray.longest.valueunsignedlong, nextLocate); retval = VERIFY_BWTSEQ_TERMPOS_ERROR; break; } if ((checkFlags & VERIFY_BWTSEQ_LFMAPWALK) && (bwtSeq->featureToggles & BWTReversiblySorted)) { GtUword i = seqLen; /* handle first symbol specially because the encseq * will not return the terminator symbol */ { Symbol sym = BWTSeqGetSym(bwtSeq, nextLocate); if (sym != UNDEFBWTCHAR) { gt_error_set(err, "symbol mismatch at position "GT_WU": " "%d vs. reference symbol %d", i - 1, (int)sym, (int)UNDEFBWTCHAR); retval = VERIFY_BWTSEQ_LFMAPWALK_ERROR; break; } --i; nextLocate = BWTSeqLFMap(bwtSeq, nextLocate, &extBits); } while (i > 0) { Symbol symRef = gt_encseq_get_encoded_char(suffixArray.encseq, --i, suffixArray.readmode); Symbol symCmp = BWTSeqGetSym(bwtSeq, nextLocate); if (symCmp != symRef) { gt_error_set(err, "symbol mismatch at position "GT_WU": " "%d vs. reference symbol %d", i, symCmp, symRef); retval = VERIFY_BWTSEQ_LFMAPWALK_ERROR; break; } nextLocate = BWTSeqLFMap(bwtSeq, nextLocate, &extBits); } if (retval != VERIFY_BWTSEQ_NO_ERROR) break; } else if ((checkFlags & VERIFY_BWTSEQ_LFMAPWALK) && !(bwtSeq->featureToggles & BWTReversiblySorted)) { gt_error_set(err, "requested complete backwards regeneration in index" " without regeneration capability"); retval = VERIFY_BWTSEQ_LFMAPWALK_IMP_ERROR; break; } } if (checkFlags & VERIFY_BWTSEQ_CONTEXT) { BWTSeqContextRetriever *bwtSeqCR = gt_BWTSeqCRLoad(bwtSeq, projectName, CTX_MAP_ILOG_AUTOSIZE); if (!bwtSeqCR) { gt_error_set(err, "cannot load BWT sequence context access table" " for project %s", projectName); retval = VERIFY_BWTSEQ_CONTEXT_LOADFAIL; break; } fputs("Checking context regeneration.\n", stderr); { GtUword i, start, subSeqLen, maxSubSeqLen = MIN(MAX(MIN_CONTEXT_LEN, seqLen/CONTEXT_FRACTION), MAX_CONTEXT_LEN), numTries = MIN(MAX_NUM_CONTEXT_CHECKS, MAX(2, seqLen/CONTEXT_INTERVAL)); Symbol *contextBuf = gt_malloc(sizeof (Symbol) * MAX_CONTEXT_LEN); GtEncseqReader *esr = gt_encseq_create_reader_with_readmode(suffixArray.encseq, suffixArray.readmode, 0); for (i = 0; i < numTries && retval == VERIFY_BWTSEQ_NO_ERROR; ++i) { GtUword j, end, inSubSeqLen; subSeqLen = random()%maxSubSeqLen + 1; start = random()%(seqLen - subSeqLen + 1); end = start + subSeqLen; inSubSeqLen = subSeqLen - ((end==seqLen)?1:0); gt_BWTSeqCRAccessSubseq(bwtSeqCR, start, subSeqLen, contextBuf); gt_encseq_reader_reinit_with_readmode(esr, suffixArray.encseq, suffixArray.readmode, start); for (j = 0; j < inSubSeqLen; ++j) { Symbol symRef = gt_encseq_reader_next_encoded_char(esr); Symbol symCmp = contextBuf[j]; if (symCmp != symRef) { gt_error_set(err, "symbol mismatch at position "GT_WU": " "%d vs. reference symbol %d", start + j, (int)symCmp, (int)symRef); retval = VERIFY_BWTSEQ_CONTEXT_SYMFAIL; break; } } while (j < subSeqLen) { Symbol symRef = UNDEFBWTCHAR; Symbol symCmp = contextBuf[j]; if (symCmp != symRef) { gt_error_set(err, "symbol mismatch at position "GT_WU": " "%d vs. reference symbol %d", start + j, (int)symCmp, (int)symRef); retval = VERIFY_BWTSEQ_CONTEXT_SYMFAIL; break; } ++j; } } if (retval == VERIFY_BWTSEQ_NO_ERROR) fputs("Context regeneration completed successfully.\n", stderr); gt_encseq_reader_delete(esr); gt_free(contextBuf); } gt_deleteBWTSeqCR(bwtSeqCR); } } while (0); if (suffixArrayIsInitialized) gt_freesuffixarray(&suffixArray); if (extBitsAreInitialized) destructExtBitsRetrieval(&extBits); return retval; }