static GtCodetype qgram2codefillspecial(unsigned int numofchars, unsigned int kmersize, const GtEncseq *encseq, GtReadmode readmode, GtUword startpos, GtUword totallength) { GtCodetype integercode; GtUword pos; bool foundspecial; GtUchar cc; if (startpos >= totallength) { integercode = (GtCodetype) (numofchars - 1); foundspecial = true; } else { /* for testing */ cc = gt_encseq_get_encoded_char(encseq,startpos,readmode); if (ISSPECIAL(cc)) { integercode = (GtCodetype) (numofchars - 1); foundspecial = true; } else { integercode = (GtCodetype) cc; foundspecial = false; } } for (pos = startpos + 1; pos < startpos + kmersize; pos++) { if (foundspecial) { ADDNEXTCHAR(integercode,numofchars-1,numofchars); } else { if (pos >= totallength) { ADDNEXTCHAR(integercode,numofchars-1,numofchars); foundspecial = true; } else { /* for testing */ cc = gt_encseq_get_encoded_char(encseq,pos,readmode); if (ISSPECIAL(cc)) { ADDNEXTCHAR(integercode,numofchars-1,numofchars); foundspecial = true; } else { ADDNEXTCHAR(integercode,cc,numofchars); } } } } return integercode; }
static void backwardderive(const GtBucketspec2 *bucketspec2, GtSuffixsortspace *suffixsortspace, GtUword *targetoffset, unsigned int source, GtUword idx) { GtUword startpos; GtUchar cc; for (; idx + 1 > targetoffset[source] + 1; idx--) { startpos = gt_suffixsortspace_getdirect(suffixsortspace,idx); if (startpos > 0) { cc = gt_encseq_get_encoded_char(bucketspec2->encseq, startpos-1, bucketspec2->readmode); if (ISNOTSPECIAL(cc) && !bucketspec2->superbuckettab[cc].sorted) { gt_suffixsortspace_setdirect(suffixsortspace,targetoffset[cc], startpos - 1); targetoffset[cc]--; } } } }
static GtUchar sequenceobject_get_char(Sequenceobject *seq,GtUword pos) { if (seq->twobitencoding != NULL) { return gt_twobitencoding_char_at_pos(seq->twobitencoding, seq->forward ? seq->startpos + pos : seq->startpos - pos); } if (seq->encseqreader != NULL) { const GtUword addamount = 256UL; if (seq->min_access_pos != GT_UWORD_MAX && seq->min_access_pos >= seq->cache_offset + addamount) { GtUword idx, end = MIN(seq->cache_num_positions,seq->substringlength); GtUchar *cs = ((GtUchar *) seq->sequence_cache->space) - seq->min_access_pos; for (idx = seq->min_access_pos; idx < end; idx++) { cs[idx] = seq->cache_ptr[idx]; } seq->cache_offset = seq->min_access_pos; seq->cache_ptr = ((GtUchar *) seq->sequence_cache->space) - seq->cache_offset; } if (pos >= seq->cache_num_positions) { GtUword idx, tostore; tostore = MIN(seq->cache_num_positions + addamount,seq->substringlength); if (tostore > seq->cache_offset + seq->sequence_cache->allocated) { seq->sequence_cache->allocated += addamount; seq->sequence_cache->space = gt_realloc(seq->sequence_cache->space, sizeof (GtUchar) * seq->sequence_cache->allocated); seq->cache_ptr = ((GtUchar *) seq->sequence_cache->space) - seq->cache_offset; } gt_assert(pos >= seq->cache_offset); for (idx = seq->cache_num_positions; idx < tostore; idx++) { seq->cache_ptr[idx] = gt_encseq_reader_next_encoded_char(seq->encseqreader); } seq->cache_num_positions = tostore; } gt_assert(pos < seq->cache_offset + seq->sequence_cache->allocated); gt_assert(seq->cache_ptr != NULL); return seq->cache_ptr[pos]; } gt_assert(seq->encseq != NULL); gt_assert(seq->forward || seq->startpos >= pos); return gt_encseq_get_encoded_char(seq->encseq, seq->forward ? seq->startpos + pos : seq->startpos - pos, GT_READMODE_FORWARD); }
static void iteritvdistribution(GtArrayuint64_t *distribution, const GtEncseq *encseq, GtReadmode readmode, unsigned long totallength, unsigned long minmersize, unsigned long maxmersize, unsigned long length, unsigned long startpos) { if (length <= (unsigned long) maxmersize) { unsigned long ulen, pos; for (ulen = length, pos = startpos + length - 1; ulen <= (unsigned long) maxmersize && pos < totallength && ISNOTSPECIAL(gt_encseq_get_encoded_char(encseq,pos,readmode)); pos++, ulen++) { if (ulen >= (unsigned long) minmersize) { adddistributionuint64_t(distribution,(unsigned long) ulen,1UL); } } } }
static void checknumberofoccurrences(const TyrDfsstate *dfsstate, GtUword countocc, GtUword position) { GtMMsearchiterator *mmsi; GtUword idx, bfcount; for (idx = 0; idx < dfsstate->mersize; idx++) { dfsstate->currentmer[idx] = gt_encseq_get_encoded_char(dfsstate->encseq,position+idx, dfsstate->readmode); } mmsi = gt_mmsearchiterator_new_complete_plain(dfsstate->encseq, dfsstate->suftab, 0, dfsstate->totallength, 0, dfsstate->readmode, dfsstate->currentmer, dfsstate->mersize); bfcount = gt_mmsearchiterator_count(mmsi); if (bfcount != countocc) { fprintf(stderr,"bfcount = "GT_WU" != "GT_WU" = countocc\n", bfcount,countocc); exit(GT_EXIT_PROGRAMMING_ERROR); } gt_mmsearchiterator_delete(mmsi); }
static GtUchar gt_mmsearch_accessquery(const GtQueryrepresentation *queryrep, GtUword pos) { GtUword abspos, cc; gt_assert(queryrep != NULL); gt_assert(pos < queryrep->seqlen); abspos = queryrep->startpos + (queryrep->readmode == GT_READMODE_FORWARD ? pos : GT_REVERSEPOS(queryrep->seqlen,pos)); if (queryrep->sequence != NULL) { cc = queryrep->sequence[abspos]; } else { gt_assert(queryrep->encseq != NULL); cc = gt_encseq_get_encoded_char(queryrep->encseq,abspos, GT_READMODE_FORWARD); } if (GT_ISDIRCOMPLEMENT(queryrep->readmode)) { if (ISSPECIAL(cc)) { return cc; } return GT_COMPLEMENTBASE(cc); } else { return cc; } }
GtUchar gt_bioseq_get_encoded_char(const GtBioseq *bs, GtUword index, GtUword position) { GtUword startpos; gt_assert(bs); gt_assert(index < gt_encseq_num_of_sequences(bs->encseq)); startpos = gt_encseq_seqstartpos(bs->encseq, index); return gt_encseq_get_encoded_char(bs->encseq, startpos + position, GT_READMODE_FORWARD); }
static Scoretype swlocalsimilarityscore(Scoretype *scol, Maxscorecoord *maxpair, const Scorevalues *scorevalues, const GtUchar *useq, GtUword ulen, const GtEncseq *vencseq, GtUword startpos, GtUword endpos) { Scoretype val, we, nw, *scolptr, maximalscore = 0; const GtUchar *uptr; GtUchar vcurrent; GtUword j; maxpair->umax = maxpair->vmax = 0; for (scolptr = scol; scolptr <= scol + ulen; scolptr++) { *scolptr = 0; } for (j = startpos; j < endpos; j++) { nw = 0; vcurrent = gt_encseq_get_encoded_char(vencseq,j, GT_READMODE_FORWARD); gt_assert(vcurrent != (GtUchar) SEPARATOR); for (scolptr = scol+1, uptr = useq; uptr < useq + ulen; scolptr++, uptr++) { gt_assert(*uptr != (GtUchar) SEPARATOR); we = *scolptr; *scolptr = *(scolptr-1) + scorevalues->gapextend; if ((val = nw + REPLACEMENTSCORE(scorevalues,*uptr,vcurrent)) > *scolptr) { *scolptr = val; } if ((val = we + scorevalues->gapextend) > *scolptr) { *scolptr = val; } if (*scolptr < 0) { *scolptr = 0; } else { if (*scolptr > maximalscore) { maximalscore = *scolptr; maxpair->umax = (GtUword) (uptr - useq + 1); maxpair->vmax = (GtUword) (j - startpos + 1); } } nw = we; } } return maximalscore; }
static void verifymatch(const GtEncseq *encseq, GtUword len, GtUword pos1, uint64_t seqnum2, GtUword pos2, GtReadmode readmode) { if (readmode == GT_READMODE_REVERSE) { GtUword offset, seqstartpos, totallength = gt_encseq_total_length(encseq); GtUchar cc1, cc2; seqstartpos = gt_encseq_seqstartpos(encseq, seqnum2); pos2 += seqstartpos; for (offset = 0; offset < len; offset++) { gt_assert(pos1 + len - 1 < totallength); gt_assert(pos2 + len - 1 < totallength); cc1 = gt_encseq_get_encoded_char(encseq,pos1+offset,GT_READMODE_FORWARD); cc2 = gt_encseq_get_encoded_char(encseq,pos2+len-1-offset, GT_READMODE_FORWARD); gt_assert(cc1 == cc2 && ISNOTSPECIAL(cc1)); } if (pos1 + len < totallength) { cc1 = gt_encseq_get_encoded_char(encseq,pos1+len,GT_READMODE_FORWARD); } else { cc1 = SEPARATOR; } if (pos2 > 0) { cc2 = gt_encseq_get_encoded_char(encseq,pos2-1,GT_READMODE_FORWARD); } else { cc2 = SEPARATOR; } gt_assert(cc1 != cc2 || ISSPECIAL(cc1)); } }
static GtUword getlcp(const GtEncseq *encseq1, GtReadmode readmode1, GtUword start1, GtUword end1, const GtEncseq *encseq2, GtReadmode readmode2, GtUword start2, GtUword end2) { GtUword i1, i2; GtUchar cc1; for (i1=start1, i2=start2; i1 <= end1 && i2 <= end2; i1++, i2++) { cc1 = gt_encseq_get_encoded_char(/*XXX*/ encseq1,i1,readmode1); if (cc1 != gt_encseq_get_encoded_char(/*XXX*/ encseq2,i2,readmode2) || ISSPECIAL(cc1)) { break; } } return i1 - start1; }
static void showmergertrie2(const Mergertrierep *trierep, const GtUchar *characters, unsigned int level, const Mergertrienode *node) { GtUchar cc = 0; GtUword pos, endpos; Mergertrienode *current; for (current = node->firstchild; current != NULL; current = current->rightsibling) { printf("%*.*s",(int) (6 * level),(int) (6 * level)," "); if (MTRIE_ISLEAF(current)) { endpos = gt_encseq_total_length( trierep->encseqtable[current->suffixinfo.idx]); } else { endpos = current->suffixinfo.startpos + current->depth; } for (pos = current->suffixinfo.startpos + node->depth; pos < endpos; pos++) { cc = gt_encseq_get_encoded_char( /* just for testing */ trierep->enseqreadinfo[current->suffixinfo.idx].encseqptr, pos, trierep->enseqreadinfo[current->suffixinfo.idx].readmode); if (ISSPECIAL(cc)) { printf("#\n"); break; } printf("%c",characters[(int) cc]); } if (MTRIE_ISLEAF(current)) { if (!ISSPECIAL(cc)) { printf("~\n"); } } else { printf(" d="GT_WU",i=" Formatuint64_t "\n", current->depth, PRINTuint64_tcast(current->suffixinfo.ident)); showmergertrie2(trierep,characters,level+1,current); } } }
static int encseq_lua_get_encoded_char(lua_State *L) { GtEncseq **encseq; GtUword pos; int readmode; unsigned char cc; encseq = check_encseq(L, 1); pos = luaL_checknumber(L, 2); readmode = luaL_checknumber(L, 3); luaL_argcheck(L, pos < gt_encseq_total_length(*encseq), 2, "cannot exceed total length of encoded sequence"); cc = gt_encseq_get_encoded_char(*encseq, pos, readmode); lua_pushnumber(L, cc); return 1; }
static GtUchar getfirstedgechar(const Mergertrierep *trierep, const Mergertrienode *node, GtUword prevdepth) { Encseqreadinfo *eri = trierep->encseqreadinfo + node->suffixinfo.idx; if (MTRIE_ISLEAF(node) && node->suffixinfo.startpos + prevdepth >= gt_encseq_total_length(eri->encseqptr)) { return (GtUchar) SEPARATOR; } return gt_encseq_get_encoded_char(eri->encseqptr, /* Random access */ node->suffixinfo.startpos + prevdepth, eri->readmode); }
static GtUchar gt_mmsearch_accessquery(const GtQueryrep *queryrep, unsigned long pos) { unsigned long abspos; gt_assert(queryrep != NULL && pos < queryrep->length); abspos = queryrep->startpos + pos; if (queryrep->sequence != NULL) { gt_assert(queryrep->readmode == GT_READMODE_FORWARD); return queryrep->sequence[abspos]; } else { gt_assert(queryrep->readmode != GT_READMODE_FORWARD && queryrep->encseq != NULL); return gt_encseq_get_encoded_char(queryrep->encseq,abspos, queryrep->readmode); } }
void gt_fprintfencseq(FILE *fpout, const GtEncseq *encseq, unsigned long start, unsigned long wlen) { unsigned long idx; GtUchar currentchar; const GtAlphabet *alpha; alpha = gt_encseq_alphabet(encseq); for (idx = start; idx < start + wlen; idx++) { currentchar = gt_encseq_get_encoded_char(encseq, idx, GT_READMODE_FORWARD); gt_assert(ISNOTSPECIAL(currentchar)); gt_alphabet_echo_pretty_symbol(alpha,fpout,currentchar); } }
static Mergertrienode *mtrie_makenewbranch(Mergertrierep *trierep, Suffixinfo *suffixinfo, GtUword currentdepth, Mergertrienode *oldnode) { Mergertrienode *newbranch, *newleaf; GtUchar cc1, cc2; Encseqreadinfo *eri = trierep->encseqreadinfo + suffixinfo->idx; #ifdef WITHTRIEIDENT #ifdef WITHTRIESHOW printf("makenewbranch(ident=" Formatuint64_t ")\n", PRINTuint64_tcast(suffixinfo->ident)); #endif #endif newbranch = newMergertrienode(trierep); newbranch->suffixinfo = *suffixinfo; newbranch->rightsibling = oldnode->rightsibling; cc1 = getfirstedgechar(trierep,oldnode,currentdepth); if (suffixinfo->startpos + currentdepth >= gt_encseq_total_length(eri->encseqptr)) { cc2 = (GtUchar) SEPARATOR; } else { cc2 = gt_encseq_get_encoded_char(eri->encseqptr, suffixinfo->startpos + currentdepth, eri->readmode); } newleaf = mtrie_makenewleaf(trierep,suffixinfo); if (mtrie_comparecharacters(cc1,oldnode->suffixinfo.idx, cc2,suffixinfo->idx) <= 0) { makesuccs(newbranch,oldnode,newleaf); } else { makesuccs(newbranch,newleaf,oldnode); } newbranch->depth = currentdepth; return newbranch; }
static void swtracebackDPedges(GtAlignment *alignment, GtUword ulen, const GtEncseq *encseq, GtUword vlen, GtUchar *dbsubstring, GtUword startpos, const Retracebits *edges) { const Retracebits *eptr = edges + (ulen+1) * (vlen+1) - 1; while (true) { if (*eptr & DELETIONBIT) { gt_alignment_add_deletion(alignment); eptr--; } else { if (*eptr & REPLACEMENTBIT) { gt_alignment_add_replacement(alignment); eptr -= (ulen+2); } else { if (*eptr & INSERTIONBIT) { gt_alignment_add_insertion(alignment); eptr -= (ulen+1); } else { break; } } gt_assert(vlen > 0); vlen--; dbsubstring[vlen] = gt_encseq_get_encoded_char(encseq, startpos + vlen, GT_READMODE_FORWARD); } } }
static bool gt_mum_isleftmaximal(const GtEncseq *dbencseq, GtReadmode readmode, GtUword dbstart, GtUword queryoffset, const GtUchar *query) { GtUchar dbleftchar; if (dbstart == 0 || queryoffset == 0) { return true; } dbleftchar = gt_encseq_get_encoded_char(dbencseq, /* Random access */ dbstart-1, readmode); if (ISSPECIAL(dbleftchar) || dbleftchar != query[queryoffset-1]) { return true; } return false; }
static GtUchar gt_mmsearch_accessquery(const GtQueryrepresentation *queryrep, GtUword pos) { GtUword abspos; gt_assert(queryrep != NULL); gt_assert(pos < queryrep->seqlen); abspos = queryrep->startpos + (queryrep->readmode == GT_READMODE_FORWARD ? pos : GT_REVERSEPOS(queryrep->seqlen,pos)); if (queryrep->sequence != NULL) { gt_assert(!GT_ISDIRCOMPLEMENT(queryrep->readmode)); /* not implemented */ return queryrep->sequence[abspos]; } else { gt_assert(queryrep->encseq != NULL); return gt_encseq_get_encoded_char(queryrep->encseq,abspos, GT_READMODE_FORWARD); } }
static bool gt_mmsearch_isleftmaximal(const GtEncseq *dbencseq, GtReadmode readmode, GtUword dbstart, const GtQuerysubstring *querysubstring) { GtUchar dbleftchar; if (dbstart == 0 || querysubstring->currentoffset == 0) { return true; } dbleftchar = gt_encseq_get_encoded_char(dbencseq, /* Random access */ dbstart-1, readmode); if (ISSPECIAL(dbleftchar) || dbleftchar != gt_mmsearch_accessquery(querysubstring->queryrep, querysubstring->currentoffset-1)) { return true; } return false; }
static void swmaximalDPedges(Retracebits *edges, Scoretype *scol, const Scorevalues *scorevalues, const GtUchar *useq, GtUword ulen, const GtEncseq *vencseq, GtUword startpos, GtUword endpos) { Scoretype val, we, nw, *scolptr; const GtUchar *uptr; GtUchar vcurrent; GtUword j; Retracebits *eptr; eptr = edges; *eptr = 0; for (*scol = 0, scolptr = scol+1, uptr = useq, eptr++; uptr < useq + ulen; scolptr++, uptr++, eptr++) { *scolptr = *(scolptr-1) + scorevalues->gapextend; *eptr = DELETIONBIT; } for (j = startpos; j < endpos; j++) { vcurrent = gt_encseq_get_encoded_char(vencseq,j, GT_READMODE_FORWARD); gt_assert(vcurrent != (GtUchar) SEPARATOR); nw = *scol; *scol = nw + scorevalues->gapextend; *eptr = INSERTIONBIT; for (scolptr = scol+1, uptr = useq, eptr++; uptr < useq + ulen; scolptr++, uptr++, eptr++) { gt_assert(*uptr != (GtUchar) SEPARATOR); we = *scolptr; *scolptr = *(scolptr-1) + scorevalues->gapextend; *eptr = DELETIONBIT; if ((val = nw + REPLACEMENTSCORE(scorevalues,*uptr,vcurrent)) >= *scolptr) { if (val == *scolptr) { *eptr = *eptr | REPLACEMENTBIT; } else { *eptr = REPLACEMENTBIT; } *scolptr = val; } if ((val = we + scorevalues->gapextend) >= *scolptr) { if (val == *scolptr) { *eptr = *eptr | INSERTIONBIT; } else { *eptr = INSERTIONBIT; } *scolptr = val; } nw = we; } } }
enum verifyBWTSeqErrCode gt_BWTSeqVerifyIntegrity(BWTSeq *bwtSeq, const char *projectName, int checkFlags, GtUword tickPrint, FILE *fp, GtLogger *verbosity, GtError *err) { Suffixarray suffixArray; struct extBitsRetrieval extBits; bool suffixArrayIsInitialized = false, extBitsAreInitialized = false; enum verifyBWTSeqErrCode retval = VERIFY_BWTSEQ_NO_ERROR; do { GtUword seqLen; gt_assert(bwtSeq && projectName && err); gt_error_check(err); initExtBitsRetrieval(&extBits); extBitsAreInitialized = true; if (gt_mapsuffixarray(&suffixArray, SARR_SUFTAB | SARR_ESQTAB, projectName, verbosity, err)) { gt_error_set(err, "Cannot load reference suffix array project with" " demand for suffix table file and encoded sequence" " for project: %s", projectName); retval = VERIFY_BWTSEQ_REFLOAD_ERROR; break; } suffixArrayIsInitialized = true; seqLen = gt_encseq_total_length(suffixArray.encseq) + 1; if (BWTSeqLength(bwtSeq) != seqLen) { gt_error_set(err, "length mismatch for suffix array project %s and " "bwt sequence index", projectName); retval = VERIFY_BWTSEQ_LENCOMPARE_ERROR; break; } if (checkFlags & VERIFY_BWTSEQ_SUFVAL && BWTSeqHasLocateInformation(bwtSeq)) { GtUword i; for (i = 0; i < seqLen && retval == VERIFY_BWTSEQ_NO_ERROR; ++i) { if (gt_BWTSeqPosHasLocateInfo(bwtSeq, i, &extBits)) { GtUword sfxArrayValue = gt_BWTSeqLocateMatch(bwtSeq, i, &extBits); if (sfxArrayValue != ESASUFFIXPTRGET(suffixArray.suftab,i)) { gt_error_set(err, "Failed suffix array value comparison" " at position "GT_WU": "GT_WU" != "GT_WU"", i, sfxArrayValue, ESASUFFIXPTRGET(suffixArray.suftab,i)); retval = VERIFY_BWTSEQ_SUFVAL_ERROR; break; } } if (tickPrint && !((i + 1) % tickPrint)) putc('.', fp); } if (tickPrint) putc('\n', fp); if (retval != VERIFY_BWTSEQ_NO_ERROR) break; } else if (checkFlags & VERIFY_BWTSEQ_SUFVAL) { gt_error_set(err, "check of suffix array values was requested," " but index contains no locate information!"); retval = VERIFY_BWTSEQ_SUFVAL_ERROR; break; } else if (!(checkFlags & VERIFY_BWTSEQ_SUFVAL) && BWTSeqHasLocateInformation(bwtSeq)) { fputs("Not checking suftab values.\n", stderr); } if (BWTSeqHasLocateInformation(bwtSeq)) { GtUword nextLocate = BWTSeqTerminatorPos(bwtSeq); if (suffixArray.longest.defined && suffixArray.longest.valueunsignedlong != nextLocate) { gt_error_set(err, "terminator/0-rotation position mismatch "GT_WU"" " vs. "GT_WU"", suffixArray.longest.valueunsignedlong, nextLocate); retval = VERIFY_BWTSEQ_TERMPOS_ERROR; break; } if ((checkFlags & VERIFY_BWTSEQ_LFMAPWALK) && (bwtSeq->featureToggles & BWTReversiblySorted)) { GtUword i = seqLen; /* handle first symbol specially because the encseq * will not return the terminator symbol */ { Symbol sym = BWTSeqGetSym(bwtSeq, nextLocate); if (sym != UNDEFBWTCHAR) { gt_error_set(err, "symbol mismatch at position "GT_WU": " "%d vs. reference symbol %d", i - 1, (int)sym, (int)UNDEFBWTCHAR); retval = VERIFY_BWTSEQ_LFMAPWALK_ERROR; break; } --i; nextLocate = BWTSeqLFMap(bwtSeq, nextLocate, &extBits); } while (i > 0) { Symbol symRef = gt_encseq_get_encoded_char(suffixArray.encseq, --i, suffixArray.readmode); Symbol symCmp = BWTSeqGetSym(bwtSeq, nextLocate); if (symCmp != symRef) { gt_error_set(err, "symbol mismatch at position "GT_WU": " "%d vs. reference symbol %d", i, symCmp, symRef); retval = VERIFY_BWTSEQ_LFMAPWALK_ERROR; break; } nextLocate = BWTSeqLFMap(bwtSeq, nextLocate, &extBits); } if (retval != VERIFY_BWTSEQ_NO_ERROR) break; } else if ((checkFlags & VERIFY_BWTSEQ_LFMAPWALK) && !(bwtSeq->featureToggles & BWTReversiblySorted)) { gt_error_set(err, "requested complete backwards regeneration in index" " without regeneration capability"); retval = VERIFY_BWTSEQ_LFMAPWALK_IMP_ERROR; break; } } if (checkFlags & VERIFY_BWTSEQ_CONTEXT) { BWTSeqContextRetriever *bwtSeqCR = gt_BWTSeqCRLoad(bwtSeq, projectName, CTX_MAP_ILOG_AUTOSIZE); if (!bwtSeqCR) { gt_error_set(err, "cannot load BWT sequence context access table" " for project %s", projectName); retval = VERIFY_BWTSEQ_CONTEXT_LOADFAIL; break; } fputs("Checking context regeneration.\n", stderr); { GtUword i, start, subSeqLen, maxSubSeqLen = MIN(MAX(MIN_CONTEXT_LEN, seqLen/CONTEXT_FRACTION), MAX_CONTEXT_LEN), numTries = MIN(MAX_NUM_CONTEXT_CHECKS, MAX(2, seqLen/CONTEXT_INTERVAL)); Symbol *contextBuf = gt_malloc(sizeof (Symbol) * MAX_CONTEXT_LEN); GtEncseqReader *esr = gt_encseq_create_reader_with_readmode(suffixArray.encseq, suffixArray.readmode, 0); for (i = 0; i < numTries && retval == VERIFY_BWTSEQ_NO_ERROR; ++i) { GtUword j, end, inSubSeqLen; subSeqLen = random()%maxSubSeqLen + 1; start = random()%(seqLen - subSeqLen + 1); end = start + subSeqLen; inSubSeqLen = subSeqLen - ((end==seqLen)?1:0); gt_BWTSeqCRAccessSubseq(bwtSeqCR, start, subSeqLen, contextBuf); gt_encseq_reader_reinit_with_readmode(esr, suffixArray.encseq, suffixArray.readmode, start); for (j = 0; j < inSubSeqLen; ++j) { Symbol symRef = gt_encseq_reader_next_encoded_char(esr); Symbol symCmp = contextBuf[j]; if (symCmp != symRef) { gt_error_set(err, "symbol mismatch at position "GT_WU": " "%d vs. reference symbol %d", start + j, (int)symCmp, (int)symRef); retval = VERIFY_BWTSEQ_CONTEXT_SYMFAIL; break; } } while (j < subSeqLen) { Symbol symRef = UNDEFBWTCHAR; Symbol symCmp = contextBuf[j]; if (symCmp != symRef) { gt_error_set(err, "symbol mismatch at position "GT_WU": " "%d vs. reference symbol %d", start + j, (int)symCmp, (int)symRef); retval = VERIFY_BWTSEQ_CONTEXT_SYMFAIL; break; } ++j; } } if (retval == VERIFY_BWTSEQ_NO_ERROR) fputs("Context regeneration completed successfully.\n", stderr); gt_encseq_reader_delete(esr); gt_free(contextBuf); } gt_deleteBWTSeqCR(bwtSeqCR); } } while (0); if (suffixArrayIsInitialized) gt_freesuffixarray(&suffixArray); if (extBitsAreInitialized) destructExtBitsRetrieval(&extBits); return retval; }
void gt_mergertrie_insertsuffix(Mergertrierep *trierep, Mergertrienode *node, Suffixinfo *suffixinfo) { if (trierep->root == NULL) { trierep->root = mtrie_makeroot(trierep,suffixinfo); } else { GtUword currentdepth, lcpvalue, totallength; Mergertrienode *currentnode, *newleaf, *newbranch, *succ; Nodepair np; GtUchar cc; Encseqreadinfo *eri = trierep->encseqreadinfo + suffixinfo->idx; gt_assert(!MTRIE_ISLEAF(node)); currentnode = node; currentdepth = node->depth; totallength = gt_encseq_total_length(eri->encseqptr); while (true) { if (suffixinfo->startpos + currentdepth >= totallength) { cc = (GtUchar) SEPARATOR; } else { /* Random access */ cc = gt_encseq_get_encoded_char(eri->encseqptr, suffixinfo->startpos + currentdepth, eri->readmode); } gt_assert(currentnode != NULL); gt_assert(!MTRIE_ISLEAF(currentnode)); if (!hassuccessor(trierep,&np,currentdepth,currentnode,cc, suffixinfo->idx)) { newleaf = mtrie_makenewleaf(trierep,suffixinfo); newleaf->rightsibling = np.current; SHOWNODERELATIONS(newleaf); if (np.previous == NULL) { SETFIRSTCHILD(currentnode,newleaf); SHOWNODERELATIONS(currentnode); } else { np.previous->rightsibling = newleaf; SHOWNODERELATIONS(np.previous); } return; } succ = np.current; if (MTRIE_ISLEAF(succ)) { lcpvalue = getlcp(eri->encseqptr, eri->readmode, suffixinfo->startpos + currentdepth + 1, gt_encseq_total_length(eri->encseqptr) - 1, trierep->encseqreadinfo[succ->suffixinfo.idx]. encseqptr, trierep->encseqreadinfo[succ->suffixinfo.idx]. readmode, succ->suffixinfo.startpos + currentdepth + 1, gt_encseq_total_length( trierep->encseqreadinfo[succ->suffixinfo.idx]. encseqptr) - 1); newbranch = mtrie_makenewbranch(trierep, suffixinfo, currentdepth + lcpvalue + 1, succ); if (np.previous == NULL) { SETFIRSTCHILD(currentnode,newbranch); SHOWNODERELATIONS(currentnode); } else { np.previous->rightsibling = newbranch; SHOWNODERELATIONS(np.previous); } return; } lcpvalue = getlcp(eri->encseqptr, eri->readmode, suffixinfo->startpos + currentdepth + 1, gt_encseq_total_length(eri->encseqptr) - 1, trierep->encseqreadinfo[succ->suffixinfo.idx].encseqptr, trierep->encseqreadinfo[succ->suffixinfo.idx].readmode, succ->suffixinfo.startpos + currentdepth + 1, succ->suffixinfo.startpos + succ->depth - 1); if (currentdepth + lcpvalue + 1 < succ->depth) { newbranch = mtrie_makenewbranch(trierep, suffixinfo, currentdepth + lcpvalue + 1, succ); if (np.previous == NULL) { SETFIRSTCHILD(currentnode,newbranch); SHOWNODERELATIONS(currentnode); } else { np.previous->rightsibling = newbranch; SHOWNODERELATIONS(np.previous); } return; } currentnode = succ; currentdepth = currentnode->depth; } } }
static void swlocalsimilarityregion(DPpoint *scol, DPregion *maxentry, const Scorevalues *scorevalues, const GtUchar *useq, GtUword ulen, const GtEncseq *vencseq, GtUword startpos, GtUword endpos) { Scoretype val; DPpoint *scolptr, we, nw; const GtUchar *uptr; GtUchar vcurrent; GtUword j; maxentry->similarity = 0; maxentry->len1 = 0; maxentry->len2 = 0; maxentry->start1 = 0; maxentry->start2 = 0; for (scolptr = scol; scolptr <= scol + ulen; scolptr++) { scolptr->similarity = 0; scolptr->lu = 0; scolptr->lv = 0; } for (j = startpos; j < endpos; j++) { vcurrent = gt_encseq_get_encoded_char(vencseq,j, GT_READMODE_FORWARD); gt_assert(vcurrent != (GtUchar) SEPARATOR); nw = *scol; for (scolptr = scol+1, uptr = useq; uptr < useq + ulen; scolptr++, uptr++) { gt_assert(*uptr != (GtUchar) SEPARATOR); we = *scolptr; scolptr->similarity = (scolptr-1)->similarity + scorevalues->gapextend; scolptr->lu = (scolptr-1)->lu + 1; scolptr->lv = (scolptr-1)->lv; if ((val = nw.similarity + REPLACEMENTSCORE(scorevalues,*uptr,vcurrent)) > scolptr->similarity) { scolptr->similarity = val; scolptr->lu = nw.lu + 1; scolptr->lv = nw.lv + 1; } if ((val = we.similarity + scorevalues->gapextend) > scolptr->similarity) { scolptr->similarity = val; scolptr->lu = we.lu; scolptr->lv = we.lv + 1; } if (scolptr->similarity < 0) { scolptr->similarity = 0; scolptr->lu = 0; scolptr->lv = 0; } else { if (scolptr->similarity > maxentry->similarity) { maxentry->similarity = scolptr->similarity; maxentry->len1 = scolptr->lu; maxentry->len2 = scolptr->lv; maxentry->start1 = (GtUword) (uptr - useq) - scolptr->lu + 1; maxentry->start2 = (j - startpos) - scolptr->lv + 1; } } nw = we; } } }
static GtUword *leftcontextofspecialchardist(unsigned int numofchars, const GtEncseq *encseq, GtReadmode readmode) { GtUchar cc; unsigned int idx; GtUword *specialchardist, totallength = gt_encseq_total_length(encseq); GtReadmode convertedreadmode = (readmode == GT_READMODE_REVERSE) ? GT_READMODE_FORWARD : GT_READMODE_COMPL; specialchardist = gt_malloc(sizeof (*specialchardist) * numofchars); for (idx = 0; idx<numofchars; idx++) { specialchardist[idx] = 0; } if (gt_encseq_has_specialranges(encseq)) { GtSpecialrangeiterator *sri; GtRange range; sri = gt_specialrangeiterator_new(encseq,true); if (GT_ISDIRREVERSE(readmode)) { while (gt_specialrangeiterator_next(sri,&range)) { if (range.end < totallength) { cc = gt_encseq_get_encoded_char(encseq,range.end,convertedreadmode); if (ISNOTSPECIAL(cc)) { specialchardist[cc]++; } } } } else { while (gt_specialrangeiterator_next(sri,&range)) { if (range.start > 0) { cc = gt_encseq_get_encoded_char(encseq,range.start-1,readmode); if (ISNOTSPECIAL(cc)) { specialchardist[cc]++; } } } } gt_specialrangeiterator_delete(sri); } if (GT_ISDIRREVERSE(readmode)) { if (gt_encseq_lengthofspecialprefix(encseq) == 0) { cc = gt_encseq_get_encoded_char(encseq,0,convertedreadmode); gt_assert(ISNOTSPECIAL(cc)); specialchardist[cc]++; } } else { if (gt_encseq_lengthofspecialsuffix(encseq) == 0) { cc = gt_encseq_get_encoded_char(encseq,totallength-1,readmode); gt_assert(ISNOTSPECIAL(cc)); specialchardist[cc]++; } } return specialchardist; }