Sequentialsuffixarrayreader *gt_newSequentialsuffixarrayreaderfromfile( const char *indexname, unsigned int demand, bool scanfile, GtLogger *logger, GtError *err) { Sequentialsuffixarrayreader *ssar; ssar = gt_malloc(sizeof *ssar); ssar->suffixarray = gt_malloc(sizeof *ssar->suffixarray); if ((scanfile ? streamsuffixarray : gt_mapsuffixarray)(ssar->suffixarray, demand, indexname, logger, err) != 0) { gt_free(ssar->suffixarray); gt_free(ssar); return NULL; } ssar->nextsuftabindex = 0; ssar->nextlcptabindex = 1UL; ssar->largelcpindex = 0; ssar->scanfile = scanfile; ssar->suftab = NULL; gt_assert(ssar->suffixarray != NULL); ssar->encseq = ssar->suffixarray->encseq; ssar->readmode = ssar->suffixarray->readmode; ssar->numberofsuffixes = gt_encseq_total_length(ssar->encseq) + 1; ssar->nonspecials = gt_encseq_total_length(ssar->encseq) - gt_encseq_specialcharacters(ssar->encseq); ssar->extrainfo = NULL; return ssar; }
static int verifycodelists(const GtEncseq *encseq, unsigned int kmersize, unsigned int numofchars, const GtArrayGtCodetype *codeliststream, GtError *err) { bool haserr = false; GtArrayGtCodetype codeliststring; const GtUchar *characters; GtUword stringtotallength; gt_error_check(err); stringtotallength = gt_encseq_total_length(encseq); characters = gt_alphabet_characters(gt_encseq_alphabet(encseq)); GT_INITARRAY(&codeliststring,GtCodetype); collectkmercode(&codeliststring, encseq, kmersize, numofchars, stringtotallength); if (comparecodelists(codeliststream, &codeliststring, kmersize, numofchars, (const char *) characters, err) != 0) { haserr = true; } GT_FREEARRAY(&codeliststring,GtCodetype); return haserr ? -1 : 0; }
static void onlinespacedseedsearch(const GtEncseq *encseq, const Spacedseed *spse, const GtUchar *qptr,qp) { Windowiterator *wit; const GtUchar *buffer; GtUword currentpos, totallength; GtUword firstpos, windowschecked = 0; Bitsequence bitmask; bool matched; totallength = gt_encseq_total_length(encseq); wit = gt_windowiterator_new(encseq,spse->seedwidth,0,totallength); while (true) { buffer = gt_windowiterator_next(¤tpos,&firstpos,wit); if (buffer != NULL) { bitmask = FIRSTBIT; matched = true; for (idx=0; idx < spse->seedwidth; idx++) { if ((spse->seedbitvector & bitmask) && qptr[idx] != buffer[idx]) { matched = false; break; } bitmask >>= 1; } if (matched) { } } else { break;
static int encseq_lua_total_length(lua_State *L) { GtEncseq **encseq; encseq = check_encseq(L, 1); lua_pushnumber(L, gt_encseq_total_length(*encseq)); return 1; }
static unsigned long gt_esa2shulengthquery(const Suffixarray *suffixarray, const GtUchar *query, unsigned long querylen) { const GtUchar *qptr; unsigned long totalgmatchlength = 0, gmatchlength, remaining; unsigned long totallength = gt_encseq_total_length(suffixarray->encseq); for (qptr = query, remaining = querylen; remaining > 0; qptr++, remaining--) { if (ISSPECIAL(*qptr)) { gmatchlength = 0; } else { gmatchlength = gt_esa2shulengthatposition(suffixarray, totallength, 0, 0, totallength, qptr, query+querylen); } totalgmatchlength += gmatchlength; } return totalgmatchlength; }
void gt_checksortedsuffixes(const char *filename, int line, const GtEncseq *encseq, GtReadmode readmode, const GtSuffixsortspace *suffixsortspace, GtUword subbucketleft, GtUword numberofsuffixes, bool specialsareequal, bool specialsareequalatdepth0, GtUword depth) { GtUword idx, pos1, pos2, maxlcp, totallength = gt_encseq_total_length(encseq); GtEncseqReader *esr1, *esr2; int cmp; gt_assert(!specialsareequal || specialsareequalatdepth0); esr1 = gt_encseq_create_reader_with_readmode(encseq, readmode, 0); esr2 = gt_encseq_create_reader_with_readmode(encseq, readmode, 0); gt_assert(numberofsuffixes > 0); pos1 = gt_suffixsortspace_get(suffixsortspace,subbucketleft,0); gt_assert(pos1 < totallength); for (idx = 1UL; idx < numberofsuffixes; idx++) { pos2 = gt_suffixsortspace_get(suffixsortspace,subbucketleft,idx); if (pos2 < totallength) { cmp = gt_encseq_check_comparetwosuffixes(encseq, readmode, &maxlcp, specialsareequal, specialsareequalatdepth0, depth, pos1, pos2, esr1, esr2); if (cmp > 0) { showcomparisonfailure(filename, line, "checksortedsuffixes", encseq, readmode, suffixsortspace, subbucketleft, depth, idx-1, idx, cmp, maxlcp); exit(GT_EXIT_PROGRAMMING_ERROR); } gt_assert(depth == 0 || maxlcp <= depth); } pos1 = pos2; } gt_encseq_reader_delete(esr1); gt_encseq_reader_delete(esr2); }
Enumcodeatposition *gt_Enumcodeatposition_new(const GtEncseq *encseq, GtReadmode readmode, unsigned int prefixlength, unsigned int numofchars) { Enumcodeatposition *ecp; ecp = gt_malloc(sizeof *ecp); ecp->encseq = encseq; ecp->readmode = readmode; ecp->multimappower = gt_initmultimappower(numofchars,prefixlength); ecp->filltable = gt_initfilltable(numofchars,prefixlength); ecp->prefixlength = prefixlength; ecp->moveforward = GT_ISDIRREVERSE(readmode) ? true : false; ecp->totallength = gt_encseq_total_length(encseq); if (ecp->moveforward) { ecp->previousrange.start = ecp->previousrange.end = 0; } else { ecp->previousrange.start = ecp->previousrange.end = ecp->totallength; } ecp->exhausted = false; if (gt_encseq_has_specialranges(encseq)) { ecp->sri = gt_specialrangeiterator_new(encseq,ecp->moveforward); } else { ecp->sri = NULL; } return ecp; }
static void sequenceobject_init(Sequenceobject *seq, GtExtendCharAccess extend_char_access_mode, const GtEncseq *encseq, GtReadmode readmode, GtUword startpos, GtUword len, GtEncseqReader *encseq_r, GtAllocatedMemory *sequence_cache, GtUword totallength ) { gt_assert(seq != NULL); seq->encseq = NULL; seq->encseqreader = NULL; seq->twobitencoding = NULL; seq->cache_ptr = NULL; seq->sequence_cache = NULL; if (extend_char_access_mode == GT_EXTEND_CHAR_ACCESS_ANY && gt_encseq_has_twobitencoding(encseq) && gt_encseq_wildcards(encseq) == 0) { seq->twobitencoding = gt_encseq_twobitencoding_export(encseq); } if (seq->twobitencoding == NULL && (extend_char_access_mode == GT_EXTEND_CHAR_ACCESS_ANY || extend_char_access_mode == GT_EXTEND_CHAR_ACCESS_ENCSEQ_READER)) { gt_encseq_reader_reinit_with_readmode(encseq_r, encseq, readmode, startpos); seq->encseqreader = encseq_r; gt_assert(seq->encseqreader != NULL); seq->sequence_cache = sequence_cache; gt_assert(sequence_cache != NULL); seq->cache_ptr = sequence_cache->space; seq->min_access_pos = GT_UWORD_MAX; seq->cache_num_positions = 0; seq->cache_offset = 0; } if (seq->twobitencoding == NULL && seq->encseqreader == NULL && (extend_char_access_mode == GT_EXTEND_CHAR_ACCESS_ANY || extend_char_access_mode == GT_EXTEND_CHAR_ACCESS_ENCSEQ)) { seq->encseq = encseq; } seq->substringlength = len; if (readmode == GT_READMODE_FORWARD) { seq->startpos = startpos; seq->forward = true; } else { gt_assert(readmode == GT_READMODE_REVERSE); gt_assert(gt_encseq_total_length(encseq) == totallength); gt_assert(startpos + 1 <= totallength); seq->startpos = totallength - 1 - startpos; seq->forward = false; } gt_assert(seq->twobitencoding != NULL || seq->encseqreader != NULL || seq->encseq != NULL); }
static int encseq_lua_filenum(lua_State *L) { GtEncseq **encseq; GtUword pos; encseq = check_encseq(L, 1); pos = luaL_checknumber(L, 2); luaL_argcheck(L, pos < gt_encseq_total_length(*encseq), 2, "cannot exceed total length of encoded sequence"); lua_pushnumber(L, gt_encseq_filenum(*encseq, pos)); return 1; }
void getencseqkmers(const GtEncseq *encseq, GtReadmode readmode, unsigned int kmersize, void(*processkmercode)(void *, unsigned long, const GtKmercode *), void *processkmercodeinfo) { unsigned long currentposition = 0, totallength; Kmerstream *spwp; GtUchar charcode; GtEncseqReader *esr; unsigned int numofchars, overshoot; totallength = gt_encseq_total_length(encseq); if (totallength < (unsigned long) kmersize) { return; } numofchars = gt_alphabet_num_of_chars(gt_encseq_alphabet(encseq)); spwp = kmerstream_new(numofchars,kmersize); esr = gt_encseq_create_reader_with_readmode(encseq,readmode,0); for (currentposition = 0; currentposition < (unsigned long) kmersize; currentposition++) { charcode = gt_encseq_reader_next_encoded_char(esr); GT_CHECKENCCHAR(charcode,encseq,currentposition,readmode); spwp->windowwidth++; updatespecialpositions(spwp,charcode,false,0); spwp->cyclicwindow[spwp->windowwidth-1] = charcode; } kmerstream_newcode(&spwp->currentkmercode,spwp); processkmercode(processkmercodeinfo,0,&spwp->currentkmercode); for (currentposition = (unsigned long) kmersize; currentposition<totallength; currentposition++) { charcode = gt_encseq_reader_next_encoded_char(esr); GT_CHECKENCCHAR(charcode,encseq,currentposition,readmode); shiftrightwithchar(spwp,charcode); kmerstream_newcode(&spwp->currentkmercode,spwp); processkmercode(processkmercodeinfo,currentposition + 1 - spwp->kmersize, &spwp->currentkmercode); } gt_encseq_reader_delete(esr); for (overshoot=0; overshoot<kmersize; overshoot++) { shiftrightwithchar(spwp,(GtUchar) WILDCARD); kmerstream_newcode(&spwp->currentkmercode,spwp); processkmercode(processkmercodeinfo, overshoot + currentposition + 1 - spwp->kmersize, &spwp->currentkmercode); } kmerstream_delete(spwp); }
int gt_test_trieins(bool onlyins,const char *indexname,GtError *err) { Suffixarray suffixarray; bool haserr = false; unsigned long totallength = 0; gt_error_check(err); if (streamsuffixarray(&suffixarray, SARR_ESQTAB, indexname, NULL, err) != 0) { haserr = true; } else { totallength = gt_encseq_total_length(suffixarray.encseq); } if (!haserr) { Mergertrierep trierep; const GtUchar *characters; trierep.encseqreadinfo = gt_malloc(sizeof *trierep.encseqreadinfo); trierep.encseqreadinfo->encseqptr = suffixarray.encseq; trierep.encseqreadinfo->readmode = suffixarray.readmode; characters = gt_alphabet_characters(gt_encseq_alphabet(suffixarray.encseq)); gt_mergertrie_initnodetable(&trierep,totallength,1U); maketrie(&trierep,characters,totallength); if (onlyins) { #ifdef WITHTRIEIDENT #ifdef WITHTRIESHOW showtrie(&trierep,characters); #endif checktrie(&trierep,totallength+1,totallength,err); #endif } else { #ifdef WITHTRIEIDENT #ifdef WITHTRIESHOW showallnoderelations(trierep.root); #endif #endif successivelydeletesmallest(&trierep,totallength,characters,err); } gt_mergertrie_delete(&trierep); } gt_freesuffixarray(&suffixarray); return haserr ? -1 : 0; }
GtCondenseq *gt_condenseq_new(const GtEncseq *orig_es, GtLogger *logger) { GtCondenseq *condenseq; condenseq = condenseq_new_empty(gt_encseq_alphabet(orig_es)); condenseq->orig_num_seq = gt_encseq_num_of_sequences(orig_es); condenseq->ssptab = condenseq_fill_tab(condenseq, orig_es); condenseq->orig_length = gt_encseq_total_length(orig_es); condenseq_process_descriptions(condenseq, orig_es, logger); return condenseq; }
unsigned long gt_contfind_bottomup(Sequentialsuffixarrayreader *ssar, bool show_progressbar, GtBitsequence *contained, unsigned long firstrevcompl, unsigned long read_length /* 0 = variable */) { ContfindBUstate state; unsigned long totallength; GT_UNUSED int retval; gt_assert(ssar != NULL); gt_assert(contained != NULL); state.contained = contained; state.encseq = gt_encseqSequentialsuffixarrayreader(ssar); totallength = gt_encseq_total_length(state.encseq); state.nofsequences = gt_encseq_num_of_sequences(state.encseq); if (read_length == 0) { prepare_sspbittab_and_shortest(totallength, &state); } else { state.shortest = read_length; state.spacing = read_length + 1; } state.show_progressbar = show_progressbar; state.csize = 0; state.cmin = 0; state.firstrevcompl = firstrevcompl; state.counter = 0; if (show_progressbar) { state.progress = 0; gt_progressbar_start(&(state.progress), (unsigned long long)totallength); } retval = (read_length == 0) ? gt_esa_bottomup_rdjcv(ssar, &state, NULL) : gt_esa_bottomup_rdjce(ssar, &state, NULL); gt_assert(retval == 0); if (show_progressbar) gt_progressbar_stop(); if (read_length == 0) gt_free(state.sspbittab); return state.counter; }
static void showmergertrie2(const Mergertrierep *trierep, const GtUchar *characters, unsigned int level, const Mergertrienode *node) { GtUchar cc = 0; GtUword pos, endpos; Mergertrienode *current; for (current = node->firstchild; current != NULL; current = current->rightsibling) { printf("%*.*s",(int) (6 * level),(int) (6 * level)," "); if (MTRIE_ISLEAF(current)) { endpos = gt_encseq_total_length( trierep->encseqtable[current->suffixinfo.idx]); } else { endpos = current->suffixinfo.startpos + current->depth; } for (pos = current->suffixinfo.startpos + node->depth; pos < endpos; pos++) { cc = gt_encseq_get_encoded_char( /* just for testing */ trierep->enseqreadinfo[current->suffixinfo.idx].encseqptr, pos, trierep->enseqreadinfo[current->suffixinfo.idx].readmode); if (ISSPECIAL(cc)) { printf("#\n"); break; } printf("%c",characters[(int) cc]); } if (MTRIE_ISLEAF(current)) { if (!ISSPECIAL(cc)) { printf("~\n"); } } else { printf(" d="GT_WU",i=" Formatuint64_t "\n", current->depth, PRINTuint64_tcast(current->suffixinfo.ident)); showmergertrie2(trierep,characters,level+1,current); } } }
static int encseq_reader_lua_reinit_with_readmode(lua_State *L) { GtEncseq **encseq; GtEncseqReader **reader; GtUword startpos; GtReadmode readmode; reader = check_encseq_reader(L, 1); encseq = check_encseq(L, 2); readmode = luaL_checknumber(L, 3); startpos = luaL_checknumber(L, 4); luaL_argcheck(L, startpos < gt_encseq_total_length(*encseq), 4, "cannot exceed total length of encoded sequence"); gt_encseq_reader_reinit_with_readmode(*reader, *encseq, readmode, startpos); return 0; }
static int encseq_lua_get_decoded_char(lua_State *L) { GtEncseq **encseq; GtUword pos; int readmode; char cc; encseq = check_encseq(L, 1); pos = luaL_checknumber(L, 2); readmode = luaL_checknumber(L, 3); luaL_argcheck(L, pos < gt_encseq_total_length(*encseq), 2, "cannot exceed total length of encoded sequence"); cc = gt_encseq_get_decoded_char(*encseq, pos, readmode); lua_pushlstring(L, &cc, sizeof (char)); return 1; }
void gt_showentiresuftab(const GtEncseq *encseq, GtReadmode readmode, const GtSuffixsortspace *suffixsortspace, GtUword subbucketleft, GtUword depth) { GtUword idx, pos, totallength = gt_encseq_total_length(encseq); for (idx = 0; idx <= totallength; idx++) { pos = gt_suffixsortspace_get(suffixsortspace,subbucketleft,idx); printf("suftab["GT_WU"]="GT_WU" ",idx,pos); gt_encseq_showatstartposwithdepth(stdout,encseq,readmode,pos,depth); printf("\n"); } }
void gt_bioseq_show_stat(GtBioseq *bs, GtFile *outfp) { GtUword i, num_of_seqs; gt_assert(bs); num_of_seqs = gt_bioseq_number_of_sequences(bs); gt_file_xprintf(outfp, "showing statistics for sequence file \"%s\"\n", gt_str_get(bs->sequence_file)); gt_file_xprintf(outfp, "number of sequences: "GT_WU"\n", num_of_seqs); gt_file_xprintf(outfp, "total length: "GT_WU"\n", gt_encseq_total_length(bs->encseq) - gt_encseq_num_of_sequences(bs->encseq) + 1); for (i = 0; i < num_of_seqs; i++) { gt_file_xprintf(outfp, "sequence #"GT_WU" length: "GT_WU"\n", i+1, gt_bioseq_get_sequence_length(bs, i)); } }
static int encseq_lua_extract_encoded(lua_State *L) { GtEncseq **encseq; GtUword from, to; unsigned char *string; encseq = check_encseq(L, 1); from = luaL_checknumber(L, 2); to = luaL_checknumber(L, 3); luaL_argcheck(L, from <= to, 2, "must be <= range endposition"); luaL_argcheck(L, to < gt_encseq_total_length(*encseq), 3, "cannot exceed total length of encoded sequence"); string = gt_malloc((to - from + 1) * sizeof (unsigned char)); gt_encseq_extract_encoded(*encseq, string, from, to); encseq_lua_push_buffer(L, string, (to - from + 1)); return 1; }
static int encseq_lua_create_reader_with_readmode(lua_State *L) { GtEncseq **encseq; GtEncseqReader *reader; GtUword startpos; GtReadmode readmode; encseq = check_encseq(L, 1); readmode = luaL_checknumber(L, 2); startpos = luaL_checknumber(L, 3); luaL_argcheck(L, startpos < gt_encseq_total_length(*encseq), 3, "cannot exceed total length of encoded sequence"); reader = gt_encseq_create_reader_with_readmode(*encseq, readmode, startpos); gt_assert(reader); gt_lua_encseq_reader_push(L, reader); return 1; }
static GtUchar getfirstedgechar(const Mergertrierep *trierep, const Mergertrienode *node, GtUword prevdepth) { Encseqreadinfo *eri = trierep->encseqreadinfo + node->suffixinfo.idx; if (MTRIE_ISLEAF(node) && node->suffixinfo.startpos + prevdepth >= gt_encseq_total_length(eri->encseqptr)) { return (GtUchar) SEPARATOR; } return gt_encseq_get_encoded_char(eri->encseqptr, /* Random access */ node->suffixinfo.startpos + prevdepth, eri->readmode); }
/*@notnull@*/ GtKmercodeiterator *gt_kmercodeiterator_encseq_new( const GtEncseq *encseq, GtReadmode readmode, unsigned int kmersize, unsigned long startpos) { GtKmercodeiterator *kmercodeiterator; unsigned int numofchars; GtUchar charcode; gt_assert(!GT_ISDIRREVERSE(readmode) || startpos == 0); kmercodeiterator = gt_malloc(sizeof (*kmercodeiterator)); kmercodeiterator->totallength = gt_encseq_total_length(encseq); kmercodeiterator->startpos = startpos; gt_assert(startpos < kmercodeiterator->totallength); if (kmercodeiterator->totallength - startpos < (unsigned long) kmersize) { kmercodeiterator->inputexhausted = true; kmercodeiterator->fb = NULL; kmercodeiterator->encseq = encseq; kmercodeiterator->esr = NULL; kmercodeiterator->spwp = NULL; } else { kmercodeiterator->inputexhausted = false; kmercodeiterator->fb = NULL; kmercodeiterator->encseq = encseq; kmercodeiterator->readmode = readmode; kmercodeiterator->esr = gt_encseq_create_reader_with_readmode(encseq, readmode, startpos); numofchars = gt_alphabet_num_of_chars(gt_encseq_alphabet(encseq)); kmercodeiterator->spwp = kmerstream_new(numofchars,kmersize); kmercodeiterator->hasprocessedfirst = false; for (kmercodeiterator->currentposition = startpos; kmercodeiterator->currentposition < startpos+(unsigned long) kmersize; kmercodeiterator->currentposition++) { charcode = gt_encseq_reader_next_encoded_char(kmercodeiterator->esr); kmercodeiterator->spwp->windowwidth++; updatespecialpositions(kmercodeiterator->spwp,charcode,false,0); kmercodeiterator->spwp->cyclicwindow[kmercodeiterator-> spwp->windowwidth-1] = charcode; } } return kmercodeiterator; }
static GtUword samplesubstring(GtUchar *seqspace, const GtEncseq *encseq, GtUword substringlength) { GtUword start, totallength; totallength = gt_encseq_total_length(encseq); start = (GtUword) (random() % totallength); if (start + substringlength > totallength) { substringlength = totallength - start; } gt_assert(substringlength > 0); gt_encseq_extract_encoded(encseq,seqspace,start, start+substringlength-1); return substringlength; }
static void showprjinfo(FILE *outprj, GtReadmode readmode, const GtEncseq *encseq, GtUword numberofallsortedsuffixes, unsigned int prefixlength, GtUword numoflargelcpvalues, double averagelcp, GtUword maxbranchdepth, const Definedunsignedlong *longest) { GtUword totallength; GtUword numofsequences; totallength = gt_encseq_total_length(encseq); fprintf(outprj,"totallength="GT_WU"\n",totallength); PRJSPECIALOUT(specialcharacters); PRJSPECIALOUT(specialranges); PRJSPECIALOUT(realspecialranges); PRJSPECIALOUT(lengthofspecialprefix); PRJSPECIALOUT(lengthofspecialsuffix); PRJSPECIALOUT(wildcards); PRJSPECIALOUT(wildcardranges); PRJSPECIALOUT(realwildcardranges); PRJSPECIALOUT(lengthofwildcardprefix); PRJSPECIALOUT(lengthofwildcardsuffix); numofsequences = gt_encseq_num_of_sequences(encseq); fprintf(outprj,"numofsequences="GT_WU"\n",numofsequences); fprintf(outprj,"numofdbsequences="GT_WU"\n",numofsequences); fprintf(outprj,"numofquerysequences=0\n"); fprintf(outprj,"numberofallsortedsuffixes="GT_WU"\n", numberofallsortedsuffixes); if (longest->defined) { fprintf(outprj,"longest="GT_WU"\n",longest->valueunsignedlong); } fprintf(outprj,"prefixlength=%u\n",prefixlength); fprintf(outprj,"largelcpvalues="GT_WU"\n",numoflargelcpvalues); fprintf(outprj,"averagelcp=%.2f\n",averagelcp); fprintf(outprj,"maxbranchdepth="GT_WU"\n",maxbranchdepth); fprintf(outprj,"integersize=%u\n", (unsigned int) (sizeof (GtUword) * CHAR_BIT)); fprintf(outprj,"littleendian=%c\n",gt_is_little_endian() ? '1' : '0'); fprintf(outprj,"readmode=%u\n",(unsigned int) readmode); fprintf(outprj,"mirrored=%c\n", gt_encseq_is_mirrored(encseq) ? '1' : '0'); }
static void gt_readjoiner_assembly_pump_encseq_through_cache( const GtEncseq *encseq) { const GtTwobitencoding *twobitencoding = gt_encseq_twobitencoding_export( encseq); uint64_t sum = 0; /* compute the sum, so that the compiler does no remove the code accessing twobitencoding during optimization */ GtUword idx, totallength = gt_encseq_total_length(encseq), numofunits = ! gt_encseq_is_mirrored(encseq) ? gt_unitsoftwobitencoding(totallength) : gt_unitsoftwobitencoding((totallength - 1)/2); for (idx = 0; idx < numofunits; idx++) sum += twobitencoding[idx]; gt_assert(sum > 0); #ifndef S_SPLINT_S gt_log_log("encseq codes-sum: %"PRIu64, sum); #endif }
void gt_kmercodeiterator_reset(GtKmercodeiterator *kmercodeiterator, GtReadmode readmode, GtUword startpos) { GtUchar charcode; const GtEncseq *encseq = kmercodeiterator->encseq; GtUword kmersize = (GtUword) kmercodeiterator->spwp->kmersize; gt_assert(!GT_ISDIRREVERSE(readmode) || startpos == 0); kmercodeiterator->totallength = gt_encseq_total_length(encseq); kmercodeiterator->startpos = startpos; gt_assert(startpos < kmercodeiterator->totallength); kmercodeiterator->fb = NULL; if (kmercodeiterator->totallength - startpos < kmersize) { kmercodeiterator->inputexhausted = true; gt_encseq_reader_delete(kmercodeiterator->esr); kmercodeiterator->esr = NULL; kmerstream_delete(kmercodeiterator->spwp); kmercodeiterator->spwp = NULL; } else { kmercodeiterator->inputexhausted = false; kmercodeiterator->readmode = readmode; gt_encseq_reader_reinit_with_readmode(kmercodeiterator->esr, encseq, readmode, startpos); kmerstream_reset(kmercodeiterator->spwp); kmercodeiterator->hasprocessedfirst = false; for (kmercodeiterator->currentposition = startpos; kmercodeiterator->currentposition < startpos+(GtUword) kmersize; kmercodeiterator->currentposition++) { charcode = gt_encseq_reader_next_encoded_char(kmercodeiterator->esr); kmercodeiterator->spwp->windowwidth++; kmerstream_updatespecialpositions(kmercodeiterator->spwp,charcode, false,0); kmercodeiterator->spwp->cyclicwindow[kmercodeiterator-> spwp->windowwidth-1] = charcode; } } }
GtCodonIterator* gt_codon_iterator_encseq_new_with_readmode(GtEncseq *encseq, unsigned long startpos, unsigned long length, GtReadmode readmode, GT_UNUSED GtError *err) { GtCodonIteratorEncseq *cie; GtCodonIterator *ci; gt_assert(encseq && startpos + length - 1 < gt_encseq_total_length(encseq)); gt_error_check(err); ci = gt_codon_iterator_create(gt_codon_iterator_encseq_class()); cie = gt_codon_iterator_encseq_cast(ci); cie->encseq = gt_encseq_ref(encseq); cie->readmode = readmode; ci->pvt->length = length; ci->pvt->curpos = 0; ci->pvt->startpos = startpos; return ci; }
static void verifymatch(const GtEncseq *encseq, GtUword len, GtUword pos1, uint64_t seqnum2, GtUword pos2, GtReadmode readmode) { if (readmode == GT_READMODE_REVERSE) { GtUword offset, seqstartpos, totallength = gt_encseq_total_length(encseq); GtUchar cc1, cc2; seqstartpos = gt_encseq_seqstartpos(encseq, seqnum2); pos2 += seqstartpos; for (offset = 0; offset < len; offset++) { gt_assert(pos1 + len - 1 < totallength); gt_assert(pos2 + len - 1 < totallength); cc1 = gt_encseq_get_encoded_char(encseq,pos1+offset,GT_READMODE_FORWARD); cc2 = gt_encseq_get_encoded_char(encseq,pos2+len-1-offset, GT_READMODE_FORWARD); gt_assert(cc1 == cc2 && ISNOTSPECIAL(cc1)); } if (pos1 + len < totallength) { cc1 = gt_encseq_get_encoded_char(encseq,pos1+len,GT_READMODE_FORWARD); } else { cc1 = SEPARATOR; } if (pos2 > 0) { cc2 = gt_encseq_get_encoded_char(encseq,pos2-1,GT_READMODE_FORWARD); } else { cc2 = SEPARATOR; } gt_assert(cc1 != cc2 || ISSPECIAL(cc1)); } }
static Mergertrienode *mtrie_makenewbranch(Mergertrierep *trierep, Suffixinfo *suffixinfo, GtUword currentdepth, Mergertrienode *oldnode) { Mergertrienode *newbranch, *newleaf; GtUchar cc1, cc2; Encseqreadinfo *eri = trierep->encseqreadinfo + suffixinfo->idx; #ifdef WITHTRIEIDENT #ifdef WITHTRIESHOW printf("makenewbranch(ident=" Formatuint64_t ")\n", PRINTuint64_tcast(suffixinfo->ident)); #endif #endif newbranch = newMergertrienode(trierep); newbranch->suffixinfo = *suffixinfo; newbranch->rightsibling = oldnode->rightsibling; cc1 = getfirstedgechar(trierep,oldnode,currentdepth); if (suffixinfo->startpos + currentdepth >= gt_encseq_total_length(eri->encseqptr)) { cc2 = (GtUchar) SEPARATOR; } else { cc2 = gt_encseq_get_encoded_char(eri->encseqptr, suffixinfo->startpos + currentdepth, eri->readmode); } newleaf = mtrie_makenewleaf(trierep,suffixinfo); if (mtrie_comparecharacters(cc1,oldnode->suffixinfo.idx, cc2,suffixinfo->idx) <= 0) { makesuccs(newbranch,oldnode,newleaf); } else { makesuccs(newbranch,newleaf,oldnode); } newbranch->depth = currentdepth; return newbranch; }
GtWtree* gt_wtree_encseq_new(GtEncseq *encseq) { /* sample rate for compressd bitseq */ const unsigned int samplerate = 32U; GtWtree *wtree; GtWtreeEncseq *wtree_encseq; wtree = gt_wtree_create(gt_wtree_encseq_class()); wtree_encseq = gt_wtree_encseq_cast(wtree); wtree_encseq->encseq = gt_encseq_ref(encseq); wtree_encseq->alpha = gt_alphabet_ref(gt_encseq_alphabet(encseq)); /* encoded chars + WC given by gt_alphabet_size, we have to encode UNDEFCHAR and SEPARATOR too */ wtree_encseq->alpha_size = gt_alphabet_size(wtree_encseq->alpha) + 2; wtree->members->num_of_symbols = (GtUword) wtree_encseq->alpha_size; /* levels in tree: \lceil log_2(\sigma)\rceil */ wtree_encseq->levels = gt_determinebitspervalue((GtUword) wtree_encseq->alpha_size); wtree_encseq->root_fo = gt_wtree_encseq_fill_offset_new(); wtree_encseq->current_fo = wtree_encseq->root_fo; wtree->members->length = gt_encseq_total_length(encseq); /* each level has number of symbols bits */ wtree_encseq->num_of_bits = wtree_encseq->levels * wtree->members->length; wtree_encseq->bits_size = wtree_encseq->num_of_bits / (sizeof (GtBitsequence) * CHAR_BIT); if (wtree_encseq->num_of_bits % (sizeof (GtBitsequence) * CHAR_BIT) != 0) wtree_encseq->bits_size++; wtree_encseq->bits = gt_calloc((size_t) wtree_encseq->bits_size, sizeof (GtBitsequence)); wtree_encseq->node_start = 0; gt_wtree_encseq_fill_bits(wtree_encseq); wtree_encseq->c_bits = gt_compressed_bitsequence_new(wtree_encseq->bits, samplerate, wtree_encseq->num_of_bits); gt_free(wtree_encseq->bits); wtree_encseq->bits = NULL; return wtree; }