void gt_checksortedsuffixes(const char *filename, int line, const GtEncseq *encseq, GtReadmode readmode, const GtSuffixsortspace *suffixsortspace, GtUword subbucketleft, GtUword numberofsuffixes, bool specialsareequal, bool specialsareequalatdepth0, GtUword depth) { GtUword idx, pos1, pos2, maxlcp, totallength = gt_encseq_total_length(encseq); GtEncseqReader *esr1, *esr2; int cmp; gt_assert(!specialsareequal || specialsareequalatdepth0); esr1 = gt_encseq_create_reader_with_readmode(encseq, readmode, 0); esr2 = gt_encseq_create_reader_with_readmode(encseq, readmode, 0); gt_assert(numberofsuffixes > 0); pos1 = gt_suffixsortspace_get(suffixsortspace,subbucketleft,0); gt_assert(pos1 < totallength); for (idx = 1UL; idx < numberofsuffixes; idx++) { pos2 = gt_suffixsortspace_get(suffixsortspace,subbucketleft,idx); if (pos2 < totallength) { cmp = gt_encseq_check_comparetwosuffixes(encseq, readmode, &maxlcp, specialsareequal, specialsareequalatdepth0, depth, pos1, pos2, esr1, esr2); if (cmp > 0) { showcomparisonfailure(filename, line, "checksortedsuffixes", encseq, readmode, suffixsortspace, subbucketleft, depth, idx-1, idx, cmp, maxlcp); exit(GT_EXIT_PROGRAMMING_ERROR); } gt_assert(depth == 0 || maxlcp <= depth); } pos1 = pos2; } gt_encseq_reader_delete(esr1); gt_encseq_reader_delete(esr2); }
static void gt_seqorder_output(unsigned long seqnum, GtEncseq *encseq) { GtEncseqReader *esr; unsigned long startpos, len, desclen = 0; const char *desc = NULL; unsigned long i; startpos = gt_encseq_seqstartpos(encseq, seqnum); len = gt_encseq_seqlength(encseq, seqnum); gt_xfputc(GT_FASTA_SEPARATOR, stdout); if (gt_encseq_has_description_support(encseq)) { desc = gt_encseq_description(encseq, &desclen, seqnum); gt_xfwrite(desc, (size_t)1, (size_t)desclen, stdout); } gt_xfputc('\n', stdout); esr = gt_encseq_create_reader_with_readmode(encseq, GT_READMODE_FORWARD, startpos); for (i = 0; i < len; i++) { gt_xfputc(gt_encseq_reader_next_decoded_char(esr), stdout); } gt_encseq_reader_delete(esr); gt_xfputc('\n', stdout); }
static int encseq_reader_lua_delete(lua_State *L) { GtEncseqReader **reader; reader = check_encseq_reader(L, 1); gt_encseq_reader_delete(*reader); return 0; }
void gt_checkifprefixesareidentical(const char *filename, int line, const GtEncseq *encseq, GtReadmode readmode, const GtSuffixsortspace *suffixsortspace, GtUword subbucketleft, GtUword width, GtUword depth) { GtUword idx, maxlcp, pos1, pos2; int cmp; GtEncseqReader *esr1, *esr2; esr1 = gt_encseq_create_reader_with_readmode(encseq, readmode, 0); esr2 = gt_encseq_create_reader_with_readmode(encseq, readmode, 0); gt_assert(depth > 0); for (idx = 0; idx < width-1; idx++) { pos1 = gt_suffixsortspace_get(suffixsortspace,subbucketleft,idx); pos2 = gt_suffixsortspace_get(suffixsortspace,subbucketleft,idx+1); cmp = gt_encseq_check_comparetwosuffixes(encseq, readmode, &maxlcp, false, /* specialsareequal */ true,/* specialsareequalatdepth0 */ depth,pos1, pos2, esr1, esr2); gt_assert(maxlcp <= depth); if (cmp != 0 || maxlcp < depth) { showcomparisonfailure(filename, line, "checkifprefixesareidentical", encseq, readmode, suffixsortspace, subbucketleft, depth, idx,idx+1,cmp,maxlcp); exit(GT_EXIT_PROGRAMMING_ERROR); } } gt_encseq_reader_delete(esr1); gt_encseq_reader_delete(esr2); }
void gt_mmsearchiterator_delete(GtMMsearchiterator *mmsi) { if (mmsi != NULL) { gt_encseq_reader_delete(mmsi->esr); gt_free(mmsi); } }
void gt_kmercodeiterator_delete(GtKmercodeiterator *kmercodeiterator) { if (kmercodeiterator != NULL) { gt_encseq_reader_delete(kmercodeiterator->esr); kmerstream_delete(kmercodeiterator->spwp); gt_sequence_buffer_delete(kmercodeiterator->fb); gt_free(kmercodeiterator); } }
void gt_freeMyersonlineresources(Myersonlineresources *ptrmyersonlineresources) { if (ptrmyersonlineresources != NULL) { gt_free(ptrmyersonlineresources->eqsvectorrev); gt_free(ptrmyersonlineresources->eqsvector); gt_encseq_reader_delete(ptrmyersonlineresources->esr); gt_free(ptrmyersonlineresources); } }
void getencseqkmers(const GtEncseq *encseq, GtReadmode readmode, unsigned int kmersize, void(*processkmercode)(void *, unsigned long, const GtKmercode *), void *processkmercodeinfo) { unsigned long currentposition = 0, totallength; Kmerstream *spwp; GtUchar charcode; GtEncseqReader *esr; unsigned int numofchars, overshoot; totallength = gt_encseq_total_length(encseq); if (totallength < (unsigned long) kmersize) { return; } numofchars = gt_alphabet_num_of_chars(gt_encseq_alphabet(encseq)); spwp = kmerstream_new(numofchars,kmersize); esr = gt_encseq_create_reader_with_readmode(encseq,readmode,0); for (currentposition = 0; currentposition < (unsigned long) kmersize; currentposition++) { charcode = gt_encseq_reader_next_encoded_char(esr); GT_CHECKENCCHAR(charcode,encseq,currentposition,readmode); spwp->windowwidth++; updatespecialpositions(spwp,charcode,false,0); spwp->cyclicwindow[spwp->windowwidth-1] = charcode; } kmerstream_newcode(&spwp->currentkmercode,spwp); processkmercode(processkmercodeinfo,0,&spwp->currentkmercode); for (currentposition = (unsigned long) kmersize; currentposition<totallength; currentposition++) { charcode = gt_encseq_reader_next_encoded_char(esr); GT_CHECKENCCHAR(charcode,encseq,currentposition,readmode); shiftrightwithchar(spwp,charcode); kmerstream_newcode(&spwp->currentkmercode,spwp); processkmercode(processkmercodeinfo,currentposition + 1 - spwp->kmersize, &spwp->currentkmercode); } gt_encseq_reader_delete(esr); for (overshoot=0; overshoot<kmersize; overshoot++) { shiftrightwithchar(spwp,(GtUchar) WILDCARD); kmerstream_newcode(&spwp->currentkmercode,spwp); processkmercode(processkmercodeinfo, overshoot + currentposition + 1 - spwp->kmersize, &spwp->currentkmercode); } kmerstream_delete(spwp); }
void gt_encseq2symbolstring(FILE *fpout, const GtEncseq *encseq, GtReadmode readmode, unsigned long start, unsigned long wlen, unsigned long width) { unsigned long j, idx, lastpos; GtUchar currentchar; GtEncseqReader *esr; const GtAlphabet *alpha; esr = gt_encseq_create_reader_with_readmode(encseq, readmode, start); gt_assert(width > 0); lastpos = start + wlen - 1; alpha = gt_encseq_alphabet(encseq); for (idx = start, j = 0; /* Nothing */ ; idx++) { currentchar = gt_encseq_reader_next_encoded_char(esr); if (currentchar == (GtUchar) SEPARATOR) { fprintf(fpout,"\n>\n"); j = 0; } else { gt_alphabet_echo_pretty_symbol(alpha,fpout,currentchar); } if (idx == lastpos) { fprintf(fpout,"\n"); break; } if (currentchar != (GtUchar) SEPARATOR) { j++; if (j >= width) { fprintf(fpout,"\n"); j = 0; } } } gt_encseq_reader_delete(esr); }
void gt_kmercodeiterator_reset(GtKmercodeiterator *kmercodeiterator, GtReadmode readmode, GtUword startpos) { GtUchar charcode; const GtEncseq *encseq = kmercodeiterator->encseq; GtUword kmersize = (GtUword) kmercodeiterator->spwp->kmersize; gt_assert(!GT_ISDIRREVERSE(readmode) || startpos == 0); kmercodeiterator->totallength = gt_encseq_total_length(encseq); kmercodeiterator->startpos = startpos; gt_assert(startpos < kmercodeiterator->totallength); kmercodeiterator->fb = NULL; if (kmercodeiterator->totallength - startpos < kmersize) { kmercodeiterator->inputexhausted = true; gt_encseq_reader_delete(kmercodeiterator->esr); kmercodeiterator->esr = NULL; kmerstream_delete(kmercodeiterator->spwp); kmercodeiterator->spwp = NULL; } else { kmercodeiterator->inputexhausted = false; kmercodeiterator->readmode = readmode; gt_encseq_reader_reinit_with_readmode(kmercodeiterator->esr, encseq, readmode, startpos); kmerstream_reset(kmercodeiterator->spwp); kmercodeiterator->hasprocessedfirst = false; for (kmercodeiterator->currentposition = startpos; kmercodeiterator->currentposition < startpos+(GtUword) kmersize; kmercodeiterator->currentposition++) { charcode = gt_encseq_reader_next_encoded_char(kmercodeiterator->esr); kmercodeiterator->spwp->windowwidth++; kmerstream_updatespecialpositions(kmercodeiterator->spwp,charcode, false,0); kmercodeiterator->spwp->cyclicwindow[kmercodeiterator-> spwp->windowwidth-1] = charcode; } } }
static void gt_wtree_encseq_fill_bits(GtWtreeEncseq *we) { unsigned int level_idx; GtUword sym_idx; GtEncseqReader *er = gt_encseq_create_reader_with_readmode(we->encseq, GT_READMODE_FORWARD, 0); gt_assert(we != NULL); for (level_idx = 0; level_idx < we->levels; level_idx++) { for (sym_idx = 0; sym_idx < we->parent_instance.members->length; sym_idx++) { GtWtreeSymbol c_sym = gt_wtree_encseq_map(we, gt_encseq_reader_next_encoded_char(er)); if (gt_wtree_encseq_set_nodestart_and_current_fo(we, level_idx, c_sym)) { /*0*/ if (we->current_fo != NULL) { gt_assert(we->node_start + we->current_fo->offset < we->num_of_bits); we->current_fo->offset++; we->current_fo->left_size++; } } else { if (we->current_fo != NULL) { gt_assert(we->node_start + we->current_fo->offset < we->num_of_bits); GT_SETIBIT(we->bits, we->node_start + we->current_fo->offset); we->current_fo->offset++; } } } gt_encseq_reader_reinit_with_readmode(er, we->encseq, GT_READMODE_FORWARD, 0); } gt_encseq_reader_delete(er); gt_wtree_encseq_fill_offset_delete(we->root_fo); we->root_fo = we->current_fo = NULL; gt_encseq_delete(we->encseq); we->encseq = NULL; }
void gt_lookaheadsearchPSSM(const GtEncseq *encseq, const Profilematrix *prof) { unsigned long firstpos, bufsize; GtUchar currentchar; unsigned long pos; GtEncseqReader *esr; unsigned long totallength = gt_encseq_total_length(encseq); GtUchar *buffer; esr = gt_encseq_create_reader_with_readmode(encseq,GT_READMODE_FORWARD,0); buffer = gt_malloc(sizeof *buffer * prof->dimension); firstpos = bufsize = 0; for (pos=0; pos < totallength; pos++) { currentchar = gt_encseq_reader_next_encoded_char(esr); if (ISSPECIAL(currentchar)) { bufsize = firstpos = 0; } else { if (bufsize < prof->dimension) { buffer[bufsize++] = currentchar; } else { buffer[firstpos++] = currentchar; if (firstpos == prof->dimension) { firstpos = 0; } } } } gt_encseq_reader_delete(esr); gt_free(buffer); }
static int output_sequence(GtEncseq *encseq, GtEncseqDecodeArguments *args, const char *filename, GtError *err) { GtUword i, j, sfrom, sto; int had_err = 0; bool has_desc; GtEncseqReader *esr; gt_assert(encseq); if (!(has_desc = gt_encseq_has_description_support(encseq))) gt_warning("Missing description support for file %s", filename); if (strcmp(gt_str_get(args->mode), "fasta") == 0) { /* specify a single sequence to extract */ if (args->seq != GT_UNDEF_UWORD) { if (args->seq >= gt_encseq_num_of_sequences(encseq)) { gt_error_set(err, "requested sequence "GT_WU" exceeds number of sequences " "("GT_WU")", args->seq, gt_encseq_num_of_sequences(encseq)); return -1; } sfrom = args->seq; sto = args->seq + 1; } else if (args->seqrng.start != GT_UNDEF_UWORD && args->seqrng.end != GT_UNDEF_UWORD) { /* specify a sequence range to extract */ if (args->seqrng.start >= gt_encseq_num_of_sequences(encseq) || args->seqrng.end >= gt_encseq_num_of_sequences(encseq)) { gt_error_set(err, "range "GT_WU"-"GT_WU" includes a sequence number " "exceeding the total number of sequences ("GT_WU")", args->seqrng.start, args->seqrng.end, gt_encseq_num_of_sequences(encseq)); return -1; } sfrom = args->seqrng.start; sto = args->seqrng.end + 1; } else { /* extract all sequences */ sfrom = 0; sto = gt_encseq_num_of_sequences(encseq); } for (i = sfrom; i < sto; i++) { GtUword desclen, startpos, len; char buf[BUFSIZ]; const char *desc = NULL; /* XXX: maybe make this distinction in the functions via readmode? */ if (!GT_ISDIRREVERSE(args->rm)) { startpos = gt_encseq_seqstartpos(encseq, i); len = gt_encseq_seqlength(encseq, i); if (has_desc) { desc = gt_encseq_description(encseq, &desclen, i); } else { (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i); desclen = strlen(buf); desc = buf; } } else { startpos = gt_encseq_seqstartpos(encseq, i); len = gt_encseq_seqlength(encseq, gt_encseq_num_of_sequences(encseq)-1-i); startpos = gt_encseq_total_length(encseq) - (gt_encseq_seqstartpos(encseq, gt_encseq_num_of_sequences( encseq)-1-i) + len); if (has_desc) { desc = gt_encseq_description(encseq, &desclen, gt_encseq_num_of_sequences(encseq)-1-i); } else { (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i); desclen = strlen(buf); desc = buf; } } gt_assert(desc); /* output description */ gt_xfputc(GT_FASTA_SEPARATOR, stdout); gt_xfwrite(desc, 1, desclen, stdout); gt_xfputc('\n', stdout); /* XXX: make this more efficient by writing in a buffer first and then showing the result */ if (args->singlechars) { for (j = 0; j < len; j++) { gt_xfputc(gt_encseq_get_decoded_char(encseq, startpos + j, args->rm), stdout); } } else { esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, startpos); for (j = 0; j < len; j++) { gt_xfputc(gt_encseq_reader_next_decoded_char(esr), stdout); } gt_encseq_reader_delete(esr); } gt_xfputc('\n', stdout); } } if (strcmp(gt_str_get(args->mode), "concat") == 0) { GtUword from = 0, to = gt_encseq_total_length(encseq) - 1; if (args->rng.start != GT_UNDEF_UWORD && args->rng.end != GT_UNDEF_UWORD) { if (args->rng.end > to) { had_err = -1; gt_error_set(err, "end of range ("GT_WU") exceeds encoded sequence length " "("GT_WU")", args->rng.end, to); } if (!had_err) { from = args->rng.start; to = args->rng.end; } } if (!had_err) { if (args->singlechars) { for (j = from; j <= to; j++) { char cc = gt_encseq_get_decoded_char(encseq, j, args->rm); if (cc == (char) SEPARATOR) cc = gt_str_get(args->sepchar)[0]; gt_xfputc(cc, stdout); } } else { esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, from); if (esr) { for (j = from; j <= to; j++) { char cc = gt_encseq_reader_next_decoded_char(esr); if (cc == (char) SEPARATOR) cc = gt_str_get(args->sepchar)[0]; gt_xfputc(cc, stdout); } gt_encseq_reader_delete(esr); } } gt_xfputc('\n', stdout); } } return had_err; }
static int enumeratelcpintervals(const char *inputindex, Sequentialsuffixarrayreader *ssar, const char *storeindex, bool storecounts, GtUword mersize, GtUword minocc, GtUword maxocc, bool performtest, GtLogger *logger, GtError *err) { TyrDfsstate *state; bool haserr = false; unsigned int alphasize; gt_error_check(err); state = gt_malloc(sizeof (*state)); GT_INITARRAY(&state->occdistribution,Countwithpositions); state->esrspace = gt_encseq_create_reader_with_readmode( gt_encseqSequentialsuffixarrayreader(ssar), gt_readmodeSequentialsuffixarrayreader(ssar), 0); state->mersize = (GtUword) mersize; state->encseq = gt_encseqSequentialsuffixarrayreader(ssar); alphasize = gt_alphabet_num_of_chars(gt_encseq_alphabet(state->encseq)); state->readmode = gt_readmodeSequentialsuffixarrayreader(ssar); state->storecounts = storecounts; state->minocc = minocc; state->maxocc = maxocc; state->totallength = gt_encseq_total_length(state->encseq); state->performtest = performtest; state->countoutputmers = 0; state->merindexfpout = NULL; state->countsfilefpout = NULL; GT_INITARRAY(&state->largecounts,Largecount); if (strlen(storeindex) == 0) { state->sizeofbuffer = 0; state->bytebuffer = NULL; } else { state->sizeofbuffer = MERBYTES(mersize); state->bytebuffer = gt_malloc(sizeof *state->bytebuffer * state->sizeofbuffer); } if (performtest) { state->currentmer = gt_malloc(sizeof *state->currentmer * state->mersize); state->suftab = gt_suftabSequentialsuffixarrayreader(ssar); } else { state->currentmer = NULL; state->suftab = NULL; } if (state->mersize > state->totallength) { gt_error_set(err,"mersize "GT_WU" > "GT_WU" = totallength not allowed", state->mersize, state->totallength); haserr = true; } else { if (strlen(storeindex) == 0) { state->processoccurrencecount = adddistpos2distribution; } else { state->merindexfpout = gt_fa_fopen_with_suffix(storeindex,MERSUFFIX, "wb",err); if (state->merindexfpout == NULL) { haserr = true; } else { if (state->storecounts) { state->countsfilefpout = gt_fa_fopen_with_suffix(storeindex,COUNTSSUFFIX,"wb",err); if (state->countsfilefpout == NULL) { haserr = true; } } } state->processoccurrencecount = outputsortedstring2index; } if (!haserr) { if (gt_depthfirstesa(ssar, tyr_allocateDfsinfo, tyr_freeDfsinfo, tyr_processleafedge, NULL, tyr_processcompletenode, tyr_assignleftmostleaf, tyr_assignrightmostleaf, (Dfsstate*) state, logger, err) != 0) { haserr = true; } if (strlen(storeindex) == 0) { showfinalstatistics(state,inputindex,logger); } } if (!haserr) { if (state->countsfilefpout != NULL) { gt_logger_log(logger,"write "GT_WU" mercounts > "GT_WU " to file \"%s%s\"", state->largecounts.nextfreeLargecount, (GtUword) MAXSMALLMERCOUNT, storeindex, COUNTSSUFFIX); gt_xfwrite(state->largecounts.spaceLargecount, sizeof (Largecount), (size_t) state->largecounts.nextfreeLargecount, state->countsfilefpout); } } if (!haserr) { gt_logger_log(logger,"number of "GT_WU"-mers in index: "GT_WU"", mersize, state->countoutputmers); gt_logger_log(logger,"index size: %.2f megabytes\n", GT_MEGABYTES(state->countoutputmers * state->sizeofbuffer + sizeof (GtUword) * EXTRAINTEGERS)); } } /* now out EXTRAINTEGERS integer values */ if (!haserr && state->merindexfpout != NULL) { outputbytewiseUlongvalue(state->merindexfpout, (GtUword) state->mersize); outputbytewiseUlongvalue(state->merindexfpout,(GtUword) alphasize); } gt_fa_xfclose(state->merindexfpout); gt_fa_xfclose(state->countsfilefpout); GT_FREEARRAY(&state->occdistribution,Countwithpositions); gt_free(state->currentmer); gt_free(state->bytebuffer); GT_FREEARRAY(&state->largecounts,Largecount); gt_encseq_reader_delete(state->esrspace); gt_free(state); return haserr ? -1 : 0; }
enum verifyBWTSeqErrCode gt_BWTSeqVerifyIntegrity(BWTSeq *bwtSeq, const char *projectName, int checkFlags, GtUword tickPrint, FILE *fp, GtLogger *verbosity, GtError *err) { Suffixarray suffixArray; struct extBitsRetrieval extBits; bool suffixArrayIsInitialized = false, extBitsAreInitialized = false; enum verifyBWTSeqErrCode retval = VERIFY_BWTSEQ_NO_ERROR; do { GtUword seqLen; gt_assert(bwtSeq && projectName && err); gt_error_check(err); initExtBitsRetrieval(&extBits); extBitsAreInitialized = true; if (gt_mapsuffixarray(&suffixArray, SARR_SUFTAB | SARR_ESQTAB, projectName, verbosity, err)) { gt_error_set(err, "Cannot load reference suffix array project with" " demand for suffix table file and encoded sequence" " for project: %s", projectName); retval = VERIFY_BWTSEQ_REFLOAD_ERROR; break; } suffixArrayIsInitialized = true; seqLen = gt_encseq_total_length(suffixArray.encseq) + 1; if (BWTSeqLength(bwtSeq) != seqLen) { gt_error_set(err, "length mismatch for suffix array project %s and " "bwt sequence index", projectName); retval = VERIFY_BWTSEQ_LENCOMPARE_ERROR; break; } if (checkFlags & VERIFY_BWTSEQ_SUFVAL && BWTSeqHasLocateInformation(bwtSeq)) { GtUword i; for (i = 0; i < seqLen && retval == VERIFY_BWTSEQ_NO_ERROR; ++i) { if (gt_BWTSeqPosHasLocateInfo(bwtSeq, i, &extBits)) { GtUword sfxArrayValue = gt_BWTSeqLocateMatch(bwtSeq, i, &extBits); if (sfxArrayValue != ESASUFFIXPTRGET(suffixArray.suftab,i)) { gt_error_set(err, "Failed suffix array value comparison" " at position "GT_WU": "GT_WU" != "GT_WU"", i, sfxArrayValue, ESASUFFIXPTRGET(suffixArray.suftab,i)); retval = VERIFY_BWTSEQ_SUFVAL_ERROR; break; } } if (tickPrint && !((i + 1) % tickPrint)) putc('.', fp); } if (tickPrint) putc('\n', fp); if (retval != VERIFY_BWTSEQ_NO_ERROR) break; } else if (checkFlags & VERIFY_BWTSEQ_SUFVAL) { gt_error_set(err, "check of suffix array values was requested," " but index contains no locate information!"); retval = VERIFY_BWTSEQ_SUFVAL_ERROR; break; } else if (!(checkFlags & VERIFY_BWTSEQ_SUFVAL) && BWTSeqHasLocateInformation(bwtSeq)) { fputs("Not checking suftab values.\n", stderr); } if (BWTSeqHasLocateInformation(bwtSeq)) { GtUword nextLocate = BWTSeqTerminatorPos(bwtSeq); if (suffixArray.longest.defined && suffixArray.longest.valueunsignedlong != nextLocate) { gt_error_set(err, "terminator/0-rotation position mismatch "GT_WU"" " vs. "GT_WU"", suffixArray.longest.valueunsignedlong, nextLocate); retval = VERIFY_BWTSEQ_TERMPOS_ERROR; break; } if ((checkFlags & VERIFY_BWTSEQ_LFMAPWALK) && (bwtSeq->featureToggles & BWTReversiblySorted)) { GtUword i = seqLen; /* handle first symbol specially because the encseq * will not return the terminator symbol */ { Symbol sym = BWTSeqGetSym(bwtSeq, nextLocate); if (sym != UNDEFBWTCHAR) { gt_error_set(err, "symbol mismatch at position "GT_WU": " "%d vs. reference symbol %d", i - 1, (int)sym, (int)UNDEFBWTCHAR); retval = VERIFY_BWTSEQ_LFMAPWALK_ERROR; break; } --i; nextLocate = BWTSeqLFMap(bwtSeq, nextLocate, &extBits); } while (i > 0) { Symbol symRef = gt_encseq_get_encoded_char(suffixArray.encseq, --i, suffixArray.readmode); Symbol symCmp = BWTSeqGetSym(bwtSeq, nextLocate); if (symCmp != symRef) { gt_error_set(err, "symbol mismatch at position "GT_WU": " "%d vs. reference symbol %d", i, symCmp, symRef); retval = VERIFY_BWTSEQ_LFMAPWALK_ERROR; break; } nextLocate = BWTSeqLFMap(bwtSeq, nextLocate, &extBits); } if (retval != VERIFY_BWTSEQ_NO_ERROR) break; } else if ((checkFlags & VERIFY_BWTSEQ_LFMAPWALK) && !(bwtSeq->featureToggles & BWTReversiblySorted)) { gt_error_set(err, "requested complete backwards regeneration in index" " without regeneration capability"); retval = VERIFY_BWTSEQ_LFMAPWALK_IMP_ERROR; break; } } if (checkFlags & VERIFY_BWTSEQ_CONTEXT) { BWTSeqContextRetriever *bwtSeqCR = gt_BWTSeqCRLoad(bwtSeq, projectName, CTX_MAP_ILOG_AUTOSIZE); if (!bwtSeqCR) { gt_error_set(err, "cannot load BWT sequence context access table" " for project %s", projectName); retval = VERIFY_BWTSEQ_CONTEXT_LOADFAIL; break; } fputs("Checking context regeneration.\n", stderr); { GtUword i, start, subSeqLen, maxSubSeqLen = MIN(MAX(MIN_CONTEXT_LEN, seqLen/CONTEXT_FRACTION), MAX_CONTEXT_LEN), numTries = MIN(MAX_NUM_CONTEXT_CHECKS, MAX(2, seqLen/CONTEXT_INTERVAL)); Symbol *contextBuf = gt_malloc(sizeof (Symbol) * MAX_CONTEXT_LEN); GtEncseqReader *esr = gt_encseq_create_reader_with_readmode(suffixArray.encseq, suffixArray.readmode, 0); for (i = 0; i < numTries && retval == VERIFY_BWTSEQ_NO_ERROR; ++i) { GtUword j, end, inSubSeqLen; subSeqLen = random()%maxSubSeqLen + 1; start = random()%(seqLen - subSeqLen + 1); end = start + subSeqLen; inSubSeqLen = subSeqLen - ((end==seqLen)?1:0); gt_BWTSeqCRAccessSubseq(bwtSeqCR, start, subSeqLen, contextBuf); gt_encseq_reader_reinit_with_readmode(esr, suffixArray.encseq, suffixArray.readmode, start); for (j = 0; j < inSubSeqLen; ++j) { Symbol symRef = gt_encseq_reader_next_encoded_char(esr); Symbol symCmp = contextBuf[j]; if (symCmp != symRef) { gt_error_set(err, "symbol mismatch at position "GT_WU": " "%d vs. reference symbol %d", start + j, (int)symCmp, (int)symRef); retval = VERIFY_BWTSEQ_CONTEXT_SYMFAIL; break; } } while (j < subSeqLen) { Symbol symRef = UNDEFBWTCHAR; Symbol symCmp = contextBuf[j]; if (symCmp != symRef) { gt_error_set(err, "symbol mismatch at position "GT_WU": " "%d vs. reference symbol %d", start + j, (int)symCmp, (int)symRef); retval = VERIFY_BWTSEQ_CONTEXT_SYMFAIL; break; } ++j; } } if (retval == VERIFY_BWTSEQ_NO_ERROR) fputs("Context regeneration completed successfully.\n", stderr); gt_encseq_reader_delete(esr); gt_free(contextBuf); } gt_deleteBWTSeqCR(bwtSeqCR); } } while (0); if (suffixArrayIsInitialized) gt_freesuffixarray(&suffixArray); if (extBitsAreInitialized) destructExtBitsRetrieval(&extBits); return retval; }
double *gt_encseq_get_gc(const GtEncseq *encseq, bool with_special, bool calculate, GT_UNUSED GtError *err) { GtEncseqReader *reader; GtAlphabet *alphabet; double *gc_content; /* unit = file or sequence depending on per_file */ unsigned long char_idx, totallength, max_unit, seq_idx = 0, nextsep = 0, at_count = 0, gc_count = 0, default_count = 0; bool is_mirrored_encseq; GtUchar acgt[8], current_c; alphabet = gt_encseq_alphabet(encseq); gt_assert(gt_alphabet_is_dna(alphabet)); gt_alphabet_encode_seq(alphabet, acgt, "aAtTcCgG", 8UL); totallength = gt_encseq_total_length(encseq); reader = gt_encseq_create_reader_with_readmode(encseq, GT_READMODE_FORWARD, 0); is_mirrored_encseq = gt_encseq_is_mirrored(encseq); if (is_mirrored_encseq) { max_unit = GT_DIV2(gt_encseq_num_of_sequences(encseq)); gc_content = gt_calloc((size_t) GT_MULT2(max_unit), sizeof (double)); } else { max_unit = gt_encseq_num_of_sequences(encseq); gc_content = gt_calloc((size_t) max_unit, sizeof (double)); } nextsep = gt_encseq_seqstartpos(encseq, seq_idx) + gt_encseq_seqlength(encseq, seq_idx); for (char_idx = 0; char_idx < totallength; char_idx++) { if (nextsep == char_idx) { if (calculate) { calculate_gc(encseq, gc_content, with_special, seq_idx, gc_count, at_count); } else { gc_content[seq_idx] = (double) gc_count; } seq_idx++; nextsep = gt_encseq_seqstartpos(encseq, seq_idx) + gt_encseq_seqlength(encseq, seq_idx); gt_encseq_reader_reinit_with_readmode(reader, encseq, GT_READMODE_FORWARD, char_idx + 1UL); gc_count = at_count = default_count = 0UL; continue; } current_c = gt_encseq_reader_next_encoded_char(reader); if (current_c == acgt[0] || current_c == acgt[1] || current_c == acgt[2] || current_c == acgt[3]) { at_count++; } else { if (current_c == acgt[4] || current_c == acgt[5] || current_c == acgt[6] || current_c == acgt[7]) { gc_count++; } else { default_count++; } } } if (calculate) { calculate_gc(encseq, gc_content, with_special, seq_idx, gc_count, at_count); } else { gc_content[seq_idx] = (double) gc_count; } gt_encseq_reader_delete(reader); if (is_mirrored_encseq) { unsigned long double_max_unit = GT_MULT2(max_unit); for (seq_idx = 0; seq_idx < max_unit; seq_idx++) { gc_content[double_max_unit - seq_idx - 1] = gc_content[seq_idx]; } } return gc_content; }
static int gt_encseq_bitextract_runner(GT_UNUSED int argc, const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GT_UNUSED GtError *err) { GtEncseqBitextractArguments *arguments = tool_arguments; GtEncseqLoader *el; GtEncseq *encseq; int had_err = 0; bool fwd, it1, GT_UNUSED it2; char buffer[BUFSIZ]; GtEndofTwobitencoding etbe; GtEncseqReader *esr; GtSpecialrangeiterator *sri; GtRange srng; GtReadmode rm; gt_error_check(err); gt_assert(arguments); el = gt_encseq_loader_new(); encseq = gt_encseq_loader_load(el, argv[parsed_args], err); if (!encseq) had_err = -1; if (!had_err && arguments->mirror) { had_err = gt_encseq_mirror(encseq, err); } if (!had_err) { rm = gt_readmode_parse(gt_str_get(arguments->readmode), NULL); fwd = GT_ISDIRREVERSE(rm) ? false : true; } if (!had_err && arguments->bitpos != GT_UNDEF_ULONG) { if (arguments->bitpos >= gt_encseq_total_length(encseq)) { gt_error_set(err, "position %lu exceeds encoded sequence length of %lu", arguments->bitpos, gt_encseq_total_length(encseq)); had_err = -1; } if (!had_err) { unsigned long ret; esr = gt_encseq_create_reader_with_readmode(encseq, rm, arguments->bitpos); ret = gt_encseq_extract2bitencwithtwobitencodingstoppos(&etbe, esr, encseq, rm, arguments->bitpos); gt_bitsequence_tostring(buffer, etbe.tbe); printf("Twobitencoding %s\n" "unitsnotspecial %u\n" "position %lu\n" "returnvalue %lu\n", buffer, etbe.unitsnotspecial, arguments->bitpos, ret); gt_encseq_reader_delete(esr); } } if (!had_err && arguments->stoppos != GT_UNDEF_ULONG) { if (arguments->stoppos >= gt_encseq_total_length(encseq)) { gt_error_set(err, "position %lu exceeds encoded sequence length of %lu", arguments->stoppos, gt_encseq_total_length(encseq)); had_err = -1; } if (!had_err) { esr = gt_encseq_create_reader_with_readmode(encseq, rm, 0); /* check stoppos stuff */ gt_encseq_reader_reinit_with_readmode(esr, encseq, rm, arguments->stoppos); printf("%lu: %lu\n", arguments->stoppos, gt_getnexttwobitencodingstoppos(fwd, esr)); gt_encseq_reader_delete(esr); } } if (!had_err && arguments->specialranges) { /* check specialrangeiterator stuff */ if (gt_encseq_has_specialranges(encseq)) { sri = gt_specialrangeiterator_new(encseq, fwd); while (true) { it1 = gt_specialrangeiterator_next(sri, &srng); if (it1) printf("%lu:%lu\n", srng.start, srng.end); else break; } gt_specialrangeiterator_delete(sri); } } gt_encseq_delete(encseq); gt_encseq_loader_delete(el); return had_err; }