Enumcodeatposition *gt_Enumcodeatposition_new(const GtEncseq *encseq, GtReadmode readmode, unsigned int prefixlength, unsigned int numofchars) { Enumcodeatposition *ecp; ecp = gt_malloc(sizeof *ecp); ecp->encseq = encseq; ecp->readmode = readmode; ecp->multimappower = gt_initmultimappower(numofchars,prefixlength); ecp->filltable = gt_initfilltable(numofchars,prefixlength); ecp->prefixlength = prefixlength; ecp->moveforward = GT_ISDIRREVERSE(readmode) ? true : false; ecp->totallength = gt_encseq_total_length(encseq); if (ecp->moveforward) { ecp->previousrange.start = ecp->previousrange.end = 0; } else { ecp->previousrange.start = ecp->previousrange.end = ecp->totallength; } ecp->exhausted = false; if (gt_encseq_has_specialranges(encseq)) { ecp->sri = gt_specialrangeiterator_new(encseq,ecp->moveforward); } else { ecp->sri = NULL; } return ecp; }
Rankedbounds *gt_fillrankbounds(const GtEncseq *encseq, GtReadmode readmode) { if (gt_encseq_has_specialranges(encseq)) { GtSpecialrangeiterator *sri; GtRange range; GtUword currentrank = 0, realspecialranges; Rankedbounds *rankedbounds, *rbptr; realspecialranges = gt_encseq_realspecialranges(encseq); rankedbounds = gt_malloc(sizeof (Rankedbounds) * realspecialranges); sri = gt_specialrangeiterator_new(encseq, GT_ISDIRREVERSE(readmode) ? false : true); for (rbptr = rankedbounds; gt_specialrangeiterator_next(sri,&range); rbptr++) { rbptr->lowerbound = range.start; rbptr->upperbound = range.end; rbptr->rank = currentrank; currentrank += rbptr->upperbound - rbptr->lowerbound; } gt_assert(rbptr == rankedbounds + realspecialranges); gt_specialrangeiterator_delete(sri); return rankedbounds; } return NULL; }
/*@notnull@*/ GtKmercodeiterator *gt_kmercodeiterator_encseq_new( const GtEncseq *encseq, GtReadmode readmode, unsigned int kmersize, unsigned long startpos) { GtKmercodeiterator *kmercodeiterator; unsigned int numofchars; GtUchar charcode; gt_assert(!GT_ISDIRREVERSE(readmode) || startpos == 0); kmercodeiterator = gt_malloc(sizeof (*kmercodeiterator)); kmercodeiterator->totallength = gt_encseq_total_length(encseq); kmercodeiterator->startpos = startpos; gt_assert(startpos < kmercodeiterator->totallength); if (kmercodeiterator->totallength - startpos < (unsigned long) kmersize) { kmercodeiterator->inputexhausted = true; kmercodeiterator->fb = NULL; kmercodeiterator->encseq = encseq; kmercodeiterator->esr = NULL; kmercodeiterator->spwp = NULL; } else { kmercodeiterator->inputexhausted = false; kmercodeiterator->fb = NULL; kmercodeiterator->encseq = encseq; kmercodeiterator->readmode = readmode; kmercodeiterator->esr = gt_encseq_create_reader_with_readmode(encseq, readmode, startpos); numofchars = gt_alphabet_num_of_chars(gt_encseq_alphabet(encseq)); kmercodeiterator->spwp = kmerstream_new(numofchars,kmersize); kmercodeiterator->hasprocessedfirst = false; for (kmercodeiterator->currentposition = startpos; kmercodeiterator->currentposition < startpos+(unsigned long) kmersize; kmercodeiterator->currentposition++) { charcode = gt_encseq_reader_next_encoded_char(kmercodeiterator->esr); kmercodeiterator->spwp->windowwidth++; updatespecialpositions(kmercodeiterator->spwp,charcode,false,0); kmercodeiterator->spwp->cyclicwindow[kmercodeiterator-> spwp->windowwidth-1] = charcode; } } return kmercodeiterator; }
void gt_kmercodeiterator_reset(GtKmercodeiterator *kmercodeiterator, GtReadmode readmode, GtUword startpos) { GtUchar charcode; const GtEncseq *encseq = kmercodeiterator->encseq; GtUword kmersize = (GtUword) kmercodeiterator->spwp->kmersize; gt_assert(!GT_ISDIRREVERSE(readmode) || startpos == 0); kmercodeiterator->totallength = gt_encseq_total_length(encseq); kmercodeiterator->startpos = startpos; gt_assert(startpos < kmercodeiterator->totallength); kmercodeiterator->fb = NULL; if (kmercodeiterator->totallength - startpos < kmersize) { kmercodeiterator->inputexhausted = true; gt_encseq_reader_delete(kmercodeiterator->esr); kmercodeiterator->esr = NULL; kmerstream_delete(kmercodeiterator->spwp); kmercodeiterator->spwp = NULL; } else { kmercodeiterator->inputexhausted = false; kmercodeiterator->readmode = readmode; gt_encseq_reader_reinit_with_readmode(kmercodeiterator->esr, encseq, readmode, startpos); kmerstream_reset(kmercodeiterator->spwp); kmercodeiterator->hasprocessedfirst = false; for (kmercodeiterator->currentposition = startpos; kmercodeiterator->currentposition < startpos+(GtUword) kmersize; kmercodeiterator->currentposition++) { charcode = gt_encseq_reader_next_encoded_char(kmercodeiterator->esr); kmercodeiterator->spwp->windowwidth++; kmerstream_updatespecialpositions(kmercodeiterator->spwp,charcode, false,0); kmercodeiterator->spwp->cyclicwindow[kmercodeiterator-> spwp->windowwidth-1] = charcode; } } }
Specialrank *gt_fillspecialranklist(const GtEncseq *encseq, GtReadmode readmode, const GtUword *inversesuftab) { if (gt_encseq_has_specialranges(encseq)) { GtSpecialrangeiterator *sri; GtRange range; GtUword realspecialranges, specialrank; GT_UNUSED GtUword totallength; Specialrank *specialranklist, *rbptr; totallength = gt_encseq_total_length(encseq); realspecialranges = gt_encseq_realspecialranges(encseq); specialranklist = gt_malloc(sizeof (Specialrank) * realspecialranges); sri = gt_specialrangeiterator_new(encseq, GT_ISDIRREVERSE(readmode) ? false : true); rbptr = specialranklist; specialrank = 0; while (gt_specialrangeiterator_next(sri,&range)) { gt_assert(rbptr < specialranklist + realspecialranges); gt_assert(range.end<=totallength); specialrank += range.end - range.start; rbptr->specialrank = specialrank - 1; rbptr->key = inversesuftab[range.end]; rbptr++; } gt_assert(rbptr == specialranklist + realspecialranges); gt_specialrangeiterator_delete(sri); qsort(specialranklist,(size_t) realspecialranges, sizeof (Specialrank),compareSpecialrank); return specialranklist; } return NULL; }
static int output_sequence(GtEncseq *encseq, GtEncseqDecodeArguments *args, const char *filename, GtError *err) { GtUword i, j, sfrom, sto; int had_err = 0; bool has_desc; GtEncseqReader *esr; gt_assert(encseq); if (!(has_desc = gt_encseq_has_description_support(encseq))) gt_warning("Missing description support for file %s", filename); if (strcmp(gt_str_get(args->mode), "fasta") == 0) { /* specify a single sequence to extract */ if (args->seq != GT_UNDEF_UWORD) { if (args->seq >= gt_encseq_num_of_sequences(encseq)) { gt_error_set(err, "requested sequence "GT_WU" exceeds number of sequences " "("GT_WU")", args->seq, gt_encseq_num_of_sequences(encseq)); return -1; } sfrom = args->seq; sto = args->seq + 1; } else if (args->seqrng.start != GT_UNDEF_UWORD && args->seqrng.end != GT_UNDEF_UWORD) { /* specify a sequence range to extract */ if (args->seqrng.start >= gt_encseq_num_of_sequences(encseq) || args->seqrng.end >= gt_encseq_num_of_sequences(encseq)) { gt_error_set(err, "range "GT_WU"-"GT_WU" includes a sequence number " "exceeding the total number of sequences ("GT_WU")", args->seqrng.start, args->seqrng.end, gt_encseq_num_of_sequences(encseq)); return -1; } sfrom = args->seqrng.start; sto = args->seqrng.end + 1; } else { /* extract all sequences */ sfrom = 0; sto = gt_encseq_num_of_sequences(encseq); } for (i = sfrom; i < sto; i++) { GtUword desclen, startpos, len; char buf[BUFSIZ]; const char *desc = NULL; /* XXX: maybe make this distinction in the functions via readmode? */ if (!GT_ISDIRREVERSE(args->rm)) { startpos = gt_encseq_seqstartpos(encseq, i); len = gt_encseq_seqlength(encseq, i); if (has_desc) { desc = gt_encseq_description(encseq, &desclen, i); } else { (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i); desclen = strlen(buf); desc = buf; } } else { startpos = gt_encseq_seqstartpos(encseq, i); len = gt_encseq_seqlength(encseq, gt_encseq_num_of_sequences(encseq)-1-i); startpos = gt_encseq_total_length(encseq) - (gt_encseq_seqstartpos(encseq, gt_encseq_num_of_sequences( encseq)-1-i) + len); if (has_desc) { desc = gt_encseq_description(encseq, &desclen, gt_encseq_num_of_sequences(encseq)-1-i); } else { (void) snprintf(buf, BUFSIZ, "sequence "GT_WU"", i); desclen = strlen(buf); desc = buf; } } gt_assert(desc); /* output description */ gt_xfputc(GT_FASTA_SEPARATOR, stdout); gt_xfwrite(desc, 1, desclen, stdout); gt_xfputc('\n', stdout); /* XXX: make this more efficient by writing in a buffer first and then showing the result */ if (args->singlechars) { for (j = 0; j < len; j++) { gt_xfputc(gt_encseq_get_decoded_char(encseq, startpos + j, args->rm), stdout); } } else { esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, startpos); for (j = 0; j < len; j++) { gt_xfputc(gt_encseq_reader_next_decoded_char(esr), stdout); } gt_encseq_reader_delete(esr); } gt_xfputc('\n', stdout); } } if (strcmp(gt_str_get(args->mode), "concat") == 0) { GtUword from = 0, to = gt_encseq_total_length(encseq) - 1; if (args->rng.start != GT_UNDEF_UWORD && args->rng.end != GT_UNDEF_UWORD) { if (args->rng.end > to) { had_err = -1; gt_error_set(err, "end of range ("GT_WU") exceeds encoded sequence length " "("GT_WU")", args->rng.end, to); } if (!had_err) { from = args->rng.start; to = args->rng.end; } } if (!had_err) { if (args->singlechars) { for (j = from; j <= to; j++) { char cc = gt_encseq_get_decoded_char(encseq, j, args->rm); if (cc == (char) SEPARATOR) cc = gt_str_get(args->sepchar)[0]; gt_xfputc(cc, stdout); } } else { esr = gt_encseq_create_reader_with_readmode(encseq, args->rm, from); if (esr) { for (j = from; j <= to; j++) { char cc = gt_encseq_reader_next_decoded_char(esr); if (cc == (char) SEPARATOR) cc = gt_str_get(args->sepchar)[0]; gt_xfputc(cc, stdout); } gt_encseq_reader_delete(esr); } } gt_xfputc('\n', stdout); } } return had_err; }
static int gt_encseq_bitextract_runner(GT_UNUSED int argc, const char **argv, GT_UNUSED int parsed_args, void *tool_arguments, GT_UNUSED GtError *err) { GtEncseqBitextractArguments *arguments = tool_arguments; GtEncseqLoader *el; GtEncseq *encseq; int had_err = 0; bool fwd, it1, GT_UNUSED it2; char buffer[BUFSIZ]; GtEndofTwobitencoding etbe; GtEncseqReader *esr; GtSpecialrangeiterator *sri; GtRange srng; GtReadmode rm; gt_error_check(err); gt_assert(arguments); el = gt_encseq_loader_new(); encseq = gt_encseq_loader_load(el, argv[parsed_args], err); if (!encseq) had_err = -1; if (!had_err && arguments->mirror) { had_err = gt_encseq_mirror(encseq, err); } if (!had_err) { rm = gt_readmode_parse(gt_str_get(arguments->readmode), NULL); fwd = GT_ISDIRREVERSE(rm) ? false : true; } if (!had_err && arguments->bitpos != GT_UNDEF_ULONG) { if (arguments->bitpos >= gt_encseq_total_length(encseq)) { gt_error_set(err, "position %lu exceeds encoded sequence length of %lu", arguments->bitpos, gt_encseq_total_length(encseq)); had_err = -1; } if (!had_err) { unsigned long ret; esr = gt_encseq_create_reader_with_readmode(encseq, rm, arguments->bitpos); ret = gt_encseq_extract2bitencwithtwobitencodingstoppos(&etbe, esr, encseq, rm, arguments->bitpos); gt_bitsequence_tostring(buffer, etbe.tbe); printf("Twobitencoding %s\n" "unitsnotspecial %u\n" "position %lu\n" "returnvalue %lu\n", buffer, etbe.unitsnotspecial, arguments->bitpos, ret); gt_encseq_reader_delete(esr); } } if (!had_err && arguments->stoppos != GT_UNDEF_ULONG) { if (arguments->stoppos >= gt_encseq_total_length(encseq)) { gt_error_set(err, "position %lu exceeds encoded sequence length of %lu", arguments->stoppos, gt_encseq_total_length(encseq)); had_err = -1; } if (!had_err) { esr = gt_encseq_create_reader_with_readmode(encseq, rm, 0); /* check stoppos stuff */ gt_encseq_reader_reinit_with_readmode(esr, encseq, rm, arguments->stoppos); printf("%lu: %lu\n", arguments->stoppos, gt_getnexttwobitencodingstoppos(fwd, esr)); gt_encseq_reader_delete(esr); } } if (!had_err && arguments->specialranges) { /* check specialrangeiterator stuff */ if (gt_encseq_has_specialranges(encseq)) { sri = gt_specialrangeiterator_new(encseq, fwd); while (true) { it1 = gt_specialrangeiterator_next(sri, &srng); if (it1) printf("%lu:%lu\n", srng.start, srng.end); else break; } gt_specialrangeiterator_delete(sri); } } gt_encseq_delete(encseq); gt_encseq_loader_delete(el); return had_err; }
void gt_copysort_derivesorting(const GtBucketspec2 *bucketspec2, GtSuffixsortspace *suffixsortspace, GtLogger *logger) { GtUword hardwork = 0, *targetoffset; unsigned int idx, idxsource, source, second; #ifdef WITHSUFFIXES { GtUword idx; for (idx = 0; idx < bucketspec2->partwidth; idx++) { gt_encseq_showatstartpos( stdout, GT_ISDIRREVERSE(readmode) ? false : true, GT_ISDIRCOMPLEMENT(readmode) ? true : false, encseq, gt_suffixsortspace_getdirect(suffixsortspace,idx)); } } #endif targetoffset = gt_malloc(sizeof (*targetoffset) * bucketspec2->numofchars); for (idxsource = 0; idxsource<bucketspec2->numofchars; idxsource++) { source = bucketspec2->order[idxsource]; for (second = 0; second < bucketspec2->numofchars; second++) { if (!bucketspec2->subbuckettab[source][second].sorted && source != second) { gt_assert(bucketspec2->subbuckettab[source][second].hardworktodo); gt_logger_log(logger,"hard work for %u %u",source,second); hardwork += getendidx(bucketspec2,source,second) - getstartidx(bucketspec2,source,second); bucketspec2->subbuckettab[source][second].sorted = true; } else { gt_assert(!bucketspec2->subbuckettab[source][second].hardworktodo); } } if (getstartidx(bucketspec2,source,0) < getstartidx(bucketspec2,source,source)) { for (idx = 0; idx < bucketspec2->numofchars; idx++) { targetoffset[idx] = getstartidx(bucketspec2,idx,source); } forwardderive(bucketspec2, suffixsortspace, targetoffset, source, getstartidx(bucketspec2,source,0)); } if (getendidx(bucketspec2,source,source) < getendidx(bucketspec2,source,bucketspec2->numofchars)) { for (idx = 0; idx < bucketspec2->numofchars; idx++) { /* do not need to assert that getendidx(idx,source) > 0, as later the value stored in targetoffset is incremented */ targetoffset[idx] = getendidx(bucketspec2,idx,source) - 1; } gt_assert(getendidx(bucketspec2,source,bucketspec2->numofchars) > 0); backwardderive(bucketspec2, suffixsortspace, targetoffset, source, getendidx(bucketspec2,source,bucketspec2->numofchars) - 1); } for (idx = 0; idx < bucketspec2->numofchars; idx++) { bucketspec2->subbuckettab[idx][source].sorted = true; } bucketspec2->superbuckettab[source].sorted = true; } gt_free(targetoffset); gt_logger_log(logger,"hardwork = "GT_WU" (%.2f)", hardwork, (double) hardwork/gt_encseq_total_length(bucketspec2->encseq)); }
static GtUword *leftcontextofspecialchardist(unsigned int numofchars, const GtEncseq *encseq, GtReadmode readmode) { GtUchar cc; unsigned int idx; GtUword *specialchardist, totallength = gt_encseq_total_length(encseq); GtReadmode convertedreadmode = (readmode == GT_READMODE_REVERSE) ? GT_READMODE_FORWARD : GT_READMODE_COMPL; specialchardist = gt_malloc(sizeof (*specialchardist) * numofchars); for (idx = 0; idx<numofchars; idx++) { specialchardist[idx] = 0; } if (gt_encseq_has_specialranges(encseq)) { GtSpecialrangeiterator *sri; GtRange range; sri = gt_specialrangeiterator_new(encseq,true); if (GT_ISDIRREVERSE(readmode)) { while (gt_specialrangeiterator_next(sri,&range)) { if (range.end < totallength) { cc = gt_encseq_get_encoded_char(encseq,range.end,convertedreadmode); if (ISNOTSPECIAL(cc)) { specialchardist[cc]++; } } } } else { while (gt_specialrangeiterator_next(sri,&range)) { if (range.start > 0) { cc = gt_encseq_get_encoded_char(encseq,range.start-1,readmode); if (ISNOTSPECIAL(cc)) { specialchardist[cc]++; } } } } gt_specialrangeiterator_delete(sri); } if (GT_ISDIRREVERSE(readmode)) { if (gt_encseq_lengthofspecialprefix(encseq) == 0) { cc = gt_encseq_get_encoded_char(encseq,0,convertedreadmode); gt_assert(ISNOTSPECIAL(cc)); specialchardist[cc]++; } } else { if (gt_encseq_lengthofspecialsuffix(encseq) == 0) { cc = gt_encseq_get_encoded_char(encseq,totallength-1,readmode); gt_assert(ISNOTSPECIAL(cc)); specialchardist[cc]++; } } return specialchardist; }