Beispiel #1
0
uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid)
{
	int64_t far_beg, far_end, len;
	int is_rev;
	uint8_t *seq;

	if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap
	assert(*beg <= mid && mid < *end);
	*rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev));
	far_beg = bns->anns[*rid].offset;
	far_end = far_beg + bns->anns[*rid].len;
	if (is_rev) { // flip to the reverse strand
		int64_t tmp = far_beg;
		far_beg = (bns->l_pac<<1) - far_end;
		far_end = (bns->l_pac<<1) - tmp;
	}
	*beg = *beg > far_beg? *beg : far_beg;
	*end = *end < far_end? *end : far_end;
	seq = bns_get_seq(bns->l_pac, pac, *beg, *end, &len);
	if (seq == 0 || *end - *beg != len) {
		fprintf(stderr, "[E::%s] begin=%ld, mid=%ld, end=%ld, len=%ld, seq=%p, rid=%d, far_beg=%ld, far_end=%ld\n",
				__func__, (long)*beg, (long)mid, (long)*end, (long)len, seq, *rid, (long)far_beg, (long)far_end);
	}
	assert(seq && *end - *beg == len); // assertion failure should never happen
	return seq;
}
Beispiel #2
0
    void loadTranscriptsFromFMD() {
	    bwaidx_t* idx_ = salmonIndex_->bwaIndex();
	    size_t numRecords = idx_->bns->n_seqs;
	    std::vector<Transcript> transcripts_tmp;
        //transcripts_tmp.reserve(numRecords);
        //transcripts_.reserve(numRecords);

	    fmt::print(stderr, "Index contained {} targets\n", numRecords);
	    //transcripts_.resize(numRecords);
	    for (auto i : boost::irange(size_t(0), numRecords)) {
		    uint32_t id = i;
		    char* name = idx_->bns->anns[i].name;
		    uint32_t len = idx_->bns->anns[i].len;
		    // copy over the length, then we're done.
		    transcripts_tmp.emplace_back(id, name, len);
	    }

	    std::sort(transcripts_tmp.begin(), transcripts_tmp.end(),
			    [](const Transcript& t1, const Transcript& t2) -> bool {
			    return t1.id < t2.id;
			    });


	    double alpha = 0.005;
	    char nucTab[256];
	    nucTab[0] = 'A'; nucTab[1] = 'C'; nucTab[2] = 'G'; nucTab[3] = 'T';
	    for (size_t i = 4; i < 256; ++i) { nucTab[i] = 'N'; }

        size_t tnum = 0;
	    // Load the transcript sequence from file
	    for (auto& t : transcripts_tmp) {
		    transcripts_.emplace_back(t.id, t.RefName.c_str(), t.RefLength, alpha);
		    /* from BWA */
		    uint8_t* rseq = nullptr;
		    int64_t tstart, tend, compLen, l_pac = idx_->bns->l_pac;
		    tstart  = idx_->bns->anns[t.id].offset;
		    tend = tstart + t.RefLength;
		    rseq = bns_get_seq(l_pac, idx_->pac, tstart, tend, &compLen);
		    if (compLen != t.RefLength) {
			    fmt::print(stderr,
					    "For transcript {}, stored length ({}) != computed length ({}) --- index may be corrupt. exiting\n",
					    t.RefName, compLen, t.RefLength);
			    std::exit(1);
		    }
		    std::string seq(t.RefLength, ' ');
		    if (rseq != 0) {
			    for (int64_t i = 0; i < compLen; ++i) { seq[i] = nucTab[rseq[i]]; }
		    }

            auto& txp = transcripts_.back();

            // allocate space for the new copy
            char* seqCopy = new char[seq.length()+1];
            std::strcpy(seqCopy, seq.c_str());
            txp.Sequence = seqCopy;
            txp.freeSeqOnDestruct = false;

		    txp.SAMSequence = salmon::stringtools::encodeSequenceInSAM(seq.c_str(), t.RefLength);
		    // Length classes taken from
		    // ======
		    // Roberts, Adam, et al.
		    // "Improving RNA-Seq expression estimates by correcting for fragment bias."
		    // Genome Biol 12.3 (2011): R22.
		    // ======
		    // perhaps, define these in a more data-driven way
		    if (t.RefLength <= 1334) {
			    txp.lengthClassIndex(0);
		    } else if (t.RefLength <= 2104) {
			    txp.lengthClassIndex(0);
		    } else if (t.RefLength <= 2988) {
			    txp.lengthClassIndex(0);
		    } else if (t.RefLength <= 4389) {
			    txp.lengthClassIndex(0);
		    } else {
			    txp.lengthClassIndex(0);
		    }
		    /*
		       std::cerr << "TS = " << t.RefName << " : \n";
		       std::cerr << seq << "\n VS \n";
		       for (size_t i = 0; i < t.RefLength; ++i) {
		       std::cerr << transcripts_.back().charBaseAt(i);
		       }
		       std::cerr << "\n\n";
		       */
		    free(rseq);
		    /* end BWA code */
            ++tnum;
	    }

	    // Since we have the de-coded reference sequences, we no longer need
	    // the encoded sequences, so free them.
	    /** TEST OPT **/
	    // free(idx_->pac); idx_->pac = nullptr;
	    /** END TEST OPT **/
	    transcripts_tmp.clear();
	    // ====== Done loading the transcripts from file
    }