uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid) { int64_t far_beg, far_end, len; int is_rev; uint8_t *seq; if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap assert(*beg <= mid && mid < *end); *rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev)); far_beg = bns->anns[*rid].offset; far_end = far_beg + bns->anns[*rid].len; if (is_rev) { // flip to the reverse strand int64_t tmp = far_beg; far_beg = (bns->l_pac<<1) - far_end; far_end = (bns->l_pac<<1) - tmp; } *beg = *beg > far_beg? *beg : far_beg; *end = *end < far_end? *end : far_end; seq = bns_get_seq(bns->l_pac, pac, *beg, *end, &len); if (seq == 0 || *end - *beg != len) { fprintf(stderr, "[E::%s] begin=%ld, mid=%ld, end=%ld, len=%ld, seq=%p, rid=%d, far_beg=%ld, far_end=%ld\n", __func__, (long)*beg, (long)mid, (long)*end, (long)len, seq, *rid, (long)far_beg, (long)far_end); } assert(seq && *end - *beg == len); // assertion failure should never happen return seq; }
void loadTranscriptsFromFMD() { bwaidx_t* idx_ = salmonIndex_->bwaIndex(); size_t numRecords = idx_->bns->n_seqs; std::vector<Transcript> transcripts_tmp; //transcripts_tmp.reserve(numRecords); //transcripts_.reserve(numRecords); fmt::print(stderr, "Index contained {} targets\n", numRecords); //transcripts_.resize(numRecords); for (auto i : boost::irange(size_t(0), numRecords)) { uint32_t id = i; char* name = idx_->bns->anns[i].name; uint32_t len = idx_->bns->anns[i].len; // copy over the length, then we're done. transcripts_tmp.emplace_back(id, name, len); } std::sort(transcripts_tmp.begin(), transcripts_tmp.end(), [](const Transcript& t1, const Transcript& t2) -> bool { return t1.id < t2.id; }); double alpha = 0.005; char nucTab[256]; nucTab[0] = 'A'; nucTab[1] = 'C'; nucTab[2] = 'G'; nucTab[3] = 'T'; for (size_t i = 4; i < 256; ++i) { nucTab[i] = 'N'; } size_t tnum = 0; // Load the transcript sequence from file for (auto& t : transcripts_tmp) { transcripts_.emplace_back(t.id, t.RefName.c_str(), t.RefLength, alpha); /* from BWA */ uint8_t* rseq = nullptr; int64_t tstart, tend, compLen, l_pac = idx_->bns->l_pac; tstart = idx_->bns->anns[t.id].offset; tend = tstart + t.RefLength; rseq = bns_get_seq(l_pac, idx_->pac, tstart, tend, &compLen); if (compLen != t.RefLength) { fmt::print(stderr, "For transcript {}, stored length ({}) != computed length ({}) --- index may be corrupt. exiting\n", t.RefName, compLen, t.RefLength); std::exit(1); } std::string seq(t.RefLength, ' '); if (rseq != 0) { for (int64_t i = 0; i < compLen; ++i) { seq[i] = nucTab[rseq[i]]; } } auto& txp = transcripts_.back(); // allocate space for the new copy char* seqCopy = new char[seq.length()+1]; std::strcpy(seqCopy, seq.c_str()); txp.Sequence = seqCopy; txp.freeSeqOnDestruct = false; txp.SAMSequence = salmon::stringtools::encodeSequenceInSAM(seq.c_str(), t.RefLength); // Length classes taken from // ====== // Roberts, Adam, et al. // "Improving RNA-Seq expression estimates by correcting for fragment bias." // Genome Biol 12.3 (2011): R22. // ====== // perhaps, define these in a more data-driven way if (t.RefLength <= 1334) { txp.lengthClassIndex(0); } else if (t.RefLength <= 2104) { txp.lengthClassIndex(0); } else if (t.RefLength <= 2988) { txp.lengthClassIndex(0); } else if (t.RefLength <= 4389) { txp.lengthClassIndex(0); } else { txp.lengthClassIndex(0); } /* std::cerr << "TS = " << t.RefName << " : \n"; std::cerr << seq << "\n VS \n"; for (size_t i = 0; i < t.RefLength; ++i) { std::cerr << transcripts_.back().charBaseAt(i); } std::cerr << "\n\n"; */ free(rseq); /* end BWA code */ ++tnum; } // Since we have the de-coded reference sequences, we no longer need // the encoded sequences, so free them. /** TEST OPT **/ // free(idx_->pac); idx_->pac = nullptr; /** END TEST OPT **/ transcripts_tmp.clear(); // ====== Done loading the transcripts from file }