inline void PROBerReadModel::update(InMemAlignG* ag_in_mem, InMemAlign* aligns, AlignmentGroup& ag, double noise_frac) { SEQstring seq; QUALstring qual; CIGARstring cigar; const RefSeq* refseq = NULL; assert(ag.getSEQ(seq)); if (model_type & 1) assert(ag.getQUAL(qual)); // update noise prob npro->update(seq, noise_frac); // update alignment probs for (int i = 0; i < ag_in_mem->size; ++i) if (aligns[i].frac > 0.0) { refseq = refs->getRef(aligns[i].tid); assert(ag.getAlignment(i)->getCIGAR(cigar)); seqmodel->update(aligns[i].frac, '+', aligns[i].pos, refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL)); } if (model_type >= 2) { // paired-end reads assert(ag.getSEQ(seq, 2)); if (model_type & 1) assert(ag.getQUAL(qual, 2)); // update noise prob npro->update(seq, noise_frac); // update alignment probs for (int i = 0; i < ag_in_mem->size; ++i) if (aligns[i].frac > 0.0) { refseq = refs->getRef(aligns[i].tid); assert(ag.getAlignment(i)->getCIGAR(cigar, 2)); assert(aligns[i].fragment_length > 0); seqmodel->update(aligns[i].frac, '-', refseq->getLen() - aligns[i].pos - aligns[i].fragment_length, refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL)); } } }
void calcExpectedEffectiveLengths(ModelType& model) { int lb, ub, span; double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i) model.getGLD().copyTo(pdf, cdf, lb, ub, span); clen = new double[span + 1]; clen[0] = 0.0; for (int i = 1; i <= span; i++) { clen[i] = clen[i - 1] + pdf[i] * (lb + i); } eel.clear(); eel.resize(M + 1, 0.0); for (int i = 1; i <= M; i++) { int totLen = refs.getRef(i).getTotLen(); int fullLen = refs.getRef(i).getFullLen(); int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0); int pos2 = max(min(totLen, ub) - lb, 0); if (pos2 == 0) { eel[i] = 0.0; continue; } eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1])); assert(eel[i] >= 0); if (eel[i] < MINEEL) { eel[i] = 0.0; } } delete[] pdf; delete[] cdf; delete[] clen; }
inline void PROBerReadModel::setConProbs(InMemAlignG* ag_in_mem, InMemAlign* aligns, AlignmentGroup& ag) { int seqlen; SEQstring seq; // seq, qual and cigar must be in each function since we have multiple threads! QUALstring qual; CIGARstring cigar; const RefSeq* refseq = NULL; // Get read sequences and quality scores assert(ag.getSEQ(seq)); seqlen = read_length < 0 ? ag.getSeqLength() : read_length; if (model_type & 1) assert(ag.getQUAL(qual)); // set noise probability ag_in_mem->noise_conprb = mld1->getProb(seqlen) * npro->getProb(seq); // set alignment probabilities for (int i = 0; i < ag_in_mem->size; ++i) if (aligns[i].conprb != -1.0) { refseq = refs->getRef(aligns[i].tid); assert(ag.getAlignment(i)->getCIGAR(cigar)); aligns[i].conprb = (aligns[i].fragment_length > 0 ? mld1->getProb(seqlen, aligns[i].fragment_length) : mld1->getProb(seqlen)) * \ seqmodel->getProb('+', aligns[i].pos, refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL)); } if (model_type >= 2) { // paired-end reads assert(ag.getSEQ(seq, 2)); seqlen = read_length < 0 ? ag.getSeqLength(2) : read_length; if (model_type & 1) assert(ag.getQUAL(qual, 2)); ag_in_mem->noise_conprb *= mld2->getProb(seqlen) * npro->getProb(seq); for (int i = 0; i < ag_in_mem->size; ++i) if (aligns[i].conprb != -1.0) { refseq = refs->getRef(aligns[i].tid); assert(ag.getAlignment(i)->getCIGAR(cigar, 2)); assert(aligns[i].fragment_length > 0); aligns[i].conprb *= mld2->getProb(seqlen, aligns[i].fragment_length) * \ seqmodel->getProb('-', refseq->getLen() - aligns[i].pos - aligns[i].fragment_length, refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL)); } } }
inline void PROBerReadModel::simulate(READ_INT_TYPE rid, int tid, int pos, int fragment_length, std::ofstream* out1, std::ofstream* out2) { int m2pos; int mateL1, mateL2; std::string qual1, qual2, cigar1, cigar2, readseq1, readseq2; // Simulate reads if (tid == 0) { cigar1 = cigar2 = "*"; mateL1 = mld1->simulate(sampler); if (model_type & 1) qd->simulate(sampler, mateL1, qual1); npro->simulate(sampler, mateL1, readseq1); if (model_type >= 2) { mateL2 = mld2->simulate(sampler); if (model_type & 1) qd->simulate(sampler, mateL2, qual2); npro->simulate(sampler, mateL2, readseq2); } } else { const RefSeq* ref = refs->getRef(tid); mateL1 = mld1->simulate(sampler, fragment_length); if (model_type & 1) qd->simulate(sampler, mateL1, qual1); seqmodel->simulate(sampler, mateL1, '+', pos, ref, qual1, cigar1, readseq1); if (model_type >= 2) { mateL2 = mld2->simulate(sampler, fragment_length); if (model_type & 1) qd->simulate(sampler, mateL2, qual2); m2pos = ref->getLen() - pos - fragment_length; seqmodel->simulate(sampler, mateL2, '-', m2pos, ref, qual2, cigar2, readseq2); } } // Output reads (*out1)<< ((model_type & 1) ? '@' : '>') << rid<< '_'<< tid<< '_'<< pos<< '_'<< fragment_length<< '_'<< cigar1<< (model_type >= 2 ? "/1" : "") << std::endl; (*out1)<< readseq1<< std::endl; if (model_type & 1) (*out1)<< '+'<< std::endl<< qual1<< std::endl; if (model_type >= 2) { (*out2)<< ((model_type & 1) ? '@' : '>') << rid<< '_'<< tid<< '_'<< pos<< '_'<< fragment_length<< '_'<< cigar2<< "/2"<< std::endl; (*out2)<< readseq2<< std::endl; if (model_type & 1) (*out2)<< '+'<< std::endl<< qual2<< std::endl; } }
void writeToDisk(char* refName) { ofstream fout; sprintf(tiF, "%s.ti", refName); transcripts.writeTo(tiF); if (verbose) { printf("Transcript Information File is generated!\n"); } sprintf(refFastaF, "%s.transcripts.fa", refName); refs.writeTo(refFastaF); sprintf(transListF, "%s.translist", refName); refs.writeTransListTo(transListF); sprintf(chromListF, "%s.chrlist", refName); fout.open(chromListF); for (int i = 0; i < (int)chrvec.size(); ++i) fout<< chrvec[i].name<< '\t'<< chrvec[i].len<< endl; fout.close(); if (verbose) { printf("Chromosome List File is generated!\n"); } string cur_gene_id, cur_transcript_id, name; vector<int> gi, gt, ta; cur_gene_id = ""; gi.clear(); if (mappingType == 2) { cur_transcript_id = ""; gt.clear(); ta.clear(); } for (int i = 1; i <= M; ++i) { const Transcript& transcript = transcripts.getTranscriptAt(i); if (cur_gene_id != transcript.getGeneID()) { gi.push_back(i); if (mappingType == 2) gt.push_back((int)ta.size()); cur_gene_id = transcript.getGeneID(); } if ((mappingType == 2) && (cur_transcript_id != transcript.getTranscriptID())) { ta.push_back(i); cur_transcript_id = transcript.getTranscriptID(); } } gi.push_back(M + 1); if (mappingType == 2) { gt.push_back((int)ta.size()); ta.push_back(M + 1); } sprintf(groupF, "%s.grp", refName); fout.open(groupF); for (int i = 0; i < (int)gi.size(); ++i) fout<< gi[i]<< endl; fout.close(); if (verbose) { printf("Group File is generated!\n"); } if (mappingType == 2) { sprintf(gtF, "%s.gt", refName); fout.open(gtF); for (int i = 0; i < (int)gt.size(); ++i) fout<< gt[i]<< endl; fout.close(); sprintf(taF, "%s.ta", refName); fout.open(taF); for (int i = 0; i < (int)ta.size(); ++i) fout<< ta[i]<< endl; fout.close(); if (verbose) { printf("Allele-specific group files are generated!\n"); } } if (n2g_idx) { sprintf(n2g_idxF, "%s.n2g.idx.fa", refName); fout.open(n2g_idxF); for (int i = 1; i <= M; ++i) fout<< '>'<< refs.getRef(i)->getName()<< endl<< n2g(refs.getRef(i)->getSeq())<< endl; fout.close(); if (verbose) printf("%s is generated!\n", n2g_idxF); } }