inline void PROBerReadModel::update(InMemAlignG* ag_in_mem, InMemAlign* aligns, AlignmentGroup& ag, double noise_frac) {
	SEQstring seq;
	QUALstring qual;
	CIGARstring cigar;
	const RefSeq* refseq = NULL;
	
	assert(ag.getSEQ(seq));
	if (model_type & 1) assert(ag.getQUAL(qual));
	// update noise prob
	npro->update(seq, noise_frac);
	// update alignment probs
	for (int i = 0; i < ag_in_mem->size; ++i) if (aligns[i].frac > 0.0) {
		refseq = refs->getRef(aligns[i].tid);
		assert(ag.getAlignment(i)->getCIGAR(cigar));
		seqmodel->update(aligns[i].frac, '+', aligns[i].pos, refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL));
	}

	if (model_type >= 2) {
		// paired-end reads
		assert(ag.getSEQ(seq, 2));
		if (model_type & 1) assert(ag.getQUAL(qual, 2));
		// update noise prob
		npro->update(seq, noise_frac);
		// update alignment probs
		for (int i = 0; i < ag_in_mem->size; ++i) if (aligns[i].frac > 0.0) {
			refseq = refs->getRef(aligns[i].tid);
			assert(ag.getAlignment(i)->getCIGAR(cigar, 2));
			assert(aligns[i].fragment_length > 0);
			seqmodel->update(aligns[i].frac, '-', refseq->getLen() - aligns[i].pos - aligns[i].fragment_length, refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL));
		}
	}
} 
Exemple #2
0
void calcExpectedEffectiveLengths(ModelType& model) {
  int lb, ub, span;
  double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
  
  model.getGLD().copyTo(pdf, cdf, lb, ub, span);
  clen = new double[span + 1];
  clen[0] = 0.0;
  for (int i = 1; i <= span; i++) {
    clen[i] = clen[i - 1] + pdf[i] * (lb + i);
  }

  eel.clear();
  eel.resize(M + 1, 0.0);
  for (int i = 1; i <= M; i++) {
    int totLen = refs.getRef(i).getTotLen();
    int fullLen = refs.getRef(i).getFullLen();
    int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
    int pos2 = max(min(totLen, ub) - lb, 0);

    if (pos2 == 0) { eel[i] = 0.0; continue; }
    
    eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
    assert(eel[i] >= 0);
    if (eel[i] < MINEEL) { eel[i] = 0.0; }
  }
  
  delete[] pdf;
  delete[] cdf;
  delete[] clen;
}
inline void PROBerReadModel::setConProbs(InMemAlignG* ag_in_mem, InMemAlign* aligns, AlignmentGroup& ag) {
	int seqlen;
	SEQstring seq; // seq, qual and cigar must be in each function since we have multiple threads!
	QUALstring qual;
	CIGARstring cigar;
	const RefSeq* refseq = NULL;
	
	// Get read sequences and quality scores
	assert(ag.getSEQ(seq));
	seqlen = read_length < 0 ? ag.getSeqLength() : read_length;
	if (model_type & 1) assert(ag.getQUAL(qual));
	// set noise probability    
	ag_in_mem->noise_conprb = mld1->getProb(seqlen) * npro->getProb(seq);
	// set alignment probabilities
	for (int i = 0; i < ag_in_mem->size; ++i) if (aligns[i].conprb != -1.0) {
		refseq = refs->getRef(aligns[i].tid);
		assert(ag.getAlignment(i)->getCIGAR(cigar));
		aligns[i].conprb = (aligns[i].fragment_length > 0 ? mld1->getProb(seqlen, aligns[i].fragment_length) : mld1->getProb(seqlen)) * \
			seqmodel->getProb('+', aligns[i].pos, refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL));
	}
	
	if (model_type >= 2) {
		// paired-end reads
		assert(ag.getSEQ(seq, 2));
		seqlen = read_length < 0 ? ag.getSeqLength(2) : read_length;
		if (model_type & 1) assert(ag.getQUAL(qual, 2));
		ag_in_mem->noise_conprb *= mld2->getProb(seqlen) * npro->getProb(seq);
		for (int i = 0; i < ag_in_mem->size; ++i) if (aligns[i].conprb != -1.0) {
			refseq = refs->getRef(aligns[i].tid);
			assert(ag.getAlignment(i)->getCIGAR(cigar, 2));
			assert(aligns[i].fragment_length > 0);
			aligns[i].conprb *= mld2->getProb(seqlen, aligns[i].fragment_length) * \
	seqmodel->getProb('-', refseq->getLen() - aligns[i].pos - aligns[i].fragment_length, refseq, &cigar, &seq, ((model_type & 1) ? &qual : NULL));
		}
	}
}
inline void PROBerReadModel::simulate(READ_INT_TYPE rid, int tid, int pos, int fragment_length, std::ofstream* out1, std::ofstream* out2) {
	int m2pos;
	int mateL1, mateL2;
	std::string qual1, qual2, cigar1, cigar2, readseq1, readseq2;
	
	// Simulate reads
	if (tid == 0) {
		cigar1 = cigar2 = "*";

		mateL1 = mld1->simulate(sampler);
		if (model_type & 1) qd->simulate(sampler, mateL1, qual1);
		npro->simulate(sampler, mateL1, readseq1);
		
		if (model_type >= 2) {
			mateL2 = mld2->simulate(sampler);
			if (model_type & 1) qd->simulate(sampler, mateL2, qual2);
			npro->simulate(sampler, mateL2, readseq2);
		}
	}
	else {
		const RefSeq* ref = refs->getRef(tid);
		mateL1 = mld1->simulate(sampler, fragment_length);
		if (model_type & 1) qd->simulate(sampler, mateL1, qual1);
		seqmodel->simulate(sampler, mateL1, '+', pos, ref, qual1, cigar1, readseq1);

		if (model_type >= 2) {
			mateL2 = mld2->simulate(sampler, fragment_length); 
			if (model_type & 1) qd->simulate(sampler, mateL2, qual2);
			m2pos = ref->getLen() - pos - fragment_length;
			seqmodel->simulate(sampler, mateL2, '-', m2pos, ref, qual2, cigar2, readseq2);
		}
	}

	// Output reads
	(*out1)<< ((model_type & 1) ? '@' : '>') << rid<< '_'<< tid<< '_'<< pos<< '_'<< fragment_length<< '_'<< cigar1<< (model_type >= 2 ? "/1" : "") << std::endl;
	(*out1)<< readseq1<< std::endl;
	if (model_type & 1) (*out1)<< '+'<< std::endl<< qual1<< std::endl;

	if (model_type >= 2) {
		(*out2)<< ((model_type & 1) ? '@' : '>') << rid<< '_'<< tid<< '_'<< pos<< '_'<< fragment_length<< '_'<< cigar2<< "/2"<< std::endl;
		(*out2)<< readseq2<< std::endl;
		if (model_type & 1) (*out2)<< '+'<< std::endl<< qual2<< std::endl;
	}
}
Exemple #5
0
void writeToDisk(char* refName) {
	ofstream fout;

	sprintf(tiF, "%s.ti", refName);
	transcripts.writeTo(tiF);
	if (verbose) { printf("Transcript Information File is generated!\n"); }
	
	sprintf(refFastaF, "%s.transcripts.fa", refName);
	refs.writeTo(refFastaF);

	sprintf(transListF, "%s.translist", refName);
	refs.writeTransListTo(transListF);

	sprintf(chromListF, "%s.chrlist", refName);
	fout.open(chromListF);
	for (int i = 0; i < (int)chrvec.size(); ++i)
		fout<< chrvec[i].name<< '\t'<< chrvec[i].len<< endl;
	fout.close();
	if (verbose) { printf("Chromosome List File is generated!\n"); }
	
	string cur_gene_id, cur_transcript_id, name;
	vector<int> gi, gt, ta;

	cur_gene_id = ""; gi.clear(); 
	if (mappingType == 2) { cur_transcript_id = ""; gt.clear(); ta.clear(); }
	for (int i = 1; i <= M; ++i) {
		const Transcript& transcript = transcripts.getTranscriptAt(i);
		if (cur_gene_id != transcript.getGeneID()) {
			gi.push_back(i);
			if (mappingType == 2) gt.push_back((int)ta.size());
			cur_gene_id = transcript.getGeneID();
		}
		if ((mappingType == 2) && (cur_transcript_id != transcript.getTranscriptID())) {
			ta.push_back(i);
			cur_transcript_id = transcript.getTranscriptID();
		}
	}
	
	gi.push_back(M + 1);
	if (mappingType == 2) { gt.push_back((int)ta.size()); ta.push_back(M + 1); }

	sprintf(groupF, "%s.grp", refName);
	fout.open(groupF);
	for (int i = 0; i < (int)gi.size(); ++i) fout<< gi[i]<< endl;
	fout.close();
	if (verbose) { printf("Group File is generated!\n"); }

	if (mappingType == 2) {
		sprintf(gtF, "%s.gt", refName);
		fout.open(gtF);
		for (int i = 0; i < (int)gt.size(); ++i) fout<< gt[i]<< endl;
		fout.close();
		sprintf(taF, "%s.ta", refName);
		fout.open(taF);
		for (int i = 0; i < (int)ta.size(); ++i) fout<< ta[i]<< endl;
		fout.close();
		if (verbose) { printf("Allele-specific group files are generated!\n"); }
	}

	if (n2g_idx) {
		sprintf(n2g_idxF, "%s.n2g.idx.fa", refName);
		fout.open(n2g_idxF);
		for (int i = 1; i <= M; ++i) 
			fout<< '>'<< refs.getRef(i)->getName()<< endl<< n2g(refs.getRef(i)->getSeq())<< endl;
		fout.close();
		if (verbose) printf("%s is generated!\n", n2g_idxF);
	}
}