void CHMM::genotypeProbability(vector<SNP*>::iterator snp_it, double probs[2]) { probs[0] = probs[1] = 1.0; char ref = (*snp_it)->GetRef(); char alt = (*snp_it)->GetAlt(); int read_count = (*snp_it)->GetReadCount(); double errate = 0.01; // REVISIT: I may need to take the nth root here before returning the probabilities // n here is read_count for(int count=0; count<read_count; count++) { READ *rd = (*snp_it)->GetRead(count); char all = rd->GetAllele(count); int hap = rd->GetHap(); double happrob = rd->GetHapProb(); if(all==ref) { probs[0] = probs[0]; probs[1] *= (1-happrob); } else if(all==alt) { probs[0] *= (1-happrob); probs[1] = probs[1]; } else { probs[0] *= errate; probs[1] *= errate; } } // Experimental: Logs for revisit above probs[0] = pow(2.7182, log(probs[0])/read_count); probs[1] = pow(2.7182, log(probs[1])/read_count); probs[2] = pow(2.7182, log(probs[2])/read_count); }
void CHMM::FindFBDistance(CObsSeq *obsSeq, ostream &outFile, long snp_start, long snp_end) { double *scale; double **alpha, **beta; double logProb; int i, j, em, t, k; double **posterior; double post; for(i=1;i<=obsSeq->mNbSequences;i++){ // Loop over observation files: long T = obsSeq->mNbObs[i]; CObs **obs = obsSeq->mObs[i]; alpha = SetMatrix(T, mN);// different size every time beta = SetMatrix(T, mN); scale = SetVector(T); int *q = SetIntVector(T); posterior = SetMatrix(T,mN); // logProb contains log P(x) // Haplotype Calling logProb = ForwardAlgo(alpha, scale, obs, T, TRUE); BackwardAlgo(beta, scale, obs, T); for(t=1;t<=T;t++) { READ *nd = (((CFlexibleObs<READ*>*)(obs[t]))->Get(1)); for(k=1;k<=mN;k++) { // Experimental: P(x) cancels scaling for alpha and beta // so posterior is just their product double st = 1.0; for(j=T; j>t; j--) { st *= scale[j]; } post = (alpha[t][k]*beta[t][k]*scale[t]); #ifdef DEBUG cout << endl; cout << "Scale = " << scale[t] << endl; cout << "Scaled alpha = " << alpha[t][k] << endl; cout << "beta = " << beta[t][k] << endl; cout << "Scaled beta = " << beta[t][k]*scale[t] << endl; #endif posterior[t][k] = post; } // REVISIT: hardcoding #states. q[t] = posterior[t][1] > posterior[t][2] ? 1 : 2; #ifdef DEBUG //cout << "q[t] = " << q[t] << endl; //cout << "posterior = " << posterior[t][q[t]] << endl; #endif nd->assignHaplotype(q[t],posterior[t][q[t]]); } UpdateGenotypes(snp_start, snp_end); for(j=2;j<=T;j++) { outFile << q[j] << "\t" << ((reads_list)[reads_list.size()-T+j-1])->GetPos() << endl; } delete [] q; } }
// Works on known snp list // Given t-th read in list, returns allele of snp with position pos. char GetPosAllele(int t, long pos) { READ *rd = reads_list[t]; int snp_count = rd->GetKnownCount(); SNP ** snp_list = rd->GetKnownList(); for(int i=0; i<snp_count; i++) { if(snp_list[i]->GetPos()==pos) return rd->GetKnownAllele(i); } return 'N'; }
//double CHMM::haplotypeProbability(int refct, int altct, int errct, int g) void CHMM::haplotypeProbability(vector<SNP*>::iterator snp_it, double probs[3]) { probs[0] = 1.0; probs[1] = 1.0; probs[2] = 1.0; char ref = (*snp_it)->GetRef(); char alt = (*snp_it)->GetAlt(); double qual = (*snp_it)->GetQualScore(); double qualscore = pow(10,-(qual/10)); // P(alt==wrong) double errate = 0.01; //cout << "SNP: " << (*snp_it)->GetPos() << " Qual: " << qual << ", " << qualscore << endl; // Here I assume that if an observed allele is not ref (or alt in case of BB), // then it belongs to the other haplotype. for(int count=0; count<(*snp_it)->GetReadCount(); count++) { READ *rd = (*snp_it)->GetRead(count); char all = rd->GetAllele(count); double haprob = rd->GetHapProb(); if(all == ref) { probs[0] = probs[0]; //Do nothing probs[1] *= (1-haprob);//*qualscore; probs[2] *= ((1-haprob)*(qualscore)); } else if(all = alt) { probs[0] *= ((1-haprob)*(qualscore)); probs[1] *= (1-haprob);//*qualscore; probs[2] = probs[2]; //Do nothing } else { probs[0] *= errate; probs[1] *= errate; probs[2] *= errate; } #ifdef DEBUG //cout << "qual = " << qual << endl; //cout << "qualscore = " << qualscore << endl; //cout << "haprob = " << haprob << endl; //cout << "Computing happrobs:" << "\t" << probs[0] << "\t" << probs[1] << "\t" << probs[2] << endl; #endif } }
void NaiveBayes(int snp_start, int snp_end, int T) { int i, j, t; SNP **known_snp_list = new SNP*[100]; int *known_index = new int[200]; int reads_list_size = reads_list.size(); for (t = 1+reads_list_size-T; t <= reads_list_size-1; t++) { int known_snp_count = 0, flag = 0, direction = 1, hap = 0, mismatchsum = 0; double norm = 0.0; double emission_list[3], haprob[3], emission_set[3][100]; READ *pd = reads_list[t-1]; READ *nd = reads_list[t]; int plen = pd->GetLen(); int nlen = nd->GetLen(); int ppos = pd->GetPos(); int npos = nd->GetPos(); int rd_start = ppos < npos ? npos : ppos; int rd_end = ppos+plen > npos+nlen ? npos+nlen-1 : ppos+plen-1; //mismatchsum = GetMismatchSum(nd); //if(mismatchsum>=50) // continue; GetKnownSnpList(known_snp_list, &known_snp_count, known_index, t); // This adds the count of known overlapping snps between adjacent reads (*nd).AddKnownCount(known_snp_count); #ifdef DEBUG if(known_snp_count>0) cout << "Read " << nd->GetPos() << " ; "<< t << " and " << t+1 << ", " << rd_start << ", " << rd_end << " with " << known_snp_count << " known snps." << endl << endl; cout << "SnpPos R A LL..\tAA_gen AA_obs AA_genob\tAB_gen AB_obs AB_genob\tBB_gen BB_obs BB_genob\tprob\n"; cout << "Throwing emissions from read: " << nd->GetPos() << endl; #endif for(j=1;j<=2;j++) { #ifdef DEBUG cout << "Haplotype " << j << endl; #endif emission_list[j] = 0.0; for(int count=0; count<known_snp_count; count++) { double emission; SNP *sp = known_snp_list[count]; #ifdef DEBUG cout << sp->GetPos() << " " << sp->GetRef() << " " << sp->GetAlt(); #endif // Obtains naive bayes score for the count-th overlapping known snp // between the adjacent reads for haplotype, 'j'. // Working with relative haplotype here. emission = compute_new_emission(known_snp_list, count, t, known_index, j); if(emission==0.0) continue; emission_list[j] += ((emission)); emission_set[j][count] = emission; } #ifdef DEBUG cout << "Total Emission = " << emission_list[j] << endl; #endif } if(emission_set[1][0]<emission_set[2][0]) direction = 2; for(int count=1; count<known_snp_count; count++) { if(emission_set[1][count]>emission_set[2][count]&&direction==2) flag = 1; if(emission_set[1][count]<emission_set[2][count]&&direction==1) flag = 1; } #ifdef DEBUG cout << "Discordance = " << flag << endl; #endif norm = emission_list[1] + emission_list[2]; if(norm!=0.0) { haprob[1] = emission_list[1] / norm; haprob[2] = emission_list[2] / norm; } else { haprob[1] = 0.5; haprob[2] = 0.5; } #ifdef DEBUG cout << "Happrob[1] = " << haprob[1] << endl; cout << "Happrob[2] = " << haprob[2] << endl; #endif // If haplotype probabilities are equal, randomly assign haplotype hap = haprob[1] > haprob[2] ? 1 : haprob[1] == haprob[2] ? t%2+1 : 2; double happrob = haprob[hap]; // convert relative haplotype to absolute haplotype hap = (*pd).GetHap() == hap ? 1 : 2; nd->assignHaplotype(hap, happrob, flag); flag = 0; direction = 1; } delete [] known_index; delete [] known_snp_list; }
// Calculate posterior probability int somaticHaplotypeProbability(vector<SNP*>::iterator snp_it, double probs[3], int *known_hap_count) { char ref = (*snp_it)->GetRef(); char alt = (*snp_it)->GetAlt(); int real_count = 0, ref_ct = 0, alt_ct = 0; int ref1_ct = 0, alt1_ct = 0, ref2_ct = 0, alt2_ct = 0, err_ct=0; int contig_max = 0, contig_new = 1, max_stretch = 0; int count = 0, gap = 0, stretch = 0, contiguous = 1, contig_start = 0; int mapq1 = 0, proximal_ins_sum = 0, proximal_del_sum = 0; int clus_flag = 0, clus_in_flag = 1, clus_del_flag = 1; int max_reads = (*snp_it)->GetReadCount(); double probs1[3], probs2[3]; double prevalence; ref_ct = (*snp_it)->GetRefCount(); alt_ct = (*snp_it)->GetAltCount(); // dynamically calculate the prevalence prevalence = (double)(alt_ct)/(double)(ref_ct+alt_ct); *known_hap_count = 0; probs[0] = 1.0; probs[1] = 1.0; probs[2] = 1.0; probs1[0] = 1.0; probs1[1] = 1.0; probs1[2] = 1.0; probs2[0] = 1.0; probs2[1] = 1.0; probs2[2] = 1.0; #ifdef DEBUG cout << " Count on snp " << (*snp_it)->GetPos() << " is " << max_reads << endl; #endif // First for loop looks for maximum contiguous region for(count=0; count<max_reads; count++) { READ *rd = (*snp_it)->GetRead(count); READ *pd = (*snp_it)->GetRead(count-1); if(rd->GetKnownOverlapCount()>0&&rd->GetDiscordance()==0) { // No gap exists in hap assignment (*known_hap_count)++; stretch++; if(contig_new==1) { contig_start = count; contig_new = 0; } } else { // Gap exists in hap assignment. Not contiguous. contiguous = 0; contig_new = 1; if(stretch>max_stretch) { // Identify longest stretch of contiguous assignments max_stretch = stretch; contig_max = contig_start; } stretch = 0; } if(contiguous==1) { max_stretch=stretch; contig_max=0; } else if(stretch>max_stretch) { max_stretch = stretch; contig_max = contig_start; } #ifdef DEBUG cout << "SNP=" << (*snp_it)->GetPos() << " Gap=" << gap << " contiguous=" << contiguous << " max_reads=" << max_reads << " max_stretch=" << max_stretch << " contig_max=" << contig_max << endl; #endif } if(contiguous==0) gap+=2; if(max_stretch<(int)(9*max_reads/10)) // If max stretch of contiguous assigned reads is at least 90% of total number, consider reads for genotype score assignments. gap++; #ifdef DEBUG cout << " Gap=" << gap << " contiguous=" << contiguous << " max_reads=" << max_reads << " max_stretch=" << max_stretch << " contig_max=" << contig_max << " prevalence = " << prevalence << endl; #endif // Second for loop performs actual genotype score calculation for(count=0; count<max_reads; count++) { bool proximal_ins = 0, proximal_del = 0; char all, pall = 'N'; int flag = 0; double qual, qualscore; READ *rd = (*snp_it)->GetRead(count); READ *pd = (*snp_it)->GetRead(count-1); int hap = rd->GetHap(); double haprob = rd->GetHapProb(); double hethap = sqrt((1.0-haprob)*haprob); int snp_count = rd->GetSnpCount(); if(snp_count==0) { #ifdef DEBUG cout << "For som " << (*snp_it)->GetPos() << " skipping read " << rd->GetPos() << " for happrob calculation" << endl; #endif haprob = 0.5; } // Obtain allele, base qual and #proximal insertions/deletions on current read long snp_position = (*snp_it)->GetPos(); for(int s_pos=0; s_pos<snp_count; s_pos++) { if(rd->GetSnp(s_pos)->GetPos() == snp_position) { proximal_ins = rd->GetProximalInsert(s_pos); proximal_del = rd->GetProximalDelete(s_pos); all = rd->GetAllele(s_pos); qual = rd->GetQualScore(s_pos)-33; qualscore = pow(10.0,-(qual/10.0)); break; } } if(hap==1) { // See comments later if(all == ref) { ref1_ct++; } else if(all == alt) { alt1_ct++; } else err_ct++; } else if(hap==2) { if(all == ref) { ref2_ct++; } else if(all == alt) { alt2_ct++; } else err_ct++; } real_count++; if(qual<5) // No base with quality < 5 should be considered continue; if(all==alt && qual>=20) // At least one alternate allele with base quality >20 mapq1++; if(proximal_ins==1) { // Read contains insertion in vicinity proximal_ins_sum+= 1; } if(proximal_del==1) { // Read contains deletion in vicinity proximal_del_sum+= 1; } if(all==alt) { clus_in_flag--; clus_del_flag--; if(proximal_ins==1) clus_in_flag++; if(proximal_del==1) clus_del_flag++; } proximal_ins = 0; proximal_del = 0; // Work on contiguous stretch only if(count>=contig_max&&count<contig_max+max_stretch) { // Since we are haplotype-aware, two separate set of calculations are performed // First set assumes somatic mutation lies on haplotype 1 // Second set assumes somatic mutation lies on haplotype 2 // In the end we consider that haplotype to contain somatic mutation, which has higher het probability assigned if(hap==1) { if(all == ref) { probs1[0] *= ((haprob)*(1.0-qualscore)); probs1[1] *= ((1.0-prevalence-qualscore/4.0)*haprob); probs1[2] *= ((1.0-haprob) * (qualscore/3.0)); probs2[0] *= ((haprob)*(1.0-qualscore)); probs2[1] *= ((1.0-prevalence-qualscore/4.0)*(1.0-haprob)); probs2[2] *= ((1.0-haprob) * (qualscore/3.0)); } else if(all == alt) { probs1[0] *= (haprob*(1.0*qualscore/3.0)); probs1[1] *= ((1.0*qualscore/3.0)*(1.0-haprob)); probs1[2] *= ((1.0-haprob)*(1.0*qualscore/3.0)); probs2[0] *= ((1.0-haprob) * (qualscore/3.0)); probs2[1] *= (prevalence*haprob); probs2[2] *= ((haprob)*(1.0-qualscore)); } } else if(hap==2) { if(all == ref) { probs1[0] *= ((haprob)*(1.0-qualscore)); probs1[1] *= ((1.0-prevalence-qualscore/4.0)*(1.0-haprob)); probs1[2] *= ((1.0-haprob) * (qualscore/3.0)); probs2[0] *= ((haprob)*(1.0-qualscore)); probs2[1] *= ((1.0-prevalence-qualscore/4.0)*haprob); probs2[2] *= ((1.0-haprob) * (qualscore/3.0)); } else if(all == alt) { probs1[0] *= ((1.0-haprob) * (qualscore/3.0)); probs1[1] *= (prevalence*haprob); probs1[2] *= ((haprob)*(1.0-qualscore)); probs2[0] *= (haprob*(1.0*qualscore/3.0)); probs2[1] *= ((1.0*qualscore/3.0)*(1.0-haprob)); probs2[2] *= ((1.0-haprob)*(1.0*qualscore/3.0)); } } } else if(rd->GetKnownOverlapCount()>=0) { // Insufficient contiguous regions. Haplotype unaware if(all == ref) { probs1[0] *= ((1.0-qualscore)); probs1[1] *= (1.0-prevalence-qualscore/3.0); probs1[2] *= ((qualscore/3.0)); probs2[0] *= ((1.0-qualscore)); probs2[1] *= (1.0-prevalence-qualscore/3.0); probs2[2] *= ((qualscore/3.0)); } else if(all == alt) { probs1[0] *= ((qualscore/3.0)); probs1[1] *= (prevalence-qualscore/3.0); probs1[2] *= ((1.0-qualscore)); probs2[0] *= ((qualscore/3.0)); probs2[1] *= (prevalence-qualscore/3.0); probs2[2] *= ((1.0-qualscore)); } else { probs1[0] *= ((2.0*qualscore/3.0)); probs1[1] *= (qualscore/2.0); probs1[2] *= ((2.0*qualscore/3.0)); probs2[0] *= ((2.0*qualscore/3.0)); probs2[1] *= (qualscore/2.0); probs2[2] *= ((2.0*qualscore/3.0)); } } #ifdef DEBUG cout << "For som " << (*snp_it)->GetPos() << " bearing allele " << all << ref << alt << " on read " << rd->GetPos() << ", qualscore: " << qualscore << " has known_hap_count " << rd->GetKnownOverlapCount() << " and flag " << flag << " bearing haplotype " << hap << " with haprob " << haprob << " for happrob calculation " << probs1[0] << ":" << probs1[1] << ":" << probs1[2] << " and " << probs2[0] << ":" << probs2[1] << ":" << probs2[2] << " and proxdel " << proximal_del << " and proxins " << proximal_ins << " and clus_in " << clus_in_flag << " and clus_del " << clus_del_flag << " and clus " << clus_flag << endl; #endif } // Pick one of two sets of scores double norm1 = probs1[0] + probs1[1] + probs1[2]; double norm2 = probs2[0] + probs2[1] + probs2[2]; if(probs1[1]/norm1>probs2[1]/norm1) { probs[0]=probs1[0];probs[1]=probs1[1];probs[2]=probs1[2]; } else if(probs2[1]/norm2>probs1[1]/norm1) { probs[0]=probs2[0];probs[1]=probs2[1];probs[2]=probs2[2]; } else if(probs1[0]/norm1<probs2[0]/norm2) { probs[0]=probs1[0];probs[1]=probs1[1];probs[2]=probs1[2]; } else if(probs2[0]/norm2<probs1[0]/norm1) { probs[0]=probs2[0];probs[1]=probs2[1];probs[2]=probs2[2]; } else { probs[0]=probs1[0];probs[1]=probs1[1];probs[2]=probs1[2]; } PROXIMAL_READ_CT = 2 + (int)ceil((double)(real_count)/10.0); if(clus_in_flag>=1||clus_del_flag>=1) clus_flag = 1; // FILTERS if(ref_ct<3 || alt_ct<1 || (gap!=3&&alt1_ct>=2&&alt2_ct>=2) || mapq1==0 || proximal_ins_sum>=PROXIMAL_READ_CT&&alt_ct<5 || proximal_del_sum>=PROXIMAL_READ_CT&&alt_ct<5 || clus_flag==1) { // || (gap==3&&alt_ct>=ref_ct)) gap=5; if(mapq1==0) gap = 6; if(proximal_ins_sum>=PROXIMAL_READ_CT&&alt_ct<5 || proximal_del_sum>=PROXIMAL_READ_CT&&alt_ct<5) gap = 7; if(clus_flag==1) gap = 8; probs[0] = 1.0; probs[1] = probs[2] = 0.0; } if(probs[0]<0.0||probs[1]<0.0||probs[2]<0.0) { cout << "Negative: " << probs[0] << ":" << probs[1] << ":" << probs[2] << endl; probs[0] = 1.0; probs[1] = probs[2] = 0.0; gap=5; } #ifdef DEBUG cout << "Accepted mutation " << (*snp_it)->GetPos() << " Prox=" << PROXIMAL_READ_CT << " Gap=" << gap << " known=" << (*snp_it)->GetKnown() << " real_count=" << real_count << " ref_ct=" << ref_ct << " ref1_ct=" << ref1_ct << " ref2_ct=" << ref2_ct << " alt_ct=" << alt_ct << " alt1_ct=" << alt1_ct << " alt2_ct=" << alt2_ct << " proxdelsum=" << proximal_del_sum << " proxinssum=" << proximal_ins_sum << " clus_in=" << clus_in_flag << " clus_del=" << clus_del_flag << " clus=" << clus_flag << endl; #endif return gap; }