void CHMM::genotypeProbability(vector<SNP*>::iterator snp_it, double probs[2]) { probs[0] = probs[1] = 1.0; char ref = (*snp_it)->GetRef(); char alt = (*snp_it)->GetAlt(); int read_count = (*snp_it)->GetReadCount(); double errate = 0.01; // REVISIT: I may need to take the nth root here before returning the probabilities // n here is read_count for(int count=0; count<read_count; count++) { READ *rd = (*snp_it)->GetRead(count); char all = rd->GetAllele(count); int hap = rd->GetHap(); double happrob = rd->GetHapProb(); if(all==ref) { probs[0] = probs[0]; probs[1] *= (1-happrob); } else if(all==alt) { probs[0] *= (1-happrob); probs[1] = probs[1]; } else { probs[0] *= errate; probs[1] *= errate; } } // Experimental: Logs for revisit above probs[0] = pow(2.7182, log(probs[0])/read_count); probs[1] = pow(2.7182, log(probs[1])/read_count); probs[2] = pow(2.7182, log(probs[2])/read_count); }
//double CHMM::haplotypeProbability(int refct, int altct, int errct, int g) void CHMM::haplotypeProbability(vector<SNP*>::iterator snp_it, double probs[3]) { probs[0] = 1.0; probs[1] = 1.0; probs[2] = 1.0; char ref = (*snp_it)->GetRef(); char alt = (*snp_it)->GetAlt(); double qual = (*snp_it)->GetQualScore(); double qualscore = pow(10,-(qual/10)); // P(alt==wrong) double errate = 0.01; //cout << "SNP: " << (*snp_it)->GetPos() << " Qual: " << qual << ", " << qualscore << endl; // Here I assume that if an observed allele is not ref (or alt in case of BB), // then it belongs to the other haplotype. for(int count=0; count<(*snp_it)->GetReadCount(); count++) { READ *rd = (*snp_it)->GetRead(count); char all = rd->GetAllele(count); double haprob = rd->GetHapProb(); if(all == ref) { probs[0] = probs[0]; //Do nothing probs[1] *= (1-haprob);//*qualscore; probs[2] *= ((1-haprob)*(qualscore)); } else if(all = alt) { probs[0] *= ((1-haprob)*(qualscore)); probs[1] *= (1-haprob);//*qualscore; probs[2] = probs[2]; //Do nothing } else { probs[0] *= errate; probs[1] *= errate; probs[2] *= errate; } #ifdef DEBUG //cout << "qual = " << qual << endl; //cout << "qualscore = " << qualscore << endl; //cout << "haprob = " << haprob << endl; //cout << "Computing happrobs:" << "\t" << probs[0] << "\t" << probs[1] << "\t" << probs[2] << endl; #endif } }
// Calculate posterior probability int somaticHaplotypeProbability(vector<SNP*>::iterator snp_it, double probs[3], int *known_hap_count) { char ref = (*snp_it)->GetRef(); char alt = (*snp_it)->GetAlt(); int real_count = 0, ref_ct = 0, alt_ct = 0; int ref1_ct = 0, alt1_ct = 0, ref2_ct = 0, alt2_ct = 0, err_ct=0; int contig_max = 0, contig_new = 1, max_stretch = 0; int count = 0, gap = 0, stretch = 0, contiguous = 1, contig_start = 0; int mapq1 = 0, proximal_ins_sum = 0, proximal_del_sum = 0; int clus_flag = 0, clus_in_flag = 1, clus_del_flag = 1; int max_reads = (*snp_it)->GetReadCount(); double probs1[3], probs2[3]; double prevalence; ref_ct = (*snp_it)->GetRefCount(); alt_ct = (*snp_it)->GetAltCount(); // dynamically calculate the prevalence prevalence = (double)(alt_ct)/(double)(ref_ct+alt_ct); *known_hap_count = 0; probs[0] = 1.0; probs[1] = 1.0; probs[2] = 1.0; probs1[0] = 1.0; probs1[1] = 1.0; probs1[2] = 1.0; probs2[0] = 1.0; probs2[1] = 1.0; probs2[2] = 1.0; #ifdef DEBUG cout << " Count on snp " << (*snp_it)->GetPos() << " is " << max_reads << endl; #endif // First for loop looks for maximum contiguous region for(count=0; count<max_reads; count++) { READ *rd = (*snp_it)->GetRead(count); READ *pd = (*snp_it)->GetRead(count-1); if(rd->GetKnownOverlapCount()>0&&rd->GetDiscordance()==0) { // No gap exists in hap assignment (*known_hap_count)++; stretch++; if(contig_new==1) { contig_start = count; contig_new = 0; } } else { // Gap exists in hap assignment. Not contiguous. contiguous = 0; contig_new = 1; if(stretch>max_stretch) { // Identify longest stretch of contiguous assignments max_stretch = stretch; contig_max = contig_start; } stretch = 0; } if(contiguous==1) { max_stretch=stretch; contig_max=0; } else if(stretch>max_stretch) { max_stretch = stretch; contig_max = contig_start; } #ifdef DEBUG cout << "SNP=" << (*snp_it)->GetPos() << " Gap=" << gap << " contiguous=" << contiguous << " max_reads=" << max_reads << " max_stretch=" << max_stretch << " contig_max=" << contig_max << endl; #endif } if(contiguous==0) gap+=2; if(max_stretch<(int)(9*max_reads/10)) // If max stretch of contiguous assigned reads is at least 90% of total number, consider reads for genotype score assignments. gap++; #ifdef DEBUG cout << " Gap=" << gap << " contiguous=" << contiguous << " max_reads=" << max_reads << " max_stretch=" << max_stretch << " contig_max=" << contig_max << " prevalence = " << prevalence << endl; #endif // Second for loop performs actual genotype score calculation for(count=0; count<max_reads; count++) { bool proximal_ins = 0, proximal_del = 0; char all, pall = 'N'; int flag = 0; double qual, qualscore; READ *rd = (*snp_it)->GetRead(count); READ *pd = (*snp_it)->GetRead(count-1); int hap = rd->GetHap(); double haprob = rd->GetHapProb(); double hethap = sqrt((1.0-haprob)*haprob); int snp_count = rd->GetSnpCount(); if(snp_count==0) { #ifdef DEBUG cout << "For som " << (*snp_it)->GetPos() << " skipping read " << rd->GetPos() << " for happrob calculation" << endl; #endif haprob = 0.5; } // Obtain allele, base qual and #proximal insertions/deletions on current read long snp_position = (*snp_it)->GetPos(); for(int s_pos=0; s_pos<snp_count; s_pos++) { if(rd->GetSnp(s_pos)->GetPos() == snp_position) { proximal_ins = rd->GetProximalInsert(s_pos); proximal_del = rd->GetProximalDelete(s_pos); all = rd->GetAllele(s_pos); qual = rd->GetQualScore(s_pos)-33; qualscore = pow(10.0,-(qual/10.0)); break; } } if(hap==1) { // See comments later if(all == ref) { ref1_ct++; } else if(all == alt) { alt1_ct++; } else err_ct++; } else if(hap==2) { if(all == ref) { ref2_ct++; } else if(all == alt) { alt2_ct++; } else err_ct++; } real_count++; if(qual<5) // No base with quality < 5 should be considered continue; if(all==alt && qual>=20) // At least one alternate allele with base quality >20 mapq1++; if(proximal_ins==1) { // Read contains insertion in vicinity proximal_ins_sum+= 1; } if(proximal_del==1) { // Read contains deletion in vicinity proximal_del_sum+= 1; } if(all==alt) { clus_in_flag--; clus_del_flag--; if(proximal_ins==1) clus_in_flag++; if(proximal_del==1) clus_del_flag++; } proximal_ins = 0; proximal_del = 0; // Work on contiguous stretch only if(count>=contig_max&&count<contig_max+max_stretch) { // Since we are haplotype-aware, two separate set of calculations are performed // First set assumes somatic mutation lies on haplotype 1 // Second set assumes somatic mutation lies on haplotype 2 // In the end we consider that haplotype to contain somatic mutation, which has higher het probability assigned if(hap==1) { if(all == ref) { probs1[0] *= ((haprob)*(1.0-qualscore)); probs1[1] *= ((1.0-prevalence-qualscore/4.0)*haprob); probs1[2] *= ((1.0-haprob) * (qualscore/3.0)); probs2[0] *= ((haprob)*(1.0-qualscore)); probs2[1] *= ((1.0-prevalence-qualscore/4.0)*(1.0-haprob)); probs2[2] *= ((1.0-haprob) * (qualscore/3.0)); } else if(all == alt) { probs1[0] *= (haprob*(1.0*qualscore/3.0)); probs1[1] *= ((1.0*qualscore/3.0)*(1.0-haprob)); probs1[2] *= ((1.0-haprob)*(1.0*qualscore/3.0)); probs2[0] *= ((1.0-haprob) * (qualscore/3.0)); probs2[1] *= (prevalence*haprob); probs2[2] *= ((haprob)*(1.0-qualscore)); } } else if(hap==2) { if(all == ref) { probs1[0] *= ((haprob)*(1.0-qualscore)); probs1[1] *= ((1.0-prevalence-qualscore/4.0)*(1.0-haprob)); probs1[2] *= ((1.0-haprob) * (qualscore/3.0)); probs2[0] *= ((haprob)*(1.0-qualscore)); probs2[1] *= ((1.0-prevalence-qualscore/4.0)*haprob); probs2[2] *= ((1.0-haprob) * (qualscore/3.0)); } else if(all == alt) { probs1[0] *= ((1.0-haprob) * (qualscore/3.0)); probs1[1] *= (prevalence*haprob); probs1[2] *= ((haprob)*(1.0-qualscore)); probs2[0] *= (haprob*(1.0*qualscore/3.0)); probs2[1] *= ((1.0*qualscore/3.0)*(1.0-haprob)); probs2[2] *= ((1.0-haprob)*(1.0*qualscore/3.0)); } } } else if(rd->GetKnownOverlapCount()>=0) { // Insufficient contiguous regions. Haplotype unaware if(all == ref) { probs1[0] *= ((1.0-qualscore)); probs1[1] *= (1.0-prevalence-qualscore/3.0); probs1[2] *= ((qualscore/3.0)); probs2[0] *= ((1.0-qualscore)); probs2[1] *= (1.0-prevalence-qualscore/3.0); probs2[2] *= ((qualscore/3.0)); } else if(all == alt) { probs1[0] *= ((qualscore/3.0)); probs1[1] *= (prevalence-qualscore/3.0); probs1[2] *= ((1.0-qualscore)); probs2[0] *= ((qualscore/3.0)); probs2[1] *= (prevalence-qualscore/3.0); probs2[2] *= ((1.0-qualscore)); } else { probs1[0] *= ((2.0*qualscore/3.0)); probs1[1] *= (qualscore/2.0); probs1[2] *= ((2.0*qualscore/3.0)); probs2[0] *= ((2.0*qualscore/3.0)); probs2[1] *= (qualscore/2.0); probs2[2] *= ((2.0*qualscore/3.0)); } } #ifdef DEBUG cout << "For som " << (*snp_it)->GetPos() << " bearing allele " << all << ref << alt << " on read " << rd->GetPos() << ", qualscore: " << qualscore << " has known_hap_count " << rd->GetKnownOverlapCount() << " and flag " << flag << " bearing haplotype " << hap << " with haprob " << haprob << " for happrob calculation " << probs1[0] << ":" << probs1[1] << ":" << probs1[2] << " and " << probs2[0] << ":" << probs2[1] << ":" << probs2[2] << " and proxdel " << proximal_del << " and proxins " << proximal_ins << " and clus_in " << clus_in_flag << " and clus_del " << clus_del_flag << " and clus " << clus_flag << endl; #endif } // Pick one of two sets of scores double norm1 = probs1[0] + probs1[1] + probs1[2]; double norm2 = probs2[0] + probs2[1] + probs2[2]; if(probs1[1]/norm1>probs2[1]/norm1) { probs[0]=probs1[0];probs[1]=probs1[1];probs[2]=probs1[2]; } else if(probs2[1]/norm2>probs1[1]/norm1) { probs[0]=probs2[0];probs[1]=probs2[1];probs[2]=probs2[2]; } else if(probs1[0]/norm1<probs2[0]/norm2) { probs[0]=probs1[0];probs[1]=probs1[1];probs[2]=probs1[2]; } else if(probs2[0]/norm2<probs1[0]/norm1) { probs[0]=probs2[0];probs[1]=probs2[1];probs[2]=probs2[2]; } else { probs[0]=probs1[0];probs[1]=probs1[1];probs[2]=probs1[2]; } PROXIMAL_READ_CT = 2 + (int)ceil((double)(real_count)/10.0); if(clus_in_flag>=1||clus_del_flag>=1) clus_flag = 1; // FILTERS if(ref_ct<3 || alt_ct<1 || (gap!=3&&alt1_ct>=2&&alt2_ct>=2) || mapq1==0 || proximal_ins_sum>=PROXIMAL_READ_CT&&alt_ct<5 || proximal_del_sum>=PROXIMAL_READ_CT&&alt_ct<5 || clus_flag==1) { // || (gap==3&&alt_ct>=ref_ct)) gap=5; if(mapq1==0) gap = 6; if(proximal_ins_sum>=PROXIMAL_READ_CT&&alt_ct<5 || proximal_del_sum>=PROXIMAL_READ_CT&&alt_ct<5) gap = 7; if(clus_flag==1) gap = 8; probs[0] = 1.0; probs[1] = probs[2] = 0.0; } if(probs[0]<0.0||probs[1]<0.0||probs[2]<0.0) { cout << "Negative: " << probs[0] << ":" << probs[1] << ":" << probs[2] << endl; probs[0] = 1.0; probs[1] = probs[2] = 0.0; gap=5; } #ifdef DEBUG cout << "Accepted mutation " << (*snp_it)->GetPos() << " Prox=" << PROXIMAL_READ_CT << " Gap=" << gap << " known=" << (*snp_it)->GetKnown() << " real_count=" << real_count << " ref_ct=" << ref_ct << " ref1_ct=" << ref1_ct << " ref2_ct=" << ref2_ct << " alt_ct=" << alt_ct << " alt1_ct=" << alt1_ct << " alt2_ct=" << alt2_ct << " proxdelsum=" << proximal_del_sum << " proxinssum=" << proximal_ins_sum << " clus_in=" << clus_in_flag << " clus_del=" << clus_del_flag << " clus=" << clus_flag << endl; #endif return gap; }