void CHMM::FindFBDistance(CObsSeq *obsSeq, ostream &outFile, long snp_start, long snp_end) { double *scale; double **alpha, **beta; double logProb; int i, j, em, t, k; double **posterior; double post; for(i=1;i<=obsSeq->mNbSequences;i++){ // Loop over observation files: long T = obsSeq->mNbObs[i]; CObs **obs = obsSeq->mObs[i]; alpha = SetMatrix(T, mN);// different size every time beta = SetMatrix(T, mN); scale = SetVector(T); int *q = SetIntVector(T); posterior = SetMatrix(T,mN); // logProb contains log P(x) // Haplotype Calling logProb = ForwardAlgo(alpha, scale, obs, T, TRUE); BackwardAlgo(beta, scale, obs, T); for(t=1;t<=T;t++) { READ *nd = (((CFlexibleObs<READ*>*)(obs[t]))->Get(1)); for(k=1;k<=mN;k++) { // Experimental: P(x) cancels scaling for alpha and beta // so posterior is just their product double st = 1.0; for(j=T; j>t; j--) { st *= scale[j]; } post = (alpha[t][k]*beta[t][k]*scale[t]); #ifdef DEBUG cout << endl; cout << "Scale = " << scale[t] << endl; cout << "Scaled alpha = " << alpha[t][k] << endl; cout << "beta = " << beta[t][k] << endl; cout << "Scaled beta = " << beta[t][k]*scale[t] << endl; #endif posterior[t][k] = post; } // REVISIT: hardcoding #states. q[t] = posterior[t][1] > posterior[t][2] ? 1 : 2; #ifdef DEBUG //cout << "q[t] = " << q[t] << endl; //cout << "posterior = " << posterior[t][q[t]] << endl; #endif nd->assignHaplotype(q[t],posterior[t][q[t]]); } UpdateGenotypes(snp_start, snp_end); for(j=2;j<=T;j++) { outFile << q[j] << "\t" << ((reads_list)[reads_list.size()-T+j-1])->GetPos() << endl; } delete [] q; } }
void NaiveBayes(int snp_start, int snp_end, int T) { int i, j, t; SNP **known_snp_list = new SNP*[100]; int *known_index = new int[200]; int reads_list_size = reads_list.size(); for (t = 1+reads_list_size-T; t <= reads_list_size-1; t++) { int known_snp_count = 0, flag = 0, direction = 1, hap = 0, mismatchsum = 0; double norm = 0.0; double emission_list[3], haprob[3], emission_set[3][100]; READ *pd = reads_list[t-1]; READ *nd = reads_list[t]; int plen = pd->GetLen(); int nlen = nd->GetLen(); int ppos = pd->GetPos(); int npos = nd->GetPos(); int rd_start = ppos < npos ? npos : ppos; int rd_end = ppos+plen > npos+nlen ? npos+nlen-1 : ppos+plen-1; //mismatchsum = GetMismatchSum(nd); //if(mismatchsum>=50) // continue; GetKnownSnpList(known_snp_list, &known_snp_count, known_index, t); // This adds the count of known overlapping snps between adjacent reads (*nd).AddKnownCount(known_snp_count); #ifdef DEBUG if(known_snp_count>0) cout << "Read " << nd->GetPos() << " ; "<< t << " and " << t+1 << ", " << rd_start << ", " << rd_end << " with " << known_snp_count << " known snps." << endl << endl; cout << "SnpPos R A LL..\tAA_gen AA_obs AA_genob\tAB_gen AB_obs AB_genob\tBB_gen BB_obs BB_genob\tprob\n"; cout << "Throwing emissions from read: " << nd->GetPos() << endl; #endif for(j=1;j<=2;j++) { #ifdef DEBUG cout << "Haplotype " << j << endl; #endif emission_list[j] = 0.0; for(int count=0; count<known_snp_count; count++) { double emission; SNP *sp = known_snp_list[count]; #ifdef DEBUG cout << sp->GetPos() << " " << sp->GetRef() << " " << sp->GetAlt(); #endif // Obtains naive bayes score for the count-th overlapping known snp // between the adjacent reads for haplotype, 'j'. // Working with relative haplotype here. emission = compute_new_emission(known_snp_list, count, t, known_index, j); if(emission==0.0) continue; emission_list[j] += ((emission)); emission_set[j][count] = emission; } #ifdef DEBUG cout << "Total Emission = " << emission_list[j] << endl; #endif } if(emission_set[1][0]<emission_set[2][0]) direction = 2; for(int count=1; count<known_snp_count; count++) { if(emission_set[1][count]>emission_set[2][count]&&direction==2) flag = 1; if(emission_set[1][count]<emission_set[2][count]&&direction==1) flag = 1; } #ifdef DEBUG cout << "Discordance = " << flag << endl; #endif norm = emission_list[1] + emission_list[2]; if(norm!=0.0) { haprob[1] = emission_list[1] / norm; haprob[2] = emission_list[2] / norm; } else { haprob[1] = 0.5; haprob[2] = 0.5; } #ifdef DEBUG cout << "Happrob[1] = " << haprob[1] << endl; cout << "Happrob[2] = " << haprob[2] << endl; #endif // If haplotype probabilities are equal, randomly assign haplotype hap = haprob[1] > haprob[2] ? 1 : haprob[1] == haprob[2] ? t%2+1 : 2; double happrob = haprob[hap]; // convert relative haplotype to absolute haplotype hap = (*pd).GetHap() == hap ? 1 : 2; nd->assignHaplotype(hap, happrob, flag); flag = 0; direction = 1; } delete [] known_index; delete [] known_snp_list; }