TEST_F(FileIOIntegrationTest, ReadSNPInfo) { EXPECT_CALL(configMock, excludeSNPsWithNegativePosition()).Times(AtLeast(1)).WillRepeatedly(Return(true)); EXPECT_CALL(configMock, getPhenotypeCoding()).Times(1).WillRepeatedly(Return(ZERO_ONE_CODING)); DataFilesReaderFactory dataFilesReaderFactory; DataFilesReader* dataFilesReader = dataFilesReaderFactory.constructDataFilesReader(configMock); std::vector<SNP*>* snpInformation = dataFilesReader->readSNPInformation(); int numSNPToInclude = 0; int snpSize = snpInformation->size(); for(int i = 0; i < snpSize; ++i){ SNP* snp = (*snpInformation)[i]; if(snp->shouldInclude()){ ++numSNPToInclude; } } EXPECT_EQ(10, snpSize); EXPECT_EQ(8, numSNPToInclude); delete dataFilesReader; for(int i = 0; i < snpSize; ++i){ delete (*snpInformation)[i]; } delete snpInformation; }
int GetMismatchSum(READ *read) { int read_snp_count = read->GetSnpCount(); SNP **snp_list = read->GetSnpList(); int it=0; double sum=0.0; while(it<read_snp_count) { SNP *snp = snp_list[it]; if(snp->GetKnown()==0) { sum += snp->GetQualScore(); } it++; } return (int)sum; }
string FreqTable::toString() { stringstream ss; std::cout <<"tTot:"<<"\t"<<this->tTot<<endl; boost::unordered_map<SNP, int>::iterator iter; for (iter = this->idMap.begin(); iter != this->idMap.end(); ++iter) { const SNP v = iter->first; double length = this->branchLengths[iter->second]; if (length > 0) { ss << length / this->tTot; for (vector<int>::const_iterator it2 = v.begin(); it2 != v.end(); ++it2) { ss << "\t" << (*it2); } ss << endl; } } return ss.str(); }
bool SNP::operator==(const SNP& otherSNP) const { return id == otherSNP.getId(); }
bool SNP::operator<(const SNP& otherSNP) const { return id < otherSNP.getId(); }
void NaiveBayes(int snp_start, int snp_end, int T) { int i, j, t; SNP **known_snp_list = new SNP*[100]; int *known_index = new int[200]; int reads_list_size = reads_list.size(); for (t = 1+reads_list_size-T; t <= reads_list_size-1; t++) { int known_snp_count = 0, flag = 0, direction = 1, hap = 0, mismatchsum = 0; double norm = 0.0; double emission_list[3], haprob[3], emission_set[3][100]; READ *pd = reads_list[t-1]; READ *nd = reads_list[t]; int plen = pd->GetLen(); int nlen = nd->GetLen(); int ppos = pd->GetPos(); int npos = nd->GetPos(); int rd_start = ppos < npos ? npos : ppos; int rd_end = ppos+plen > npos+nlen ? npos+nlen-1 : ppos+plen-1; //mismatchsum = GetMismatchSum(nd); //if(mismatchsum>=50) // continue; GetKnownSnpList(known_snp_list, &known_snp_count, known_index, t); // This adds the count of known overlapping snps between adjacent reads (*nd).AddKnownCount(known_snp_count); #ifdef DEBUG if(known_snp_count>0) cout << "Read " << nd->GetPos() << " ; "<< t << " and " << t+1 << ", " << rd_start << ", " << rd_end << " with " << known_snp_count << " known snps." << endl << endl; cout << "SnpPos R A LL..\tAA_gen AA_obs AA_genob\tAB_gen AB_obs AB_genob\tBB_gen BB_obs BB_genob\tprob\n"; cout << "Throwing emissions from read: " << nd->GetPos() << endl; #endif for(j=1;j<=2;j++) { #ifdef DEBUG cout << "Haplotype " << j << endl; #endif emission_list[j] = 0.0; for(int count=0; count<known_snp_count; count++) { double emission; SNP *sp = known_snp_list[count]; #ifdef DEBUG cout << sp->GetPos() << " " << sp->GetRef() << " " << sp->GetAlt(); #endif // Obtains naive bayes score for the count-th overlapping known snp // between the adjacent reads for haplotype, 'j'. // Working with relative haplotype here. emission = compute_new_emission(known_snp_list, count, t, known_index, j); if(emission==0.0) continue; emission_list[j] += ((emission)); emission_set[j][count] = emission; } #ifdef DEBUG cout << "Total Emission = " << emission_list[j] << endl; #endif } if(emission_set[1][0]<emission_set[2][0]) direction = 2; for(int count=1; count<known_snp_count; count++) { if(emission_set[1][count]>emission_set[2][count]&&direction==2) flag = 1; if(emission_set[1][count]<emission_set[2][count]&&direction==1) flag = 1; } #ifdef DEBUG cout << "Discordance = " << flag << endl; #endif norm = emission_list[1] + emission_list[2]; if(norm!=0.0) { haprob[1] = emission_list[1] / norm; haprob[2] = emission_list[2] / norm; } else { haprob[1] = 0.5; haprob[2] = 0.5; } #ifdef DEBUG cout << "Happrob[1] = " << haprob[1] << endl; cout << "Happrob[2] = " << haprob[2] << endl; #endif // If haplotype probabilities are equal, randomly assign haplotype hap = haprob[1] > haprob[2] ? 1 : haprob[1] == haprob[2] ? t%2+1 : 2; double happrob = haprob[hap]; // convert relative haplotype to absolute haplotype hap = (*pd).GetHap() == hap ? 1 : 2; nd->assignHaplotype(hap, happrob, flag); flag = 0; direction = 1; } delete [] known_index; delete [] known_snp_list; }
double CHMM::ViterbiLog(CObs **obs, long T, int *q, double *probarray) // returns sequence q of most probable states // and probability of seeing that sequence // if A, B, pi are already given // This implementation uses logarithms to avoid underflows. { int i, j; // state indices int t; // time index int argmaxval; double prob; double maxval, val, bVal, logVal; double logProb; double *prevDelta, *delta, *tmp; int **psi; double **logBiOt; int zeroProbCount; delta = SetVector(mN); prevDelta = SetVector(mN); psi = SetIntMatrix(T, mN); // We do not preprocess the logs for B // because the data have variable length T logBiOt = SetMatrix(mN, T); for (t = 1; t <= 1; t++){ zeroProbCount = 0; for (i = 1; i <= mN; i++){ bVal = mB->at(i, t); // obs[t] is an entire seq of snps in a read if(bVal<=0.0){ logVal = -10.0; zeroProbCount++; } else{ logVal = log(bVal); } logBiOt[i][t] = logVal; }// for i if(zeroProbCount == mN){// unseen obs, Viterbi decides which seen obs to use cerr << "*** Unseen data, renormalizing logBiOt to equiprobs ***"<<endl; for (i = 1; i <= mN; i++){ logBiOt[i][t] = log(1.0/mN); } } }// for t // 1. Initialization // Initialization is performed for prevDelta and psi only. // PrevDelta stores probability to stage k-1 for all states for (i = 1; i <= mN; i++){ prevDelta[i] = mPi->logAt(i) + logBiOt[i][1]; psi[1][i] = 0; // What's this for? mA->InitViterbiDurations(i); // NO-OP } // 2. Recursion int nread = 2; SNP **reads_snp_list = new SNP*[1000]; int *index = new int[2000]; for (t = 2; t <= T; t++) { // successive reads zeroProbCount = 0; int common_snp_count = 0; double obslik[1000][2][3], genlik[1000][2][3]; READ *pd = (((CFlexibleObs<READ*>*)(obs[t-1]))->Get(1)); READ *nd = (((CFlexibleObs<READ*>*)(obs[t]))->Get(1)); int rd_start = (*pd).GetPos() < (*nd).GetPos() ? (*nd).GetPos() : (*pd).GetPos(); int rd_end = (*pd).GetPos()+(*pd).GetLen() > (*nd).GetPos()+(*nd).GetLen() ? (*nd).GetPos()+(*nd).GetLen()-1 : (*pd).GetPos()+(*pd).GetLen()-1; GetCommonSnpList(obs, reads_snp_list, &common_snp_count, index, t); //Ignore this for now: REVISIT: There is a bug here. In case two consecutive reads with 0 overlapping snps are encountered, it may not //continue to work as expected. We might have to reset haplotype assumptions, or compare the last read with overlapping //snps to the next such one (again to which there might be very little chance) if(common_snp_count==0) { cout << "Read " << t-1 << " and " << t << " have no overlapping snps. Skipping to the next pair.." << endl << endl; //exit(1); (nd)->assignHaplotype(1,0.5); continue; } for (j = 1; j <= mN; j++) { // prevDelta[i] = v(k)(i) maxval = prevDelta[1] + mA->logAt(1, j); argmaxval = 1; prob = maxval; for (i = 1; i <= mN; i++) {// previous state val = prevDelta[i] + mA->logAt(i, j); if (val > maxval) { maxval = val; argmaxval = i; prob = maxval; } } // maxval = max(k) (v(k)(i)a(k)(l)) // need to add emission here and find max state for next read // I end up not assigning the actual emission matrix, but only the log matrix for the computations logBiOt[j][nread] = 0.0; int hap = prevDelta[j] > prevDelta[(j%mN)+1] ? 1 : (prevDelta[j] < prevDelta[(j%mN)+1] ? 2 : j) ; int abs_hap = prevDelta[j] >= prevDelta[(j%mN)+1] ? j : (j%mN)+1; #ifdef DEBUG cout << "Read " << t-1 << " and " << t << ", Hap " << hap << ", Abs Hap = " << abs_hap << ", " << rd_start << ", " << rd_end << " with " << common_snp_count << " overlapping snps" << endl << endl; cout << "K SnpPos R A L L\tAA_gen\tAA_obs\tAA_genob\tAB_gen\tAB_obs\tAB_genob\tBB_gen\tBB_obs\tBB_genob\tprob\tlogprob\n"; #endif for(int count=0; count<common_snp_count; count++) { SNP *sp = reads_snp_list[count]; // if(sp->GetKnown()==1) { #ifdef DEBUG cout << sp->GetKnown() << " " << sp->GetPos() << " " << sp->GetRef() << " " << sp->GetAlt() << "\t" << (*pd).GetAllele(index[2*count]) << " " << (*nd).GetAllele(index[2*count+1]); #endif double emission = compute_new_emission(reads_snp_list, count, obs, t, index, hap, obslik[count][j-1], genlik[count][j-1]); if(emission <= 0.0) { logVal = -100.0; zeroProbCount++; } else{ logVal = log(emission); } #ifdef DEBUG //cout << "\t" << emission << "\t" << logVal << endl; #endif // Experimental: Do I divide by common_snp_count or not? logBiOt[j][nread] += logVal/common_snp_count; //logBiOt[j][nread] += logVal; // } } // logBiOt[j][nread] now contains the mN different emissions to be added to maxval to determine the max state at this stage #ifdef DEBUG cout << "Total emission = " << logBiOt[j][nread] << "," << maxval << endl << endl; #endif delta[j] = maxval + logBiOt[j][nread]; psi[nread][j] = argmaxval; // What's this for? mA->UpdateViterbiDurations(argmaxval, j);// ***dfd 4-16-99 } // By now all v(l)(i+1)s are computed from which the max has to be found if(zeroProbCount > common_snp_count) { // need to check per snp cerr << "*** Unseen data, renormalizing logBiOt to equiprobs ***"<<endl; for (i = 1; i <= mN; i++){ logBiOt[i][nread] = log(1.0/mN); } } // Updating posteriors here int ind_snp = 0; double hap_prob = 0.0; ind_snp = delta[1] > delta[2] ? 1 : 2; // Ignore this for now: REVISIT: hap_prob assignment seems incorrect //hap_prob = pow(2.7182, -(logBiOt[ind_snp][nread] - logBiOt[ind_snp%mN + 1][nread]) ); double prob1 = pow(2.7182, logBiOt[ind_snp][nread]); double prob2 = pow(2.7182, logBiOt[ind_snp%mN+1][nread]); hap_prob = prob1/(prob1+prob2); #ifdef DEBUG cout << "leading emission " << prob1 << ", trailing emission " << prob2 << endl; cout << "Assigning read " << (*nd).GetPos() << " haplotype " << ind_snp << " and happrob " << hap_prob << endl << endl << endl; #endif (nd)->assignHaplotype(ind_snp, hap_prob); nread++; // prevDelta updated to find the max from in the next read tmp = delta; delta = prevDelta; prevDelta = tmp; } nread--; // 3. Termination logProb = prevDelta[1]; //q[T] = 1; q[nread] = 1; for (i = 1; i <= mN; i++) { if (prevDelta[i] > logProb) { logProb = prevDelta[i]; q[nread] = i; //q[T] = i; } } // 4. Path (state sequence) backtracking //for (t = T - 1; t >= 1; t--){ for (t = nread - 1; t >= 1; t--){ q[t] = psi[t+1][q[t+1]]; } delete [] psi[1]; delete [] psi; delete [] prevDelta; delete [] delta; delete [] logBiOt[1]; delete [] logBiOt; #if 0 delete [] logA[1]; delete [] logA; delete [] logPi; #endif return logProb; }
double CHMM::ForwardAlgo(double **alpha, double *scale, CObs **obs, long T, boolean doLog) // Scaling is used to prevent roundoff errors // Same scaling is used for backward and forward procedures // so that the scales cancel out in the Baum Welch formula // Quantity returned is actually - log(P(O | model), // i.e. exponential of this quantity is 1 / P(O | model) { int i, j; /* state indices */ int t; /* time index */ double sum; /* partial sum */ double logProb; double bi1, aij, bjt1; // 1. Initialization // alpha = f(i) // bjt1 = emission for (i = 1; i <= mN; i++) { // Experimental: Need to confirm the emission for the first read bi1 = mB->at(i, 1); alpha[1][i] = mPi->at(i) * bi1; } scale[1] = Normalize(alpha[1], mN); // 2. Induction int nread = 2; SNP **reads_snp_list = new SNP*[1000]; int *index = new int[2000]; for (t = 1; t <= T - 1; t++) { int common_snp_count = 0; bool overlap; overlap = TRUE; double obslik[1000][2][3], genlik[1000][2][3]; READ *pd = (((CFlexibleObs<READ*>*)(obs[t]))->Get(1)); READ *nd = (((CFlexibleObs<READ*>*)(obs[t+1]))->Get(1)); int rd_start = (*pd).GetPos() < (*nd).GetPos() ? (*nd).GetPos() : (*pd).GetPos(); int rd_end = (*pd).GetPos()+(*pd).GetLen() > (*nd).GetPos()+(*nd).GetLen() ? (*nd).GetPos()+(*nd).GetLen()-1 : (*pd).GetPos()+(*pd).GetLen()-1; GetCommonSnpList(obs, reads_snp_list, &common_snp_count, index, t+1); if(common_snp_count==0) { cout << "Read " << t << " and " << t+1 << " have no overlapping snps. Skipping to the next pair.." << endl << endl; //exit(1); // (nd)->assignHaplotype(1,0.5); overlap = FALSE; // continue; } for (j = 1; j <= mN; j++) { sum = 0.0; for (i = 1; i <= mN; i++){ aij = mA->at(i, j); // sum += alpha[nread-1][i] * aij; // This is the scaled alpha sum += alpha[t][i] * aij; } #ifdef DEBUG cout << "Read " << t << " and " << t+1 << ", " << rd_start << ", " << rd_end << " with " << common_snp_count << " overlapping snps" << endl << endl; cout << "K SnpPos R A L L\tAA_gen\tAA_obs\tAA_genob\tAB_gen\tAB_obs\tAB_genob\tBB_gen\tBB_obs\tBB_genob\tprob\tlogprob\n"; #endif double logBiOt = 0; //cout << "Read check: " << rd_start << ", " << rd_end << " with " << common_snp_count << " common snps" << endl; for(int count=0; count<common_snp_count; count++) { SNP *sp = reads_snp_list[count]; #ifdef DEBUG cout << sp->GetKnown() << " " << sp->GetPos() << " " << sp->GetRef() << " " << sp->GetAlt() << " " << (*pd).GetAllele(index[2*count]) << " " << (*nd).GetAllele(index[2*count+1]); cout << endl; #endif double emission = compute_new_emission(reads_snp_list, count, obs, t+1, index, j, obslik[count][j-1], genlik[count][j-1]); logBiOt += log(emission)/common_snp_count; } if(common_snp_count==0) { logBiOt = log(0.5); } bjt1 = pow(2.7182, logBiOt); mB->addEmission(j,t+1,bjt1); bjt1 = mB->at(j, t+1); // Computing emission here // alpha[nread][j] = sum * bjt1; // Unscaled yet alpha[t+1][j] = sum * bjt1; } // scale is the normalization factor s(i) // alpha is the normalized ~f(i) // scale[nread] = Normalize(alpha[nread], mN); scale[t+1] = Normalize(alpha[t+1], mN); nread++; } nread--; //logProb = 0.0; // Experimental: logProb contains P(x) logProb = 1.0; // 3. Termination if(doLog){ for (t = 1; t <= T; t++){ // logProb += log(scale[nread-1]); logProb += log(scale[t]); } }// endif delete [] index; delete [] reads_snp_list; return logProb;// zero returned if doLog is false }