Пример #1
0
TEST_F(FileIOIntegrationTest, ReadSNPInfo) {
  EXPECT_CALL(configMock, excludeSNPsWithNegativePosition()).Times(AtLeast(1)).WillRepeatedly(Return(true));
  EXPECT_CALL(configMock, getPhenotypeCoding()).Times(1).WillRepeatedly(Return(ZERO_ONE_CODING));

  DataFilesReaderFactory dataFilesReaderFactory;
  DataFilesReader* dataFilesReader = dataFilesReaderFactory.constructDataFilesReader(configMock);

  std::vector<SNP*>* snpInformation = dataFilesReader->readSNPInformation();

  int numSNPToInclude = 0;
  int snpSize = snpInformation->size();
  for(int i = 0; i < snpSize; ++i){
    SNP* snp = (*snpInformation)[i];
    if(snp->shouldInclude()){
      ++numSNPToInclude;
    }
  }

  EXPECT_EQ(10, snpSize);
  EXPECT_EQ(8, numSNPToInclude);

  delete dataFilesReader;

  for(int i = 0; i < snpSize; ++i){
    delete (*snpInformation)[i];
  }
  delete snpInformation;
}
Пример #2
0
int GetMismatchSum(READ *read)
{
	int read_snp_count = read->GetSnpCount();
	SNP **snp_list = read->GetSnpList();
	int it=0;
	double sum=0.0;

	while(it<read_snp_count) {
		SNP *snp = snp_list[it];
		if(snp->GetKnown()==0) {
			sum += snp->GetQualScore();
		}
		it++;
	}
	return (int)sum;
}
Пример #3
0
string FreqTable::toString() {
    stringstream ss;
    std::cout <<"tTot:"<<"\t"<<this->tTot<<endl;
    boost::unordered_map<SNP, int>::iterator iter;
    for (iter = this->idMap.begin(); iter != this->idMap.end(); ++iter) {

        const SNP v = iter->first;
        double length = this->branchLengths[iter->second];
        if (length > 0) {
            ss << length / this->tTot;
            for (vector<int>::const_iterator it2 = v.begin(); it2 != v.end(); ++it2) {
                ss << "\t" << (*it2);
            }
            ss << endl;
        }
    }
    return ss.str();
}
Пример #4
0
bool SNP::operator==(const SNP& otherSNP) const {
  return id == otherSNP.getId();
}
Пример #5
0
bool SNP::operator<(const SNP& otherSNP) const {
  return id < otherSNP.getId();
}
Пример #6
0
void NaiveBayes(int snp_start, int snp_end, int T)
{
	int i, j, t;
        SNP **known_snp_list = new SNP*[100];
        int *known_index = new int[200];
	int reads_list_size = reads_list.size();

	for (t = 1+reads_list_size-T; t <= reads_list_size-1; t++) {
		int known_snp_count = 0, flag = 0, direction = 1, hap = 0, mismatchsum = 0;
		double norm = 0.0;
                double emission_list[3], haprob[3], emission_set[3][100];
		READ *pd = reads_list[t-1];
		READ *nd = reads_list[t];
		int plen = pd->GetLen();
		int nlen = nd->GetLen();
		int ppos = pd->GetPos();
		int npos = nd->GetPos();
                int rd_start = ppos < npos ? npos : ppos;
                int rd_end = ppos+plen > npos+nlen ? npos+nlen-1 : ppos+plen-1;
		
		//mismatchsum = GetMismatchSum(nd);
		//if(mismatchsum>=50)
		//	continue;
		
                GetKnownSnpList(known_snp_list, &known_snp_count, known_index, t);
                // This adds the count of known overlapping snps between adjacent reads
		(*nd).AddKnownCount(known_snp_count);
#ifdef DEBUG
if(known_snp_count>0)
cout << "Read " << nd->GetPos() << " ; "<< t << " and " << t+1 << ", " << rd_start << ", " << rd_end << " with " << known_snp_count << " known snps." << endl << endl;
cout << "SnpPos  R A LL..\tAA_gen AA_obs AA_genob\tAB_gen AB_obs AB_genob\tBB_gen BB_obs BB_genob\tprob\n";
cout << "Throwing emissions from read: " << nd->GetPos() << endl;
#endif

		for(j=1;j<=2;j++) {
#ifdef DEBUG
cout << "Haplotype " << j << endl;
#endif
			emission_list[j] = 0.0;
			for(int count=0; count<known_snp_count; count++) {
				double emission;
                       	        SNP *sp = known_snp_list[count];
#ifdef DEBUG
cout << sp->GetPos() << " " << sp->GetRef() << " " << sp->GetAlt();
#endif
				// Obtains naive bayes score for the count-th overlapping known snp 
				// between the adjacent reads for haplotype, 'j'.
				// Working with relative haplotype here.
				emission = compute_new_emission(known_snp_list, count, t, known_index, j);
					if(emission==0.0)
						continue;
					emission_list[j] += ((emission));
					emission_set[j][count] = emission;
			}
#ifdef DEBUG
cout << "Total Emission = " << emission_list[j] << endl;
#endif
	  	}
		if(emission_set[1][0]<emission_set[2][0])
			direction = 2;
		for(int count=1; count<known_snp_count; count++) {
			if(emission_set[1][count]>emission_set[2][count]&&direction==2)
				flag = 1;
			if(emission_set[1][count]<emission_set[2][count]&&direction==1)
				flag = 1;
		}
#ifdef DEBUG
cout << "Discordance = " << flag << endl;
#endif
		norm = emission_list[1] + emission_list[2];
		if(norm!=0.0) {
			haprob[1] = emission_list[1] / norm;
			haprob[2] = emission_list[2] / norm;
		} else {
			haprob[1] = 0.5;
			haprob[2] = 0.5;
		}
#ifdef DEBUG
cout << "Happrob[1] = " << haprob[1] << endl;
cout << "Happrob[2] = " << haprob[2] << endl;
#endif
		// If haplotype probabilities are equal, randomly assign haplotype
		hap = haprob[1] > haprob[2] ? 1 : haprob[1] == haprob[2] ? t%2+1 : 2;
		double happrob = haprob[hap];
		// convert relative haplotype to absolute haplotype
		hap = (*pd).GetHap() == hap ? 1 : 2;
		nd->assignHaplotype(hap, happrob, flag);
		flag = 0; direction = 1;
	}

	delete [] known_index;
	delete [] known_snp_list;
}
Пример #7
0
Файл: hmm.C Проект: bnv2103/NGS
double CHMM::ViterbiLog(CObs **obs, long T, int *q, double *probarray)
// returns sequence q of most probable states
// and probability of seeing that sequence
// if A, B, pi are already given
// This implementation uses logarithms to avoid underflows.
{

	int 	i, j;	// state indices
	int  	t;	// time index

	int	argmaxval;
	double  prob;
	double	maxval, val, bVal, logVal;
	double logProb;
	double *prevDelta, *delta, *tmp;
	int **psi;
	double **logBiOt;
	int zeroProbCount;

	delta = SetVector(mN);
	prevDelta = SetVector(mN);
	psi = SetIntMatrix(T, mN);

// We do not preprocess the logs for B
// because the data have variable length T
	logBiOt =  SetMatrix(mN, T);
	for (t = 1; t <= 1; t++){
            zeroProbCount = 0;
            for (i = 1; i <= mN; i++){ 
              bVal = mB->at(i, t); // obs[t] is an entire seq of snps in a read
                if(bVal<=0.0){
                    logVal = -10.0;
                    zeroProbCount++;
                }
                else{
                    logVal = log(bVal);
                }
                logBiOt[i][t] = logVal;
            }// for i
            if(zeroProbCount == mN){// unseen obs, Viterbi decides which seen obs to use
                cerr << "*** Unseen data, renormalizing logBiOt to equiprobs ***"<<endl;
                for (i = 1; i <= mN; i++){		  
                    logBiOt[i][t] = log(1.0/mN);
                }
            }
 	}// for t

// 1. Initialization
// Initialization is performed for prevDelta and psi only.
// PrevDelta stores probability to stage k-1 for all states

	for (i = 1; i <= mN; i++){
		prevDelta[i] = mPi->logAt(i) + logBiOt[i][1];
		psi[1][i] = 0; // What's this for?
		mA->InitViterbiDurations(i); // NO-OP
	}
 
// 2. Recursion
	int nread = 2;
	SNP **reads_snp_list = new SNP*[1000];
	int *index = new int[2000];
	for (t = 2; t <= T; t++) { // successive reads
		zeroProbCount = 0;
		int common_snp_count = 0;
		double obslik[1000][2][3], genlik[1000][2][3];

		READ *pd = (((CFlexibleObs<READ*>*)(obs[t-1]))->Get(1));
		READ *nd = (((CFlexibleObs<READ*>*)(obs[t]))->Get(1));
		int rd_start = (*pd).GetPos() < (*nd).GetPos() ? (*nd).GetPos() : (*pd).GetPos();
		int rd_end = (*pd).GetPos()+(*pd).GetLen() > (*nd).GetPos()+(*nd).GetLen() ? (*nd).GetPos()+(*nd).GetLen()-1 : (*pd).GetPos()+(*pd).GetLen()-1;

		GetCommonSnpList(obs, reads_snp_list, &common_snp_count, index, t);

		//Ignore this for now: REVISIT: There is a bug here. In case two consecutive reads with 0 overlapping snps are encountered, it may not
		//continue to work as expected. We might have to reset haplotype assumptions, or compare the last read with overlapping
		//snps to the next such one (again to which there might be very little chance)
		if(common_snp_count==0) {
			cout << "Read " << t-1 << " and " << t << " have no overlapping snps. Skipping to the next pair.." << endl << endl;
			//exit(1);
			(nd)->assignHaplotype(1,0.5);
			continue;
		}
		for (j = 1; j <= mN; j++) {
			// prevDelta[i] = v(k)(i)
			maxval = prevDelta[1] + mA->logAt(1, j);
			argmaxval = 1;
			prob = maxval;
			for (i = 1; i <= mN; i++) {// previous state
				val = prevDelta[i] + mA->logAt(i, j);
				if (val > maxval) {
					maxval = val;
					argmaxval = i;
					prob = maxval;
				}
			}
			// maxval = max(k) (v(k)(i)a(k)(l))
			// need to add emission here and find max state for next read

			// I end up not assigning the actual emission matrix, but only the log matrix for the computations

			logBiOt[j][nread] = 0.0;
			int hap = prevDelta[j] > prevDelta[(j%mN)+1] ? 1 : (prevDelta[j] < prevDelta[(j%mN)+1] ? 2 : j) ;
			int abs_hap = prevDelta[j] >= prevDelta[(j%mN)+1] ? j : (j%mN)+1;

#ifdef DEBUG
cout << "Read " << t-1 << " and " << t << ", Hap " << hap << ", Abs Hap = " << abs_hap << ", " << rd_start << ", " << rd_end << " with " << common_snp_count << " overlapping snps" << endl << endl;
cout << "K SnpPos  R A L L\tAA_gen\tAA_obs\tAA_genob\tAB_gen\tAB_obs\tAB_genob\tBB_gen\tBB_obs\tBB_genob\tprob\tlogprob\n";
#endif
			for(int count=0; count<common_snp_count; count++) {
				SNP *sp = reads_snp_list[count];
//				if(sp->GetKnown()==1) {
#ifdef DEBUG
cout << sp->GetKnown() << " " << sp->GetPos() << " " << sp->GetRef() << " " << sp->GetAlt() << "\t" << (*pd).GetAllele(index[2*count]) << " " << (*nd).GetAllele(index[2*count+1]);
#endif
				double emission = compute_new_emission(reads_snp_list, count, obs, t, index, hap, obslik[count][j-1], genlik[count][j-1]);
				if(emission <= 0.0) {
					logVal = -100.0;
                    			zeroProbCount++;
                		} else{
                    			logVal = log(emission);
                		}
#ifdef DEBUG
//cout << "\t" << emission << "\t" << logVal << endl;
#endif
				// Experimental: Do I divide by common_snp_count or not?
                			logBiOt[j][nread] += logVal/common_snp_count;
                			//logBiOt[j][nread] += logVal;
//				}
			}
			// logBiOt[j][nread] now contains the mN different emissions to be added to maxval to determine the max state at this stage
#ifdef DEBUG
cout << "Total emission = " << logBiOt[j][nread] << "," << maxval << endl << endl;
#endif

			delta[j] = maxval + logBiOt[j][nread];
			psi[nread][j] = argmaxval; // What's this for?
			mA->UpdateViterbiDurations(argmaxval, j);// ***dfd 4-16-99
		}
		// By now all v(l)(i+1)s are computed from which the max has to be found
		if(zeroProbCount > common_snp_count) { // need to check per snp
                	cerr << "*** Unseen data, renormalizing logBiOt to equiprobs ***"<<endl;
                	for (i = 1; i <= mN; i++){		  
                    		logBiOt[i][nread] = log(1.0/mN);
                	}
		}

		// Updating posteriors here
		int ind_snp = 0;
		double hap_prob = 0.0;
		ind_snp = delta[1] > delta[2] ? 1 : 2;
		// Ignore this for now: REVISIT: hap_prob assignment seems incorrect
		//hap_prob = pow(2.7182, -(logBiOt[ind_snp][nread] - logBiOt[ind_snp%mN + 1][nread]) );
		double prob1 = pow(2.7182, logBiOt[ind_snp][nread]);
		double prob2 = pow(2.7182, logBiOt[ind_snp%mN+1][nread]);
		hap_prob = prob1/(prob1+prob2);
#ifdef DEBUG
cout << "leading emission " << prob1 << ", trailing emission " << prob2 << endl;
cout << "Assigning read " << (*nd).GetPos() << " haplotype " << ind_snp << " and happrob " << hap_prob << endl << endl << endl;
#endif
		(nd)->assignHaplotype(ind_snp, hap_prob);
		nread++;
		// prevDelta updated to find the max from in the next read
		tmp = delta; delta = prevDelta; prevDelta = tmp;
	}
	nread--;
 
// 3. Termination
	logProb = prevDelta[1];
	//q[T] = 1;
	q[nread] = 1;
	for (i = 1; i <= mN; i++) {
		if (prevDelta[i] > logProb) {
		      logProb = prevDelta[i];
		      q[nread] = i;
		      //q[T] = i;
		}
	} 
 
// 4. Path (state sequence) backtracking
	//for (t = T - 1; t >= 1; t--){
	for (t = nread - 1; t >= 1; t--){
		q[t] = psi[t+1][q[t+1]];
	}
	
	delete [] psi[1];
	delete [] psi;
	delete [] prevDelta;
	delete [] delta;
	delete [] logBiOt[1];
	delete [] logBiOt;
#if 0
	delete [] logA[1];
	delete [] logA;
	delete [] logPi;
#endif	
	return logProb;
}
Пример #8
0
Файл: hmm.C Проект: bnv2103/NGS
double CHMM::ForwardAlgo(double **alpha, double *scale, CObs **obs, long T, boolean doLog)
     // Scaling is used to prevent roundoff errors
     // Same scaling is used for backward and forward procedures
     // so that the scales cancel out in the Baum Welch formula
     // Quantity returned is actually - log(P(O | model),
     // i.e. exponential of this quantity is 1 / P(O | model)
{
	int	i, j; 	/* state indices */
	int	t;	/* time index */

	double sum;	/* partial sum */
	double logProb;
	double bi1, aij, bjt1;

// 1. Initialization
// alpha = f(i)
// bjt1 = emission
	for (i = 1; i <= mN; i++) {
// Experimental: Need to confirm the emission for the first read
	  bi1 = mB->at(i, 1);
	  alpha[1][i] = mPi->at(i) *  bi1;
	}
	scale[1] = Normalize(alpha[1], mN);

// 2. Induction
	int nread = 2;
        SNP **reads_snp_list = new SNP*[1000];
        int *index = new int[2000];

	for (t = 1; t <= T - 1; t++) {
		int common_snp_count = 0;
		bool overlap;
		overlap = TRUE;
                double obslik[1000][2][3], genlik[1000][2][3];
                READ *pd = (((CFlexibleObs<READ*>*)(obs[t]))->Get(1));
                READ *nd = (((CFlexibleObs<READ*>*)(obs[t+1]))->Get(1));
                int rd_start = (*pd).GetPos() < (*nd).GetPos() ? (*nd).GetPos() : (*pd).GetPos();
                int rd_end = (*pd).GetPos()+(*pd).GetLen() > (*nd).GetPos()+(*nd).GetLen() ? (*nd).GetPos()+(*nd).GetLen()-1 : (*pd).GetPos()+(*pd).GetLen()-1;

                GetCommonSnpList(obs, reads_snp_list, &common_snp_count, index, t+1);
                if(common_snp_count==0) {
                        cout << "Read " << t << " and " << t+1 << " have no overlapping snps. Skipping to the next pair.." << endl << endl;
                        //exit(1);
                        // (nd)->assignHaplotype(1,0.5);
			overlap = FALSE;
                        // continue;
                }

		for (j = 1; j <= mN; j++) {
			sum = 0.0;
			for (i = 1; i <= mN; i++){
				aij = mA->at(i, j);
				// sum += alpha[nread-1][i] * aij;
				// This is the scaled alpha
				sum += alpha[t][i] * aij;
	    		}

#ifdef DEBUG
cout << "Read " << t << " and " << t+1 << ", " << rd_start << ", " << rd_end << " with " << common_snp_count << " overlapping snps" << endl << endl;
cout << "K SnpPos  R A L L\tAA_gen\tAA_obs\tAA_genob\tAB_gen\tAB_obs\tAB_genob\tBB_gen\tBB_obs\tBB_genob\tprob\tlogprob\n";
#endif
			double logBiOt = 0;
//cout << "Read check: " << rd_start << ", " << rd_end << " with " << common_snp_count << " common snps" << endl;
			for(int count=0; count<common_snp_count; count++) {
                                SNP *sp = reads_snp_list[count];
#ifdef DEBUG
cout << sp->GetKnown() << " " << sp->GetPos() << " " << sp->GetRef() << " " << sp->GetAlt() << " " << (*pd).GetAllele(index[2*count]) << " " << (*nd).GetAllele(index[2*count+1]);
cout << endl;
#endif
                                double emission = compute_new_emission(reads_snp_list, count, obs, t+1, index, j, obslik[count][j-1], genlik[count][j-1]);
                                logBiOt += log(emission)/common_snp_count;
			}
			if(common_snp_count==0) {
				logBiOt = log(0.5);
			}
			bjt1 = pow(2.7182, logBiOt);
			mB->addEmission(j,t+1,bjt1);
	    		bjt1 =  mB->at(j, t+1);
			// Computing emission here
	    		// alpha[nread][j] = sum * bjt1;
	    		// Unscaled yet
	    		alpha[t+1][j] = sum * bjt1;
	  	}
		// scale is the normalization factor s(i)
		// alpha is the normalized ~f(i)
	  	// scale[nread] = Normalize(alpha[nread], mN);
	  	scale[t+1] = Normalize(alpha[t+1], mN);
		nread++;
	}
	nread--;
	//logProb = 0.0;
	// Experimental: logProb contains P(x)
	logProb = 1.0;

// 3. Termination
	if(doLog){
	  for (t = 1; t <= T; t++){
	    // logProb += log(scale[nread-1]);
	    logProb += log(scale[t]);
	  }
	}// endif

	delete [] index;
	delete [] reads_snp_list;
	return logProb;// zero returned if doLog is false
}