void Consolidator::findTruePctErrors( ErrorCalculator &e_obj,int ma_snp_ends, bool holdOut,int window,float ma_threshold, float empirical_ma_threshold ) { for(int i=0;i<person_count;i++) { for(int j=i;j<person_count;j++) { for(int l=0;l<m_trueMatches[i][j].size();l++) { if(m_trueMatches[i][j][l].end==-1) { continue; } //handle moving averages calculation // int t1 = m_trueMatches[ i ][ j ][ l ].start + ( m_trueMatches[ i ][ j ][ l ].end - m_trueMatches[ i ][ j ][ l ].start ) * 0.25; int t2 = m_trueMatches[ i ][ j ][ l ].end - ( m_trueMatches[ i ][ j ][ l ].end - m_trueMatches[ i ][ j ][ l ].start ) * 0.25; //now we have the positions of the first and last 25% of the truly ibd SH //all that's left to do is to pass them into the moving averages function, and obtain the max ma //then store that in a vector, sort them, and find the xth percentile of that vector. That will be //the ma that we use later //for that "finalErrors" parameters, need to get the number of errors along the truly IBD SH first... vector<vector<int> > trueErrors=e_obj.checkErrors( i, j, t1, t2); vector<int>finalTrueErrors=e_obj.getFinalErrors( trueErrors ); //handles MA calculations std::vector<float> av; float current_max; if(empirical_ma_threshold < 0.0){ av = e_obj.getTrueMovingAverages(finalTrueErrors,t1,t2,window); current_max = av[0]; for(int q = 1; q < av.size(); q++){ if(av[q] > current_max){ current_max = av[q]; } } e_obj.addMaxAverage(current_max); } // int temp1 = m_trueMatches[ i ][ j ][ l ].start + ( m_trueMatches[ i ][ j ][ l ].end - m_trueMatches[ i ][ j ][ l ].start ) * 0.15; //Should probably stop doing this int temp2 = m_trueMatches[ i ][ j ][ l ].end - ( m_trueMatches[ i ][ j ][ l ].end - m_trueMatches[ i ][ j ][ l ].start ) * 0.15; int start =0, end =0, fend = ( temp2 -temp1 ) ; //since we are using MOL at this point, this will pick out a random SH from the set of non-truly IBD SH //and use that length to define the region over which we find PIE. Unless you are changing something with MOL, //don't ever read this next block int randPers1, randPers2, pos; randPers1 = std::rand() % person_count; randPers2 = std::rand() % person_count; if( randPers1 > randPers2 ) { randPers1 = randPers1 + randPers2; randPers2 = randPers1 - randPers2; randPers1 = randPers1 - randPers2; } while( m_matches[ randPers1 ][ randPers2 ].size() <= 0 ) { randPers1 = std::rand() % person_count; randPers2 = std::rand() % person_count; if( randPers1 > randPers2 ) { randPers1 = randPers1 + randPers2; randPers2 = randPers1 - randPers2; randPers1 = randPers1 - randPers2; } } pos = std::rand() % m_matches[ randPers1 ][ randPers2 ].size(); int len = m_matches[ randPers1 ][ randPers2 ][ pos ].end - m_matches[ randPers1 ][ randPers2 ][ pos ].start; if( len >= fend || len <= 0) { continue; } temp1 = temp1; temp2 = temp1 + len; //end crazy MOL stuff vector<vector<int> > errors=e_obj.checkErrors( i, j, temp1, temp2); vector<int>finalErrors=e_obj.getFinalErrors( errors ); float per_err = e_obj.getThreshold(finalErrors,temp1,temp2);//overload m_errors.push_back( per_err ); if( holdOut ) { float oppHom = ( e_obj.getOppHomThreshold( i, j, temp1, temp2 ) ) / ( temp2 -temp1 ); m_holdOutErrors.push_back( oppHom ); } } } } vector<float>maxes; float cutoff = empirical_ma_threshold; if(empirical_ma_threshold < 0.0){ maxes = e_obj.getMaxAverages(); std::sort(maxes.begin(),maxes.end()); e_obj.setMaxAverage(maxes); cutoff = e_obj.getXthPercentile(ma_threshold); } e_obj.setCutoff(cutoff);//set the actual threshold to be used when calculating MA in all other SH // std::sort( m_errors.begin(), m_errors.end() ); std::sort( m_holdOutErrors.begin(), m_holdOutErrors.end() ); std::string str = " \n No of elements in error check are: " + NumberToString( m_errors.size() ); str = str + " \n No of elements in hold error check are: " + NumberToString( m_holdOutErrors.size() ); e_obj.log( str ); }
//overloaded version void Consolidator::performTrim(ErrorCalculator& e_obj,int window, int ma_snp_ends, float ma_threshold, int min_snp,float min_cm, float per_err_threshold, string option, float hThreshold, bool holdOut,float empirical_threshold, float empirical_pie_threshold,float cut_value,float cm_cut_value) { int removed1 =0, removed2 = 0, removed3 = 0, removed4 = 0; int not_removed = 0; int total_count = global_initial; bool wrongOption = false; if((cut_value >= 0.0) && (cm_cut_value >= 0.0) ){ cerr << "ERROR: You have specified both a cut value percentage and a cM cut value, but only one is allowed. Please try again." << endl; exit(1); } //8/14/14 //Adding header for output cout << "id1" << "\t" << "id2" << "\t" << "marker_id1" << "\t" << "marker_id2" << "\t" << "snp_length" << "\t" << "cm_length" << "\t" << "start/end" << "\t" << "pie" << "\t" << "ma_max" << "\t" << "random_pie" << "\t" << "random_ma_max" << "\t" << "errors" << "\t" << "moving_averages" << endl; for(int i=0;i<person_count;i++) { for(int j=i;j<person_count;j++) { for(int l=0;l<m_matches[i][j].size();l++) { total_count++; if(m_matches[i][j][l].end==-1) { continue; } if(i == j){ continue; //ignore same person matches } int pers1 = i, pers2 = j; //cerr << "For this snp, before applying the cut argument, the length is: " << ((m_matches[i][j][l].end - m_matches[i][j][l].start)) << " SNPs" << endl; if((m_matches[i][j][l].end - m_matches[i][j][l].start) < min_snp){ //reduced argument forces an initial drop due to SNPS continue; } //reduced argument check for mincm if(e_obj.isInitialCmDrop(m_matches[i][j][l].start,m_matches[i][j][l].end,min_cm)){ continue; } int temp1,temp2; if(cut_value >= 0.0){ //remove first and last 25% by default, this can also be a user-specified value float new_cut_value = (1 - cut_value) * 0.5; //cut it in half for each side temp1 = m_matches[ i ][ j ][ l ].start + (int)(( m_matches[ i ][ j ][ l ].end - m_matches[ i ][ j ][ l ].start ) * new_cut_value); temp2 = m_matches[ i ][ j ][ l ].end - (int)(( m_matches[ i ][ j ][ l ].end - m_matches[ i ][ j ][ l ].start ) * new_cut_value); } else { //check that cm arg does not exceed the SH's cM length float new_cm_length = e_obj.newCmLength(m_matches[i][j][l].start, m_matches[i][j][l].end,cm_cut_value); //change that cut_value parameter. new_cm_length = (new_cm_length / 2.0); if(new_cm_length <= 0.0){ cerr << "Error: The cM length that you specified was larger than one or more of the SH lengths." << endl; exit(1); } float new_cm_start = e_obj.adjustCmLength(m_matches[i][j][l].start, new_cm_length); float new_cm_end = e_obj.adjustCmLength(m_matches[i][j][l].end, ((-1.0) * new_cm_length) ); if(new_cm_start >= new_cm_end){ cerr << "ERROR: start value in cM greater than end value. Exiting..." << endl; exit(1); } temp1 = e_obj.snpFinder(new_cm_start,m_matches[i][j][l].start,m_matches[i][j][l].end,0); temp2 = e_obj.snpFinder(new_cm_end,m_matches[i][j][l].start,m_matches[i][j][l].end,1); } //we need to calculate for both this person and a random person. int randpers1; int randpers2; //for now, let's enable randomness by default randpers1 = std::rand() % e_obj.getNoOfPersons(); randpers2 = std::rand() % e_obj.getNoOfPersons(); if( randpers1 > randpers2 ) { randpers1 = randpers1 + randpers2; randpers2 = randpers1 - randpers2; randpers1 = randpers1 - randpers2; } //we now have pers1,2 and randpers1,2 vector<vector<int> > errors=e_obj.checkErrors(pers1, pers2, temp1, temp2); vector<int>finalErrors=e_obj.getFinalErrors(errors); vector<vector<int> > random_errors = e_obj.checkErrors(randpers1,randpers2,temp1,temp2); vector<int>random_final_errors = e_obj.getFinalErrors(random_errors); vector<float>movingAverages; vector<float>random_moving_averages; float threshold; movingAverages = e_obj.getMovingAverages(finalErrors,temp1,temp2,window); random_moving_averages = e_obj.getMovingAverages(random_final_errors,temp1,temp2,window); if(empirical_threshold < 0.0){ threshold = e_obj.getCutoff(); //this is the empirical average threshold for moving averages } else { threshold = empirical_threshold; } float per_err = e_obj.getThreshold(finalErrors,temp1,temp2); float random_per_err = e_obj.getThreshold(random_final_errors,temp1,temp2); //find maximum of moving averages int max_pos = 0; int rmax_pos = 0; float max_ma = 0.0; float rmax_ma = 0.0; for(int z = 1; z < movingAverages.size();z++){ if(movingAverages[z] > movingAverages[max_pos]){ max_pos = z; } } max_ma = movingAverages[max_pos]; for(int c = 1; c < random_moving_averages.size(); c++){ if(random_moving_averages[c] > random_moving_averages[rmax_pos]){ rmax_pos = c; } } rmax_ma = random_moving_averages[rmax_pos]; if( (option.compare("Error1") == 0 ) || (option.compare("ErrorRandom1") == 0) || (option.compare("Error") == 0) ){ not_removed++; e_obj.errorOutput(i,j,temp1,temp2,min_snp,min_cm,movingAverages,finalErrors,per_err,temp1,temp2,per_err,max_ma,random_per_err,rmax_ma); continue; }//end error1 }//l }//j }//i ma_drop_str = "No of matches removed due to length of trimming by moving averages: " + NumberToString( removed2 ); pie_drop_str = "No of matches removed due to percentage error: " + NumberToString( removed1 ); }//endperftrim()
void Consolidator::findTrueSimplePctErrors( ErrorCalculator &e_obj, float PIElength, bool holdOut,int window, float ma_threshold, float empirical_ma_threshold ) { for(int i=0;i<person_count;i++) { for(int j=i;j<person_count;j++) { for(int l=0;l<m_trueMatches[i][j].size();l++) { if(m_trueMatches[i][j][l].end==-1) { continue; } //------------------------------------------------------------------------------------------------- int t1 = m_trueMatches[ i ][ j ][ l ].start + ( m_trueMatches[ i ][ j ][ l ].end - m_trueMatches[ i ][ j ][ l ].start ) * 0.25; int t2 = m_trueMatches[ i ][ j ][ l ].end - ( m_trueMatches[ i ][ j ][ l ].end - m_trueMatches[ i ][ j ][ l ].start ) * 0.25; /*What do these two functions do? Is this necessary for being able to find errors, or is it only useful for MA calculations?*/ vector<vector<int> > trueErrors=e_obj.checkErrors( i, j, t1, t2); vector<int>finalTrueErrors=e_obj.getFinalErrors( trueErrors ); /*x*/ //This section handles finding the maximum moving averages amongst trulyIBD segments std::vector<float> av; float current_max; if(empirical_ma_threshold < 0.0){ av = e_obj.getTrueMovingAverages(finalTrueErrors,t1,t2,window); current_max = av[0]; for(int q = 1; q < av.size(); q++){ if(av[q] > current_max){ current_max = av[q]; } } e_obj.addMaxAverage(current_max); } //------------------------------------------------------------------------------ int temp1 = m_trueMatches[i][j][l].start; int temp2 = m_trueMatches[i][j][l].end; float startCM = e_obj.getCMDistance( temp1 ); float endCM = e_obj.getCMDistance( temp2 ); float mid1CM = startCM + ( endCM - startCM ) / 2 - PIElength / 2; float mid2CM = startCM + ( endCM - startCM ) / 2 + PIElength / 2; while( e_obj.getCMDistance( temp1 ) <= mid1CM || e_obj.getCMDistance( temp2 ) >=mid2CM ) { if( e_obj.getCMDistance( temp1 ) <= mid1CM ) { ++temp1; } if( e_obj.getCMDistance( temp2 ) >=mid2CM ) { --temp2; } } /*Here they are again. */ vector<vector<int> > errors=e_obj.checkErrors( i, j, temp1, temp2); vector<int>finalErrors=e_obj.getFinalErrors( errors ); // float per_err = e_obj.getThreshold(finalErrors,temp1, temp2, 0 ); float per_err = e_obj.getThreshold(finalErrors,temp1,temp2); //overload! m_errors.push_back( per_err ); /*x*/ }//end for(l) }//end for(j) }//end for(i) //this section actually handles the sorting of the max averages, and the setting of the user supplied percentile. vector<float>maxes; float cutoff = empirical_ma_threshold; //assume the user wanted to supply a value. This value will be overwritten shortly if they did not. if(empirical_ma_threshold < 0.0){ maxes = e_obj.getMaxAverages(); std::sort(maxes.begin(),maxes.end()); e_obj.setMaxAverage(maxes); cutoff = e_obj.getXthPercentile(ma_threshold); //<-make that an actual user input value } e_obj.setCutoff(cutoff);//set the actual threshold to be used when calculating MA in all other SH if(empirical_ma_threshold < 0.0){ ma_thresh_str = "User supplied ma-threshold is: " + NumberToString(ma_threshold); emp_ma_thresh_str = "Moving Averages will be tested usign the empirical threshold: " + NumberToString(cutoff); } else { emp_ma_thresh_str = "Moving Averages will be tested usign the empirical threshold: " + NumberToString(cutoff); } //---------------------------------------- std::sort( m_errors.begin(), m_errors.end() ); std::sort( m_holdOutErrors.begin(), m_holdOutErrors.end() ); ibg_str = "No of segments deemed to be IBD for finding empirical error threshold " + NumberToString( m_errors.size() ); }//end ftspe
//overloaded version void Consolidator::performTrim(ErrorCalculator& e_obj,int window, int ma_snp_ends, float ma_threshold, int min_snp,float min_cm, float per_err_threshold, std::string option, float hThreshold, bool holdOut,float empirical_threshold, float empirical_pie_threshold,int extendSnp)//<piyush> added the param int EXTENDSNP for calculating moving window avg) { int removed1 =0, removed2 = 0, removed3 = 0, removed4 = 0; int not_removed = 0; int total_count = global_initial; bool wrongOption = false; float per_err_threshold1; if(empirical_pie_threshold >= 0.0){ per_err_threshold1 = empirical_pie_threshold; } else { per_err_threshold1 = getPctErrThreshold( per_err_threshold ); } std::stringstream sstr; sstr << std::fixed << std::setprecision(10) << per_err_threshold1; std::string per_err_value = sstr.str(); emp_pie_thresh_str = "empirical pie threshold is : " + per_err_value + " \n"; float hThreshold1 = 0; if( holdOut ) { hThreshold1 = getHoldOutThreshold( hThreshold ); } per_err_threshold = per_err_threshold1; hThreshold = hThreshold1; for(int i=0;i<person_count;i++) { for(int j=i;j<person_count;j++) { for(int l=0;l<m_matches[i][j].size();l++) { total_count++; if(m_matches[i][j][l].end==-1) { continue; } int temp1=m_matches[i][j][l].start; //cout<<"temp1 start begin= "<<temp1<<endl; int temp2=m_matches[i][j][l].end; //cout<<"temp2 end begin= "<<temp2<<endl; if (extendSnp != 0) { /*<piyush1>*/ if(temp1-extendSnp <0) { temp1=0; //cout<<"New value of temp1= "<<temp1<<endl; } else { temp1=m_matches[i][j][l].start-extendSnp; //cout<<"New value of temp1= "<<temp1<<endl; } if(temp2+extendSnp > 4443)// change this constant { temp2=m_matches[i][j][l].end; //cout<<"New value of temp2= "<<temp2<<endl; } else { temp2=m_matches[i][j][l].end+extendSnp; //cout<<"New value of temp2= "<<temp2<<endl; } /*till here*/ /*cout<<"temp1 start after= "<<temp1<<endl; cout<<"temp2 end after= "<<temp2<<endl;*/ //cout<<"perform trim temp1= "<<temp1<<endl; //cout<<"perform trim temp2= "<<temp2<<endl; } int pers1 = i, pers2 = j; if( option.compare( "ErrorRandom1" ) == 0 || option.compare( "ErrorRandom2" ) == 0 || option.compare( "ErrorRandom3" ) == 0 ) { pers1 = std::rand() % e_obj.getNoOfPersons(); pers2 = std::rand() % e_obj.getNoOfPersons(); if( pers1 > pers2 ) { pers1 = pers1 + pers2; pers2 = pers1 - pers2; pers1 = pers1 - pers2; } } std::vector<std::vector<int> > errors=e_obj.checkErrors(pers1, pers2, temp1, temp2); std::vector<int>finalErrors=e_obj.getFinalErrors(errors);//<piyush for errors> //cout<<"finalErrors size= "<<finalErrors.size()<<endl; /*Inject implied error at start/end of SH here*/ std::vector<int>::iterator it; it = finalErrors.begin(); //go to the start of the vector if(finalErrors[0] != 1){ finalErrors.insert(it,1); //inject an error at position 1, if not already there } /*End inject implied error section*/ std::vector<int>trimPositions; std::vector<float>movingAverages; float threshold; if( (e_obj.isInitialCmDrop(temp1,temp2,min_cm)) || ((temp2-temp1) < min_snp) ){ //initial drop. Don't calculate MA trimPositions.push_back(temp1); trimPositions.push_back(temp2); trimPositions.push_back(1); }else{ movingAverages = e_obj.getMovingAverages(finalErrors,temp1,temp2,window,extendSnp);//<piyush> get moving averages are calculated from this part if(empirical_threshold < 0.0){ threshold = e_obj.getCutoff(); } else { threshold = empirical_threshold; } trimPositions = e_obj.getTrimPositions(movingAverages,temp1,temp2,threshold,min_cm); } //----------------- int beforeTrimStart = temp1; int beforeTrimEnd = temp2; m_matches[i][j][l].end = temp2 = temp1+trimPositions[1]; m_matches[i][j][l].start = temp1 = temp1+trimPositions[0]; int del0 = trimPositions[0]; int del1 = trimPositions[1]; float per_err = e_obj.getThreshold(finalErrors,del0,del1,ma_snp_ends); //add new weighted option /* For this new option, we only output SH that are not dropped. So, the output is finalOutput + weighted column. */ if( (option.compare("weightedOutput") == 0) || (option.compare("weightedOutputBP") == 0) ){ int snp1 = 0, snp2 = 0, hlength = 0; float noOfOppHom = 0; if( holdOut ) { snp1 = e_obj.getNewSnp( temp1 ); snp2 = e_obj.getNewSnp( temp2 ); hlength = snp2 - snp1; if( hlength <= 0 ) { hlength = 1; } noOfOppHom = e_obj.getOppHomThreshold( pers1, pers2, m_matches[i][j][l].start, m_matches[i][j][l].end ); } if( ( (beforeTrimEnd - beforeTrimStart) < min_snp) || ( (trimPositions.size() == 3) && (trimPositions[2] == 1) ) ){ m_matches[i][j][l].start= m_matches[i][j][l].end=-1; removed4++; continue; } if( (( temp2-temp1 ) < min_snp) || (trimPositions.size() == 3) ){ //removed2 a tpos.size of 3 indicates trimming due ot cM removed2++; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } if( per_err > per_err_threshold){ removed1++; continue; } if( holdOut && hThreshold < ( noOfOppHom ) / hlength ){ removed3++; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } //removed3 not_removed++; m_weighted_sh.push_back(Weighted_SH(temp1,temp2,i,j)); //build the vector of SH that passed continue; }//end weghtedOutput /*Add new finalErrorsOutput*/ if( (option.compare("finalErrorsOutput") == 0) ){ int snp1 = 0, snp2 = 0, hlength = 0; float noOfOppHom = 0; if( holdOut ) { snp1 = e_obj.getNewSnp( temp1 ); snp2 = e_obj.getNewSnp( temp2 ); hlength = snp2 - snp1; if( hlength <= 0 ) { hlength = 1; } noOfOppHom = e_obj.getOppHomThreshold( pers1, pers2, m_matches[i][j][l].start, m_matches[i][j][l].end ); } if( ( (beforeTrimEnd - beforeTrimStart) < min_snp) || ( (trimPositions.size() == 3) && (trimPositions[2] == 1) ) ){ std::vector<float>movingAverages; temp1 = beforeTrimStart; temp2 = beforeTrimEnd; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; removed4++; continue; } if( (( temp2-temp1 ) < min_snp) || (trimPositions.size() == 3) ){ //removed2 a tpos.size of 3 indicates trimming due ot cM removed2++; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } if( per_err > per_err_threshold){ removed1++; continue; } if( holdOut && hThreshold < ( noOfOppHom ) / hlength ){ removed3++; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } //removed3 not_removed++; e_obj.finalErrorsOutput(i,j,temp1,temp2,min_cm,per_err);// <piyush>this is where the final output is written is called at continue; }//end finalErrorsOutput if( (option.compare("FullPlusDropped") == 0) ){ int snp1 = 0, snp2 = 0, hlength = 0; float noOfOppHom = 0; if( holdOut ) { snp1 = e_obj.getNewSnp( temp1 ); snp2 = e_obj.getNewSnp( temp2 ); hlength = snp2 - snp1; if( hlength <= 0 ) { hlength = 1; } noOfOppHom = e_obj.getOppHomThreshold( pers1, pers2, m_matches[i][j][l].start, m_matches[i][j][l].end ); } if( ( (beforeTrimEnd - beforeTrimStart) < min_snp) || ( (trimPositions.size() == 3) && (trimPositions[2] == 1) ) ){ std::vector<float>movingAverages; temp1 = beforeTrimStart; temp2 = beforeTrimEnd; e_obj.fullPlusDroppedOutput(i,j,temp1,temp2,min_snp,min_cm,finalErrors,per_err,1);//standardize the error codes m_matches[i][j][l].start= m_matches[i][j][l].end=-1; removed4++; continue; } if( (( temp2-temp1 ) < min_snp) || (trimPositions.size() == 3) ){ //removed2 a tpos.size of 3 indicates trimming due ot cM e_obj.fullPlusDroppedOutput(i,j,temp1,temp2,min_snp,min_cm,finalErrors,per_err,2); removed2++; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } if( per_err > per_err_threshold){ e_obj.fullPlusDroppedOutput(i,j,temp1,temp2,min_snp,min_cm,finalErrors,per_err,3); removed1++; continue; } if( holdOut && hThreshold < ( noOfOppHom ) / hlength ){ e_obj.fullPlusDroppedOutput(i,j,temp1,temp2,min_snp,min_cm,finalErrors,per_err,4); removed3++; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } //removed3 not_removed++; e_obj.finalOutPut(i,j,temp1,temp2,min_cm); continue; } //end FullPlusDropped //Calculate Error1 if( (option.compare("Error1") == 0 ) || (option.compare("ErrorRandom1") == 0) || (option.compare("Error") == 0) ){ if( ( (beforeTrimEnd - beforeTrimStart) < min_snp) || ( (trimPositions.size() == 3) && (trimPositions[2] == 1) ) ){ //dropped before trimming //don't bother printing out ma for this one. But go back and change it so that it doesn't actually calc it std::vector<float>movingAverages;//null //trying something special in this case. This can be removed once idrops aren't being trimmed //test code temp1 = beforeTrimStart; temp2 = beforeTrimEnd; // e_obj.errorOutput(i,j,temp1,temp2,min_snp,min_cm,movingAverages,finalErrors,per_err,temp1,temp2,beforeTrimStart,beforeTrimEnd,1); m_matches[i][j][l].start= m_matches[i][j][l].end=-1; removed4++; //seems ok continue; } if( (( temp2-temp1 ) < min_snp) || ((trimPositions.size() == 3) && (trimPositions[2] == 2) ) ) //dropped after trimming { e_obj.errorOutput(i,j,temp1,temp2,min_snp,min_cm,movingAverages,finalErrors,per_err,temp1,temp2,beforeTrimStart,beforeTrimEnd,2); ++removed2; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } if( per_err > per_err_threshold ) //dropped due to pie { e_obj.errorOutput(i,j,temp1,temp2,min_snp,min_cm,movingAverages,finalErrors,per_err,temp1,temp2,beforeTrimStart,beforeTrimEnd,3); ++removed1; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } not_removed++; e_obj.errorOutput(i,j,temp1,temp2,min_snp,min_cm,movingAverages,finalErrors,per_err,temp1,temp2,beforeTrimStart,beforeTrimEnd,0);//no drop continue; }//end error1 int snp1 = 0, snp2 = 0, hlength = 0; float noOfOppHom = 0; if( holdOut ) { snp1 = e_obj.getNewSnp( temp1 ); snp2 = e_obj.getNewSnp( temp2 ); hlength = snp2 - snp1; if( hlength <= 0 ) { hlength = 1; } noOfOppHom = e_obj.getOppHomThreshold( pers1, pers2, m_matches[i][j][l].start, m_matches[i][j][l].end ); } //update drop order 2/26/14 if( (( temp2-temp1 ) < min_snp) || (trimPositions.size() == 3) ) { ++removed2; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } if( per_err > per_err_threshold ) { ++removed1; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } //probably not removed? not_removed++; if( option.compare("MovingAverages")==0 ) //make this ma2 { if( holdOut) { e_obj.middleHoldOutPut(i,j,temp1,temp2, min_snp,min_cm,movingAverages,trimPositions,per_err, noOfOppHom, hlength ); } else { e_obj.middleOutPut(i,j,temp1,temp2, min_snp, min_cm,movingAverages, trimPositions,per_err ); } continue; } if(option.compare("Error2")==0 || option.compare( "ErrorRandom2" ) == 0) { if( holdOut) { e_obj.middleHoldOutPut(i,j,temp1,temp2, min_snp, min_cm, finalErrors, trimPositions, per_err, noOfOppHom, hlength ); } else { e_obj.middleOutPut(i,j,temp1,temp2, min_snp, min_cm, finalErrors, trimPositions, per_err); } continue; } if ( holdOut && hThreshold < ( noOfOppHom ) / hlength ) { ++removed3; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } if( option.compare("Error3")==0 || option.compare( "ErrorRandom3" ) == 0 ) { e_obj.middleHoldOutPut(i,j,temp1,temp2, min_snp, min_cm, finalErrors, trimPositions, per_err, noOfOppHom, hlength ); } }//l }//j }//i /*ENTERING TESTING AREA DEC 4th 2014*/ /***************************** ******************************/ /*Now, let's handle weighted output if need be*/ if( option.compare("weightedOutput") == 0 ){ float snp_average_count = 0.0; int start_position; int end_position; int genome_length; if(isUserSuppliedWeights()){ //the user has supplied their own weights. //in this case, the min and max values correspond to the number of lines in the input file, //since each line represents a snp. So the min is always 0, and the max is always the number of lines-1. start_position = 0; end_position = user_supplied_snp_weights.size() - 1; }else { start_position = find_genome_min(); end_position = find_genome_max(); }//end else genome_length = (end_position - start_position)+1; genome_vector.resize(genome_length,0); if(isUserSuppliedWeights()){ for(int i = 0; i < user_supplied_snp_weights.size(); i++){ update_genome(i,user_supplied_snp_weights[i]); } }else{ /*This next for loop adds one to each snp in a SH. Bypass it if the user gives a files of weights*/ for(int i = 0; i < m_weighted_sh.size(); i++){ update_genome(m_weighted_sh[i].snp1, m_weighted_sh[i].snp2); } } //this part is next...will probably need to add stuff to that weighted object... snp_average_count = average_snp_count(); for(int i = 0; i < m_weighted_sh.size(); i++){ m_weighted_sh[i].snp_weight = update_snp_weight(m_weighted_sh[i].snp1, m_weighted_sh[i].snp2); } for(int i = 0; i < m_weighted_sh.size(); i++){ m_weighted_sh[i].final_weight = ( snp_average_count / (m_weighted_sh[i].snp_weight)); e_obj.weightedOutput(m_weighted_sh[i].per1, m_weighted_sh[i].per2, m_weighted_sh[i].snp1, m_weighted_sh[i].snp2, m_weighted_sh[i].final_weight); } } if (option.compare("weightedOutputBP") == 0){ //begin new test code section here: Dec 4th 2014 int genome_length = e_obj.getGenomeBPLength(); float adjusted_genome_length = genome_length / 1000.0; //L using kbp for now int genome_min = e_obj.getMinimumBP(); std::cout<<"genome_min= "<<genome_min<<std::endl; int genome_max = e_obj.getMaximumBP(); std::cout<<"genome_max= "<<genome_max<<std::endl; int genome_size_snps = (find_genome_max() - find_genome_min())+1; //used for genome_vector float wprime_numerator = 0.0; //This is Ci / L float total_sh_length_sum = 0.0; float w2prime_denominator = 0.0; genome_vector.resize(genome_size_snps,0); //resize and zero out the genome. shit that needs to be snps. //update all of the snp counts in the genome. This looks fine. for(int i = 0; i < m_weighted_sh.size(); i++){ update_genome(m_weighted_sh[i].snp1, m_weighted_sh[i].snp2); } //calculate the w' numerator by summing up all of the snp counts and dividing by the genome length. //WARNING: This can cause wprime_numerator to overflow. Currently using kbp units to avoid this, but //this needs to be addressed. for(int i = 0; i < genome_vector.size(); i++){ wprime_numerator += genome_vector[i] / adjusted_genome_length; } //Calculate w' for each SH. for(int i = 0; i < m_weighted_sh.size(); i++){ float wprime_denominator = 0.0; m_weighted_sh[i].mbp_length = (e_obj.getSHBPLength(m_weighted_sh[i].snp1, m_weighted_sh[i].snp2)/1000.0); wprime_denominator = get_snps_over_range(m_weighted_sh[i].snp1, m_weighted_sh[i].snp2, m_weighted_sh[i].mbp_length); m_weighted_sh[i].wprime = wprime_numerator / wprime_denominator; } //This is the total length of all SH. This can probably overflow as well...ugh. for(int i = 0; i < m_weighted_sh.size(); i++){ total_sh_length_sum += m_weighted_sh[i].mbp_length; } //Calculate the w2prime denominator - this value is a constant for(int i = 0; i < m_weighted_sh.size(); i++){ float temp = m_weighted_sh[i].mbp_length * m_weighted_sh[i].wprime; w2prime_denominator += temp / total_sh_length_sum; } //Calculate and output w2' for each SH for(int i = 0; i < m_weighted_sh.size(); i++){ m_weighted_sh[i].w2prime = (m_weighted_sh[i].wprime) / w2prime_denominator; e_obj.weightedOutput(m_weighted_sh[i].per1, m_weighted_sh[i].per2, m_weighted_sh[i].snp1, m_weighted_sh[i].snp2, m_weighted_sh[i].w2prime); } } /*End weighted output*/ /*END TESTING AREA DEC 4th 2014*/ /***************************** ******************************/ ma_drop_str = "No of matches removed due to length of trimming by moving averages: " + NumberToString( removed2 ); pie_drop_str = "No of matches removed due to percentage error: " + NumberToString( removed1 ); if(holdOut){ // str = str+ " \n No of matches removed due hold out ped file checking: "+ NumberToString( removed3 ); } //begin log output std::string parameter_string_1 = "\n\n**********Parameters used in program**********\n"; e_obj.log(parameter_string_1); e_obj.log(emp_ma_thresh_str); //keep e_obj.log(emp_pie_thresh_str);//keep parameter_string_1 = "**********************************************\n\n"; e_obj.log(parameter_string_1); std::string total_count_str = "The total number of SH in the input file was: " + NumberToString(total_count); e_obj.log(total_count_str); e_obj.log(consolidated_str); e_obj.log(initial_drop_str); // e_obj.log(ibg_str); e_obj.log(ma_drop_str); e_obj.log(pie_drop_str); final_sh_str = "Total number of SH that were not dropped is: " + NumberToString(not_removed); e_obj.log(final_sh_str); }//end performTrim