void Consolidator::performHoldOutTrim( ErrorCalculator& ecal, float threshold, std::string hMiss, std::string option ) { threshold = getHoldOutThreshold( threshold ); for(int i=0;i<person_count;i++) { for(int j=i;j<person_count;j++) { for(int l=0;l<m_matches[i][j].size();l++) { if(m_matches[i][j][l].end==-1) { continue; } int temp1=m_matches[i][j][l].start; int temp2=m_matches[i][j][l].end; int snp1 = ecal.getNewSnp( temp1 ); int snp2 = ecal.getNewSnp( temp2 ); int length = snp2 - snp1; float noOfOppHom = ecal.getOppHomThreshold( i, j, m_matches[i][j][l].start, m_matches[i][j][l].end ); if( length != 0 && threshold < (noOfOppHom) / length ) { m_matches[i][j][l].start = m_matches[i][j][l].end = -1; } } } } }
//overloaded version void Consolidator::performTrim(ErrorCalculator& e_obj,int window, int ma_snp_ends, float ma_threshold, int min_snp,float min_cm, float per_err_threshold, std::string option, float hThreshold, bool holdOut,float empirical_threshold, float empirical_pie_threshold,int extendSnp)//<piyush> added the param int EXTENDSNP for calculating moving window avg) { int removed1 =0, removed2 = 0, removed3 = 0, removed4 = 0; int not_removed = 0; int total_count = global_initial; bool wrongOption = false; float per_err_threshold1; if(empirical_pie_threshold >= 0.0){ per_err_threshold1 = empirical_pie_threshold; } else { per_err_threshold1 = getPctErrThreshold( per_err_threshold ); } std::stringstream sstr; sstr << std::fixed << std::setprecision(10) << per_err_threshold1; std::string per_err_value = sstr.str(); emp_pie_thresh_str = "empirical pie threshold is : " + per_err_value + " \n"; float hThreshold1 = 0; if( holdOut ) { hThreshold1 = getHoldOutThreshold( hThreshold ); } per_err_threshold = per_err_threshold1; hThreshold = hThreshold1; for(int i=0;i<person_count;i++) { for(int j=i;j<person_count;j++) { for(int l=0;l<m_matches[i][j].size();l++) { total_count++; if(m_matches[i][j][l].end==-1) { continue; } int temp1=m_matches[i][j][l].start; //cout<<"temp1 start begin= "<<temp1<<endl; int temp2=m_matches[i][j][l].end; //cout<<"temp2 end begin= "<<temp2<<endl; if (extendSnp != 0) { /*<piyush1>*/ if(temp1-extendSnp <0) { temp1=0; //cout<<"New value of temp1= "<<temp1<<endl; } else { temp1=m_matches[i][j][l].start-extendSnp; //cout<<"New value of temp1= "<<temp1<<endl; } if(temp2+extendSnp > 4443)// change this constant { temp2=m_matches[i][j][l].end; //cout<<"New value of temp2= "<<temp2<<endl; } else { temp2=m_matches[i][j][l].end+extendSnp; //cout<<"New value of temp2= "<<temp2<<endl; } /*till here*/ /*cout<<"temp1 start after= "<<temp1<<endl; cout<<"temp2 end after= "<<temp2<<endl;*/ //cout<<"perform trim temp1= "<<temp1<<endl; //cout<<"perform trim temp2= "<<temp2<<endl; } int pers1 = i, pers2 = j; if( option.compare( "ErrorRandom1" ) == 0 || option.compare( "ErrorRandom2" ) == 0 || option.compare( "ErrorRandom3" ) == 0 ) { pers1 = std::rand() % e_obj.getNoOfPersons(); pers2 = std::rand() % e_obj.getNoOfPersons(); if( pers1 > pers2 ) { pers1 = pers1 + pers2; pers2 = pers1 - pers2; pers1 = pers1 - pers2; } } std::vector<std::vector<int> > errors=e_obj.checkErrors(pers1, pers2, temp1, temp2); std::vector<int>finalErrors=e_obj.getFinalErrors(errors);//<piyush for errors> //cout<<"finalErrors size= "<<finalErrors.size()<<endl; /*Inject implied error at start/end of SH here*/ std::vector<int>::iterator it; it = finalErrors.begin(); //go to the start of the vector if(finalErrors[0] != 1){ finalErrors.insert(it,1); //inject an error at position 1, if not already there } /*End inject implied error section*/ std::vector<int>trimPositions; std::vector<float>movingAverages; float threshold; if( (e_obj.isInitialCmDrop(temp1,temp2,min_cm)) || ((temp2-temp1) < min_snp) ){ //initial drop. Don't calculate MA trimPositions.push_back(temp1); trimPositions.push_back(temp2); trimPositions.push_back(1); }else{ movingAverages = e_obj.getMovingAverages(finalErrors,temp1,temp2,window,extendSnp);//<piyush> get moving averages are calculated from this part if(empirical_threshold < 0.0){ threshold = e_obj.getCutoff(); } else { threshold = empirical_threshold; } trimPositions = e_obj.getTrimPositions(movingAverages,temp1,temp2,threshold,min_cm); } //----------------- int beforeTrimStart = temp1; int beforeTrimEnd = temp2; m_matches[i][j][l].end = temp2 = temp1+trimPositions[1]; m_matches[i][j][l].start = temp1 = temp1+trimPositions[0]; int del0 = trimPositions[0]; int del1 = trimPositions[1]; float per_err = e_obj.getThreshold(finalErrors,del0,del1,ma_snp_ends); //add new weighted option /* For this new option, we only output SH that are not dropped. So, the output is finalOutput + weighted column. */ if( (option.compare("weightedOutput") == 0) || (option.compare("weightedOutputBP") == 0) ){ int snp1 = 0, snp2 = 0, hlength = 0; float noOfOppHom = 0; if( holdOut ) { snp1 = e_obj.getNewSnp( temp1 ); snp2 = e_obj.getNewSnp( temp2 ); hlength = snp2 - snp1; if( hlength <= 0 ) { hlength = 1; } noOfOppHom = e_obj.getOppHomThreshold( pers1, pers2, m_matches[i][j][l].start, m_matches[i][j][l].end ); } if( ( (beforeTrimEnd - beforeTrimStart) < min_snp) || ( (trimPositions.size() == 3) && (trimPositions[2] == 1) ) ){ m_matches[i][j][l].start= m_matches[i][j][l].end=-1; removed4++; continue; } if( (( temp2-temp1 ) < min_snp) || (trimPositions.size() == 3) ){ //removed2 a tpos.size of 3 indicates trimming due ot cM removed2++; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } if( per_err > per_err_threshold){ removed1++; continue; } if( holdOut && hThreshold < ( noOfOppHom ) / hlength ){ removed3++; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } //removed3 not_removed++; m_weighted_sh.push_back(Weighted_SH(temp1,temp2,i,j)); //build the vector of SH that passed continue; }//end weghtedOutput /*Add new finalErrorsOutput*/ if( (option.compare("finalErrorsOutput") == 0) ){ int snp1 = 0, snp2 = 0, hlength = 0; float noOfOppHom = 0; if( holdOut ) { snp1 = e_obj.getNewSnp( temp1 ); snp2 = e_obj.getNewSnp( temp2 ); hlength = snp2 - snp1; if( hlength <= 0 ) { hlength = 1; } noOfOppHom = e_obj.getOppHomThreshold( pers1, pers2, m_matches[i][j][l].start, m_matches[i][j][l].end ); } if( ( (beforeTrimEnd - beforeTrimStart) < min_snp) || ( (trimPositions.size() == 3) && (trimPositions[2] == 1) ) ){ std::vector<float>movingAverages; temp1 = beforeTrimStart; temp2 = beforeTrimEnd; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; removed4++; continue; } if( (( temp2-temp1 ) < min_snp) || (trimPositions.size() == 3) ){ //removed2 a tpos.size of 3 indicates trimming due ot cM removed2++; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } if( per_err > per_err_threshold){ removed1++; continue; } if( holdOut && hThreshold < ( noOfOppHom ) / hlength ){ removed3++; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } //removed3 not_removed++; e_obj.finalErrorsOutput(i,j,temp1,temp2,min_cm,per_err);// <piyush>this is where the final output is written is called at continue; }//end finalErrorsOutput if( (option.compare("FullPlusDropped") == 0) ){ int snp1 = 0, snp2 = 0, hlength = 0; float noOfOppHom = 0; if( holdOut ) { snp1 = e_obj.getNewSnp( temp1 ); snp2 = e_obj.getNewSnp( temp2 ); hlength = snp2 - snp1; if( hlength <= 0 ) { hlength = 1; } noOfOppHom = e_obj.getOppHomThreshold( pers1, pers2, m_matches[i][j][l].start, m_matches[i][j][l].end ); } if( ( (beforeTrimEnd - beforeTrimStart) < min_snp) || ( (trimPositions.size() == 3) && (trimPositions[2] == 1) ) ){ std::vector<float>movingAverages; temp1 = beforeTrimStart; temp2 = beforeTrimEnd; e_obj.fullPlusDroppedOutput(i,j,temp1,temp2,min_snp,min_cm,finalErrors,per_err,1);//standardize the error codes m_matches[i][j][l].start= m_matches[i][j][l].end=-1; removed4++; continue; } if( (( temp2-temp1 ) < min_snp) || (trimPositions.size() == 3) ){ //removed2 a tpos.size of 3 indicates trimming due ot cM e_obj.fullPlusDroppedOutput(i,j,temp1,temp2,min_snp,min_cm,finalErrors,per_err,2); removed2++; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } if( per_err > per_err_threshold){ e_obj.fullPlusDroppedOutput(i,j,temp1,temp2,min_snp,min_cm,finalErrors,per_err,3); removed1++; continue; } if( holdOut && hThreshold < ( noOfOppHom ) / hlength ){ e_obj.fullPlusDroppedOutput(i,j,temp1,temp2,min_snp,min_cm,finalErrors,per_err,4); removed3++; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } //removed3 not_removed++; e_obj.finalOutPut(i,j,temp1,temp2,min_cm); continue; } //end FullPlusDropped //Calculate Error1 if( (option.compare("Error1") == 0 ) || (option.compare("ErrorRandom1") == 0) || (option.compare("Error") == 0) ){ if( ( (beforeTrimEnd - beforeTrimStart) < min_snp) || ( (trimPositions.size() == 3) && (trimPositions[2] == 1) ) ){ //dropped before trimming //don't bother printing out ma for this one. But go back and change it so that it doesn't actually calc it std::vector<float>movingAverages;//null //trying something special in this case. This can be removed once idrops aren't being trimmed //test code temp1 = beforeTrimStart; temp2 = beforeTrimEnd; // e_obj.errorOutput(i,j,temp1,temp2,min_snp,min_cm,movingAverages,finalErrors,per_err,temp1,temp2,beforeTrimStart,beforeTrimEnd,1); m_matches[i][j][l].start= m_matches[i][j][l].end=-1; removed4++; //seems ok continue; } if( (( temp2-temp1 ) < min_snp) || ((trimPositions.size() == 3) && (trimPositions[2] == 2) ) ) //dropped after trimming { e_obj.errorOutput(i,j,temp1,temp2,min_snp,min_cm,movingAverages,finalErrors,per_err,temp1,temp2,beforeTrimStart,beforeTrimEnd,2); ++removed2; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } if( per_err > per_err_threshold ) //dropped due to pie { e_obj.errorOutput(i,j,temp1,temp2,min_snp,min_cm,movingAverages,finalErrors,per_err,temp1,temp2,beforeTrimStart,beforeTrimEnd,3); ++removed1; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } not_removed++; e_obj.errorOutput(i,j,temp1,temp2,min_snp,min_cm,movingAverages,finalErrors,per_err,temp1,temp2,beforeTrimStart,beforeTrimEnd,0);//no drop continue; }//end error1 int snp1 = 0, snp2 = 0, hlength = 0; float noOfOppHom = 0; if( holdOut ) { snp1 = e_obj.getNewSnp( temp1 ); snp2 = e_obj.getNewSnp( temp2 ); hlength = snp2 - snp1; if( hlength <= 0 ) { hlength = 1; } noOfOppHom = e_obj.getOppHomThreshold( pers1, pers2, m_matches[i][j][l].start, m_matches[i][j][l].end ); } //update drop order 2/26/14 if( (( temp2-temp1 ) < min_snp) || (trimPositions.size() == 3) ) { ++removed2; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } if( per_err > per_err_threshold ) { ++removed1; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } //probably not removed? not_removed++; if( option.compare("MovingAverages")==0 ) //make this ma2 { if( holdOut) { e_obj.middleHoldOutPut(i,j,temp1,temp2, min_snp,min_cm,movingAverages,trimPositions,per_err, noOfOppHom, hlength ); } else { e_obj.middleOutPut(i,j,temp1,temp2, min_snp, min_cm,movingAverages, trimPositions,per_err ); } continue; } if(option.compare("Error2")==0 || option.compare( "ErrorRandom2" ) == 0) { if( holdOut) { e_obj.middleHoldOutPut(i,j,temp1,temp2, min_snp, min_cm, finalErrors, trimPositions, per_err, noOfOppHom, hlength ); } else { e_obj.middleOutPut(i,j,temp1,temp2, min_snp, min_cm, finalErrors, trimPositions, per_err); } continue; } if ( holdOut && hThreshold < ( noOfOppHom ) / hlength ) { ++removed3; m_matches[i][j][l].start= m_matches[i][j][l].end=-1; continue; } if( option.compare("Error3")==0 || option.compare( "ErrorRandom3" ) == 0 ) { e_obj.middleHoldOutPut(i,j,temp1,temp2, min_snp, min_cm, finalErrors, trimPositions, per_err, noOfOppHom, hlength ); } }//l }//j }//i /*ENTERING TESTING AREA DEC 4th 2014*/ /***************************** ******************************/ /*Now, let's handle weighted output if need be*/ if( option.compare("weightedOutput") == 0 ){ float snp_average_count = 0.0; int start_position; int end_position; int genome_length; if(isUserSuppliedWeights()){ //the user has supplied their own weights. //in this case, the min and max values correspond to the number of lines in the input file, //since each line represents a snp. So the min is always 0, and the max is always the number of lines-1. start_position = 0; end_position = user_supplied_snp_weights.size() - 1; }else { start_position = find_genome_min(); end_position = find_genome_max(); }//end else genome_length = (end_position - start_position)+1; genome_vector.resize(genome_length,0); if(isUserSuppliedWeights()){ for(int i = 0; i < user_supplied_snp_weights.size(); i++){ update_genome(i,user_supplied_snp_weights[i]); } }else{ /*This next for loop adds one to each snp in a SH. Bypass it if the user gives a files of weights*/ for(int i = 0; i < m_weighted_sh.size(); i++){ update_genome(m_weighted_sh[i].snp1, m_weighted_sh[i].snp2); } } //this part is next...will probably need to add stuff to that weighted object... snp_average_count = average_snp_count(); for(int i = 0; i < m_weighted_sh.size(); i++){ m_weighted_sh[i].snp_weight = update_snp_weight(m_weighted_sh[i].snp1, m_weighted_sh[i].snp2); } for(int i = 0; i < m_weighted_sh.size(); i++){ m_weighted_sh[i].final_weight = ( snp_average_count / (m_weighted_sh[i].snp_weight)); e_obj.weightedOutput(m_weighted_sh[i].per1, m_weighted_sh[i].per2, m_weighted_sh[i].snp1, m_weighted_sh[i].snp2, m_weighted_sh[i].final_weight); } } if (option.compare("weightedOutputBP") == 0){ //begin new test code section here: Dec 4th 2014 int genome_length = e_obj.getGenomeBPLength(); float adjusted_genome_length = genome_length / 1000.0; //L using kbp for now int genome_min = e_obj.getMinimumBP(); std::cout<<"genome_min= "<<genome_min<<std::endl; int genome_max = e_obj.getMaximumBP(); std::cout<<"genome_max= "<<genome_max<<std::endl; int genome_size_snps = (find_genome_max() - find_genome_min())+1; //used for genome_vector float wprime_numerator = 0.0; //This is Ci / L float total_sh_length_sum = 0.0; float w2prime_denominator = 0.0; genome_vector.resize(genome_size_snps,0); //resize and zero out the genome. shit that needs to be snps. //update all of the snp counts in the genome. This looks fine. for(int i = 0; i < m_weighted_sh.size(); i++){ update_genome(m_weighted_sh[i].snp1, m_weighted_sh[i].snp2); } //calculate the w' numerator by summing up all of the snp counts and dividing by the genome length. //WARNING: This can cause wprime_numerator to overflow. Currently using kbp units to avoid this, but //this needs to be addressed. for(int i = 0; i < genome_vector.size(); i++){ wprime_numerator += genome_vector[i] / adjusted_genome_length; } //Calculate w' for each SH. for(int i = 0; i < m_weighted_sh.size(); i++){ float wprime_denominator = 0.0; m_weighted_sh[i].mbp_length = (e_obj.getSHBPLength(m_weighted_sh[i].snp1, m_weighted_sh[i].snp2)/1000.0); wprime_denominator = get_snps_over_range(m_weighted_sh[i].snp1, m_weighted_sh[i].snp2, m_weighted_sh[i].mbp_length); m_weighted_sh[i].wprime = wprime_numerator / wprime_denominator; } //This is the total length of all SH. This can probably overflow as well...ugh. for(int i = 0; i < m_weighted_sh.size(); i++){ total_sh_length_sum += m_weighted_sh[i].mbp_length; } //Calculate the w2prime denominator - this value is a constant for(int i = 0; i < m_weighted_sh.size(); i++){ float temp = m_weighted_sh[i].mbp_length * m_weighted_sh[i].wprime; w2prime_denominator += temp / total_sh_length_sum; } //Calculate and output w2' for each SH for(int i = 0; i < m_weighted_sh.size(); i++){ m_weighted_sh[i].w2prime = (m_weighted_sh[i].wprime) / w2prime_denominator; e_obj.weightedOutput(m_weighted_sh[i].per1, m_weighted_sh[i].per2, m_weighted_sh[i].snp1, m_weighted_sh[i].snp2, m_weighted_sh[i].w2prime); } } /*End weighted output*/ /*END TESTING AREA DEC 4th 2014*/ /***************************** ******************************/ ma_drop_str = "No of matches removed due to length of trimming by moving averages: " + NumberToString( removed2 ); pie_drop_str = "No of matches removed due to percentage error: " + NumberToString( removed1 ); if(holdOut){ // str = str+ " \n No of matches removed due hold out ped file checking: "+ NumberToString( removed3 ); } //begin log output std::string parameter_string_1 = "\n\n**********Parameters used in program**********\n"; e_obj.log(parameter_string_1); e_obj.log(emp_ma_thresh_str); //keep e_obj.log(emp_pie_thresh_str);//keep parameter_string_1 = "**********************************************\n\n"; e_obj.log(parameter_string_1); std::string total_count_str = "The total number of SH in the input file was: " + NumberToString(total_count); e_obj.log(total_count_str); e_obj.log(consolidated_str); e_obj.log(initial_drop_str); // e_obj.log(ibg_str); e_obj.log(ma_drop_str); e_obj.log(pie_drop_str); final_sh_str = "Total number of SH that were not dropped is: " + NumberToString(not_removed); e_obj.log(final_sh_str); }//end performTrim