Ejemplo n.º 1
0
void Consolidator::finalOutPut(ErrorCalculator &e,float min_cm, int min_snp )const
{

     for(int i=0;i<person_count;i++)
        {
                for(int j=i;j<person_count;j++)
                {

                         for(int l=0;l<m_matches[i][j].size();l++)
                         { 
				if(m_matches[i][j][l].start==-1||m_matches[i][j][l].end==-1 || ( m_matches[i][j][l].end - m_matches[i][j][l].start ) < min_snp ) continue;
                                e.finalOutPut(i,j,m_matches[i][j][l].start,m_matches[i][j][l].end ,min_cm);                
                         }

		}

	}

}
//overloaded version
void Consolidator::performTrim(ErrorCalculator& e_obj,int window,
                               int ma_snp_ends, float ma_threshold,
                               int min_snp,float min_cm,
                               float per_err_threshold, std::string option,
                               float hThreshold, bool holdOut,float empirical_threshold, float empirical_pie_threshold,int extendSnp)//<piyush> added the param int EXTENDSNP for calculating moving window avg)
{
  int removed1 =0, removed2 = 0, removed3 = 0, removed4 = 0;
  int not_removed = 0;
  int total_count = global_initial;
  bool wrongOption = false;
  float per_err_threshold1;
  if(empirical_pie_threshold >= 0.0){
    per_err_threshold1 = empirical_pie_threshold;
  } else {
    per_err_threshold1 = getPctErrThreshold( per_err_threshold );
  }
  std::stringstream sstr;
  sstr << std::fixed << std::setprecision(10) << per_err_threshold1;
  std::string per_err_value = sstr.str();
  emp_pie_thresh_str = "empirical pie threshold is : " + per_err_value  + " \n";
  float hThreshold1 = 0;
  if( holdOut )
  {
    hThreshold1 = getHoldOutThreshold( hThreshold );
  }
  
  per_err_threshold = per_err_threshold1;
  hThreshold = hThreshold1;

  for(int i=0;i<person_count;i++)
  {
    for(int j=i;j<person_count;j++)
    {
      for(int l=0;l<m_matches[i][j].size();l++)
      {
        total_count++;
        if(m_matches[i][j][l].end==-1)
        {
          continue; 
        }

        int temp1=m_matches[i][j][l].start;
        //cout<<"temp1 start begin= "<<temp1<<endl;

        int temp2=m_matches[i][j][l].end;
        //cout<<"temp2 end begin= "<<temp2<<endl;


if (extendSnp != 0)
{

        /*<piyush1>*/
        if(temp1-extendSnp <0)
                                     {
                                     	temp1=0;
                                     	//cout<<"New value of temp1= "<<temp1<<endl;
                                     }
                                     else
                                     {
                                     	temp1=m_matches[i][j][l].start-extendSnp;
                                     	//cout<<"New value of temp1= "<<temp1<<endl;
                                     }

                                     if(temp2+extendSnp > 4443)// change this constant
                                     {
                                     	temp2=m_matches[i][j][l].end;
                                     	//cout<<"New value of temp2= "<<temp2<<endl;
                                     }
                                     else
                                     {
                                     	temp2=m_matches[i][j][l].end+extendSnp;
                                     	//cout<<"New value of temp2= "<<temp2<<endl;
                                     }


        /*till here*/
                                     /*cout<<"temp1 start after= "<<temp1<<endl;
                                     cout<<"temp2 end after= "<<temp2<<endl;*/


                                     //cout<<"perform trim temp1= "<<temp1<<endl;
                                     //cout<<"perform trim temp2= "<<temp2<<endl;


}

        int pers1 = i, pers2 = j;
        if( option.compare( "ErrorRandom1" ) == 0 || option.compare( "ErrorRandom2" ) == 0 || option.compare( "ErrorRandom3" ) == 0 )
        {
          pers1 = std::rand() % e_obj.getNoOfPersons();
          pers2 = std::rand() % e_obj.getNoOfPersons();
          if( pers1 > pers2 )
          {
            pers1 = pers1 + pers2;
            pers2 = pers1 - pers2;
            pers1 = pers1 - pers2;
          }
        }

        std::vector<std::vector<int> > errors=e_obj.checkErrors(pers1, pers2, temp1, temp2);
        std::vector<int>finalErrors=e_obj.getFinalErrors(errors);//<piyush for errors>

        //cout<<"finalErrors size= "<<finalErrors.size()<<endl;
        /*Inject implied error at start/end of SH here*/
        std::vector<int>::iterator it;
        it = finalErrors.begin(); //go to the start of the vector
        if(finalErrors[0] != 1){
          finalErrors.insert(it,1); //inject an error at position 1, if not already there
        }	
        /*End inject implied error section*/

        std::vector<int>trimPositions;
        std::vector<float>movingAverages;
        float threshold;
        if( (e_obj.isInitialCmDrop(temp1,temp2,min_cm)) || ((temp2-temp1) < min_snp) ){ //initial drop. Don't calculate MA
          trimPositions.push_back(temp1);
          trimPositions.push_back(temp2);
          trimPositions.push_back(1);
        }else{

          movingAverages = e_obj.getMovingAverages(finalErrors,temp1,temp2,window,extendSnp);//<piyush> get moving averages are calculated from this part
          if(empirical_threshold < 0.0){
            threshold = e_obj.getCutoff();
          } else {
            threshold = empirical_threshold;
          }
          trimPositions = e_obj.getTrimPositions(movingAverages,temp1,temp2,threshold,min_cm); 

        }
        //-----------------

        int beforeTrimStart = temp1;	
        int beforeTrimEnd = temp2;
        m_matches[i][j][l].end = temp2 = temp1+trimPositions[1];
        m_matches[i][j][l].start = temp1 = temp1+trimPositions[0];
        int del0 = trimPositions[0];
        int del1 = trimPositions[1];
        float per_err = e_obj.getThreshold(finalErrors,del0,del1,ma_snp_ends);

        //add new weighted option
        /*
         For this new option, we only output SH that are not dropped. So, the output is finalOutput + weighted column.
        */
        if( (option.compare("weightedOutput") == 0) || (option.compare("weightedOutputBP") == 0) ){
          int snp1 = 0, snp2 = 0, hlength = 0;
          float noOfOppHom = 0;
          if( holdOut )
          {
            snp1 = e_obj.getNewSnp( temp1 );
            snp2 = e_obj.getNewSnp( temp2 );
            hlength = snp2 - snp1;
            if( hlength <= 0 )
            {
              hlength = 1;
            }
            noOfOppHom = e_obj.getOppHomThreshold( pers1, pers2, m_matches[i][j][l].start, m_matches[i][j][l].end );
          }
          if( ( (beforeTrimEnd - beforeTrimStart) < min_snp) || ( (trimPositions.size() == 3) && (trimPositions[2] == 1) ) ){ 
            m_matches[i][j][l].start= m_matches[i][j][l].end=-1;
            removed4++;
            continue;
          }
          if( (( temp2-temp1 ) < min_snp) || (trimPositions.size() == 3) ){ //removed2 a tpos.size of 3 indicates trimming due ot cM 
            removed2++;
            m_matches[i][j][l].start= m_matches[i][j][l].end=-1;
            continue;
          }
          if( per_err > per_err_threshold){
            removed1++;
            continue;
          }
          if( holdOut && hThreshold < ( noOfOppHom ) / hlength ){
            removed3++;
            m_matches[i][j][l].start= m_matches[i][j][l].end=-1;
            continue; 
          } //removed3

          not_removed++;
          m_weighted_sh.push_back(Weighted_SH(temp1,temp2,i,j)); //build the vector of SH that passed
          continue;
        }//end weghtedOutput
        /*Add new finalErrorsOutput*/
        if( (option.compare("finalErrorsOutput") == 0) ){
          int snp1 = 0, snp2 = 0, hlength = 0;
          float noOfOppHom = 0;

          if( holdOut )
          {
            snp1 = e_obj.getNewSnp( temp1 );
            snp2 = e_obj.getNewSnp( temp2 );
            hlength = snp2 - snp1;
            if( hlength <= 0 )
            {
              hlength = 1;
            }
            noOfOppHom = e_obj.getOppHomThreshold( pers1, pers2, m_matches[i][j][l].start, m_matches[i][j][l].end );
          }


          if( ( (beforeTrimEnd - beforeTrimStart) < min_snp) || ( (trimPositions.size() == 3) && (trimPositions[2] == 1) ) ){ 
            std::vector<float>movingAverages;
            temp1 = beforeTrimStart;
            temp2 = beforeTrimEnd;
            m_matches[i][j][l].start= m_matches[i][j][l].end=-1;
            removed4++;
            continue;
          }

          if( (( temp2-temp1 ) < min_snp) || (trimPositions.size() == 3) ){ //removed2 a tpos.size of 3 indicates trimming due ot cM
            removed2++;
            m_matches[i][j][l].start= m_matches[i][j][l].end=-1;
            continue;
          }
          if( per_err > per_err_threshold){
            removed1++;
            continue;
          }

          if( holdOut && hThreshold < ( noOfOppHom ) / hlength ){
            removed3++;
            m_matches[i][j][l].start= m_matches[i][j][l].end=-1;
            continue; 
          } //removed3
          not_removed++;
          e_obj.finalErrorsOutput(i,j,temp1,temp2,min_cm,per_err);// <piyush>this is where the final output is written is called at
          continue;
        }//end finalErrorsOutput
        if( (option.compare("FullPlusDropped") == 0) ){
          int snp1 = 0, snp2 = 0, hlength = 0;
          float noOfOppHom = 0;

          if( holdOut )
          {
            snp1 = e_obj.getNewSnp( temp1 );
            snp2 = e_obj.getNewSnp( temp2 );
            hlength = snp2 - snp1;
            if( hlength <= 0 )
            {
              hlength = 1;
            }
            noOfOppHom = e_obj.getOppHomThreshold( pers1, pers2, m_matches[i][j][l].start, m_matches[i][j][l].end );
          }


          if( ( (beforeTrimEnd - beforeTrimStart) < min_snp) || ( (trimPositions.size() == 3) && (trimPositions[2] == 1) ) ){	
            std::vector<float>movingAverages;
            temp1 = beforeTrimStart;
            temp2 = beforeTrimEnd;
            e_obj.fullPlusDroppedOutput(i,j,temp1,temp2,min_snp,min_cm,finalErrors,per_err,1);//standardize the error codes
            m_matches[i][j][l].start= m_matches[i][j][l].end=-1;
            removed4++;
            continue;
          }

          if( (( temp2-temp1 ) < min_snp) || (trimPositions.size() == 3) ){ //removed2 a tpos.size of 3 indicates trimming due ot cM
            e_obj.fullPlusDroppedOutput(i,j,temp1,temp2,min_snp,min_cm,finalErrors,per_err,2); 
            removed2++;
            m_matches[i][j][l].start= m_matches[i][j][l].end=-1;
            continue;
          }
          if( per_err > per_err_threshold){
            e_obj.fullPlusDroppedOutput(i,j,temp1,temp2,min_snp,min_cm,finalErrors,per_err,3);
            removed1++;
            continue;
          }

          if( holdOut && hThreshold < ( noOfOppHom ) / hlength ){
            e_obj.fullPlusDroppedOutput(i,j,temp1,temp2,min_snp,min_cm,finalErrors,per_err,4);
            removed3++;
            m_matches[i][j][l].start= m_matches[i][j][l].end=-1;
            continue; 
          } //removed3
          not_removed++;
          e_obj.finalOutPut(i,j,temp1,temp2,min_cm);
          continue;
        } //end FullPlusDropped

        //Calculate Error1
        if( (option.compare("Error1") == 0 ) || (option.compare("ErrorRandom1") == 0) || (option.compare("Error") == 0) ){

          if( ( (beforeTrimEnd - beforeTrimStart) < min_snp) || ( (trimPositions.size() == 3) && (trimPositions[2] == 1) ) ){ //dropped before trimming
            //don't bother printing out ma for this one. But go back and change it so that it doesn't actually calc it
            std::vector<float>movingAverages;//null	   
            //trying something special in this case. This can be removed once idrops aren't being trimmed
            //test code
            temp1 = beforeTrimStart;
            temp2 = beforeTrimEnd;
            //
            e_obj.errorOutput(i,j,temp1,temp2,min_snp,min_cm,movingAverages,finalErrors,per_err,temp1,temp2,beforeTrimStart,beforeTrimEnd,1);
            m_matches[i][j][l].start= m_matches[i][j][l].end=-1;
            removed4++; //seems ok
            continue;
          } 
          if( (( temp2-temp1 ) < min_snp) || ((trimPositions.size() == 3) && (trimPositions[2] == 2) ) ) //dropped after trimming
          {
            e_obj.errorOutput(i,j,temp1,temp2,min_snp,min_cm,movingAverages,finalErrors,per_err,temp1,temp2,beforeTrimStart,beforeTrimEnd,2);
            ++removed2;
            m_matches[i][j][l].start= m_matches[i][j][l].end=-1;
            continue;
          }
          if( per_err > per_err_threshold ) //dropped due to pie
          {
            e_obj.errorOutput(i,j,temp1,temp2,min_snp,min_cm,movingAverages,finalErrors,per_err,temp1,temp2,beforeTrimStart,beforeTrimEnd,3);
            ++removed1;
            m_matches[i][j][l].start= m_matches[i][j][l].end=-1;
            continue;
          }
          not_removed++;
          e_obj.errorOutput(i,j,temp1,temp2,min_snp,min_cm,movingAverages,finalErrors,per_err,temp1,temp2,beforeTrimStart,beforeTrimEnd,0);//no drop
          continue;
        }//end error1

        int snp1 = 0, snp2 = 0, hlength = 0;
        float noOfOppHom = 0;
        if( holdOut )
        {
          snp1 = e_obj.getNewSnp( temp1 );
          snp2 = e_obj.getNewSnp( temp2 );
          hlength = snp2 - snp1;
          if( hlength <= 0 )
          {
            hlength = 1;
          }
          noOfOppHom = e_obj.getOppHomThreshold( pers1, pers2, m_matches[i][j][l].start, m_matches[i][j][l].end );
        }
        //update drop order 2/26/14
        if( (( temp2-temp1 ) < min_snp) || (trimPositions.size() == 3) )
        {
          ++removed2;
          m_matches[i][j][l].start= m_matches[i][j][l].end=-1;
          continue;
        }

        if( per_err > per_err_threshold )
        {

          ++removed1;
          m_matches[i][j][l].start= m_matches[i][j][l].end=-1;
          continue;
        }
        //probably not removed?
        not_removed++;
        if( option.compare("MovingAverages")==0 ) //make this ma2
        {
          if( holdOut)
          {
            e_obj.middleHoldOutPut(i,j,temp1,temp2, min_snp,min_cm,movingAverages,trimPositions,per_err, noOfOppHom, hlength );
          }  
          else
          {
            e_obj.middleOutPut(i,j,temp1,temp2, min_snp, min_cm,movingAverages, trimPositions,per_err );
          }
          continue;
        }

        if(option.compare("Error2")==0 || option.compare( "ErrorRandom2" ) == 0)
        {
          if( holdOut)
          { 
            e_obj.middleHoldOutPut(i,j,temp1,temp2, min_snp, min_cm, finalErrors, trimPositions, per_err, noOfOppHom, hlength );
          }
          else
          {
            e_obj.middleOutPut(i,j,temp1,temp2, min_snp, min_cm, finalErrors, trimPositions, per_err);
          }
          continue;
        }
        if ( holdOut && hThreshold < ( noOfOppHom ) / hlength )
        {
          ++removed3;
          m_matches[i][j][l].start= m_matches[i][j][l].end=-1;
          continue;
        }
        if( option.compare("Error3")==0 || option.compare( "ErrorRandom3" ) == 0  )
        {
          e_obj.middleHoldOutPut(i,j,temp1,temp2, min_snp, min_cm, finalErrors, trimPositions, per_err, noOfOppHom, hlength );
        }
      }//l
    }//j
  }//i


  /*ENTERING TESTING AREA DEC 4th 2014*/
  /*****************************
  ******************************/


  /*Now, let's handle weighted output if need be*/
  if( option.compare("weightedOutput") == 0 ){
    float snp_average_count = 0.0;
    int start_position;
    int end_position;
    int genome_length;
    if(isUserSuppliedWeights()){ //the user has supplied their own weights.
      //in this case, the min and max values correspond to the number of lines in the input file,
      //since each line represents a snp. So the min is always 0, and the max is always the number of lines-1.
      start_position = 0;
      end_position = user_supplied_snp_weights.size() - 1;
    }else {
      start_position = find_genome_min();
      end_position = find_genome_max();
    }//end else
    genome_length = (end_position - start_position)+1;
    genome_vector.resize(genome_length,0);
    if(isUserSuppliedWeights()){
      for(int i = 0; i < user_supplied_snp_weights.size(); i++){
        update_genome(i,user_supplied_snp_weights[i]);
      }
    }else{
      /*This next for loop adds one to each snp in a SH. Bypass it if the user gives a files of weights*/
      for(int i = 0; i < m_weighted_sh.size(); i++){
        update_genome(m_weighted_sh[i].snp1, m_weighted_sh[i].snp2);
      }
    }
    //this part is next...will probably need to add stuff to that weighted object...
    snp_average_count = average_snp_count();
    for(int i = 0; i < m_weighted_sh.size(); i++){
      m_weighted_sh[i].snp_weight = update_snp_weight(m_weighted_sh[i].snp1, m_weighted_sh[i].snp2);
    }
    for(int i = 0; i < m_weighted_sh.size(); i++){
      m_weighted_sh[i].final_weight = ( snp_average_count / (m_weighted_sh[i].snp_weight));
      e_obj.weightedOutput(m_weighted_sh[i].per1, m_weighted_sh[i].per2, m_weighted_sh[i].snp1, m_weighted_sh[i].snp2, m_weighted_sh[i].final_weight);
    }
  }

  if (option.compare("weightedOutputBP") == 0){

  //begin new test code section here: Dec 4th 2014
  int genome_length = e_obj.getGenomeBPLength();    
  float adjusted_genome_length = genome_length / 1000.0; //L using kbp for now
  int genome_min = e_obj.getMinimumBP(); std::cout<<"genome_min= "<<genome_min<<std::endl;
  int genome_max = e_obj.getMaximumBP(); std::cout<<"genome_max= "<<genome_max<<std::endl;
  int genome_size_snps = (find_genome_max() - find_genome_min())+1; //used for genome_vector
  float wprime_numerator = 0.0;  //This is Ci / L
  float total_sh_length_sum = 0.0;
  float w2prime_denominator = 0.0;

  genome_vector.resize(genome_size_snps,0); //resize and zero out the genome. shit that needs to be snps.

  //update all of the snp counts in the genome. This looks fine.
  for(int i = 0; i < m_weighted_sh.size(); i++){
    update_genome(m_weighted_sh[i].snp1, m_weighted_sh[i].snp2); 
  }

  //calculate the w' numerator by summing up all of the snp counts and dividing by the genome length.
  //WARNING: This can cause wprime_numerator to overflow. Currently using kbp units to avoid this, but
  //this needs to be addressed.
  for(int i = 0; i < genome_vector.size(); i++){
    wprime_numerator += genome_vector[i] / adjusted_genome_length;
  }
  
  //Calculate w' for each SH.
  for(int i = 0; i < m_weighted_sh.size(); i++){
    float wprime_denominator = 0.0;
    m_weighted_sh[i].mbp_length = (e_obj.getSHBPLength(m_weighted_sh[i].snp1, m_weighted_sh[i].snp2)/1000.0);
    wprime_denominator = get_snps_over_range(m_weighted_sh[i].snp1, m_weighted_sh[i].snp2, m_weighted_sh[i].mbp_length);
    m_weighted_sh[i].wprime = wprime_numerator / wprime_denominator;
  }

  //This is the total length of all SH. This can probably overflow as well...ugh.
  for(int i = 0; i < m_weighted_sh.size(); i++){
    total_sh_length_sum += m_weighted_sh[i].mbp_length;
  }
  
  //Calculate the w2prime denominator - this value is a constant
  for(int i = 0; i < m_weighted_sh.size(); i++){
    float temp = m_weighted_sh[i].mbp_length * m_weighted_sh[i].wprime;
    w2prime_denominator += temp / total_sh_length_sum;
  }

  //Calculate and output w2' for each SH
  for(int i = 0; i < m_weighted_sh.size(); i++){
    m_weighted_sh[i].w2prime = (m_weighted_sh[i].wprime) / w2prime_denominator;
    e_obj.weightedOutput(m_weighted_sh[i].per1, m_weighted_sh[i].per2, m_weighted_sh[i].snp1, m_weighted_sh[i].snp2, m_weighted_sh[i].w2prime);
  }
}

  /*End weighted output*/



  /*END TESTING AREA DEC 4th 2014*/
  /*****************************
  ******************************/
  ma_drop_str = "No of matches removed due to length of trimming by moving averages: " + NumberToString( removed2 );
  pie_drop_str = "No of matches removed due to percentage error: " + NumberToString( removed1 );
  if(holdOut){
  //  str = str+ " \n No of matches removed due hold out ped file checking: "+ NumberToString( removed3 );
  }
  //begin log output
  std::string parameter_string_1 = "\n\n**********Parameters used in program**********\n";
  e_obj.log(parameter_string_1);
  e_obj.log(emp_ma_thresh_str); //keep
  e_obj.log(emp_pie_thresh_str);//keep
  parameter_string_1 = "**********************************************\n\n";
  e_obj.log(parameter_string_1);
  std::string total_count_str = "The total number of SH in the input file was: " + NumberToString(total_count);
  e_obj.log(total_count_str);
  e_obj.log(consolidated_str);
  e_obj.log(initial_drop_str);
  //  e_obj.log(ibg_str);
  e_obj.log(ma_drop_str);
  e_obj.log(pie_drop_str);
  final_sh_str = "Total number of SH that were not dropped is: " + NumberToString(not_removed);
  e_obj.log(final_sh_str);
}//end performTrim