Ejemplo n.º 1
0
void Consolidator::findTruePctErrors( ErrorCalculator &e_obj,int ma_snp_ends, bool holdOut,int window,float ma_threshold, float empirical_ma_threshold )
{
   for(int i=0;i<person_count;i++)
   {
      for(int j=i;j<person_count;j++)
      {
          for(int l=0;l<m_trueMatches[i][j].size();l++)
          {
              if(m_trueMatches[i][j][l].end==-1)
              {
                  continue;
              }
	      //handle moving averages calculation
	      //
	      int t1 = m_trueMatches[ i ][ j ][ l ].start +
                          ( m_trueMatches[ i ][ j ][ l ].end -
                            m_trueMatches[ i ][ j ][ l ].start ) * 0.25;
              int t2 = m_trueMatches[ i ][ j ][ l ].end -
                           ( m_trueMatches[ i ][ j ][ l ].end -
                            m_trueMatches[ i ][ j ][ l ].start ) * 0.25;		
	      //now we have the positions of the first and last 25% of the truly ibd SH
	      //all that's left to do is to pass them into the moving averages function, and obtain the max ma
	      //then store that in a vector, sort them, and find the xth percentile of that vector. That will be
	      //the ma that we use later
	      //for that "finalErrors" parameters, need to get the number of errors along the truly IBD SH first...
	      vector<vector<int> > trueErrors=e_obj.checkErrors( i, j, t1, t2);
              vector<int>finalTrueErrors=e_obj.getFinalErrors( trueErrors );

	      //handles MA calculations
	      std::vector<float> av;
	      float current_max;
	      if(empirical_ma_threshold < 0.0){
	      av = e_obj.getTrueMovingAverages(finalTrueErrors,t1,t2,window);
              current_max = av[0];
              for(int q = 1; q < av.size(); q++){
                   if(av[q] > current_max){
                           current_max = av[q];
                   }
              }
              e_obj.addMaxAverage(current_max);
	      }
 	      
	      //
              int temp1 = m_trueMatches[ i ][ j ][ l ].start +
                          ( m_trueMatches[ i ][ j ][ l ].end -
                            m_trueMatches[ i ][ j ][ l ].start ) * 0.15; //Should probably stop doing this
              int temp2 = m_trueMatches[ i ][ j ][ l ].end - 
                           ( m_trueMatches[ i ][ j ][ l ].end - 
                            m_trueMatches[ i ][ j ][ l ].start ) * 0.15;
              int start =0, end =0, fend = ( temp2 -temp1 )  ;
               
		  //since we are using MOL at this point, this will pick out a random SH from the set of non-truly IBD SH 
		  //and use that length to define the region over which we find PIE. Unless you are changing something with MOL,
		  //don't ever read this next block
                  int randPers1, randPers2, pos;
                  randPers1 = std::rand() % person_count;
                  randPers2 = std::rand() % person_count;
                  if( randPers1 > randPers2 )
                  {
                     randPers1 = randPers1 + randPers2;
                     randPers2 = randPers1 - randPers2;
                     randPers1 = randPers1 - randPers2;
                  }
                  while( m_matches[ randPers1 ][ randPers2 ].size() <= 0 )
                  {
                    randPers1 = std::rand() % person_count;
                    randPers2 = std::rand() % person_count;
                   if( randPers1 > randPers2 )
                   {
                      randPers1 = randPers1 + randPers2;
                      randPers2 = randPers1 - randPers2;
                      randPers1 = randPers1 - randPers2;
                   }

                  }
                  pos = std::rand() % m_matches[ randPers1 ][ randPers2 ].size();
                  int len = m_matches[ randPers1 ][ randPers2 ][ pos ].end 
                            - m_matches[ randPers1 ][ randPers2 ][ pos ].start;
                  if( len >= fend || len <= 0)
                  {
                      continue;
                  } 
                  temp1 = temp1;
                  temp2 = temp1 + len;
		  //end crazy MOL stuff
                  vector<vector<int> > errors=e_obj.checkErrors( i, j, temp1, temp2);

                  vector<int>finalErrors=e_obj.getFinalErrors( errors );
		  float per_err = e_obj.getThreshold(finalErrors,temp1,temp2);//overload
                  m_errors.push_back( per_err );
                 if( holdOut  )
                 {
                        float oppHom = ( e_obj.getOppHomThreshold( i, j, temp1, temp2 ) ) / ( temp2 -temp1 );
                        m_holdOutErrors.push_back( oppHom );
                 }
          }
       }  
   }
   vector<float>maxes;
   float cutoff = empirical_ma_threshold;
   if(empirical_ma_threshold < 0.0){
   maxes = e_obj.getMaxAverages();
   std::sort(maxes.begin(),maxes.end());
   e_obj.setMaxAverage(maxes);
   cutoff = e_obj.getXthPercentile(ma_threshold); 
   }
   e_obj.setCutoff(cutoff);//set the actual threshold to be used when calculating MA in all other SH
   //
   std::sort( m_errors.begin(), m_errors.end() );
   std::sort( m_holdOutErrors.begin(), m_holdOutErrors.end() );
   std::string str =  " \n No of elements in error check are: "
                      + NumberToString( m_errors.size() );
   str  = str + " \n No of elements in hold  error check are: "
                      + NumberToString( m_holdOutErrors.size() );

        e_obj.log( str );

}
Ejemplo n.º 2
0
void Consolidator::findTrueSimplePctErrors( ErrorCalculator &e_obj, float PIElength, bool holdOut,int window, float ma_threshold, float empirical_ma_threshold )
{
  for(int i=0;i<person_count;i++)
  {
    for(int j=i;j<person_count;j++)
    {
      for(int l=0;l<m_trueMatches[i][j].size();l++)
      {
        if(m_trueMatches[i][j][l].end==-1)
        {
          continue;
        }
        //-------------------------------------------------------------------------------------------------
        int t1 = m_trueMatches[ i ][ j ][ l ].start +
        ( m_trueMatches[ i ][ j ][ l ].end -
        m_trueMatches[ i ][ j ][ l ].start ) * 0.25;
        int t2 = m_trueMatches[ i ][ j ][ l ].end -
        ( m_trueMatches[ i ][ j ][ l ].end -
        m_trueMatches[ i ][ j ][ l ].start ) * 0.25;

        /*What do these two functions do? Is this necessary for being able to find errors, or 
        is it only useful for MA calculations?*/
        vector<vector<int> > trueErrors=e_obj.checkErrors( i, j, t1, t2);
        vector<int>finalTrueErrors=e_obj.getFinalErrors( trueErrors );
        /*x*/

        //This section handles finding the maximum moving averages amongst trulyIBD segments
        std::vector<float> av;
        float current_max;
        if(empirical_ma_threshold < 0.0){
          av = e_obj.getTrueMovingAverages(finalTrueErrors,t1,t2,window);
          current_max = av[0];
          for(int q = 1; q < av.size(); q++){
            if(av[q] > current_max){
              current_max = av[q];
           }
          }
          e_obj.addMaxAverage(current_max);
        }
        //------------------------------------------------------------------------------
        int temp1 = m_trueMatches[i][j][l].start;
        int temp2 = m_trueMatches[i][j][l].end;
        float startCM = e_obj.getCMDistance( temp1 );
        float endCM = e_obj.getCMDistance( temp2 );
        float mid1CM = startCM + ( endCM - startCM ) / 2 - PIElength / 2;
        float mid2CM = startCM + ( endCM - startCM ) / 2 + PIElength / 2;
        while( e_obj.getCMDistance( temp1 ) <= mid1CM || e_obj.getCMDistance( temp2 ) >=mid2CM )
        {
          if( e_obj.getCMDistance( temp1 ) <= mid1CM )
          {
            ++temp1;
          }
          if( e_obj.getCMDistance( temp2 ) >=mid2CM )
          {
            --temp2;
          }
        }

        /*Here they are again.
        */
        vector<vector<int> > errors=e_obj.checkErrors( i, j, temp1, temp2);

        vector<int>finalErrors=e_obj.getFinalErrors( errors );
        //                  float per_err = e_obj.getThreshold(finalErrors,temp1, temp2, 0 );
        float per_err = e_obj.getThreshold(finalErrors,temp1,temp2); //overload!
        m_errors.push_back( per_err );
        /*x*/


      }//end for(l)
    }//end for(j)
  }//end for(i)

    //this section actually handles the sorting of the max averages, and the setting of the user supplied percentile.
    vector<float>maxes;
    float cutoff = empirical_ma_threshold; //assume the user wanted to supply a value. This value will be overwritten shortly if they did not.
    if(empirical_ma_threshold < 0.0){
      maxes = e_obj.getMaxAverages();
      std::sort(maxes.begin(),maxes.end());
      e_obj.setMaxAverage(maxes);
      cutoff = e_obj.getXthPercentile(ma_threshold); //<-make that an actual user input value
    }

    e_obj.setCutoff(cutoff);//set the actual threshold to be used when calculating MA in all other SH

    if(empirical_ma_threshold < 0.0){
      ma_thresh_str = "User supplied ma-threshold is: " + NumberToString(ma_threshold);
      emp_ma_thresh_str = "Moving Averages will be tested usign the empirical threshold: " + NumberToString(cutoff);
    } else {
      emp_ma_thresh_str = "Moving Averages will be tested usign the empirical threshold: " + NumberToString(cutoff);
  }
  //----------------------------------------
  std::sort( m_errors.begin(), m_errors.end() );
  std::sort( m_holdOutErrors.begin(), m_holdOutErrors.end() );
  ibg_str = "No of segments deemed to be IBD for finding empirical error threshold "
  + NumberToString( m_errors.size() );
}//end ftspe