/** 
 * Train one of the crossvalidation bins 
 * @param set identification number of the bin that is processed
 * @param updateDOC boolean deciding to calculate retention features 
 *        @see DescriptionOfCorrect
 * @param cposCandidates candidate soft margin parameters for positives
 * @param cfracCandidates candidate soft margin parameters for fraction neg/pos
 * @param bestCpos best soft margin parameter for positives
 * @param bestCfrac best soft margin parameter for fraction neg/pos
 * @param pWeights results vector from the SVM algorithm
 * @param pOptions options for the SVM algorithm
*/
int CrossValidation::processSingleFold(unsigned int set, bool updateDOC, 
    const vector<double>& cposCandidates, const vector<double>& cfracCandidates, 
    double &bestCpos, double &bestCfrac, vector_double* pWeights, 
    options * pOptions) {
  int bestTruePos = 0;
  if (VERB > 2) {
    cerr << "cross validation - fold " << set + 1 << " out of "
         << numFolds_ << endl;
  }
  
  vector<double> ww = w_[set]; // SVM weights initial guess and result holder
  vector<double> bestW = w_[set]; // SVM weights with highest true pos estimate
  trainScores_[set].calcScores(ww, selectionFdr_);
  if (DataSet::getCalcDoc() && updateDOC) {
    trainScores_[set].recalculateDescriptionOfCorrect(selectionFdr_);
  }
  
  AlgIn* svmInput = svmInputs_[set % numAlgInObjects_];
  
  trainScores_[set].generateNegativeTrainingSet(*svmInput, 1.0);
  trainScores_[set].generatePositiveTrainingSet(*svmInput, selectionFdr_, 1.0);
  if (VERB > 2) {
    cerr << "Calling with " << svmInput->positives << " positives and "
         << svmInput->negatives << " negatives\n";
  }
  
  // Create storage vector for SVM algorithm
  struct vector_double* Outputs = new vector_double;
  size_t numInputs = svmInput->positives + svmInput->negatives;
  Outputs->vec = new double[numInputs];
  Outputs->d = numInputs;
  
  // Find soft margin parameters with highest estimate of true positives
  std::vector<double>::const_iterator itCpos = cposCandidates.begin();
  for ( ; itCpos != cposCandidates.end(); ++itCpos) {
    double cpos = *itCpos;  
    std::vector<double>::const_iterator itCfrac = cfracCandidates.begin();
    for ( ; itCfrac != cfracCandidates.end(); ++itCfrac) {
      double cfrac = *itCfrac;
      if (VERB > 2) cerr << "-cross validation with cpos=" << cpos
          << ", cfrac=" << cfrac << endl;
      int tp = 0;
      for (int ix = 0; ix < pWeights->d; ix++) {
        pWeights->vec[ix] = 0;
      }
      for (int ix = 0; ix < Outputs->d; ix++) {
        Outputs->vec[ix] = 0;
      }
      svmInput->setCost(cpos, cpos * cfrac);
      
      // Call SVM algorithm (see ssl.cpp)
      L2_SVM_MFN(*svmInput, pOptions, pWeights, Outputs);
      
      for (int i = FeatureNames::getNumFeatures() + 1; i--;) {
        ww[i] = pWeights->vec[i];
      }
      // sub-optimal cross validation (better would be to measure
      // performance on a set disjoint of the training set)
      tp = trainScores_[set].calcScores(ww, testFdr_);
      if (VERB > 2) {
        cerr << "- cross validation estimates " << tp
             << " target PSMs over " << testFdr_ * 100 << "% FDR level"
             << endl;
      }
      if (tp >= bestTruePos) {
        if (VERB > 2) {
          cerr << "Better than previous result, store this" << endl;
        }
        bestTruePos = tp;
        bestW = ww;
        bestCpos = cpos;
        bestCfrac = cfrac;
      }
    }
    if (VERB > 2) {
      std::cerr << "cross validation estimates " << 
          bestTruePos / (numFolds_-1) << " target PSMs with q<" << testFdr_ <<
          " for hyperparameters Cpos=" << bestCpos << 
          ", Cneg=" << bestCfrac * bestCpos << std::endl;
    }
  }
  w_[set] = bestW;
  delete[] Outputs->vec;
  delete Outputs;
  return bestTruePos;
}
예제 #2
0
/** 
 * Train one of the crossvalidation bins 
 * @param set identification number of the bin that is processed
 * @param w list of normal vectors (in the linear algebra sense) of the hyperplane from SVM, one for each bin
 * @param updateDOC boolean deciding to calculate retention features @see DescriptionOfCorrect
 * @param cpos_vec vector with soft margin parameter for positives
 * @param cfrac_vec vector with soft margin parameter for fraction negatives / positives
 * @param best_cpos best soft margin parameter for positives
 * @param best_cfrac best soft margin parameter for fraction negatives / positives
 * @param pWeights results vector from the SVM algorithm
 * @param pOptions options for the SVM algorithm
*/
int CrossValidation::xv_process_one_bin(unsigned int set, bool updateDOC, vector<double>& cpos_vec, 
         vector<double>& cfrac_vec, double &best_cpos, double &best_cfrac, vector_double* pWeights,
         options * pOptions) {
  int bestTP = 0;
  if (VERB > 2) {
    cerr << "cross validation - fold " << set + 1 << " out of "
         << xval_fold << endl;
  }
  
  vector<double> ww = w[set]; // normal vector initial guess and result holder
  vector<double> bestW = w[set]; // normal vector with highest true positive estimate
  xv_train[set].calcScores(ww, selectionFdr);
  if (DataSet::getCalcDoc() && updateDOC) {
    xv_train[set].recalculateDescriptionOfCorrect(selectionFdr);
  }
  xv_train[set].generateNegativeTrainingSet(*svmInput, 1.0);
  xv_train[set].generatePositiveTrainingSet(*svmInput, selectionFdr, 1.0);
  if (VERB > 2) {
    cerr << "Calling with " << svmInput->positives << " positives and "
         << svmInput->negatives << " negatives\n";
  }
  
  // Create storage vector for SVM algorithm
  struct vector_double* Outputs = new vector_double;
  Outputs->vec = new double[svmInput->positives + svmInput->negatives];
  Outputs->d = svmInput->positives + svmInput->negatives;
  
  // Find combination of soft margin parameters with highest estimate of true positives
  BOOST_FOREACH (const double cpos, cpos_vec) {
    BOOST_FOREACH (const double cfrac, cfrac_vec) {
      if (VERB > 2) cerr << "-cross validation with cpos=" << cpos
          << ", cfrac=" << cfrac << endl;
      int tp = 0;
      for (int ix = 0; ix < pWeights->d; ix++) {
        pWeights->vec[ix] = 0;
      }
      for (int ix = 0; ix < Outputs->d; ix++) {
        Outputs->vec[ix] = 0;
      }
      svmInput->setCost(cpos, (cpos) * (cfrac));
      
      // Call SVM algorithm (see ssl.cpp)
      L2_SVM_MFN(*svmInput, pOptions, pWeights, Outputs);
      
      for (int i = FeatureNames::getNumFeatures() + 1; i--;) {
        ww[i] = pWeights->vec[i];
      }
      // sub-optimal cross validation (better would be to measure
      // performance on a set disjoint of the training set)
      tp = xv_train[set].calcScores(ww, testFdr);
      if (VERB > 2) {
        cerr << "- cross validation estimates " << tp
             << " target PSMs over " << testFdr * 100 << "% FDR level"
             << endl;
      }
      if (tp >= bestTP) {
        if (VERB > 2) {
          cerr << "Better than previous result, store this" << endl;
        }
        bestTP = tp;
        bestW = ww;
        best_cpos = cpos;
        best_cfrac = cfrac;
      }
    }
    if (VERB > 2) cerr << "cross validation estimates " << bestTP
        / (xval_fold - 1) << " target PSMs with q<" << testFdr
        << " for hyperparameters Cpos=" << best_cpos << ", Cneg="
        << best_cfrac * best_cpos << endl;
  }