/** * Train one of the crossvalidation bins * @param set identification number of the bin that is processed * @param updateDOC boolean deciding to calculate retention features * @see DescriptionOfCorrect * @param cposCandidates candidate soft margin parameters for positives * @param cfracCandidates candidate soft margin parameters for fraction neg/pos * @param bestCpos best soft margin parameter for positives * @param bestCfrac best soft margin parameter for fraction neg/pos * @param pWeights results vector from the SVM algorithm * @param pOptions options for the SVM algorithm */ int CrossValidation::processSingleFold(unsigned int set, bool updateDOC, const vector<double>& cposCandidates, const vector<double>& cfracCandidates, double &bestCpos, double &bestCfrac, vector_double* pWeights, options * pOptions) { int bestTruePos = 0; if (VERB > 2) { cerr << "cross validation - fold " << set + 1 << " out of " << numFolds_ << endl; } vector<double> ww = w_[set]; // SVM weights initial guess and result holder vector<double> bestW = w_[set]; // SVM weights with highest true pos estimate trainScores_[set].calcScores(ww, selectionFdr_); if (DataSet::getCalcDoc() && updateDOC) { trainScores_[set].recalculateDescriptionOfCorrect(selectionFdr_); } AlgIn* svmInput = svmInputs_[set % numAlgInObjects_]; trainScores_[set].generateNegativeTrainingSet(*svmInput, 1.0); trainScores_[set].generatePositiveTrainingSet(*svmInput, selectionFdr_, 1.0); if (VERB > 2) { cerr << "Calling with " << svmInput->positives << " positives and " << svmInput->negatives << " negatives\n"; } // Create storage vector for SVM algorithm struct vector_double* Outputs = new vector_double; size_t numInputs = svmInput->positives + svmInput->negatives; Outputs->vec = new double[numInputs]; Outputs->d = numInputs; // Find soft margin parameters with highest estimate of true positives std::vector<double>::const_iterator itCpos = cposCandidates.begin(); for ( ; itCpos != cposCandidates.end(); ++itCpos) { double cpos = *itCpos; std::vector<double>::const_iterator itCfrac = cfracCandidates.begin(); for ( ; itCfrac != cfracCandidates.end(); ++itCfrac) { double cfrac = *itCfrac; if (VERB > 2) cerr << "-cross validation with cpos=" << cpos << ", cfrac=" << cfrac << endl; int tp = 0; for (int ix = 0; ix < pWeights->d; ix++) { pWeights->vec[ix] = 0; } for (int ix = 0; ix < Outputs->d; ix++) { Outputs->vec[ix] = 0; } svmInput->setCost(cpos, cpos * cfrac); // Call SVM algorithm (see ssl.cpp) L2_SVM_MFN(*svmInput, pOptions, pWeights, Outputs); for (int i = FeatureNames::getNumFeatures() + 1; i--;) { ww[i] = pWeights->vec[i]; } // sub-optimal cross validation (better would be to measure // performance on a set disjoint of the training set) tp = trainScores_[set].calcScores(ww, testFdr_); if (VERB > 2) { cerr << "- cross validation estimates " << tp << " target PSMs over " << testFdr_ * 100 << "% FDR level" << endl; } if (tp >= bestTruePos) { if (VERB > 2) { cerr << "Better than previous result, store this" << endl; } bestTruePos = tp; bestW = ww; bestCpos = cpos; bestCfrac = cfrac; } } if (VERB > 2) { std::cerr << "cross validation estimates " << bestTruePos / (numFolds_-1) << " target PSMs with q<" << testFdr_ << " for hyperparameters Cpos=" << bestCpos << ", Cneg=" << bestCfrac * bestCpos << std::endl; } } w_[set] = bestW; delete[] Outputs->vec; delete Outputs; return bestTruePos; }
/** * Train one of the crossvalidation bins * @param set identification number of the bin that is processed * @param w list of normal vectors (in the linear algebra sense) of the hyperplane from SVM, one for each bin * @param updateDOC boolean deciding to calculate retention features @see DescriptionOfCorrect * @param cpos_vec vector with soft margin parameter for positives * @param cfrac_vec vector with soft margin parameter for fraction negatives / positives * @param best_cpos best soft margin parameter for positives * @param best_cfrac best soft margin parameter for fraction negatives / positives * @param pWeights results vector from the SVM algorithm * @param pOptions options for the SVM algorithm */ int CrossValidation::xv_process_one_bin(unsigned int set, bool updateDOC, vector<double>& cpos_vec, vector<double>& cfrac_vec, double &best_cpos, double &best_cfrac, vector_double* pWeights, options * pOptions) { int bestTP = 0; if (VERB > 2) { cerr << "cross validation - fold " << set + 1 << " out of " << xval_fold << endl; } vector<double> ww = w[set]; // normal vector initial guess and result holder vector<double> bestW = w[set]; // normal vector with highest true positive estimate xv_train[set].calcScores(ww, selectionFdr); if (DataSet::getCalcDoc() && updateDOC) { xv_train[set].recalculateDescriptionOfCorrect(selectionFdr); } xv_train[set].generateNegativeTrainingSet(*svmInput, 1.0); xv_train[set].generatePositiveTrainingSet(*svmInput, selectionFdr, 1.0); if (VERB > 2) { cerr << "Calling with " << svmInput->positives << " positives and " << svmInput->negatives << " negatives\n"; } // Create storage vector for SVM algorithm struct vector_double* Outputs = new vector_double; Outputs->vec = new double[svmInput->positives + svmInput->negatives]; Outputs->d = svmInput->positives + svmInput->negatives; // Find combination of soft margin parameters with highest estimate of true positives BOOST_FOREACH (const double cpos, cpos_vec) { BOOST_FOREACH (const double cfrac, cfrac_vec) { if (VERB > 2) cerr << "-cross validation with cpos=" << cpos << ", cfrac=" << cfrac << endl; int tp = 0; for (int ix = 0; ix < pWeights->d; ix++) { pWeights->vec[ix] = 0; } for (int ix = 0; ix < Outputs->d; ix++) { Outputs->vec[ix] = 0; } svmInput->setCost(cpos, (cpos) * (cfrac)); // Call SVM algorithm (see ssl.cpp) L2_SVM_MFN(*svmInput, pOptions, pWeights, Outputs); for (int i = FeatureNames::getNumFeatures() + 1; i--;) { ww[i] = pWeights->vec[i]; } // sub-optimal cross validation (better would be to measure // performance on a set disjoint of the training set) tp = xv_train[set].calcScores(ww, testFdr); if (VERB > 2) { cerr << "- cross validation estimates " << tp << " target PSMs over " << testFdr * 100 << "% FDR level" << endl; } if (tp >= bestTP) { if (VERB > 2) { cerr << "Better than previous result, store this" << endl; } bestTP = tp; bestW = ww; best_cpos = cpos; best_cfrac = cfrac; } } if (VERB > 2) cerr << "cross validation estimates " << bestTP / (xval_fold - 1) << " target PSMs with q<" << testFdr << " for hyperparameters Cpos=" << best_cpos << ", Cneg=" << best_cfrac * best_cpos << endl; }