void DetectabilitySimulation::svmFilter_(FeatureMapSim& features) { // transform featuremap to peptides vector vector<String> peptides_vector(features.size()); for (Size i = 0; i < features.size(); ++i) { peptides_vector[i] = features[i].getPeptideIdentifications()[0].getHits()[0].getSequence().toUnmodifiedString(); } vector<DoubleReal> labels; vector<DoubleReal> detectabilities; predictDetectabilities(peptides_vector, labels, detectabilities); // copy all meta data stored in the feature map FeatureMapSim temp_copy(features); temp_copy.clear(false); for (Size i = 0; i < peptides_vector.size(); ++i) { if (detectabilities[i] > min_detect_) { features[i].setMetaValue("detectability", detectabilities[i]); temp_copy.push_back(features[i]); } #ifdef DEBUG_SIM cout << detectabilities[i] << " " << min_detect_ << endl; #endif } features.swap(temp_copy); }
/** @brief Gets a feature map containing the peptides and predicts for those the retention times */ void RTSimulation::predictRT(FeatureMapSim& features) { LOG_INFO << "RT Simulation ... started" << std::endl; vector<DoubleReal> predicted_retention_times; bool is_relative = (param_.getValue("auto_scale") == "true"); if (param_.getValue("rt_column") == "none") { noRTColumn_(features); return; } // CE or HPLC: else if (param_.getValue("rt_column") == "CE") { calculateMT_(features, predicted_retention_times); } else if (param_.getValue("rt_column") == "HPLC") { vector<AASequence> peptides_aa_vector(features.size()); for (Size i = 0; i < features.size(); ++i) { peptides_aa_vector[i] = features[i].getPeptideIdentifications()[0].getHits()[0].getSequence(); } wrapSVM(peptides_aa_vector, predicted_retention_times); } // rt error dicing SimCoordinateType rt_offset = param_.getValue("variation:affine_offset"); SimCoordinateType rt_scale = param_.getValue("variation:affine_scale"); SimCoordinateType rt_ft_stddev = param_.getValue("variation:feature_stddev"); FeatureMapSim fm_tmp(features); fm_tmp.clear(false); StringList deleted_features; for (Size i = 0; i < predicted_retention_times.size(); ++i) { // relative -> absolute RT's (with border) if (is_relative) { predicted_retention_times[i] *= total_gradient_time_; } //overwrite RT (if given by user) if (features[i].metaValueExists("rt")) { predicted_retention_times[i] = features[i].getMetaValue("rt"); } // add variation SimCoordinateType rt_error = gsl_ran_gaussian(rnd_gen_->technical_rng, rt_ft_stddev) + rt_offset; predicted_retention_times[i] = predicted_retention_times[i] * rt_scale + rt_error; //overwrite RT [no randomization] (if given by user) if (features[i].metaValueExists("RT")) { predicted_retention_times[i] = features[i].getMetaValue("RT"); } // remove invalid peptides & (later) display removed ones if ( (predicted_retention_times[i] < 0.0) || // check for invalid RT (predicted_retention_times[i] > gradient_max_) || // check if RT is not in scan window (predicted_retention_times[i] < gradient_min_) // check if RT is not in scan window ) { deleted_features.push_back(features[i].getPeptideIdentifications()[0].getHits()[0].getSequence().toUnmodifiedString() + " [" + String::number(predicted_retention_times[i], 2) + "]"); continue; } features[i].setRT(predicted_retention_times[i]); // determine shape parameters for EGH DoubleReal variance = egh_variance_location_ + (egh_variance_scale_ == 0 ? 0 : gsl_ran_cauchy(rnd_gen_->technical_rng, egh_variance_scale_)); DoubleReal tau = egh_tau_location_ + (egh_tau_scale_ == 0 ? 0 : gsl_ran_cauchy(rnd_gen_->technical_rng, egh_tau_scale_)); // resample variance if it is below 0 // try this only 10 times to avoid endless loop in case of // a bad parameter combination Size retry_variance_sampling = 0; while ((variance <= 0 || (fabs(variance - egh_variance_location_) > 10 * egh_variance_scale_)) && retry_variance_sampling < 9) { variance = egh_variance_location_ + gsl_ran_cauchy(rnd_gen_->technical_rng, egh_variance_scale_); ++retry_variance_sampling; } if (variance <= 0 || (fabs(variance - egh_variance_location_) > 10 * egh_variance_scale_)) { LOG_ERROR << "Sigma^2 was negative, resulting in a feature with width=0. Tried to resample 10 times and then stopped. Setting it to the user defined width value of " << egh_variance_location_ << "!" << std::endl; variance = egh_variance_location_; } // resample tau if the value is to big // try this only 10 times to avoid endless loop in case of // a bad parameter combination Size retry_tau_sampling = 0; while (fabs(tau - egh_tau_location_) > 10 * egh_tau_scale_ && retry_tau_sampling < 9) { tau = egh_tau_location_ + gsl_ran_cauchy(rnd_gen_->technical_rng, egh_tau_scale_); ++retry_tau_sampling; } if (fabs(tau - egh_tau_location_) > 10 * egh_tau_scale_) { LOG_ERROR << "Tau is to big for a reasonable feature. Tried to resample 10 times and then stopped. Setting it to the user defined skewness value of " << egh_tau_location_ << "!" << std::endl; tau = egh_tau_location_; } features[i].setMetaValue("RT_egh_variance", variance); features[i].setMetaValue("RT_egh_tau", tau); fm_tmp.push_back(features[i]); } // print invalid features: if (deleted_features.size() > 0) { LOG_WARN << "RT prediction gave 'invalid' results for " << deleted_features.size() << " peptide(s), making them unobservable.\n"; if (deleted_features.size() < 100) LOG_WARN << " " << ListUtils::concatenate(deleted_features, "\n ") << std::endl; else LOG_WARN << " (List is too big to show)" << std::endl; } // only retain valid features: features.swap(fm_tmp); features.sortByPosition(); features.updateRanges(); }