void DetectabilitySimulation::svmFilter_(FeatureMapSim& features) { // transform featuremap to peptides vector vector<String> peptides_vector(features.size()); for (Size i = 0; i < features.size(); ++i) { peptides_vector[i] = features[i].getPeptideIdentifications()[0].getHits()[0].getSequence().toUnmodifiedString(); } vector<DoubleReal> labels; vector<DoubleReal> detectabilities; predictDetectabilities(peptides_vector, labels, detectabilities); // copy all meta data stored in the feature map FeatureMapSim temp_copy(features); temp_copy.clear(false); for (Size i = 0; i < peptides_vector.size(); ++i) { if (detectabilities[i] > min_detect_) { features[i].setMetaValue("detectability", detectabilities[i]); temp_copy.push_back(features[i]); } #ifdef DEBUG_SIM cout << detectabilities[i] << " " << min_detect_ << endl; #endif } features.swap(temp_copy); }
void RTSimulation::noRTColumn_(FeatureMapSim& features) { for (FeatureMapSim::iterator it_f = features.begin(); it_f != features.end(); ++it_f) { (*it_f).setRT(-1); } }
void DetectabilitySimulation::noFilter_(FeatureMapSim& features) { // set detectibility to 1.0 for all given peptides DoubleReal defaultDetectibility = 1.0; for (FeatureMapSim::iterator feature_it = features.begin(); feature_it != features.end(); ++feature_it) { (*feature_it).setMetaValue("detectability", defaultDetectibility); } }
void RTSimulation::predictContaminantsRT(FeatureMapSim& contaminants) { // iterate of feature map for (Size i = 0; i < contaminants.size(); ++i) { // assign random retention time SimCoordinateType retention_time = gsl_ran_flat(rnd_gen_->technical_rng, 0, total_gradient_time_); contaminants[i].setRT(retention_time); } }
void BaseLabeler::recomputeConsensus_(const FeatureMapSim & simulated_features) { // iterate over all given features stored in the labeling consensus and try to find the corresponding feature in // in the feature map // build index for faster access Map<String, IntList> id_map; Map<UInt64, Size> features_per_labeled_map; for (Size i = 0; i < simulated_features.size(); ++i) { if (simulated_features[i].metaValueExists("parent_feature")) { LOG_DEBUG << "Checking [" << i << "]: " << simulated_features[i].getPeptideIdentifications()[0].getHits()[0].getSequence().toString() << " with charge " << simulated_features[i].getCharge() << " (" << simulated_features[i].getMetaValue("charge_adducts") << ")" << " parent was " << simulated_features[i].getMetaValue("parent_feature") << std::endl; id_map[simulated_features[i].getMetaValue("parent_feature")].push_back((Int)i); UInt64 map_index = 0; if (simulated_features[i].metaValueExists("map_index")) { map_index = simulated_features[i].getMetaValue("map_index"); } ++features_per_labeled_map[map_index]; } } for (Map<String, IntList>::iterator it = id_map.begin(); it != id_map.end(); ++it) { LOG_DEBUG << it->first << " " << it->second << std::endl; } // new consensus map ConsensusMap new_cm; // initialize submaps in consensus map for (Map<UInt64, Size>::Iterator it = features_per_labeled_map.begin(); it != features_per_labeled_map.end(); ++it) { new_cm.getFileDescriptions()[it->first].size = it->second; new_cm.getFileDescriptions()[it->first].unique_id = simulated_features.getUniqueId(); } for (ConsensusMap::iterator cm_iter = consensus_.begin(); cm_iter != consensus_.end(); ++cm_iter) { bool complete = true; LOG_DEBUG << "Checking consensus feature containing: " << std::endl; // check if we have all elements of current CF in the new feature map (simulated_features) for (ConsensusFeature::iterator cf_iter = (*cm_iter).begin(); cf_iter != (*cm_iter).end(); ++cf_iter) { complete &= id_map.has(String((*cf_iter).getUniqueId())); LOG_DEBUG << "\t" << String((*cf_iter).getUniqueId()) << std::endl; } if (complete) { // get all elements sorted by charge state; since the same charge can be achieved by different // adduct compositions we use the adduct-string as indicator to find the groups Map<String, std::set<FeatureHandle, FeatureHandle::IndexLess> > charge_mapping; for (ConsensusFeature::iterator cf_iter = (*cm_iter).begin(); cf_iter != (*cm_iter).end(); ++cf_iter) { IntList feature_indices = id_map[String((*cf_iter).getUniqueId())]; for (IntList::iterator it = feature_indices.begin(); it != feature_indices.end(); ++it) { UInt64 map_index = 0; if (simulated_features[*it].metaValueExists("map_index")) { map_index = simulated_features[*it].getMetaValue("map_index"); } if (charge_mapping.has(simulated_features[*it].getMetaValue("charge_adducts"))) { charge_mapping[simulated_features[*it].getMetaValue("charge_adducts")].insert(FeatureHandle(map_index, simulated_features[*it])); } else { LOG_DEBUG << "Create new set with charge composition " << simulated_features[*it].getMetaValue("charge_adducts") << std::endl; std::set<FeatureHandle, FeatureHandle::IndexLess> fh_set; fh_set.insert(FeatureHandle(map_index, simulated_features[*it])); charge_mapping.insert(std::make_pair(simulated_features[*it].getMetaValue("charge_adducts"), fh_set)); } } } // create new consensus feature from derived features (separated by charge, if charge != 0) for (Map<String, std::set<FeatureHandle, FeatureHandle::IndexLess> >::const_iterator charge_group_it = charge_mapping.begin(); charge_group_it != charge_mapping.end(); ++charge_group_it) { ConsensusFeature cf; cf.setCharge((*(*charge_group_it).second.begin()).getCharge()); cf.setMetaValue("charge_adducts", charge_group_it->first); std::vector<PeptideIdentification> ids; for (std::set<FeatureHandle, FeatureHandle::IndexLess>::const_iterator fh_it = (charge_group_it->second).begin(); fh_it != (charge_group_it->second).end(); ++fh_it) { cf.insert(*fh_it); // append identifications Size f_index = simulated_features.uniqueIdToIndex(fh_it->getUniqueId()); std::vector<PeptideIdentification> ids_feature = simulated_features[f_index].getPeptideIdentifications(); ids.insert(ids.end(), ids_feature.begin(), ids_feature.end()); } cf.computeMonoisotopicConsensus(); cf.setPeptideIdentifications(ids); new_cm.push_back(cf); } } } new_cm.setProteinIdentifications(simulated_features.getProteinIdentifications()); consensus_.swap(new_cm); consensus_.applyMemberFunction(&UniqueIdInterface::ensureUniqueId); }
void RTSimulation::calculateMT_(FeatureMapSim& features, std::vector<DoubleReal>& predicted_retention_times) { Map<String, double> q_cterm, q_nterm, q_aa_basic, q_aa_acidic; getChargeContribution_(q_cterm, q_nterm, q_aa_basic, q_aa_acidic); DoubleReal alpha = param_.getValue("CE:alpha"); bool auto_scale = (param_.getValue("auto_scale") == "true"); DoubleReal c = (auto_scale ? 1 : (DoubleReal)param_.getValue("CE:lenght_d") * (DoubleReal)param_.getValue("CE:length_total") / (DoubleReal)param_.getValue("CE:voltage")); predicted_retention_times.resize(features.size()); for (Size i = 0; i < features.size(); ++i) { String seq = features[i].getPeptideIdentifications()[0].getHits()[0].getSequence().toUnmodifiedString(); // ** determine charge of peptide ** DoubleReal charge = 0; // C&N term charge contribution if (q_nterm.has(seq[0])) charge += q_nterm[seq[0]]; if (q_cterm.has(seq.suffix(1))) charge += q_cterm[seq.suffix(1)]; // sidechains ... Map<String, Size> frequency_table; features[i].getPeptideIdentifications()[0].getHits()[0].getSequence().getAAFrequencies(frequency_table); for (Map<String, Size>::const_iterator it = frequency_table.begin(); it != frequency_table.end(); ++it) { if (q_aa_basic.has(it->first)) charge += q_aa_basic[it->first] * it->second; if (q_aa_acidic.has(it->first)) charge += q_aa_acidic[it->first] * it->second; } // ** determine mass of peptide DoubleReal mass = features[i].getPeptideIdentifications()[0].getHits()[0].getSequence().getFormula().getAverageWeight(); // ** mobility (mu = mu_ep + mu_eo = (q/MW^alpha) + mu_eo DoubleReal mu = (charge / std::pow(mass, alpha)) + (auto_scale ? 0 : (DoubleReal)param_.getValue("CE:mu_eo")); predicted_retention_times[i] = c / mu; // this is L_d*L_t / (mu * V) as "c = L_d*L_t/V" } // ** only when Auto-Scaling is active ** / std::vector<DoubleReal> rt_sorted(predicted_retention_times); std::sort(rt_sorted.begin(), rt_sorted.end()); DoubleReal max_rt = rt_sorted.back(); if (auto_scale) { max_rt = 1; // highest will be scaled to 1 //std::cerr << "minRT: " << rt_sorted[0] << " max: " << rt_sorted.back() << "\n"; // normalize to 5th - 95th percentile (we want to avoid that few outliers with huge/small MT can compress the others to a small MT range): DoubleReal mt_5p = rt_sorted[rt_sorted.size() * 5 / 100]; DoubleReal mt_95p = rt_sorted[rt_sorted.size() * 95 / 100]; // ... assume 95% MT range at 95th percentile DoubleReal range = std::max(1.0, (mt_95p - mt_5p) * 0.9); //std::cerr << " 5% MT: " << mt_5p << ", 95% MT: " << mt_95p << " Range: " << range << "\n"; DoubleReal new_offset = mt_5p - range * 0.05; // scale MT's between 0 and 1 (except for outliers --> which will get <0 or >1) for (Size i = 0; i < features.size(); ++i) { predicted_retention_times[i] = (predicted_retention_times[i] - new_offset) / range; } } // the width factor is 1.0 at MT=0 and reaches its max (default 2.0) at MT=max DoubleReal rt_widening_max = 2.0; for (Size i = 0; i < features.size(); ++i) { features[i].setMetaValue("RT_CE_width_factor", (predicted_retention_times[i] / max_rt * (rt_widening_max - 1) + 1)); } }
/** @brief Gets a feature map containing the peptides and predicts for those the retention times */ void RTSimulation::predictRT(FeatureMapSim& features) { LOG_INFO << "RT Simulation ... started" << std::endl; vector<DoubleReal> predicted_retention_times; bool is_relative = (param_.getValue("auto_scale") == "true"); if (param_.getValue("rt_column") == "none") { noRTColumn_(features); return; } // CE or HPLC: else if (param_.getValue("rt_column") == "CE") { calculateMT_(features, predicted_retention_times); } else if (param_.getValue("rt_column") == "HPLC") { vector<AASequence> peptides_aa_vector(features.size()); for (Size i = 0; i < features.size(); ++i) { peptides_aa_vector[i] = features[i].getPeptideIdentifications()[0].getHits()[0].getSequence(); } wrapSVM(peptides_aa_vector, predicted_retention_times); } // rt error dicing SimCoordinateType rt_offset = param_.getValue("variation:affine_offset"); SimCoordinateType rt_scale = param_.getValue("variation:affine_scale"); SimCoordinateType rt_ft_stddev = param_.getValue("variation:feature_stddev"); FeatureMapSim fm_tmp(features); fm_tmp.clear(false); StringList deleted_features; for (Size i = 0; i < predicted_retention_times.size(); ++i) { // relative -> absolute RT's (with border) if (is_relative) { predicted_retention_times[i] *= total_gradient_time_; } //overwrite RT (if given by user) if (features[i].metaValueExists("rt")) { predicted_retention_times[i] = features[i].getMetaValue("rt"); } // add variation SimCoordinateType rt_error = gsl_ran_gaussian(rnd_gen_->technical_rng, rt_ft_stddev) + rt_offset; predicted_retention_times[i] = predicted_retention_times[i] * rt_scale + rt_error; //overwrite RT [no randomization] (if given by user) if (features[i].metaValueExists("RT")) { predicted_retention_times[i] = features[i].getMetaValue("RT"); } // remove invalid peptides & (later) display removed ones if ( (predicted_retention_times[i] < 0.0) || // check for invalid RT (predicted_retention_times[i] > gradient_max_) || // check if RT is not in scan window (predicted_retention_times[i] < gradient_min_) // check if RT is not in scan window ) { deleted_features.push_back(features[i].getPeptideIdentifications()[0].getHits()[0].getSequence().toUnmodifiedString() + " [" + String::number(predicted_retention_times[i], 2) + "]"); continue; } features[i].setRT(predicted_retention_times[i]); // determine shape parameters for EGH DoubleReal variance = egh_variance_location_ + (egh_variance_scale_ == 0 ? 0 : gsl_ran_cauchy(rnd_gen_->technical_rng, egh_variance_scale_)); DoubleReal tau = egh_tau_location_ + (egh_tau_scale_ == 0 ? 0 : gsl_ran_cauchy(rnd_gen_->technical_rng, egh_tau_scale_)); // resample variance if it is below 0 // try this only 10 times to avoid endless loop in case of // a bad parameter combination Size retry_variance_sampling = 0; while ((variance <= 0 || (fabs(variance - egh_variance_location_) > 10 * egh_variance_scale_)) && retry_variance_sampling < 9) { variance = egh_variance_location_ + gsl_ran_cauchy(rnd_gen_->technical_rng, egh_variance_scale_); ++retry_variance_sampling; } if (variance <= 0 || (fabs(variance - egh_variance_location_) > 10 * egh_variance_scale_)) { LOG_ERROR << "Sigma^2 was negative, resulting in a feature with width=0. Tried to resample 10 times and then stopped. Setting it to the user defined width value of " << egh_variance_location_ << "!" << std::endl; variance = egh_variance_location_; } // resample tau if the value is to big // try this only 10 times to avoid endless loop in case of // a bad parameter combination Size retry_tau_sampling = 0; while (fabs(tau - egh_tau_location_) > 10 * egh_tau_scale_ && retry_tau_sampling < 9) { tau = egh_tau_location_ + gsl_ran_cauchy(rnd_gen_->technical_rng, egh_tau_scale_); ++retry_tau_sampling; } if (fabs(tau - egh_tau_location_) > 10 * egh_tau_scale_) { LOG_ERROR << "Tau is to big for a reasonable feature. Tried to resample 10 times and then stopped. Setting it to the user defined skewness value of " << egh_tau_location_ << "!" << std::endl; tau = egh_tau_location_; } features[i].setMetaValue("RT_egh_variance", variance); features[i].setMetaValue("RT_egh_tau", tau); fm_tmp.push_back(features[i]); } // print invalid features: if (deleted_features.size() > 0) { LOG_WARN << "RT prediction gave 'invalid' results for " << deleted_features.size() << " peptide(s), making them unobservable.\n"; if (deleted_features.size() < 100) LOG_WARN << " " << ListUtils::concatenate(deleted_features, "\n ") << std::endl; else LOG_WARN << " (List is too big to show)" << std::endl; } // only retain valid features: features.swap(fm_tmp); features.sortByPosition(); features.updateRanges(); }