ExitCodes main_(int, const char**) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- StringList in = getStringList_("in"); String edta = getStringOption_("pos"); String out = getStringOption_("out"); String out_sep = getStringOption_("out_separator"); String out_TIC_debug = getStringOption_("auto_rt:out_debug_TIC"); StringList in_header = getStringList_("in_header"); // number of out_debug_TIC files and input files must be identical /*if (out_TIC_debug.size() > 0 && in.size() != out_TIC_debug.size()) { LOG_FATAL_ERROR << "Error: number of input file 'in' and auto_rt:out_debug_TIC files must be identical!" << std::endl; return ILLEGAL_PARAMETERS; }*/ // number of header files and input files must be identical if (in_header.size() > 0 && in.size() != in_header.size()) { LOG_FATAL_ERROR << "Error: number of input file 'in' and 'in_header' files must be identical!" << std::endl; return ILLEGAL_PARAMETERS; } if (!getFlag_("auto_rt:enabled") && !out_TIC_debug.empty()) { LOG_FATAL_ERROR << "Error: TIC output file requested, but auto_rt is not enabled! Either do not request the file or switch on 'auto_rt:enabled'." << std::endl; return ILLEGAL_PARAMETERS; } double rttol = getDoubleOption_("rt_tol"); double mztol = getDoubleOption_("mz_tol"); Size rt_collect = getIntOption_("rt_collect"); //------------------------------------------------------------- // loading input //------------------------------------------------------------- MzMLFile mzml_file; mzml_file.setLogType(log_type_); MSExperiment<Peak1D> exp, exp_pp; EDTAFile ed; ConsensusMap cm; ed.load(edta, cm); StringList tf_single_header0, tf_single_header1, tf_single_header2; // header content, for each column std::vector<String> vec_single; // one line for each compound, multiple columns per experiment vec_single.resize(cm.size()); for (Size fi = 0; fi < in.size(); ++fi) { // load raw data mzml_file.load(in[fi], exp); exp.sortSpectra(true); if (exp.empty()) { LOG_WARN << "The given file does not contain any conventional peak data, but might" " contain chromatograms. This tool currently cannot handle them, sorry." << std::endl; return INCOMPATIBLE_INPUT_DATA; } // try to detect RT peaks (only for the first input file -- all others should align!) // cm.size() might change in here... if (getFlag_("auto_rt:enabled") && fi == 0) { ConsensusMap cm_local = cm; // we might have different RT peaks for each map if 'auto_rt' is enabled cm.clear(false); // reset global list (about to be filled) // compute TIC MSChromatogram<> tic = exp.getTIC(); MSSpectrum<> tics, tic_gf, tics_pp, tics_sn; for (Size ic = 0; ic < tic.size(); ++ic) { // rewrite Chromatogram to MSSpectrum (GaussFilter requires it) Peak1D peak; peak.setMZ(tic[ic].getRT()); peak.setIntensity(tic[ic].getIntensity()); tics.push_back(peak); } // smooth (no PP_CWT here due to efficiency reasons -- large FWHM take longer!) double fwhm = getDoubleOption_("auto_rt:FHWM"); GaussFilter gf; Param p = gf.getParameters(); p.setValue("gaussian_width", fwhm * 2); // wider than FWHM, just to be sure we have a fully smoothed peak. Merging two peaks is unlikely p.setValue("use_ppm_tolerance", "false"); gf.setParameters(p); tic_gf = tics; gf.filter(tic_gf); // pick peaks PeakPickerHiRes pp; p = pp.getParameters(); p.setValue("signal_to_noise", getDoubleOption_("auto_rt:SNThreshold")); pp.setParameters(p); pp.pick(tic_gf, tics_pp); if (tics_pp.size()) { LOG_INFO << "Found " << tics_pp.size() << " auto-rt peaks at: "; for (Size ipp = 0; ipp != tics_pp.size(); ++ipp) LOG_INFO << " " << tics_pp[ipp].getMZ(); } else { LOG_INFO << "Found no auto-rt peaks. Change threshold parameters!"; } LOG_INFO << std::endl; if (!out_TIC_debug.empty()) // if debug file was given { // store intermediate steps for debug MSExperiment<> out_debug; out_debug.addChromatogram(toChromatogram(tics)); out_debug.addChromatogram(toChromatogram(tic_gf)); SignalToNoiseEstimatorMedian<MSSpectrum<> > snt; snt.init(tics); for (Size is = 0; is < tics.size(); ++is) { Peak1D peak; peak.setMZ(tic[is].getMZ()); peak.setIntensity(snt.getSignalToNoise(tics[is])); tics_sn.push_back(peak); } out_debug.addChromatogram(toChromatogram(tics_sn)); out_debug.addChromatogram(toChromatogram(tics_pp)); // get rid of "native-id" missing warning for (Size id = 0; id < out_debug.size(); ++id) out_debug[id].setNativeID(String("spectrum=") + id); mzml_file.store(out_TIC_debug, out_debug); LOG_DEBUG << "Storing debug AUTO-RT: " << out_TIC_debug << std::endl; } // add target EICs: for each m/z with no/negative RT, add all combinations of that m/z with auto-RTs // duplicate m/z entries will be ignored! // all other lines with positive RT values are copied unaffected //do not allow doubles std::set<double> mz_doubles; for (ConsensusMap::Iterator cit = cm_local.begin(); cit != cm_local.end(); ++cit) { if (cit->getRT() < 0) { if (mz_doubles.find(cit->getMZ()) == mz_doubles.end()) { mz_doubles.insert(cit->getMZ()); } else { LOG_INFO << "Found duplicate m/z entry (" << cit->getMZ() << ") for auto-rt. Skipping ..." << std::endl; continue; } ConsensusMap cm_RT_multiplex; for (MSSpectrum<>::ConstIterator itp = tics_pp.begin(); itp != tics_pp.end(); ++itp) { ConsensusFeature f = *cit; f.setRT(itp->getMZ()); cm.push_back(f); } } else { // default feature with no auto-rt LOG_INFO << "copying feature with RT " << cit->getRT() << std::endl; cm.push_back(*cit); } } // resize, since we have more positions now vec_single.resize(cm.size()); } // search for each EIC and add up Int not_found(0); Map<Size, double> quant; String description; if (fi < in_header.size()) { HeaderInfo info(in_header[fi]); description = info.header_description; } if (fi == 0) { // two additional columns for first file (theoretical RT and m/z) tf_single_header0 << "" << ""; tf_single_header1 << "" << ""; tf_single_header2 << "RT" << "mz"; } // 5 entries for each input file tf_single_header0 << File::basename(in[fi]) << "" << "" << "" << ""; tf_single_header1 << description << "" << "" << "" << ""; tf_single_header2 << "RTobs" << "dRT" << "mzobs" << "dppm" << "intensity"; for (Size i = 0; i < cm.size(); ++i) { //std::cerr << "Rt" << cm[i].getRT() << " mz: " << cm[i].getMZ() << " R " << cm[i].getMetaValue("rank") << "\n"; double mz_da = mztol * cm[i].getMZ() / 1e6; // mz tolerance in Dalton MSExperiment<>::ConstAreaIterator it = exp.areaBeginConst(cm[i].getRT() - rttol / 2, cm[i].getRT() + rttol / 2, cm[i].getMZ() - mz_da, cm[i].getMZ() + mz_da); Peak2D max_peak; max_peak.setIntensity(0); max_peak.setRT(cm[i].getRT()); max_peak.setMZ(cm[i].getMZ()); for (; it != exp.areaEndConst(); ++it) { if (max_peak.getIntensity() < it->getIntensity()) { max_peak.setIntensity(it->getIntensity()); max_peak.setRT(it.getRT()); max_peak.setMZ(it->getMZ()); } } double ppm = 0; // observed m/z offset if (max_peak.getIntensity() == 0) { ++not_found; } else { // take median for m/z found std::vector<double> mz; MSExperiment<>::Iterator itm = exp.RTBegin(max_peak.getRT()); SignedSize low = std::min<SignedSize>(std::distance(exp.begin(), itm), rt_collect); SignedSize high = std::min<SignedSize>(std::distance(itm, exp.end()) - 1, rt_collect); MSExperiment<>::AreaIterator itt = exp.areaBegin((itm - low)->getRT() - 0.01, (itm + high)->getRT() + 0.01, cm[i].getMZ() - mz_da, cm[i].getMZ() + mz_da); for (; itt != exp.areaEnd(); ++itt) { mz.push_back(itt->getMZ()); //std::cerr << "ppm: " << itt.getRT() << " " << itt->getMZ() << " " << itt->getIntensity() << std::endl; } if ((SignedSize)mz.size() > (low + high + 1)) LOG_WARN << "Compound " << i << " has overlapping peaks [" << mz.size() << "/" << low + high + 1 << "]" << std::endl; if (!mz.empty()) { double avg_mz = std::accumulate(mz.begin(), mz.end(), 0.0) / double(mz.size()); //std::cerr << "avg: " << avg_mz << "\n"; ppm = (avg_mz - cm[i].getMZ()) / cm[i].getMZ() * 1e6; } } // appending the second column set requires separator String append_sep = (fi == 0 ? "" : out_sep); vec_single[i] += append_sep; // new line if (fi == 0) { vec_single[i] += String(cm[i].getRT()) + out_sep + String(cm[i].getMZ()) + out_sep; } vec_single[i] += String(max_peak.getRT()) + out_sep + String(max_peak.getRT() - cm[i].getRT()) + out_sep + String(max_peak.getMZ()) + out_sep + String(ppm) + out_sep + String(max_peak.getIntensity()); } if (not_found) LOG_INFO << "Missing peaks for " << not_found << " compounds in file '" << in[fi] << "'.\n"; } //------------------------------------------------------------- // create header //------------------------------------------------------------- vec_single.insert(vec_single.begin(), ListUtils::concatenate(tf_single_header2, out_sep)); vec_single.insert(vec_single.begin(), ListUtils::concatenate(tf_single_header1, out_sep)); vec_single.insert(vec_single.begin(), ListUtils::concatenate(tf_single_header0, out_sep)); //------------------------------------------------------------- // writing output //------------------------------------------------------------- TextFile tf; for (std::vector<String>::iterator v_it = vec_single.begin(); v_it != vec_single.end(); ++v_it) { tf.addLine(*v_it); } tf.store(out); return EXECUTION_OK; }
void SimplePairFinder::run(const std::vector<ConsensusMap> & input_maps, ConsensusMap & result_map) { if (input_maps.size() != 2) throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "exactly two input maps required"); checkIds_(input_maps); // progress dots Int progress_dots = 0; if (this->param_.exists("debug::progress_dots")) { progress_dots = (Int) this->param_.getValue("debug:progress_dots"); } Int number_of_considered_element_pairs = 0; // For each element in map 0, find its best friend in map 1 std::vector<UInt> best_companion_index_0(input_maps[0].size(), UInt(-1)); std::vector<double> best_companion_quality_0(input_maps[0].size(), 0); for (UInt fi0 = 0; fi0 < input_maps[0].size(); ++fi0) { double best_quality = -std::numeric_limits<double>::max(); for (UInt fi1 = 0; fi1 < input_maps[1].size(); ++fi1) { double quality = similarity_(input_maps[0][fi0], input_maps[1][fi1]); if (quality > best_quality) { best_quality = quality; best_companion_index_0[fi0] = fi1; } ++number_of_considered_element_pairs; if (progress_dots && !(number_of_considered_element_pairs % progress_dots)) { std::cout << '-' << std::flush; } } best_companion_quality_0[fi0] = best_quality; } // For each element in map 1, find its best friend in map 0 std::vector<UInt> best_companion_index_1(input_maps[1].size(), UInt(-1)); std::vector<double> best_companion_quality_1(input_maps[1].size(), 0); for (UInt fi1 = 0; fi1 < input_maps[1].size(); ++fi1) { double best_quality = -std::numeric_limits<double>::max(); for (UInt fi0 = 0; fi0 < input_maps[0].size(); ++fi0) { double quality = similarity_(input_maps[0][fi0], input_maps[1][fi1]); if (quality > best_quality) { best_quality = quality; best_companion_index_1[fi1] = fi0; } ++number_of_considered_element_pairs; if (progress_dots && !(number_of_considered_element_pairs % progress_dots)) { std::cout << '+' << std::flush; } } best_companion_quality_1[fi1] = best_quality; } // And if both like each other, they become a pair. // element_pairs_->clear(); for (UInt fi0 = 0; fi0 < input_maps[0].size(); ++fi0) { // fi0 likes someone ... if (best_companion_quality_0[fi0] > pair_min_quality_) { // ... who likes him too ... UInt best_companion_of_fi0 = best_companion_index_0[fi0]; if (best_companion_index_1[best_companion_of_fi0] == fi0 && best_companion_quality_1[best_companion_of_fi0] > pair_min_quality_ ) { ConsensusFeature f; f.insert(input_maps[0][fi0]); f.insert(input_maps[1][best_companion_of_fi0]); f.computeConsensus(); f.setQuality(best_companion_quality_0[fi0] + best_companion_quality_1[best_companion_of_fi0]); result_map.push_back(f); } } } return; }
void IsobaricChannelExtractor::extractChannels(const MSExperiment<Peak1D>& ms_exp_data, ConsensusMap& consensus_map) { if (ms_exp_data.empty()) { LOG_WARN << "The given file does not contain any conventional peak data, but might" " contain chromatograms. This tool currently cannot handle them, sorry.\n"; throw Exception::MissingInformation(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Experiment has no scans!"); } // clear the output map consensus_map.clear(false); consensus_map.setExperimentType("labeled_MS2"); // create predicate for spectrum checking LOG_INFO << "Selecting scans with activation mode: " << (selected_activation_ == "" ? "any" : selected_activation_) << "\n"; HasActivationMethod<MSExperiment<Peak1D>::SpectrumType> activation_predicate(StringList::create(selected_activation_)); // now we have picked data // --> assign peaks to channels UInt64 element_index(0); // remember the current precusor spectrum MSExperiment<Peak1D>::ConstIterator prec_spec = ms_exp_data.end(); for (MSExperiment<Peak1D>::ConstIterator it = ms_exp_data.begin(); it != ms_exp_data.end(); ++it) { // remember the last MS1 spectra as we assume it to be the precursor spectrum if (it->getMSLevel() == 1) prec_spec = it; if (selected_activation_ == "" || activation_predicate(*it)) { // check if precursor is available if (it->getPrecursors().empty()) { throw Exception::MissingInformation(__FILE__, __LINE__, __PRETTY_FUNCTION__, String("No precursor information given for scan native ID ") + it->getNativeID() + " with RT " + String(it->getRT())); } // check precursor constraints if (!isValidPrecursor_(it->getPrecursors()[0])) { LOG_DEBUG << "Skip spectrum " << it->getNativeID() << ": Precursor doesn't fulfill all constraints." << std::endl; continue; } // check precursor purity if we have a valid precursor .. if (prec_spec != ms_exp_data.end()) { const DoubleReal purity = computePrecursorPurity_(it, prec_spec); if (purity < min_precursor_purity_) { LOG_DEBUG << "Skip spectrum " << it->getNativeID() << ": Precursor purity is below the threshold. [purity = " << purity << "]" << std::endl; continue; } } else { LOG_INFO << "No precursor available for spectrum: " << it->getNativeID() << std::endl; } if (!(prec_spec == ms_exp_data.end()) && computePrecursorPurity_(it, prec_spec) < min_precursor_purity_) { LOG_DEBUG << "Skip spectrum " << it->getNativeID() << ": Precursor purity is below the threshold." << std::endl; continue; } // store RT&MZ of parent ion as centroid of ConsensusFeature ConsensusFeature cf; cf.setUniqueId(); cf.setRT(it->getRT()); cf.setMZ(it->getPrecursors()[0].getMZ()); Peak2D channel_value; channel_value.setRT(it->getRT()); // for each each channel UInt64 map_index = 0; Peak2D::IntensityType overall_intensity = 0; for (IsobaricQuantitationMethod::IsobaricChannelList::const_iterator cl_it = quant_method_->getChannelInformation().begin(); cl_it != quant_method_->getChannelInformation().end(); ++cl_it) { // set mz-position of channel channel_value.setMZ(cl_it->center); // reset intensity channel_value.setIntensity(0); // as every evaluation requires time, we cache the MZEnd iterator const MSExperiment<Peak1D>::SpectrumType::ConstIterator mz_end = it->MZEnd(cl_it->center + reporter_mass_shift_); // add up all signals for (MSExperiment<Peak1D>::SpectrumType::ConstIterator mz_it = it->MZBegin(cl_it->center - reporter_mass_shift_); mz_it != mz_end; ++mz_it) { channel_value.setIntensity(channel_value.getIntensity() + mz_it->getIntensity()); } // discard contribution of this channel as it is below the required intensity threshold if (channel_value.getIntensity() < min_reporter_intensity_) { channel_value.setIntensity(0); } overall_intensity += channel_value.getIntensity(); // add channel to ConsensusFeature cf.insert(map_index++, channel_value, element_index); } // ! channel_iterator // check if we keep this feature or if it contains low-intensity quantifications if (remove_low_intensity_quantifications_ && hasLowIntensityReporter_(cf)) { continue; } // check featureHandles are not empty if (overall_intensity == 0) { cf.setMetaValue("all_empty", String("true")); } cf.setIntensity(overall_intensity); consensus_map.push_back(cf); // the tandem-scan in the order they appear in the experiment ++element_index; } } // ! Experiment iterator /// add meta information to the map registerChannelsInOutputMap_(consensus_map); }
/// @brief extracts the iTRAQ channels from the MS data and stores intensity values in a consensus map /// /// @param ms_exp_data Raw data to read /// @param consensus_map Output each MS² scan as a consensus feature /// @throws Exception::MissingInformation if no scans present or MS² scan has no precursor void ItraqChannelExtractor::run(const MSExperiment<Peak1D>& ms_exp_data, ConsensusMap& consensus_map) { if (ms_exp_data.empty()) { LOG_WARN << "The given file does not contain any conventional peak data, but might" " contain chromatograms. This tool currently cannot handle them, sorry."; throw Exception::MissingInformation(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Experiment has no scans!"); } MSExperiment<> ms_exp_MS2; String mode = (String) param_.getValue("select_activation"); std::cout << "Selecting scans with activation mode: " << (mode == "" ? "any" : mode) << "\n"; HasActivationMethod<MSExperiment<Peak1D>::SpectrumType> activation_predicate(ListUtils::create<String>(mode)); for (size_t idx = 0; idx < ms_exp_data.size(); ++idx) { if (ms_exp_data[idx].getMSLevel() == 2) { if (mode == "" || activation_predicate(ms_exp_data[idx])) { // copy only MS² scans ms_exp_MS2.addSpectrum(ms_exp_data[idx]); } else { //std::cout << "deleting spectrum # " << idx << " with RT: " << ms_exp_data[idx].getRT() << "\n"; } } } #ifdef ITRAQ_DEBUG std::cout << "we have " << ms_exp_MS2.size() << " scans left of level " << ms_exp_MS2[0].getMSLevel() << std::endl; std::cout << "run: channel_map_ has " << channel_map_.size() << " entries!" << std::endl; #endif consensus_map.clear(false); // set <mapList> header Int index_cnt = 0; for (ChannelMapType::const_iterator cm_it = channel_map_.begin(); cm_it != channel_map_.end(); ++cm_it) { // structure of Map cm_it // first == channel-name as Int e.g. 114 // second == ChannelInfo struct ConsensusMap::FileDescription channel_as_map; // label is the channel + description provided in the Params if (itraq_type_ != TMT_SIXPLEX) channel_as_map.label = "iTRAQ_" + String(cm_it->second.name) + "_" + String(cm_it->second.description); else channel_as_map.label = "TMT_" + String(cm_it->second.name) + "_" + String(cm_it->second.description); channel_as_map.size = ms_exp_MS2.size(); //TODO what about .filename? leave empty? // add some more MetaInfo channel_as_map.setMetaValue("channel_name", cm_it->second.name); channel_as_map.setMetaValue("channel_id", cm_it->second.id); channel_as_map.setMetaValue("channel_description", cm_it->second.description); channel_as_map.setMetaValue("channel_center", cm_it->second.center); channel_as_map.setMetaValue("channel_active", String(cm_it->second.active ? "true" : "false")); consensus_map.getFileDescriptions()[index_cnt++] = channel_as_map; } // create consensusElements Peak2D::CoordinateType allowed_deviation = (Peak2D::CoordinateType) param_.getValue("reporter_mass_shift"); // now we have picked data // --> assign peaks to channels UInt element_index(0); for (MSExperiment<>::ConstIterator it = ms_exp_MS2.begin(); it != ms_exp_MS2.end(); ++it) { // store RT&MZ of parent ion as centroid of ConsensusFeature ConsensusFeature cf; cf.setUniqueId(); cf.setRT(it->getRT()); if (it->getPrecursors().size() >= 1) { cf.setMZ(it->getPrecursors()[0].getMZ()); } else { throw Exception::MissingInformation(__FILE__, __LINE__, __PRETTY_FUNCTION__, String("No precursor information given for scan native ID ") + String(it->getNativeID()) + " with RT " + String(it->getRT())); } Peak2D channel_value; channel_value.setRT(it->getRT()); // for each each channel Int index = 0; Peak2D::IntensityType overall_intensity = 0; for (ChannelMapType::const_iterator cm_it = channel_map_.begin(); cm_it != channel_map_.end(); ++cm_it) { // set mz-position of channel channel_value.setMZ(cm_it->second.center); // reset intensity channel_value.setIntensity(0); //add up all signals for (MSExperiment<>::SpectrumType::ConstIterator mz_it = it->MZBegin(cm_it->second.center - allowed_deviation) ; mz_it != it->MZEnd(cm_it->second.center + allowed_deviation) ; ++mz_it ) { channel_value.setIntensity(channel_value.getIntensity() + mz_it->getIntensity()); } overall_intensity += channel_value.getIntensity(); // add channel to ConsensusFeature cf.insert(index++, channel_value, element_index); } // ! channel_iterator // check featureHandles are not empty if (overall_intensity == 0) { cf.setMetaValue("all_empty", String("true")); } cf.setIntensity(overall_intensity); consensus_map.push_back(cf); // the tandem-scan in the order they appear in the experiment ++element_index; } // ! Experiment iterator #ifdef ITRAQ_DEBUG std::cout << "processed " << element_index << " scans" << std::endl; #endif consensus_map.setExperimentType("itraq"); return; }
void StablePairFinder::run(const std::vector<ConsensusMap>& input_maps, ConsensusMap& result_map) { // empty output destination: result_map.clear(false); // sanity checks: if (input_maps.size() != 2) { throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "exactly two input maps required"); } checkIds_(input_maps); // set up the distance functor: double max_intensity = max(input_maps[0].getMaxInt(), input_maps[1].getMaxInt()); Param distance_params = param_.copy(""); distance_params.remove("use_identifications"); distance_params.remove("second_nearest_gap"); FeatureDistance feature_distance(max_intensity, false); feature_distance.setParameters(distance_params); // keep track of pairing: std::vector<bool> is_singleton[2]; is_singleton[0].resize(input_maps[0].size(), true); is_singleton[1].resize(input_maps[1].size(), true); typedef pair<double, double> DoublePair; DoublePair init = make_pair(FeatureDistance::infinity, FeatureDistance::infinity); // for every element in map 0: // - index of nearest neighbor in map 1: vector<UInt> nn_index_0(input_maps[0].size(), UInt(-1)); // - distances to nearest and second-nearest neighbors in map 1: vector<DoublePair> nn_distance_0(input_maps[0].size(), init); // for every element in map 1: // - index of nearest neighbor in map 0: vector<UInt> nn_index_1(input_maps[1].size(), UInt(-1)); // - distances to nearest and second-nearest neighbors in map 0: vector<DoublePair> nn_distance_1(input_maps[1].size(), init); // iterate over all feature pairs, find nearest neighbors: // TODO: iterate over SENSIBLE RT (and m/z) window -- sort the maps beforehand // to save a lot of processing time... // Once done, remove the warning in the description of the 'use_identifications' parameter for (UInt fi0 = 0; fi0 < input_maps[0].size(); ++fi0) { const ConsensusFeature& feat0 = input_maps[0][fi0]; for (UInt fi1 = 0; fi1 < input_maps[1].size(); ++fi1) { const ConsensusFeature& feat1 = input_maps[1][fi1]; if (use_IDs_ && !compatibleIDs_(feat0, feat1)) // check peptide IDs { continue; // mismatch } pair<bool, double> result = feature_distance(feat0, feat1); double distance = result.second; // we only care if distance constraints are satisfied for "best // matches", not for second-best; this means that second-best distances // can become smaller than best distances // (e.g. the RT is larger than allowed (->invalid pair), but m/z is perfect and has the most weight --> better score!) bool valid = result.first; // update entries for map 0: if (distance < nn_distance_0[fi0].second) { if (valid && (distance < nn_distance_0[fi0].first)) { nn_distance_0[fi0].second = nn_distance_0[fi0].first; nn_distance_0[fi0].first = distance; nn_index_0[fi0] = fi1; } else { nn_distance_0[fi0].second = distance; } } // update entries for map 1: if (distance < nn_distance_1[fi1].second) { if (valid && (distance < nn_distance_1[fi1].first)) { nn_distance_1[fi1].second = nn_distance_1[fi1].first; nn_distance_1[fi1].first = distance; nn_index_1[fi1] = fi0; } else { nn_distance_1[fi1].second = distance; } } } } // if features from the two maps are nearest neighbors of each other, they // can become a pair: for (UInt fi0 = 0; fi0 < input_maps[0].size(); ++fi0) { UInt fi1 = nn_index_0[fi0]; // nearest neighbor of "fi0" in map 1 // cout << "index: " << fi0 << ", RT: " << input_maps[0][fi0].getRT() // << ", MZ: " << input_maps[0][fi0].getMZ() << endl // << "neighbor: " << fi1 << ", RT: " << input_maps[1][fi1].getRT() // << ", MZ: " << input_maps[1][fi1].getMZ() << endl // << "d(i,j): " << nn_distance_0[fi0].first << endl // << "d2(i): " << nn_distance_0[fi0].second << endl // << "d2(j): " << nn_distance_1[fi1].second << endl; // criteria set by the parameters must be fulfilled: if ((nn_distance_0[fi0].first < FeatureDistance::infinity) && (nn_distance_0[fi0].first * second_nearest_gap_ <= nn_distance_0[fi0].second)) { // "fi0" satisfies constraints... if ((nn_index_1[fi1] == fi0) && (nn_distance_1[fi1].first * second_nearest_gap_ <= nn_distance_1[fi1].second)) { // ...nearest neighbor of "fi0" also satisfies constraints (yay!) // cout << "match!" << endl; result_map.push_back(ConsensusFeature()); ConsensusFeature& f = result_map.back(); f.insert(input_maps[0][fi0]); f.getPeptideIdentifications().insert(f.getPeptideIdentifications().end(), input_maps[0][fi0].getPeptideIdentifications().begin(), input_maps[0][fi0].getPeptideIdentifications().end()); f.insert(input_maps[1][fi1]); f.getPeptideIdentifications().insert(f.getPeptideIdentifications().end(), input_maps[1][fi1].getPeptideIdentifications().begin(), input_maps[1][fi1].getPeptideIdentifications().end()); f.computeConsensus(); double quality = 1.0 - nn_distance_0[fi0].first; double quality0 = 1.0 - nn_distance_0[fi0].first * second_nearest_gap_ / nn_distance_0[fi0].second; double quality1 = 1.0 - nn_distance_1[fi1].first * second_nearest_gap_ / nn_distance_1[fi1].second; quality = quality * quality0 * quality1; // TODO other formula? // incorporate existing quality values: Size size0 = max(input_maps[0][fi0].size(), size_t(1)); Size size1 = max(input_maps[1][fi1].size(), size_t(1)); // quality contribution from first map: quality0 = input_maps[0][fi0].getQuality() * (size0 - 1); // quality contribution from second map: quality1 = input_maps[1][fi1].getQuality() * (size1 - 1); f.setQuality((quality + quality0 + quality1) / (size0 + size1 - 1)); is_singleton[0][fi0] = false; is_singleton[1][fi1] = false; } } } // write out unmatched consensus features for (UInt input = 0; input <= 1; ++input) { for (UInt index = 0; index < input_maps[input].size(); ++index) { if (is_singleton[input][index]) { result_map.push_back(input_maps[input][index]); if (result_map.back().size() < 2) // singleton consensus feature { result_map.back().setQuality(0.0); } } } } // canonical ordering for checking the results, and the ids have no real meaning anyway result_map.sortByMZ(); // protein IDs and unassigned peptide IDs are added to the result by the // FeatureGroupingAlgorithm! }
void LabeledPairFinder::run(const vector<ConsensusMap>& input_maps, ConsensusMap& result_map) { if (input_maps.size() != 1) throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "exactly one input map required"); if (result_map.getFileDescriptions().size() != 2) throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "two file descriptions required"); if (result_map.getFileDescriptions().begin()->second.filename != result_map.getFileDescriptions().rbegin()->second.filename) throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the two file descriptions have to contain the same file name"); checkIds_(input_maps); //look up the light and heavy index Size light_index = numeric_limits<Size>::max(); Size heavy_index = numeric_limits<Size>::max(); for (ConsensusMap::FileDescriptions::const_iterator it = result_map.getFileDescriptions().begin(); it != result_map.getFileDescriptions().end(); ++it) { if (it->second.label == "heavy") { heavy_index = it->first; } else if (it->second.label == "light") { light_index = it->first; } } if (light_index == numeric_limits<Size>::max() || heavy_index == numeric_limits<Size>::max()) { throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the input maps have to be labeled 'light' and 'heavy'"); } result_map.clear(false); // sort consensus features by RT (and MZ) to speed up searching afterwards typedef ConstRefVector<ConsensusMap> RefMap; RefMap model_ref(input_maps[0].begin(), input_maps[0].end()); model_ref.sortByPosition(); //calculate matches ConsensusMap matches; //settings double rt_pair_dist = param_.getValue("rt_pair_dist"); double rt_dev_low = param_.getValue("rt_dev_low"); double rt_dev_high = param_.getValue("rt_dev_high"); double mz_dev = param_.getValue("mz_dev"); DoubleList mz_pair_dists = param_.getValue("mz_pair_dists"); bool mrm = param_.getValue("mrm").toBool(); //estimate RT parameters if (param_.getValue("rt_estimate") == "true") { //find all possible RT distances of features with the same charge and a good m/z distance vector<double> dists; dists.reserve(model_ref.size()); for (RefMap::const_iterator it = model_ref.begin(); it != model_ref.end(); ++it) { for (RefMap::const_iterator it2 = model_ref.begin(); it2 != model_ref.end(); ++it2) { for (DoubleList::const_iterator dist_it = mz_pair_dists.begin(); dist_it != mz_pair_dists.end(); ++dist_it) { double mz_pair_dist = *dist_it; if (it2->getCharge() == it->getCharge() && it2->getMZ() >= it->getMZ() + mz_pair_dist / it->getCharge() - mz_dev && it2->getMZ() <= it->getMZ() + mz_pair_dist / it->getCharge() + mz_dev) { dists.push_back(it2->getRT() - it->getRT()); } } } } if (dists.empty()) { cout << "Warning: Could not find pairs for RT distance estimation. The manual settings are used!" << endl; } else { if (dists.size() < 50) { cout << "Warning: Found only " << dists.size() << " pairs. The estimated shift and std deviation are probably not reliable!" << endl; } //--------------------------- estimate initial parameters of fit --------------------------- GaussFitter::GaussFitResult result(-1, -1, -1); //first estimate of the optimal shift: median of the distances sort(dists.begin(), dists.end()); Size median_index = dists.size() / 2; result.x0 = dists[median_index]; //create histogram of distances //consider only the maximum of pairs, centered around the optimal shift Size max_pairs = model_ref.size() / 2; Size start_index = (Size) max((SignedSize)0, (SignedSize)(median_index - max_pairs / 2)); Size end_index = (Size) min((SignedSize)(dists.size() - 1), (SignedSize)(median_index + max_pairs / 2)); double start_value = dists[start_index]; double end_value = dists[end_index]; double bin_step = fabs(end_value - start_value) / 99.999; //ensure that we have 100 bins Math::Histogram<> hist(start_value, end_value, bin_step); //std::cout << "HIST from " << start_value << " to " << end_value << " (bin size " << bin_step << ")" << endl; for (Size i = start_index; i <= end_index; ++i) { hist.inc(dists[i]); } //cout << hist << endl; dists.clear(); //determine median of bins (uniform background distribution) vector<Size> bins(hist.begin(), hist.end()); sort(bins.begin(), bins.end()); Size bin_median = bins[bins.size() / 2]; bins.clear(); //estimate scale A: maximum of the histogram Size max_value = hist.maxValue(); result.A = max_value - bin_median; //overwrite estimate of x0 with the position of the highest bin for (Size i = 0; i < hist.size(); ++i) { if (hist[i] == max_value) { result.x0 = hist.centerOfBin(i); break; } } //estimate sigma: first time the count is less or equal the median count in the histogram double pos = result.x0; while (pos > start_value && hist.binValue(pos) > bin_median) { pos -= bin_step; } double sigma_low = result.x0 - pos; pos = result.x0; while (pos<end_value&& hist.binValue(pos)> bin_median) { pos += bin_step; } double sigma_high = pos - result.x0; result.sigma = (sigma_high + sigma_low) / 6.0; //cout << "estimated optimal RT distance (before fit): " << result.x0 << endl; //cout << "estimated allowed deviation (before fit): " << result.sigma*3.0 << endl; //--------------------------- do gauss fit --------------------------- vector<DPosition<2> > points(hist.size()); for (Size i = 0; i < hist.size(); ++i) { points[i][0] = hist.centerOfBin(i); points[i][1] = max(0u, hist[i]); } GaussFitter fitter; fitter.setInitialParameters(result); result = fitter.fit(points); cout << "estimated optimal RT distance: " << result.x0 << endl; cout << "estimated allowed deviation: " << fabs(result.sigma) * 3.0 << endl; rt_pair_dist = result.x0; rt_dev_low = fabs(result.sigma) * 3.0; rt_dev_high = fabs(result.sigma) * 3.0; } } // check each feature for (RefMap::const_iterator it = model_ref.begin(); it != model_ref.end(); ++it) { for (DoubleList::const_iterator dist_it = mz_pair_dists.begin(); dist_it != mz_pair_dists.end(); ++dist_it) { double mz_pair_dist = *dist_it; RefMap::const_iterator it2 = lower_bound(model_ref.begin(), model_ref.end(), it->getRT() + rt_pair_dist - rt_dev_low, ConsensusFeature::RTLess()); while (it2 != model_ref.end() && it2->getRT() <= it->getRT() + rt_pair_dist + rt_dev_high) { // if in mrm mode, we need to compare precursor mass difference and fragment mass difference, charge remains the same double prec_mz_diff(0); if (mrm) { prec_mz_diff = fabs((double)it2->getMetaValue("MZ") - (double)it->getMetaValue("MZ")); if (it->getCharge() != 0) { prec_mz_diff = fabs(prec_mz_diff - mz_pair_dist / it->getCharge()); } else { prec_mz_diff = fabs(prec_mz_diff - mz_pair_dist); } } bool mrm_correct_dist(false); double frag_mz_diff = fabs(it->getMZ() - it2->getMZ()); //cerr << it->getRT() << " charge1=" << it->getCharge() << ", charge2=" << it2->getCharge() << ", prec_diff=" << prec_mz_diff << ", frag_diff=" << frag_mz_diff << endl; if (mrm && it2->getCharge() == it->getCharge() && prec_mz_diff < mz_dev && (frag_mz_diff < mz_dev || fabs(frag_mz_diff - mz_pair_dist) < mz_dev)) { mrm_correct_dist = true; //cerr << "mrm_correct_dist" << endl; } if ((mrm && mrm_correct_dist) || (!mrm && it2->getCharge() == it->getCharge() && it2->getMZ() >= it->getMZ() + mz_pair_dist / it->getCharge() - mz_dev && it2->getMZ() <= it->getMZ() + mz_pair_dist / it->getCharge() + mz_dev )) { //cerr << "dist correct" << endl; double score = sqrt( PValue_(it2->getMZ() - it->getMZ(), mz_pair_dist / it->getCharge(), mz_dev, mz_dev) * PValue_(it2->getRT() - it->getRT(), rt_pair_dist, rt_dev_low, rt_dev_high) ); // Note: we used to copy the id from the light feature here, but that strategy does not generalize to more than two labels. // We might want to report consensus features where the light one is missing but more than one heavier variant was found. // Also, the old strategy is inconsistent with what was done in the unlabeled case. Thus now we assign a new unique id here. matches.push_back(ConsensusFeature()); matches.back().setUniqueId(); matches.back().insert(light_index, *it); matches.back().clearMetaInfo(); matches.back().insert(heavy_index, *it2); matches.back().setQuality(score); matches.back().setCharge(it->getCharge()); matches.back().computeMonoisotopicConsensus(); } ++it2; } } } //compute best pairs // - sort matches by quality // - take highest-quality matches first (greedy) and mark them as used set<Size> used_features; matches.sortByQuality(true); for (ConsensusMap::const_iterator match = matches.begin(); match != matches.end(); ++match) { //check if features are not used yet if (used_features.find(match->begin()->getUniqueId()) == used_features.end() && used_features.find(match->rbegin()->getUniqueId()) == used_features.end() ) { //if unused, add it to the final set of elements result_map.push_back(*match); used_features.insert(match->begin()->getUniqueId()); used_features.insert(match->rbegin()->getUniqueId()); } } //Add protein identifications to result map for (Size i = 0; i < input_maps.size(); ++i) { result_map.getProteinIdentifications().insert(result_map.getProteinIdentifications().end(), input_maps[i].getProteinIdentifications().begin(), input_maps[i].getProteinIdentifications().end()); } //Add unassigned peptide identifications to result map for (Size i = 0; i < input_maps.size(); ++i) { result_map.getUnassignedPeptideIdentifications().insert(result_map.getUnassignedPeptideIdentifications().end(), input_maps[i].getUnassignedPeptideIdentifications().begin(), input_maps[i].getUnassignedPeptideIdentifications().end()); } // Very useful for checking the results, and the ids have no real meaning anyway result_map.sortByMZ(); }
void EDTAFile::load(const String& filename, ConsensusMap& consensus_map) { // load input TextFile input(filename); TextFile::ConstIterator input_it = input.begin(); // reset map consensus_map = ConsensusMap(); consensus_map.setUniqueId(); char separator = ' '; if (input_it->hasSubstring("\t")) separator = '\t'; else if (input_it->hasSubstring(" ")) separator = ' '; else if (input_it->hasSubstring(",")) separator = ','; // parsing header line std::vector<String> headers; input_it->split(separator, headers); int offset = 0; for (Size i = 0; i < headers.size(); ++i) { headers[i].trim(); } String header_trimmed = *input.begin(); header_trimmed.trim(); enum { TYPE_UNDEFINED, TYPE_OLD_NOCHARGE, TYPE_OLD_CHARGE, TYPE_CONSENSUS } input_type = TYPE_UNDEFINED; Size input_features = 1; double rt = 0.0; double mz = 0.0; double it = 0.0; Int ch = 0; if (headers.size() <= 2) { throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "", String("Failed parsing in line 1: not enough columns! Expected at least 3 columns!\nOffending line: '") + header_trimmed + "' (line 1)\n"); } else if (headers.size() == 3) input_type = TYPE_OLD_NOCHARGE; else if (headers.size() == 4) input_type = TYPE_OLD_CHARGE; // see if we have a header try { // try to convert... if not: thats a header rt = headers[0].toDouble(); mz = headers[1].toDouble(); it = headers[2].toDouble(); } catch (Exception::BaseException&) { offset = 1; ++input_it; LOG_INFO << "Detected a header line.\n"; } if (headers.size() >= 5) { if (String(headers[4].trim()).toUpper() == "RT1") input_type = TYPE_CONSENSUS; else input_type = TYPE_OLD_CHARGE; } if (input_type == TYPE_CONSENSUS) { // Every consensus style line includes features with four columns. // The remainder is meta data input_features = headers.size() / 4; } if (offset == 0 && (input_type == TYPE_OLD_CHARGE || input_type == TYPE_CONSENSUS)) { throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "", String("Failed parsing in line 1: No HEADER provided. This is only allowed for three columns. You have more!\nOffending line: '") + header_trimmed + "' (line 1)\n"); } SignedSize input_size = input.end() - input.begin(); ConsensusMap::FileDescription desc; desc.filename = filename; desc.size = (input_size) - offset; consensus_map.getFileDescriptions()[0] = desc; // parsing features consensus_map.reserve(input_size); for (; input_it != input.end(); ++input_it) { //do nothing for empty lines String line_trimmed = *input_it; line_trimmed.trim(); if (line_trimmed == "") { if ((input_it - input.begin()) < input_size - 1) LOG_WARN << "Notice: Empty line ignored (line " << ((input_it - input.begin()) + 1) << ")."; continue; } //split line to tokens std::vector<String> parts; input_it->split(separator, parts); //abort if line does not contain enough fields if (parts.size() < 3) { throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "", String("Failed parsing in line ") + String((input_it - input.begin()) + 1) + ": At least three columns are needed! (got " + String(parts.size()) + ")\nOffending line: '" + line_trimmed + "' (line " + String((input_it - input.begin()) + 1) + ")\n"); } ConsensusFeature cf; cf.setUniqueId(); try { // Convert values. Will return -1 if not available. rt = checkedToDouble_(parts, 0); mz = checkedToDouble_(parts, 1); it = checkedToDouble_(parts, 2); ch = checkedToInt_(parts, 3); cf.setRT(rt); cf.setMZ(mz); cf.setIntensity(it); if (input_type != TYPE_OLD_NOCHARGE) cf.setCharge(ch); } catch (Exception::BaseException&) { throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "", String("Failed parsing in line ") + String((input_it - input.begin()) + 1) + ": Could not convert the first three columns to a number!\nOffending line: '" + line_trimmed + "' (line " + String((input_it - input.begin()) + 1) + ")\n"); } // Check all features in one line for (Size j = 1; j < input_features; ++j) { try { Feature f; f.setUniqueId(); // Convert values. Will return -1 if not available. rt = checkedToDouble_(parts, j * 4 + 0); mz = checkedToDouble_(parts, j * 4 + 1); it = checkedToDouble_(parts, j * 4 + 2); ch = checkedToInt_(parts, j * 4 + 3); // Only accept features with at least RT and MZ set if (rt != -1 && mz != -1) { f.setRT(rt); f.setMZ(mz); f.setIntensity(it); f.setCharge(ch); cf.insert(j - 1, f); } } catch (Exception::BaseException&) { throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "", String("Failed parsing in line ") + String((input_it - input.begin()) + 1) + ": Could not convert one of the four sub-feature columns (starting at column " + (j * 4 + 1) + ") to a number! Is the correct separator specified?\nOffending line: '" + line_trimmed + "' (line " + String((input_it - input.begin()) + 1) + ")\n"); } } //parse meta data for (Size j = input_features * 4; j < parts.size(); ++j) { String part_trimmed = parts[j]; part_trimmed.trim(); if (part_trimmed != "") { //check if column name is ok if (headers.size() <= j || headers[j] == "") { throw Exception::ParseError(__FILE__, __LINE__, __PRETTY_FUNCTION__, "", String("Error: Missing meta data header for column ") + (j + 1) + "!" + String("Offending header line: '") + header_trimmed + "' (line 1)"); } //add meta value cf.setMetaValue(headers[j], part_trimmed); } } //insert feature to map consensus_map.push_back(cf); } // register FileDescriptions ConsensusMap::FileDescription fd; fd.filename = filename; fd.size = consensus_map.size(); Size maps = std::max(input_features - 1, Size(1)); // its either a simple feature or a consensus map // (in this case the 'input_features' includes the centroid, which we do not count) for (Size i = 0; i < maps; ++i) { fd.label = String("EDTA_Map ") + String(i); consensus_map.getFileDescriptions()[i] = fd; } }
void QTClusterFinder::run_(const vector<MapType> & input_maps, ConsensusMap & result_map) { num_maps_ = input_maps.size(); if (num_maps_ < 2) { throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "At least two input maps required"); } // set up the distance functor (and set other parameters): DoubleReal max_intensity = input_maps[0].getMaxInt(); DoubleReal max_mz = input_maps[0].getMax()[1]; for (Size map_index = 1; map_index < num_maps_; ++map_index) { max_intensity = max(max_intensity, input_maps[map_index].getMaxInt()); max_mz = max(max_mz, input_maps[map_index].getMax()[0]); } setParameters_(max_intensity, max_mz); // create the hash grid and fill it with features: //cout << "Hashing..." << endl; list<GridFeature> grid_features; Grid grid(Grid::ClusterCenter(max_diff_rt_, max_diff_mz_)); for (Size map_index = 0; map_index < num_maps_; ++map_index) { for (Size feature_index = 0; feature_index < input_maps[map_index].size(); ++feature_index) { grid_features.push_back( GridFeature(input_maps[map_index][feature_index], map_index, feature_index)); GridFeature & gfeature = grid_features.back(); // sort peptide hits once now, instead of multiple times later: BaseFeature & feature = const_cast<BaseFeature &>( grid_features.back().getFeature()); for (vector<PeptideIdentification>::iterator pep_it = feature.getPeptideIdentifications().begin(); pep_it != feature.getPeptideIdentifications().end(); ++pep_it) { pep_it->sort(); } grid.insert(std::make_pair(Grid::ClusterCenter(gfeature.getRT(), gfeature.getMZ()), &gfeature)); } } // compute QT clustering: //cout << "Clustering..." << endl; list<QTCluster> clustering; computeClustering_(grid, clustering); // number of clusters == number of data points: Size size = clustering.size(); // Create a temporary map where we store which GridFeatures are next to which Clusters OpenMSBoost::unordered_map<GridFeature *, std::vector< QTCluster * > > element_mapping; for (list<QTCluster>::iterator it = clustering.begin(); it != clustering.end(); ++it) { OpenMSBoost::unordered_map<Size, GridFeature *> elements; typedef std::multimap<DoubleReal, GridFeature *> InnerNeighborMap; typedef OpenMSBoost::unordered_map<Size, InnerNeighborMap > NeighborMap; NeighborMap neigh = it->getNeighbors(); for (NeighborMap::iterator n_it = neigh.begin(); n_it != neigh.end(); ++n_it) { for (InnerNeighborMap::iterator i_it = n_it->second.begin(); i_it != n_it->second.end(); ++i_it) { element_mapping[i_it->second].push_back( &(*it) ); } } } ProgressLogger logger; logger.setLogType(ProgressLogger::CMD); logger.startProgress(0, size, "linking features"); Size progress = 0; result_map.clear(false); while (!clustering.empty()) { // cout << "Clusters: " << clustering.size() << endl; ConsensusFeature consensus_feature; makeConsensusFeature_(clustering, consensus_feature, element_mapping); if (!clustering.empty()) { result_map.push_back(consensus_feature); } logger.setProgress(progress++); } logger.endProgress(); }