void StablePairFinder::run(const std::vector<ConsensusMap>& input_maps, ConsensusMap& result_map) { // empty output destination: result_map.clear(false); // sanity checks: if (input_maps.size() != 2) { throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "exactly two input maps required"); } checkIds_(input_maps); // set up the distance functor: double max_intensity = max(input_maps[0].getMaxInt(), input_maps[1].getMaxInt()); Param distance_params = param_.copy(""); distance_params.remove("use_identifications"); distance_params.remove("second_nearest_gap"); FeatureDistance feature_distance(max_intensity, false); feature_distance.setParameters(distance_params); // keep track of pairing: std::vector<bool> is_singleton[2]; is_singleton[0].resize(input_maps[0].size(), true); is_singleton[1].resize(input_maps[1].size(), true); typedef pair<double, double> DoublePair; DoublePair init = make_pair(FeatureDistance::infinity, FeatureDistance::infinity); // for every element in map 0: // - index of nearest neighbor in map 1: vector<UInt> nn_index_0(input_maps[0].size(), UInt(-1)); // - distances to nearest and second-nearest neighbors in map 1: vector<DoublePair> nn_distance_0(input_maps[0].size(), init); // for every element in map 1: // - index of nearest neighbor in map 0: vector<UInt> nn_index_1(input_maps[1].size(), UInt(-1)); // - distances to nearest and second-nearest neighbors in map 0: vector<DoublePair> nn_distance_1(input_maps[1].size(), init); // iterate over all feature pairs, find nearest neighbors: // TODO: iterate over SENSIBLE RT (and m/z) window -- sort the maps beforehand // to save a lot of processing time... // Once done, remove the warning in the description of the 'use_identifications' parameter for (UInt fi0 = 0; fi0 < input_maps[0].size(); ++fi0) { const ConsensusFeature& feat0 = input_maps[0][fi0]; for (UInt fi1 = 0; fi1 < input_maps[1].size(); ++fi1) { const ConsensusFeature& feat1 = input_maps[1][fi1]; if (use_IDs_ && !compatibleIDs_(feat0, feat1)) // check peptide IDs { continue; // mismatch } pair<bool, double> result = feature_distance(feat0, feat1); double distance = result.second; // we only care if distance constraints are satisfied for "best // matches", not for second-best; this means that second-best distances // can become smaller than best distances // (e.g. the RT is larger than allowed (->invalid pair), but m/z is perfect and has the most weight --> better score!) bool valid = result.first; // update entries for map 0: if (distance < nn_distance_0[fi0].second) { if (valid && (distance < nn_distance_0[fi0].first)) { nn_distance_0[fi0].second = nn_distance_0[fi0].first; nn_distance_0[fi0].first = distance; nn_index_0[fi0] = fi1; } else { nn_distance_0[fi0].second = distance; } } // update entries for map 1: if (distance < nn_distance_1[fi1].second) { if (valid && (distance < nn_distance_1[fi1].first)) { nn_distance_1[fi1].second = nn_distance_1[fi1].first; nn_distance_1[fi1].first = distance; nn_index_1[fi1] = fi0; } else { nn_distance_1[fi1].second = distance; } } } } // if features from the two maps are nearest neighbors of each other, they // can become a pair: for (UInt fi0 = 0; fi0 < input_maps[0].size(); ++fi0) { UInt fi1 = nn_index_0[fi0]; // nearest neighbor of "fi0" in map 1 // cout << "index: " << fi0 << ", RT: " << input_maps[0][fi0].getRT() // << ", MZ: " << input_maps[0][fi0].getMZ() << endl // << "neighbor: " << fi1 << ", RT: " << input_maps[1][fi1].getRT() // << ", MZ: " << input_maps[1][fi1].getMZ() << endl // << "d(i,j): " << nn_distance_0[fi0].first << endl // << "d2(i): " << nn_distance_0[fi0].second << endl // << "d2(j): " << nn_distance_1[fi1].second << endl; // criteria set by the parameters must be fulfilled: if ((nn_distance_0[fi0].first < FeatureDistance::infinity) && (nn_distance_0[fi0].first * second_nearest_gap_ <= nn_distance_0[fi0].second)) { // "fi0" satisfies constraints... if ((nn_index_1[fi1] == fi0) && (nn_distance_1[fi1].first * second_nearest_gap_ <= nn_distance_1[fi1].second)) { // ...nearest neighbor of "fi0" also satisfies constraints (yay!) // cout << "match!" << endl; result_map.push_back(ConsensusFeature()); ConsensusFeature& f = result_map.back(); f.insert(input_maps[0][fi0]); f.getPeptideIdentifications().insert(f.getPeptideIdentifications().end(), input_maps[0][fi0].getPeptideIdentifications().begin(), input_maps[0][fi0].getPeptideIdentifications().end()); f.insert(input_maps[1][fi1]); f.getPeptideIdentifications().insert(f.getPeptideIdentifications().end(), input_maps[1][fi1].getPeptideIdentifications().begin(), input_maps[1][fi1].getPeptideIdentifications().end()); f.computeConsensus(); double quality = 1.0 - nn_distance_0[fi0].first; double quality0 = 1.0 - nn_distance_0[fi0].first * second_nearest_gap_ / nn_distance_0[fi0].second; double quality1 = 1.0 - nn_distance_1[fi1].first * second_nearest_gap_ / nn_distance_1[fi1].second; quality = quality * quality0 * quality1; // TODO other formula? // incorporate existing quality values: Size size0 = max(input_maps[0][fi0].size(), size_t(1)); Size size1 = max(input_maps[1][fi1].size(), size_t(1)); // quality contribution from first map: quality0 = input_maps[0][fi0].getQuality() * (size0 - 1); // quality contribution from second map: quality1 = input_maps[1][fi1].getQuality() * (size1 - 1); f.setQuality((quality + quality0 + quality1) / (size0 + size1 - 1)); is_singleton[0][fi0] = false; is_singleton[1][fi1] = false; } } } // write out unmatched consensus features for (UInt input = 0; input <= 1; ++input) { for (UInt index = 0; index < input_maps[input].size(); ++index) { if (is_singleton[input][index]) { result_map.push_back(input_maps[input][index]); if (result_map.back().size() < 2) // singleton consensus feature { result_map.back().setQuality(0.0); } } } } // canonical ordering for checking the results, and the ids have no real meaning anyway result_map.sortByMZ(); // protein IDs and unassigned peptide IDs are added to the result by the // FeatureGroupingAlgorithm! }
void LabeledPairFinder::run(const vector<ConsensusMap>& input_maps, ConsensusMap& result_map) { if (input_maps.size() != 1) throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "exactly one input map required"); if (result_map.getFileDescriptions().size() != 2) throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "two file descriptions required"); if (result_map.getFileDescriptions().begin()->second.filename != result_map.getFileDescriptions().rbegin()->second.filename) throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the two file descriptions have to contain the same file name"); checkIds_(input_maps); //look up the light and heavy index Size light_index = numeric_limits<Size>::max(); Size heavy_index = numeric_limits<Size>::max(); for (ConsensusMap::FileDescriptions::const_iterator it = result_map.getFileDescriptions().begin(); it != result_map.getFileDescriptions().end(); ++it) { if (it->second.label == "heavy") { heavy_index = it->first; } else if (it->second.label == "light") { light_index = it->first; } } if (light_index == numeric_limits<Size>::max() || heavy_index == numeric_limits<Size>::max()) { throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the input maps have to be labeled 'light' and 'heavy'"); } result_map.clear(false); // sort consensus features by RT (and MZ) to speed up searching afterwards typedef ConstRefVector<ConsensusMap> RefMap; RefMap model_ref(input_maps[0].begin(), input_maps[0].end()); model_ref.sortByPosition(); //calculate matches ConsensusMap matches; //settings double rt_pair_dist = param_.getValue("rt_pair_dist"); double rt_dev_low = param_.getValue("rt_dev_low"); double rt_dev_high = param_.getValue("rt_dev_high"); double mz_dev = param_.getValue("mz_dev"); DoubleList mz_pair_dists = param_.getValue("mz_pair_dists"); bool mrm = param_.getValue("mrm").toBool(); //estimate RT parameters if (param_.getValue("rt_estimate") == "true") { //find all possible RT distances of features with the same charge and a good m/z distance vector<double> dists; dists.reserve(model_ref.size()); for (RefMap::const_iterator it = model_ref.begin(); it != model_ref.end(); ++it) { for (RefMap::const_iterator it2 = model_ref.begin(); it2 != model_ref.end(); ++it2) { for (DoubleList::const_iterator dist_it = mz_pair_dists.begin(); dist_it != mz_pair_dists.end(); ++dist_it) { double mz_pair_dist = *dist_it; if (it2->getCharge() == it->getCharge() && it2->getMZ() >= it->getMZ() + mz_pair_dist / it->getCharge() - mz_dev && it2->getMZ() <= it->getMZ() + mz_pair_dist / it->getCharge() + mz_dev) { dists.push_back(it2->getRT() - it->getRT()); } } } } if (dists.empty()) { cout << "Warning: Could not find pairs for RT distance estimation. The manual settings are used!" << endl; } else { if (dists.size() < 50) { cout << "Warning: Found only " << dists.size() << " pairs. The estimated shift and std deviation are probably not reliable!" << endl; } //--------------------------- estimate initial parameters of fit --------------------------- GaussFitter::GaussFitResult result(-1, -1, -1); //first estimate of the optimal shift: median of the distances sort(dists.begin(), dists.end()); Size median_index = dists.size() / 2; result.x0 = dists[median_index]; //create histogram of distances //consider only the maximum of pairs, centered around the optimal shift Size max_pairs = model_ref.size() / 2; Size start_index = (Size) max((SignedSize)0, (SignedSize)(median_index - max_pairs / 2)); Size end_index = (Size) min((SignedSize)(dists.size() - 1), (SignedSize)(median_index + max_pairs / 2)); double start_value = dists[start_index]; double end_value = dists[end_index]; double bin_step = fabs(end_value - start_value) / 99.999; //ensure that we have 100 bins Math::Histogram<> hist(start_value, end_value, bin_step); //std::cout << "HIST from " << start_value << " to " << end_value << " (bin size " << bin_step << ")" << endl; for (Size i = start_index; i <= end_index; ++i) { hist.inc(dists[i]); } //cout << hist << endl; dists.clear(); //determine median of bins (uniform background distribution) vector<Size> bins(hist.begin(), hist.end()); sort(bins.begin(), bins.end()); Size bin_median = bins[bins.size() / 2]; bins.clear(); //estimate scale A: maximum of the histogram Size max_value = hist.maxValue(); result.A = max_value - bin_median; //overwrite estimate of x0 with the position of the highest bin for (Size i = 0; i < hist.size(); ++i) { if (hist[i] == max_value) { result.x0 = hist.centerOfBin(i); break; } } //estimate sigma: first time the count is less or equal the median count in the histogram double pos = result.x0; while (pos > start_value && hist.binValue(pos) > bin_median) { pos -= bin_step; } double sigma_low = result.x0 - pos; pos = result.x0; while (pos<end_value&& hist.binValue(pos)> bin_median) { pos += bin_step; } double sigma_high = pos - result.x0; result.sigma = (sigma_high + sigma_low) / 6.0; //cout << "estimated optimal RT distance (before fit): " << result.x0 << endl; //cout << "estimated allowed deviation (before fit): " << result.sigma*3.0 << endl; //--------------------------- do gauss fit --------------------------- vector<DPosition<2> > points(hist.size()); for (Size i = 0; i < hist.size(); ++i) { points[i][0] = hist.centerOfBin(i); points[i][1] = max(0u, hist[i]); } GaussFitter fitter; fitter.setInitialParameters(result); result = fitter.fit(points); cout << "estimated optimal RT distance: " << result.x0 << endl; cout << "estimated allowed deviation: " << fabs(result.sigma) * 3.0 << endl; rt_pair_dist = result.x0; rt_dev_low = fabs(result.sigma) * 3.0; rt_dev_high = fabs(result.sigma) * 3.0; } } // check each feature for (RefMap::const_iterator it = model_ref.begin(); it != model_ref.end(); ++it) { for (DoubleList::const_iterator dist_it = mz_pair_dists.begin(); dist_it != mz_pair_dists.end(); ++dist_it) { double mz_pair_dist = *dist_it; RefMap::const_iterator it2 = lower_bound(model_ref.begin(), model_ref.end(), it->getRT() + rt_pair_dist - rt_dev_low, ConsensusFeature::RTLess()); while (it2 != model_ref.end() && it2->getRT() <= it->getRT() + rt_pair_dist + rt_dev_high) { // if in mrm mode, we need to compare precursor mass difference and fragment mass difference, charge remains the same double prec_mz_diff(0); if (mrm) { prec_mz_diff = fabs((double)it2->getMetaValue("MZ") - (double)it->getMetaValue("MZ")); if (it->getCharge() != 0) { prec_mz_diff = fabs(prec_mz_diff - mz_pair_dist / it->getCharge()); } else { prec_mz_diff = fabs(prec_mz_diff - mz_pair_dist); } } bool mrm_correct_dist(false); double frag_mz_diff = fabs(it->getMZ() - it2->getMZ()); //cerr << it->getRT() << " charge1=" << it->getCharge() << ", charge2=" << it2->getCharge() << ", prec_diff=" << prec_mz_diff << ", frag_diff=" << frag_mz_diff << endl; if (mrm && it2->getCharge() == it->getCharge() && prec_mz_diff < mz_dev && (frag_mz_diff < mz_dev || fabs(frag_mz_diff - mz_pair_dist) < mz_dev)) { mrm_correct_dist = true; //cerr << "mrm_correct_dist" << endl; } if ((mrm && mrm_correct_dist) || (!mrm && it2->getCharge() == it->getCharge() && it2->getMZ() >= it->getMZ() + mz_pair_dist / it->getCharge() - mz_dev && it2->getMZ() <= it->getMZ() + mz_pair_dist / it->getCharge() + mz_dev )) { //cerr << "dist correct" << endl; double score = sqrt( PValue_(it2->getMZ() - it->getMZ(), mz_pair_dist / it->getCharge(), mz_dev, mz_dev) * PValue_(it2->getRT() - it->getRT(), rt_pair_dist, rt_dev_low, rt_dev_high) ); // Note: we used to copy the id from the light feature here, but that strategy does not generalize to more than two labels. // We might want to report consensus features where the light one is missing but more than one heavier variant was found. // Also, the old strategy is inconsistent with what was done in the unlabeled case. Thus now we assign a new unique id here. matches.push_back(ConsensusFeature()); matches.back().setUniqueId(); matches.back().insert(light_index, *it); matches.back().clearMetaInfo(); matches.back().insert(heavy_index, *it2); matches.back().setQuality(score); matches.back().setCharge(it->getCharge()); matches.back().computeMonoisotopicConsensus(); } ++it2; } } } //compute best pairs // - sort matches by quality // - take highest-quality matches first (greedy) and mark them as used set<Size> used_features; matches.sortByQuality(true); for (ConsensusMap::const_iterator match = matches.begin(); match != matches.end(); ++match) { //check if features are not used yet if (used_features.find(match->begin()->getUniqueId()) == used_features.end() && used_features.find(match->rbegin()->getUniqueId()) == used_features.end() ) { //if unused, add it to the final set of elements result_map.push_back(*match); used_features.insert(match->begin()->getUniqueId()); used_features.insert(match->rbegin()->getUniqueId()); } } //Add protein identifications to result map for (Size i = 0; i < input_maps.size(); ++i) { result_map.getProteinIdentifications().insert(result_map.getProteinIdentifications().end(), input_maps[i].getProteinIdentifications().begin(), input_maps[i].getProteinIdentifications().end()); } //Add unassigned peptide identifications to result map for (Size i = 0; i < input_maps.size(); ++i) { result_map.getUnassignedPeptideIdentifications().insert(result_map.getUnassignedPeptideIdentifications().end(), input_maps[i].getUnassignedPeptideIdentifications().begin(), input_maps[i].getUnassignedPeptideIdentifications().end()); } // Very useful for checking the results, and the ids have no real meaning anyway result_map.sortByMZ(); }