Esempio n. 1
0
  void StablePairFinder::run(const std::vector<ConsensusMap>& input_maps,
                             ConsensusMap& result_map)
  {
    // empty output destination:
    result_map.clear(false);

    // sanity checks:
    if (input_maps.size() != 2)
    {
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__,
                                       "exactly two input maps required");
    }
    checkIds_(input_maps);

    // set up the distance functor:
    double max_intensity = max(input_maps[0].getMaxInt(),
                               input_maps[1].getMaxInt());
    Param distance_params = param_.copy("");
    distance_params.remove("use_identifications");
    distance_params.remove("second_nearest_gap");
    FeatureDistance feature_distance(max_intensity, false);
    feature_distance.setParameters(distance_params);

    // keep track of pairing:
    std::vector<bool> is_singleton[2];
    is_singleton[0].resize(input_maps[0].size(), true);
    is_singleton[1].resize(input_maps[1].size(), true);

    typedef pair<double, double> DoublePair;
    DoublePair init = make_pair(FeatureDistance::infinity,
                                FeatureDistance::infinity);

    // for every element in map 0:
    // - index of nearest neighbor in map 1:
    vector<UInt> nn_index_0(input_maps[0].size(), UInt(-1));
    // - distances to nearest and second-nearest neighbors in map 1:
    vector<DoublePair> nn_distance_0(input_maps[0].size(), init);

    // for every element in map 1:
    // - index of nearest neighbor in map 0:
    vector<UInt> nn_index_1(input_maps[1].size(), UInt(-1));
    // - distances to nearest and second-nearest neighbors in map 0:
    vector<DoublePair> nn_distance_1(input_maps[1].size(), init);

    // iterate over all feature pairs, find nearest neighbors:
    // TODO: iterate over SENSIBLE RT (and m/z) window -- sort the maps beforehand
    //       to save a lot of processing time...
    //       Once done, remove the warning in the description of the 'use_identifications' parameter
    for (UInt fi0 = 0; fi0 < input_maps[0].size(); ++fi0)
    {
      const ConsensusFeature& feat0 = input_maps[0][fi0];

      for (UInt fi1 = 0; fi1 < input_maps[1].size(); ++fi1)
      {
        const ConsensusFeature& feat1 = input_maps[1][fi1];

        if (use_IDs_ && !compatibleIDs_(feat0, feat1)) // check peptide IDs
        {
          continue; // mismatch
        }

        pair<bool, double> result = feature_distance(feat0, feat1);
        double distance = result.second;
        // we only care if distance constraints are satisfied for "best
        // matches", not for second-best; this means that second-best distances
        // can become smaller than best distances
        // (e.g. the RT is larger than allowed (->invalid pair), but m/z is perfect and has the most weight --> better score!)
        bool valid = result.first;

        // update entries for map 0:
        if (distance < nn_distance_0[fi0].second)
        {
          if (valid && (distance < nn_distance_0[fi0].first))
          {
            nn_distance_0[fi0].second = nn_distance_0[fi0].first;
            nn_distance_0[fi0].first = distance;
            nn_index_0[fi0] = fi1;
          }
          else
          {
            nn_distance_0[fi0].second = distance;
          }
        }
        // update entries for map 1:
        if (distance < nn_distance_1[fi1].second)
        {
          if (valid && (distance < nn_distance_1[fi1].first))
          {
            nn_distance_1[fi1].second = nn_distance_1[fi1].first;
            nn_distance_1[fi1].first = distance;
            nn_index_1[fi1] = fi0;
          }
          else
          {
            nn_distance_1[fi1].second = distance;
          }
        }
      }
    }

    // if features from the two maps are nearest neighbors of each other, they
    // can become a pair:
    for (UInt fi0 = 0; fi0 < input_maps[0].size(); ++fi0)
    {
      UInt fi1 = nn_index_0[fi0]; // nearest neighbor of "fi0" in map 1
      // cout << "index: " << fi0 << ", RT: " << input_maps[0][fi0].getRT()
      //         << ", MZ: " << input_maps[0][fi0].getMZ() << endl
      //         << "neighbor: " << fi1 << ", RT: " << input_maps[1][fi1].getRT()
      //         << ", MZ: " << input_maps[1][fi1].getMZ() << endl
      //         << "d(i,j): " << nn_distance_0[fi0].first << endl
      //         << "d2(i): " << nn_distance_0[fi0].second << endl
      //         << "d2(j): " << nn_distance_1[fi1].second << endl;

      // criteria set by the parameters must be fulfilled:
      if ((nn_distance_0[fi0].first < FeatureDistance::infinity) &&
          (nn_distance_0[fi0].first * second_nearest_gap_ <= nn_distance_0[fi0].second))
      {
        // "fi0" satisfies constraints...
        if ((nn_index_1[fi1] == fi0) &&
            (nn_distance_1[fi1].first * second_nearest_gap_ <= nn_distance_1[fi1].second))
        {
          // ...nearest neighbor of "fi0" also satisfies constraints (yay!)
          // cout << "match!" << endl;
          result_map.push_back(ConsensusFeature());
          ConsensusFeature& f = result_map.back();

          f.insert(input_maps[0][fi0]);
          f.getPeptideIdentifications().insert(f.getPeptideIdentifications().end(),
                                               input_maps[0][fi0].getPeptideIdentifications().begin(),
                                               input_maps[0][fi0].getPeptideIdentifications().end());

          f.insert(input_maps[1][fi1]);
          f.getPeptideIdentifications().insert(f.getPeptideIdentifications().end(),
                                               input_maps[1][fi1].getPeptideIdentifications().begin(),
                                               input_maps[1][fi1].getPeptideIdentifications().end());

          f.computeConsensus();
          double quality = 1.0 - nn_distance_0[fi0].first;
          double quality0 = 1.0 - nn_distance_0[fi0].first * second_nearest_gap_ / nn_distance_0[fi0].second;
          double quality1 = 1.0 - nn_distance_1[fi1].first * second_nearest_gap_ / nn_distance_1[fi1].second;
          quality = quality * quality0 * quality1; // TODO other formula?

          // incorporate existing quality values:
          Size size0 = max(input_maps[0][fi0].size(), size_t(1));
          Size size1 = max(input_maps[1][fi1].size(), size_t(1));
          // quality contribution from first map:
          quality0 = input_maps[0][fi0].getQuality() * (size0 - 1);
          // quality contribution from second map:
          quality1 = input_maps[1][fi1].getQuality() * (size1 - 1);
          f.setQuality((quality + quality0 + quality1) / (size0 + size1 - 1));

          is_singleton[0][fi0] = false;
          is_singleton[1][fi1] = false;
        }
      }
    }

    // write out unmatched consensus features
    for (UInt input = 0; input <= 1; ++input)
    {
      for (UInt index = 0; index < input_maps[input].size(); ++index)
      {
        if (is_singleton[input][index])
        {
          result_map.push_back(input_maps[input][index]);
          if (result_map.back().size() < 2) // singleton consensus feature
          {
            result_map.back().setQuality(0.0);
          }
        }
      }
    }

    // canonical ordering for checking the results, and the ids have no real meaning anyway
    result_map.sortByMZ();

    // protein IDs and unassigned peptide IDs are added to the result by the
    // FeatureGroupingAlgorithm!
  }
Esempio n. 2
0
  void LabeledPairFinder::run(const vector<ConsensusMap>& input_maps, ConsensusMap& result_map)
  {
    if (input_maps.size() != 1)
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "exactly one input map required");
    if (result_map.getFileDescriptions().size() != 2)
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "two file descriptions required");
    if (result_map.getFileDescriptions().begin()->second.filename != result_map.getFileDescriptions().rbegin()->second.filename)
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the two file descriptions have to contain the same file name");
    checkIds_(input_maps);

    //look up the light and heavy index
    Size light_index = numeric_limits<Size>::max();
    Size heavy_index = numeric_limits<Size>::max();
    for (ConsensusMap::FileDescriptions::const_iterator it = result_map.getFileDescriptions().begin();
         it != result_map.getFileDescriptions().end();
         ++it)
    {
      if (it->second.label == "heavy")
      {
        heavy_index = it->first;
      }
      else if (it->second.label == "light")
      {
        light_index = it->first;
      }
    }
    if (light_index == numeric_limits<Size>::max() || heavy_index == numeric_limits<Size>::max())
    {
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the input maps have to be labeled 'light' and 'heavy'");
    }

    result_map.clear(false);

    // sort consensus features by RT (and MZ) to speed up searching afterwards
    typedef ConstRefVector<ConsensusMap> RefMap;
    RefMap model_ref(input_maps[0].begin(), input_maps[0].end());
    model_ref.sortByPosition();

    //calculate matches
    ConsensusMap matches;
    //settings
    double rt_pair_dist = param_.getValue("rt_pair_dist");
    double rt_dev_low = param_.getValue("rt_dev_low");
    double rt_dev_high = param_.getValue("rt_dev_high");
    double mz_dev = param_.getValue("mz_dev");
    DoubleList mz_pair_dists = param_.getValue("mz_pair_dists");
    bool mrm = param_.getValue("mrm").toBool();

    //estimate RT parameters
    if (param_.getValue("rt_estimate") == "true")
    {
      //find all possible RT distances of features with the same charge and a good m/z distance
      vector<double> dists;
      dists.reserve(model_ref.size());
      for (RefMap::const_iterator it = model_ref.begin(); it != model_ref.end(); ++it)
      {
        for (RefMap::const_iterator it2 = model_ref.begin(); it2 != model_ref.end(); ++it2)
        {
          for (DoubleList::const_iterator dist_it = mz_pair_dists.begin(); dist_it != mz_pair_dists.end(); ++dist_it)
          {
            double mz_pair_dist = *dist_it;
            if (it2->getCharge() == it->getCharge()
               && it2->getMZ() >= it->getMZ() + mz_pair_dist / it->getCharge() - mz_dev
               && it2->getMZ() <= it->getMZ() + mz_pair_dist / it->getCharge() + mz_dev)
            {
              dists.push_back(it2->getRT() - it->getRT());
            }
          }
        }
      }
      if (dists.empty())
      {
        cout << "Warning: Could not find pairs for RT distance estimation. The manual settings are used!" << endl;
      }
      else
      {
        if (dists.size() < 50)
        {
          cout << "Warning: Found only " << dists.size() << " pairs. The estimated shift and std deviation are probably not reliable!" << endl;
        }
        //--------------------------- estimate initial parameters of fit ---------------------------
        GaussFitter::GaussFitResult result(-1, -1, -1);
        //first estimate of the optimal shift: median of the distances
        sort(dists.begin(), dists.end());
        Size median_index = dists.size() / 2;
        result.x0 = dists[median_index];
        //create histogram of distances
        //consider only the maximum of pairs, centered around the optimal shift
        Size max_pairs = model_ref.size() / 2;
        Size start_index = (Size) max((SignedSize)0, (SignedSize)(median_index - max_pairs / 2));
        Size end_index = (Size) min((SignedSize)(dists.size() - 1), (SignedSize)(median_index + max_pairs / 2));
        double start_value = dists[start_index];
        double end_value = dists[end_index];
        double bin_step = fabs(end_value - start_value) / 99.999; //ensure that we have 100 bins
        Math::Histogram<> hist(start_value, end_value, bin_step);
        //std::cout << "HIST from " << start_value << " to " << end_value << " (bin size " << bin_step << ")" << endl;
        for (Size i = start_index; i <= end_index; ++i)
        {
          hist.inc(dists[i]);
        }
        //cout << hist << endl;
        dists.clear();
        //determine median of bins (uniform background distribution)
        vector<Size> bins(hist.begin(), hist.end());
        sort(bins.begin(), bins.end());
        Size bin_median = bins[bins.size() / 2];
        bins.clear();
        //estimate scale A: maximum of the histogram
        Size max_value = hist.maxValue();
        result.A = max_value - bin_median;
        //overwrite estimate of x0 with the position of the highest bin
        for (Size i = 0; i < hist.size(); ++i)
        {
          if (hist[i] == max_value)
          {
            result.x0 = hist.centerOfBin(i);
            break;
          }
        }
        //estimate sigma: first time the count is less or equal the median count in the histogram
        double pos = result.x0;
        while (pos > start_value && hist.binValue(pos) > bin_median)
        {
          pos -= bin_step;
        }
        double sigma_low =  result.x0 - pos;
        pos = result.x0;
        while (pos<end_value&& hist.binValue(pos)> bin_median)
        {
          pos += bin_step;
        }
        double sigma_high = pos - result.x0;
        result.sigma = (sigma_high + sigma_low) / 6.0;
        //cout << "estimated optimal RT distance (before fit): " << result.x0 << endl;
        //cout << "estimated allowed deviation (before fit): " << result.sigma*3.0 << endl;
        //--------------------------- do gauss fit ---------------------------
        vector<DPosition<2> > points(hist.size());
        for (Size i = 0; i < hist.size(); ++i)
        {
          points[i][0] = hist.centerOfBin(i);
          points[i][1] = max(0u, hist[i]);
        }
        GaussFitter fitter;
        fitter.setInitialParameters(result);
        result = fitter.fit(points);
        cout << "estimated optimal RT distance: " << result.x0 << endl;
        cout << "estimated allowed deviation: " << fabs(result.sigma) * 3.0 << endl;
        rt_pair_dist = result.x0;
        rt_dev_low = fabs(result.sigma) * 3.0;
        rt_dev_high = fabs(result.sigma) * 3.0;
      }
    }


    // check each feature
    for (RefMap::const_iterator it = model_ref.begin(); it != model_ref.end(); ++it)
    {
      for (DoubleList::const_iterator dist_it = mz_pair_dists.begin(); dist_it != mz_pair_dists.end(); ++dist_it)
      {
        double mz_pair_dist = *dist_it;
        RefMap::const_iterator it2 = lower_bound(model_ref.begin(), model_ref.end(), it->getRT() + rt_pair_dist - rt_dev_low, ConsensusFeature::RTLess());
        while (it2 != model_ref.end() && it2->getRT() <= it->getRT() + rt_pair_dist + rt_dev_high)
        {
          // if in mrm mode, we need to compare precursor mass difference and fragment mass difference, charge remains the same

          double prec_mz_diff(0);
          if (mrm)
          {
            prec_mz_diff = fabs((double)it2->getMetaValue("MZ") - (double)it->getMetaValue("MZ"));
            if (it->getCharge() != 0)
            {
              prec_mz_diff = fabs(prec_mz_diff - mz_pair_dist / it->getCharge());
            }
            else
            {
              prec_mz_diff = fabs(prec_mz_diff - mz_pair_dist);
            }
          }

          bool mrm_correct_dist(false);
          double frag_mz_diff = fabs(it->getMZ() - it2->getMZ());

          //cerr << it->getRT() << " charge1=" << it->getCharge() << ", charge2=" << it2->getCharge() << ", prec_diff=" << prec_mz_diff << ", frag_diff=" << frag_mz_diff << endl;

          if (mrm &&
              it2->getCharge() == it->getCharge() &&
              prec_mz_diff < mz_dev &&
              (frag_mz_diff < mz_dev || fabs(frag_mz_diff - mz_pair_dist) < mz_dev))
          {
            mrm_correct_dist = true;
            //cerr << "mrm_correct_dist" << endl;
          }

          if ((mrm && mrm_correct_dist) || (!mrm &&
                                            it2->getCharge() == it->getCharge() &&
                                            it2->getMZ() >= it->getMZ() + mz_pair_dist / it->getCharge() - mz_dev &&
                                            it2->getMZ() <= it->getMZ() + mz_pair_dist / it->getCharge() + mz_dev
                                            ))
          {
            //cerr << "dist correct" << endl;
            double score = sqrt(
              PValue_(it2->getMZ() - it->getMZ(), mz_pair_dist / it->getCharge(), mz_dev, mz_dev) *
              PValue_(it2->getRT() - it->getRT(), rt_pair_dist, rt_dev_low, rt_dev_high)
              );

            // Note: we used to copy the id from the light feature here, but that strategy does not generalize to more than two labels.
            // We might want to report consensus features where the light one is missing but more than one heavier variant was found.
            // Also, the old strategy is inconsistent with what was done in the unlabeled case.  Thus now we assign a new unique id here.
            matches.push_back(ConsensusFeature());
            matches.back().setUniqueId();

            matches.back().insert(light_index, *it);
            matches.back().clearMetaInfo();
            matches.back().insert(heavy_index, *it2);
            matches.back().setQuality(score);
            matches.back().setCharge(it->getCharge());
            matches.back().computeMonoisotopicConsensus();
          }
          ++it2;
        }
      }
    }

    //compute best pairs
    // - sort matches by quality
    // - take highest-quality matches first (greedy) and mark them as used
    set<Size> used_features;
    matches.sortByQuality(true);
    for (ConsensusMap::const_iterator match = matches.begin(); match != matches.end(); ++match)
    {
      //check if features are not used yet
      if (used_features.find(match->begin()->getUniqueId()) == used_features.end() &&
          used_features.find(match->rbegin()->getUniqueId()) == used_features.end()
          )
      {
        //if unused, add it to the final set of elements
        result_map.push_back(*match);
        used_features.insert(match->begin()->getUniqueId());
        used_features.insert(match->rbegin()->getUniqueId());
      }
    }

    //Add protein identifications to result map
    for (Size i = 0; i < input_maps.size(); ++i)
    {
      result_map.getProteinIdentifications().insert(result_map.getProteinIdentifications().end(), input_maps[i].getProteinIdentifications().begin(), input_maps[i].getProteinIdentifications().end());
    }

    //Add unassigned peptide identifications to result map
    for (Size i = 0; i < input_maps.size(); ++i)
    {
      result_map.getUnassignedPeptideIdentifications().insert(result_map.getUnassignedPeptideIdentifications().end(), input_maps[i].getUnassignedPeptideIdentifications().begin(), input_maps[i].getUnassignedPeptideIdentifications().end());
    }

    // Very useful for checking the results, and the ids have no real meaning anyway
    result_map.sortByMZ();
  }