예제 #1
0
  ExitCodes main_(int, const char**)
  {
    //-------------------------------------------------------------
    // parameter handling
    //-------------------------------------------------------------
    StringList in = getStringList_("in");
    String edta = getStringOption_("pos");
    String out = getStringOption_("out");
    String out_sep = getStringOption_("out_separator");
    String out_TIC_debug = getStringOption_("auto_rt:out_debug_TIC");

    StringList in_header = getStringList_("in_header");


    // number of out_debug_TIC files and input files must be identical
    /*if (out_TIC_debug.size() > 0 && in.size() != out_TIC_debug.size())
    {
        LOG_FATAL_ERROR << "Error: number of input file 'in' and auto_rt:out_debug_TIC files must be identical!" << std::endl;
        return ILLEGAL_PARAMETERS;
    }*/

    // number of header files and input files must be identical
    if (in_header.size() > 0 && in.size() != in_header.size())
    {
      LOG_FATAL_ERROR << "Error: number of input file 'in' and 'in_header' files must be identical!" << std::endl;
      return ILLEGAL_PARAMETERS;
    }

    if (!getFlag_("auto_rt:enabled") && !out_TIC_debug.empty())
    {
      LOG_FATAL_ERROR << "Error: TIC output file requested, but auto_rt is not enabled! Either do not request the file or switch on 'auto_rt:enabled'." << std::endl;
      return ILLEGAL_PARAMETERS;
    }

    double rttol = getDoubleOption_("rt_tol");
    double mztol = getDoubleOption_("mz_tol");
    Size rt_collect = getIntOption_("rt_collect");

    //-------------------------------------------------------------
    // loading input
    //-------------------------------------------------------------
    MzMLFile mzml_file;
    mzml_file.setLogType(log_type_);
    MSExperiment<Peak1D> exp, exp_pp;

    EDTAFile ed;
    ConsensusMap cm;
    ed.load(edta, cm);

    StringList tf_single_header0, tf_single_header1, tf_single_header2; // header content, for each column

    std::vector<String> vec_single; // one line for each compound, multiple columns per experiment
    vec_single.resize(cm.size());
    for (Size fi = 0; fi < in.size(); ++fi)
    {
      // load raw data
      mzml_file.load(in[fi], exp);
      exp.sortSpectra(true);

      if (exp.empty())
      {
        LOG_WARN << "The given file does not contain any conventional peak data, but might"
                    " contain chromatograms. This tool currently cannot handle them, sorry." << std::endl;
        return INCOMPATIBLE_INPUT_DATA;
      }

      // try to detect RT peaks (only for the first input file -- all others should align!)
      // cm.size() might change in here...
      if (getFlag_("auto_rt:enabled") && fi == 0)
      {
        ConsensusMap cm_local = cm; // we might have different RT peaks for each map if 'auto_rt' is enabled
        cm.clear(false); // reset global list (about to be filled)

        // compute TIC
        MSChromatogram<> tic = exp.getTIC();
        MSSpectrum<> tics, tic_gf, tics_pp, tics_sn;
        for (Size ic = 0; ic < tic.size(); ++ic)
        { // rewrite Chromatogram to MSSpectrum (GaussFilter requires it)
          Peak1D peak;
          peak.setMZ(tic[ic].getRT());
          peak.setIntensity(tic[ic].getIntensity());
          tics.push_back(peak);
        }
        // smooth (no PP_CWT here due to efficiency reasons -- large FWHM take longer!)
        double fwhm = getDoubleOption_("auto_rt:FHWM");
        GaussFilter gf;
        Param p = gf.getParameters();
        p.setValue("gaussian_width", fwhm * 2); // wider than FWHM, just to be sure we have a fully smoothed peak. Merging two peaks is unlikely
        p.setValue("use_ppm_tolerance", "false");
        gf.setParameters(p);
        tic_gf = tics;
        gf.filter(tic_gf);
        // pick peaks
        PeakPickerHiRes pp;
        p = pp.getParameters();
        p.setValue("signal_to_noise", getDoubleOption_("auto_rt:SNThreshold"));
        pp.setParameters(p);
        pp.pick(tic_gf, tics_pp);

        if (tics_pp.size())
        {
          LOG_INFO << "Found " << tics_pp.size() << " auto-rt peaks at: ";
          for (Size ipp = 0; ipp != tics_pp.size(); ++ipp) LOG_INFO << " " << tics_pp[ipp].getMZ();
        }
        else
        {
          LOG_INFO << "Found no auto-rt peaks. Change threshold parameters!";
        }
        LOG_INFO << std::endl;

        if (!out_TIC_debug.empty()) // if debug file was given
        { // store intermediate steps for debug
          MSExperiment<> out_debug;
          out_debug.addChromatogram(toChromatogram(tics));
          out_debug.addChromatogram(toChromatogram(tic_gf));

          SignalToNoiseEstimatorMedian<MSSpectrum<> > snt;
          snt.init(tics);
          for (Size is = 0; is < tics.size(); ++is)
          {
            Peak1D peak;
            peak.setMZ(tic[is].getMZ());
            peak.setIntensity(snt.getSignalToNoise(tics[is]));
            tics_sn.push_back(peak);
          }
          out_debug.addChromatogram(toChromatogram(tics_sn));

          out_debug.addChromatogram(toChromatogram(tics_pp));
          // get rid of "native-id" missing warning
          for (Size id = 0; id < out_debug.size(); ++id) out_debug[id].setNativeID(String("spectrum=") + id);

          mzml_file.store(out_TIC_debug, out_debug);
          LOG_DEBUG << "Storing debug AUTO-RT: " << out_TIC_debug << std::endl;
        }

        // add target EICs: for each m/z with no/negative RT, add all combinations of that m/z with auto-RTs
        // duplicate m/z entries will be ignored!
        // all other lines with positive RT values are copied unaffected
        //do not allow doubles
        std::set<double> mz_doubles;
        for (ConsensusMap::Iterator cit = cm_local.begin(); cit != cm_local.end(); ++cit)
        {
          if (cit->getRT() < 0)
          {
            if (mz_doubles.find(cit->getMZ()) == mz_doubles.end())
            {
              mz_doubles.insert(cit->getMZ());
            }
            else
            {
              LOG_INFO << "Found duplicate m/z entry (" << cit->getMZ() << ") for auto-rt. Skipping ..." << std::endl;
              continue;
            }

            ConsensusMap cm_RT_multiplex;
            for (MSSpectrum<>::ConstIterator itp = tics_pp.begin(); itp != tics_pp.end(); ++itp)
            {
              ConsensusFeature f = *cit;
              f.setRT(itp->getMZ());
              cm.push_back(f);
            }

          }
          else
          { // default feature with no auto-rt
            LOG_INFO << "copying feature with RT " << cit->getRT() << std::endl;
            cm.push_back(*cit);
          }
        }

        // resize, since we have more positions now
        vec_single.resize(cm.size());
      }


      // search for each EIC and add up
      Int not_found(0);
      Map<Size, double> quant;

      String description;
      if (fi < in_header.size())
      {
        HeaderInfo info(in_header[fi]);
        description = info.header_description;
      }

      if (fi == 0)
      { // two additional columns for first file (theoretical RT and m/z)
        tf_single_header0 << "" << "";
        tf_single_header1 << "" << "";
        tf_single_header2 << "RT" << "mz";
      }

      // 5 entries for each input file
      tf_single_header0 << File::basename(in[fi]) << "" << "" << "" << "";
      tf_single_header1 << description << "" << "" << "" << "";
      tf_single_header2 << "RTobs" << "dRT" << "mzobs" << "dppm" << "intensity";

      for (Size i = 0; i < cm.size(); ++i)
      {
        //std::cerr << "Rt" << cm[i].getRT() << "  mz: " << cm[i].getMZ() << " R " <<  cm[i].getMetaValue("rank") << "\n";

        double mz_da = mztol * cm[i].getMZ() / 1e6; // mz tolerance in Dalton
        MSExperiment<>::ConstAreaIterator it = exp.areaBeginConst(cm[i].getRT() - rttol / 2,
                                                                  cm[i].getRT() + rttol / 2,
                                                                  cm[i].getMZ() - mz_da,
                                                                  cm[i].getMZ() + mz_da);
        Peak2D max_peak;
        max_peak.setIntensity(0);
        max_peak.setRT(cm[i].getRT());
        max_peak.setMZ(cm[i].getMZ());
        for (; it != exp.areaEndConst(); ++it)
        {
          if (max_peak.getIntensity() < it->getIntensity())
          {
            max_peak.setIntensity(it->getIntensity());
            max_peak.setRT(it.getRT());
            max_peak.setMZ(it->getMZ());
          }
        }
        double ppm = 0; // observed m/z offset

        if (max_peak.getIntensity() == 0)
        {
          ++not_found;
        }
        else
        {
          // take median for m/z found
          std::vector<double> mz;
          MSExperiment<>::Iterator itm = exp.RTBegin(max_peak.getRT());
          SignedSize low = std::min<SignedSize>(std::distance(exp.begin(), itm), rt_collect);
          SignedSize high = std::min<SignedSize>(std::distance(itm, exp.end()) - 1, rt_collect);
          MSExperiment<>::AreaIterator itt = exp.areaBegin((itm - low)->getRT() - 0.01, (itm + high)->getRT() + 0.01, cm[i].getMZ() - mz_da, cm[i].getMZ() + mz_da);
          for (; itt != exp.areaEnd(); ++itt)
          {
            mz.push_back(itt->getMZ());
            //std::cerr << "ppm: " << itt.getRT() << " " <<  itt->getMZ() << " " << itt->getIntensity() << std::endl;
          }

          if ((SignedSize)mz.size() > (low + high + 1)) LOG_WARN << "Compound " << i << " has overlapping peaks [" << mz.size() << "/" << low + high + 1 << "]" << std::endl;

          if (!mz.empty())
          {
            double avg_mz = std::accumulate(mz.begin(), mz.end(), 0.0) / double(mz.size());
            //std::cerr << "avg: " << avg_mz << "\n";
            ppm = (avg_mz - cm[i].getMZ()) / cm[i].getMZ() * 1e6;
          }

        }

        // appending the second column set requires separator
        String append_sep = (fi == 0 ? "" : out_sep);

        vec_single[i] += append_sep; // new line
        if (fi == 0)
        {
          vec_single[i] += String(cm[i].getRT()) + out_sep +
                           String(cm[i].getMZ()) + out_sep;
        }
        vec_single[i] += String(max_peak.getRT()) + out_sep +
                         String(max_peak.getRT() - cm[i].getRT()) + out_sep +
                         String(max_peak.getMZ()) + out_sep +
                         String(ppm)  + out_sep +
                         String(max_peak.getIntensity());
      }

      if (not_found) LOG_INFO << "Missing peaks for " << not_found << " compounds in file '" << in[fi] << "'.\n";
    }

    //-------------------------------------------------------------
    // create header
    //-------------------------------------------------------------
    vec_single.insert(vec_single.begin(), ListUtils::concatenate(tf_single_header2, out_sep));
    vec_single.insert(vec_single.begin(), ListUtils::concatenate(tf_single_header1, out_sep));
    vec_single.insert(vec_single.begin(), ListUtils::concatenate(tf_single_header0, out_sep));

    //-------------------------------------------------------------
    // writing output
    //-------------------------------------------------------------
    TextFile tf;
    for (std::vector<String>::iterator v_it = vec_single.begin(); v_it != vec_single.end(); ++v_it)
    {
      tf.addLine(*v_it);
    }
    tf.store(out);

    return EXECUTION_OK;
  }
  void IsobaricChannelExtractor::extractChannels(const MSExperiment<Peak1D>& ms_exp_data, ConsensusMap& consensus_map)
  {
    if (ms_exp_data.empty())
    {
      LOG_WARN << "The given file does not contain any conventional peak data, but might"
                  " contain chromatograms. This tool currently cannot handle them, sorry.\n";
      throw Exception::MissingInformation(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Experiment has no scans!");
    }

    // clear the output map
    consensus_map.clear(false);
    consensus_map.setExperimentType("labeled_MS2");

    // create predicate for spectrum checking
    LOG_INFO << "Selecting scans with activation mode: " << (selected_activation_ == "" ? "any" : selected_activation_) << "\n";
    HasActivationMethod<MSExperiment<Peak1D>::SpectrumType> activation_predicate(StringList::create(selected_activation_));

    // now we have picked data
    // --> assign peaks to channels
    UInt64 element_index(0);

    // remember the current precusor spectrum
    MSExperiment<Peak1D>::ConstIterator prec_spec = ms_exp_data.end();

    for (MSExperiment<Peak1D>::ConstIterator it = ms_exp_data.begin(); it != ms_exp_data.end(); ++it)
    {
      // remember the last MS1 spectra as we assume it to be the precursor spectrum
      if (it->getMSLevel() ==  1) prec_spec = it;

      if (selected_activation_ == "" || activation_predicate(*it))
      {
        // check if precursor is available
        if (it->getPrecursors().empty())
        {
          throw Exception::MissingInformation(__FILE__, __LINE__, __PRETTY_FUNCTION__, String("No precursor information given for scan native ID ") + it->getNativeID() + " with RT " + String(it->getRT()));
        }

        // check precursor constraints
        if (!isValidPrecursor_(it->getPrecursors()[0]))
        {
          LOG_DEBUG << "Skip spectrum " << it->getNativeID() << ": Precursor doesn't fulfill all constraints." << std::endl;
          continue;
        }

        // check precursor purity if we have a valid precursor ..
        if (prec_spec != ms_exp_data.end())
        {
          const DoubleReal purity = computePrecursorPurity_(it, prec_spec);
          if (purity < min_precursor_purity_)
          {
            LOG_DEBUG << "Skip spectrum " << it->getNativeID() << ": Precursor purity is below the threshold. [purity = " << purity << "]" << std::endl;
            continue;
          }
        }
        else
        {
          LOG_INFO << "No precursor available for spectrum: " << it->getNativeID() << std::endl;
        }
        if (!(prec_spec == ms_exp_data.end()) && computePrecursorPurity_(it, prec_spec) < min_precursor_purity_)
        {
          LOG_DEBUG << "Skip spectrum " << it->getNativeID() << ": Precursor purity is below the threshold." << std::endl;
          continue;
        }

        // store RT&MZ of parent ion as centroid of ConsensusFeature
        ConsensusFeature cf;
        cf.setUniqueId();
        cf.setRT(it->getRT());
        cf.setMZ(it->getPrecursors()[0].getMZ());

        Peak2D channel_value;
        channel_value.setRT(it->getRT());
        // for each each channel
        UInt64 map_index = 0;
        Peak2D::IntensityType overall_intensity = 0;
        for (IsobaricQuantitationMethod::IsobaricChannelList::const_iterator cl_it = quant_method_->getChannelInformation().begin();
             cl_it != quant_method_->getChannelInformation().end();
             ++cl_it)
        {
          // set mz-position of channel
          channel_value.setMZ(cl_it->center);
          // reset intensity
          channel_value.setIntensity(0);

          // as every evaluation requires time, we cache the MZEnd iterator
          const MSExperiment<Peak1D>::SpectrumType::ConstIterator mz_end = it->MZEnd(cl_it->center + reporter_mass_shift_);

          // add up all signals
          for (MSExperiment<Peak1D>::SpectrumType::ConstIterator mz_it = it->MZBegin(cl_it->center - reporter_mass_shift_);
               mz_it != mz_end;
               ++mz_it)
          {
            channel_value.setIntensity(channel_value.getIntensity() + mz_it->getIntensity());
          }

          // discard contribution of this channel as it is below the required intensity threshold
          if (channel_value.getIntensity() < min_reporter_intensity_)
          {
            channel_value.setIntensity(0);
          }

          overall_intensity += channel_value.getIntensity();
          // add channel to ConsensusFeature
          cf.insert(map_index++, channel_value, element_index);
        } // ! channel_iterator

        // check if we keep this feature or if it contains low-intensity quantifications
        if (remove_low_intensity_quantifications_ && hasLowIntensityReporter_(cf))
        {
          continue;
        }

        // check featureHandles are not empty
        if (overall_intensity == 0)
        {
          cf.setMetaValue("all_empty", String("true"));
        }
        cf.setIntensity(overall_intensity);
        consensus_map.push_back(cf);

        // the tandem-scan in the order they appear in the experiment
        ++element_index;
      }
    } // ! Experiment iterator

    /// add meta information to the map
    registerChannelsInOutputMap_(consensus_map);
  }
  /// @brief extracts the iTRAQ channels from the MS data and stores intensity values in a consensus map
  ///
  /// @param ms_exp_data Raw data to read
  /// @param consensus_map Output each MS² scan as a consensus feature
  /// @throws Exception::MissingInformation if no scans present or MS² scan has no precursor
  void ItraqChannelExtractor::run(const MSExperiment<Peak1D>& ms_exp_data, ConsensusMap& consensus_map)
  {
    if (ms_exp_data.empty())
    {
      LOG_WARN << "The given file does not contain any conventional peak data, but might"
                  " contain chromatograms. This tool currently cannot handle them, sorry.";
      throw Exception::MissingInformation(__FILE__, __LINE__, __PRETTY_FUNCTION__, "Experiment has no scans!");
    }

    MSExperiment<> ms_exp_MS2;

    String mode = (String) param_.getValue("select_activation");
    std::cout << "Selecting scans with activation mode: " << (mode == "" ? "any" : mode) << "\n";
    HasActivationMethod<MSExperiment<Peak1D>::SpectrumType> activation_predicate(ListUtils::create<String>(mode));

    for (size_t idx = 0; idx < ms_exp_data.size(); ++idx)
    {
      if (ms_exp_data[idx].getMSLevel() == 2)
      {
        if (mode == "" || activation_predicate(ms_exp_data[idx]))
        {
          // copy only MS² scans
          ms_exp_MS2.addSpectrum(ms_exp_data[idx]);
        }
        else
        {
          //std::cout << "deleting spectrum # " << idx << " with RT: " << ms_exp_data[idx].getRT() << "\n";
        }
      }
    }

#ifdef ITRAQ_DEBUG
    std::cout << "we have " << ms_exp_MS2.size() << " scans left of level " << ms_exp_MS2[0].getMSLevel() << std::endl;
    std::cout << "run: channel_map_ has " << channel_map_.size() << " entries!" << std::endl;
#endif
    consensus_map.clear(false);
    // set <mapList> header
    Int index_cnt = 0;
    for (ChannelMapType::const_iterator cm_it = channel_map_.begin(); cm_it != channel_map_.end(); ++cm_it)
    {
      // structure of Map cm_it
      //  first == channel-name as Int e.g. 114
      //  second == ChannelInfo struct
      ConsensusMap::FileDescription channel_as_map;
      // label is the channel + description provided in the Params
      if (itraq_type_ != TMT_SIXPLEX)
        channel_as_map.label = "iTRAQ_" + String(cm_it->second.name) + "_" + String(cm_it->second.description);
      else
        channel_as_map.label = "TMT_" + String(cm_it->second.name) + "_" + String(cm_it->second.description);

      channel_as_map.size = ms_exp_MS2.size();
      //TODO what about .filename? leave empty?
      // add some more MetaInfo
      channel_as_map.setMetaValue("channel_name", cm_it->second.name);
      channel_as_map.setMetaValue("channel_id", cm_it->second.id);
      channel_as_map.setMetaValue("channel_description", cm_it->second.description);
      channel_as_map.setMetaValue("channel_center", cm_it->second.center);
      channel_as_map.setMetaValue("channel_active", String(cm_it->second.active ? "true" : "false"));
      consensus_map.getFileDescriptions()[index_cnt++] = channel_as_map;
    }

    // create consensusElements

    Peak2D::CoordinateType allowed_deviation = (Peak2D::CoordinateType) param_.getValue("reporter_mass_shift");
    // now we have picked data
    // --> assign peaks to channels
    UInt element_index(0);

    for (MSExperiment<>::ConstIterator it = ms_exp_MS2.begin(); it != ms_exp_MS2.end(); ++it)
    {
      // store RT&MZ of parent ion as centroid of ConsensusFeature
      ConsensusFeature cf;
      cf.setUniqueId();
      cf.setRT(it->getRT());
      if (it->getPrecursors().size() >= 1)
      {
        cf.setMZ(it->getPrecursors()[0].getMZ());
      }
      else
      {
        throw Exception::MissingInformation(__FILE__, __LINE__, __PRETTY_FUNCTION__, String("No precursor information given for scan native ID ") + String(it->getNativeID()) + " with RT " + String(it->getRT()));
      }

      Peak2D channel_value;
      channel_value.setRT(it->getRT());
      // for each each channel
      Int index = 0;
      Peak2D::IntensityType overall_intensity = 0;
      for (ChannelMapType::const_iterator cm_it = channel_map_.begin(); cm_it != channel_map_.end(); ++cm_it)
      {
        // set mz-position of channel
        channel_value.setMZ(cm_it->second.center);
        // reset intensity
        channel_value.setIntensity(0);

        //add up all signals
        for (MSExperiment<>::SpectrumType::ConstIterator mz_it =
               it->MZBegin(cm_it->second.center - allowed_deviation)
             ; mz_it != it->MZEnd(cm_it->second.center + allowed_deviation)
             ; ++mz_it
             )
        {
          channel_value.setIntensity(channel_value.getIntensity() + mz_it->getIntensity());
        }

        overall_intensity += channel_value.getIntensity();

        // add channel to ConsensusFeature
        cf.insert(index++, channel_value, element_index);

      } // ! channel_iterator


      // check featureHandles are not empty
      if (overall_intensity == 0)
      {
        cf.setMetaValue("all_empty", String("true"));
      }
      cf.setIntensity(overall_intensity);
      consensus_map.push_back(cf);

      // the tandem-scan in the order they appear in the experiment
      ++element_index;
    } // ! Experiment iterator


#ifdef ITRAQ_DEBUG
    std::cout << "processed " << element_index << " scans" << std::endl;
#endif

    consensus_map.setExperimentType("itraq");

    return;
  }
예제 #4
0
  void StablePairFinder::run(const std::vector<ConsensusMap>& input_maps,
                             ConsensusMap& result_map)
  {
    // empty output destination:
    result_map.clear(false);

    // sanity checks:
    if (input_maps.size() != 2)
    {
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__,
                                       "exactly two input maps required");
    }
    checkIds_(input_maps);

    // set up the distance functor:
    double max_intensity = max(input_maps[0].getMaxInt(),
                               input_maps[1].getMaxInt());
    Param distance_params = param_.copy("");
    distance_params.remove("use_identifications");
    distance_params.remove("second_nearest_gap");
    FeatureDistance feature_distance(max_intensity, false);
    feature_distance.setParameters(distance_params);

    // keep track of pairing:
    std::vector<bool> is_singleton[2];
    is_singleton[0].resize(input_maps[0].size(), true);
    is_singleton[1].resize(input_maps[1].size(), true);

    typedef pair<double, double> DoublePair;
    DoublePair init = make_pair(FeatureDistance::infinity,
                                FeatureDistance::infinity);

    // for every element in map 0:
    // - index of nearest neighbor in map 1:
    vector<UInt> nn_index_0(input_maps[0].size(), UInt(-1));
    // - distances to nearest and second-nearest neighbors in map 1:
    vector<DoublePair> nn_distance_0(input_maps[0].size(), init);

    // for every element in map 1:
    // - index of nearest neighbor in map 0:
    vector<UInt> nn_index_1(input_maps[1].size(), UInt(-1));
    // - distances to nearest and second-nearest neighbors in map 0:
    vector<DoublePair> nn_distance_1(input_maps[1].size(), init);

    // iterate over all feature pairs, find nearest neighbors:
    // TODO: iterate over SENSIBLE RT (and m/z) window -- sort the maps beforehand
    //       to save a lot of processing time...
    //       Once done, remove the warning in the description of the 'use_identifications' parameter
    for (UInt fi0 = 0; fi0 < input_maps[0].size(); ++fi0)
    {
      const ConsensusFeature& feat0 = input_maps[0][fi0];

      for (UInt fi1 = 0; fi1 < input_maps[1].size(); ++fi1)
      {
        const ConsensusFeature& feat1 = input_maps[1][fi1];

        if (use_IDs_ && !compatibleIDs_(feat0, feat1)) // check peptide IDs
        {
          continue; // mismatch
        }

        pair<bool, double> result = feature_distance(feat0, feat1);
        double distance = result.second;
        // we only care if distance constraints are satisfied for "best
        // matches", not for second-best; this means that second-best distances
        // can become smaller than best distances
        // (e.g. the RT is larger than allowed (->invalid pair), but m/z is perfect and has the most weight --> better score!)
        bool valid = result.first;

        // update entries for map 0:
        if (distance < nn_distance_0[fi0].second)
        {
          if (valid && (distance < nn_distance_0[fi0].first))
          {
            nn_distance_0[fi0].second = nn_distance_0[fi0].first;
            nn_distance_0[fi0].first = distance;
            nn_index_0[fi0] = fi1;
          }
          else
          {
            nn_distance_0[fi0].second = distance;
          }
        }
        // update entries for map 1:
        if (distance < nn_distance_1[fi1].second)
        {
          if (valid && (distance < nn_distance_1[fi1].first))
          {
            nn_distance_1[fi1].second = nn_distance_1[fi1].first;
            nn_distance_1[fi1].first = distance;
            nn_index_1[fi1] = fi0;
          }
          else
          {
            nn_distance_1[fi1].second = distance;
          }
        }
      }
    }

    // if features from the two maps are nearest neighbors of each other, they
    // can become a pair:
    for (UInt fi0 = 0; fi0 < input_maps[0].size(); ++fi0)
    {
      UInt fi1 = nn_index_0[fi0]; // nearest neighbor of "fi0" in map 1
      // cout << "index: " << fi0 << ", RT: " << input_maps[0][fi0].getRT()
      //         << ", MZ: " << input_maps[0][fi0].getMZ() << endl
      //         << "neighbor: " << fi1 << ", RT: " << input_maps[1][fi1].getRT()
      //         << ", MZ: " << input_maps[1][fi1].getMZ() << endl
      //         << "d(i,j): " << nn_distance_0[fi0].first << endl
      //         << "d2(i): " << nn_distance_0[fi0].second << endl
      //         << "d2(j): " << nn_distance_1[fi1].second << endl;

      // criteria set by the parameters must be fulfilled:
      if ((nn_distance_0[fi0].first < FeatureDistance::infinity) &&
          (nn_distance_0[fi0].first * second_nearest_gap_ <= nn_distance_0[fi0].second))
      {
        // "fi0" satisfies constraints...
        if ((nn_index_1[fi1] == fi0) &&
            (nn_distance_1[fi1].first * second_nearest_gap_ <= nn_distance_1[fi1].second))
        {
          // ...nearest neighbor of "fi0" also satisfies constraints (yay!)
          // cout << "match!" << endl;
          result_map.push_back(ConsensusFeature());
          ConsensusFeature& f = result_map.back();

          f.insert(input_maps[0][fi0]);
          f.getPeptideIdentifications().insert(f.getPeptideIdentifications().end(),
                                               input_maps[0][fi0].getPeptideIdentifications().begin(),
                                               input_maps[0][fi0].getPeptideIdentifications().end());

          f.insert(input_maps[1][fi1]);
          f.getPeptideIdentifications().insert(f.getPeptideIdentifications().end(),
                                               input_maps[1][fi1].getPeptideIdentifications().begin(),
                                               input_maps[1][fi1].getPeptideIdentifications().end());

          f.computeConsensus();
          double quality = 1.0 - nn_distance_0[fi0].first;
          double quality0 = 1.0 - nn_distance_0[fi0].first * second_nearest_gap_ / nn_distance_0[fi0].second;
          double quality1 = 1.0 - nn_distance_1[fi1].first * second_nearest_gap_ / nn_distance_1[fi1].second;
          quality = quality * quality0 * quality1; // TODO other formula?

          // incorporate existing quality values:
          Size size0 = max(input_maps[0][fi0].size(), size_t(1));
          Size size1 = max(input_maps[1][fi1].size(), size_t(1));
          // quality contribution from first map:
          quality0 = input_maps[0][fi0].getQuality() * (size0 - 1);
          // quality contribution from second map:
          quality1 = input_maps[1][fi1].getQuality() * (size1 - 1);
          f.setQuality((quality + quality0 + quality1) / (size0 + size1 - 1));

          is_singleton[0][fi0] = false;
          is_singleton[1][fi1] = false;
        }
      }
    }

    // write out unmatched consensus features
    for (UInt input = 0; input <= 1; ++input)
    {
      for (UInt index = 0; index < input_maps[input].size(); ++index)
      {
        if (is_singleton[input][index])
        {
          result_map.push_back(input_maps[input][index]);
          if (result_map.back().size() < 2) // singleton consensus feature
          {
            result_map.back().setQuality(0.0);
          }
        }
      }
    }

    // canonical ordering for checking the results, and the ids have no real meaning anyway
    result_map.sortByMZ();

    // protein IDs and unassigned peptide IDs are added to the result by the
    // FeatureGroupingAlgorithm!
  }
예제 #5
0
  void LabeledPairFinder::run(const vector<ConsensusMap>& input_maps, ConsensusMap& result_map)
  {
    if (input_maps.size() != 1)
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "exactly one input map required");
    if (result_map.getFileDescriptions().size() != 2)
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "two file descriptions required");
    if (result_map.getFileDescriptions().begin()->second.filename != result_map.getFileDescriptions().rbegin()->second.filename)
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the two file descriptions have to contain the same file name");
    checkIds_(input_maps);

    //look up the light and heavy index
    Size light_index = numeric_limits<Size>::max();
    Size heavy_index = numeric_limits<Size>::max();
    for (ConsensusMap::FileDescriptions::const_iterator it = result_map.getFileDescriptions().begin();
         it != result_map.getFileDescriptions().end();
         ++it)
    {
      if (it->second.label == "heavy")
      {
        heavy_index = it->first;
      }
      else if (it->second.label == "light")
      {
        light_index = it->first;
      }
    }
    if (light_index == numeric_limits<Size>::max() || heavy_index == numeric_limits<Size>::max())
    {
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the input maps have to be labeled 'light' and 'heavy'");
    }

    result_map.clear(false);

    // sort consensus features by RT (and MZ) to speed up searching afterwards
    typedef ConstRefVector<ConsensusMap> RefMap;
    RefMap model_ref(input_maps[0].begin(), input_maps[0].end());
    model_ref.sortByPosition();

    //calculate matches
    ConsensusMap matches;
    //settings
    double rt_pair_dist = param_.getValue("rt_pair_dist");
    double rt_dev_low = param_.getValue("rt_dev_low");
    double rt_dev_high = param_.getValue("rt_dev_high");
    double mz_dev = param_.getValue("mz_dev");
    DoubleList mz_pair_dists = param_.getValue("mz_pair_dists");
    bool mrm = param_.getValue("mrm").toBool();

    //estimate RT parameters
    if (param_.getValue("rt_estimate") == "true")
    {
      //find all possible RT distances of features with the same charge and a good m/z distance
      vector<double> dists;
      dists.reserve(model_ref.size());
      for (RefMap::const_iterator it = model_ref.begin(); it != model_ref.end(); ++it)
      {
        for (RefMap::const_iterator it2 = model_ref.begin(); it2 != model_ref.end(); ++it2)
        {
          for (DoubleList::const_iterator dist_it = mz_pair_dists.begin(); dist_it != mz_pair_dists.end(); ++dist_it)
          {
            double mz_pair_dist = *dist_it;
            if (it2->getCharge() == it->getCharge()
               && it2->getMZ() >= it->getMZ() + mz_pair_dist / it->getCharge() - mz_dev
               && it2->getMZ() <= it->getMZ() + mz_pair_dist / it->getCharge() + mz_dev)
            {
              dists.push_back(it2->getRT() - it->getRT());
            }
          }
        }
      }
      if (dists.empty())
      {
        cout << "Warning: Could not find pairs for RT distance estimation. The manual settings are used!" << endl;
      }
      else
      {
        if (dists.size() < 50)
        {
          cout << "Warning: Found only " << dists.size() << " pairs. The estimated shift and std deviation are probably not reliable!" << endl;
        }
        //--------------------------- estimate initial parameters of fit ---------------------------
        GaussFitter::GaussFitResult result(-1, -1, -1);
        //first estimate of the optimal shift: median of the distances
        sort(dists.begin(), dists.end());
        Size median_index = dists.size() / 2;
        result.x0 = dists[median_index];
        //create histogram of distances
        //consider only the maximum of pairs, centered around the optimal shift
        Size max_pairs = model_ref.size() / 2;
        Size start_index = (Size) max((SignedSize)0, (SignedSize)(median_index - max_pairs / 2));
        Size end_index = (Size) min((SignedSize)(dists.size() - 1), (SignedSize)(median_index + max_pairs / 2));
        double start_value = dists[start_index];
        double end_value = dists[end_index];
        double bin_step = fabs(end_value - start_value) / 99.999; //ensure that we have 100 bins
        Math::Histogram<> hist(start_value, end_value, bin_step);
        //std::cout << "HIST from " << start_value << " to " << end_value << " (bin size " << bin_step << ")" << endl;
        for (Size i = start_index; i <= end_index; ++i)
        {
          hist.inc(dists[i]);
        }
        //cout << hist << endl;
        dists.clear();
        //determine median of bins (uniform background distribution)
        vector<Size> bins(hist.begin(), hist.end());
        sort(bins.begin(), bins.end());
        Size bin_median = bins[bins.size() / 2];
        bins.clear();
        //estimate scale A: maximum of the histogram
        Size max_value = hist.maxValue();
        result.A = max_value - bin_median;
        //overwrite estimate of x0 with the position of the highest bin
        for (Size i = 0; i < hist.size(); ++i)
        {
          if (hist[i] == max_value)
          {
            result.x0 = hist.centerOfBin(i);
            break;
          }
        }
        //estimate sigma: first time the count is less or equal the median count in the histogram
        double pos = result.x0;
        while (pos > start_value && hist.binValue(pos) > bin_median)
        {
          pos -= bin_step;
        }
        double sigma_low =  result.x0 - pos;
        pos = result.x0;
        while (pos<end_value&& hist.binValue(pos)> bin_median)
        {
          pos += bin_step;
        }
        double sigma_high = pos - result.x0;
        result.sigma = (sigma_high + sigma_low) / 6.0;
        //cout << "estimated optimal RT distance (before fit): " << result.x0 << endl;
        //cout << "estimated allowed deviation (before fit): " << result.sigma*3.0 << endl;
        //--------------------------- do gauss fit ---------------------------
        vector<DPosition<2> > points(hist.size());
        for (Size i = 0; i < hist.size(); ++i)
        {
          points[i][0] = hist.centerOfBin(i);
          points[i][1] = max(0u, hist[i]);
        }
        GaussFitter fitter;
        fitter.setInitialParameters(result);
        result = fitter.fit(points);
        cout << "estimated optimal RT distance: " << result.x0 << endl;
        cout << "estimated allowed deviation: " << fabs(result.sigma) * 3.0 << endl;
        rt_pair_dist = result.x0;
        rt_dev_low = fabs(result.sigma) * 3.0;
        rt_dev_high = fabs(result.sigma) * 3.0;
      }
    }


    // check each feature
    for (RefMap::const_iterator it = model_ref.begin(); it != model_ref.end(); ++it)
    {
      for (DoubleList::const_iterator dist_it = mz_pair_dists.begin(); dist_it != mz_pair_dists.end(); ++dist_it)
      {
        double mz_pair_dist = *dist_it;
        RefMap::const_iterator it2 = lower_bound(model_ref.begin(), model_ref.end(), it->getRT() + rt_pair_dist - rt_dev_low, ConsensusFeature::RTLess());
        while (it2 != model_ref.end() && it2->getRT() <= it->getRT() + rt_pair_dist + rt_dev_high)
        {
          // if in mrm mode, we need to compare precursor mass difference and fragment mass difference, charge remains the same

          double prec_mz_diff(0);
          if (mrm)
          {
            prec_mz_diff = fabs((double)it2->getMetaValue("MZ") - (double)it->getMetaValue("MZ"));
            if (it->getCharge() != 0)
            {
              prec_mz_diff = fabs(prec_mz_diff - mz_pair_dist / it->getCharge());
            }
            else
            {
              prec_mz_diff = fabs(prec_mz_diff - mz_pair_dist);
            }
          }

          bool mrm_correct_dist(false);
          double frag_mz_diff = fabs(it->getMZ() - it2->getMZ());

          //cerr << it->getRT() << " charge1=" << it->getCharge() << ", charge2=" << it2->getCharge() << ", prec_diff=" << prec_mz_diff << ", frag_diff=" << frag_mz_diff << endl;

          if (mrm &&
              it2->getCharge() == it->getCharge() &&
              prec_mz_diff < mz_dev &&
              (frag_mz_diff < mz_dev || fabs(frag_mz_diff - mz_pair_dist) < mz_dev))
          {
            mrm_correct_dist = true;
            //cerr << "mrm_correct_dist" << endl;
          }

          if ((mrm && mrm_correct_dist) || (!mrm &&
                                            it2->getCharge() == it->getCharge() &&
                                            it2->getMZ() >= it->getMZ() + mz_pair_dist / it->getCharge() - mz_dev &&
                                            it2->getMZ() <= it->getMZ() + mz_pair_dist / it->getCharge() + mz_dev
                                            ))
          {
            //cerr << "dist correct" << endl;
            double score = sqrt(
              PValue_(it2->getMZ() - it->getMZ(), mz_pair_dist / it->getCharge(), mz_dev, mz_dev) *
              PValue_(it2->getRT() - it->getRT(), rt_pair_dist, rt_dev_low, rt_dev_high)
              );

            // Note: we used to copy the id from the light feature here, but that strategy does not generalize to more than two labels.
            // We might want to report consensus features where the light one is missing but more than one heavier variant was found.
            // Also, the old strategy is inconsistent with what was done in the unlabeled case.  Thus now we assign a new unique id here.
            matches.push_back(ConsensusFeature());
            matches.back().setUniqueId();

            matches.back().insert(light_index, *it);
            matches.back().clearMetaInfo();
            matches.back().insert(heavy_index, *it2);
            matches.back().setQuality(score);
            matches.back().setCharge(it->getCharge());
            matches.back().computeMonoisotopicConsensus();
          }
          ++it2;
        }
      }
    }

    //compute best pairs
    // - sort matches by quality
    // - take highest-quality matches first (greedy) and mark them as used
    set<Size> used_features;
    matches.sortByQuality(true);
    for (ConsensusMap::const_iterator match = matches.begin(); match != matches.end(); ++match)
    {
      //check if features are not used yet
      if (used_features.find(match->begin()->getUniqueId()) == used_features.end() &&
          used_features.find(match->rbegin()->getUniqueId()) == used_features.end()
          )
      {
        //if unused, add it to the final set of elements
        result_map.push_back(*match);
        used_features.insert(match->begin()->getUniqueId());
        used_features.insert(match->rbegin()->getUniqueId());
      }
    }

    //Add protein identifications to result map
    for (Size i = 0; i < input_maps.size(); ++i)
    {
      result_map.getProteinIdentifications().insert(result_map.getProteinIdentifications().end(), input_maps[i].getProteinIdentifications().begin(), input_maps[i].getProteinIdentifications().end());
    }

    //Add unassigned peptide identifications to result map
    for (Size i = 0; i < input_maps.size(); ++i)
    {
      result_map.getUnassignedPeptideIdentifications().insert(result_map.getUnassignedPeptideIdentifications().end(), input_maps[i].getUnassignedPeptideIdentifications().begin(), input_maps[i].getUnassignedPeptideIdentifications().end());
    }

    // Very useful for checking the results, and the ids have no real meaning anyway
    result_map.sortByMZ();
  }
예제 #6
0
  void QTClusterFinder::run_(const vector<MapType> & input_maps,
                             ConsensusMap & result_map)
  {
    num_maps_ = input_maps.size();
    if (num_maps_ < 2)
    {
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__,
                                       "At least two input maps required");
    }

    // set up the distance functor (and set other parameters):
    DoubleReal max_intensity = input_maps[0].getMaxInt();
    DoubleReal max_mz = input_maps[0].getMax()[1];
    for (Size map_index = 1; map_index < num_maps_; ++map_index)
    {
      max_intensity = max(max_intensity, input_maps[map_index].getMaxInt());
      max_mz = max(max_mz, input_maps[map_index].getMax()[0]);
    }

    setParameters_(max_intensity, max_mz);

    // create the hash grid and fill it with features:
    //cout << "Hashing..." << endl;
    list<GridFeature> grid_features;
    Grid grid(Grid::ClusterCenter(max_diff_rt_, max_diff_mz_));
    for (Size map_index = 0; map_index < num_maps_; ++map_index)
    {
      for (Size feature_index = 0; feature_index < input_maps[map_index].size();
           ++feature_index)
      {
        grid_features.push_back(
          GridFeature(input_maps[map_index][feature_index], map_index,
                      feature_index));
        GridFeature & gfeature = grid_features.back();
        // sort peptide hits once now, instead of multiple times later:
        BaseFeature & feature = const_cast<BaseFeature &>(
          grid_features.back().getFeature());
        for (vector<PeptideIdentification>::iterator pep_it =
               feature.getPeptideIdentifications().begin(); pep_it !=
             feature.getPeptideIdentifications().end(); ++pep_it)
        {
          pep_it->sort();
        }
        grid.insert(std::make_pair(Grid::ClusterCenter(gfeature.getRT(), gfeature.getMZ()), &gfeature));
      }
    }

    // compute QT clustering:
    //cout << "Clustering..." << endl;
    list<QTCluster> clustering;
    computeClustering_(grid, clustering);
    // number of clusters == number of data points:
    Size size = clustering.size();

    // Create a temporary map where we store which GridFeatures are next to which Clusters
    OpenMSBoost::unordered_map<GridFeature *, std::vector< QTCluster * > > element_mapping;
    for (list<QTCluster>::iterator it = clustering.begin(); it != clustering.end(); ++it)
    {
      OpenMSBoost::unordered_map<Size, GridFeature *> elements;
      typedef std::multimap<DoubleReal, GridFeature *> InnerNeighborMap;
      typedef OpenMSBoost::unordered_map<Size, InnerNeighborMap > NeighborMap;
      NeighborMap neigh = it->getNeighbors();
      for (NeighborMap::iterator n_it = neigh.begin(); n_it != neigh.end(); ++n_it)
      {
        for (InnerNeighborMap::iterator i_it = n_it->second.begin(); i_it != n_it->second.end(); ++i_it)
        {
          element_mapping[i_it->second].push_back( &(*it) );
        }
      }
    }

    ProgressLogger logger;
    logger.setLogType(ProgressLogger::CMD);
    logger.startProgress(0, size, "linking features");
    Size progress = 0;
    result_map.clear(false);

    while (!clustering.empty())
    {
      // cout << "Clusters: " << clustering.size() << endl;
      ConsensusFeature consensus_feature;
      makeConsensusFeature_(clustering, consensus_feature, element_mapping);
      if (!clustering.empty())
      {
        result_map.push_back(consensus_feature);
      }
      logger.setProgress(progress++);
    }

    logger.endProgress();
  }