Exemplo n.º 1
0
  ExitCodes main_(int, const char**)
  {
    //-------------------------------------------------------------
    // parameter handling
    //-------------------------------------------------------------
    StringList in = getStringList_("in");
    String edta = getStringOption_("pos");
    String out = getStringOption_("out");
    String out_sep = getStringOption_("out_separator");
    String out_TIC_debug = getStringOption_("auto_rt:out_debug_TIC");

    StringList in_header = getStringList_("in_header");


    // number of out_debug_TIC files and input files must be identical
    /*if (out_TIC_debug.size() > 0 && in.size() != out_TIC_debug.size())
    {
        LOG_FATAL_ERROR << "Error: number of input file 'in' and auto_rt:out_debug_TIC files must be identical!" << std::endl;
        return ILLEGAL_PARAMETERS;
    }*/

    // number of header files and input files must be identical
    if (in_header.size() > 0 && in.size() != in_header.size())
    {
      LOG_FATAL_ERROR << "Error: number of input file 'in' and 'in_header' files must be identical!" << std::endl;
      return ILLEGAL_PARAMETERS;
    }

    if (!getFlag_("auto_rt:enabled") && !out_TIC_debug.empty())
    {
      LOG_FATAL_ERROR << "Error: TIC output file requested, but auto_rt is not enabled! Either do not request the file or switch on 'auto_rt:enabled'." << std::endl;
      return ILLEGAL_PARAMETERS;
    }

    double rttol = getDoubleOption_("rt_tol");
    double mztol = getDoubleOption_("mz_tol");
    Size rt_collect = getIntOption_("rt_collect");

    //-------------------------------------------------------------
    // loading input
    //-------------------------------------------------------------
    MzMLFile mzml_file;
    mzml_file.setLogType(log_type_);
    MSExperiment<Peak1D> exp, exp_pp;

    EDTAFile ed;
    ConsensusMap cm;
    ed.load(edta, cm);

    StringList tf_single_header0, tf_single_header1, tf_single_header2; // header content, for each column

    std::vector<String> vec_single; // one line for each compound, multiple columns per experiment
    vec_single.resize(cm.size());
    for (Size fi = 0; fi < in.size(); ++fi)
    {
      // load raw data
      mzml_file.load(in[fi], exp);
      exp.sortSpectra(true);

      if (exp.empty())
      {
        LOG_WARN << "The given file does not contain any conventional peak data, but might"
                    " contain chromatograms. This tool currently cannot handle them, sorry." << std::endl;
        return INCOMPATIBLE_INPUT_DATA;
      }

      // try to detect RT peaks (only for the first input file -- all others should align!)
      // cm.size() might change in here...
      if (getFlag_("auto_rt:enabled") && fi == 0)
      {
        ConsensusMap cm_local = cm; // we might have different RT peaks for each map if 'auto_rt' is enabled
        cm.clear(false); // reset global list (about to be filled)

        // compute TIC
        MSChromatogram<> tic = exp.getTIC();
        MSSpectrum<> tics, tic_gf, tics_pp, tics_sn;
        for (Size ic = 0; ic < tic.size(); ++ic)
        { // rewrite Chromatogram to MSSpectrum (GaussFilter requires it)
          Peak1D peak;
          peak.setMZ(tic[ic].getRT());
          peak.setIntensity(tic[ic].getIntensity());
          tics.push_back(peak);
        }
        // smooth (no PP_CWT here due to efficiency reasons -- large FWHM take longer!)
        double fwhm = getDoubleOption_("auto_rt:FHWM");
        GaussFilter gf;
        Param p = gf.getParameters();
        p.setValue("gaussian_width", fwhm * 2); // wider than FWHM, just to be sure we have a fully smoothed peak. Merging two peaks is unlikely
        p.setValue("use_ppm_tolerance", "false");
        gf.setParameters(p);
        tic_gf = tics;
        gf.filter(tic_gf);
        // pick peaks
        PeakPickerHiRes pp;
        p = pp.getParameters();
        p.setValue("signal_to_noise", getDoubleOption_("auto_rt:SNThreshold"));
        pp.setParameters(p);
        pp.pick(tic_gf, tics_pp);

        if (tics_pp.size())
        {
          LOG_INFO << "Found " << tics_pp.size() << " auto-rt peaks at: ";
          for (Size ipp = 0; ipp != tics_pp.size(); ++ipp) LOG_INFO << " " << tics_pp[ipp].getMZ();
        }
        else
        {
          LOG_INFO << "Found no auto-rt peaks. Change threshold parameters!";
        }
        LOG_INFO << std::endl;

        if (!out_TIC_debug.empty()) // if debug file was given
        { // store intermediate steps for debug
          MSExperiment<> out_debug;
          out_debug.addChromatogram(toChromatogram(tics));
          out_debug.addChromatogram(toChromatogram(tic_gf));

          SignalToNoiseEstimatorMedian<MSSpectrum<> > snt;
          snt.init(tics);
          for (Size is = 0; is < tics.size(); ++is)
          {
            Peak1D peak;
            peak.setMZ(tic[is].getMZ());
            peak.setIntensity(snt.getSignalToNoise(tics[is]));
            tics_sn.push_back(peak);
          }
          out_debug.addChromatogram(toChromatogram(tics_sn));

          out_debug.addChromatogram(toChromatogram(tics_pp));
          // get rid of "native-id" missing warning
          for (Size id = 0; id < out_debug.size(); ++id) out_debug[id].setNativeID(String("spectrum=") + id);

          mzml_file.store(out_TIC_debug, out_debug);
          LOG_DEBUG << "Storing debug AUTO-RT: " << out_TIC_debug << std::endl;
        }

        // add target EICs: for each m/z with no/negative RT, add all combinations of that m/z with auto-RTs
        // duplicate m/z entries will be ignored!
        // all other lines with positive RT values are copied unaffected
        //do not allow doubles
        std::set<double> mz_doubles;
        for (ConsensusMap::Iterator cit = cm_local.begin(); cit != cm_local.end(); ++cit)
        {
          if (cit->getRT() < 0)
          {
            if (mz_doubles.find(cit->getMZ()) == mz_doubles.end())
            {
              mz_doubles.insert(cit->getMZ());
            }
            else
            {
              LOG_INFO << "Found duplicate m/z entry (" << cit->getMZ() << ") for auto-rt. Skipping ..." << std::endl;
              continue;
            }

            ConsensusMap cm_RT_multiplex;
            for (MSSpectrum<>::ConstIterator itp = tics_pp.begin(); itp != tics_pp.end(); ++itp)
            {
              ConsensusFeature f = *cit;
              f.setRT(itp->getMZ());
              cm.push_back(f);
            }

          }
          else
          { // default feature with no auto-rt
            LOG_INFO << "copying feature with RT " << cit->getRT() << std::endl;
            cm.push_back(*cit);
          }
        }

        // resize, since we have more positions now
        vec_single.resize(cm.size());
      }


      // search for each EIC and add up
      Int not_found(0);
      Map<Size, double> quant;

      String description;
      if (fi < in_header.size())
      {
        HeaderInfo info(in_header[fi]);
        description = info.header_description;
      }

      if (fi == 0)
      { // two additional columns for first file (theoretical RT and m/z)
        tf_single_header0 << "" << "";
        tf_single_header1 << "" << "";
        tf_single_header2 << "RT" << "mz";
      }

      // 5 entries for each input file
      tf_single_header0 << File::basename(in[fi]) << "" << "" << "" << "";
      tf_single_header1 << description << "" << "" << "" << "";
      tf_single_header2 << "RTobs" << "dRT" << "mzobs" << "dppm" << "intensity";

      for (Size i = 0; i < cm.size(); ++i)
      {
        //std::cerr << "Rt" << cm[i].getRT() << "  mz: " << cm[i].getMZ() << " R " <<  cm[i].getMetaValue("rank") << "\n";

        double mz_da = mztol * cm[i].getMZ() / 1e6; // mz tolerance in Dalton
        MSExperiment<>::ConstAreaIterator it = exp.areaBeginConst(cm[i].getRT() - rttol / 2,
                                                                  cm[i].getRT() + rttol / 2,
                                                                  cm[i].getMZ() - mz_da,
                                                                  cm[i].getMZ() + mz_da);
        Peak2D max_peak;
        max_peak.setIntensity(0);
        max_peak.setRT(cm[i].getRT());
        max_peak.setMZ(cm[i].getMZ());
        for (; it != exp.areaEndConst(); ++it)
        {
          if (max_peak.getIntensity() < it->getIntensity())
          {
            max_peak.setIntensity(it->getIntensity());
            max_peak.setRT(it.getRT());
            max_peak.setMZ(it->getMZ());
          }
        }
        double ppm = 0; // observed m/z offset

        if (max_peak.getIntensity() == 0)
        {
          ++not_found;
        }
        else
        {
          // take median for m/z found
          std::vector<double> mz;
          MSExperiment<>::Iterator itm = exp.RTBegin(max_peak.getRT());
          SignedSize low = std::min<SignedSize>(std::distance(exp.begin(), itm), rt_collect);
          SignedSize high = std::min<SignedSize>(std::distance(itm, exp.end()) - 1, rt_collect);
          MSExperiment<>::AreaIterator itt = exp.areaBegin((itm - low)->getRT() - 0.01, (itm + high)->getRT() + 0.01, cm[i].getMZ() - mz_da, cm[i].getMZ() + mz_da);
          for (; itt != exp.areaEnd(); ++itt)
          {
            mz.push_back(itt->getMZ());
            //std::cerr << "ppm: " << itt.getRT() << " " <<  itt->getMZ() << " " << itt->getIntensity() << std::endl;
          }

          if ((SignedSize)mz.size() > (low + high + 1)) LOG_WARN << "Compound " << i << " has overlapping peaks [" << mz.size() << "/" << low + high + 1 << "]" << std::endl;

          if (!mz.empty())
          {
            double avg_mz = std::accumulate(mz.begin(), mz.end(), 0.0) / double(mz.size());
            //std::cerr << "avg: " << avg_mz << "\n";
            ppm = (avg_mz - cm[i].getMZ()) / cm[i].getMZ() * 1e6;
          }

        }

        // appending the second column set requires separator
        String append_sep = (fi == 0 ? "" : out_sep);

        vec_single[i] += append_sep; // new line
        if (fi == 0)
        {
          vec_single[i] += String(cm[i].getRT()) + out_sep +
                           String(cm[i].getMZ()) + out_sep;
        }
        vec_single[i] += String(max_peak.getRT()) + out_sep +
                         String(max_peak.getRT() - cm[i].getRT()) + out_sep +
                         String(max_peak.getMZ()) + out_sep +
                         String(ppm)  + out_sep +
                         String(max_peak.getIntensity());
      }

      if (not_found) LOG_INFO << "Missing peaks for " << not_found << " compounds in file '" << in[fi] << "'.\n";
    }

    //-------------------------------------------------------------
    // create header
    //-------------------------------------------------------------
    vec_single.insert(vec_single.begin(), ListUtils::concatenate(tf_single_header2, out_sep));
    vec_single.insert(vec_single.begin(), ListUtils::concatenate(tf_single_header1, out_sep));
    vec_single.insert(vec_single.begin(), ListUtils::concatenate(tf_single_header0, out_sep));

    //-------------------------------------------------------------
    // writing output
    //-------------------------------------------------------------
    TextFile tf;
    for (std::vector<String>::iterator v_it = vec_single.begin(); v_it != vec_single.end(); ++v_it)
    {
      tf.addLine(*v_it);
    }
    tf.store(out);

    return EXECUTION_OK;
  }
Exemplo n.º 2
0
  void PeakPickerHiRes::pick(const MSSpectrum& input, MSSpectrum& output, std::vector<PeakBoundary>& boundaries, bool check_spacings) const
  {
    // copy meta data of the input spectrum
    output.clear(true);
    output.SpectrumSettings::operator=(input);
    output.MetaInfoInterface::operator=(input);
    output.setRT(input.getRT());
    output.setMSLevel(input.getMSLevel());
    output.setName(input.getName());
    output.setType(SpectrumSettings::CENTROID);
    if (report_FWHM_)
    {
      output.getFloatDataArrays().resize(1);
      output.getFloatDataArrays()[0].setName( report_FWHM_as_ppm_ ? "FWHM_ppm" : "FWHM");
    }

    // don't pick a spectrum with less than 5 data points
    if (input.size() < 5) return;

    // if both spacing constraints are disabled, don't check spacings at all:
    if ((spacing_difference_ == std::numeric_limits<double>::infinity()) &&
      (spacing_difference_gap_ == std::numeric_limits<double>::infinity()))
    {
      check_spacings = false;
    }

    // signal-to-noise estimation
    SignalToNoiseEstimatorMedian<MSSpectrum > snt;
    snt.setParameters(param_.copy("SignalToNoise:", true));

    if (signal_to_noise_ > 0.0)
    {
      snt.init(input);
    }

    // find local maxima in profile data
    for (Size i = 2; i < input.size() - 2; ++i)
    {
      double central_peak_mz = input[i].getMZ(), central_peak_int = input[i].getIntensity();
      double left_neighbor_mz = input[i - 1].getMZ(), left_neighbor_int = input[i - 1].getIntensity();
      double right_neighbor_mz = input[i + 1].getMZ(), right_neighbor_int = input[i + 1].getIntensity();

      // do not interpolate when the left or right support is a zero-data-point
      if (std::fabs(left_neighbor_int) < std::numeric_limits<double>::epsilon()) continue;
      if (std::fabs(right_neighbor_int) < std::numeric_limits<double>::epsilon()) continue;

      // MZ spacing sanity checks
      double left_to_central = 0.0, central_to_right = 0.0, min_spacing = 0.0;
      if (check_spacings)
      {
        left_to_central = central_peak_mz - left_neighbor_mz;
        central_to_right = right_neighbor_mz - central_peak_mz;
        min_spacing = (left_to_central < central_to_right) ? left_to_central : central_to_right;
      }

      double act_snt = 0.0, act_snt_l1 = 0.0, act_snt_r1 = 0.0;
      if (signal_to_noise_ > 0.0)
      {
        act_snt = snt.getSignalToNoise(input[i]);
        act_snt_l1 = snt.getSignalToNoise(input[i - 1]);
        act_snt_r1 = snt.getSignalToNoise(input[i + 1]);
      }

      // look for peak cores meeting MZ and intensity/SNT criteria
      if ((central_peak_int > left_neighbor_int) && 
        (central_peak_int > right_neighbor_int) && 
        (act_snt >= signal_to_noise_) && 
        (act_snt_l1 >= signal_to_noise_) && 
        (act_snt_r1 >= signal_to_noise_) &&
        (!check_spacings || 
        ((left_to_central < spacing_difference_ * min_spacing) && 
          (central_to_right < spacing_difference_ * min_spacing))))
      {
        // special case: if a peak core is surrounded by more intense
        // satellite peaks (indicates oscillation rather than
        // real peaks) -> remove

        double act_snt_l2 = 0.0, act_snt_r2 = 0.0;

        if (signal_to_noise_ > 0.0)
        {
          act_snt_l2 = snt.getSignalToNoise(input[i - 2]);
          act_snt_r2 = snt.getSignalToNoise(input[i + 2]);
        }

        // checking signal-to-noise?
        if ((i > 1) &&
          (i + 2 < input.size()) &&
          (left_neighbor_int < input[i - 2].getIntensity()) &&
          (right_neighbor_int < input[i + 2].getIntensity()) &&
          (act_snt_l2 >= signal_to_noise_) &&
          (act_snt_r2 >= signal_to_noise_) &&
          (!check_spacings ||
          ((left_neighbor_mz - input[i - 2].getMZ() < spacing_difference_ * min_spacing) && 
            (input[i + 2].getMZ() - right_neighbor_mz < spacing_difference_ * min_spacing))))
        {
          ++i;
          continue;
        }

        std::map<double, double> peak_raw_data;

        peak_raw_data[central_peak_mz] = central_peak_int;
        peak_raw_data[left_neighbor_mz] = left_neighbor_int;
        peak_raw_data[right_neighbor_mz] = right_neighbor_int;

        // peak core found, now extend it
        // to the left
        Size k = 2;

        bool previous_zero_left(false); // no need to extend peak if previous intensity was zero
        Size missing_left(0);
        Size left_boundary(i - 1); // index of the left boundary for the spline interpolation

        while ((k <= i) && // prevent underflow
          (i - k + 1 > 0) && 
          !previous_zero_left && 
          (missing_left <= missing_) && 
          (input[i - k].getIntensity() <= peak_raw_data.begin()->second) &&
          (!check_spacings || 
          (peak_raw_data.begin()->first - input[i - k].getMZ() < spacing_difference_gap_ * min_spacing)))
        {
          double act_snt_lk = 0.0;

          if (signal_to_noise_ > 0.0)
          {
            act_snt_lk = snt.getSignalToNoise(input[i - k]);
          }

          if ((act_snt_lk >= signal_to_noise_) && 
            (!check_spacings ||
            (peak_raw_data.begin()->first - input[i - k].getMZ() < spacing_difference_ * min_spacing)))
          {
            peak_raw_data[input[i - k].getMZ()] = input[i - k].getIntensity();
          }
          else
          {
            ++missing_left;
            if (missing_left <= missing_)
            {
              peak_raw_data[input[i - k].getMZ()] = input[i - k].getIntensity();
            }
          }

          previous_zero_left = (input[i - k].getIntensity() == 0);
          left_boundary = i - k;
          ++k;
        }

        // to the right
        k = 2;

        bool previous_zero_right(false); // no need to extend peak if previous intensity was zero
        Size missing_right(0);
        Size right_boundary(i+1); // index of the right boundary for the spline interpolation

        while ((i + k < input.size()) && 
          !previous_zero_right && 
          (missing_right <= missing_) && 
          (input[i + k].getIntensity() <= peak_raw_data.rbegin()->second) &&
          (!check_spacings ||
          (input[i + k].getMZ() - peak_raw_data.rbegin()->first < spacing_difference_gap_ * min_spacing)))
        {
          double act_snt_rk = 0.0;

          if (signal_to_noise_ > 0.0)
          {
            act_snt_rk = snt.getSignalToNoise(input[i + k]);
          }

          if ((act_snt_rk >= signal_to_noise_) && 
            (!check_spacings ||
            (input[i + k].getMZ() - peak_raw_data.rbegin()->first < spacing_difference_ * min_spacing)))
          {
            peak_raw_data[input[i + k].getMZ()] = input[i + k].getIntensity();
          }
          else
          {
            ++missing_right;
            if (missing_right <= missing_)
            {
              peak_raw_data[input[i + k].getMZ()] = input[i + k].getIntensity();
            }
          }

          previous_zero_right = (input[i + k].getIntensity() == 0);
          right_boundary = i + k;
          ++k;
        }

        // skip if the minimal number of 3 points for fitting is not reached
        if (peak_raw_data.size() < 3) continue;

        CubicSpline2d peak_spline (peak_raw_data);

        // calculate maximum by evaluating the spline's 1st derivative
        // (bisection method)
        double max_peak_mz = central_peak_mz;
        double max_peak_int = central_peak_int;
        double threshold = 1e-6;
        OpenMS::Math::spline_bisection(peak_spline, left_neighbor_mz, right_neighbor_mz, max_peak_mz, max_peak_int, threshold);

        //
        // compute FWHM
        //
        if (report_FWHM_)
        {
          double fwhm_int = max_peak_int / 2.0;
          threshold = 0.01 * fwhm_int;
          double mz_mid, int_mid; 
          // left:
          double mz_left = peak_raw_data.begin()->first;
          double mz_center = max_peak_mz;
          if (peak_spline.eval(mz_left) > fwhm_int)
          { // the spline ends before half max is reached -- take the leftmost point (probably an underestimation)
            mz_mid = mz_left;
          }
          else
          {
            do 
            {
              mz_mid = mz_left / 2 + mz_center / 2;
              int_mid = peak_spline.eval(mz_mid);
              if (int_mid < fwhm_int)
              {
                mz_left = mz_mid;
              }
              else
              {
                mz_center = mz_mid;
              }
            } while (fabs(int_mid - fwhm_int) > threshold);
          }
          const double fwhm_left_mz = mz_mid;

          // right ...
          double mz_right = peak_raw_data.rbegin()->first;
          mz_center = max_peak_mz;
          if (peak_spline.eval(mz_right) > fwhm_int)
          { // the spline ends before half max is reached -- take the rightmost point (probably an underestimation)
            mz_mid = mz_right;
          }
          else
          {
            do 
            {
              mz_mid = mz_right / 2 + mz_center / 2;
              int_mid = peak_spline.eval(mz_mid);
              if (int_mid < fwhm_int)
              {
                mz_right = mz_mid;
              }
              else
              {
                mz_center = mz_mid;
              }

            } while (fabs(int_mid - fwhm_int) > threshold);
          }
          const double fwhm_right_mz = mz_mid;
          const double fwhm_absolute = fwhm_right_mz - fwhm_left_mz;
          output.getFloatDataArrays()[0].push_back( report_FWHM_as_ppm_ ? fwhm_absolute / max_peak_mz  * 1e6 : fwhm_absolute);
        } // FWHM

          // save picked peak into output spectrum
        Peak1D peak;
        PeakBoundary peak_boundary;
        peak.setMZ(max_peak_mz);
        peak.setIntensity(max_peak_int);
        peak_boundary.mz_min = input[left_boundary].getMZ();
        peak_boundary.mz_max = input[right_boundary].getMZ();
        output.push_back(peak);

        boundaries.push_back(peak_boundary);

        // jump over profile data points that have been considered already
        i = i + k - 1;
      }
    }

    return;
  }