コード例 #1
0
ファイル: FileMerger.cpp プロジェクト: chahuistle/OpenMS
 void adjustRetentionTimes_(MapType& map, const String& trafo_out,
                            bool first_file)
 {
   map.updateRanges();
   TransformationDescription trafo;
   if (first_file) // no transformation necessary
   {
     rt_offset_ = map.getMax()[0] + rt_gap_;
     trafo.fitModel("identity");
   }
   else // subsequent file -> apply transformation
   {
     TransformationDescription::DataPoints points(2);
     double rt_min = map.getMin()[0], rt_max = map.getMax()[0];
     points[0] = make_pair(rt_min, rt_offset_);
     rt_offset_ += rt_max - rt_min;
     points[1] = make_pair(rt_max, rt_offset_);
     trafo.setDataPoints(points);
     trafo.fitModel("linear");
     MapAlignmentTransformer::transformRetentionTimes(map, trafo, true);
     rt_offset_ += rt_gap_;
   }
   if (!trafo_out.empty())
   {
     TransformationXMLFile().store(trafo_out, trafo);
   }
 }
  void MapAlignmentAlgorithmPoseClustering::align(const ConsensusMap & map, TransformationDescription & trafo)
  {
    // TODO: move this to updateMembers_? (if consensusMap prevails)
    // TODO: why does superimposer work on consensus map???
    const ConsensusMap & map_model = reference_;
    ConsensusMap map_scene = map;

    // run superimposer to find the global transformation
    TransformationDescription si_trafo;
    superimposer_.run(map_model, map_scene, si_trafo);

    // apply transformation to consensus features and contained feature
    // handles
    for (Size j = 0; j < map_scene.size(); ++j)
    {
      //Calculate new RT
      double rt = map_scene[j].getRT();
      rt = si_trafo.apply(rt);
      //Set RT of consensus feature centroid
      map_scene[j].setRT(rt);
      //Set RT of consensus feature handles
      map_scene[j].begin()->asMutable().setRT(rt);
    }

    //run pairfinder to find pairs
    ConsensusMap result;
    //TODO: add another 2map interface to pairfinder?
    std::vector<ConsensusMap> input(2);
    input[0] = map_model;
    input[1] = map_scene;
    pairfinder_.run(input, result);

    // calculate the local transformation
    si_trafo.invert();         // to undo the transformation applied above
    TransformationDescription::DataPoints data;
    for (ConsensusMap::Iterator it = result.begin(); it != result.end();
         ++it)
    {
      if (it->size() == 2)           // two matching features
      {
        ConsensusFeature::iterator feat_it = it->begin();
        double y = feat_it->getRT();
        double x = si_trafo.apply((++feat_it)->getRT());
        // one feature should be from the reference map:
        if (feat_it->getMapIndex() != 0)
        {
          data.push_back(make_pair(x, y));
        }
        else
        {
          data.push_back(make_pair(y, x));
        }
      }
    }
    trafo = TransformationDescription(data);
    trafo.fitModel("linear");
  }
コード例 #3
0
  void MapAlignmentAlgorithmIdentification::computeTransformations_(
    vector<SeqToList>& rt_data, vector<TransformationDescription>& transforms,
    bool sorted)
  {
    Int size = rt_data.size(); // not Size because we compare to Ints later
    transforms.clear();

    // filter RT data (remove peptides that elute in several fractions):
    // TODO

    // compute RT medians:
    LOG_DEBUG << "Computing RT medians..." << endl;
    vector<SeqToValue> medians_per_run(size);
    for (Int i = 0; i < size; ++i)
    {
      computeMedians_(rt_data[i], medians_per_run[i], sorted);
    }
    SeqToList medians_per_seq;
    for (vector<SeqToValue>::iterator run_it = medians_per_run.begin();
         run_it != medians_per_run.end(); ++run_it)
    {
      for (SeqToValue::iterator med_it = run_it->begin();
           med_it != run_it->end(); ++med_it)
      {
        medians_per_seq[med_it->first].push_back(med_it->second);
      }
    }

    // get reference retention time scale: either directly from reference file,
    // or compute consensus time scale
    bool reference_given = !reference_.empty(); // reference file given
    if (reference_given)
    {
      // remove peptides that don't occur in enough runs:
      LOG_DEBUG << "Removing peptides that occur in too few runs..." << endl;
      SeqToValue temp;
      for (SeqToValue::iterator ref_it = reference_.begin();
           ref_it != reference_.end(); ++ref_it)
      {
        SeqToList::iterator med_it = medians_per_seq.find(ref_it->first);
        if ((med_it != medians_per_seq.end()) &&
            (med_it->second.size() + 1 >= min_run_occur_))
        {
          temp.insert(temp.end(), *ref_it); // new items should go at the end
        }
      }
      LOG_DEBUG << "Removed " << reference_.size() - temp.size() << " of "
                << reference_.size() << " peptides." << endl;
      temp.swap(reference_);
    }
    else // compute overall RT median per sequence (median of medians per run)
    {
      LOG_DEBUG << "Computing overall RT medians per sequence..." << endl;

      // remove peptides that don't occur in enough runs (at least two):
      LOG_DEBUG << "Removing peptides that occur in too few runs..." << endl;
      SeqToList temp;
      for (SeqToList::iterator med_it = medians_per_seq.begin();
           med_it != medians_per_seq.end(); ++med_it)
      {
        if (med_it->second.size() >= min_run_occur_)
        {
          temp.insert(temp.end(), *med_it);
        }
      }
      LOG_DEBUG << "Removed " << medians_per_seq.size() - temp.size() << " of "
                << medians_per_seq.size() << " peptides." << endl;
      temp.swap(medians_per_seq);
      computeMedians_(medians_per_seq, reference_);
    }
    if (reference_.empty())
    {
      throw Exception::MissingInformation(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No reference RT information left after filtering");
    }

    double max_rt_shift = param_.getValue("max_rt_shift");
    if (max_rt_shift <= 1)
    {
      // compute max. allowed shift from overall retention time range:
      double rt_min = numeric_limits<double>::infinity(), rt_max = -rt_min;
      for (SeqToValue::iterator it = reference_.begin(); it != reference_.end();
           ++it)
      {
        rt_min = min(rt_min, it->second);
        rt_max = max(rt_max, it->second);
      }
      double rt_range = rt_max - rt_min;
      max_rt_shift *= rt_range;
      // in the degenerate case of only one reference point, "max_rt_shift"
      // should be zero (because "rt_range" is zero) - this is covered below
    }
    if (max_rt_shift == 0)
    {
      max_rt_shift = numeric_limits<double>::max();
    }
    LOG_DEBUG << "Max. allowed RT shift (in seconds): " << max_rt_shift << endl;

    // generate RT transformations:
    LOG_DEBUG << "Generating RT transformations..." << endl;
    LOG_INFO << "\nAlignment based on:" << endl; // diagnostic output
    Size offset = 0; // offset in case of internal reference
    for (Int i = 0; i < size + 1; ++i)
    {
      if (i == reference_index_)
      {
        // if one of the input maps was used as reference, it has been skipped
        // so far - now we have to consider it again:
        TransformationDescription trafo;
        trafo.fitModel("identity");
        transforms.push_back(trafo);
        LOG_INFO << "- " << reference_.size() << " data points for sample "
                 << i + 1 << " (reference)\n";
        offset = 1;
      }
      if (i >= size) break;

      // to be useful for the alignment, a peptide sequence has to occur in the
      // current run ("medians_per_run[i]"), but also in at least one other run
      // ("medians_overall"):
      TransformationDescription::DataPoints data;
      Size n_outliers = 0;
      for (SeqToValue::iterator med_it = medians_per_run[i].begin();
           med_it != medians_per_run[i].end(); ++med_it)
      {
        SeqToValue::const_iterator pos = reference_.find(med_it->first);
        if (pos != reference_.end())
        {
          if (abs(med_it->second - pos->second) <= max_rt_shift)
          { // found, and satisfies "max_rt_shift" condition!
            TransformationDescription::DataPoint point(med_it->second,
                                                       pos->second, pos->first);
            data.push_back(point);
          }
          else
          {
            n_outliers++;
          }
        }
      }
      transforms.push_back(TransformationDescription(data));
      LOG_INFO << "- " << data.size() << " data points for sample "
               << i + offset + 1;
      if (n_outliers) LOG_INFO << " (" << n_outliers << " outliers removed)";
      LOG_INFO << "\n";
    }
    LOG_INFO << endl;

    // delete temporary reference
    if (!reference_given) reference_.clear();
  }
コード例 #4
0
  ExitCodes main_(int, const char**)
  {
    //-------------------------------------------------------------
    // parameter handling
    //-------------------------------------------------------------
    String in = getStringOption_("in");
    String out = getStringOption_("out");
    String trafo_in = getStringOption_("trafo_in");
    String trafo_out = getStringOption_("trafo_out");
    Param model_params = getParam_().copy("model:", true);
    String model_type = model_params.getValue("type");
    model_params = model_params.copy(model_type + ":", true);

    ProgressLogger progresslogger;
    progresslogger.setLogType(log_type_);

    //-------------------------------------------------------------
    // check for valid input
    //-------------------------------------------------------------
    if (out.empty() && trafo_out.empty())
    {
      writeLog_("Error: Either a data or a transformation output file has to be provided (parameters 'out'/'trafo_out')");
      return ILLEGAL_PARAMETERS;
    }
    if (in.empty() != out.empty())
    {
      writeLog_("Error: Data input and output parameters ('in'/'out') must be used together");
      return ILLEGAL_PARAMETERS;
    }

    //-------------------------------------------------------------
    // apply transformation
    //-------------------------------------------------------------
    TransformationXMLFile trafoxml;
    TransformationDescription trafo;
    trafoxml.load(trafo_in, trafo);
    if (model_type != "none")
    {
      trafo.fitModel(model_type, model_params);
    }
    if (getFlag_("invert"))
    {
      trafo.invert();
    }
    if (!trafo_out.empty())
    {
      trafoxml.store(trafo_out, trafo);
    }
    if (!in.empty()) // load input
    {
      FileTypes::Type in_type = FileHandler::getType(in);
      if (in_type == FileTypes::MZML)
      {
        MzMLFile file;
        MSExperiment<> map;
        applyTransformation_(in, out, trafo, file, map);
      }
      else if (in_type == FileTypes::FEATUREXML)
      {
        FeatureXMLFile file;
        FeatureMap map;
        applyTransformation_(in, out, trafo, file, map);
      }
      else if (in_type == FileTypes::CONSENSUSXML)
      {
        ConsensusXMLFile file;
        ConsensusMap map;
        applyTransformation_(in, out, trafo, file, map);
      }
      else if (in_type == FileTypes::IDXML)
      {
        IdXMLFile file;
        vector<ProteinIdentification> proteins;
        vector<PeptideIdentification> peptides;
        file.load(in, proteins, peptides);
        bool store_original_rt = getFlag_("store_original_rt");
        MapAlignmentTransformer::transformRetentionTimes(peptides, trafo,
                                                         store_original_rt);
        // no "data processing" section in idXML
        file.store(out, proteins, peptides);
      }
    }

    return EXECUTION_OK;
  }
コード例 #5
0
  ExitCodes main_(int, const char**) override
  {
    ExitCodes ret = TOPPMapAlignerBase::checkParameters_();
    if (ret != EXECUTION_OK) return ret;

    MapAlignmentAlgorithmPoseClustering algorithm;
    Param algo_params = getParam_().copy("algorithm:", true);
    algorithm.setParameters(algo_params);
    algorithm.setLogType(log_type_);

    StringList in_files = getStringList_("in");
    StringList out_files = getStringList_("out");
    StringList out_trafos = getStringList_("trafo_out");

    Size reference_index = getIntOption_("reference:index");
    String reference_file = getStringOption_("reference:file");

    FileTypes::Type in_type = FileHandler::getType(in_files[0]);
    String file;
    if (!reference_file.empty())
    {
      file = reference_file;
      reference_index = in_files.size(); // points to invalid index
    }
    else if (reference_index > 0) // normal reference (index was checked before)
    {
      file = in_files[--reference_index]; // ref. index is 1-based in parameters, but should be 0-based here
    }
    else if (reference_index == 0) // no reference given
    {
      LOG_INFO << "Picking a reference (by size) ..." << std::flush;
      // use map with highest number of features as reference:
      Size max_count(0);
      FeatureXMLFile f;
      for (Size i = 0; i < in_files.size(); ++i)
      {
        Size s = 0;
        if (in_type == FileTypes::FEATUREXML) 
        {
          s = f.loadSize(in_files[i]);
        }
        else if (in_type == FileTypes::MZML) // this is expensive!
        {
          PeakMap exp;
          MzMLFile().load(in_files[i], exp);
          exp.updateRanges(1);
          s = exp.getSize();
        }
        if (s > max_count)
        {
          max_count = s;
          reference_index = i;
        }
      }
      LOG_INFO << " done" << std::endl;
      file = in_files[reference_index];
    }

    FeatureXMLFile f_fxml;
    if (out_files.empty()) // no need to store featureXML, thus we can load only minimum required information
    {
      f_fxml.getOptions().setLoadConvexHull(false);
      f_fxml.getOptions().setLoadSubordinates(false);
    }
    if (in_type == FileTypes::FEATUREXML)
    {
      FeatureMap map_ref;
      FeatureXMLFile f_fxml_tmp; // for the reference, we never need CH or subordinates
      f_fxml_tmp.getOptions().setLoadConvexHull(false);
      f_fxml_tmp.getOptions().setLoadSubordinates(false);
      f_fxml_tmp.load(file, map_ref);
      algorithm.setReference(map_ref);
    }
    else if (in_type == FileTypes::MZML)
    {
      PeakMap map_ref;
      MzMLFile().load(file, map_ref);
      algorithm.setReference(map_ref);
    }

    ProgressLogger plog;
    plog.setLogType(log_type_);

    plog.startProgress(0, in_files.size(), "Aligning input maps");
    Size progress(0); // thread-safe progress
    // TODO: it should all work on featureXML files, since we might need them for output anyway. Converting to consensusXML is just wasting memory!
#ifdef _OPENMP
#pragma omp parallel for schedule(dynamic, 1)
#endif
    for (int i = 0; i < static_cast<int>(in_files.size()); ++i)
    {
      TransformationDescription trafo;
      if (in_type == FileTypes::FEATUREXML)
      {
        FeatureMap map;
        // workaround for loading: use temporary FeatureXMLFile since it is not thread-safe
        FeatureXMLFile f_fxml_tmp; // do not use OMP-firstprivate, since FeatureXMLFile has no copy c'tor
        f_fxml_tmp.getOptions() = f_fxml.getOptions();
        f_fxml_tmp.load(in_files[i], map);
        if (i == static_cast<int>(reference_index)) trafo.fitModel("identity");
        else algorithm.align(map, trafo);
        if (out_files.size())
        {
          MapAlignmentTransformer::transformRetentionTimes(map, trafo);
          // annotate output with data processing info
          addDataProcessing_(map, getProcessingInfo_(DataProcessing::ALIGNMENT));
          f_fxml_tmp.store(out_files[i], map);
        }
      }
      else if (in_type == FileTypes::MZML)
      {
        PeakMap map;
        MzMLFile().load(in_files[i], map);
        if (i == static_cast<int>(reference_index)) trafo.fitModel("identity");
        else algorithm.align(map, trafo);
        if (out_files.size())
        {
          MapAlignmentTransformer::transformRetentionTimes(map, trafo);
          // annotate output with data processing info
          addDataProcessing_(map, getProcessingInfo_(DataProcessing::ALIGNMENT));
          MzMLFile().store(out_files[i], map);
        }
      }

      if (!out_trafos.empty())
      {
        TransformationXMLFile().store(out_trafos[i], trafo);
      }

#ifdef _OPENMP
#pragma omp critical (MAPose_Progress)
#endif
      {
        plog.setProgress(++progress); // thread safe progress counter
      }

    }

    plog.endProgress();
    return EXECUTION_OK;
  }
コード例 #6
0
  ExitCodes main_(int, const char **)
  {
    StringList file_list = getStringList_("in");
    String tr_file_str = getStringOption_("tr");
    String out = getStringOption_("out");
    bool is_swath = getFlag_("is_swath");
    bool ppm = getFlag_("ppm");
    bool extract_MS1 = getFlag_("extract_MS1");
    double min_upper_edge_dist = getDoubleOption_("min_upper_edge_dist");
    double mz_extraction_window = getDoubleOption_("mz_window");
    double rt_extraction_window = getDoubleOption_("rt_window");

    String extraction_function = getStringOption_("extraction_function");

    // If we have a transformation file, trafo will transform the RT in the
    // scoring according to the model. If we dont have one, it will apply the
    // null transformation.
    String trafo_in = getStringOption_("rt_norm");
    TransformationDescription trafo;
    if (trafo_in.size() > 0) 
    {
      TransformationXMLFile trafoxml;

      String model_type = getStringOption_("model:type");
      Param model_params = getParam_().copy("model:", true);
      trafoxml.load(trafo_in, trafo);
      trafo.fitModel(model_type, model_params);
    }
    TransformationDescription trafo_inverse = trafo;
    trafo_inverse.invert();

    const char * tr_file = tr_file_str.c_str();

    MapType out_exp;
    std::vector< OpenMS::MSChromatogram > chromatograms;
    TraMLFile traml;
    OpenMS::TargetedExperiment targeted_exp;

    std::cout << "Loading TraML file" << std::endl;
    traml.load(tr_file, targeted_exp);
    std::cout << "Loaded TraML file" << std::endl;

    // Do parallelization over the different input files
    // Only in OpenMP 3.0 are unsigned loop variables allowed
#ifdef _OPENMP
#pragma omp parallel for
#endif
    for (SignedSize i = 0; i < boost::numeric_cast<SignedSize>(file_list.size()); ++i)
    {
      boost::shared_ptr<PeakMap > exp(new PeakMap);
      MzMLFile f;
      // Logging and output to the console
      // IF_MASTERTHREAD f.setLogType(log_type_); 

      // Find the transitions to extract and extract them
      MapType tmp_out;
      OpenMS::TargetedExperiment transition_exp_used;
      f.load(file_list[i], *exp);
      if (exp->empty() ) { continue; } // if empty, go on
      OpenSwath::SpectrumAccessPtr expptr = SimpleOpenMSSpectraFactory::getSpectrumAccessOpenMSPtr(exp);
      bool do_continue = true;
      if (is_swath)
      {
        do_continue = OpenSwathHelper::checkSwathMapAndSelectTransitions(*exp, targeted_exp, transition_exp_used, min_upper_edge_dist);  
      }
      else
      {
        transition_exp_used = targeted_exp;
      }

#ifdef _OPENMP
#pragma omp critical (OpenSwathChromatogramExtractor_metadata)
#endif
      // after loading the first file, copy the meta data from that experiment
      // this may happen *after* chromatograms were already added to the
      // output, thus we do NOT fill the experiment here but rather store all
      // the chromatograms in the "chromatograms" array and store them in
      // out_exp afterwards.
      if (i == 0) 
      {
        out_exp = *exp;
        out_exp.clear(false);
      }

      std::cout << "Extracting " << transition_exp_used.getTransitions().size() << " transitions" << std::endl;
      std::vector< OpenSwath::ChromatogramPtr > chromatogram_ptrs;
      std::vector< ChromatogramExtractor::ExtractionCoordinates > coordinates;

      // continue if the map is not empty
      if (do_continue)
      {

        // Prepare the coordinates (with or without rt extraction) and then extract the chromatograms
        ChromatogramExtractor extractor;
        if (rt_extraction_window < 0)
        {
          extractor.prepare_coordinates(chromatogram_ptrs, coordinates, transition_exp_used, rt_extraction_window, extract_MS1);
        }
        else
        {
          // Use an rt extraction window of 0.0 which will just write the retention time in start / end positions
          extractor.prepare_coordinates(chromatogram_ptrs, coordinates, transition_exp_used, 0.0, extract_MS1);
          for (std::vector< ChromatogramExtractor::ExtractionCoordinates >::iterator it = coordinates.begin(); it != coordinates.end(); ++it)
          {
            it->rt_start = trafo_inverse.apply(it->rt_start) - rt_extraction_window / 2.0;
            it->rt_end = trafo_inverse.apply(it->rt_end) + rt_extraction_window / 2.0;
          }
        }
        extractor.extractChromatograms(expptr, chromatogram_ptrs, coordinates, 
            mz_extraction_window, ppm, extraction_function);

#ifdef _OPENMP
#pragma omp critical (OpenSwathChromatogramExtractor_insertMS1)
#endif
        {
          // Remove potential meta value indicating cached data
          SpectrumSettings exp_settings = (*exp)[0];
          for (Size j = 0; j < exp_settings.getDataProcessing().size(); j++)
          {
            if (exp_settings.getDataProcessing()[j]->metaValueExists("cached_data"))
            { exp_settings.getDataProcessing()[j]->removeMetaValue("cached_data"); }
          }
          extractor.return_chromatogram(chromatogram_ptrs, coordinates, transition_exp_used, exp_settings, chromatograms, extract_MS1);
        }

      } // end of do_continue
    } // end of loop over all files / end of OpenMP

    // TODO check that no chromatogram IDs occur multiple times !
    
    // store the output
    out_exp.setChromatograms(chromatograms);
    MzMLFile mzf;
    mzf.setLogType(log_type_); 
    addDataProcessing_(out_exp, getProcessingInfo_(DataProcessing::SMOOTHING));
    mzf.store(out, out_exp);

    return EXECUTION_OK;
  }
コード例 #7
0
  void MapAlignmentAlgorithmIdentification::computeTransformations_(
    vector<SeqToList> & rt_data, vector<TransformationDescription> & transforms,
    bool sorted)
  {
    Size size = rt_data.size();
    transforms.clear();

    // filter RT data (remove peptides that elute in several fractions):
    // TODO

    // compute RT medians:
    LOG_DEBUG << "Computing RT medians..." << endl;
    vector<SeqToValue> medians_per_run(size);
    for (Size i = 0; i < size; ++i)
    {
      computeMedians_(rt_data[i], medians_per_run[i], sorted);
    }
    SeqToList medians_per_seq;
    for (vector<SeqToValue>::iterator run_it = medians_per_run.begin();
         run_it != medians_per_run.end(); ++run_it)
    {
      for (SeqToValue::iterator med_it = run_it->begin();
           med_it != run_it->end(); ++med_it)
      {
        medians_per_seq[med_it->first] << med_it->second;
      }
    }

    // get reference retention time scale: either directly from reference file,
    // or compute consensus time scale
    bool reference_given = !reference_.empty();     // reference file given
    if (reference_given)
    {
      // remove peptides that don't occur in enough runs:
      LOG_DEBUG << "Removing peptides that occur in too few runs..." << endl;
      SeqToValue temp;
      SeqToValue::iterator pos = temp.begin();       // to prevent segfault below
      for (SeqToValue::iterator ref_it = reference_.begin();
           ref_it != reference_.end(); ++ref_it)
      {
        SeqToList::iterator med_it = medians_per_seq.find(ref_it->first);
        if ((med_it != medians_per_seq.end()) &&
            (med_it->second.size() + 1 >= min_run_occur_))
        {
          temp.insert(pos, *ref_it);
          pos = --temp.end();           // would cause segfault if "temp" was empty
        }
      }
      temp.swap(reference_);
    }
    else     // compute overall RT median per sequence (median of medians per run)
    {
      LOG_DEBUG << "Computing overall RT medians per sequence..." << endl;

      // remove peptides that don't occur in enough runs (at least two):
      LOG_DEBUG << "Removing peptides that occur in too few runs..." << endl;
      SeqToList temp;
      SeqToList::iterator pos = temp.begin();       // to prevent segfault below
      for (SeqToList::iterator med_it = medians_per_seq.begin();
           med_it != medians_per_seq.end(); ++med_it)
      {
        if (med_it->second.size() >= min_run_occur_)
        {
          temp.insert(pos, *med_it);
          pos = --temp.end();           // would cause segfault if "temp" was empty
        }
      }
      temp.swap(medians_per_seq);
      computeMedians_(medians_per_seq, reference_);
    }

    DoubleReal max_rt_shift = param_.getValue("max_rt_shift");
    if (max_rt_shift == 0)
    {
      max_rt_shift = numeric_limits<DoubleReal>::max();
    }
    else if (max_rt_shift <= 1) // compute max. allowed shift from overall retention time range:
    {
      DoubleReal rt_range, rt_min = reference_.begin()->second,
                 rt_max = rt_min;
      for (SeqToValue::iterator it = ++reference_.begin();
           it != reference_.end(); ++it)
      {
        rt_min = min(rt_min, it->second);
        rt_max = max(rt_max, it->second);
      }
      rt_range = rt_max - rt_min;
      max_rt_shift *= rt_range;
    }
    LOG_DEBUG << "Max. allowed RT shift (in seconds): " << max_rt_shift << endl;

    // generate RT transformations:
    LOG_DEBUG << "Generating RT transformations..." << endl;
    LOG_INFO << "\nAlignment based on:" << endl;     // diagnostic output
    for (Size i = 0, offset = 0; i < size + 1; ++i)
    {
      if (i == reference_index_ - 1)
      {
        // if one of the input maps was used as reference, it has been skipped
        // so far - now we have to consider it again:
        TransformationDescription trafo;
        trafo.fitModel("identity");
        transforms.push_back(trafo);
        LOG_INFO << "- 0 data points for sample " << i + 1 << " (reference)\n";
        offset = 1;
      }

      if (i >= size)
        break;

      // to be useful for the alignment, a peptide sequence has to occur in the
      // current run ("medians_per_run[i]"), but also in at least one other run
      // ("medians_overall"):
      TransformationDescription::DataPoints data;
      for (SeqToValue::iterator med_it = medians_per_run[i].begin();
           med_it != medians_per_run[i].end(); ++med_it)
      {
        SeqToValue::const_iterator pos = reference_.find(med_it->first);
        if ((pos != reference_.end()) &&
            (fabs(med_it->second - pos->second) <= max_rt_shift))
        {         // found, and satisfies "max_rt_shift" condition!
          data.push_back(make_pair(med_it->second, pos->second));
        }
      }
      transforms.push_back(TransformationDescription(data));
      LOG_INFO << "- " << data.size() << " data points for sample "
               << i + offset + 1 << "\n";
    }
    LOG_INFO << endl;

    // delete temporary reference
    if (!reference_given)
      reference_.clear();

  }
コード例 #8
0
  void PoseClusteringShiftSuperimposer::run(const ConsensusMap & map_model, const ConsensusMap & map_scene, TransformationDescription & transformation)
  {
    typedef ConstRefVector<ConsensusMap> PeakPointerArray_;
    typedef Math::LinearInterpolation<double, double> LinearInterpolationType_;

    LinearInterpolationType_ shift_hash_;

    // OLD STUFF
    //    LinearInterpolationType_ scaling_hash_1;
    //    LinearInterpolationType_ scaling_hash_2;
    //    LinearInterpolationType_ shift_hash_;
    //    LinearInterpolationType_ rt_high_hash_;

    /// Maximum deviation in mz of two partner points
    const double mz_pair_max_distance = param_.getValue("mz_pair_max_distance");

    /// Size of each shift bucket
    const double shift_bucket_size = param_.getValue("shift_bucket_size");

    const UInt struc_elem_length_datapoints = 21; // MAGIC ALERT: number of data points in structuring element for tophat filter, which removes baseline from histogram
    const double scaling_histogram_crossing_slope = 3.0; // MAGIC ALERT: used when distinguishing noise level and enriched histogram bins
    const double scaling_cutoff_stdev_multiplier = 1.5; // MAGIC ALERT: multiplier for stdev in cutoff for outliers
    const UInt loops_mean_stdev_cutoff = 3; // MAGIC ALERT: number of loops in stdev cutoff for outliers

    startProgress(0, 100, "shift pose clustering");
    UInt actual_progress = 0;
    setProgress(++actual_progress);

    // Optionally, we will write dumps of the hash table buckets.
    bool do_dump_buckets = false;
    String dump_buckets_basename;
    if (param_.getValue("dump_buckets") != "")
    {
      do_dump_buckets = true;
      dump_buckets_basename = param_.getValue("dump_buckets");
    }
    setProgress(++actual_progress);

    // Even more optionally, we will write dumps of the hashed pairs.
    bool do_dump_pairs = false;
    String dump_pairs_basename;
    if (param_.getValue("dump_pairs") != "")
    {
      do_dump_pairs = true;
      dump_pairs_basename = param_.getValue("dump_pairs");
    }
    setProgress(++actual_progress);

    //**************************************************************************
    // Select the most abundant data points only.  After that, disallow modifications
    // (we tend to have annoying issues with const_iterator versus iterator).
    PeakPointerArray_ model_map_ini(map_model.begin(), map_model.end());
    const PeakPointerArray_ & model_map(model_map_ini);
    PeakPointerArray_ scene_map_ini(map_scene.begin(), map_scene.end());
    const PeakPointerArray_ & scene_map(scene_map_ini);
    {
      // truncate the data as necessary
      // casting to SignedSize is done on PURPOSE here! (num_used_points will be maximal if -1 is used)
      const Size num_used_points = (SignedSize) param_.getValue("num_used_points");
      if (model_map_ini.size() > num_used_points)
      {
        model_map_ini.sortByIntensity(true);
        model_map_ini.resize(num_used_points);
      }
      model_map_ini.sortByComparator(Peak2D::MZLess());
      setProgress(++actual_progress);
      if (scene_map_ini.size() > num_used_points)
      {
        scene_map_ini.sortByIntensity(true);
        scene_map_ini.resize(num_used_points);
      }
      scene_map_ini.sortByComparator(Peak2D::MZLess());
      setProgress(++actual_progress);
      // Note: model_map_ini and scene_map_ini will not be used further below
    }
    setProgress((actual_progress = 10));

    //**************************************************************************
    // Preprocessing

    // get RT ranges (NOTE: we trust that min and max have been updated in the
    // ConsensusMap::convert() method !)

    const double model_low = map_model.getMin()[ConsensusFeature::RT];
    const double scene_low = map_scene.getMin()[ConsensusFeature::RT];
    const double model_high = map_model.getMax()[ConsensusFeature::RT];
    const double scene_high = map_scene.getMax()[ConsensusFeature::RT];

    // OLD STUFF
    //    const double rt_low = (maps[0].getMin()[ConsensusFeature::RT] + maps[1].getMin()[ConsensusFeature::RT]) / 2.;
    //    const double rt_high = (maps[0].getMax()[ConsensusFeature::RT] + maps[1].getMax()[ConsensusFeature::RT]) / 2.;

    // Initialize the hash tables: shift_hash_
    // OLD STUFF: was:  rt_scaling_hash_, rt_low_hash_, and rt_high_hash_
    {
      // (over)estimate the required number of buckets for shifting
      double max_shift = param_.getValue("max_shift");
      // actually the largest possible shift can be much smaller, depending on the data
      do
      {
        if (max_shift < 0)
          max_shift = -max_shift;
        //     ...ml@@@mh........    ,    ........ml@@@mh...
        //     ........sl@@@sh...    ,    ...sl@@@sh........
        double diff;
        diff = model_high - scene_low;
        if (diff < 0)
          diff = -diff;
        if (max_shift > diff)
          max_shift = diff;
        diff = model_low - scene_high;
        if (diff < 0)
          diff = -diff;
        if (max_shift > diff)
          max_shift = diff;
      }
      while (0);

      const Int shift_buckets_num_half = 4 + (Int) ceil((max_shift) / shift_bucket_size);
      const Int shift_buckets_num = 1 + 2 * shift_buckets_num_half;

      shift_hash_.getData().clear();
      shift_hash_.getData().resize(shift_buckets_num);
      shift_hash_.setMapping(shift_bucket_size, shift_buckets_num_half, 0);
    }
    setProgress(++actual_progress);

    //**************************************************************************
    // compute the ratio of the total intensities of both maps, for normalization
    double total_intensity_ratio;
    do
    {
      double total_int_model_map = 0;
      for (Size i = 0; i < model_map.size(); ++i)
      {
        total_int_model_map += model_map[i].getIntensity();
      }
      setProgress(++actual_progress);
      double total_int_scene_map = 0;
      for (Size i = 0; i < scene_map.size(); ++i)
      {
        total_int_scene_map += scene_map[i].getIntensity();
      }
      setProgress(++actual_progress);
      // ... and finally ...
      total_intensity_ratio = total_int_model_map / total_int_scene_map;
    }
    while (0);   // (the extra syntax helps with code folding in eclipse!)
    setProgress((actual_progress = 20));

    /// The serial number is incremented for each invocation of this, to avoid overwriting of hash table dumps.
    static Int dump_buckets_serial = 0;
    ++dump_buckets_serial;

    //**************************************************************************
    // Hashing

    // Compute the transformations between each point pair in the model map
    // and each point pair in the scene map and hash the shift
    // transformation.

    // To speed up the calculation of the final transformation, we confine the number of
    // considered point pairs.  We match a point p in the model map only onto those points p'
    // in the scene map that lie in a certain mz interval.

    Size const model_map_size = model_map.size(); // i  /* OLD STUFF: also: j */
    Size const scene_map_size = scene_map.size(); // k  /* OLD STUFF: also: l */

    const double winlength_factor_baseline = 0.1; // MAGIC ALERT: Each window is given unit weight.  If there are too many pairs for a window, the individual contributions will be very small, but running time will be high, so we provide a cutoff for this.  Typically this will exclude compounds which elute over the whole retention time range from consideration.


    ///////////////////////////////////////////////////////////////////
    // Hashing:  Estimate the shift

    do // begin of hashing (the extra syntax helps with code folding in eclipse!)
    {
      String dump_pairs_filename;
      std::ofstream dump_pairs_file;
      if (do_dump_pairs)
      {
        dump_pairs_filename = dump_pairs_basename + String(dump_buckets_serial);
        dump_pairs_file.open(dump_pairs_filename.c_str());
        dump_pairs_file << "#" << ' ' << "i" << ' ' << "k" << std::endl;
      }
      setProgress(++actual_progress);

      // first point in model map
      for (Size i = 0, i_low = 0, i_high = 0, k_low = 0, k_high = 0; i < model_map_size - 1; ++i)
      {
        setProgress(actual_progress + float(i) / model_map_size * 10.f);

        // Adjust window around i in model map
        while (i_low < model_map_size && model_map[i_low].getMZ() < model_map[i].getMZ() - mz_pair_max_distance)
          ++i_low;
        while (i_high < model_map_size && model_map[i_high].getMZ() <= model_map[i].getMZ() + mz_pair_max_distance)
          ++i_high;
        double i_winlength_factor = 1. / (i_high - i_low);
        i_winlength_factor -= winlength_factor_baseline;
        if (i_winlength_factor <= 0)
          continue;

        // Adjust window around k in scene map
        while (k_low < scene_map_size && scene_map[k_low].getMZ() < model_map[i].getMZ() - mz_pair_max_distance)
          ++k_low;
        while (k_high < scene_map_size && scene_map[k_high].getMZ() <= model_map[i].getMZ() + mz_pair_max_distance)
          ++k_high;

        // first point in scene map
        for (Size k = k_low; k < k_high; ++k)
        {
          double k_winlength_factor = 1. / (k_high - k_low);
          k_winlength_factor -= winlength_factor_baseline;
          if (k_winlength_factor <= 0)
            continue;

          // compute similarity of intensities i k
          double similarity_ik;
          {
            const double int_i = model_map[i].getIntensity();
            const double int_k = scene_map[k].getIntensity() * total_intensity_ratio;
            similarity_ik = (int_i < int_k) ? int_i / int_k : int_k / int_i;
            // weight is inverse proportional to number of elements with similar mz
            similarity_ik *= i_winlength_factor;
            similarity_ik *= k_winlength_factor;
            // VV_(int_i<<' '<<int_k<<' '<<int_similarity_ik);
          }

          // compute the transformation (i) -> (k)
          double shift = model_map[i].getRT() - scene_map[k].getRT();

          // hash the images of scaling, rt_low and rt_high into their respective hash tables
          shift_hash_.addValue(shift, similarity_ik);

          if (do_dump_pairs)
          {
            dump_pairs_file << i << ' ' << model_map[i].getRT() << ' ' << model_map[i].getMZ() << ' ' << k << ' ' << scene_map[k].getRT() << ' '
                            << scene_map[k].getMZ() << ' ' << similarity_ik << ' ' << std::endl;
          }

        } // k
      } // i
    }
    while (0);   // end of hashing (the extra syntax helps with code folding in eclipse!)

    setProgress((actual_progress = 30));

    ///////////////////////////////////////////////////////////////////
    // work on shift_hash_
    //   double shift_low;
    //   double shift_centroid;
    //   double shift_high;

    // OLD STUFF
    // double shift_low;
    double shift_centroid;
    // double shift_high;
    do
    {

      UInt filtering_stage = 0;

      // optionally, dump before filtering
      String dump_buckets_filename;
      std::ofstream dump_buckets_file;
      if (do_dump_buckets)
      {
        dump_buckets_filename = dump_buckets_basename + "_" + String(dump_buckets_serial);
        dump_buckets_file.open(dump_buckets_filename.c_str());
        VV_(dump_buckets_filename);

        dump_buckets_file << "# shift hash table buckets dump ( scale, height ) : " << dump_buckets_filename << std::endl;
        dump_buckets_file << "# unfiltered hash data\n";
        for (Size index = 0; index < shift_hash_.getData().size(); ++index)
        {
          const double image = shift_hash_.index2key(index);
          const double height = shift_hash_.getData()[index];
          dump_buckets_file << filtering_stage << '\t' << index << '\t' << image << '\t' << height << '\n';
        }
        dump_buckets_file << '\n';
      }

      ++filtering_stage;
      setProgress(++actual_progress);

      // apply tophat filter to histogram
      MorphologicalFilter morph_filter;
      Param morph_filter_param;
      morph_filter_param.setValue("struc_elem_unit", "DataPoints");
      morph_filter_param.setValue("struc_elem_length", double(struc_elem_length_datapoints));
      morph_filter_param.setValue("method", "tophat");
      morph_filter.setParameters(morph_filter_param);

      LinearInterpolationType_::container_type buffer(shift_hash_.getData().size());
      morph_filter.filterRange(shift_hash_.getData().begin(), shift_hash_.getData().end(), buffer.begin());
      shift_hash_.getData().swap(buffer);

      // optionally, dump after filtering
      if (do_dump_buckets)
      {
        dump_buckets_file << "# tophat filtered hash data\n";
        for (Size index = 0; index < shift_hash_.getData().size(); ++index)
        {
          const double image = shift_hash_.index2key(index);
          const double height = shift_hash_.getData()[index];
          dump_buckets_file << filtering_stage << '\t' << index << '\t' << image << '\t' << height << '\n';
        }
        dump_buckets_file << '\n';
      }
      setProgress(++actual_progress);

      ++filtering_stage;

      // compute freq_cutoff using a fancy criterion to distinguish between the noise level of the histogram and enriched histogram bins
      double freq_cutoff_low;
      do
      {
        {
          std::copy(shift_hash_.getData().begin(), shift_hash_.getData().end(), buffer.begin());
          std::sort(buffer.begin(), buffer.end(), std::greater<double>());
          double freq_intercept = shift_hash_.getData().front();
          double freq_slope = (shift_hash_.getData().back() - shift_hash_.getData().front()) / double(buffer.size())
                                  / scaling_histogram_crossing_slope;
          if (!freq_slope || !buffer.size())
          {
            // in fact these conditions are actually impossible, but let's be really sure ;-)
            freq_cutoff_low = 0;
          }
          else
          {
            Size index = 1; // not 0 (!)
            while (buffer[index] >= freq_intercept + freq_slope * double(index))
            {
              ++index;
            }
            freq_cutoff_low = buffer[--index]; // note that we have index >= 1
          }
        }
      }
      while (0);
      setProgress(++actual_progress);

      // apply freq_cutoff, setting smaller values to zero
      for (Size index = 0; index < shift_hash_.getData().size(); ++index)
      {
        if (shift_hash_.getData()[index] < freq_cutoff_low)
        {
          shift_hash_.getData()[index] = 0;
        }
      }
      setProgress(++actual_progress);

      // optionally, dump after noise filtering using freq_cutoff
      if (do_dump_buckets)
      {
        dump_buckets_file << "# after freq_cutoff, which is: " << freq_cutoff_low << '\n';
        for (Size index = 0; index < shift_hash_.getData().size(); ++index)
        {
          const double image = shift_hash_.index2key(index);
          const double height = shift_hash_.getData()[index];
          dump_buckets_file << filtering_stage << '\t' << index << '\t' << image << '\t' << height << '\n';
        }
        dump_buckets_file << '\n';
      }
      setProgress(++actual_progress);

      // iterative cut-off based on mean and stdev - relies upon scaling_cutoff_stdev_multiplier which is a bit hard to set right.
      {
        Math::BasicStatistics<double> statistics;
        std::vector<double>::const_iterator data_begin = shift_hash_.getData().begin();
        const Size data_size = shift_hash_.getData().size();
        Size data_range_begin = 0;
        Size data_range_end = data_size;
        for (UInt loop = 0; loop < loops_mean_stdev_cutoff; ++loop)   // MAGIC ALERT: number of loops
        {
          statistics.update(data_begin + data_range_begin, data_begin + data_range_end);
          double mean = statistics.mean() + data_range_begin;
          double stdev = sqrt(statistics.variance());
          data_range_begin = floor(std::max<double>(mean - scaling_cutoff_stdev_multiplier * stdev, 0));
          data_range_end = ceil(std::min<double>(mean + scaling_cutoff_stdev_multiplier * stdev + 1, data_size));
          const double outside_mean = shift_hash_.index2key(mean);
          const double outside_stdev = stdev * shift_hash_.getScale();
          // shift_low = (outside_mean - outside_stdev);
          shift_centroid = (outside_mean);
          // shift_high = (outside_mean + outside_stdev);
          if (do_dump_buckets)
          {
            dump_buckets_file << "# loop: " << loop << "  mean: " << outside_mean << "  stdev: " << outside_stdev << "  (mean-stdev): "
                              << outside_mean - outside_stdev << "  (mean+stdev): " << outside_mean + outside_stdev
                              << "  data_range_begin: " << data_range_begin << "  data_range_end: "
                              << data_range_end << std::endl;
          }
        }
        setProgress(++actual_progress);
      }
      if (do_dump_buckets)
      {
        dump_buckets_file << "# EOF" << std::endl;
        dump_buckets_file.close();
      }
      setProgress(80);

    }
    while (0);

    //************************************************************************************
    // Estimate transform

    // Compute the shifts at the low and high ends by looking at (around) the fullest bins.
    double intercept;
#if 1 // yes of course, use centroids for images of rt_low and rt_high
    intercept = shift_centroid;
#else // ooh, use maximum bins instead (Note: this is a fossil which would disregard most of the above computations!  The code is left here for developers/debugging only.)
    const Size rt_low_max_index = std::distance(shift_hash_.getData().begin(),
                                                std::max_element(shift_hash_.getData().begin(), shift_hash_.getData().end()));
    intercept = shift_hash_.index2key(rt_low_max_index);
#endif

    VV_(intercept);

    setProgress(++actual_progress);

    // set trafo
    {
      Param params;
      params.setValue("slope", 1.0);
      params.setValue("intercept", intercept);

      TransformationDescription trafo;
      trafo.fitModel("linear", params);
      transformation = trafo;
    }

    setProgress(++actual_progress);
    endProgress();

    return;
  } // run()