Ejemplo n.º 1
0
 ExitCodes main_(int, const char **)
 {
   String in = getStringOption_("in"), out = getStringOption_("out");
   FileTypes::Type in_type = FileHandler::getType(in);
   if (in_type == FileTypes::FEATUREXML)
   {
     FeatureMap<> features;
     FeatureXMLFile().load(in, features);
     for (FeatureMap<>::Iterator feat_it = features.begin();
          feat_it != features.end(); ++feat_it)
     {
       resolveConflict_(feat_it->getPeptideIdentifications());
     }
     addDataProcessing_(features,
                        getProcessingInfo_(DataProcessing::FILTERING));
     FeatureXMLFile().store(out, features);
   }
   else     // consensusXML
   {
     ConsensusMap consensus;
     ConsensusXMLFile().load(in, consensus);
     for (ConsensusMap::Iterator cons_it = consensus.begin();
          cons_it != consensus.end(); ++cons_it)
     {
       resolveConflict_(cons_it->getPeptideIdentifications());
     }
     addDataProcessing_(consensus,
                        getProcessingInfo_(DataProcessing::FILTERING));
     ConsensusXMLFile().store(out, consensus);
   }
   return EXECUTION_OK;
 }
 void SeedListGenerator::generateSeedLists(const ConsensusMap& consensus,
                                           Map<UInt64, SeedList>& seed_lists)
 {
   seed_lists.clear();
   // iterate over all consensus features...
   for (ConsensusMap::ConstIterator cons_it = consensus.begin();
        cons_it != consensus.end(); ++cons_it)
   {
     DPosition<2> point(cons_it->getRT(), cons_it->getMZ());
     // for each sub-map in the consensus map, add a seed at the position of
     // this consensus feature:
     for (ConsensusMap::FileDescriptions::const_iterator file_it =
            consensus.getFileDescriptions().begin(); file_it !=
          consensus.getFileDescriptions().end(); ++file_it)
       seed_lists[file_it->first].push_back(point);
     // for each feature contained in the consensus feature, remove the seed of
     // the corresponding map:
     for (ConsensusFeature::HandleSetType::const_iterator feat_it =
            cons_it->getFeatures().begin(); feat_it !=
          cons_it->getFeatures().end(); ++feat_it)
     {
       seed_lists[feat_it->getMapIndex()].pop_back();
     }
     // this leaves seeds for maps where no feature was found near the
     // consensus position
   }
 }
  void MapAlignmentAlgorithmPoseClustering::align(const ConsensusMap & map, TransformationDescription & trafo)
  {
    // TODO: move this to updateMembers_? (if consensusMap prevails)
    // TODO: why does superimposer work on consensus map???
    const ConsensusMap & map_model = reference_;
    ConsensusMap map_scene = map;

    // run superimposer to find the global transformation
    TransformationDescription si_trafo;
    superimposer_.run(map_model, map_scene, si_trafo);

    // apply transformation to consensus features and contained feature
    // handles
    for (Size j = 0; j < map_scene.size(); ++j)
    {
      //Calculate new RT
      double rt = map_scene[j].getRT();
      rt = si_trafo.apply(rt);
      //Set RT of consensus feature centroid
      map_scene[j].setRT(rt);
      //Set RT of consensus feature handles
      map_scene[j].begin()->asMutable().setRT(rt);
    }

    //run pairfinder to find pairs
    ConsensusMap result;
    //TODO: add another 2map interface to pairfinder?
    std::vector<ConsensusMap> input(2);
    input[0] = map_model;
    input[1] = map_scene;
    pairfinder_.run(input, result);

    // calculate the local transformation
    si_trafo.invert();         // to undo the transformation applied above
    TransformationDescription::DataPoints data;
    for (ConsensusMap::Iterator it = result.begin(); it != result.end();
         ++it)
    {
      if (it->size() == 2)           // two matching features
      {
        ConsensusFeature::iterator feat_it = it->begin();
        double y = feat_it->getRT();
        double x = si_trafo.apply((++feat_it)->getRT());
        // one feature should be from the reference map:
        if (feat_it->getMapIndex() != 0)
        {
          data.push_back(make_pair(x, y));
        }
        else
        {
          data.push_back(make_pair(y, x));
        }
      }
    }
    trafo = TransformationDescription(data);
    trafo.fitModel("linear");
  }
  void FeatureGroupingAlgorithm::transferSubelements(const vector<ConsensusMap>& maps, ConsensusMap& out) const
  {
    // accumulate file descriptions from the input maps:
    // cout << "Updating file descriptions..." << endl;
    out.getFileDescriptions().clear();
    // mapping: (map index, original id) -> new id
    map<pair<Size, UInt64>, Size> mapid_table;
    for (Size i = 0; i < maps.size(); ++i)
    {
      const ConsensusMap& consensus = maps[i];
      for (ConsensusMap::FileDescriptions::const_iterator desc_it = consensus.getFileDescriptions().begin(); desc_it != consensus.getFileDescriptions().end(); ++desc_it)
      {
        Size counter = mapid_table.size();
        mapid_table[make_pair(i, desc_it->first)] = counter;
        out.getFileDescriptions()[counter] = desc_it->second;
      }
    }

    // look-up table: input map -> unique ID -> consensus feature
    // cout << "Creating look-up table..." << endl;
    vector<map<UInt64, ConsensusMap::ConstIterator> > feat_lookup(maps.size());
    for (Size i = 0; i < maps.size(); ++i)
    {
      const ConsensusMap& consensus = maps[i];
      for (ConsensusMap::ConstIterator feat_it = consensus.begin();
           feat_it != consensus.end(); ++feat_it)
      {
        // do NOT use "id_lookup[i][feat_it->getUniqueId()] = feat_it;" here as
        // you will get "attempt to copy-construct an iterator from a singular
        // iterator" in STL debug mode:
        feat_lookup[i].insert(make_pair(feat_it->getUniqueId(), feat_it));
      }
    }
    // adjust the consensus features:
    // cout << "Adjusting consensus features..." << endl;
    for (ConsensusMap::iterator cons_it = out.begin(); cons_it != out.end(); ++cons_it)
    {
      ConsensusFeature adjusted = ConsensusFeature(
        static_cast<BaseFeature>(*cons_it)); // remove sub-features
      for (ConsensusFeature::HandleSetType::const_iterator sub_it = cons_it->getFeatures().begin(); sub_it != cons_it->getFeatures().end(); ++sub_it)
      {
        UInt64 id = sub_it->getUniqueId();
        Size map_index = sub_it->getMapIndex();
        ConsensusMap::ConstIterator origin = feat_lookup[map_index][id];
        for (ConsensusFeature::HandleSetType::const_iterator handle_it = origin->getFeatures().begin(); handle_it != origin->getFeatures().end(); ++handle_it)
        {
          FeatureHandle handle = *handle_it;
          Size new_id = mapid_table[make_pair(map_index, handle.getMapIndex())];
          handle.setMapIndex(new_id);
          adjusted.insert(handle);
        }
      }
      *cons_it = adjusted;
    }
  }
Ejemplo n.º 5
0
  void MapAlignmentTransformer::transformSingleConsensusMap(ConsensusMap & cmap,
                                                            const TransformationDescription & trafo)
  {
    for (ConsensusMap::Iterator cmit = cmap.begin(); cmit != cmap.end();
         ++cmit)
    {
      applyToConsensusFeature_(*cmit, trafo);
    }

    // adapt RT values of unassigned peptides:
    if (!cmap.getUnassignedPeptideIdentifications().empty())
    {
      transformSinglePeptideIdentification(
        cmap.getUnassignedPeptideIdentifications(), trafo);
    }
  }
 void ConsensusMapNormalizerAlgorithmThreshold::normalizeMaps(ConsensusMap& map, const vector<double>& ratios)
 {
   ConsensusMap::Iterator cf_it;
   ProgressLogger progresslogger;
   progresslogger.setLogType(ProgressLogger::CMD);
   progresslogger.startProgress(0, map.size(), "normalizing maps");
   for (cf_it = map.begin(); cf_it != map.end(); ++cf_it)
   {
     progresslogger.setProgress(cf_it - map.begin());
     ConsensusFeature::HandleSetType::const_iterator f_it;
     for (f_it = cf_it->getFeatures().begin(); f_it != cf_it->getFeatures().end(); ++f_it)
     {
       f_it->asMutable().setIntensity(f_it->getIntensity() * ratios[f_it->getMapIndex()]);
     }
   }
   progresslogger.endProgress();
 }
 void ConsensusMapNormalizerAlgorithmQuantile::setNormalizedIntensityValues(const vector<vector<double> >& feature_ints, ConsensusMap& map)
 {
   //assumes the input map and feature_ints are in the same order as in the beginning,
   //although feature_ints has normalized values now (but the same ranks as before)
   Size number_of_maps = map.getColumnHeaders().size();
   ConsensusMap::ConstIterator cf_it;
   vector<Size> progress_indices(number_of_maps);
   for (cf_it = map.begin(); cf_it != map.end(); ++cf_it)
   {
     ConsensusFeature::HandleSetType::const_iterator f_it;
     for (f_it = cf_it->getFeatures().begin(); f_it != cf_it->getFeatures().end(); ++f_it)
     {
       Size map_idx = f_it->getMapIndex();
       double intensity = feature_ints[map_idx][progress_indices[map_idx]++];
       f_it->asMutable().setIntensity(intensity);
     }
   }
 }
void QuantitativeExperimentalDesign::mergeConsensusMaps_(ConsensusMap & out, const String & experiment, StringList & file_paths)
{
    ConsensusMap map;

    LOG_INFO << "Merge consensus maps: " << endl;
    UInt counter = 1;
    for (StringList::Iterator file_it = file_paths.begin(); file_it != file_paths.end(); ++file_it, ++counter)
    {
        //load should clear the map
        ConsensusXMLFile().load(*file_it, map);
        for (ConsensusMap::iterator it = map.begin(); it != map.end(); ++it)
        {
            it->setMetaValue("experiment", DataValue(experiment));
        }
        out += map;
    }
    LOG_INFO << endl;
}
 void ConsensusMapNormalizerAlgorithmQuantile::extractIntensityVectors(const ConsensusMap& map, vector<vector<double> >& out_intensities)
 {
   //reserve space for out_intensities (unequal vector lengths, 0-features omitted)
   Size number_of_maps = map.getColumnHeaders().size();
   out_intensities.clear();
   out_intensities.resize(number_of_maps);
   for (UInt i = 0; i < number_of_maps; i++)
   {
     ConsensusMap::ColumnHeaders::const_iterator it = map.getColumnHeaders().find(i);
     if (it == map.getColumnHeaders().end()) throw Exception::ElementNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, String(i));
     out_intensities[i].reserve(it->second.size);
   }
   //fill out_intensities
   ConsensusMap::ConstIterator cf_it;
   for (cf_it = map.begin(); cf_it != map.end(); ++cf_it)
   {
     ConsensusFeature::HandleSetType::const_iterator f_it;
     for (f_it = cf_it->getFeatures().begin(); f_it != cf_it->getFeatures().end(); ++f_it)
     {
       out_intensities[f_it->getMapIndex()].push_back(f_it->getIntensity());
     }
   }
 }
Ejemplo n.º 10
0
  ExitCodes main_(int, const char**)
  {
    vector<ProteinIdentification> prot_ids;
    vector<PeptideIdentification> pep_ids;
    ProteinHit temp_protein_hit;

    //-------------------------------------------------------------
    // parsing parameters
    //-------------------------------------------------------------
    String inputfile_id               = getStringOption_("id");
    String inputfile_feature       = getStringOption_("feature");
    String inputfile_consensus  = getStringOption_("consensus");
    String inputfile_raw            = getStringOption_("in");
    String outputfile_name       = getStringOption_("out");

    //~ bool Ms1(getFlag_("MS1"));
    //~ bool Ms2(getFlag_("MS2"));
    bool remove_duplicate_features(getFlag_("remove_duplicate_features"));
    
    //-------------------------------------------------------------
    // fetch vocabularies
    //------------------------------------------------------------
    ControlledVocabulary cv;
    cv.loadFromOBO("PSI-MS", File::find("/CV/psi-ms.obo"));
    cv.loadFromOBO("QC", File::find("/CV/qc-cv.obo"));
 
     QcMLFile qcmlfile;

    //-------------------------------------------------------------
    // MS  aqiusition
    //------------------------------------------------------------
    String base_name = QFileInfo(QString::fromStdString(inputfile_raw)).baseName();

    cout << "Reading mzML file..." << endl;
    MzMLFile mz_data_file;
    MSExperiment<Peak1D> exp;
    MzMLFile().load(inputfile_raw, exp);
    
    //---prep input
    exp.sortSpectra();
    UInt min_mz = std::numeric_limits<UInt>::max();
    UInt max_mz = 0;
    std::map<Size, UInt> mslevelcounts;
    
    qcmlfile.registerRun(base_name,base_name); //TODO use UIDs
    
    //---base MS aquisition qp
    String msaq_ref = base_name + "_msaq";
    QcMLFile::QualityParameter qp;
    qp.id = msaq_ref; ///< Identifier
    qp.cvRef = "QC"; ///< cv reference
    qp.cvAcc = "QC:0000004";
    try
    {
      //~ const ControlledVocabulary::CVTerm& test = cv.getTermByName("MS aquisition result details");
      //~ cout << test.name << test.id << endl;
      const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
      //~ const ControlledVocabulary::CVTerm& term = cv.getTerm("0000004");
      qp.name = term.name; ///< Name
    }
    catch (...)
    {
      qp.name = "mzML file"; ///< Name
    }
    qcmlfile.addRunQualityParameter(base_name, qp);
    
    //---file origin qp
    qp = QcMLFile::QualityParameter();
    qp.name = "mzML file"; ///< Name
    qp.id = base_name + "_run_name"; ///< Identifier
    qp.cvRef = "MS"; ///< cv reference
    qp.cvAcc = "MS:1000577";
    qp.value = base_name;
    qcmlfile.addRunQualityParameter(base_name, qp);
    
    qp = QcMLFile::QualityParameter();
    qp.name = "instrument model"; ///< Name
    qp.id = base_name + "_instrument_name"; ///< Identifier
    qp.cvRef = "MS"; ///< cv reference
    qp.cvAcc = "MS:1000031";
    qp.value = exp.getInstrument().getName();
    qcmlfile.addRunQualityParameter(base_name, qp);    

    qp = QcMLFile::QualityParameter();
    qp.name = "completion time"; ///< Name
    qp.id = base_name + "_date"; ///< Identifier
    qp.cvRef = "MS"; ///< cv reference
    qp.cvAcc = "MS:1000747";
    qp.value = exp.getDateTime().getDate();
    qcmlfile.addRunQualityParameter(base_name, qp);

    //---precursors at
    QcMLFile::Attachment at;
    at.cvRef = "QC"; ///< cv reference
    at.cvAcc = "QC:0000044";
    at.qualityRef = msaq_ref;
    at.id = base_name + "_precursors"; ///< Identifier
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
      at.name = term.name; ///< Name
    }
    catch (...)
    {
      at.name = "precursors"; ///< Name
    }

    at.colTypes.push_back("MS:1000894_[sec]"); //RT
    at.colTypes.push_back("MS:1000040"); //MZ
    for (Size i = 0; i < exp.size(); ++i)
    {
      mslevelcounts[exp[i].getMSLevel()]++;
      if (exp[i].getMSLevel() == 2)
      {
        if (exp[i].getPrecursors().front().getMZ() < min_mz)
        {
          min_mz = exp[i].getPrecursors().front().getMZ();
        }
        if (exp[i].getPrecursors().front().getMZ() > max_mz)
        {
          max_mz = exp[i].getPrecursors().front().getMZ();
        }
        std::vector<String> row;
        row.push_back(exp[i].getRT());
        row.push_back(exp[i].getPrecursors().front().getMZ());
        at.tableRows.push_back(row);
      }
    }
    qcmlfile.addRunAttachment(base_name, at);

    //---aquisition results qp
    qp = QcMLFile::QualityParameter();
    qp.cvRef = "QC"; ///< cv reference
    qp.cvAcc = "QC:0000006"; ///< cv accession for "aquisition results"
    qp.id = base_name + "_ms1aquisition"; ///< Identifier
    qp.value = String(mslevelcounts[1]);
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
      qp.name = term.name; ///< Name
    }
    catch (...)
    {
      qp.name = "number of ms1 spectra"; ///< Name
    }
    qcmlfile.addRunQualityParameter(base_name, qp);
    

    qp = QcMLFile::QualityParameter();
    qp.cvRef = "QC"; ///< cv reference
    qp.cvAcc = "QC:0000007"; ///< cv accession for "aquisition results"
    qp.id = base_name + "_ms2aquisition"; ///< Identifier
    qp.value = String(mslevelcounts[2]);
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
      qp.name = term.name; ///< Name
    }
    catch (...)
    {
      qp.name = "number of ms2 spectra"; ///< Name
    }
    qcmlfile.addRunQualityParameter(base_name, qp);

    qp = QcMLFile::QualityParameter();
    qp.cvRef = "QC"; ///< cv reference
    qp.cvAcc = "QC:0000008"; ///< cv accession for "aquisition results"
    qp.id = base_name + "_Chromaquisition"; ///< Identifier
    qp.value = String(exp.getChromatograms().size());
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
      qp.name = term.name; ///< Name
    }
    catch (...)
    {
      qp.name = "number of chromatograms"; ///< Name
    }
    qcmlfile.addRunQualityParameter(base_name, qp);
    
    at = QcMLFile::Attachment();
    at.cvRef = "QC"; ///< cv reference
    at.cvAcc = "QC:0000009";
    at.qualityRef = msaq_ref;
    at.id = base_name + "_mzrange"; ///< Identifier
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
      at.name = term.name; ///< Name
    }
    catch (...)
    {
      at.name = "MS MZ aquisition ranges"; ///< Name
    }

    at.colTypes.push_back("QC:0000010"); //MZ
    at.colTypes.push_back("QC:0000011"); //MZ
    std::vector<String> rowmz;
    rowmz.push_back(String(min_mz));
    rowmz.push_back(String(max_mz));
    at.tableRows.push_back(rowmz);
    qcmlfile.addRunAttachment(base_name, at);

    at = QcMLFile::Attachment();
    at.cvRef = "QC"; ///< cv reference
    at.cvAcc = "QC:0000012";
    at.qualityRef = msaq_ref;
    at.id = base_name + "_rtrange"; ///< Identifier
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
      at.name = term.name; ///< Name
    }
    catch (...)
    {
      at.name = "MS RT aquisition ranges"; ///< Name
    }

    at.colTypes.push_back("QC:0000013"); //MZ
    at.colTypes.push_back("QC:0000014"); //MZ
    std::vector<String> rowrt;
    rowrt.push_back(String(exp.begin()->getRT()));
    rowrt.push_back(String(exp.getSpectra().back().getRT()));
    at.tableRows.push_back(rowrt);
    qcmlfile.addRunAttachment(base_name, at);
    

    //---ion current stability ( & tic ) qp
    at = QcMLFile::Attachment();
    at.cvRef = "QC"; ///< cv reference
    at.cvAcc = "QC:0000022";
    at.qualityRef = msaq_ref;
    at.id = base_name + "_tics"; ///< Identifier
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
      at.name = term.name; ///< Name
    }
    catch (...)
    {
      at.name = "MS TICs"; ///< Name
    }
    
    at.colTypes.push_back("MS:1000894_[sec]");
    at.colTypes.push_back("MS:1000285");
    UInt max = 0;
    Size below_10k = 0;
    for (Size i = 0; i < exp.size(); ++i)
    {
      if (exp[i].getMSLevel() == 1)
      {
        UInt sum = 0;
        for (Size j = 0; j < exp[i].size(); ++j)
        {
          sum += exp[i][j].getIntensity();
        }
        if (sum > max)
        {
          max = sum;
        }
        if (sum < 10000)
        {
          ++below_10k;
        }
        std::vector<String> row;
        row.push_back(exp[i].getRT());
        row.push_back(sum);
        at.tableRows.push_back(row);
      }
    }
    qcmlfile.addRunAttachment(base_name, at);
    

    qp = QcMLFile::QualityParameter();
    qp.id = base_name + "_ticslump"; ///< Identifier
    qp.cvRef = "QC"; ///< cv reference
    qp.cvAcc = "QC:0000023";
    qp.value = String((100 / exp.size()) * below_10k);
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
      qp.name = term.name; ///< Name
    }
    catch (...)
    {
      qp.name = "percentage of tic slumps"; ///< Name
    }
    qcmlfile.addRunQualityParameter(base_name, qp);

    
    //-------------------------------------------------------------
    // MS  id
    //------------------------------------------------------------
    if (inputfile_id != "")
    {
      IdXMLFile().load(inputfile_id, prot_ids, pep_ids);
      cerr << "idXML read ended. Found " << pep_ids.size() << " peptide identifications." << endl;

      ProteinIdentification::SearchParameters params = prot_ids[0].getSearchParameters();
      vector<String> var_mods = params.variable_modifications;
      //~ boost::regex re("(?<=[KR])(?=[^P])");
     
      String msid_ref = base_name + "_msid";
      QcMLFile::QualityParameter qp;
      qp.id = msid_ref; ///< Identifier
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000025";
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
        qp.name = "MS identification result details"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      at = QcMLFile::Attachment();
      at.cvRef = "QC"; ///< cv reference
      at.cvAcc = "QC:0000026";
      at.qualityRef = msid_ref;
      at.id = base_name + "_idsetting"; ///< Identifier
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
        at.name = term.name; ///< Name
      }
      catch (...)
      {
        at.name = "MS id settings"; ///< Name
      }
      
      at.colTypes.push_back("MS:1001013"); //MS:1001013 db name  MS:1001016 version  MS:1001020 taxonomy
      at.colTypes.push_back("MS:1001016");
      at.colTypes.push_back("MS:1001020");
      std::vector<String> row;
      row.push_back(String(prot_ids.front().getSearchParameters().db));
      row.push_back(String(prot_ids.front().getSearchParameters().db_version));
      row.push_back(String(prot_ids.front().getSearchParameters().taxonomy));
      at.tableRows.push_back(row);
      qcmlfile.addRunAttachment(base_name, at);


      UInt spectrum_count = 0;
      Size peptide_hit_count = 0;
      UInt runs_count = 0;
      Size protein_hit_count = 0;
      set<String> peptides;
      set<String> proteins;
      Size missedcleavages = 0;
      for (Size i = 0; i < pep_ids.size(); ++i)
      {
        if (!pep_ids[i].empty())
        {
          ++spectrum_count;
          peptide_hit_count += pep_ids[i].getHits().size();
          const vector<PeptideHit>& temp_hits = pep_ids[i].getHits();
          for (Size j = 0; j < temp_hits.size(); ++j)
          {
            peptides.insert(temp_hits[j].getSequence().toString());
          }
        }
      }
      for (set<String>::iterator it = peptides.begin(); it != peptides.end(); ++it)
      {
        for (String::const_iterator st = it->begin(); st != it->end() - 1; ++st)
        {
          if (*st == 'K' || *st == 'R')
          {
            ++missedcleavages;
          }
        }
      }

      for (Size i = 0; i < prot_ids.size(); ++i)
      {
        ++runs_count;
        protein_hit_count += prot_ids[i].getHits().size();
        const vector<ProteinHit>& temp_hits = prot_ids[i].getHits();
        for (Size j = 0; j < temp_hits.size(); ++j)
        {
          proteins.insert(temp_hits[j].getAccession());
        }
      }
      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000037"; ///< cv accession
      qp.id = base_name + "_misscleave"; ///< Identifier
      qp.value = missedcleavages;
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
        qp.name = "total number of missed cleavages"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);

      
      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000032"; ///< cv accession
      qp.id = base_name + "_totprot"; ///< Identifier
      qp.value = protein_hit_count;
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
        qp.name = "total number of identified proteins"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000033"; ///< cv accession
      qp.id = base_name + "_totuniqprot"; ///< Identifier
      qp.value = String(proteins.size());
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "total number of uniquely identified proteins"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000029"; ///< cv accession
      qp.id = base_name + "_psms"; ///< Identifier
      qp.value = String(spectrum_count);
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "total number of PSM"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000030"; ///< cv accession
      qp.id = base_name + "_totpeps"; ///< Identifier
      qp.value = String(peptide_hit_count);
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "total number of identified peptides"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000031"; ///< cv accession
      qp.id = base_name + "_totuniqpeps"; ///< Identifier
      qp.value = String(peptides.size());
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "total number of uniquely identified peptides"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      at = QcMLFile::Attachment();
      at.cvRef = "QC"; ///< cv reference
      at.cvAcc = "QC:0000038";
      at.qualityRef = msid_ref;
      at.id = base_name + "_massacc"; ///< Identifier
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
        at.name = term.name; ///< Name
      }
      catch (...)
      {
        at.name = "delta ppm tables";
      }
      
      //~ delta ppm QC:0000039 RT MZ uniqueness ProteinID MS:1000885 target/decoy Score PeptideSequence MS:1000889 Annots string Similarity Charge UO:0000219 TheoreticalWeight UO:0000221 Oxidation_(M)
      at.colTypes.push_back("RT");
      at.colTypes.push_back("MZ");
      at.colTypes.push_back("Score");
      at.colTypes.push_back("PeptideSequence");
      at.colTypes.push_back("Charge");
      at.colTypes.push_back("TheoreticalWeight");
      at.colTypes.push_back("delta_ppm");
      for (UInt w = 0; w < var_mods.size(); ++w)
      {
        at.colTypes.push_back(String(var_mods[w]).substitute(' ', '_'));
      }

      std::vector<double> deltas;
      //~ prot_ids[0].getSearchParameters();
      for (vector<PeptideIdentification>::iterator it = pep_ids.begin(); it != pep_ids.end(); ++it)
      {
        if (it->getHits().size() > 0)
        {
          std::vector<String> row;
          row.push_back(it->getRT());
          row.push_back(it->getMZ());
          PeptideHit tmp = it->getHits().front(); //TODO depends on score & sort
          vector<UInt> pep_mods;
          for (UInt w = 0; w < var_mods.size(); ++w)
          {
            pep_mods.push_back(0);
          }
          for (AASequence::ConstIterator z =  tmp.getSequence().begin(); z != tmp.getSequence().end(); ++z)
          {
            Residue res = *z;
            String temp;
            if (res.getModification().size() > 0 && res.getModification() != "Carbamidomethyl")
            {
              temp = res.getModification() + " (" + res.getOneLetterCode()  + ")";
              //cout<<res.getModification()<<endl;
              for (UInt w = 0; w < var_mods.size(); ++w)
              {
                if (temp == var_mods[w])
                {
                  //cout<<temp;
                  pep_mods[w] += 1;
                }
              }
            }
          }
          row.push_back(tmp.getScore());
          row.push_back(tmp.getSequence().toString().removeWhitespaces());
          row.push_back(tmp.getCharge());
          row.push_back(String((tmp.getSequence().getMonoWeight() + tmp.getCharge() * Constants::PROTON_MASS_U) / tmp.getCharge()));
          double dppm = /* std::abs */ (getMassDifference(((tmp.getSequence().getMonoWeight() + tmp.getCharge() * Constants::PROTON_MASS_U) / tmp.getCharge()), it->getMZ(), true));
          row.push_back(String(dppm));
          deltas.push_back(dppm);
          for (UInt w = 0; w < var_mods.size(); ++w)
          {
            row.push_back(pep_mods[w]);
          }
          at.tableRows.push_back(row);
        }
      }
      qcmlfile.addRunAttachment(base_name, at);
      

      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000040"; ///< cv accession
      qp.id = base_name + "_mean_delta"; ///< Identifier
      qp.value = String(OpenMS::Math::mean(deltas.begin(), deltas.end()));
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "mean delta ppm"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000041"; ///< cv accession
      qp.id = base_name + "_median_delta"; ///< Identifier
      qp.value = String(OpenMS::Math::median(deltas.begin(), deltas.end(), false));
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "median delta ppm"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000035"; ///< cv accession
      qp.id = base_name + "_ratio_id"; ///< Identifier
      qp.value = String(double(pep_ids.size()) / double(mslevelcounts[2]));
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "id ratio"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);
    }

    //-------------------------------------------------------------
    // MS quantitation
    //------------------------------------------------------------
    FeatureMap map;
    String msqu_ref = base_name + "_msqu";
    if (inputfile_feature != "")
    {
      FeatureXMLFile f;
      f.load(inputfile_feature, map);

      cout << "Read featureXML file..." << endl;

      //~ UInt fiter = 0;
      map.sortByRT();
      map.updateRanges();

      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000045"; ///< cv accession
      qp.id = msqu_ref; ///< Identifier
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "MS quantification result details"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);
      
      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000046"; ///< cv accession
      qp.id = base_name + "_feature_count"; ///< Identifier
      qp.value = String(map.size());
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "number of features"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);      
    }

    if (inputfile_feature != "" && !remove_duplicate_features)
    {
      
      QcMLFile::Attachment at;
      at = QcMLFile::Attachment();
      at.cvRef = "QC"; ///< cv reference
      at.cvAcc = "QC:0000047";
      at.qualityRef = msqu_ref;
      at.id = base_name + "_features"; ///< Identifier
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
        at.name = term.name; ///< Name
      }
      catch (...)
      {
        at.name = "features"; ///< Name
      }
      
      at.colTypes.push_back("MZ");
      at.colTypes.push_back("RT");
      at.colTypes.push_back("Intensity");
      at.colTypes.push_back("Charge");
      at.colTypes.push_back("Quality");
      at.colTypes.push_back("FWHM");
      at.colTypes.push_back("IDs");
      UInt fiter = 0;
      map.sortByRT();
      //ofstream out(outputfile_name.c_str());
      while (fiter < map.size())
      {
        std::vector<String> row;
        row.push_back(map[fiter].getMZ());
        row.push_back(map[fiter].getRT());
        row.push_back(map[fiter].getIntensity());
        row.push_back(map[fiter].getCharge());
        row.push_back(map[fiter].getOverallQuality());
        row.push_back(map[fiter].getWidth());
        row.push_back(map[fiter].getPeptideIdentifications().size());
        fiter++;
        at.tableRows.push_back(row);
      }     
      qcmlfile.addRunAttachment(base_name, at);
    }
    else if (inputfile_feature != "" && remove_duplicate_features)
    {
      QcMLFile::Attachment at;
      at = QcMLFile::Attachment();
      at.cvRef = "QC"; ///< cv reference
      at.cvAcc = "QC:0000047";
      at.qualityRef = msqu_ref;
      at.id = base_name + "_features"; ///< Identifier
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
        at.name = term.name; ///< Name
      }
      catch (...)
      {
        at.name = "features"; ///< Name
      }
      
      at.colTypes.push_back("MZ");
      at.colTypes.push_back("RT");
      at.colTypes.push_back("Intensity");
      at.colTypes.push_back("Charge");
      FeatureMap map, map_out;
      FeatureXMLFile f;
      f.load(inputfile_feature, map);
      UInt fiter = 0;
      map.sortByRT();
      while (fiter < map.size())
      {
        FeatureMap map_tmp;
        for (UInt k = fiter; k <= map.size(); ++k)
        {
          if (abs(map[fiter].getRT() - map[k].getRT()) < 0.1)
          {
            //~ cout << fiter << endl;
            map_tmp.push_back(map[k]);
          }
          else
          {
            fiter = k;
            break;
          }
        }
        map_tmp.sortByMZ();
        UInt retif = 1;
        map_out.push_back(map_tmp[0]);
        while (retif < map_tmp.size())
        {
          if (abs(map_tmp[retif].getMZ() - map_tmp[retif - 1].getMZ()) > 0.01)
          {
            cout << "equal RT, but mass different" << endl;
            map_out.push_back(map_tmp[retif]);
          }
          retif++;
        }
      }
      qcmlfile.addRunAttachment(base_name, at);
    }
    if (inputfile_consensus != "")
    {
      cout << "Reading consensusXML file..." << endl;
      ConsensusXMLFile f;
      ConsensusMap map;
      f.load(inputfile_consensus, map);
      //~ String CONSENSUS_NAME = "_consensus.tsv";
      //~ String combined_out = outputfile_name + CONSENSUS_NAME;
      //~ ofstream out(combined_out.c_str());

      at = QcMLFile::Attachment();
      qp.name = "consensuspoints"; ///< Name
      //~ qp.id = base_name + "_consensuses"; ///< Identifier
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:xxxxxxxx"; ///< cv accession "featuremapper results"

      at.colTypes.push_back("Native_spectrum_ID");
      at.colTypes.push_back("DECON_RT_(sec)");
      at.colTypes.push_back("DECON_MZ_(Th)");
      at.colTypes.push_back("DECON_Intensity");
      at.colTypes.push_back("Feature_RT_(sec)");
      at.colTypes.push_back("Feature_MZ_(Th)");
      at.colTypes.push_back("Feature_Intensity");
      at.colTypes.push_back("Feature_Charge");
      for (ConsensusMap::const_iterator cmit = map.begin(); cmit != map.end(); ++cmit)
      {
        const ConsensusFeature& CF = *cmit;
        for (ConsensusFeature::const_iterator cfit = CF.begin(); cfit != CF.end(); ++cfit)
        {
          std::vector<String> row;
          FeatureHandle FH = *cfit;
          row.push_back(CF.getMetaValue("spectrum_native_id"));
          row.push_back(CF.getRT()); row.push_back(CF.getMZ());
          row.push_back(CF.getIntensity());
          row.push_back(FH.getRT());
          row.push_back(FH.getMZ());
          row.push_back(FH.getCharge());
          at.tableRows.push_back(row);
        }
      }
      qcmlfile.addRunAttachment(base_name, at);
    }
    
    
    //-------------------------------------------------------------
    // finalize
    //------------------------------------------------------------
    qcmlfile.store(outputfile_name);
    return EXECUTION_OK;
  }
  vector<double> ConsensusMapNormalizerAlgorithmThreshold::computeCorrelation(const ConsensusMap& map, const double& ratio_threshold, const String& acc_filter, const String& desc_filter)
  {
    Size number_of_features = map.size();
    Size number_of_maps = map.getFileDescriptions().size();
    vector<vector<double> > feature_int(number_of_maps);

    //get map with most features, resize feature_int
    UInt map_with_most_features_idx = 0;
    ConsensusMap::FileDescriptions::const_iterator map_with_most_features = map.getFileDescriptions().find(0);
    for (UInt i = 0; i < number_of_maps; i++)
    {
      feature_int[i].resize(number_of_features);
      ConsensusMap::FileDescriptions::const_iterator it = map.getFileDescriptions().find(i);
      if (it->second.size > map_with_most_features->second.size)
      {
        map_with_most_features = it;
        map_with_most_features_idx = i;
      }
    }

    //fill feature_int with intensities
    Size pass_counter = 0;
    ConsensusMap::ConstIterator cf_it;
    UInt idx = 0;
    for (cf_it = map.begin(); cf_it != map.end(); ++cf_it, ++idx)
    {
      if (!ConsensusMapNormalizerAlgorithmMedian::passesFilters_(cf_it, map, acc_filter, desc_filter))
      {
        continue;
      }
      ++pass_counter;

      ConsensusFeature::HandleSetType::const_iterator f_it;
      for (f_it = cf_it->getFeatures().begin(); f_it != cf_it->getFeatures().end(); ++f_it)
      {
        feature_int[f_it->getMapIndex()][idx] = f_it->getIntensity();
      }
    }

    LOG_INFO << endl << "Using " << pass_counter << "/" << map.size() <<  " consensus features for computing normalization coefficients" << endl << endl;

    //determine ratio
    vector<double> ratio_vector(number_of_maps);
    for (UInt j = 0; j < number_of_maps; j++)
    {
      vector<double> ratios;
      for (UInt k = 0; k < number_of_features; ++k)
      {
        if (feature_int[map_with_most_features_idx][k] != 0.0 && feature_int[j][k] != 0.0)
        {
          double ratio = feature_int[map_with_most_features_idx][k] / feature_int[j][k];
          if (ratio > ratio_threshold && ratio < 1 / ratio_threshold)
          {
            ratios.push_back(ratio);
          }
        }
      }
      if (ratios.empty())
      {
        LOG_WARN << endl << "Not enough features passing filters. Cannot compute normalization coefficients for all maps. Result will be unnormalized." << endl << endl;
        return vector<double>(number_of_maps, 1.0);
      }
      ratio_vector[j] = Math::mean(ratios.begin(), ratios.end());
    }
    return ratio_vector;
  }
Ejemplo n.º 12
0
  ExitCodes outputTo(ostream& os)
  {
    //-------------------------------------------------------------
    // Parameter handling
    //-------------------------------------------------------------

    // File names
    String in = getStringOption_("in");

    // File type
    FileHandler fh;
    FileTypes::Type in_type = FileTypes::nameToType(getStringOption_("in_type"));

    if (in_type == FileTypes::UNKNOWN)
    {
      in_type = fh.getType(in);
      writeDebug_(String("Input file type: ") + FileTypes::typeToName(in_type), 2);
    }

    if (in_type == FileTypes::UNKNOWN)
    {
      writeLog_("Error: Could not determine input file type!");
      return PARSE_ERROR;
    }

    MSExperiment<Peak1D> exp;
    FeatureMap feat;
    ConsensusMap cons;

    if (in_type == FileTypes::FEATUREXML) //features
    {
      FeatureXMLFile().load(in, feat);
      feat.updateRanges();
    }
    else if (in_type == FileTypes::CONSENSUSXML)     //consensus features
    {
      ConsensusXMLFile().load(in, cons);
      cons.updateRanges();
    }

    //-------------------------------------------------------------
    // meta information
    //-------------------------------------------------------------
    if (getFlag_("m"))
    {
      os << endl
         << "-- General information --" << endl
         << endl
         << "file name: " << in << endl
         << "file type: " <<  FileTypes::typeToName(in_type) << endl;

      //basic info
      os << endl
         << "-- Meta information --" << endl
         << endl;

      if (in_type == FileTypes::FEATUREXML) //features
      {
        os << "Document id       : " << feat.getIdentifier() << endl << endl;
      }
      else if (in_type == FileTypes::CONSENSUSXML)       //consensus features
      {
        os << "Document id       : " << cons.getIdentifier() << endl << endl;
      }
    }

    //-------------------------------------------------------------
    // data processing
    //-------------------------------------------------------------
    if (getFlag_("p"))
    {
      //basic info
      os << endl
         << "-- Data processing information --" << endl
         << endl;

      //get data processing info
      vector<DataProcessing> dp;
      if (in_type == FileTypes::FEATUREXML) //features
      {
        dp = feat.getDataProcessing();
      }
      else if (in_type == FileTypes::CONSENSUSXML)       //consensus features
      {
        dp = cons.getDataProcessing();
      }
      int i = 0;
      for (vector<DataProcessing>::iterator it = dp.begin(); it != dp.end(); ++it)
      {
        os << "Data processing " << i << endl;
        os << "\tcompletion_time:   " << (*it).getCompletionTime().getDate() << 'T' << (*it).getCompletionTime().getTime() << endl;
        os << "\tsoftware name:     " << (*it).getSoftware().getName() << " version " << (*it).getSoftware().getVersion() << endl;
        for (set<DataProcessing::ProcessingAction>::const_iterator paIt = (*it).getProcessingActions().begin(); paIt != (*it).getProcessingActions().end(); ++paIt)
        {
          os << "\t\tprocessing action: " << DataProcessing::NamesOfProcessingAction[*paIt] << endl;
        }
      }
      ++i;
    }

    //-------------------------------------------------------------
    // statistics
    //-------------------------------------------------------------
    if (getFlag_("s"))
    {
      //-------------------------------------------------------------
      // Content statistics
      //-------------------------------------------------------------
      Map<String, int> meta_names;
      if (in_type == FileTypes::FEATUREXML) //features
      {
        os << "Number of features: " << feat.size() << endl
           << endl
           << "Ranges:" << endl
           << "  retention time:  " << String::number(feat.getMin()[Peak2D::RT], 2) << " : " << String::number(feat.getMax()[Peak2D::RT], 2) << endl
           << "  mass-to-charge:  " << String::number(feat.getMin()[Peak2D::MZ], 2) << " : " << String::number(feat.getMax()[Peak2D::MZ], 2) << endl
           << "  intensity:       " << String::number(feat.getMinInt(), 2) << " : " << String::number(feat.getMaxInt(), 2) << endl
           << endl;

        // Charge distribution
        Map<UInt, UInt> charges;
        for (Size i = 0; i < feat.size(); ++i)
        {
          charges[feat[i].getCharge()]++;
        }

        os << "Charge distribution" << endl;
        for (Map<UInt, UInt>::const_iterator it = charges.begin();
             it != charges.end(); ++it)
        {
          os << "charge " << it->first << ": " << it->second << endl;
        }
      }
      else if (in_type == FileTypes::CONSENSUSXML)       //consensus features
      {
        map<Size, UInt> num_consfeat_of_size;
        for (ConsensusMap::const_iterator cmit = cons.begin();
             cmit != cons.end(); ++cmit)
        {
          ++num_consfeat_of_size[cmit->size()];
        }

        os << endl << "Number of consensus features:" << endl;
        for (map<Size, UInt>::reverse_iterator i = num_consfeat_of_size.rbegin(); i != num_consfeat_of_size.rend(); ++i)
        {
          os << "  of size " << setw(2) << i->first << ": " << setw(6) << i->second << endl;
        }
        os << "  total:      " << setw(6) << cons.size() << endl << endl;

        os << "Ranges:" << endl
           << "  retention time:  " << String::number(cons.getMin()[Peak2D::RT], 2) << " : " << String::number(cons.getMax()[Peak2D::RT], 2) << endl
           << "  mass-to-charge:  " << String::number(cons.getMin()[Peak2D::MZ], 2) << " : " << String::number(cons.getMax()[Peak2D::MZ], 2) << endl
           << "  intensity:       " << String::number(cons.getMinInt(), 2) << " : " << String::number(cons.getMaxInt(), 2) << endl;

        // file descriptions
        const ConsensusMap::FileDescriptions& descs = cons.getFileDescriptions();
        if (!descs.empty())
        {
          os << endl <<
          "File descriptions:" << endl;
          for (ConsensusMap::FileDescriptions::const_iterator it = descs.begin(); it != descs.end(); ++it)
          {
            os << " - " << it->second.filename << endl
               << "   identifier: " << it->first << endl
               << "   label     : " << it->second.label << endl
               << "   size      : " << it->second.size << endl;
          }
        }
      }

      os << endl
         << "-- Summary Statistics --" << endl
         << endl;

    }

    if (in_type == FileTypes::FEATUREXML) //features
    {
      feat.sortByRT();

      vector<double> slice_stats;
      Size n = getIntOption_("n");

      Size begin = 0;
      Size end = 0;
      os << "#slice\tRT_begin\tRT_end\tnumber_of_features\ttic\t"
         << "int_mean\tint_stddev\tint_min\tint_max\tint_median\tint_lowerq\tint_upperq\t"
         << "mz_mean\tmz_stddev\tmz_min\tmz_max\tmz_median\tmz_lowerq\tmz_upperq\t"
         << "width_mean\twidth_stddev\twidth_min\twidth_max\twidth_median\twidth_lowerq\twidth_upperq\t"
         << "qual_mean\tqual_stddev\tqual_min\tqual_max\tqual_median\tqual_lowerq\tqual_upperq\t"
         << "rt_qual_mean\trt_qual_stddev\trt_qual_min\trt_qual_max\trt_qual_median\trt_qual_lowerq\trt_qual_upperq\t"
         << "mz_qual_mean\tmz_qual_stddev\tmz_qual_min\tmz_qual_max\tmz_qual_median\tmz_qual_lowerq\tmz_qual_upperq"
         << endl;

      double rt_begin = 0.0;
      for (Size slice = 0; slice < n; ++slice)
      {
        // Determine slice boundaries.
        double rt_end = feat.back().getRT() / (double)n * (slice + 1);
        for (end = begin; end < feat.size() && feat[end].getRT() < rt_end; ++end) {}

        // Compute statistics on all features in this slice.
        slice_stats = sliceStatistics(feat, begin, end);

        // Write the beginning and end of the slices to the output as well as the slice index.
        os << slice << "\t" << rt_begin << "\t" << rt_end << "\t" << end - begin << "\t";

        // Write the statistics as a line of an csv file
        copy(slice_stats.begin(), slice_stats.end(), ostream_iterator<double>(os, "\t"));
        os << endl;

        begin = end;
        rt_begin = rt_end;
      }
    }
    else if (in_type == FileTypes::CONSENSUSXML)     //consensus features
    {
      Size size = cons.size();

      vector<double> intensities;
      intensities.reserve(size);
      vector<double> qualities(size);
      qualities.reserve(size);
      vector<double> widths(size);
      widths.reserve(size);

      vector<double> rt_delta_by_elems;
      vector<double> rt_aad_by_elems;
      vector<double> rt_aad_by_cfs;
      rt_aad_by_cfs.reserve(size);

      vector<double> mz_delta_by_elems;
      vector<double> mz_aad_by_elems;
      vector<double> mz_aad_by_cfs;
      mz_aad_by_cfs.reserve(size);

      vector<double> it_delta_by_elems;
      vector<double> it_aad_by_elems;
      vector<double> it_aad_by_cfs;
      it_aad_by_cfs.reserve(size);

      for (ConsensusMap::const_iterator cm_iter = cons.begin();
           cm_iter != cons.end(); ++cm_iter)
      {
        double rt_aad = 0;
        double mz_aad = 0;
        double it_aad = 0;
        intensities.push_back(cm_iter->getIntensity());
        qualities.push_back(cm_iter->getQuality());
        widths.push_back(cm_iter->getWidth());
        for (ConsensusFeature::HandleSetType::const_iterator hs_iter = cm_iter->begin();
             hs_iter != cm_iter->end(); ++hs_iter)
        {
          double rt_diff = hs_iter->getRT() - cm_iter->getRT();
          rt_delta_by_elems.push_back(rt_diff);
          if (rt_diff < 0)
          {
            rt_diff = -rt_diff;
          }
          rt_aad_by_elems.push_back(rt_diff);
          rt_aad += rt_diff;
          double mz_diff = hs_iter->getMZ() - cm_iter->getMZ();
          mz_delta_by_elems.push_back(mz_diff);
          if (mz_diff < 0)
          {
            mz_diff = -mz_diff;
          }
          mz_aad_by_elems.push_back(mz_diff);
          mz_aad += mz_diff;
          double it_ratio = hs_iter->getIntensity() / (cm_iter->getIntensity() ? cm_iter->getIntensity() : 1.);
          it_delta_by_elems.push_back(it_ratio);
          if (it_ratio < 1.)
          {
            it_ratio = 1. / it_ratio;
          }
          it_aad_by_elems.push_back(it_ratio);
          it_aad += it_ratio;
        }
        if (!cm_iter->empty())
        {
          rt_aad /= cm_iter->size();
          mz_aad /= cm_iter->size();
          it_aad /= cm_iter->size();
        } // otherwise rt_aad etc. are 0 anyway
        rt_aad_by_cfs.push_back(rt_aad);
        mz_aad_by_cfs.push_back(mz_aad);
        it_aad_by_cfs.push_back(it_aad);
      }

      OpenMS::SomeStatistics some_statistics;

      os.precision(writtenDigits(ConsensusFeature::IntensityType()));
      os << "Intensities of consensus features:" << endl << some_statistics(intensities) << endl;

      os.precision(writtenDigits(ConsensusFeature::QualityType()));
      os << "Qualities of consensus features:" << endl << some_statistics(qualities) << endl;

      os.precision(writtenDigits(ConsensusFeature::CoordinateType()));
      os << "Retention time differences ( element-center, weight 1 per element):" << endl << some_statistics(rt_delta_by_elems) << endl;
      os << "Absolute retention time differences ( |element-center|, weight 1 per element):" << endl << some_statistics(rt_aad_by_elems) << endl;
      os << "Average absolute differences of retention time within consensus features ( |element-center|, weight 1 per consensus features):" << endl << some_statistics(rt_aad_by_cfs) << endl;

      os.precision(writtenDigits(ConsensusFeature::CoordinateType()));
      os << "Mass-to-charge differences ( element-center, weight 1 per element):" << endl << some_statistics(mz_delta_by_elems) << endl;
      os << "Absolute differences of mass-to-charge ( |element-center|, weight 1 per element):" << endl << some_statistics(mz_aad_by_elems) << endl;
      os << "Average absolute differences of mass-to-charge within consensus features ( |element-center|, weight 1 per consensus features):" << endl << some_statistics(mz_aad_by_cfs) << endl;

      os.precision(writtenDigits(ConsensusFeature::IntensityType()));
      os << "Intensity ratios ( element/center, weight 1 per element):" << endl << some_statistics(it_delta_by_elems) << endl;
      os << "Relative intensity error ( max{(element/center),(center/element)}, weight 1 per element):" << endl << some_statistics(it_aad_by_elems) << endl;
      os << "Average relative intensity error within consensus features ( max{(element/center),(center/element)}, weight 1 per consensus features):" << endl << some_statistics(it_aad_by_cfs) << endl;
    }

    return EXECUTION_OK;
  }
Ejemplo n.º 13
0
  ExitCodes main_(int, const char**)
  {

    //-------------------------------------------------------------
    // parameter handling
    //-------------------------------------------------------------
    // file list
    StringList file_list = getStringList_("in");

    // file type
    FileHandler file_handler;
    FileTypes::Type force_type;
    if (getStringOption_("in_type").size() > 0)
    {
      force_type = FileTypes::nameToType(getStringOption_("in_type"));
    }
    else
    {
      force_type = file_handler.getType(file_list[0]);
    }

    // output file names and types
    String out_file = getStringOption_("out");

    bool annotate_file_origin =  getFlag_("annotate_file_origin");
    rt_gap_ = getDoubleOption_("rt_concat:gap");
    vector<String> trafo_out = getStringList_("rt_concat:trafo_out");
    if (trafo_out.empty())
    {
      // resize now so we don't have to worry about indexing out of bounds:
      trafo_out.resize(file_list.size());
    }
    else if (trafo_out.size() != file_list.size())
    {
      writeLog_("Error: Number of transformation output files must equal the number of input files (parameters 'rt_concat:trafo_out'/'in')!");
      return ILLEGAL_PARAMETERS;
    }

    //-------------------------------------------------------------
    // calculations
    //-------------------------------------------------------------

    if (force_type == FileTypes::FEATUREXML)
    {
      FeatureMap out;
      FeatureXMLFile fh;
      for (Size i = 0; i < file_list.size(); ++i)
      {
        FeatureMap map;
        fh.load(file_list[i], map);

        if (annotate_file_origin)
        {
          for (FeatureMap::iterator it = map.begin(); it != map.end(); ++it)
          {
            it->setMetaValue("file_origin", DataValue(file_list[i]));
          }
        }

        if (rt_gap_ > 0.0) // concatenate in RT
        {
          adjustRetentionTimes_(map, trafo_out[i], i == 0);
        }

        out += map;
      }

      //-------------------------------------------------------------
      // writing output
      //-------------------------------------------------------------

      // annotate output with data processing info
      addDataProcessing_(out, getProcessingInfo_(DataProcessing::FORMAT_CONVERSION));

      fh.store(out_file, out);
    }

    else if (force_type == FileTypes::CONSENSUSXML)
    {
      ConsensusMap out;
      ConsensusXMLFile fh;
      fh.load(file_list[0], out);
      // skip first file
      for (Size i = 1; i < file_list.size(); ++i)
      {
        ConsensusMap map;
        fh.load(file_list[i], map);

        if (annotate_file_origin)
        {
          for (ConsensusMap::iterator it = map.begin(); it != map.end(); ++it)
          {
            it->setMetaValue("file_origin", DataValue(file_list[i]));
          }
        }

        if (rt_gap_ > 0.0) // concatenate in RT
        {
          adjustRetentionTimes_(map, trafo_out[i], i == 0);
        }

        out += map;
      }

      //-------------------------------------------------------------
      // writing output
      //-------------------------------------------------------------

      // annotate output with data processing info
      addDataProcessing_(out, getProcessingInfo_(DataProcessing::FORMAT_CONVERSION));

      fh.store(out_file, out);
    }

    else if (force_type == FileTypes::TRAML)
    {
      TargetedExperiment out;
      TraMLFile fh;
      for (Size i = 0; i < file_list.size(); ++i)
      {
        TargetedExperiment map;
        fh.load(file_list[i], map);
        out += map;
      }

      //-------------------------------------------------------------
      // writing output
      //-------------------------------------------------------------

      // annotate output with data processing info
      Software software;
      software.setName("FileMerger");
      software.setVersion(VersionInfo::getVersion());
      out.addSoftware(software);

      fh.store(out_file, out);
    }
    else // raw data input (e.g. mzML)
    {
      // RT
      bool rt_auto_number = getFlag_("raw:rt_auto");
      bool rt_filename = getFlag_("raw:rt_filename");
      bool rt_custom = false;
      DoubleList custom_rts = getDoubleList_("raw:rt_custom");
      if (!custom_rts.empty())
      {
        rt_custom = true;
        if (custom_rts.size() != file_list.size())
        {
          writeLog_("Custom retention time list (parameter 'raw:rt_custom') must have as many elements as there are input files (parameter 'in')!");
          return ILLEGAL_PARAMETERS;
        }
      }

      // MS level
      Int ms_level = getIntOption_("raw:ms_level");

      MSExperiment<> out;
      UInt rt_auto = 0;
      UInt native_id = 0;
      for (Size i = 0; i < file_list.size(); ++i)
      {
        String filename = file_list[i];

        // load file
        force_type = file_handler.getType(file_list[i]);
        MSExperiment<> in;
        file_handler.loadExperiment(filename, in, force_type, log_type_);

        if (in.empty() && in.getChromatograms().empty())
        {
          writeLog_(String("Warning: Empty file '") + filename + "'!");
          continue;
        }
        out.reserve(out.size() + in.size());

        // warn if custom RT and more than one scan in input file
        if (rt_custom && in.size() > 1)
        {
          writeLog_(String("Warning: More than one scan in file '") + filename + "'! All scans will have the same retention time!");
        }

        // handle special raw data options:
        for (MSExperiment<>::iterator spec_it = in.begin();
             spec_it != in.end(); ++spec_it)
        {
          float rt_final = spec_it->getRT();
          if (rt_auto_number)
          {
            rt_final = ++rt_auto;
          }
          else if (rt_custom)
          {
            rt_final = custom_rts[i];
          }
          else if (rt_filename)
          {
            static const boost::regex re("rt(\\d+(\\.\\d+)?)");
            boost::smatch match;
            bool found = boost::regex_search(filename, match, re);
            if (found)
            {
              rt_final = String(match[1]).toFloat();
            }
            else
            {
              writeLog_("Warning: could not extract retention time from filename '" + filename + "'");
            }
          }

          // none of the rt methods were successful
          if (rt_final < 0)
          {
            writeLog_(String("Warning: No valid retention time for output scan '") + rt_auto + "' from file '" + filename + "'");
          }

          spec_it->setRT(rt_final);
          spec_it->setNativeID("spectrum=" + String(native_id));
          if (ms_level > 0)
          {
            spec_it->setMSLevel(ms_level);
          }
          ++native_id;
        }

        // if we have only one spectrum, we can annotate it directly, for more spectra, we just name the source file leaving the spectra unannotated (to avoid a long and redundant list of sourceFiles)
        if (in.size() == 1)
        {
          in[0].setSourceFile(in.getSourceFiles()[0]);
          in.getSourceFiles().clear(); // delete source file annotated from source file (it's in the spectrum anyways)
        }

        if (rt_gap_ > 0.0) // concatenate in RT
        {
          adjustRetentionTimes_(in, trafo_out[i], i == 0);
        }

        // add spectra to output
        for (MSExperiment<>::const_iterator spec_it = in.begin();
             spec_it != in.end(); ++spec_it)
        {
          out.addSpectrum(*spec_it);
        }
        // also add the chromatograms
        for (vector<MSChromatogram<ChromatogramPeak> >::const_iterator
               chrom_it = in.getChromatograms().begin(); chrom_it != 
               in.getChromatograms().end(); ++chrom_it)
        {
          out.addChromatogram(*chrom_it);
        }

        // copy experimental settings from first file
        if (i == 0)
        {
          out.ExperimentalSettings::operator=(in);
        }
        else // otherwise append
        {
          out.getSourceFiles().insert(out.getSourceFiles().end(), in.getSourceFiles().begin(), in.getSourceFiles().end()); // could be emtpty if spectrum was annotated above, but that's ok then
        }
      }

      //-------------------------------------------------------------
      // writing output
      //-------------------------------------------------------------

      // annotate output with data processing info
      addDataProcessing_(out, getProcessingInfo_(DataProcessing::FORMAT_CONVERSION));

      MzMLFile f;
      f.setLogType(log_type_);
      f.store(out_file, out);
    }

    return EXECUTION_OK;
  }
Ejemplo n.º 14
0
  void IBSpectraFile::store(const String& filename, const ConsensusMap& cm)
  {
    // typdefs for shorter code
    typedef std::vector<ProteinHit>::iterator ProtHitIt;

    // general settings .. do we need to expose these?
    // ----------------------------------------------------------------------
    /// Allow also non-unique peptides to be exported
    bool allow_non_unique = true;
    /// Intensities below this value will be set to 0.0 to avoid numerical problems when quantifying
    double intensity_threshold = 0.00001;
    // ----------------------------------------------------------------------


    // guess experiment type
    boost::shared_ptr<IsobaricQuantitationMethod> quantMethod = guessExperimentType_(cm);

    // we need the protein identifications to reference the protein names
    ProteinIdentification protIdent;
    bool has_proteinIdentifications = false;
    if (cm.getProteinIdentifications().size() > 0)
    {
      protIdent = cm.getProteinIdentifications()[0];
      has_proteinIdentifications = true;
    }

    // start the file by adding the tsv header
    TextFile textFile;
    textFile.addLine(ListUtils::concatenate(constructHeader_(*quantMethod), "\t"));

    for (ConsensusMap::ConstIterator cm_iter = cm.begin();
         cm_iter != cm.end();
         ++cm_iter)
    {
      const ConsensusFeature& cFeature = *cm_iter;
      std::vector<IdCSV> entries;

      /// 1st we extract the identification information from the consensus feature
      if (cFeature.getPeptideIdentifications().size() == 0 || !has_proteinIdentifications)
      {
        // we store unidentified hits anyway, because the iTRAQ quant is still helpful for normalization
        entries.push_back(IdCSV());
      }
      else
      {
        // protein name:
        const PeptideHit& peptide_hit = cFeature.getPeptideIdentifications()[0].getHits()[0];
        std::set<String> protein_accessions = peptide_hit.extractProteinAccessions();
        if (protein_accessions.size() != 1)
        {
          if (!allow_non_unique) continue; // we only want unique peptides
        }

        for (std::set<String>::const_iterator prot_ac = protein_accessions.begin(); prot_ac != protein_accessions.end(); ++prot_ac)
        {
          IdCSV entry;
          entry.charge = cFeature.getPeptideIdentifications()[0].getHits()[0].getCharge();
          entry.peptide = cFeature.getPeptideIdentifications()[0].getHits()[0].getSequence().toUnmodifiedString();
          entry.theo_mass = cFeature.getPeptideIdentifications()[0].getHits()[0].getSequence().getMonoWeight(Residue::Full, cFeature.getPeptideIdentifications()[0].getHits()[0].getCharge());

          // write modif
          entry.modif = getModifString_(cFeature.getPeptideIdentifications()[0].getHits()[0].getSequence());

          ProtHitIt proteinHit = protIdent.findHit(*prot_ac);
          if (proteinHit == protIdent.getHits().end())
          {
            std::cerr << "Protein referenced in peptide not found...\n";
            continue; // protein not found
          }

          entry.accession = proteinHit->getAccession();
          entries.push_back(entry);
        }
      }

      // 2nd we add the quantitative information of the channels

      // .. skip features with 0 intensity
      if (cFeature.getIntensity() == 0)
      {
        continue;
      }

      for (std::vector<IdCSV>::iterator entry = entries.begin();
           entry != entries.end();
           ++entry)
      {
        // set parent intensity
        entry->parent_intens = cFeature.getIntensity();
        entry->retention_time = cFeature.getRT();
        entry->spectrum = cFeature.getUniqueId();
        entry->exp_mass = cFeature.getMZ();

        // create output line
        StringList currentLine;

        // add entry to currentLine
        entry->toStringList(currentLine);

        // extract channel intensities and positions
        std::map<Int, double> intensityMap;
        ConsensusFeature::HandleSetType features = cFeature.getFeatures();

        for (ConsensusFeature::HandleSetType::const_iterator fIt = features.begin();
             fIt != features.end();
             ++fIt)
        {
          intensityMap[Int(fIt->getMZ())] = (fIt->getIntensity() > intensity_threshold ? fIt->getIntensity() : 0.0);
        }
        for (IsobaricQuantitationMethod::IsobaricChannelList::const_iterator it = quantMethod->getChannelInformation().begin();
             it != quantMethod->getChannelInformation().end();
             ++it)
        {
          currentLine.push_back(String(it->center));
        }
        for (IsobaricQuantitationMethod::IsobaricChannelList::const_iterator it = quantMethod->getChannelInformation().begin();
             it != quantMethod->getChannelInformation().end();
             ++it)
        {
          currentLine.push_back(String(intensityMap[int(it->center)]));
        }

        textFile.addLine(ListUtils::concatenate(currentLine, "\t"));
      }
    }

    // write to file
    textFile.store(filename);
  }
Ejemplo n.º 15
0
  ExitCodes main_(int, const char **)
  {
    String in = getStringOption_("in"), out = getStringOption_("out"),
           id_out = getStringOption_("id_out");

    if (out.empty() && id_out.empty())
    {
      throw Exception::RequiredParameterNotGiven(__FILE__, __LINE__,
                                                 __PRETTY_FUNCTION__,
                                                 "out/id_out");
    }

    vector<ProteinIdentification> proteins;
    vector<PeptideIdentification> peptides;

    FileTypes::Type in_type = FileHandler::getType(in);

    if (in_type == FileTypes::MZML)
    {
      MSExperiment<> experiment;
      MzMLFile().load(in, experiment);
      // what about unassigned peptide IDs?
      for (MSExperiment<>::Iterator exp_it = experiment.begin();
           exp_it != experiment.end(); ++exp_it)
      {
        peptides.insert(peptides.end(),
                        exp_it->getPeptideIdentifications().begin(),
                        exp_it->getPeptideIdentifications().end());
        exp_it->getPeptideIdentifications().clear();
      }
      experiment.getProteinIdentifications().swap(proteins);
      if (!out.empty())
      {
        addDataProcessing_(experiment,
                           getProcessingInfo_(DataProcessing::FILTERING));
        MzMLFile().store(out, experiment);
      }
    }
    else if (in_type == FileTypes::FEATUREXML)
    {
      FeatureMap features;
      FeatureXMLFile().load(in, features);
      features.getUnassignedPeptideIdentifications().swap(peptides);
      for (FeatureMap::Iterator feat_it = features.begin();
           feat_it != features.end(); ++feat_it)
      {
        peptides.insert(peptides.end(),
                        feat_it->getPeptideIdentifications().begin(),
                        feat_it->getPeptideIdentifications().end());
        feat_it->getPeptideIdentifications().clear();
      }
      features.getProteinIdentifications().swap(proteins);
      if (!out.empty())
      {
        addDataProcessing_(features,
                           getProcessingInfo_(DataProcessing::FILTERING));
        FeatureXMLFile().store(out, features);
      }
    }
    else         // consensusXML
    {
      ConsensusMap consensus;
      ConsensusXMLFile().load(in, consensus);
      consensus.getUnassignedPeptideIdentifications().swap(peptides);
      for (ConsensusMap::Iterator cons_it = consensus.begin();
           cons_it != consensus.end(); ++cons_it)
      {
        peptides.insert(peptides.end(),
                        cons_it->getPeptideIdentifications().begin(),
                        cons_it->getPeptideIdentifications().end());
        cons_it->getPeptideIdentifications().clear();
      }
      consensus.getProteinIdentifications().swap(proteins);
      if (!out.empty())
      {
        addDataProcessing_(consensus,
                           getProcessingInfo_(DataProcessing::FILTERING));
        ConsensusXMLFile().store(out, consensus);
      }
    }

    if (!id_out.empty())
    {
      // IDMapper can match a peptide ID to several overlapping features,
      // resulting in duplicates; this shouldn't be the case for peak data
      if (in_type != FileTypes::MZML) removeDuplicates_(peptides);
      IdXMLFile().store(id_out, proteins, peptides);
    }

    return EXECUTION_OK;
  }
Ejemplo n.º 16
0
  void LabeledPairFinder::run(const vector<ConsensusMap>& input_maps, ConsensusMap& result_map)
  {
    if (input_maps.size() != 1)
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "exactly one input map required");
    if (result_map.getFileDescriptions().size() != 2)
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "two file descriptions required");
    if (result_map.getFileDescriptions().begin()->second.filename != result_map.getFileDescriptions().rbegin()->second.filename)
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the two file descriptions have to contain the same file name");
    checkIds_(input_maps);

    //look up the light and heavy index
    Size light_index = numeric_limits<Size>::max();
    Size heavy_index = numeric_limits<Size>::max();
    for (ConsensusMap::FileDescriptions::const_iterator it = result_map.getFileDescriptions().begin();
         it != result_map.getFileDescriptions().end();
         ++it)
    {
      if (it->second.label == "heavy")
      {
        heavy_index = it->first;
      }
      else if (it->second.label == "light")
      {
        light_index = it->first;
      }
    }
    if (light_index == numeric_limits<Size>::max() || heavy_index == numeric_limits<Size>::max())
    {
      throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the input maps have to be labeled 'light' and 'heavy'");
    }

    result_map.clear(false);

    // sort consensus features by RT (and MZ) to speed up searching afterwards
    typedef ConstRefVector<ConsensusMap> RefMap;
    RefMap model_ref(input_maps[0].begin(), input_maps[0].end());
    model_ref.sortByPosition();

    //calculate matches
    ConsensusMap matches;
    //settings
    double rt_pair_dist = param_.getValue("rt_pair_dist");
    double rt_dev_low = param_.getValue("rt_dev_low");
    double rt_dev_high = param_.getValue("rt_dev_high");
    double mz_dev = param_.getValue("mz_dev");
    DoubleList mz_pair_dists = param_.getValue("mz_pair_dists");
    bool mrm = param_.getValue("mrm").toBool();

    //estimate RT parameters
    if (param_.getValue("rt_estimate") == "true")
    {
      //find all possible RT distances of features with the same charge and a good m/z distance
      vector<double> dists;
      dists.reserve(model_ref.size());
      for (RefMap::const_iterator it = model_ref.begin(); it != model_ref.end(); ++it)
      {
        for (RefMap::const_iterator it2 = model_ref.begin(); it2 != model_ref.end(); ++it2)
        {
          for (DoubleList::const_iterator dist_it = mz_pair_dists.begin(); dist_it != mz_pair_dists.end(); ++dist_it)
          {
            double mz_pair_dist = *dist_it;
            if (it2->getCharge() == it->getCharge()
               && it2->getMZ() >= it->getMZ() + mz_pair_dist / it->getCharge() - mz_dev
               && it2->getMZ() <= it->getMZ() + mz_pair_dist / it->getCharge() + mz_dev)
            {
              dists.push_back(it2->getRT() - it->getRT());
            }
          }
        }
      }
      if (dists.empty())
      {
        cout << "Warning: Could not find pairs for RT distance estimation. The manual settings are used!" << endl;
      }
      else
      {
        if (dists.size() < 50)
        {
          cout << "Warning: Found only " << dists.size() << " pairs. The estimated shift and std deviation are probably not reliable!" << endl;
        }
        //--------------------------- estimate initial parameters of fit ---------------------------
        GaussFitter::GaussFitResult result(-1, -1, -1);
        //first estimate of the optimal shift: median of the distances
        sort(dists.begin(), dists.end());
        Size median_index = dists.size() / 2;
        result.x0 = dists[median_index];
        //create histogram of distances
        //consider only the maximum of pairs, centered around the optimal shift
        Size max_pairs = model_ref.size() / 2;
        Size start_index = (Size) max((SignedSize)0, (SignedSize)(median_index - max_pairs / 2));
        Size end_index = (Size) min((SignedSize)(dists.size() - 1), (SignedSize)(median_index + max_pairs / 2));
        double start_value = dists[start_index];
        double end_value = dists[end_index];
        double bin_step = fabs(end_value - start_value) / 99.999; //ensure that we have 100 bins
        Math::Histogram<> hist(start_value, end_value, bin_step);
        //std::cout << "HIST from " << start_value << " to " << end_value << " (bin size " << bin_step << ")" << endl;
        for (Size i = start_index; i <= end_index; ++i)
        {
          hist.inc(dists[i]);
        }
        //cout << hist << endl;
        dists.clear();
        //determine median of bins (uniform background distribution)
        vector<Size> bins(hist.begin(), hist.end());
        sort(bins.begin(), bins.end());
        Size bin_median = bins[bins.size() / 2];
        bins.clear();
        //estimate scale A: maximum of the histogram
        Size max_value = hist.maxValue();
        result.A = max_value - bin_median;
        //overwrite estimate of x0 with the position of the highest bin
        for (Size i = 0; i < hist.size(); ++i)
        {
          if (hist[i] == max_value)
          {
            result.x0 = hist.centerOfBin(i);
            break;
          }
        }
        //estimate sigma: first time the count is less or equal the median count in the histogram
        double pos = result.x0;
        while (pos > start_value && hist.binValue(pos) > bin_median)
        {
          pos -= bin_step;
        }
        double sigma_low =  result.x0 - pos;
        pos = result.x0;
        while (pos<end_value&& hist.binValue(pos)> bin_median)
        {
          pos += bin_step;
        }
        double sigma_high = pos - result.x0;
        result.sigma = (sigma_high + sigma_low) / 6.0;
        //cout << "estimated optimal RT distance (before fit): " << result.x0 << endl;
        //cout << "estimated allowed deviation (before fit): " << result.sigma*3.0 << endl;
        //--------------------------- do gauss fit ---------------------------
        vector<DPosition<2> > points(hist.size());
        for (Size i = 0; i < hist.size(); ++i)
        {
          points[i][0] = hist.centerOfBin(i);
          points[i][1] = max(0u, hist[i]);
        }
        GaussFitter fitter;
        fitter.setInitialParameters(result);
        result = fitter.fit(points);
        cout << "estimated optimal RT distance: " << result.x0 << endl;
        cout << "estimated allowed deviation: " << fabs(result.sigma) * 3.0 << endl;
        rt_pair_dist = result.x0;
        rt_dev_low = fabs(result.sigma) * 3.0;
        rt_dev_high = fabs(result.sigma) * 3.0;
      }
    }


    // check each feature
    for (RefMap::const_iterator it = model_ref.begin(); it != model_ref.end(); ++it)
    {
      for (DoubleList::const_iterator dist_it = mz_pair_dists.begin(); dist_it != mz_pair_dists.end(); ++dist_it)
      {
        double mz_pair_dist = *dist_it;
        RefMap::const_iterator it2 = lower_bound(model_ref.begin(), model_ref.end(), it->getRT() + rt_pair_dist - rt_dev_low, ConsensusFeature::RTLess());
        while (it2 != model_ref.end() && it2->getRT() <= it->getRT() + rt_pair_dist + rt_dev_high)
        {
          // if in mrm mode, we need to compare precursor mass difference and fragment mass difference, charge remains the same

          double prec_mz_diff(0);
          if (mrm)
          {
            prec_mz_diff = fabs((double)it2->getMetaValue("MZ") - (double)it->getMetaValue("MZ"));
            if (it->getCharge() != 0)
            {
              prec_mz_diff = fabs(prec_mz_diff - mz_pair_dist / it->getCharge());
            }
            else
            {
              prec_mz_diff = fabs(prec_mz_diff - mz_pair_dist);
            }
          }

          bool mrm_correct_dist(false);
          double frag_mz_diff = fabs(it->getMZ() - it2->getMZ());

          //cerr << it->getRT() << " charge1=" << it->getCharge() << ", charge2=" << it2->getCharge() << ", prec_diff=" << prec_mz_diff << ", frag_diff=" << frag_mz_diff << endl;

          if (mrm &&
              it2->getCharge() == it->getCharge() &&
              prec_mz_diff < mz_dev &&
              (frag_mz_diff < mz_dev || fabs(frag_mz_diff - mz_pair_dist) < mz_dev))
          {
            mrm_correct_dist = true;
            //cerr << "mrm_correct_dist" << endl;
          }

          if ((mrm && mrm_correct_dist) || (!mrm &&
                                            it2->getCharge() == it->getCharge() &&
                                            it2->getMZ() >= it->getMZ() + mz_pair_dist / it->getCharge() - mz_dev &&
                                            it2->getMZ() <= it->getMZ() + mz_pair_dist / it->getCharge() + mz_dev
                                            ))
          {
            //cerr << "dist correct" << endl;
            double score = sqrt(
              PValue_(it2->getMZ() - it->getMZ(), mz_pair_dist / it->getCharge(), mz_dev, mz_dev) *
              PValue_(it2->getRT() - it->getRT(), rt_pair_dist, rt_dev_low, rt_dev_high)
              );

            // Note: we used to copy the id from the light feature here, but that strategy does not generalize to more than two labels.
            // We might want to report consensus features where the light one is missing but more than one heavier variant was found.
            // Also, the old strategy is inconsistent with what was done in the unlabeled case.  Thus now we assign a new unique id here.
            matches.push_back(ConsensusFeature());
            matches.back().setUniqueId();

            matches.back().insert(light_index, *it);
            matches.back().clearMetaInfo();
            matches.back().insert(heavy_index, *it2);
            matches.back().setQuality(score);
            matches.back().setCharge(it->getCharge());
            matches.back().computeMonoisotopicConsensus();
          }
          ++it2;
        }
      }
    }

    //compute best pairs
    // - sort matches by quality
    // - take highest-quality matches first (greedy) and mark them as used
    set<Size> used_features;
    matches.sortByQuality(true);
    for (ConsensusMap::const_iterator match = matches.begin(); match != matches.end(); ++match)
    {
      //check if features are not used yet
      if (used_features.find(match->begin()->getUniqueId()) == used_features.end() &&
          used_features.find(match->rbegin()->getUniqueId()) == used_features.end()
          )
      {
        //if unused, add it to the final set of elements
        result_map.push_back(*match);
        used_features.insert(match->begin()->getUniqueId());
        used_features.insert(match->rbegin()->getUniqueId());
      }
    }

    //Add protein identifications to result map
    for (Size i = 0; i < input_maps.size(); ++i)
    {
      result_map.getProteinIdentifications().insert(result_map.getProteinIdentifications().end(), input_maps[i].getProteinIdentifications().begin(), input_maps[i].getProteinIdentifications().end());
    }

    //Add unassigned peptide identifications to result map
    for (Size i = 0; i < input_maps.size(); ++i)
    {
      result_map.getUnassignedPeptideIdentifications().insert(result_map.getUnassignedPeptideIdentifications().end(), input_maps[i].getUnassignedPeptideIdentifications().begin(), input_maps[i].getUnassignedPeptideIdentifications().end());
    }

    // Very useful for checking the results, and the ids have no real meaning anyway
    result_map.sortByMZ();
  }
Ejemplo n.º 17
0
  ExitCodes main_(int, const char**)
  {

    //-------------------------------------------------------------
    // parameter handling
    //-------------------------------------------------------------
    //file list
    StringList file_list = getStringList_("in");

    //file type
    FileHandler fh;
    FileTypes::Type force_type;
    if (getStringOption_("in_type").size() > 0)
    {
      force_type = FileTypes::nameToType(getStringOption_("in_type"));
    }
    else
    {
      force_type = fh.getType(file_list[0]);
    }

    //output file names and types
    String out_file = getStringOption_("out");

    //-------------------------------------------------------------
    // calculations
    //-------------------------------------------------------------

    bool annotate_file_origin =  getFlag_("annotate_file_origin");

    if (force_type == FileTypes::FEATUREXML)
    {
      FeatureMap<> out;
      for (Size i = 0; i < file_list.size(); ++i)
      {
        FeatureMap<> map;
        FeatureXMLFile fh;
        fh.load(file_list[i], map);

        if (annotate_file_origin)
        {
          for (FeatureMap<>::iterator it = map.begin(); it != map.end(); ++it)
          {
            it->setMetaValue("file_origin", DataValue(file_list[i]));
          }
        }
        out += map;
      }

      //-------------------------------------------------------------
      // writing output
      //-------------------------------------------------------------

      //annotate output with data processing info
      addDataProcessing_(out, getProcessingInfo_(DataProcessing::FORMAT_CONVERSION));

      FeatureXMLFile f;
      f.store(out_file, out);

    }
    else if (force_type == FileTypes::CONSENSUSXML)
    {
      ConsensusMap out;
      ConsensusXMLFile fh;
      fh.load(file_list[0], out);
      //skip first file
      for (Size i = 1; i < file_list.size(); ++i)
      {
        ConsensusMap map;
        ConsensusXMLFile fh;
        fh.load(file_list[i], map);

        if (annotate_file_origin)
        {
          for (ConsensusMap::iterator it = map.begin(); it != map.end(); ++it)
          {
            it->setMetaValue("file_origin", DataValue(file_list[i]));
          }
        }
        out += map;
      }

      //-------------------------------------------------------------
      // writing output
      //-------------------------------------------------------------

      //annotate output with data processing info
      addDataProcessing_(out, getProcessingInfo_(DataProcessing::FORMAT_CONVERSION));

      ConsensusXMLFile f;
      f.store(out_file, out);
    }
    else if (force_type == FileTypes::TRAML)
    {
      TargetedExperiment out;
      for (Size i = 0; i < file_list.size(); ++i)
      {
        TargetedExperiment map;
        TraMLFile fh;
        fh.load(file_list[i], map);
        out += map;
      }

      //-------------------------------------------------------------
      // writing output
      //-------------------------------------------------------------

      //annotate output with data processing info
      Software software;
      software.setName("FileMerger");
      software.setVersion(VersionInfo::getVersion());
      out.addSoftware(software);

      TraMLFile f;
      f.store(out_file, out);
    }
    else
    {
      // we might want to combine different types, thus we only
      // query in_type (which applies to all files)
      // and not the suffix or content of a single file
      force_type = FileTypes::nameToType(getStringOption_("in_type"));

      //rt
      bool rt_auto_number = getFlag_("raw:rt_auto");
      bool rt_filename = getFlag_("raw:rt_filename");
      bool rt_custom = false;
      DoubleList custom_rts = getDoubleList_("raw:rt_custom");
      if (custom_rts.size() != 0)
      {
        rt_custom = true;
        if (custom_rts.size() != file_list.size())
        {
          writeLog_("Custom retention time list must have as many elements as there are input files!");
          printUsage_();
          return ILLEGAL_PARAMETERS;
        }
      }

      //ms level
      bool user_ms_level = getFlag_("raw:user_ms_level");

      MSExperiment<> out;
      out.reserve(file_list.size());
      UInt rt_auto = 0;
      UInt native_id = 0;
      std::vector<MSChromatogram<ChromatogramPeak> > all_chromatograms;
      for (Size i = 0; i < file_list.size(); ++i)
      {
        String filename = file_list[i];

        //load file
        MSExperiment<> in;
        fh.loadExperiment(filename, in, force_type, log_type_);
        if (in.empty() && in.getChromatograms().empty())
        {
          writeLog_(String("Warning: Empty file '") + filename + "'!");
          continue;
        }
        out.reserve(out.size() + in.size());

        //warn if custom RT and more than one scan in input file
        if (rt_custom && in.size() > 1)
        {
          writeLog_(String("Warning: More than one scan in file '") + filename + "'! All scans will have the same retention time!");
        }

        for (MSExperiment<>::const_iterator it2 = in.begin(); it2 != in.end(); ++it2)
        {
          //handle rt
          Real rt_final = it2->getRT();
          if (rt_auto_number)
          {
            rt_final = ++rt_auto;
          }
          else if (rt_custom)
          {
            rt_final = custom_rts[i];
          }
          else if (rt_filename)
          {
            if (!filename.hasSubstring("rt"))
            {
              writeLog_(String("Warning: cannot guess retention time from filename as it does not contain 'rt'"));
            }
            for (Size i = 0; i < filename.size(); ++i)
            {
              if (filename[i] == 'r' && ++i != filename.size() && filename[i] == 't' && ++i != filename.size() && isdigit(filename[i]))
              {
                String rt;
                while (i != filename.size() && (filename[i] == '.' || isdigit(filename[i])))
                {
                  rt += filename[i++];
                }
                if (rt.size() > 0)
                {
                  // remove dot from rt3892.98.dta
                  //                          ^
                  if (rt[rt.size() - 1] == '.')
                  {
                    // remove last character
                    rt.erase(rt.end() - 1);
                  }
                }
                try
                {
                  float tmp = rt.toFloat();
                  rt_final = tmp;
                }
                catch (Exception::ConversionError)
                {
                  writeLog_(String("Warning: cannot convert the found retention time in a value '" + rt + "'."));
                }
              }
            }
          }

          // none of the rt methods were successful
          if (rt_final == -1)
          {
            writeLog_(String("Warning: No valid retention time for output scan '") + rt_auto + "' from file '" + filename + "'");
          }

          out.addSpectrum(*it2);
          out.getSpectra().back().setRT(rt_final);
          out.getSpectra().back().setNativeID(native_id);

          if (user_ms_level)
          {
            out.getSpectra().back().setMSLevel((int)getIntOption_("raw:ms_level"));
          }
          ++native_id;
        }

        // if we had only one spectrum, we can annotate it directly, for more spectra, we just name the source file leaving the spectra unannotated (to avoid a long and redundant list of sourceFiles)
        if (in.size() == 1)
        {
          out.getSpectra().back().setSourceFile(in.getSourceFiles()[0]);
          in.getSourceFiles().clear();   // delete source file annotated from source file (its in the spectrum anyways)
        }
        // copy experimental settings from first file
        if (i == 0)
        {
          out.ExperimentalSettings::operator=(in);
        }
        else // otherwise append
        {
          out.getSourceFiles().insert(out.getSourceFiles().end(), in.getSourceFiles().begin(), in.getSourceFiles().end()); // could be emtpty if spectrum was annotated above, but that's ok then
        }

        // also add the chromatograms
        for (std::vector<MSChromatogram<ChromatogramPeak> >::const_iterator it2 = in.getChromatograms().begin(); it2 != in.getChromatograms().end(); ++it2)
        {
          all_chromatograms.push_back(*it2);
        }

      }
      // set the chromatograms
      out.setChromatograms(all_chromatograms);

      //-------------------------------------------------------------
      // writing output
      //-------------------------------------------------------------

      //annotate output with data processing info
      addDataProcessing_(out, getProcessingInfo_(DataProcessing::FORMAT_CONVERSION));

      MzMLFile f;
      f.setLogType(log_type_);
      f.store(out_file, out);

    }

    return EXECUTION_OK;
  }
  void PoseClusteringShiftSuperimposer::run(const ConsensusMap & map_model, const ConsensusMap & map_scene, TransformationDescription & transformation)
  {
    typedef ConstRefVector<ConsensusMap> PeakPointerArray_;
    typedef Math::LinearInterpolation<double, double> LinearInterpolationType_;

    LinearInterpolationType_ shift_hash_;

    // OLD STUFF
    //    LinearInterpolationType_ scaling_hash_1;
    //    LinearInterpolationType_ scaling_hash_2;
    //    LinearInterpolationType_ shift_hash_;
    //    LinearInterpolationType_ rt_high_hash_;

    /// Maximum deviation in mz of two partner points
    const double mz_pair_max_distance = param_.getValue("mz_pair_max_distance");

    /// Size of each shift bucket
    const double shift_bucket_size = param_.getValue("shift_bucket_size");

    const UInt struc_elem_length_datapoints = 21; // MAGIC ALERT: number of data points in structuring element for tophat filter, which removes baseline from histogram
    const double scaling_histogram_crossing_slope = 3.0; // MAGIC ALERT: used when distinguishing noise level and enriched histogram bins
    const double scaling_cutoff_stdev_multiplier = 1.5; // MAGIC ALERT: multiplier for stdev in cutoff for outliers
    const UInt loops_mean_stdev_cutoff = 3; // MAGIC ALERT: number of loops in stdev cutoff for outliers

    startProgress(0, 100, "shift pose clustering");
    UInt actual_progress = 0;
    setProgress(++actual_progress);

    // Optionally, we will write dumps of the hash table buckets.
    bool do_dump_buckets = false;
    String dump_buckets_basename;
    if (param_.getValue("dump_buckets") != "")
    {
      do_dump_buckets = true;
      dump_buckets_basename = param_.getValue("dump_buckets");
    }
    setProgress(++actual_progress);

    // Even more optionally, we will write dumps of the hashed pairs.
    bool do_dump_pairs = false;
    String dump_pairs_basename;
    if (param_.getValue("dump_pairs") != "")
    {
      do_dump_pairs = true;
      dump_pairs_basename = param_.getValue("dump_pairs");
    }
    setProgress(++actual_progress);

    //**************************************************************************
    // Select the most abundant data points only.  After that, disallow modifications
    // (we tend to have annoying issues with const_iterator versus iterator).
    PeakPointerArray_ model_map_ini(map_model.begin(), map_model.end());
    const PeakPointerArray_ & model_map(model_map_ini);
    PeakPointerArray_ scene_map_ini(map_scene.begin(), map_scene.end());
    const PeakPointerArray_ & scene_map(scene_map_ini);
    {
      // truncate the data as necessary
      // casting to SignedSize is done on PURPOSE here! (num_used_points will be maximal if -1 is used)
      const Size num_used_points = (SignedSize) param_.getValue("num_used_points");
      if (model_map_ini.size() > num_used_points)
      {
        model_map_ini.sortByIntensity(true);
        model_map_ini.resize(num_used_points);
      }
      model_map_ini.sortByComparator(Peak2D::MZLess());
      setProgress(++actual_progress);
      if (scene_map_ini.size() > num_used_points)
      {
        scene_map_ini.sortByIntensity(true);
        scene_map_ini.resize(num_used_points);
      }
      scene_map_ini.sortByComparator(Peak2D::MZLess());
      setProgress(++actual_progress);
      // Note: model_map_ini and scene_map_ini will not be used further below
    }
    setProgress((actual_progress = 10));

    //**************************************************************************
    // Preprocessing

    // get RT ranges (NOTE: we trust that min and max have been updated in the
    // ConsensusMap::convert() method !)

    const double model_low = map_model.getMin()[ConsensusFeature::RT];
    const double scene_low = map_scene.getMin()[ConsensusFeature::RT];
    const double model_high = map_model.getMax()[ConsensusFeature::RT];
    const double scene_high = map_scene.getMax()[ConsensusFeature::RT];

    // OLD STUFF
    //    const double rt_low = (maps[0].getMin()[ConsensusFeature::RT] + maps[1].getMin()[ConsensusFeature::RT]) / 2.;
    //    const double rt_high = (maps[0].getMax()[ConsensusFeature::RT] + maps[1].getMax()[ConsensusFeature::RT]) / 2.;

    // Initialize the hash tables: shift_hash_
    // OLD STUFF: was:  rt_scaling_hash_, rt_low_hash_, and rt_high_hash_
    {
      // (over)estimate the required number of buckets for shifting
      double max_shift = param_.getValue("max_shift");
      // actually the largest possible shift can be much smaller, depending on the data
      do
      {
        if (max_shift < 0)
          max_shift = -max_shift;
        //     ...ml@@@mh........    ,    ........ml@@@mh...
        //     ........sl@@@sh...    ,    ...sl@@@sh........
        double diff;
        diff = model_high - scene_low;
        if (diff < 0)
          diff = -diff;
        if (max_shift > diff)
          max_shift = diff;
        diff = model_low - scene_high;
        if (diff < 0)
          diff = -diff;
        if (max_shift > diff)
          max_shift = diff;
      }
      while (0);

      const Int shift_buckets_num_half = 4 + (Int) ceil((max_shift) / shift_bucket_size);
      const Int shift_buckets_num = 1 + 2 * shift_buckets_num_half;

      shift_hash_.getData().clear();
      shift_hash_.getData().resize(shift_buckets_num);
      shift_hash_.setMapping(shift_bucket_size, shift_buckets_num_half, 0);
    }
    setProgress(++actual_progress);

    //**************************************************************************
    // compute the ratio of the total intensities of both maps, for normalization
    double total_intensity_ratio;
    do
    {
      double total_int_model_map = 0;
      for (Size i = 0; i < model_map.size(); ++i)
      {
        total_int_model_map += model_map[i].getIntensity();
      }
      setProgress(++actual_progress);
      double total_int_scene_map = 0;
      for (Size i = 0; i < scene_map.size(); ++i)
      {
        total_int_scene_map += scene_map[i].getIntensity();
      }
      setProgress(++actual_progress);
      // ... and finally ...
      total_intensity_ratio = total_int_model_map / total_int_scene_map;
    }
    while (0);   // (the extra syntax helps with code folding in eclipse!)
    setProgress((actual_progress = 20));

    /// The serial number is incremented for each invocation of this, to avoid overwriting of hash table dumps.
    static Int dump_buckets_serial = 0;
    ++dump_buckets_serial;

    //**************************************************************************
    // Hashing

    // Compute the transformations between each point pair in the model map
    // and each point pair in the scene map and hash the shift
    // transformation.

    // To speed up the calculation of the final transformation, we confine the number of
    // considered point pairs.  We match a point p in the model map only onto those points p'
    // in the scene map that lie in a certain mz interval.

    Size const model_map_size = model_map.size(); // i  /* OLD STUFF: also: j */
    Size const scene_map_size = scene_map.size(); // k  /* OLD STUFF: also: l */

    const double winlength_factor_baseline = 0.1; // MAGIC ALERT: Each window is given unit weight.  If there are too many pairs for a window, the individual contributions will be very small, but running time will be high, so we provide a cutoff for this.  Typically this will exclude compounds which elute over the whole retention time range from consideration.


    ///////////////////////////////////////////////////////////////////
    // Hashing:  Estimate the shift

    do // begin of hashing (the extra syntax helps with code folding in eclipse!)
    {
      String dump_pairs_filename;
      std::ofstream dump_pairs_file;
      if (do_dump_pairs)
      {
        dump_pairs_filename = dump_pairs_basename + String(dump_buckets_serial);
        dump_pairs_file.open(dump_pairs_filename.c_str());
        dump_pairs_file << "#" << ' ' << "i" << ' ' << "k" << std::endl;
      }
      setProgress(++actual_progress);

      // first point in model map
      for (Size i = 0, i_low = 0, i_high = 0, k_low = 0, k_high = 0; i < model_map_size - 1; ++i)
      {
        setProgress(actual_progress + float(i) / model_map_size * 10.f);

        // Adjust window around i in model map
        while (i_low < model_map_size && model_map[i_low].getMZ() < model_map[i].getMZ() - mz_pair_max_distance)
          ++i_low;
        while (i_high < model_map_size && model_map[i_high].getMZ() <= model_map[i].getMZ() + mz_pair_max_distance)
          ++i_high;
        double i_winlength_factor = 1. / (i_high - i_low);
        i_winlength_factor -= winlength_factor_baseline;
        if (i_winlength_factor <= 0)
          continue;

        // Adjust window around k in scene map
        while (k_low < scene_map_size && scene_map[k_low].getMZ() < model_map[i].getMZ() - mz_pair_max_distance)
          ++k_low;
        while (k_high < scene_map_size && scene_map[k_high].getMZ() <= model_map[i].getMZ() + mz_pair_max_distance)
          ++k_high;

        // first point in scene map
        for (Size k = k_low; k < k_high; ++k)
        {
          double k_winlength_factor = 1. / (k_high - k_low);
          k_winlength_factor -= winlength_factor_baseline;
          if (k_winlength_factor <= 0)
            continue;

          // compute similarity of intensities i k
          double similarity_ik;
          {
            const double int_i = model_map[i].getIntensity();
            const double int_k = scene_map[k].getIntensity() * total_intensity_ratio;
            similarity_ik = (int_i < int_k) ? int_i / int_k : int_k / int_i;
            // weight is inverse proportional to number of elements with similar mz
            similarity_ik *= i_winlength_factor;
            similarity_ik *= k_winlength_factor;
            // VV_(int_i<<' '<<int_k<<' '<<int_similarity_ik);
          }

          // compute the transformation (i) -> (k)
          double shift = model_map[i].getRT() - scene_map[k].getRT();

          // hash the images of scaling, rt_low and rt_high into their respective hash tables
          shift_hash_.addValue(shift, similarity_ik);

          if (do_dump_pairs)
          {
            dump_pairs_file << i << ' ' << model_map[i].getRT() << ' ' << model_map[i].getMZ() << ' ' << k << ' ' << scene_map[k].getRT() << ' '
                            << scene_map[k].getMZ() << ' ' << similarity_ik << ' ' << std::endl;
          }

        } // k
      } // i
    }
    while (0);   // end of hashing (the extra syntax helps with code folding in eclipse!)

    setProgress((actual_progress = 30));

    ///////////////////////////////////////////////////////////////////
    // work on shift_hash_
    //   double shift_low;
    //   double shift_centroid;
    //   double shift_high;

    // OLD STUFF
    // double shift_low;
    double shift_centroid;
    // double shift_high;
    do
    {

      UInt filtering_stage = 0;

      // optionally, dump before filtering
      String dump_buckets_filename;
      std::ofstream dump_buckets_file;
      if (do_dump_buckets)
      {
        dump_buckets_filename = dump_buckets_basename + "_" + String(dump_buckets_serial);
        dump_buckets_file.open(dump_buckets_filename.c_str());
        VV_(dump_buckets_filename);

        dump_buckets_file << "# shift hash table buckets dump ( scale, height ) : " << dump_buckets_filename << std::endl;
        dump_buckets_file << "# unfiltered hash data\n";
        for (Size index = 0; index < shift_hash_.getData().size(); ++index)
        {
          const double image = shift_hash_.index2key(index);
          const double height = shift_hash_.getData()[index];
          dump_buckets_file << filtering_stage << '\t' << index << '\t' << image << '\t' << height << '\n';
        }
        dump_buckets_file << '\n';
      }

      ++filtering_stage;
      setProgress(++actual_progress);

      // apply tophat filter to histogram
      MorphologicalFilter morph_filter;
      Param morph_filter_param;
      morph_filter_param.setValue("struc_elem_unit", "DataPoints");
      morph_filter_param.setValue("struc_elem_length", double(struc_elem_length_datapoints));
      morph_filter_param.setValue("method", "tophat");
      morph_filter.setParameters(morph_filter_param);

      LinearInterpolationType_::container_type buffer(shift_hash_.getData().size());
      morph_filter.filterRange(shift_hash_.getData().begin(), shift_hash_.getData().end(), buffer.begin());
      shift_hash_.getData().swap(buffer);

      // optionally, dump after filtering
      if (do_dump_buckets)
      {
        dump_buckets_file << "# tophat filtered hash data\n";
        for (Size index = 0; index < shift_hash_.getData().size(); ++index)
        {
          const double image = shift_hash_.index2key(index);
          const double height = shift_hash_.getData()[index];
          dump_buckets_file << filtering_stage << '\t' << index << '\t' << image << '\t' << height << '\n';
        }
        dump_buckets_file << '\n';
      }
      setProgress(++actual_progress);

      ++filtering_stage;

      // compute freq_cutoff using a fancy criterion to distinguish between the noise level of the histogram and enriched histogram bins
      double freq_cutoff_low;
      do
      {
        {
          std::copy(shift_hash_.getData().begin(), shift_hash_.getData().end(), buffer.begin());
          std::sort(buffer.begin(), buffer.end(), std::greater<double>());
          double freq_intercept = shift_hash_.getData().front();
          double freq_slope = (shift_hash_.getData().back() - shift_hash_.getData().front()) / double(buffer.size())
                                  / scaling_histogram_crossing_slope;
          if (!freq_slope || !buffer.size())
          {
            // in fact these conditions are actually impossible, but let's be really sure ;-)
            freq_cutoff_low = 0;
          }
          else
          {
            Size index = 1; // not 0 (!)
            while (buffer[index] >= freq_intercept + freq_slope * double(index))
            {
              ++index;
            }
            freq_cutoff_low = buffer[--index]; // note that we have index >= 1
          }
        }
      }
      while (0);
      setProgress(++actual_progress);

      // apply freq_cutoff, setting smaller values to zero
      for (Size index = 0; index < shift_hash_.getData().size(); ++index)
      {
        if (shift_hash_.getData()[index] < freq_cutoff_low)
        {
          shift_hash_.getData()[index] = 0;
        }
      }
      setProgress(++actual_progress);

      // optionally, dump after noise filtering using freq_cutoff
      if (do_dump_buckets)
      {
        dump_buckets_file << "# after freq_cutoff, which is: " << freq_cutoff_low << '\n';
        for (Size index = 0; index < shift_hash_.getData().size(); ++index)
        {
          const double image = shift_hash_.index2key(index);
          const double height = shift_hash_.getData()[index];
          dump_buckets_file << filtering_stage << '\t' << index << '\t' << image << '\t' << height << '\n';
        }
        dump_buckets_file << '\n';
      }
      setProgress(++actual_progress);

      // iterative cut-off based on mean and stdev - relies upon scaling_cutoff_stdev_multiplier which is a bit hard to set right.
      {
        Math::BasicStatistics<double> statistics;
        std::vector<double>::const_iterator data_begin = shift_hash_.getData().begin();
        const Size data_size = shift_hash_.getData().size();
        Size data_range_begin = 0;
        Size data_range_end = data_size;
        for (UInt loop = 0; loop < loops_mean_stdev_cutoff; ++loop)   // MAGIC ALERT: number of loops
        {
          statistics.update(data_begin + data_range_begin, data_begin + data_range_end);
          double mean = statistics.mean() + data_range_begin;
          double stdev = sqrt(statistics.variance());
          data_range_begin = floor(std::max<double>(mean - scaling_cutoff_stdev_multiplier * stdev, 0));
          data_range_end = ceil(std::min<double>(mean + scaling_cutoff_stdev_multiplier * stdev + 1, data_size));
          const double outside_mean = shift_hash_.index2key(mean);
          const double outside_stdev = stdev * shift_hash_.getScale();
          // shift_low = (outside_mean - outside_stdev);
          shift_centroid = (outside_mean);
          // shift_high = (outside_mean + outside_stdev);
          if (do_dump_buckets)
          {
            dump_buckets_file << "# loop: " << loop << "  mean: " << outside_mean << "  stdev: " << outside_stdev << "  (mean-stdev): "
                              << outside_mean - outside_stdev << "  (mean+stdev): " << outside_mean + outside_stdev
                              << "  data_range_begin: " << data_range_begin << "  data_range_end: "
                              << data_range_end << std::endl;
          }
        }
        setProgress(++actual_progress);
      }
      if (do_dump_buckets)
      {
        dump_buckets_file << "# EOF" << std::endl;
        dump_buckets_file.close();
      }
      setProgress(80);

    }
    while (0);

    //************************************************************************************
    // Estimate transform

    // Compute the shifts at the low and high ends by looking at (around) the fullest bins.
    double intercept;
#if 1 // yes of course, use centroids for images of rt_low and rt_high
    intercept = shift_centroid;
#else // ooh, use maximum bins instead (Note: this is a fossil which would disregard most of the above computations!  The code is left here for developers/debugging only.)
    const Size rt_low_max_index = std::distance(shift_hash_.getData().begin(),
                                                std::max_element(shift_hash_.getData().begin(), shift_hash_.getData().end()));
    intercept = shift_hash_.index2key(rt_low_max_index);
#endif

    VV_(intercept);

    setProgress(++actual_progress);

    // set trafo
    {
      Param params;
      params.setValue("slope", 1.0);
      params.setValue("intercept", intercept);

      TransformationDescription trafo;
      trafo.fitModel("linear", params);
      transformation = trafo;
    }

    setProgress(++actual_progress);
    endProgress();

    return;
  } // run()