예제 #1
0
 // If the score_type has a different name in the meta_values, it is not possible to find it.
 // E.g. Percolator_qvalue <-> q-value.
 // Improvement for the future would be to have unique names for the score_types
 // LuciphorAdapter uses the same stragety to backup previous scores.
 void addScoreToMetaValues_(PeptideHit& hit, const String score_type)
 {
   if (!hit.metaValueExists(score_type) && !hit.metaValueExists(score_type + "_score"))
   {
     if (score_type.hasSubstring("score"))
     {
       hit.setMetaValue(score_type, hit.getScore());
     }
     else
     {
       hit.setMetaValue(score_type + "_score", hit.getScore());
     }
   }
 }
예제 #2
0
  double get_score_(String& engine, const PeptideHit& hit)
  {
    if (engine == "OMSSA")
    {
      return (-1) * log10(max(hit.getScore(), smallest_e_value_));
    }
    else if (engine == "MyriMatch")
    {
      //double e_val = exp(-hit.getScore());
      //double score_val = ((-1)* log10(max(e_val,smallest_e_value_)));
      //printf("myri score: %e ; e_val: %e ; score_val: %e\n",hit.getScore(),e_val,score_val);
      //return score_val;
      return hit.getScore();
    }
    else if (engine.compare("XTandem") == 0)
    {
      return (-1) * log10(max((DoubleReal)hit.getMetaValue("E-Value"), smallest_e_value_));
    }
    else if (engine == "MASCOT")
    {
      if (hit.metaValueExists("EValue"))
      {
        return (-1) * log10(max((DoubleReal)hit.getMetaValue("EValue"), smallest_e_value_));
      }
      if (hit.metaValueExists("expect"))
      {
        return (-1) * log10(max((DoubleReal)hit.getMetaValue("expect"), smallest_e_value_));
      }
    }
    else if (engine == "SpectraST")
    {
      return 100 * hit.getScore(); // SpectraST f-val
    }
    else if (engine == "SimTandem")
    {
      if (hit.metaValueExists("E-Value"))
      {
        return (-1) * log10(max((DoubleReal)hit.getMetaValue("E-Value"), smallest_e_value_));
      }
    }
    else
    {
      throw Exception::UnableToFit(__FILE__, __LINE__, __PRETTY_FUNCTION__, "No parameters for chosen search engine", "The chosen search engine is currently not supported");
    }

    // avoid compiler warning (every code path must return a value, even if there is a throw() somewhere)
    return std::numeric_limits<double>::max();
  }
예제 #3
0
  //Visualizing PeptideHit object
  void MetaDataBrowser::visualize_(PeptideHit & meta, QTreeWidgetItem * parent)
  {
    PeptideHitVisualizer * visualizer = new PeptideHitVisualizer(isEditable(), this);
    visualizer->load(meta);

    String name = String("Pep ") + meta.getSequence().toString() + " (" + meta.getScore() + ')';
    QString qs_name(name.c_str());

    QStringList labels;
    labels << qs_name << QString::number(ws_->addWidget(visualizer)) << QString::number(meta.getScore());

    QTreeWidgetItem * item;
    if (parent == nullptr)
    {
      item = new QTreeWidgetItem(treeview_, labels);
    }
    else
    {
      item = new QTreeWidgetItem(parent, labels);
    }

    visualize_(dynamic_cast<MetaInfoInterface &>(meta), item);

    connectVisualizer_(visualizer);
  }
예제 #4
0
  void ConsensusIDAlgorithm::apply(vector<PeptideIdentification>& ids,
                                   Size number_of_runs)
  {
    // abort if no IDs present
    if (ids.empty())
    {
      return;
    }

    number_of_runs_ = (number_of_runs != 0) ? number_of_runs : ids.size();

    // prepare data here, so that it doesn't have to happen in each algorithm:
    for (vector<PeptideIdentification>::iterator pep_it = ids.begin(); 
         pep_it != ids.end(); ++pep_it)
    {
      pep_it->sort();
      if ((considered_hits_ > 0) &&
          (pep_it->getHits().size() > considered_hits_))
      {
        pep_it->getHits().resize(considered_hits_);
      }
    }
    // make sure there are no duplicated hits (by sequence):
    IDFilter::removeDuplicatePeptideHits(ids, true);

    SequenceGrouping results;
    apply_(ids, results); // actual (subclass-specific) processing

    String score_type = ids[0].getScoreType();
    bool higher_better = ids[0].isHigherScoreBetter();
    ids.clear();
    ids.resize(1);
    ids[0].setScoreType(score_type);
    ids[0].setHigherScoreBetter(higher_better);
    for (SequenceGrouping::iterator res_it = results.begin(); 
         res_it != results.end(); ++res_it)
    {
      OPENMS_PRECONDITION(!res_it->second.second.empty(),
                          "Consensus score for peptide required");
      PeptideHit hit;

      if (res_it->second.second.size() == 2)
      {
        // filter by "support" value:
        double support = res_it->second.second[1];
        if (support < min_support_) continue;
        hit.setMetaValue("consensus_support", support);
      }
      
      hit.setSequence(res_it->first);
      hit.setCharge(res_it->second.first);
      hit.setScore(res_it->second.second[0]);
      ids[0].insertHit(hit);
#ifdef DEBUG_ID_CONSENSUS
      LOG_DEBUG << " - Output hit: " << hit.getSequence() << " "
                << hit.getScore() << endl;
#endif
    }
    ids[0].assignRanks();
  }
예제 #5
0
  ExitCodes main_(int, const char**)
  {
    vector<ProteinIdentification> prot_ids;
    vector<PeptideIdentification> pep_ids;
    ProteinHit temp_protein_hit;

    //-------------------------------------------------------------
    // parsing parameters
    //-------------------------------------------------------------
    String inputfile_id               = getStringOption_("id");
    String inputfile_feature       = getStringOption_("feature");
    String inputfile_consensus  = getStringOption_("consensus");
    String inputfile_raw            = getStringOption_("in");
    String outputfile_name       = getStringOption_("out");

    //~ bool Ms1(getFlag_("MS1"));
    //~ bool Ms2(getFlag_("MS2"));
    bool remove_duplicate_features(getFlag_("remove_duplicate_features"));
    
    //-------------------------------------------------------------
    // fetch vocabularies
    //------------------------------------------------------------
    ControlledVocabulary cv;
    cv.loadFromOBO("PSI-MS", File::find("/CV/psi-ms.obo"));
    cv.loadFromOBO("QC", File::find("/CV/qc-cv.obo"));
 
     QcMLFile qcmlfile;

    //-------------------------------------------------------------
    // MS  aqiusition
    //------------------------------------------------------------
    String base_name = QFileInfo(QString::fromStdString(inputfile_raw)).baseName();

    cout << "Reading mzML file..." << endl;
    MzMLFile mz_data_file;
    MSExperiment<Peak1D> exp;
    MzMLFile().load(inputfile_raw, exp);
    
    //---prep input
    exp.sortSpectra();
    UInt min_mz = std::numeric_limits<UInt>::max();
    UInt max_mz = 0;
    std::map<Size, UInt> mslevelcounts;
    
    qcmlfile.registerRun(base_name,base_name); //TODO use UIDs
    
    //---base MS aquisition qp
    String msaq_ref = base_name + "_msaq";
    QcMLFile::QualityParameter qp;
    qp.id = msaq_ref; ///< Identifier
    qp.cvRef = "QC"; ///< cv reference
    qp.cvAcc = "QC:0000004";
    try
    {
      //~ const ControlledVocabulary::CVTerm& test = cv.getTermByName("MS aquisition result details");
      //~ cout << test.name << test.id << endl;
      const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
      //~ const ControlledVocabulary::CVTerm& term = cv.getTerm("0000004");
      qp.name = term.name; ///< Name
    }
    catch (...)
    {
      qp.name = "mzML file"; ///< Name
    }
    qcmlfile.addRunQualityParameter(base_name, qp);
    
    //---file origin qp
    qp = QcMLFile::QualityParameter();
    qp.name = "mzML file"; ///< Name
    qp.id = base_name + "_run_name"; ///< Identifier
    qp.cvRef = "MS"; ///< cv reference
    qp.cvAcc = "MS:1000577";
    qp.value = base_name;
    qcmlfile.addRunQualityParameter(base_name, qp);
    
    qp = QcMLFile::QualityParameter();
    qp.name = "instrument model"; ///< Name
    qp.id = base_name + "_instrument_name"; ///< Identifier
    qp.cvRef = "MS"; ///< cv reference
    qp.cvAcc = "MS:1000031";
    qp.value = exp.getInstrument().getName();
    qcmlfile.addRunQualityParameter(base_name, qp);    

    qp = QcMLFile::QualityParameter();
    qp.name = "completion time"; ///< Name
    qp.id = base_name + "_date"; ///< Identifier
    qp.cvRef = "MS"; ///< cv reference
    qp.cvAcc = "MS:1000747";
    qp.value = exp.getDateTime().getDate();
    qcmlfile.addRunQualityParameter(base_name, qp);

    //---precursors at
    QcMLFile::Attachment at;
    at.cvRef = "QC"; ///< cv reference
    at.cvAcc = "QC:0000044";
    at.qualityRef = msaq_ref;
    at.id = base_name + "_precursors"; ///< Identifier
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
      at.name = term.name; ///< Name
    }
    catch (...)
    {
      at.name = "precursors"; ///< Name
    }

    at.colTypes.push_back("MS:1000894_[sec]"); //RT
    at.colTypes.push_back("MS:1000040"); //MZ
    for (Size i = 0; i < exp.size(); ++i)
    {
      mslevelcounts[exp[i].getMSLevel()]++;
      if (exp[i].getMSLevel() == 2)
      {
        if (exp[i].getPrecursors().front().getMZ() < min_mz)
        {
          min_mz = exp[i].getPrecursors().front().getMZ();
        }
        if (exp[i].getPrecursors().front().getMZ() > max_mz)
        {
          max_mz = exp[i].getPrecursors().front().getMZ();
        }
        std::vector<String> row;
        row.push_back(exp[i].getRT());
        row.push_back(exp[i].getPrecursors().front().getMZ());
        at.tableRows.push_back(row);
      }
    }
    qcmlfile.addRunAttachment(base_name, at);

    //---aquisition results qp
    qp = QcMLFile::QualityParameter();
    qp.cvRef = "QC"; ///< cv reference
    qp.cvAcc = "QC:0000006"; ///< cv accession for "aquisition results"
    qp.id = base_name + "_ms1aquisition"; ///< Identifier
    qp.value = String(mslevelcounts[1]);
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
      qp.name = term.name; ///< Name
    }
    catch (...)
    {
      qp.name = "number of ms1 spectra"; ///< Name
    }
    qcmlfile.addRunQualityParameter(base_name, qp);
    

    qp = QcMLFile::QualityParameter();
    qp.cvRef = "QC"; ///< cv reference
    qp.cvAcc = "QC:0000007"; ///< cv accession for "aquisition results"
    qp.id = base_name + "_ms2aquisition"; ///< Identifier
    qp.value = String(mslevelcounts[2]);
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
      qp.name = term.name; ///< Name
    }
    catch (...)
    {
      qp.name = "number of ms2 spectra"; ///< Name
    }
    qcmlfile.addRunQualityParameter(base_name, qp);

    qp = QcMLFile::QualityParameter();
    qp.cvRef = "QC"; ///< cv reference
    qp.cvAcc = "QC:0000008"; ///< cv accession for "aquisition results"
    qp.id = base_name + "_Chromaquisition"; ///< Identifier
    qp.value = String(exp.getChromatograms().size());
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
      qp.name = term.name; ///< Name
    }
    catch (...)
    {
      qp.name = "number of chromatograms"; ///< Name
    }
    qcmlfile.addRunQualityParameter(base_name, qp);
    
    at = QcMLFile::Attachment();
    at.cvRef = "QC"; ///< cv reference
    at.cvAcc = "QC:0000009";
    at.qualityRef = msaq_ref;
    at.id = base_name + "_mzrange"; ///< Identifier
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
      at.name = term.name; ///< Name
    }
    catch (...)
    {
      at.name = "MS MZ aquisition ranges"; ///< Name
    }

    at.colTypes.push_back("QC:0000010"); //MZ
    at.colTypes.push_back("QC:0000011"); //MZ
    std::vector<String> rowmz;
    rowmz.push_back(String(min_mz));
    rowmz.push_back(String(max_mz));
    at.tableRows.push_back(rowmz);
    qcmlfile.addRunAttachment(base_name, at);

    at = QcMLFile::Attachment();
    at.cvRef = "QC"; ///< cv reference
    at.cvAcc = "QC:0000012";
    at.qualityRef = msaq_ref;
    at.id = base_name + "_rtrange"; ///< Identifier
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
      at.name = term.name; ///< Name
    }
    catch (...)
    {
      at.name = "MS RT aquisition ranges"; ///< Name
    }

    at.colTypes.push_back("QC:0000013"); //MZ
    at.colTypes.push_back("QC:0000014"); //MZ
    std::vector<String> rowrt;
    rowrt.push_back(String(exp.begin()->getRT()));
    rowrt.push_back(String(exp.getSpectra().back().getRT()));
    at.tableRows.push_back(rowrt);
    qcmlfile.addRunAttachment(base_name, at);
    

    //---ion current stability ( & tic ) qp
    at = QcMLFile::Attachment();
    at.cvRef = "QC"; ///< cv reference
    at.cvAcc = "QC:0000022";
    at.qualityRef = msaq_ref;
    at.id = base_name + "_tics"; ///< Identifier
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
      at.name = term.name; ///< Name
    }
    catch (...)
    {
      at.name = "MS TICs"; ///< Name
    }
    
    at.colTypes.push_back("MS:1000894_[sec]");
    at.colTypes.push_back("MS:1000285");
    UInt max = 0;
    Size below_10k = 0;
    for (Size i = 0; i < exp.size(); ++i)
    {
      if (exp[i].getMSLevel() == 1)
      {
        UInt sum = 0;
        for (Size j = 0; j < exp[i].size(); ++j)
        {
          sum += exp[i][j].getIntensity();
        }
        if (sum > max)
        {
          max = sum;
        }
        if (sum < 10000)
        {
          ++below_10k;
        }
        std::vector<String> row;
        row.push_back(exp[i].getRT());
        row.push_back(sum);
        at.tableRows.push_back(row);
      }
    }
    qcmlfile.addRunAttachment(base_name, at);
    

    qp = QcMLFile::QualityParameter();
    qp.id = base_name + "_ticslump"; ///< Identifier
    qp.cvRef = "QC"; ///< cv reference
    qp.cvAcc = "QC:0000023";
    qp.value = String((100 / exp.size()) * below_10k);
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
      qp.name = term.name; ///< Name
    }
    catch (...)
    {
      qp.name = "percentage of tic slumps"; ///< Name
    }
    qcmlfile.addRunQualityParameter(base_name, qp);

    
    //-------------------------------------------------------------
    // MS  id
    //------------------------------------------------------------
    if (inputfile_id != "")
    {
      IdXMLFile().load(inputfile_id, prot_ids, pep_ids);
      cerr << "idXML read ended. Found " << pep_ids.size() << " peptide identifications." << endl;

      ProteinIdentification::SearchParameters params = prot_ids[0].getSearchParameters();
      vector<String> var_mods = params.variable_modifications;
      //~ boost::regex re("(?<=[KR])(?=[^P])");
     
      String msid_ref = base_name + "_msid";
      QcMLFile::QualityParameter qp;
      qp.id = msid_ref; ///< Identifier
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000025";
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
        qp.name = "MS identification result details"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      at = QcMLFile::Attachment();
      at.cvRef = "QC"; ///< cv reference
      at.cvAcc = "QC:0000026";
      at.qualityRef = msid_ref;
      at.id = base_name + "_idsetting"; ///< Identifier
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
        at.name = term.name; ///< Name
      }
      catch (...)
      {
        at.name = "MS id settings"; ///< Name
      }
      
      at.colTypes.push_back("MS:1001013"); //MS:1001013 db name  MS:1001016 version  MS:1001020 taxonomy
      at.colTypes.push_back("MS:1001016");
      at.colTypes.push_back("MS:1001020");
      std::vector<String> row;
      row.push_back(String(prot_ids.front().getSearchParameters().db));
      row.push_back(String(prot_ids.front().getSearchParameters().db_version));
      row.push_back(String(prot_ids.front().getSearchParameters().taxonomy));
      at.tableRows.push_back(row);
      qcmlfile.addRunAttachment(base_name, at);


      UInt spectrum_count = 0;
      Size peptide_hit_count = 0;
      UInt runs_count = 0;
      Size protein_hit_count = 0;
      set<String> peptides;
      set<String> proteins;
      Size missedcleavages = 0;
      for (Size i = 0; i < pep_ids.size(); ++i)
      {
        if (!pep_ids[i].empty())
        {
          ++spectrum_count;
          peptide_hit_count += pep_ids[i].getHits().size();
          const vector<PeptideHit>& temp_hits = pep_ids[i].getHits();
          for (Size j = 0; j < temp_hits.size(); ++j)
          {
            peptides.insert(temp_hits[j].getSequence().toString());
          }
        }
      }
      for (set<String>::iterator it = peptides.begin(); it != peptides.end(); ++it)
      {
        for (String::const_iterator st = it->begin(); st != it->end() - 1; ++st)
        {
          if (*st == 'K' || *st == 'R')
          {
            ++missedcleavages;
          }
        }
      }

      for (Size i = 0; i < prot_ids.size(); ++i)
      {
        ++runs_count;
        protein_hit_count += prot_ids[i].getHits().size();
        const vector<ProteinHit>& temp_hits = prot_ids[i].getHits();
        for (Size j = 0; j < temp_hits.size(); ++j)
        {
          proteins.insert(temp_hits[j].getAccession());
        }
      }
      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000037"; ///< cv accession
      qp.id = base_name + "_misscleave"; ///< Identifier
      qp.value = missedcleavages;
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
        qp.name = "total number of missed cleavages"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);

      
      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000032"; ///< cv accession
      qp.id = base_name + "_totprot"; ///< Identifier
      qp.value = protein_hit_count;
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
        qp.name = "total number of identified proteins"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000033"; ///< cv accession
      qp.id = base_name + "_totuniqprot"; ///< Identifier
      qp.value = String(proteins.size());
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "total number of uniquely identified proteins"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000029"; ///< cv accession
      qp.id = base_name + "_psms"; ///< Identifier
      qp.value = String(spectrum_count);
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "total number of PSM"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000030"; ///< cv accession
      qp.id = base_name + "_totpeps"; ///< Identifier
      qp.value = String(peptide_hit_count);
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "total number of identified peptides"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000031"; ///< cv accession
      qp.id = base_name + "_totuniqpeps"; ///< Identifier
      qp.value = String(peptides.size());
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "total number of uniquely identified peptides"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      at = QcMLFile::Attachment();
      at.cvRef = "QC"; ///< cv reference
      at.cvAcc = "QC:0000038";
      at.qualityRef = msid_ref;
      at.id = base_name + "_massacc"; ///< Identifier
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
        at.name = term.name; ///< Name
      }
      catch (...)
      {
        at.name = "delta ppm tables";
      }
      
      //~ delta ppm QC:0000039 RT MZ uniqueness ProteinID MS:1000885 target/decoy Score PeptideSequence MS:1000889 Annots string Similarity Charge UO:0000219 TheoreticalWeight UO:0000221 Oxidation_(M)
      at.colTypes.push_back("RT");
      at.colTypes.push_back("MZ");
      at.colTypes.push_back("Score");
      at.colTypes.push_back("PeptideSequence");
      at.colTypes.push_back("Charge");
      at.colTypes.push_back("TheoreticalWeight");
      at.colTypes.push_back("delta_ppm");
      for (UInt w = 0; w < var_mods.size(); ++w)
      {
        at.colTypes.push_back(String(var_mods[w]).substitute(' ', '_'));
      }

      std::vector<double> deltas;
      //~ prot_ids[0].getSearchParameters();
      for (vector<PeptideIdentification>::iterator it = pep_ids.begin(); it != pep_ids.end(); ++it)
      {
        if (it->getHits().size() > 0)
        {
          std::vector<String> row;
          row.push_back(it->getRT());
          row.push_back(it->getMZ());
          PeptideHit tmp = it->getHits().front(); //TODO depends on score & sort
          vector<UInt> pep_mods;
          for (UInt w = 0; w < var_mods.size(); ++w)
          {
            pep_mods.push_back(0);
          }
          for (AASequence::ConstIterator z =  tmp.getSequence().begin(); z != tmp.getSequence().end(); ++z)
          {
            Residue res = *z;
            String temp;
            if (res.getModification().size() > 0 && res.getModification() != "Carbamidomethyl")
            {
              temp = res.getModification() + " (" + res.getOneLetterCode()  + ")";
              //cout<<res.getModification()<<endl;
              for (UInt w = 0; w < var_mods.size(); ++w)
              {
                if (temp == var_mods[w])
                {
                  //cout<<temp;
                  pep_mods[w] += 1;
                }
              }
            }
          }
          row.push_back(tmp.getScore());
          row.push_back(tmp.getSequence().toString().removeWhitespaces());
          row.push_back(tmp.getCharge());
          row.push_back(String((tmp.getSequence().getMonoWeight() + tmp.getCharge() * Constants::PROTON_MASS_U) / tmp.getCharge()));
          double dppm = /* std::abs */ (getMassDifference(((tmp.getSequence().getMonoWeight() + tmp.getCharge() * Constants::PROTON_MASS_U) / tmp.getCharge()), it->getMZ(), true));
          row.push_back(String(dppm));
          deltas.push_back(dppm);
          for (UInt w = 0; w < var_mods.size(); ++w)
          {
            row.push_back(pep_mods[w]);
          }
          at.tableRows.push_back(row);
        }
      }
      qcmlfile.addRunAttachment(base_name, at);
      

      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000040"; ///< cv accession
      qp.id = base_name + "_mean_delta"; ///< Identifier
      qp.value = String(OpenMS::Math::mean(deltas.begin(), deltas.end()));
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "mean delta ppm"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000041"; ///< cv accession
      qp.id = base_name + "_median_delta"; ///< Identifier
      qp.value = String(OpenMS::Math::median(deltas.begin(), deltas.end(), false));
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "median delta ppm"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000035"; ///< cv accession
      qp.id = base_name + "_ratio_id"; ///< Identifier
      qp.value = String(double(pep_ids.size()) / double(mslevelcounts[2]));
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "id ratio"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);
    }

    //-------------------------------------------------------------
    // MS quantitation
    //------------------------------------------------------------
    FeatureMap map;
    String msqu_ref = base_name + "_msqu";
    if (inputfile_feature != "")
    {
      FeatureXMLFile f;
      f.load(inputfile_feature, map);

      cout << "Read featureXML file..." << endl;

      //~ UInt fiter = 0;
      map.sortByRT();
      map.updateRanges();

      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000045"; ///< cv accession
      qp.id = msqu_ref; ///< Identifier
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "MS quantification result details"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);
      
      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000046"; ///< cv accession
      qp.id = base_name + "_feature_count"; ///< Identifier
      qp.value = String(map.size());
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "number of features"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);      
    }

    if (inputfile_feature != "" && !remove_duplicate_features)
    {
      
      QcMLFile::Attachment at;
      at = QcMLFile::Attachment();
      at.cvRef = "QC"; ///< cv reference
      at.cvAcc = "QC:0000047";
      at.qualityRef = msqu_ref;
      at.id = base_name + "_features"; ///< Identifier
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
        at.name = term.name; ///< Name
      }
      catch (...)
      {
        at.name = "features"; ///< Name
      }
      
      at.colTypes.push_back("MZ");
      at.colTypes.push_back("RT");
      at.colTypes.push_back("Intensity");
      at.colTypes.push_back("Charge");
      at.colTypes.push_back("Quality");
      at.colTypes.push_back("FWHM");
      at.colTypes.push_back("IDs");
      UInt fiter = 0;
      map.sortByRT();
      //ofstream out(outputfile_name.c_str());
      while (fiter < map.size())
      {
        std::vector<String> row;
        row.push_back(map[fiter].getMZ());
        row.push_back(map[fiter].getRT());
        row.push_back(map[fiter].getIntensity());
        row.push_back(map[fiter].getCharge());
        row.push_back(map[fiter].getOverallQuality());
        row.push_back(map[fiter].getWidth());
        row.push_back(map[fiter].getPeptideIdentifications().size());
        fiter++;
        at.tableRows.push_back(row);
      }     
      qcmlfile.addRunAttachment(base_name, at);
    }
    else if (inputfile_feature != "" && remove_duplicate_features)
    {
      QcMLFile::Attachment at;
      at = QcMLFile::Attachment();
      at.cvRef = "QC"; ///< cv reference
      at.cvAcc = "QC:0000047";
      at.qualityRef = msqu_ref;
      at.id = base_name + "_features"; ///< Identifier
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
        at.name = term.name; ///< Name
      }
      catch (...)
      {
        at.name = "features"; ///< Name
      }
      
      at.colTypes.push_back("MZ");
      at.colTypes.push_back("RT");
      at.colTypes.push_back("Intensity");
      at.colTypes.push_back("Charge");
      FeatureMap map, map_out;
      FeatureXMLFile f;
      f.load(inputfile_feature, map);
      UInt fiter = 0;
      map.sortByRT();
      while (fiter < map.size())
      {
        FeatureMap map_tmp;
        for (UInt k = fiter; k <= map.size(); ++k)
        {
          if (abs(map[fiter].getRT() - map[k].getRT()) < 0.1)
          {
            //~ cout << fiter << endl;
            map_tmp.push_back(map[k]);
          }
          else
          {
            fiter = k;
            break;
          }
        }
        map_tmp.sortByMZ();
        UInt retif = 1;
        map_out.push_back(map_tmp[0]);
        while (retif < map_tmp.size())
        {
          if (abs(map_tmp[retif].getMZ() - map_tmp[retif - 1].getMZ()) > 0.01)
          {
            cout << "equal RT, but mass different" << endl;
            map_out.push_back(map_tmp[retif]);
          }
          retif++;
        }
      }
      qcmlfile.addRunAttachment(base_name, at);
    }
    if (inputfile_consensus != "")
    {
      cout << "Reading consensusXML file..." << endl;
      ConsensusXMLFile f;
      ConsensusMap map;
      f.load(inputfile_consensus, map);
      //~ String CONSENSUS_NAME = "_consensus.tsv";
      //~ String combined_out = outputfile_name + CONSENSUS_NAME;
      //~ ofstream out(combined_out.c_str());

      at = QcMLFile::Attachment();
      qp.name = "consensuspoints"; ///< Name
      //~ qp.id = base_name + "_consensuses"; ///< Identifier
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:xxxxxxxx"; ///< cv accession "featuremapper results"

      at.colTypes.push_back("Native_spectrum_ID");
      at.colTypes.push_back("DECON_RT_(sec)");
      at.colTypes.push_back("DECON_MZ_(Th)");
      at.colTypes.push_back("DECON_Intensity");
      at.colTypes.push_back("Feature_RT_(sec)");
      at.colTypes.push_back("Feature_MZ_(Th)");
      at.colTypes.push_back("Feature_Intensity");
      at.colTypes.push_back("Feature_Charge");
      for (ConsensusMap::const_iterator cmit = map.begin(); cmit != map.end(); ++cmit)
      {
        const ConsensusFeature& CF = *cmit;
        for (ConsensusFeature::const_iterator cfit = CF.begin(); cfit != CF.end(); ++cfit)
        {
          std::vector<String> row;
          FeatureHandle FH = *cfit;
          row.push_back(CF.getMetaValue("spectrum_native_id"));
          row.push_back(CF.getRT()); row.push_back(CF.getMZ());
          row.push_back(CF.getIntensity());
          row.push_back(FH.getRT());
          row.push_back(FH.getMZ());
          row.push_back(FH.getCharge());
          at.tableRows.push_back(row);
        }
      }
      qcmlfile.addRunAttachment(base_name, at);
    }
    
    
    //-------------------------------------------------------------
    // finalize
    //------------------------------------------------------------
    qcmlfile.store(outputfile_name);
    return EXECUTION_OK;
  }
예제 #6
0
  void IDDecoyProbability::apply_(vector<PeptideIdentification> & ids, const vector<double> & rev_scores, const vector<double> & fwd_scores, const vector<double> & all_scores)
  {
    Size number_of_bins(param_.getValue("number_of_bins"));



    // normalize distribution to [0, 1]
    vector<double> fwd_scores_normalized(number_of_bins, 0.0), rev_scores_normalized(number_of_bins, 0.0), diff_scores(number_of_bins, 0.0), all_scores_normalized(number_of_bins, 0.0);
    Transformation_ rev_trafo, fwd_trafo, all_trafo;
    normalizeBins_(rev_scores, rev_scores_normalized, rev_trafo);
    normalizeBins_(fwd_scores, fwd_scores_normalized, fwd_trafo);
    normalizeBins_(all_scores, all_scores_normalized, all_trafo);

    // rev scores fitting
    vector<DPosition<2> > rev_data;

    for (Size i = 0; i < number_of_bins; ++i)
    {
      DPosition<2> pos;
      pos.setX(((double)i) / (double)number_of_bins + 0.0001);    // necessary????
      pos.setY(rev_scores_normalized[i]);
      rev_data.push_back(pos);
#ifdef IDDECOYPROBABILITY_DEBUG
      cerr << pos.getX() << " " << pos.getY() << endl;
#endif
    }

    Math::GammaDistributionFitter gdf;
    Math::GammaDistributionFitter::GammaDistributionFitResult result_gamma_1st (1.0, 3.0);
    gdf.setInitialParameters(result_gamma_1st);
    // TODO heuristic for good start parameters
    Math::GammaDistributionFitter::GammaDistributionFitResult result_gamma = gdf.fit(rev_data);

#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << gdf.getGnuplotFormula() << endl;
    String rev_filename = param_.getValue("rev_filename");
    generateDistributionImage_(rev_scores_normalized, gdf.getGnuplotFormula(), rev_filename);
#endif

    // generate diffs of distributions
    // get the fwd and rev distribution, apply all_trafo and calculate the diff
    vector<Size> fwd_bins(number_of_bins, 0), rev_bins(number_of_bins, 0);
    double min(all_trafo.min_score), diff(all_trafo.diff_score);
    Size max_bin(0);
    for (vector<double>::const_iterator it = fwd_scores.begin(); it != fwd_scores.end(); ++it)
    {
      Size bin = (Size)((*it - min) / diff * (double)(number_of_bins - 1));
      ++fwd_bins[bin];
      if (fwd_bins[bin] > max_bin)
      {
        max_bin = fwd_bins[bin];
      }
    }

    Size max_reverse_bin(0), max_reverse_bin_value(0);
    //min = rev_trafo.min_score;
    //diff = rev_trafo.diff_score;
    for (vector<double>::const_iterator it = rev_scores.begin(); it != rev_scores.end(); ++it)
    {
      Size bin = (Size)((*it - min) / diff * (double)number_of_bins);
      ++rev_bins[bin];
      if (rev_bins[bin] > max_bin)
      {
        max_bin = rev_bins[bin];
      }
      if (rev_bins[bin] > max_reverse_bin_value)
      {
        max_reverse_bin = bin;
        max_reverse_bin_value = rev_bins[bin];
      }
    }

#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << "Trying to get diff scores" << endl;
#endif

    // get diff of fwd and rev
    for (Size i = 0; i < number_of_bins; ++i)
    {
      Size fwd(0), rev(0);
      fwd = fwd_bins[i];
      rev = rev_bins[i];
      if ((double)fwd > (double)(1.3 * rev) && max_reverse_bin < i)
      {
        diff_scores[i] = (double)(fwd - rev) / (double)max_bin;
      }
      else
      {
        diff_scores[i] = 0.0;
      }
    }
#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << "Gauss Fitting values size of diff scores=" << diff_scores.size() << endl;
#endif
    // diff scores fitting
    vector<DPosition<2> > diff_data;
    double gauss_A(0), gauss_x0(0), norm_factor(0);
    for (Size i = 0; i < number_of_bins; ++i)
    {
      DPosition<2> pos;
      pos.setX((double)i / (double)number_of_bins);
      pos.setY(diff_scores[i]);

      if (pos.getY() > gauss_A)
      {
        gauss_A = pos.getY();
      }
      gauss_x0 += pos.getX() * pos.getY();
      norm_factor += pos.getY();


      diff_data.push_back(pos);
    }

    double gauss_sigma(0);
    gauss_x0 /= (double)diff_data.size();
    gauss_x0 /= norm_factor;

    for (Size i = 0; i <= number_of_bins; ++i)
    {
      gauss_sigma += fabs(gauss_x0 - (double)i / (double)number_of_bins);
    }

    gauss_sigma /= (double)diff_data.size();



#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << "setting initial parameters: " << endl;
#endif
    Math::GaussFitter gf;
    Math::GaussFitter::GaussFitResult result_1st(gauss_A, gauss_x0, gauss_sigma);
    gf.setInitialParameters(result_1st);
#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << "Initial Gauss guess: A=" << gauss_A << ", x0=" << gauss_x0 << ", sigma=" << gauss_sigma << endl;
#endif

    //TODO: fail-to-fit correction was done using the GNUPlotFormula. Seemed to be a hack.
    //Changed it to try-catch-block but I am not sure if this correction should be made
    //at all. Can someone please verify?
    Math::GaussFitter::GaussFitResult result_gauss (gauss_A, gauss_x0, gauss_sigma);
    try{
        result_gauss = gf.fit(diff_data);
    }
    catch(Exception::UnableToFit& /* e */)
    {
      result_gauss.A = gauss_A;
      result_gauss.x0 = gauss_x0;
      result_gauss.sigma = gauss_sigma;
    }

//    // fit failed?
//    if (gf.getGnuplotFormula() == "")
//    {
//      result_gauss.A = gauss_A;
//      result_gauss.x0 = gauss_x0;
//      result_gauss.sigma = gauss_sigma;
//    }

#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << gf.getGnuplotFormula() << endl;
    String fwd_filename = param_.getValue("fwd_filename");
    if (gf.getGnuplotFormula() == "")
    {
      String formula("f(x)=" + String(gauss_A) + " * exp(-(x - " + String(gauss_x0) + ") ** 2 / 2 / (" + String(gauss_sigma) + ") ** 2)");
      generateDistributionImage_(diff_scores, formula, fwd_filename);
    }
    else
    {
      generateDistributionImage_(diff_scores, gf.getGnuplotFormula(), fwd_filename);
    }
#endif

#ifdef IDDECOYPROBABILITY_DEBUG
    //all_trafo.diff_score + all_trafo.min_score
    String gauss_formula("f(x)=" + String(result_gauss.A / all_trafo.max_intensity) + " * exp(-(x - " + String(result_gauss.x0 * all_trafo.diff_score + all_trafo.min_score) + ") ** 2 / 2 / (" + String(result_gauss.sigma * all_trafo.diff_score)   + ") ** 2)");

    String b_str(result_gamma.b), p_str(result_gamma.p);
    String gamma_formula = "g(x)=(" + b_str + " ** " + p_str + ") / gamma(" + p_str + ") * x ** (" + p_str + " - 1) * exp(- " + b_str + " * x)";

    generateDistributionImage_(all_scores_normalized, all_trafo, gauss_formula, gamma_formula, (String)param_.getValue("fwd_filename"));
#endif

    vector<PeptideIdentification> new_prob_ids;
    // calculate the probabilities and write them to the IDs
    for (vector<PeptideIdentification>::const_iterator it = ids.begin(); it != ids.end(); ++it)
    {
      if (it->getHits().size() > 0)
      {
        vector<PeptideHit> hits;
        String score_type = it->getScoreType() + "_score";
        for (vector<PeptideHit>::const_iterator pit = it->getHits().begin(); pit != it->getHits().end(); ++pit)
        {
          PeptideHit hit = *pit;
          double score = hit.getScore();
          if (!it->isHigherScoreBetter())
          {
            score = -log10(score);
          }
          hit.setMetaValue(score_type, hit.getScore());
          hit.setScore(getProbability_(result_gamma, rev_trafo, result_gauss, fwd_trafo, score));
          hits.push_back(hit);
        }
        PeptideIdentification id = *it;
        id.setHigherScoreBetter(true);
        id.setScoreType(id.getScoreType() + "_DecoyProbability");
        id.setHits(hits);

        new_prob_ids.push_back(id);
      }
    }
    ids = new_prob_ids;
  }
  double getScore_(String& engine, const PeptideHit& hit)
  {
    if (engine == "OMSSA")
    {
      return (-1) * log10(max(hit.getScore(), smallest_e_value_));
    }
    else if (engine == "MyriMatch")
    {
      //double e_val = exp(-hit.getScore());
      //double score_val = ((-1)* log10(max(e_val,smallest_e_value_)));
      //printf("myri score: %e ; e_val: %e ; score_val: %e\n",hit.getScore(),e_val,score_val);
      //return score_val;
      return hit.getScore();
    }
    else if (engine.compare("XTandem") == 0)
    {
      return (-1) * log10(max((double)hit.getMetaValue("E-Value"), smallest_e_value_));
    }
    else if (engine == "MASCOT")
    {
      // issue #740: unable to fit data with score 0
      if (hit.getScore() == 0.0) 
      {
        return numeric_limits<double>::quiet_NaN();
      }
      // end issue #740
      if (hit.metaValueExists("EValue"))
      {
        return (-1) * log10(max((double)hit.getMetaValue("EValue"), smallest_e_value_));
      }
      if (hit.metaValueExists("expect"))
      {
        return (-1) * log10(max((double)hit.getMetaValue("expect"), smallest_e_value_));
      }
    }
    else if (engine == "SpectraST")
    {
      return 100 * hit.getScore(); // SpectraST f-val
    }
    else if (engine == "SimTandem")
    {
      if (hit.metaValueExists("E-Value"))
      {
        return (-1) * log10(max((double)hit.getMetaValue("E-Value"), smallest_e_value_));
      }
    }
    else if ((engine == "MSGFPlus") || (engine == "MS-GF+"))
    {
      if (hit.metaValueExists("MS:1002053"))  // name: MS-GF:EValue
      {
        return (-1) * log10(max((double)hit.getMetaValue("MS:1002053"), smallest_e_value_));
      }
      else if (hit.metaValueExists("expect"))
      {
        return (-1) * log10(max((double)hit.getMetaValue("expect"), smallest_e_value_));
      }
    }
    else if (engine == "Comet")
    {
      if (hit.metaValueExists("MS:1002257")) // name: Comet:expectation value
      {
        return (-1) * log10(max((double)hit.getMetaValue("MS:1002257"), smallest_e_value_));
      }
      else if (hit.metaValueExists("expect"))
      {
        return (-1) * log10(max((double)hit.getMetaValue("expect"), smallest_e_value_));
      }
    }
    else
    {
      throw Exception::UnableToFit(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "No parameters for chosen search engine", "The chosen search engine is currently not supported");
    }

    // avoid compiler warning (every code path must return a value, even if there is a throw() somewhere)
    return std::numeric_limits<double>::max();
  }
예제 #8
0
 String describeHit_(const PeptideHit& hit)
 {
   return "peptide hit with sequence '" + hit.getSequence().toString() +
     "', charge " + String(hit.getCharge()) + ", score " + 
     String(hit.getScore());
 }
  ExitCodes main_(int, const char **)
  {
    //-------------------------------------------------------------
    // parameter handling
    //-------------------------------------------------------------

    //input/output files
    StringList in(getStringList_("in"));
    StringList id_in(getStringList_("id_in"));
    String trained_model_file(getStringOption_("trained_model_file"));
    String model_file(getStringOption_("model_file"));
    bool score_filtering(getFlag_("score_filtering"));
    double score_threshold(getDoubleOption_("score_threshold"));
    Int min_charge(getIntOption_("min_charge"));
    Int max_charge(getIntOption_("max_charge"));

    if (in.empty())
    {
      writeLog_("For 'training' mode spectra and identifications are needed.");
      return INCOMPATIBLE_INPUT_DATA;
    }

    //bool duplicates_by_tic(getFlag_("duplicates_by_tic"));
    //bool base_model_from_file(getFlag_("base_model_from_file"));

    // create model, either read from a model file, or initialize with default parameters
    PILISModel model;
    if (model_file != "")
    {
      writeDebug_("Reading model from file '" + model_file + "'", 1);
      model.readFromFile(model_file);
    }
    else
    {
      writeDebug_("Initializing model", 1);
      model.setParameters(getParam_().copy("PILIS_parameters:", true));
      model.init();
    }

    Param pilis_param(model.getParameters());
    ModificationDefinitionsSet mod_set(pilis_param.getValue("fixed_modifications"), pilis_param.getValue("variable_modifications"));

    // read spectra file (if available)
    vector<RichPeakMap> exp;
    vector<vector<ProteinIdentification> > prot_ids;
    vector<vector<PeptideIdentification> > pep_ids;

    if (!in.empty())
    {
      FileTypes::Type in_file_type = FileHandler().getType(in[0]);
      writeDebug_("File type of parameter 'in' estimated as '" + FileTypes::typeToName(in_file_type) + "'", 1);
      // TODO check all types
      if (in_file_type == FileTypes::MSP)
      {
        writeDebug_("Reading MSP file", 1);
        MSPFile f;
        exp.resize(in.size());
        pep_ids.resize(in.size());
        for (Size i = 0; i != in.size(); ++i)
        {
          f.load(in[i], pep_ids[i], exp[i]);
          for (Size j = 0; j != exp[i].size(); ++j)
          {
            exp[i][j].getPeptideIdentifications().push_back(pep_ids[i][j]);
          }
        }
      }

      if (in_file_type == FileTypes::MZML)
      {
        MzMLFile f;
        f.setLogType(log_type_);

        exp.resize(in.size());
        for (Size i = 0; i != in.size(); ++i)
        {
          f.load(in[i], exp[i]);
        }
      }
    }

    if (!id_in.empty())
    {
      prot_ids.resize(id_in.size());
      pep_ids.resize(id_in.size());
      IdXMLFile f;
      for (Size i = 0; i != id_in.size(); ++i)
      {
        f.load(id_in[i], prot_ids[i], pep_ids[i]);
      }
    }

    if (!id_in.empty() && !in.empty())
    {
      // map the
      if (id_in.size() != in.size())
      {
        writeLog_("If in parameter contains mzML files and id_in contains idXML files, the number should be equal to allow mapping of the identification to the spectra");
        return INCOMPATIBLE_INPUT_DATA;
      }

      // map the ids to the spectra
      IDMapper id_mapper;
      for (Size i = 0; i != exp.size(); ++i)
      {
        id_mapper.annotate(exp[i], pep_ids[i], prot_ids[i]);
      }
    }

    // get the peptides and spectra
    vector<PILISCrossValidation::Peptide> peptides;

    for (vector<RichPeakMap>::const_iterator it1 = exp.begin(); it1 != exp.end(); ++it1)
    {
      for (RichPeakMap::ConstIterator it2 = it1->begin(); it2 != it1->end(); ++it2)
      {
        if (it2->getPeptideIdentifications().empty())
        {
          continue;
        }

        PeptideHit hit;

        if (it2->getPeptideIdentifications().begin()->getHits().size() > 0)
        {
          hit = *it2->getPeptideIdentifications().begin()->getHits().begin();
        }
        else
        {
          continue;
        }

        // check whether the sequence contains a modification not modelled
        if (!mod_set.isCompatible(hit.getSequence()) || hit.getSequence().size() > (UInt)pilis_param.getValue("visible_model_depth"))
        {
          continue;
        }

        if (score_filtering &&
            ((hit.getScore() < score_threshold && it2->getPeptideIdentifications().begin()->isHigherScoreBetter()) ||
             (hit.getScore() > score_threshold && !it2->getPeptideIdentifications().begin()->isHigherScoreBetter())))
        {
          continue;
        }

        PILISCrossValidation::Peptide pep_struct;
        pep_struct.sequence = hit.getSequence();
        pep_struct.charge = hit.getCharge();
        pep_struct.spec = *it2;
        pep_struct.hits = it2->getPeptideIdentifications().begin()->getHits();

        // check charges
        if (pep_struct.charge < min_charge || pep_struct.charge > max_charge)
        {
          continue;
        }

        peptides.push_back(pep_struct);
      }
    }


    getUniquePeptides(peptides);
    writeDebug_("Number of (unique) peptides for training: " + String(peptides.size()), 1);

    //model.writeToFile("pilis_tmp.dat");

    model.setParameters(pilis_param);
    for (vector<PILISCrossValidation::Peptide>::const_iterator it = peptides.begin(); it != peptides.end(); ++it)
    {
      model.train(it->spec, it->sequence, it->charge);
    }
    model.evaluate();

    if (trained_model_file != "")
    {
      model.writeToFile(trained_model_file);
    }


    return EXECUTION_OK;
  }