示例#1
0
  //Visualizing PeptideHit object
  void MetaDataBrowser::visualize_(PeptideHit & meta, QTreeWidgetItem * parent)
  {
    PeptideHitVisualizer * visualizer = new PeptideHitVisualizer(isEditable(), this);
    visualizer->load(meta);

    String name = String("Pep ") + meta.getSequence().toString() + " (" + meta.getScore() + ')';
    QString qs_name(name.c_str());

    QStringList labels;
    labels << qs_name << QString::number(ws_->addWidget(visualizer)) << QString::number(meta.getScore());

    QTreeWidgetItem * item;
    if (parent == nullptr)
    {
      item = new QTreeWidgetItem(treeview_, labels);
    }
    else
    {
      item = new QTreeWidgetItem(parent, labels);
    }

    visualize_(dynamic_cast<MetaInfoInterface &>(meta), item);

    connectVisualizer_(visualizer);
  }
示例#2
0
  void ConsensusIDAlgorithm::apply(vector<PeptideIdentification>& ids,
                                   Size number_of_runs)
  {
    // abort if no IDs present
    if (ids.empty())
    {
      return;
    }

    number_of_runs_ = (number_of_runs != 0) ? number_of_runs : ids.size();

    // prepare data here, so that it doesn't have to happen in each algorithm:
    for (vector<PeptideIdentification>::iterator pep_it = ids.begin(); 
         pep_it != ids.end(); ++pep_it)
    {
      pep_it->sort();
      if ((considered_hits_ > 0) &&
          (pep_it->getHits().size() > considered_hits_))
      {
        pep_it->getHits().resize(considered_hits_);
      }
    }
    // make sure there are no duplicated hits (by sequence):
    IDFilter::removeDuplicatePeptideHits(ids, true);

    SequenceGrouping results;
    apply_(ids, results); // actual (subclass-specific) processing

    String score_type = ids[0].getScoreType();
    bool higher_better = ids[0].isHigherScoreBetter();
    ids.clear();
    ids.resize(1);
    ids[0].setScoreType(score_type);
    ids[0].setHigherScoreBetter(higher_better);
    for (SequenceGrouping::iterator res_it = results.begin(); 
         res_it != results.end(); ++res_it)
    {
      OPENMS_PRECONDITION(!res_it->second.second.empty(),
                          "Consensus score for peptide required");
      PeptideHit hit;

      if (res_it->second.second.size() == 2)
      {
        // filter by "support" value:
        double support = res_it->second.second[1];
        if (support < min_support_) continue;
        hit.setMetaValue("consensus_support", support);
      }
      
      hit.setSequence(res_it->first);
      hit.setCharge(res_it->second.first);
      hit.setScore(res_it->second.second[0]);
      ids[0].insertHit(hit);
#ifdef DEBUG_ID_CONSENSUS
      LOG_DEBUG << " - Output hit: " << hit.getSequence() << " "
                << hit.getScore() << endl;
#endif
    }
    ids[0].assignRanks();
  }
  ExitCodes main_(int, const char**)
  {
    vector<ProteinIdentification> prot_ids;
    vector<PeptideIdentification> pep_ids;
    ProteinHit temp_protein_hit;

    //-------------------------------------------------------------
    // parsing parameters
    //-------------------------------------------------------------
    String inputfile_id               = getStringOption_("id");
    String inputfile_feature       = getStringOption_("feature");
    String inputfile_consensus  = getStringOption_("consensus");
    String inputfile_raw            = getStringOption_("in");
    String outputfile_name       = getStringOption_("out");

    //~ bool Ms1(getFlag_("MS1"));
    //~ bool Ms2(getFlag_("MS2"));
    bool remove_duplicate_features(getFlag_("remove_duplicate_features"));
    
    //-------------------------------------------------------------
    // fetch vocabularies
    //------------------------------------------------------------
    ControlledVocabulary cv;
    cv.loadFromOBO("PSI-MS", File::find("/CV/psi-ms.obo"));
    cv.loadFromOBO("QC", File::find("/CV/qc-cv.obo"));
 
     QcMLFile qcmlfile;

    //-------------------------------------------------------------
    // MS  aqiusition
    //------------------------------------------------------------
    String base_name = QFileInfo(QString::fromStdString(inputfile_raw)).baseName();

    cout << "Reading mzML file..." << endl;
    MzMLFile mz_data_file;
    MSExperiment<Peak1D> exp;
    MzMLFile().load(inputfile_raw, exp);
    
    //---prep input
    exp.sortSpectra();
    UInt min_mz = std::numeric_limits<UInt>::max();
    UInt max_mz = 0;
    std::map<Size, UInt> mslevelcounts;
    
    qcmlfile.registerRun(base_name,base_name); //TODO use UIDs
    
    //---base MS aquisition qp
    String msaq_ref = base_name + "_msaq";
    QcMLFile::QualityParameter qp;
    qp.id = msaq_ref; ///< Identifier
    qp.cvRef = "QC"; ///< cv reference
    qp.cvAcc = "QC:0000004";
    try
    {
      //~ const ControlledVocabulary::CVTerm& test = cv.getTermByName("MS aquisition result details");
      //~ cout << test.name << test.id << endl;
      const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
      //~ const ControlledVocabulary::CVTerm& term = cv.getTerm("0000004");
      qp.name = term.name; ///< Name
    }
    catch (...)
    {
      qp.name = "mzML file"; ///< Name
    }
    qcmlfile.addRunQualityParameter(base_name, qp);
    
    //---file origin qp
    qp = QcMLFile::QualityParameter();
    qp.name = "mzML file"; ///< Name
    qp.id = base_name + "_run_name"; ///< Identifier
    qp.cvRef = "MS"; ///< cv reference
    qp.cvAcc = "MS:1000577";
    qp.value = base_name;
    qcmlfile.addRunQualityParameter(base_name, qp);
    
    qp = QcMLFile::QualityParameter();
    qp.name = "instrument model"; ///< Name
    qp.id = base_name + "_instrument_name"; ///< Identifier
    qp.cvRef = "MS"; ///< cv reference
    qp.cvAcc = "MS:1000031";
    qp.value = exp.getInstrument().getName();
    qcmlfile.addRunQualityParameter(base_name, qp);    

    qp = QcMLFile::QualityParameter();
    qp.name = "completion time"; ///< Name
    qp.id = base_name + "_date"; ///< Identifier
    qp.cvRef = "MS"; ///< cv reference
    qp.cvAcc = "MS:1000747";
    qp.value = exp.getDateTime().getDate();
    qcmlfile.addRunQualityParameter(base_name, qp);

    //---precursors at
    QcMLFile::Attachment at;
    at.cvRef = "QC"; ///< cv reference
    at.cvAcc = "QC:0000044";
    at.qualityRef = msaq_ref;
    at.id = base_name + "_precursors"; ///< Identifier
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
      at.name = term.name; ///< Name
    }
    catch (...)
    {
      at.name = "precursors"; ///< Name
    }

    at.colTypes.push_back("MS:1000894_[sec]"); //RT
    at.colTypes.push_back("MS:1000040"); //MZ
    for (Size i = 0; i < exp.size(); ++i)
    {
      mslevelcounts[exp[i].getMSLevel()]++;
      if (exp[i].getMSLevel() == 2)
      {
        if (exp[i].getPrecursors().front().getMZ() < min_mz)
        {
          min_mz = exp[i].getPrecursors().front().getMZ();
        }
        if (exp[i].getPrecursors().front().getMZ() > max_mz)
        {
          max_mz = exp[i].getPrecursors().front().getMZ();
        }
        std::vector<String> row;
        row.push_back(exp[i].getRT());
        row.push_back(exp[i].getPrecursors().front().getMZ());
        at.tableRows.push_back(row);
      }
    }
    qcmlfile.addRunAttachment(base_name, at);

    //---aquisition results qp
    qp = QcMLFile::QualityParameter();
    qp.cvRef = "QC"; ///< cv reference
    qp.cvAcc = "QC:0000006"; ///< cv accession for "aquisition results"
    qp.id = base_name + "_ms1aquisition"; ///< Identifier
    qp.value = String(mslevelcounts[1]);
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
      qp.name = term.name; ///< Name
    }
    catch (...)
    {
      qp.name = "number of ms1 spectra"; ///< Name
    }
    qcmlfile.addRunQualityParameter(base_name, qp);
    

    qp = QcMLFile::QualityParameter();
    qp.cvRef = "QC"; ///< cv reference
    qp.cvAcc = "QC:0000007"; ///< cv accession for "aquisition results"
    qp.id = base_name + "_ms2aquisition"; ///< Identifier
    qp.value = String(mslevelcounts[2]);
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
      qp.name = term.name; ///< Name
    }
    catch (...)
    {
      qp.name = "number of ms2 spectra"; ///< Name
    }
    qcmlfile.addRunQualityParameter(base_name, qp);

    qp = QcMLFile::QualityParameter();
    qp.cvRef = "QC"; ///< cv reference
    qp.cvAcc = "QC:0000008"; ///< cv accession for "aquisition results"
    qp.id = base_name + "_Chromaquisition"; ///< Identifier
    qp.value = String(exp.getChromatograms().size());
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
      qp.name = term.name; ///< Name
    }
    catch (...)
    {
      qp.name = "number of chromatograms"; ///< Name
    }
    qcmlfile.addRunQualityParameter(base_name, qp);
    
    at = QcMLFile::Attachment();
    at.cvRef = "QC"; ///< cv reference
    at.cvAcc = "QC:0000009";
    at.qualityRef = msaq_ref;
    at.id = base_name + "_mzrange"; ///< Identifier
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
      at.name = term.name; ///< Name
    }
    catch (...)
    {
      at.name = "MS MZ aquisition ranges"; ///< Name
    }

    at.colTypes.push_back("QC:0000010"); //MZ
    at.colTypes.push_back("QC:0000011"); //MZ
    std::vector<String> rowmz;
    rowmz.push_back(String(min_mz));
    rowmz.push_back(String(max_mz));
    at.tableRows.push_back(rowmz);
    qcmlfile.addRunAttachment(base_name, at);

    at = QcMLFile::Attachment();
    at.cvRef = "QC"; ///< cv reference
    at.cvAcc = "QC:0000012";
    at.qualityRef = msaq_ref;
    at.id = base_name + "_rtrange"; ///< Identifier
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
      at.name = term.name; ///< Name
    }
    catch (...)
    {
      at.name = "MS RT aquisition ranges"; ///< Name
    }

    at.colTypes.push_back("QC:0000013"); //MZ
    at.colTypes.push_back("QC:0000014"); //MZ
    std::vector<String> rowrt;
    rowrt.push_back(String(exp.begin()->getRT()));
    rowrt.push_back(String(exp.getSpectra().back().getRT()));
    at.tableRows.push_back(rowrt);
    qcmlfile.addRunAttachment(base_name, at);
    

    //---ion current stability ( & tic ) qp
    at = QcMLFile::Attachment();
    at.cvRef = "QC"; ///< cv reference
    at.cvAcc = "QC:0000022";
    at.qualityRef = msaq_ref;
    at.id = base_name + "_tics"; ///< Identifier
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
      at.name = term.name; ///< Name
    }
    catch (...)
    {
      at.name = "MS TICs"; ///< Name
    }
    
    at.colTypes.push_back("MS:1000894_[sec]");
    at.colTypes.push_back("MS:1000285");
    UInt max = 0;
    Size below_10k = 0;
    for (Size i = 0; i < exp.size(); ++i)
    {
      if (exp[i].getMSLevel() == 1)
      {
        UInt sum = 0;
        for (Size j = 0; j < exp[i].size(); ++j)
        {
          sum += exp[i][j].getIntensity();
        }
        if (sum > max)
        {
          max = sum;
        }
        if (sum < 10000)
        {
          ++below_10k;
        }
        std::vector<String> row;
        row.push_back(exp[i].getRT());
        row.push_back(sum);
        at.tableRows.push_back(row);
      }
    }
    qcmlfile.addRunAttachment(base_name, at);
    

    qp = QcMLFile::QualityParameter();
    qp.id = base_name + "_ticslump"; ///< Identifier
    qp.cvRef = "QC"; ///< cv reference
    qp.cvAcc = "QC:0000023";
    qp.value = String((100 / exp.size()) * below_10k);
    try
    {
      const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
      qp.name = term.name; ///< Name
    }
    catch (...)
    {
      qp.name = "percentage of tic slumps"; ///< Name
    }
    qcmlfile.addRunQualityParameter(base_name, qp);

    
    //-------------------------------------------------------------
    // MS  id
    //------------------------------------------------------------
    if (inputfile_id != "")
    {
      IdXMLFile().load(inputfile_id, prot_ids, pep_ids);
      cerr << "idXML read ended. Found " << pep_ids.size() << " peptide identifications." << endl;

      ProteinIdentification::SearchParameters params = prot_ids[0].getSearchParameters();
      vector<String> var_mods = params.variable_modifications;
      //~ boost::regex re("(?<=[KR])(?=[^P])");
     
      String msid_ref = base_name + "_msid";
      QcMLFile::QualityParameter qp;
      qp.id = msid_ref; ///< Identifier
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000025";
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
        qp.name = "MS identification result details"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      at = QcMLFile::Attachment();
      at.cvRef = "QC"; ///< cv reference
      at.cvAcc = "QC:0000026";
      at.qualityRef = msid_ref;
      at.id = base_name + "_idsetting"; ///< Identifier
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
        at.name = term.name; ///< Name
      }
      catch (...)
      {
        at.name = "MS id settings"; ///< Name
      }
      
      at.colTypes.push_back("MS:1001013"); //MS:1001013 db name  MS:1001016 version  MS:1001020 taxonomy
      at.colTypes.push_back("MS:1001016");
      at.colTypes.push_back("MS:1001020");
      std::vector<String> row;
      row.push_back(String(prot_ids.front().getSearchParameters().db));
      row.push_back(String(prot_ids.front().getSearchParameters().db_version));
      row.push_back(String(prot_ids.front().getSearchParameters().taxonomy));
      at.tableRows.push_back(row);
      qcmlfile.addRunAttachment(base_name, at);


      UInt spectrum_count = 0;
      Size peptide_hit_count = 0;
      UInt runs_count = 0;
      Size protein_hit_count = 0;
      set<String> peptides;
      set<String> proteins;
      Size missedcleavages = 0;
      for (Size i = 0; i < pep_ids.size(); ++i)
      {
        if (!pep_ids[i].empty())
        {
          ++spectrum_count;
          peptide_hit_count += pep_ids[i].getHits().size();
          const vector<PeptideHit>& temp_hits = pep_ids[i].getHits();
          for (Size j = 0; j < temp_hits.size(); ++j)
          {
            peptides.insert(temp_hits[j].getSequence().toString());
          }
        }
      }
      for (set<String>::iterator it = peptides.begin(); it != peptides.end(); ++it)
      {
        for (String::const_iterator st = it->begin(); st != it->end() - 1; ++st)
        {
          if (*st == 'K' || *st == 'R')
          {
            ++missedcleavages;
          }
        }
      }

      for (Size i = 0; i < prot_ids.size(); ++i)
      {
        ++runs_count;
        protein_hit_count += prot_ids[i].getHits().size();
        const vector<ProteinHit>& temp_hits = prot_ids[i].getHits();
        for (Size j = 0; j < temp_hits.size(); ++j)
        {
          proteins.insert(temp_hits[j].getAccession());
        }
      }
      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000037"; ///< cv accession
      qp.id = base_name + "_misscleave"; ///< Identifier
      qp.value = missedcleavages;
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
        qp.name = "total number of missed cleavages"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);

      
      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000032"; ///< cv accession
      qp.id = base_name + "_totprot"; ///< Identifier
      qp.value = protein_hit_count;
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
        qp.name = "total number of identified proteins"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000033"; ///< cv accession
      qp.id = base_name + "_totuniqprot"; ///< Identifier
      qp.value = String(proteins.size());
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "total number of uniquely identified proteins"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000029"; ///< cv accession
      qp.id = base_name + "_psms"; ///< Identifier
      qp.value = String(spectrum_count);
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "total number of PSM"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000030"; ///< cv accession
      qp.id = base_name + "_totpeps"; ///< Identifier
      qp.value = String(peptide_hit_count);
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "total number of identified peptides"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000031"; ///< cv accession
      qp.id = base_name + "_totuniqpeps"; ///< Identifier
      qp.value = String(peptides.size());
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "total number of uniquely identified peptides"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      at = QcMLFile::Attachment();
      at.cvRef = "QC"; ///< cv reference
      at.cvAcc = "QC:0000038";
      at.qualityRef = msid_ref;
      at.id = base_name + "_massacc"; ///< Identifier
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
        at.name = term.name; ///< Name
      }
      catch (...)
      {
        at.name = "delta ppm tables";
      }
      
      //~ delta ppm QC:0000039 RT MZ uniqueness ProteinID MS:1000885 target/decoy Score PeptideSequence MS:1000889 Annots string Similarity Charge UO:0000219 TheoreticalWeight UO:0000221 Oxidation_(M)
      at.colTypes.push_back("RT");
      at.colTypes.push_back("MZ");
      at.colTypes.push_back("Score");
      at.colTypes.push_back("PeptideSequence");
      at.colTypes.push_back("Charge");
      at.colTypes.push_back("TheoreticalWeight");
      at.colTypes.push_back("delta_ppm");
      for (UInt w = 0; w < var_mods.size(); ++w)
      {
        at.colTypes.push_back(String(var_mods[w]).substitute(' ', '_'));
      }

      std::vector<double> deltas;
      //~ prot_ids[0].getSearchParameters();
      for (vector<PeptideIdentification>::iterator it = pep_ids.begin(); it != pep_ids.end(); ++it)
      {
        if (it->getHits().size() > 0)
        {
          std::vector<String> row;
          row.push_back(it->getRT());
          row.push_back(it->getMZ());
          PeptideHit tmp = it->getHits().front(); //TODO depends on score & sort
          vector<UInt> pep_mods;
          for (UInt w = 0; w < var_mods.size(); ++w)
          {
            pep_mods.push_back(0);
          }
          for (AASequence::ConstIterator z =  tmp.getSequence().begin(); z != tmp.getSequence().end(); ++z)
          {
            Residue res = *z;
            String temp;
            if (res.getModification().size() > 0 && res.getModification() != "Carbamidomethyl")
            {
              temp = res.getModification() + " (" + res.getOneLetterCode()  + ")";
              //cout<<res.getModification()<<endl;
              for (UInt w = 0; w < var_mods.size(); ++w)
              {
                if (temp == var_mods[w])
                {
                  //cout<<temp;
                  pep_mods[w] += 1;
                }
              }
            }
          }
          row.push_back(tmp.getScore());
          row.push_back(tmp.getSequence().toString().removeWhitespaces());
          row.push_back(tmp.getCharge());
          row.push_back(String((tmp.getSequence().getMonoWeight() + tmp.getCharge() * Constants::PROTON_MASS_U) / tmp.getCharge()));
          double dppm = /* std::abs */ (getMassDifference(((tmp.getSequence().getMonoWeight() + tmp.getCharge() * Constants::PROTON_MASS_U) / tmp.getCharge()), it->getMZ(), true));
          row.push_back(String(dppm));
          deltas.push_back(dppm);
          for (UInt w = 0; w < var_mods.size(); ++w)
          {
            row.push_back(pep_mods[w]);
          }
          at.tableRows.push_back(row);
        }
      }
      qcmlfile.addRunAttachment(base_name, at);
      

      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000040"; ///< cv accession
      qp.id = base_name + "_mean_delta"; ///< Identifier
      qp.value = String(OpenMS::Math::mean(deltas.begin(), deltas.end()));
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "mean delta ppm"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000041"; ///< cv accession
      qp.id = base_name + "_median_delta"; ///< Identifier
      qp.value = String(OpenMS::Math::median(deltas.begin(), deltas.end(), false));
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "median delta ppm"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);


      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000035"; ///< cv accession
      qp.id = base_name + "_ratio_id"; ///< Identifier
      qp.value = String(double(pep_ids.size()) / double(mslevelcounts[2]));
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "id ratio"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);
    }

    //-------------------------------------------------------------
    // MS quantitation
    //------------------------------------------------------------
    FeatureMap map;
    String msqu_ref = base_name + "_msqu";
    if (inputfile_feature != "")
    {
      FeatureXMLFile f;
      f.load(inputfile_feature, map);

      cout << "Read featureXML file..." << endl;

      //~ UInt fiter = 0;
      map.sortByRT();
      map.updateRanges();

      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000045"; ///< cv accession
      qp.id = msqu_ref; ///< Identifier
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "MS quantification result details"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);
      
      qp = QcMLFile::QualityParameter();
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:0000046"; ///< cv accession
      qp.id = base_name + "_feature_count"; ///< Identifier
      qp.value = String(map.size());
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(qp.cvAcc);
        qp.name = term.name; ///< Name
      }
      catch (...)
      {
         qp.name = "number of features"; ///< Name
      }
      qcmlfile.addRunQualityParameter(base_name, qp);      
    }

    if (inputfile_feature != "" && !remove_duplicate_features)
    {
      
      QcMLFile::Attachment at;
      at = QcMLFile::Attachment();
      at.cvRef = "QC"; ///< cv reference
      at.cvAcc = "QC:0000047";
      at.qualityRef = msqu_ref;
      at.id = base_name + "_features"; ///< Identifier
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
        at.name = term.name; ///< Name
      }
      catch (...)
      {
        at.name = "features"; ///< Name
      }
      
      at.colTypes.push_back("MZ");
      at.colTypes.push_back("RT");
      at.colTypes.push_back("Intensity");
      at.colTypes.push_back("Charge");
      at.colTypes.push_back("Quality");
      at.colTypes.push_back("FWHM");
      at.colTypes.push_back("IDs");
      UInt fiter = 0;
      map.sortByRT();
      //ofstream out(outputfile_name.c_str());
      while (fiter < map.size())
      {
        std::vector<String> row;
        row.push_back(map[fiter].getMZ());
        row.push_back(map[fiter].getRT());
        row.push_back(map[fiter].getIntensity());
        row.push_back(map[fiter].getCharge());
        row.push_back(map[fiter].getOverallQuality());
        row.push_back(map[fiter].getWidth());
        row.push_back(map[fiter].getPeptideIdentifications().size());
        fiter++;
        at.tableRows.push_back(row);
      }     
      qcmlfile.addRunAttachment(base_name, at);
    }
    else if (inputfile_feature != "" && remove_duplicate_features)
    {
      QcMLFile::Attachment at;
      at = QcMLFile::Attachment();
      at.cvRef = "QC"; ///< cv reference
      at.cvAcc = "QC:0000047";
      at.qualityRef = msqu_ref;
      at.id = base_name + "_features"; ///< Identifier
      try
      {
        const ControlledVocabulary::CVTerm& term = cv.getTerm(at.cvAcc);
        at.name = term.name; ///< Name
      }
      catch (...)
      {
        at.name = "features"; ///< Name
      }
      
      at.colTypes.push_back("MZ");
      at.colTypes.push_back("RT");
      at.colTypes.push_back("Intensity");
      at.colTypes.push_back("Charge");
      FeatureMap map, map_out;
      FeatureXMLFile f;
      f.load(inputfile_feature, map);
      UInt fiter = 0;
      map.sortByRT();
      while (fiter < map.size())
      {
        FeatureMap map_tmp;
        for (UInt k = fiter; k <= map.size(); ++k)
        {
          if (abs(map[fiter].getRT() - map[k].getRT()) < 0.1)
          {
            //~ cout << fiter << endl;
            map_tmp.push_back(map[k]);
          }
          else
          {
            fiter = k;
            break;
          }
        }
        map_tmp.sortByMZ();
        UInt retif = 1;
        map_out.push_back(map_tmp[0]);
        while (retif < map_tmp.size())
        {
          if (abs(map_tmp[retif].getMZ() - map_tmp[retif - 1].getMZ()) > 0.01)
          {
            cout << "equal RT, but mass different" << endl;
            map_out.push_back(map_tmp[retif]);
          }
          retif++;
        }
      }
      qcmlfile.addRunAttachment(base_name, at);
    }
    if (inputfile_consensus != "")
    {
      cout << "Reading consensusXML file..." << endl;
      ConsensusXMLFile f;
      ConsensusMap map;
      f.load(inputfile_consensus, map);
      //~ String CONSENSUS_NAME = "_consensus.tsv";
      //~ String combined_out = outputfile_name + CONSENSUS_NAME;
      //~ ofstream out(combined_out.c_str());

      at = QcMLFile::Attachment();
      qp.name = "consensuspoints"; ///< Name
      //~ qp.id = base_name + "_consensuses"; ///< Identifier
      qp.cvRef = "QC"; ///< cv reference
      qp.cvAcc = "QC:xxxxxxxx"; ///< cv accession "featuremapper results"

      at.colTypes.push_back("Native_spectrum_ID");
      at.colTypes.push_back("DECON_RT_(sec)");
      at.colTypes.push_back("DECON_MZ_(Th)");
      at.colTypes.push_back("DECON_Intensity");
      at.colTypes.push_back("Feature_RT_(sec)");
      at.colTypes.push_back("Feature_MZ_(Th)");
      at.colTypes.push_back("Feature_Intensity");
      at.colTypes.push_back("Feature_Charge");
      for (ConsensusMap::const_iterator cmit = map.begin(); cmit != map.end(); ++cmit)
      {
        const ConsensusFeature& CF = *cmit;
        for (ConsensusFeature::const_iterator cfit = CF.begin(); cfit != CF.end(); ++cfit)
        {
          std::vector<String> row;
          FeatureHandle FH = *cfit;
          row.push_back(CF.getMetaValue("spectrum_native_id"));
          row.push_back(CF.getRT()); row.push_back(CF.getMZ());
          row.push_back(CF.getIntensity());
          row.push_back(FH.getRT());
          row.push_back(FH.getMZ());
          row.push_back(FH.getCharge());
          at.tableRows.push_back(row);
        }
      }
      qcmlfile.addRunAttachment(base_name, at);
    }
    
    
    //-------------------------------------------------------------
    // finalize
    //------------------------------------------------------------
    qcmlfile.store(outputfile_name);
    return EXECUTION_OK;
  }
示例#4
0
 String describeHit_(const PeptideHit& hit)
 {
   return "peptide hit with sequence '" + hit.getSequence().toString() +
     "', charge " + String(hit.getCharge()) + ", score " + 
     String(hit.getScore());
 }
示例#5
0
文件: AScore.cpp 项目: hroest/OpenMS
 PeptideHit AScore::compute(const PeptideHit & hit, PeakSpectrum & real_spectrum, double fragment_mass_tolerance, bool fragment_mass_unit_ppm, Size max_peptide_len, Size max_num_perm)
 {
   PeptideHit phospho = hit;
   
   //reset phospho
   phospho.setScore(-1);
   if (real_spectrum.empty())
   {
     return phospho;
   }
   
   String sequence_str = phospho.getSequence().toString();
   
   Size number_of_phosphorylation_events = numberOfPhosphoEvents_(sequence_str);
   AASequence seq_without_phospho = removePhosphositesFromSequence_(sequence_str);
   
   if (seq_without_phospho.toUnmodifiedString().size() > max_peptide_len)
   {
     LOG_DEBUG << "\tcalculation aborted: peptide too long: " << seq_without_phospho.toString() << std::endl;
     return phospho;
   }
   
   // determine all phospho sites
   vector<Size> sites(getSites_(seq_without_phospho));
   Size number_of_STY = sites.size();
   
   if (number_of_phosphorylation_events == 0 || number_of_STY == 0 || number_of_STY == number_of_phosphorylation_events)
   {
     return phospho;
   }
   
   vector<vector<Size> > permutations(computePermutations_(sites, (Int)number_of_phosphorylation_events));
   LOG_DEBUG << "\tnumber of permutations: " << permutations.size() << std::endl;
   
   // TODO: using a heuristic to calculate the best phospho sites if the number of permutations are exceeding the maximum.
   // A heuristic could be to calculate the best site for the first phosphorylation and based on this the best site for the second 
   // phosphorylation and so on until every site is determined
   if (permutations.size() > max_num_perm) 
   {
     LOG_DEBUG << "\tcalculation aborted: number of permutations exceeded" << std::endl;
     return phospho;
   }
     
   vector<PeakSpectrum> th_spectra(createTheoreticalSpectra_(permutations, seq_without_phospho));
   
   // prepare real spectrum windows
   if (!real_spectrum.isSorted())
   {
     real_spectrum.sortByPosition();
   }
   vector<PeakSpectrum> windows_top10(peakPickingPerWindowsInSpectrum_(real_spectrum));
   
   // calculate peptide score for each possible phospho site permutation
   vector<vector<double> > peptide_site_scores(calculatePermutationPeptideScores_(th_spectra, windows_top10, fragment_mass_tolerance, fragment_mass_unit_ppm));
   
   // rank peptide permutations ascending
   multimap<double, Size> ranking(rankWeightedPermutationPeptideScores_(peptide_site_scores));
   
   multimap<double, Size>::reverse_iterator rev = ranking.rbegin();
   String seq1 = th_spectra[rev->second].getName();
   phospho.setSequence(AASequence::fromString(seq1));
   phospho.setMetaValue("search_engine_sequence", hit.getSequence().toString());
   
   double peptide1_score = rev->first;
   phospho.setMetaValue("AScore_pep_score", peptide1_score); // initialize score with highest peptide score (aka highest weighted score)
   
   ++rev;
   String seq2 = th_spectra[rev->second].getName();
   double peptide2_score = rev->first;
   
   vector<ProbablePhosphoSites> phospho_sites;
   determineHighestScoringPermutations_(peptide_site_scores, phospho_sites, permutations, ranking);
   
   Int rank = 1;
   double best_Ascore = std::numeric_limits<double>::max(); // the lower the better
   for (vector<ProbablePhosphoSites>::iterator s_it = phospho_sites.begin(); s_it != phospho_sites.end(); ++s_it)
   {
     double Ascore = 0;
     if (peptide1_score == peptide2_score) // set Ascore = 0 for each phosphorylation site
     {
       LOG_DEBUG << "\tscore of best (" << seq1 << ") and second best peptide (" << seq2 << ") are equal (" << peptide1_score << ")" << std::endl;
     }
     else
     {
       vector<PeakSpectrum> site_determining_ions;
       
       computeSiteDeterminingIons_(th_spectra, *s_it, site_determining_ions, fragment_mass_tolerance, fragment_mass_unit_ppm);
       Size N = site_determining_ions[0].size(); // all possibilities have the same number so take the first one
       double p = static_cast<double>(s_it->peak_depth) / 100.0;
       
       Size n_first = 0; // number of matching peaks for first peptide
       for (Size window_idx = 0; window_idx != windows_top10.size(); ++window_idx) // for each 100 m/z window
       {
         n_first += numberOfMatchedIons_(site_determining_ions[0], windows_top10[window_idx], s_it->peak_depth, fragment_mass_tolerance, fragment_mass_unit_ppm);        
       }
       double P_first = computeCumulativeScore_(N, n_first, p);
       
       Size n_second = 0; // number of matching peaks for second peptide
       for (Size window_idx = 0; window_idx <  windows_top10.size(); ++window_idx) //each 100 m/z window
       {
         n_second += numberOfMatchedIons_(site_determining_ions[1], windows_top10[window_idx], s_it->peak_depth, fragment_mass_tolerance, fragment_mass_unit_ppm);        
       }
       Size N2 = site_determining_ions[1].size(); // all possibilities have the same number so take the first one
       double P_second = computeCumulativeScore_(N2, n_second, p);
       
       //abs is used to avoid -0 score values
       double score_first = abs(-10 * log10(P_first));
       double score_second = abs(-10 * log10(P_second));
       
       LOG_DEBUG << "\tfirst - N: " << N << ",p: " << p << ",n: " << n_first << ", score: " << score_first << std::endl;
       LOG_DEBUG << "\tsecond - N: " << N2 << ",p: " << p << ",n: " << n_second << ", score: " << score_second << std::endl;
       
       Ascore = score_first - score_second;
       LOG_DEBUG << "\tAscore_" << rank << ": " << Ascore << std::endl;
     }
     if (Ascore < best_Ascore)
     {
       best_Ascore = Ascore;
     }
     phospho.setMetaValue("AScore_" + String(rank), Ascore);
     ++rank;      
   }
   phospho.setScore(best_Ascore);
   return phospho;
 }
  ExitCodes main_(int, const char **)
  {
    //-------------------------------------------------------------
    // parsing parameters
    //-------------------------------------------------------------

    StringList id_in(getStringList_("id_in"));
    StringList in_raw(getStringList_("in"));
    Size number_of_bins((UInt)getIntOption_("number_of_bins"));
    bool precursor_error_ppm(getFlag_("precursor_error_ppm"));
    bool fragment_error_ppm(getFlag_("fragment_error_ppm"));
    bool generate_gnuplot_scripts(DataValue(getStringOption_("generate_gnuplot_scripts")).toBool());

    if (in_raw.size() != id_in.size())
    {
      writeLog_("Number of spectrum files and identification files differs...");
      return ILLEGAL_PARAMETERS;
    }

    //-------------------------------------------------------------
    // reading input
    //-------------------------------------------------------------

    vector<vector<PeptideIdentification> > pep_ids;
    vector<vector<ProteinIdentification> > prot_ids;
    pep_ids.resize(id_in.size());
    prot_ids.resize(id_in.size());

    IdXMLFile idxmlfile;
    for (Size i = 0; i != id_in.size(); ++i)
    {
      String doc_id;
      idxmlfile.load(id_in[i], prot_ids[i], pep_ids[i], doc_id);
    }

    // read mzML files
    vector<RichPeakMap> maps_raw;
    maps_raw.resize(in_raw.size());

    MzMLFile mzml_file;
    for (Size i = 0; i != in_raw.size(); ++i)
    {
      mzml_file.load(in_raw[i], maps_raw[i]);
    }

    //-------------------------------------------------------------
    // calculations
    //-------------------------------------------------------------

    // mapping ids
    IDMapper mapper;
    for (Size i = 0; i != maps_raw.size(); ++i)
    {
      mapper.annotate(maps_raw[i], pep_ids[i], prot_ids[i]);
    }

    // normalize the spectra
    Normalizer normalizer;
    for (vector<RichPeakMap>::iterator it1 = maps_raw.begin(); it1 != maps_raw.end(); ++it1)
    {
      for (RichPeakMap::Iterator it2 = it1->begin(); it2 != it1->end(); ++it2)
      {
        normalizer.filterSpectrum(*it2);
      }
    }

    // generate precursor statistics
    vector<MassDifference> precursor_diffs;
    if (getStringOption_("precursor_out") != "")
    {
      for (Size i = 0; i != maps_raw.size(); ++i)
      {
        for (Size j = 0; j != maps_raw[i].size(); ++j)
        {
          if (maps_raw[i][j].getPeptideIdentifications().empty())
          {
            continue;
          }
          for (vector<PeptideIdentification>::const_iterator it = maps_raw[i][j].getPeptideIdentifications().begin(); it != maps_raw[i][j].getPeptideIdentifications().end(); ++it)
          {
            if (it->getHits().size() > 0)
            {
              PeptideHit hit = *it->getHits().begin();
              MassDifference md;
              Int charge = hit.getCharge();
              if (charge == 0)
              {
                charge = 1;
              }
              md.exp_mz = it->getMZ();
              md.theo_mz = (hit.getSequence().getMonoWeight() + (double)charge * Constants::PROTON_MASS_U) / (double)charge;
              md.charge = charge;
              precursor_diffs.push_back(md);
            }
          }
        }
      }
    }

    // generate fragment ions statistics
    vector<MassDifference> fragment_diffs;
    TheoreticalSpectrumGenerator tsg;
    SpectrumAlignment sa;
    double fragment_mass_tolerance(getDoubleOption_("fragment_mass_tolerance"));
    Param sa_param(sa.getParameters());
    sa_param.setValue("tolerance", fragment_mass_tolerance);
    sa.setParameters(sa_param);

    if (getStringOption_("fragment_out") != "")
    {
      for (Size i = 0; i != maps_raw.size(); ++i)
      {
        for (Size j = 0; j != maps_raw[i].size(); ++j)
        {
          if (maps_raw[i][j].getPeptideIdentifications().empty())
          {
            continue;
          }
          for (vector<PeptideIdentification>::const_iterator it = maps_raw[i][j].getPeptideIdentifications().begin(); it != maps_raw[i][j].getPeptideIdentifications().end(); ++it)
          {
            if (it->getHits().size() > 0)
            {
              PeptideHit hit = *it->getHits().begin();

              RichPeakSpectrum theo_spec;
              tsg.addPeaks(theo_spec, hit.getSequence(), Residue::YIon);
              tsg.addPeaks(theo_spec, hit.getSequence(), Residue::BIon);

              vector<pair<Size, Size> > pairs;
              sa.getSpectrumAlignment(pairs, theo_spec, maps_raw[i][j]);
              //cerr << hit.getSequence() << " " << hit.getSequence().getSuffix(1).getFormula() << " " << hit.getSequence().getSuffix(1).getFormula().getMonoWeight() << endl;
              for (vector<pair<Size, Size> >::const_iterator pit = pairs.begin(); pit != pairs.end(); ++pit)
              {
                MassDifference md;
                md.exp_mz = maps_raw[i][j][pit->second].getMZ();
                md.theo_mz = theo_spec[pit->first].getMZ();
                //cerr.precision(15);
                //cerr << md.exp_mz << " " << md.theo_mz << " " << md.exp_mz - md.theo_mz << endl;
                md.intensity = maps_raw[i][j][pit->second].getIntensity();
                md.charge = hit.getCharge();
                fragment_diffs.push_back(md);
              }
            }
          }
        }
      }
    }

    //-------------------------------------------------------------
    // writing output
    //-------------------------------------------------------------

    String precursor_out_file(getStringOption_("precursor_out"));
    if (precursor_out_file != "")
    {
      vector<double> errors;
      ofstream precursor_out(precursor_out_file.c_str());
      double min_diff(numeric_limits<double>::max()), max_diff(numeric_limits<double>::min());
      for (Size i = 0; i != precursor_diffs.size(); ++i)
      {
        double diff = getMassDifference(precursor_diffs[i].theo_mz, precursor_diffs[i].exp_mz, precursor_error_ppm);
        precursor_out << diff << "\n";
        errors.push_back(diff);

        if (diff > max_diff)
        {
          max_diff = diff;
        }
        if (diff < min_diff)
        {
          min_diff = diff;
        }
      }
      precursor_out.close();

      // fill histogram with the collected values
      double bin_size = (max_diff - min_diff) / (double)number_of_bins;
      Histogram<double, double> hist(min_diff, max_diff, bin_size);
      for (Size i = 0; i != errors.size(); ++i)
      {
        hist.inc(errors[i], 1.0);
      }

      writeDebug_("min_diff=" + String(min_diff) + ", max_diff=" + String(max_diff) + ", number_of_bins=" + String(number_of_bins), 1);

      // transform the histogram into a vector<DPosition<2> > for the fitting
      vector<DPosition<2> > values;
      for (Size i = 0; i != hist.size(); ++i)
      {
        DPosition<2> p;
        p.setX((double)i / (double)number_of_bins * (max_diff - min_diff) + min_diff);
        p.setY(hist[i]);
        values.push_back(p);
      }

      double mean = Math::mean(errors.begin(), errors.end());
      double abs_dev = Math::absdev(errors.begin(), errors.end(), mean);
      double sdv = Math::sd(errors.begin(), errors.end(), mean);
      sort(errors.begin(), errors.end());
      double median = errors[(Size)(errors.size() / 2.0)];

      writeDebug_("Precursor mean error: " + String(mean), 1);
      writeDebug_("Precursor abs. dev.:  " + String(abs_dev), 1);
      writeDebug_("Precursor std. dev.:  " + String(sdv), 1);
      writeDebug_("Precursor median error:  " + String(median), 1);


      // calculate histogram for gauss fitting
      GaussFitter gf;
      GaussFitter::GaussFitResult init_param (hist.maxValue(), median, sdv/500.0);
      gf.setInitialParameters(init_param);

      try
      {
        gf.fit(values);

        // write gnuplot scripts
        if (generate_gnuplot_scripts)
        {
          ofstream out(String(precursor_out_file + "_gnuplot.dat").c_str());
          for (vector<DPosition<2> >::const_iterator it = values.begin(); it != values.end(); ++it)
          {
            out << it->getX() << " " << it->getY() << endl;
          }
          out.close();

          ofstream gpl_out(String(precursor_out_file + "_gnuplot.gpl").c_str());
          gpl_out << "set terminal png" << endl;
          gpl_out << "set output \"" << precursor_out_file  << "_gnuplot.png\"" << endl;
          if (precursor_error_ppm)
          {
            gpl_out << "set xlabel \"error in ppm\"" << endl;
          }
          else
          {
            gpl_out << "set xlabel \"error in Da\"" << endl;
          }
          gpl_out << "set ylabel \"frequency\"" << endl;
          gpl_out << "plot '" << precursor_out_file << "_gnuplot.dat' title 'Precursor mass error distribution' w boxes, f(x) w lp title 'Gaussian fit of the error distribution'" << endl;
          gpl_out.close();
        }

      }
      catch (Exception::UnableToFit)
      {
        writeLog_("Unable to fit a Gaussian distribution to the precursor mass errors");
      }
    }

    String fragment_out_file(getStringOption_("fragment_out"));
    if (fragment_out_file != "")
    {
      vector<double> errors;
      ofstream fragment_out(fragment_out_file.c_str());
      double min_diff(numeric_limits<double>::max()), max_diff(numeric_limits<double>::min());
      for (Size i = 0; i != fragment_diffs.size(); ++i)
      {
        double diff = getMassDifference(fragment_diffs[i].theo_mz, fragment_diffs[i].exp_mz, fragment_error_ppm);
        fragment_out << diff << endl;
        errors.push_back(diff);

        if (diff > max_diff)
        {
          max_diff = diff;
        }
        if (diff < min_diff)
        {
          min_diff = diff;
        }
      }
      fragment_out.close();

      // fill histogram with the collected values
      // here we use the intensities to scale the error
      // low intensity peaks are likely to be random matches
      double bin_size = (max_diff - min_diff) / (double)number_of_bins;
      Histogram<double, double> hist(min_diff, max_diff, bin_size);
      for (Size i = 0; i != fragment_diffs.size(); ++i)
      {
        double diff = getMassDifference(fragment_diffs[i].theo_mz, fragment_diffs[i].exp_mz, fragment_error_ppm);
        hist.inc(diff, fragment_diffs[i].intensity);
      }

      writeDebug_("min_diff=" + String(min_diff) + ", max_diff=" + String(max_diff) + ", number_of_bins=" + String(number_of_bins), 1);

      // transform the histogram into a vector<DPosition<2> > for the fitting
      vector<DPosition<2> > values;
      for (Size i = 0; i != hist.size(); ++i)
      {
        DPosition<2> p;
        p.setX((double)i / (double)number_of_bins * (max_diff - min_diff) + min_diff);
        p.setY(hist[i]);
        values.push_back(p);
      }

      double mean = Math::mean(errors.begin(), errors.end());
      double abs_dev = Math::absdev(errors.begin(), errors.end(), mean);
      double sdv = Math::sd(errors.begin(), errors.end(), mean);
      sort(errors.begin(), errors.end());
      double median = errors[(Size)(errors.size() / 2.0)];

      writeDebug_("Fragment mean error:  " + String(mean), 1);
      writeDebug_("Fragment abs. dev.:   " + String(abs_dev), 1);
      writeDebug_("Fragment std. dev.:   " + String(sdv), 1);
      writeDebug_("Fragment median error:   " + String(median), 1);

      // calculate histogram for gauss fitting
      GaussFitter gf;
      GaussFitter::GaussFitResult init_param (hist.maxValue(), median, sdv / 100.0);
      gf.setInitialParameters(init_param);

      try
      {
        gf.fit(values);


        // write gnuplot script
        if (generate_gnuplot_scripts)
        {
          ofstream out(String(fragment_out_file + "_gnuplot.dat").c_str());
          for (vector<DPosition<2> >::const_iterator it = values.begin(); it != values.end(); ++it)
          {
            out << it->getX() << " " << it->getY() << endl;
          }
          out.close();

          ofstream gpl_out(String(fragment_out_file + "_gnuplot.gpl").c_str());
          gpl_out << "set terminal png" << endl;
          gpl_out << "set output \"" << fragment_out_file  << "_gnuplot.png\"" << endl;
          if (fragment_error_ppm)
          {
            gpl_out << "set xlabel \"error in ppm\"" << endl;
          }
          else
          {
            gpl_out << "set xlabel \"error in Da\"" << endl;
          }
          gpl_out << "set ylabel \"frequency\"" << endl;
          gpl_out << "plot '" << fragment_out_file << "_gnuplot.dat' title 'Fragment mass error distribution' w boxes, f(x) w lp title 'Gaussian fit of the error distribution'" << endl;
          gpl_out.close();
        }
      }
      catch (Exception::UnableToFit)
      {
        writeLog_("Unable to fit a Gaussian distribution to the fragment mass errors");
      }
    }

    return EXECUTION_OK;
  }
  ExitCodes main_(int, const char **)
  {
    //-------------------------------------------------------------
    // parameter handling
    //-------------------------------------------------------------

    //input/output files
    StringList in(getStringList_("in"));
    StringList id_in(getStringList_("id_in"));
    String trained_model_file(getStringOption_("trained_model_file"));
    String model_file(getStringOption_("model_file"));
    bool score_filtering(getFlag_("score_filtering"));
    double score_threshold(getDoubleOption_("score_threshold"));
    Int min_charge(getIntOption_("min_charge"));
    Int max_charge(getIntOption_("max_charge"));

    if (in.empty())
    {
      writeLog_("For 'training' mode spectra and identifications are needed.");
      return INCOMPATIBLE_INPUT_DATA;
    }

    //bool duplicates_by_tic(getFlag_("duplicates_by_tic"));
    //bool base_model_from_file(getFlag_("base_model_from_file"));

    // create model, either read from a model file, or initialize with default parameters
    PILISModel model;
    if (model_file != "")
    {
      writeDebug_("Reading model from file '" + model_file + "'", 1);
      model.readFromFile(model_file);
    }
    else
    {
      writeDebug_("Initializing model", 1);
      model.setParameters(getParam_().copy("PILIS_parameters:", true));
      model.init();
    }

    Param pilis_param(model.getParameters());
    ModificationDefinitionsSet mod_set(pilis_param.getValue("fixed_modifications"), pilis_param.getValue("variable_modifications"));

    // read spectra file (if available)
    vector<RichPeakMap> exp;
    vector<vector<ProteinIdentification> > prot_ids;
    vector<vector<PeptideIdentification> > pep_ids;

    if (!in.empty())
    {
      FileTypes::Type in_file_type = FileHandler().getType(in[0]);
      writeDebug_("File type of parameter 'in' estimated as '" + FileTypes::typeToName(in_file_type) + "'", 1);
      // TODO check all types
      if (in_file_type == FileTypes::MSP)
      {
        writeDebug_("Reading MSP file", 1);
        MSPFile f;
        exp.resize(in.size());
        pep_ids.resize(in.size());
        for (Size i = 0; i != in.size(); ++i)
        {
          f.load(in[i], pep_ids[i], exp[i]);
          for (Size j = 0; j != exp[i].size(); ++j)
          {
            exp[i][j].getPeptideIdentifications().push_back(pep_ids[i][j]);
          }
        }
      }

      if (in_file_type == FileTypes::MZML)
      {
        MzMLFile f;
        f.setLogType(log_type_);

        exp.resize(in.size());
        for (Size i = 0; i != in.size(); ++i)
        {
          f.load(in[i], exp[i]);
        }
      }
    }

    if (!id_in.empty())
    {
      prot_ids.resize(id_in.size());
      pep_ids.resize(id_in.size());
      IdXMLFile f;
      for (Size i = 0; i != id_in.size(); ++i)
      {
        f.load(id_in[i], prot_ids[i], pep_ids[i]);
      }
    }

    if (!id_in.empty() && !in.empty())
    {
      // map the
      if (id_in.size() != in.size())
      {
        writeLog_("If in parameter contains mzML files and id_in contains idXML files, the number should be equal to allow mapping of the identification to the spectra");
        return INCOMPATIBLE_INPUT_DATA;
      }

      // map the ids to the spectra
      IDMapper id_mapper;
      for (Size i = 0; i != exp.size(); ++i)
      {
        id_mapper.annotate(exp[i], pep_ids[i], prot_ids[i]);
      }
    }

    // get the peptides and spectra
    vector<PILISCrossValidation::Peptide> peptides;

    for (vector<RichPeakMap>::const_iterator it1 = exp.begin(); it1 != exp.end(); ++it1)
    {
      for (RichPeakMap::ConstIterator it2 = it1->begin(); it2 != it1->end(); ++it2)
      {
        if (it2->getPeptideIdentifications().empty())
        {
          continue;
        }

        PeptideHit hit;

        if (it2->getPeptideIdentifications().begin()->getHits().size() > 0)
        {
          hit = *it2->getPeptideIdentifications().begin()->getHits().begin();
        }
        else
        {
          continue;
        }

        // check whether the sequence contains a modification not modelled
        if (!mod_set.isCompatible(hit.getSequence()) || hit.getSequence().size() > (UInt)pilis_param.getValue("visible_model_depth"))
        {
          continue;
        }

        if (score_filtering &&
            ((hit.getScore() < score_threshold && it2->getPeptideIdentifications().begin()->isHigherScoreBetter()) ||
             (hit.getScore() > score_threshold && !it2->getPeptideIdentifications().begin()->isHigherScoreBetter())))
        {
          continue;
        }

        PILISCrossValidation::Peptide pep_struct;
        pep_struct.sequence = hit.getSequence();
        pep_struct.charge = hit.getCharge();
        pep_struct.spec = *it2;
        pep_struct.hits = it2->getPeptideIdentifications().begin()->getHits();

        // check charges
        if (pep_struct.charge < min_charge || pep_struct.charge > max_charge)
        {
          continue;
        }

        peptides.push_back(pep_struct);
      }
    }


    getUniquePeptides(peptides);
    writeDebug_("Number of (unique) peptides for training: " + String(peptides.size()), 1);

    //model.writeToFile("pilis_tmp.dat");

    model.setParameters(pilis_param);
    for (vector<PILISCrossValidation::Peptide>::const_iterator it = peptides.begin(); it != peptides.end(); ++it)
    {
      model.train(it->spec, it->sequence, it->charge);
    }
    model.evaluate();

    if (trained_model_file != "")
    {
      model.writeToFile(trained_model_file);
    }


    return EXECUTION_OK;
  }