示例#1
0
文件: IDMapper.C 项目: BioITer/OpenMS
  void IDMapper::getIDDetails_(const PeptideIdentification & id, DoubleReal & rt_pep, DoubleList & mz_values, IntList & charges, bool use_avg_mass) const
  {
    mz_values.clear();
    charges.clear();

    rt_pep = id.getMetaValue("RT");

    // collect m/z values of pepId
    if (param_.getValue("mz_reference") == "precursor") // use precursor m/z of pepId
    {
      mz_values.push_back(id.getMetaValue("MZ"));
    }

    for (vector<PeptideHit>::const_iterator hit_it = id.getHits().begin();
         hit_it != id.getHits().end(); ++hit_it)
    {
      Int charge = hit_it->getCharge();
      charges.push_back(charge);

      if (param_.getValue("mz_reference") == "peptide") // use mass of each pepHit (assuming H+ adducts)
      {
        DoubleReal mass = use_avg_mass ?
                          hit_it->getSequence().getAverageWeight(Residue::Full, charge) :
                          hit_it->getSequence().getMonoWeight(Residue::Full, charge);

        mz_values.push_back( mass / (DoubleReal) charge);
      }
    }
  }
示例#2
0
  //Visualizing PeptideIdentification object
  void MetaDataBrowser::visualize_(PeptideIdentification & meta, QTreeWidgetItem * parent)
  {
    PeptideIdentificationVisualizer * visualizer = new PeptideIdentificationVisualizer(isEditable(), this, this);

    QStringList labels;
    int id = ws_->addWidget(visualizer);
    labels << QString("PeptideIdentification %1").arg(meta.getScoreType().c_str()) << QString::number(id);

    visualizer->load(meta, id);

    QTreeWidgetItem * item;
    if (parent == nullptr)
    {
      item = new QTreeWidgetItem(treeview_, labels);
    }
    else
    {
      item = new QTreeWidgetItem(parent, labels);
    }

    //check for proteins and peptides hits
    meta.assignRanks();

    //list all peptides hits in the tree
    for (Size i = 0; i < meta.getHits().size(); ++i)
    {
      visualize_(const_cast<PeptideHit &>(meta.getHits()[i]), item);
    }

    visualize_(dynamic_cast<MetaInfoInterface &>(meta), item);

    connectVisualizer_(visualizer);
  }
示例#3
0
  bool IDFilter::filterIdentificationsByMetaValueRange(const PeptideIdentification& identification, const String& key, DoubleReal low, DoubleReal high, bool missing)
  {
    if (!identification.metaValueExists(key)) return missing;

    DoubleReal value = identification.getMetaValue(key);
    return (value >= low) && (value <= high);
  }
示例#4
0
  void IDFilter::filterIdentificationsByCharge(const PeptideIdentification& identification,
                                               Int min_charge,
                                               PeptideIdentification& filtered_identification)
  {
    vector<Size> new_peptide_indices;
    vector<PeptideHit> filtered_peptide_hits;

    filtered_identification = identification;
    filtered_identification.setHits(vector<PeptideHit>());

    const vector<PeptideHit>& temp_peptide_hits = identification.getHits();

    for (Size i = 0; i < temp_peptide_hits.size(); i++)
    {
      if (temp_peptide_hits[i].getCharge() >= min_charge)
      {
        new_peptide_indices.push_back(i);
      }
    }

    for (Size i = 0; i < new_peptide_indices.size(); i++)
    {
      filtered_peptide_hits.push_back(identification.getHits()[new_peptide_indices[i]]);
    }
    if (!filtered_peptide_hits.empty())
    {
      filtered_identification.setHits(filtered_peptide_hits);
      filtered_identification.assignRanks();
    }
  }
示例#5
0
  void IDFilter::filterIdentificationsByMzError(const PeptideIdentification& identification, DoubleReal mass_error, bool unit_ppm, PeptideIdentification& filtered_identification)
  {
    vector<PeptideHit> hits;
    filtered_identification = identification;
    vector<PeptideHit> temp_hits = identification.getHits();

    for (vector<PeptideHit>::iterator it = temp_hits.begin(); it != temp_hits.end(); ++it)
    {
      Int charge = it->getCharge();

      if (charge == 0)
      {
        charge = 1;
      }

      DoubleReal exp_mz = (DoubleReal)identification.getMetaValue("MZ");
      DoubleReal theo_mz =  (it->getSequence().getMonoWeight() + (DoubleReal)charge * Constants::PROTON_MASS_U) / (DoubleReal)charge;
      DoubleReal error(exp_mz - theo_mz);

      if (unit_ppm)
      {
        error = error / theo_mz * (DoubleReal)1e6;
      }

      if (fabs(error) <= mass_error)
      {
        hits.push_back(*it);
      }
    }
    filtered_identification.setHits(hits);
  }
示例#6
0
void CompNovoIdentificationCID::getIdentifications(vector<PeptideIdentification> & pep_ids, const PeakMap & exp)
{
    Size count(1);
    for (PeakMap::ConstIterator it = exp.begin(); it != exp.end(); ++it, ++count)
    {
        //cerr << count << "/" << exp.size() << endl;
        PeptideIdentification id;
        // TODO check if both CID and ETD is present;
        PeakSpectrum CID_spec(*it);
        id.setRT(it->getRT());
        id.setMZ(it->getPrecursors().begin()->getMZ());

        subspec_to_sequences_.clear();
        permute_cache_.clear();
        decomp_cache_.clear();

        getIdentification(id, CID_spec);
        //cerr << "size_of id=" << id.getHits().size() << endl;
        pep_ids.push_back(id);

        //++it;

        //
        //if (count == 10)
        //{
        //return;
        //}
    }
    return;
}
示例#7
0
 // Equality operator
 bool PeptideIdentification::operator==(const PeptideIdentification & rhs) const
 {
   return MetaInfoInterface::operator==(rhs)
          && id_ == rhs.id_
          && hits_ == rhs.getHits()
          && significance_threshold_ == rhs.getSignificanceThreshold()
          && score_type_ == rhs.score_type_
          && higher_score_better_ == rhs.higher_score_better_;
 }
示例#8
0
 // compare peptide IDs by score of best hit (hits must be sorted first!)
 // (note to self: the "static" is necessary to avoid cryptic "no matching
 // function" errors from gcc when the comparator is used below)
 static bool compareIDs_(const PeptideIdentification & left,
                         const PeptideIdentification & right)
 {
   if (left.getHits()[0].getScore() < right.getHits()[0].getScore())
   {
     return true;
   }
   return false;
 }
 // Equality operator
 bool PeptideIdentification::operator==(const PeptideIdentification& rhs) const
 {
   return MetaInfoInterface::operator==(rhs)
          && id_ == rhs.id_
          && (rt_ == rhs.rt_ || (!this->hasRT() && !rhs.hasRT())) // might be NaN, so comparing == will always be false
          && (mz_ == rhs.mz_ || (!this->hasMZ() && !rhs.hasMZ())) // might be NaN, so comparing == will always be false
          && hits_ == rhs.getHits()
          && significance_threshold_ == rhs.getSignificanceThreshold()
          && score_type_ == rhs.score_type_
          && higher_score_better_ == rhs.higher_score_better_
          && base_name_ == rhs.base_name_;
 }
  // list of peptide hits in "peptide" will be sorted
  bool MapAlignmentAlgorithmIdentification::hasGoodHit_(PeptideIdentification &
                                                        peptide)
  {
    if (peptide.empty() || peptide.getHits().empty())
      return false;

    peptide.sort();
    DoubleReal score = peptide.getHits().begin()->getScore();
    if (peptide.isHigherScoreBetter())
      return score >= score_threshold_;

    return score <= score_threshold_;
  }
示例#11
0
  void IDFilter::filterIdentificationsByBestHits(const PeptideIdentification& identification,
                                                 PeptideIdentification& filtered_identification,
                                                 bool strict)
  {
    vector<PeptideHit> filtered_peptide_hits;
    PeptideHit temp_peptide_hit;
    vector<Size> new_peptide_indices;

    filtered_identification = identification;
    filtered_identification.setHits(vector<PeptideHit>());

    if (!identification.getHits().empty())
    {
      Real optimal_value = identification.getHits()[0].getScore();
      new_peptide_indices.push_back(0);

      // searching for peptide(s) with maximal score
      for (Size i = 1; i < identification.getHits().size(); i++)
      {
        Real temp_score = identification.getHits()[i].getScore();
        bool new_leader = false;
        if ((identification.isHigherScoreBetter() && (temp_score > optimal_value))
           || (!identification.isHigherScoreBetter() && (temp_score < optimal_value)))
          new_leader = true;

        if (new_leader)
        {
          optimal_value = temp_score;
          new_peptide_indices.clear();
          new_peptide_indices.push_back(i);
        }
        else if (temp_score == optimal_value)
        {
          new_peptide_indices.push_back(i);
        }
      }
      if (!strict || new_peptide_indices.size() == 1)
      {
        for (Size i = 0; i < new_peptide_indices.size(); i++)
        {
          filtered_peptide_hits.push_back(identification.getHits()[new_peptide_indices[i]]);
        }
      }
    }

    if (!filtered_peptide_hits.empty())
    {
      filtered_identification.setHits(filtered_peptide_hits);
      filtered_identification.assignRanks();
    }
  }
示例#12
0
  void IDFilter::filterIdentificationsByProteins(const PeptideIdentification& identification,
                                                 const vector<FASTAFile::FASTAEntry>& proteins,
                                                 PeptideIdentification& filtered_identification,
                                                 bool no_protein_identifiers)
  {
    // TODO: this is highly inefficient! the Protein-Index should be build once for all peptide-identifications instead of
    //       doing this once for every ID. Furthermore the index itself is inefficient (use seqan instead)
    String protein_sequences;
    String accession_sequences;
    vector<PeptideHit> filtered_peptide_hits;
    PeptideHit temp_peptide_hit;

    filtered_identification = identification;
    filtered_identification.setHits(vector<PeptideHit>());

    for (Size i = 0; i < proteins.size(); i++)
    {
      if (proteins[i].identifier != "")
      {
        accession_sequences.append("*" + proteins[i].identifier);
      }
      if (proteins[i].sequence != "")
      {
        protein_sequences.append("*" + proteins[i].sequence);
      }
    }
    accession_sequences.append("*");
    protein_sequences.append("*");

    for (Size i = 0; i < identification.getHits().size(); i++)
    {
      if (no_protein_identifiers || accession_sequences == "*") // filter by sequence alone if no protein accesssions are available
      {
        if (protein_sequences.find(identification.getHits()[i].getSequence().toUnmodifiedString()) != String::npos)
        {
          filtered_peptide_hits.push_back(identification.getHits()[i]);
        }
      }
      else // filter by protein accessions
      {
        for (vector<String>::const_iterator ac_it = identification.getHits()[i].getProteinAccessions().begin();
             ac_it != identification.getHits()[i].getProteinAccessions().end();
             ++ac_it)
        {
          if (accession_sequences.find("*" + *ac_it) != String::npos)
          {
            filtered_peptide_hits.push_back(identification.getHits()[i]);
            break; // we found a matching protein, the peptide is valid -> exit
          }
        }
      }
    }

    filtered_identification.setHits(filtered_peptide_hits);
    filtered_identification.assignRanks();
  }
std::vector<PeptideIdentification> toPepVec(const QStringList& sl_pep)
{
  std::vector<PeptideIdentification> pep_vec;
  for (Size i = 0; i < sl_pep.size(); ++i)
  {
    PeptideHit hit;
    hit.setSequence(AASequence::fromString(sl_pep[int(i)]));
    std::vector<PeptideHit> hits;
    hits.push_back(hit);
    PeptideIdentification pi;
    pi.setHits(hits);
    pep_vec.push_back(pi);
  }
  return pep_vec;
}
示例#14
0
  void IDFilter::filterIdentificationsByRTPValues(const PeptideIdentification& identification,
                                                  PeptideIdentification& filtered_identification,
                                                  DoubleReal p_value)
  {
    DoubleReal border = 1 - p_value;
    vector<PeptideHit> filtered_peptide_hits;
    PeptideHit temp_peptide_hit;

    filtered_identification = identification;
    filtered_identification.setHits(vector<PeptideHit>());

    Size missing_meta_value = 0;

    for (Size i = 0; i < identification.getHits().size(); i++)
    {
      if (identification.getHits()[i].metaValueExists("predicted_RT_p_value"))
      {
        if ((DoubleReal)(identification.getHits()[i].getMetaValue("predicted_RT_p_value")) <= border)
        {
          filtered_peptide_hits.push_back(identification.getHits()[i]);
        }
      }
      else
        ++missing_meta_value;
    }
    if (missing_meta_value > 0)
      LOG_WARN << "Filtering identifications by p-value did not work on " << missing_meta_value << " of " << identification.getHits().size() << " hits. Your data is missing a meta-value ('predicted_RT_p_value') from RTPredict!\n";

    if (!filtered_peptide_hits.empty())
    {
      filtered_identification.setHits(filtered_peptide_hits);
      filtered_identification.assignRanks();
    }
  }
示例#15
0
  void IDFilter::filterIdentificationsByCharge(const PeptideIdentification& identification,
                                               Int min_charge,
                                               PeptideIdentification& filtered_identification)
  {
    filtered_identification = identification;
    const vector<PeptideHit>& temp_peptide_hits = identification.getHits();
    vector<PeptideHit> filtered_peptide_hits;
    for (Size i = 0; i < temp_peptide_hits.size(); ++i)
    {
      if (temp_peptide_hits[i].getCharge() >= min_charge)
      {
        filtered_peptide_hits.push_back(temp_peptide_hits[i]);
      }
    }

    filtered_identification.setHits(filtered_peptide_hits);
    filtered_identification.assignRanks();
  }
示例#16
0
  void IDFilter::filterIdentificationsUnique(const PeptideIdentification& identification,
                                             PeptideIdentification& filtered_identification)
  {
    vector<PeptideHit> hits;
    filtered_identification = identification;
    vector<PeptideHit> temp_hits = identification.getHits();

    for (vector<PeptideHit>::iterator it = temp_hits.begin();
         it != temp_hits.end();
         ++it)
    {
      if (find(hits.begin(), hits.end(), *it) == hits.end())
      {
        hits.push_back(*it);
      }
    }
    filtered_identification.setHits(hits);
  }
示例#17
0
  void IDFilter::filterIdentificationsUnique(const PeptideIdentification& identification,
                                             PeptideIdentification& filtered_identification)
  {
    // there's no "PeptideHit::operator<" defined, so we can't use a set nor
    // "sort" + "unique" from the standard library
    vector<PeptideHit> hits;
    filtered_identification = identification;
    vector<PeptideHit> temp_hits = identification.getHits();

    for (vector<PeptideHit>::iterator it = temp_hits.begin();
         it != temp_hits.end();
         ++it)
    {
      if (find(hits.begin(), hits.end(), *it) == hits.end())
      {
        hits.push_back(*it);
      }
    }
    filtered_identification.setHits(hits);
  }
示例#18
0
  void IDFilter::filterIdentificationsByExclusionPeptides(const PeptideIdentification& identification,
                                                          const set<String>& peptides,
                                                          PeptideIdentification& filtered_identification)
  {
    String protein_sequences;
    String accession_sequences;
    vector<PeptideHit> filtered_peptide_hits;
    PeptideHit temp_peptide_hit;

    filtered_identification = identification;
    filtered_identification.setHits(vector<PeptideHit>());

    for (Size i = 0; i < identification.getHits().size(); i++)
    {
      if (find(peptides.begin(), peptides.end(), identification.getHits()[i].getSequence().toString()) == peptides.end())
      {
        filtered_peptide_hits.push_back(identification.getHits()[i]);
      }
    }
    if (!filtered_peptide_hits.empty())
    {
      filtered_identification.setHits(filtered_peptide_hits);
      filtered_identification.assignRanks();
    }
  }
示例#19
0
文件: IDRipper.C 项目: BioITer/OpenMS
  void IDRipper::getProteinIdentification_(ProteinIdentification & result, PeptideIdentification pep_ident, std::vector<ProteinIdentification> & prot_idents)
  {
    const String & identifier = pep_ident.getIdentifier();

    for (vector<ProteinIdentification>::iterator prot_it = prot_idents.begin(); prot_it != prot_idents.end(); ++prot_it)
    {
      if (identifier.compare(prot_it->getIdentifier()) == 0)
      {
        result = *prot_it;
        break;
      }
    }
  }
示例#20
0
  void IDFilter::filterIdentificationsByLength(const PeptideIdentification& identification,
                                               PeptideIdentification& filtered_identification,
                                               Size min_length,
                                               Size max_length)
  {
    vector<Size> new_peptide_indices;
    vector<PeptideHit> filtered_peptide_hits;

    filtered_identification = identification;
    filtered_identification.setHits(vector<PeptideHit>());
    Size ml = max_length;
    if (max_length < min_length)
    {
      ml = UINT_MAX;
    }

    const vector<PeptideHit>& temp_peptide_hits = identification.getHits();

    for (Size i = 0; i < temp_peptide_hits.size(); i++)
    {
      if (temp_peptide_hits[i].getSequence().size() >= min_length && temp_peptide_hits[i].getSequence().size() <= ml)
      {
        new_peptide_indices.push_back(i);
      }
    }

    for (Size i = 0; i < new_peptide_indices.size(); i++)
    {
      filtered_peptide_hits.push_back(identification.getHits()[new_peptide_indices[i]]);
    }
    if (!filtered_peptide_hits.empty())
    {
      filtered_identification.setHits(filtered_peptide_hits);
      filtered_identification.assignRanks();
    }
  }
示例#21
0
  void IDFilter::filterIdentificationsByLength(const PeptideIdentification& identification,
                                               PeptideIdentification& filtered_identification,
                                               Size min_length,
                                               Size max_length)
  {
    filtered_identification = identification;
    if (max_length < min_length)
    {
      max_length = UINT_MAX;
    }

    const vector<PeptideHit>& temp_peptide_hits = identification.getHits();
    vector<PeptideHit> filtered_peptide_hits;
    for (Size i = 0; i < temp_peptide_hits.size(); ++i)
    {
      if (min_length <= temp_peptide_hits[i].getSequence().size() && temp_peptide_hits[i].getSequence().size() <= max_length)
      {
        filtered_peptide_hits.push_back(temp_peptide_hits[i]);
      }
    }

    filtered_identification.setHits(filtered_peptide_hits);
    filtered_identification.assignRanks();
  }
  // static
  bool IDConflictResolverAlgorithm::compareIDsSmallerScores_(const PeptideIdentification & left, const PeptideIdentification & right)
  {
    // if any of them is empty, the other is considered "greater"
    // independent of the score in the first hit
    if (left.getHits().empty() || right.getHits().empty()) 
    { // also: for strict weak ordering, comp(x,x) needs to be false
      return left.getHits().size() < right.getHits().size();
    }

    if (left.getHits()[0].getScore() < right.getHits()[0].getScore())
    {
      return true;
    }
    return false;
  }
示例#23
0
  void IDFilter::filterIdentificationsByExclusionPeptides(const PeptideIdentification& identification,
                                                          const set<String>& peptides,
                                                          bool ignore_modifications,
                                                          PeptideIdentification& filtered_identification)
  {
    vector<PeptideHit> filtered_peptide_hits;

    filtered_identification = identification;
    filtered_identification.setHits(vector<PeptideHit>());

    for (Size i = 0; i < identification.getHits().size(); i++)
    {
      String query = ignore_modifications ? identification.getHits()[i].getSequence().toUnmodifiedString() : identification.getHits()[i].getSequence().toString();
      if (find(peptides.begin(), peptides.end(), query) == peptides.end())
      {
        filtered_peptide_hits.push_back(identification.getHits()[i]);
      }
    }
    if (!filtered_peptide_hits.empty())
    {
      filtered_identification.setHits(filtered_peptide_hits);
      filtered_identification.assignRanks();
    }
  }
示例#24
0
void CompNovoIdentificationCID::getIdentification(PeptideIdentification & id, const PeakSpectrum & CID_spec)
{
    //if (CID_spec.getPrecursors().begin()->getMZ() > 1000.0)
    //{
    //cerr << "Weight of precursor has been estimated to exceed 2000.0 Da which is the current limit" << endl;
    //return;
    //}

    PeakSpectrum new_CID_spec(CID_spec);
    windowMower_(new_CID_spec, 0.3, 1);

    Param zhang_param;
    zhang_param = zhang_.getParameters();
    zhang_param.setValue("tolerance", fragment_mass_tolerance_);
    zhang_param.setValue("use_gaussian_factor", "true");
    zhang_param.setValue("use_linear_factor", "false");
    zhang_.setParameters(zhang_param);


    Normalizer normalizer;
    Param n_param(normalizer.getParameters());
    n_param.setValue("method", "to_one");
    normalizer.setParameters(n_param);
    normalizer.filterSpectrum(new_CID_spec);

    Size charge(2);
    double precursor_weight(0);     // [M+H]+
    if (!CID_spec.getPrecursors().empty())
    {
        // believe charge of spectrum?
        if (CID_spec.getPrecursors().begin()->getCharge() != 0)
        {
            charge = CID_spec.getPrecursors().begin()->getCharge();
        }
        else
        {
            // TODO estimate charge state
        }
        precursor_weight = CID_spec.getPrecursors().begin()->getMZ() * charge - ((charge - 1) * Constants::PROTON_MASS_U);
    }

    //cerr << "charge=" << charge << ", [M+H]=" << precursor_weight << endl;

    // now delete all peaks that are right of the estimated precursor weight
    Size peak_counter(0);
    for (PeakSpectrum::ConstIterator it = new_CID_spec.begin(); it != new_CID_spec.end(); ++it, ++peak_counter)
    {
        if (it->getPosition()[0] > precursor_weight)
        {
            break;
        }
    }
    if (peak_counter < new_CID_spec.size())
    {
        new_CID_spec.resize(peak_counter);
    }


    static double oxonium_mass = EmpiricalFormula("H2O+").getMonoWeight();

    Peak1D p;
    p.setIntensity(1);
    p.setPosition(oxonium_mass);

    new_CID_spec.push_back(p);

    p.setPosition(precursor_weight);
    new_CID_spec.push_back(p);

    // add complement to spectrum
    /*
    for (PeakSpectrum::ConstIterator it1 = CID_spec.begin(); it1 != CID_spec.end(); ++it1)
    {
    // get m/z of complement
    double mz_comp = precursor_weight - it1->getPosition()[0] + Constants::PROTON_MASS_U;

    // search if peaks are available that have similar m/z values
    Size count(0);
    bool found(false);
    for (PeakSpectrum::ConstIterator it2 = CID_spec.begin(); it2 != CID_spec.end(); ++it2, ++count)
    {
    if (fabs(mz_comp - it2->getPosition()[0]) < fragment_mass_tolerance)
    {
      // add peak intensity to corresponding peak in new_CID_spec
      new_CID_spec[count].setIntensity(new_CID_spec[count].getIntensity());
    }
    }
    if (!found)
    {
    // infer this peak
    Peak1D p;
    p.setIntensity(it1->getIntensity());
    p.setPosition(mz_comp);
    new_CID_spec.push_back(p);
    }
    }*/

    CompNovoIonScoringCID ion_scoring;
    Param ion_scoring_param(ion_scoring.getParameters());
    ion_scoring_param.setValue("fragment_mass_tolerance", fragment_mass_tolerance_);
    ion_scoring_param.setValue("precursor_mass_tolerance", precursor_mass_tolerance_);
    ion_scoring_param.setValue("decomp_weights_precision", decomp_weights_precision_);
    ion_scoring_param.setValue("double_charged_iso_threshold", (double)param_.getValue("double_charged_iso_threshold"));
    ion_scoring_param.setValue("max_isotope_to_score", param_.getValue("max_isotope_to_score"));
    ion_scoring_param.setValue("max_isotope", max_isotope_);
    ion_scoring.setParameters(ion_scoring_param);

    Map<double, IonScore> ion_scores;
    ion_scoring.scoreSpectrum(ion_scores, new_CID_spec, precursor_weight, charge);

    new_CID_spec.sortByPosition();

    /*
    cerr << "Size of ion_scores " << ion_scores.size() << endl;
    for (Map<double, IonScore>::const_iterator it = ion_scores.begin(); it != ion_scores.end(); ++it)
    {
        cerr << it->first << " " << it->second.score << endl;
    }*/

#ifdef WRITE_SCORED_SPEC
    PeakSpectrum filtered_spec(new_CID_spec);
    filtered_spec.clear();
    for (Map<double, CompNovoIonScoringCID::IonScore>::const_iterator it = ion_scores.begin(); it != ion_scores.end(); ++it)
    {
        Peak1D p;
        p.setIntensity(it->second.score);
        p.setPosition(it->first);
        filtered_spec.push_back(p);
    }
    DTAFile().store("spec_scored.dta", filtered_spec);
#endif

    set<String> sequences;
    getDecompositionsDAC_(sequences, 0, new_CID_spec.size() - 1, precursor_weight, new_CID_spec, ion_scores);

#ifdef SPIKE_IN
    sequences.insert("AFCVDGEGR");
    sequences.insert("APEFAAPWPDFVPR");
    sequences.insert("AVKQFEESQGR");
    sequences.insert("CCTESLVNR");
    sequences.insert("DAFLGSFLYEYSR");
    sequences.insert("DAIPENLPPLTADFAEDK");
    sequences.insert("DDNKVEDIWSFLSK");
    sequences.insert("DDPHACYSTVFDK");
    sequences.insert("DEYELLCLDGSR");
    sequences.insert("DGAESYKELSVLLPNR");
    sequences.insert("DGASCWCVDADGR");
    sequences.insert("DLFIPTCLETGEFAR");
    sequences.insert("DTHKSEIAHR");
    sequences.insert("DVCKNYQEAK");
    sequences.insert("EACFAVEGPK");
    sequences.insert("ECCHGDLLECADDR");
    sequences.insert("EFLGDKFYTVISSLK");
    sequences.insert("EFTPVLQADFQK");
    sequences.insert("ELFLDSGIFQPMLQGR");
    sequences.insert("ETYGDMADCCEK");
    sequences.insert("EVGCPSSSVQEMVSCLR");
    sequences.insert("EYEATLEECCAK");
    sequences.insert("FADLIQSGTFQLHLDSK");
    sequences.insert("FFSASCVPGATIEQK");
    sequences.insert("FLANVSTVLTSK");
    sequences.insert("FLSGSDYAIR");
    sequences.insert("FTASCPPSIK");
    sequences.insert("GAIEWEGIESGSVEQAVAK");
    sequences.insert("GDVAFIQHSTVEENTGGK");
    sequences.insert("GEPPSCAEDQSCPSER");
    sequences.insert("GEYVPTSLTAR");
    sequences.insert("GQEFTITGQKR");
    sequences.insert("GTFAALSELHCDK");
    sequences.insert("HLVDEPQNLIK");
    sequences.insert("HQDCLVTTLQTQPGAVR");
    sequences.insert("HTTVNENAPDQK");
    sequences.insert("ILDCGSPDTEVR");
    sequences.insert("KCPSPCQLQAER");
    sequences.insert("KGTEFTVNDLQGK");
    sequences.insert("KQTALVELLK");
    sequences.insert("KVPQVSTPTLVEVSR");
    sequences.insert("LALQFTTNAKR");
    sequences.insert("LCVLHEKTPVSEK");
    sequences.insert("LFTFHADICTLPDTEK");
    sequences.insert("LGEYGFQNALIVR");
    sequences.insert("LHVDPENFK");
    sequences.insert("LKECCDKPLLEK");
    sequences.insert("LKHLVDEPQNLIK");
    sequences.insert("LKPDPNTLCDEFK");
    sequences.insert("LLGNVLVVVLAR");
    sequences.insert("LLVVYPWTQR");
    sequences.insert("LRVDPVNFK");
    sequences.insert("LTDEELAFPPLSPSR");
    sequences.insert("LVNELTEFAK");
    sequences.insert("MFLSFPTTK");
    sequences.insert("MPCTEDYLSLILNR");
    sequences.insert("NAPYSGYSGAFHCLK");
    sequences.insert("NECFLSHKDDSPDLPK");
    sequences.insert("NEPNKVPACPGSCEEVK");
    sequences.insert("NLQMDDFELLCTDGR");
    sequences.insert("QAGVQAEPSPK");
    sequences.insert("RAPEFAAPWPDFVPR");
    sequences.insert("RHPEYAVSVLLR");
    sequences.insert("RPCFSALTPDETYVPK");
    sequences.insert("RSLLLAPEEGPVSQR");
    sequences.insert("SAFPPEPLLCSVQR");
    sequences.insert("SAGWNIPIGTLLHR");
    sequences.insert("SCWCVDEAGQK");
    sequences.insert("SGNPNYPHEFSR");
    sequences.insert("SHCIAEVEK");
    sequences.insert("SISSGFFECER");
    sequences.insert("SKYLASASTMDHAR");
    sequences.insert("SLHTLFGDELCK");
    sequences.insert("SLLLAPEEGPVSQR");
    sequences.insert("SPPQCSPDGAFRPVQCK");
    sequences.insert("SREGDPLAVYLK");
    sequences.insert("SRQIPQCPTSCER");
    sequences.insert("TAGTPVSIPVCDDSSVK");
    sequences.insert("TCVADESHAGCEK");
    sequences.insert("TQFGCLEGFGR");
    sequences.insert("TVMENFVAFVDK");
    sequences.insert("TYFPHFDLSHGSAQVK");
    sequences.insert("TYMLAFDVNDEK");
    sequences.insert("VDEVGGEALGR");
    sequences.insert("VDLLIGSSQDDGLINR");
    sequences.insert("VEDIWSFLSK");
    sequences.insert("VGGHAAEYGAEALER");
    sequences.insert("VGTRCCTKPESER");
    sequences.insert("VKVDEVGGEALGR");
    sequences.insert("VKVDLLIGSSQDDGLINR");
    sequences.insert("VLDSFSNGMK");
    sequences.insert("VLSAADKGNVK");
    sequences.insert("VPQVSTPTLVEVSR");
    sequences.insert("VTKCCTESLVNR");
    sequences.insert("VVAASDASQDALGCVK");
    sequences.insert("VVAGVANALAHR");
    sequences.insert("YICDNQDTISSK");
    sequences.insert("YLASASTMDHAR");
    sequences.insert("YNGVFQECCQAEDK");
#endif

    SpectrumAlignmentScore spectra_zhang;
    spectra_zhang.setParameters(zhang_param);

    vector<PeptideHit> hits;
    Size missed_cleavages = param_.getValue("missed_cleavages");
    for (set<String>::const_iterator it = sequences.begin(); it != sequences.end(); ++it)
    {

        Size num_missed = countMissedCleavagesTryptic_(*it);
        if (missed_cleavages < num_missed)
        {
            //cerr << "Two many missed cleavages: " << *it << ", found " << num_missed << ", allowed " << missed_cleavages << endl;
            continue;
        }
        PeakSpectrum CID_sim_spec;
        getCIDSpectrum_(CID_sim_spec, *it, charge);

        //normalizer.filterSpectrum(CID_sim_spec);

        double cid_score = zhang_(CID_sim_spec, CID_spec);

        PeptideHit hit;
        hit.setScore(cid_score);

        hit.setSequence(getModifiedAASequence_(*it));
        hit.setCharge((Int)charge);   //TODO unify charge interface: int or size?
        hits.push_back(hit);
        //cerr << getModifiedAASequence_(*it) << " " << cid_score << " " << endl;
    }

    // rescore the top hits
    id.setHits(hits);
    id.assignRanks();

    hits = id.getHits();

    SpectrumAlignmentScore alignment_score;
    Param align_param(alignment_score.getParameters());
    align_param.setValue("tolerance", fragment_mass_tolerance_);
    align_param.setValue("use_linear_factor", "true");
    alignment_score.setParameters(align_param);

    for (vector<PeptideHit>::iterator it = hits.begin(); it != hits.end(); ++it)
    {
        //cerr << "Pre: " << it->getRank() << " " << it->getSequence() << " " << it->getScore() << " " << endl;
    }

    Size number_of_prescoring_hits = param_.getValue("number_of_prescoring_hits");
    if (hits.size() > number_of_prescoring_hits)
    {
        hits.resize(number_of_prescoring_hits);
    }

    for (vector<PeptideHit>::iterator it = hits.begin(); it != hits.end(); ++it)
    {
        PeakSpectrum CID_sim_spec;
        getCIDSpectrum_(CID_sim_spec, getModifiedStringFromAASequence_(it->getSequence()), charge);

        normalizer.filterSpectrum(CID_sim_spec);

        //DTAFile().store("sim_specs/" + it->getSequence().toUnmodifiedString() + "_sim_CID.dta", CID_sim_spec);

        //double cid_score = spectra_zhang(CID_sim_spec, CID_spec);
        double cid_score = alignment_score(CID_sim_spec, CID_spec);

        //cerr << "Final: " << it->getSequence() << " " << cid_score << endl;

        it->setScore(cid_score);
    }

    id.setHits(hits);
    id.assignRanks();
    hits = id.getHits();

    for (vector<PeptideHit>::iterator it = hits.begin(); it != hits.end(); ++it)
    {
        //cerr << "Fin: " << it->getRank() << " " << it->getSequence() << " " << it->getScore() << " " << endl;
    }

    Size number_of_hits = param_.getValue("number_of_hits");
    if (id.getHits().size() > number_of_hits)
    {
        hits.resize(number_of_hits);
    }

    id.setHits(hits);
    id.assignRanks();

    return;
}
///////////////////////////

START_TEST(IDFilter, "$Id$")

/////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////

using namespace OpenMS;
using namespace std;

///load input data
std::vector<ProteinIdentification> protein_identifications;
std::vector<PeptideIdentification> identifications;
String document_id;
IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("IDFilter_test.idXML"), protein_identifications, identifications, document_id);
PeptideIdentification identification = identifications[0];
ProteinIdentification protein_identification = protein_identifications[0];

/// Proteins for search
vector<FASTAFile::FASTAEntry> proteins;
proteins.push_back(FASTAFile::FASTAEntry("Q824A5", "test description 1", "LHASGITVTEIPVTATNFK"));
proteins.push_back(FASTAFile::FASTAEntry("Q872T5", "test description 2", "THPYGHAIVAGIERYPSK"));

IDFilter* ptr = 0;
IDFilter* nullPointer = 0;

START_SECTION((IDFilter()))
  ptr = new IDFilter();
  TEST_NOT_EQUAL(ptr, nullPointer);
END_SECTION
示例#26
0
    ExitCodes main_(int, const char **)
    {
        //-------------------------------------------------------------
        // parameter handling
        //-------------------------------------------------------------

        //input/output files
        String in(getStringOption_("in"));
        String out(getStringOption_("out"));

        //-------------------------------------------------------------
        // loading input
        //-------------------------------------------------------------

        RichPeakMap exp;
        MzMLFile f;
        f.setLogType(log_type_);
        f.load(in, exp);

        writeDebug_("Data set contains " + String(exp.size()) + " spectra", 1);

        //-------------------------------------------------------------
        // calculations
        //-------------------------------------------------------------

        writeDebug_("Reading model file", 2);

        // create model an set the given options
        PILISModel * model = new PILISModel();
        model->readFromFile(getStringOption_("model_file"));
        Param model_param(model->getParameters());
        model_param.setValue("upper_mz", getDoubleOption_("model:upper_mz"));
        model_param.setValue("lower_mz", getDoubleOption_("model:lower_mz"));
        model_param.setValue("charge_directed_threshold", getDoubleOption_("model:charge_directed_threshold"));
        model_param.setValue("charge_remote_threshold", getDoubleOption_("model:charge_remote_threshold"));
        //model_param.setValue("min_main_ion_intensity", getDoubleOption_("model:min_main_ion_intensity"));
        //model_param.setValue("min_loss_ion_intensity", getDoubleOption_("model:min_loss_ion_intensity"));
        model_param.setValue("min_y_ion_intensity", getDoubleOption_("model:min_y_ion_intensity"));
        model_param.setValue("min_b_ion_intensity", getDoubleOption_("model:min_b_ion_intensity"));
        model_param.setValue("min_a_ion_intensity", getDoubleOption_("model:min_a_ion_intensity"));
        model_param.setValue("min_y_loss_intensity", getDoubleOption_("model:min_y_loss_intensity"));
        model_param.setValue("min_b_loss_intensity", getDoubleOption_("model:min_b_loss_intensity"));
        model_param.setValue("charge_loss_factor", getDoubleOption_("model:charge_loss_factor"));
        model_param.setValue("visible_model_depth", getIntOption_("model:visible_model_depth"));
        model_param.setValue("model_depth", getIntOption_("model:model_depth"));
        model_param.setValue("fixed_modifications", getStringOption_("fixed_modifications"));
        model->setParameters(model_param);

        writeDebug_("Reading sequence db", 2);

        // create sequence db
        SuffixArrayPeptideFinder * sapf = new SuffixArrayPeptideFinder(getStringOption_("peptide_db_file"), "trypticCompressed");
        sapf->setTolerance(getDoubleOption_("precursor_mass_tolerance"));
        sapf->setNumberOfModifications(0);
        sapf->setUseTags(false);

        //exp.resize(50); // TODO

        UInt max_charge(3), min_charge(1);         // TODO
        vector<double> pre_weights;
        for (RichPeakMap::Iterator it = exp.begin(); it != exp.end(); ++it)
        {
            double pre_weight(it->getPrecursors()[0].getMZ());
            for (Size z = min_charge; z <= max_charge; ++z)
            {
                pre_weights.push_back((pre_weight * (double)z) - (double)z);
            }
        }

        sort(pre_weights.begin(), pre_weights.end());

        cerr << "Getting candidates from SA...";
        vector<vector<pair<pair<String, String>, String> > > candidates;
        sapf->getCandidates(candidates, pre_weights);
        cerr << "done" << endl;

        delete sapf;

        map<double, vector<pair<pair<String, String>, String> > > sorted_candidates;
        UInt count(0);
        for (Size count = 0; count != candidates.size(); ++count)
        {
            sorted_candidates[pre_weights[count]] = candidates[count];
        }
        candidates.clear();

        // create ProteinIdentification and set the options
        PILISIdentification PILIS_id;

        PILIS_id.setModel(model);

        Param id_param(PILIS_id.getParameters());
        id_param.setValue("precursor_mass_tolerance", getDoubleOption_("precursor_mass_tolerance"));
        id_param.setValue("max_candidates", getIntOption_("max_pre_candidates"));
        // disable evalue scoring, this is done separately to allow for a single id per spectrum
        id_param.setValue("use_evalue_scoring", 0);
        id_param.setValue("fixed_modifications", getStringOption_("fixed_modifications"));
        PILIS_id.setParameters(id_param);

        vector<PeptideIdentification> ids;

        // perform the ProteinIdentification of the given spectra
        UInt no(0);
        for (RichPeakMap::Iterator it = exp.begin(); it != exp.end(); ++it, ++no)
        {
            if (it->getMSLevel() == 0)
            {
                writeLog_("Warning: MSLevel is 0, assuming MSLevel 2");
                it->setMSLevel(2);
            }

            if (it->getMSLevel() == 2)
            {
                writeDebug_(String(no) + "/" + String(exp.size()), 1);
                PeptideIdentification id;

                map<String, UInt> cand;

                for (UInt z = min_charge; z <= max_charge; ++z)
                {
                    double pre_weight = (it->getPrecursors()[0].getMZ() * (double)z) - (double)z;
                    for (vector<pair<pair<String, String>, String> >::const_iterator cit = sorted_candidates[pre_weight].begin(); cit != sorted_candidates[pre_weight].end(); ++cit)
                    {
                        String seq = cit->first.second;
                        if (seq.size() > 39)
                        {
                            continue;
                        }
                        UInt num_cleavages_sites(0);
                        for (Size k = 0; k != seq.size(); ++k)
                        {
                            if (k != seq.size() - 1)
                            {
                                if ((seq[k] == 'K' || seq[k] == 'R') && seq[k + 1] != 'P')
                                {
                                    ++num_cleavages_sites;
                                }
                            }
                        }

                        if (num_cleavages_sites > 1)
                        {
                            continue;
                        }

                        cand[seq] = z;
                    }
                }

                cerr << "#cand=" << cand.size() << endl;
                PILIS_id.getIdentification(cand, id, *it);

                id.setMetaValue("RT", it->getRT());
                id.setMetaValue("MZ", it->getPrecursors()[0].getMZ());

                ids.push_back(id);

                if (!id.getHits().empty())
                {
                    cerr << it->getPrecursors()[0].getMZ() << " " << AASequence(id.getHits().begin()->getSequence()).getAverageWeight() << endl;
                    writeDebug_(id.getHits().begin()->getSequence().toString() + " (z=" + id.getHits().begin()->getCharge() + "), score=" + String(id.getHits().begin()->getScore()), 10);
                }
            }
        }

        // perform the PILIS scoring to the spectra
        if (!getFlag_("scoring:do_not_use_evalue_scoring"))
        {
            PILISScoring scoring;
            Param scoring_param(scoring.getParameters());
            scoring_param.setValue("use_local_scoring", (int)getFlag_("scoring:use_local_scoring"));
            scoring_param.setValue("survival_function_bin_size", getIntOption_("scoring:survival_function_bin_size"));
            scoring_param.setValue("global_linear_fitting_threshold", getDoubleOption_("scoring:global_linear_fitting_threshold"));
            scoring_param.setValue("local_linear_fitting_threshold", getDoubleOption_("scoring:local_linear_fitting_threshold"));
            scoring.setParameters(scoring_param);

            scoring.getScores(ids);
        }

        // write the result to the IdentificationData structure for the storing
        UInt max_candidates = getIntOption_("max_candidates");
        for (Size i = 0; i != ids.size(); ++i)
        {
            if (ids[i].getHits().size() > max_candidates)
            {
                vector<PeptideHit> hits = ids[i].getHits();
                hits.resize(max_candidates);
                ids[i].setHits(hits);
            }
        }

        delete model;


        //-------------------------------------------------------------
        // writing output
        //-------------------------------------------------------------

        DateTime now;
        now.now();

        String date_string;
        //now.get(date_string); // @todo Fix it (Andreas)
        String identifier("PILIS_" + date_string);

        //UInt count(0);
        count = 0;
        for (RichPeakMap::ConstIterator it = exp.begin(); it != exp.end(); ++it)
        {
            if (it->getMSLevel() == 2)
            {
                ids[count].setMetaValue("RT", it->getRT());
                ids[count].setMetaValue("MZ", it->getPrecursors()[0].getMZ());

                ids[count].setIdentifier(identifier);
                ids[count++].setHigherScoreBetter(false);
            }
        }

        // search parameters
        ProteinIdentification::SearchParameters search_parameters;
        search_parameters.db = getStringOption_("peptide_db_file");
        search_parameters.db_version = "";
        search_parameters.taxonomy = "";
        //search_parameters.charges = getStringOption_("charges");
        search_parameters.mass_type = ProteinIdentification::MONOISOTOPIC;
        vector<String> fixed_mods;
        getStringOption_("fixed_modifications").split(',', fixed_mods);
        search_parameters.fixed_modifications = fixed_mods;
        search_parameters.enzyme = ProteinIdentification::TRYPSIN;
        search_parameters.missed_cleavages = 1;
        search_parameters.peak_mass_tolerance = getDoubleOption_("peak_mass_tolerance");
        search_parameters.precursor_tolerance = getDoubleOption_("precursor_mass_tolerance");

        ProteinIdentification protein_identification;
        protein_identification.setDateTime(now);
        protein_identification.setSearchEngine("PILIS");
        protein_identification.setSearchEngineVersion("beta");
        protein_identification.setSearchParameters(search_parameters);
        protein_identification.setIdentifier(identifier);

        vector<ProteinIdentification> protein_identifications;
        protein_identifications.push_back(protein_identification);
        IdXMLFile().store(out, protein_identifications, ids);

        return EXECUTION_OK;
    }
示例#27
0
END_SECTION

START_SECTION(void load(const String &filename, ProteinIdentification &protein_ids, PeptideIdentification &peptide_ids))
{
	ProtXMLFile f;
	ProteinIdentification proteins;
	PeptideIdentification peptides;
	String prot_file;

  StringList ids = ListUtils::create<String>("16627578304933075941,13229490167902618598");

	// we do this twice, just to check that members are correctly reset etc..
	for (Int i=0;i<2;++i)
	{
		prot_file = OPENMS_GET_TEST_DATA_PATH("ProtXMLFile_input_1.protXML");
		f.load(prot_file, proteins, peptides);
		TEST_EQUAL(proteins.getIdentifier(), ids[i]);
		TEST_EQUAL(peptides.getIdentifier(), ids[i]);
	
		// groups	
		TEST_EQUAL(proteins.getProteinGroups().size(), 7);
		TEST_EQUAL(proteins.getProteinGroups()[0].probability, 0.9990);
		TEST_EQUAL(proteins.getProteinGroups()[0].accessions.size(), 1);
		TEST_EQUAL(proteins.getProteinGroups()[3].accessions.size(), 2);
		TEST_EQUAL(proteins.getProteinGroups()[3].accessions[0], 
							 "P01876|IGHA1_HUMAN");
		TEST_EQUAL(proteins.getProteinGroups()[3].accessions[1], 
							 "P01877|IGHA2_HUMAN");
		TEST_EQUAL(proteins.getProteinGroups()[6].probability, 0.2026);
		TEST_EQUAL(proteins.getProteinGroups()[6].accessions.size(), 1);

		TEST_EQUAL(proteins.getIndistinguishableProteins().size(), 7);
		TEST_EQUAL(proteins.getIndistinguishableProteins()[0].accessions.size(), 1);
		TEST_EQUAL(proteins.getIndistinguishableProteins()[3].accessions.size(), 2);
		TEST_EQUAL(proteins.getIndistinguishableProteins()[3].accessions[0], 
							 "P01876|IGHA1_HUMAN");
		TEST_EQUAL(proteins.getIndistinguishableProteins()[3].accessions[1], 
							 "P01877|IGHA2_HUMAN");
		TEST_EQUAL(proteins.getIndistinguishableProteins()[6].accessions.size(), 1);

		// proteins
		TEST_EQUAL(proteins.getHits().size(), 9);
		TEST_EQUAL(proteins.getHits()[0].getAccession(), "P02787|TRFE_HUMAN");
		TEST_EQUAL(proteins.getHits()[0].getCoverage(), 8.6);
		TEST_EQUAL(proteins.getHits()[0].getScore(), 0.9990);
    // this one is indistinguishable... therefore it should have minimal infos
		TEST_EQUAL(proteins.getHits()[6].getAccession(), "P00739|HPTR_HUMAN");
		TEST_EQUAL(proteins.getHits()[6].getCoverage(), -1);
		TEST_EQUAL(proteins.getHits()[6].getScore(), -1);

		TEST_EQUAL(proteins.getHits()[8].getAccession(), "P04217|A1BG_HUMAN");
		TEST_EQUAL(proteins.getHits()[8].getCoverage(), 2.0);
		TEST_EQUAL(proteins.getHits()[8].getScore(), 0.2026);
	
		// peptides
		TEST_EQUAL(peptides.getHits().size(), 16);
		AASequence aa_seq("MYLGYEYVTAIR");
		TEST_EQUAL(peptides.getHits()[0].getSequence(), aa_seq);
		TEST_EQUAL(peptides.getHits()[0].getCharge(), 2);
		TEST_EQUAL(peptides.getHits()[0].getScore(), 0.8633);
		TEST_EQUAL(peptides.getHits()[0].getProteinAccessions().size(), 1);
		TEST_EQUAL(peptides.getHits()[0].getProteinAccessions()[0], "P02787|TRFE_HUMAN");
		TEST_EQUAL(peptides.getHits()[0].getMetaValue("is_unique"), true);
		TEST_EQUAL(peptides.getHits()[0].getMetaValue("is_contributing"), true);
			
		// load 2 nd file and
		prot_file = OPENMS_GET_TEST_DATA_PATH("ProtXMLFile_input_2.protXML");
		
	}
}
  ExitCodes main_(int, const char**)
  {
    //-------------------------------------------------------------
    // parameter handling
    //-------------------------------------------------------------

    StringList in_spec = getStringList_("in");
    StringList out = getStringList_("out");
    String in_lib = getStringOption_("lib");
    String compare_function = getStringOption_("compare_function");
    Int precursor_mass_multiplier = getIntOption_("round_precursor_to_integer");
    float precursor_mass_tolerance = getDoubleOption_("precursor_mass_tolerance");
    //Int min_precursor_charge = getIntOption_("min_precursor_charge");
    //Int max_precursor_charge = getIntOption_("max_precursor_charge");
    float remove_peaks_below_threshold = getDoubleOption_("filter:remove_peaks_below_threshold");
    UInt min_peaks = getIntOption_("filter:min_peaks");
    UInt max_peaks = getIntOption_("filter:max_peaks");
    Int cut_peaks_below = getIntOption_("filter:cut_peaks_below");
    StringList fixed_modifications = getStringList_("fixed_modifications");
    StringList variable_modifications = getStringList_("variable_modifications");
    Int top_hits  = getIntOption_("top_hits");
    if (top_hits < -1)
    {
      writeLog_("top_hits (should be  >= -1 )");
      return ILLEGAL_PARAMETERS;
    }
    //-------------------------------------------------------------
    // loading input
    //-------------------------------------------------------------
    if (out.size() != in_spec.size())
    {
      writeLog_("out (should be as many as input files)");
      return ILLEGAL_PARAMETERS;
    }

    time_t prog_time = time(NULL);
    MSPFile spectral_library;
    RichPeakMap query, library;
    //spectrum which will be identified
    MzMLFile spectra;
    spectra.setLogType(log_type_);

    time_t start_build_time = time(NULL);
    //-------------------------------------------------------------
    //building map for faster search
    //-------------------------------------------------------------

    //library containing already identified peptide spectra
    vector<PeptideIdentification> ids;
    spectral_library.load(in_lib, ids, library);

    map<Size, vector<PeakSpectrum> > MSLibrary;
    {
      RichPeakMap::iterator s;
      vector<PeptideIdentification>::iterator i;
      ModificationsDB* mdb = ModificationsDB::getInstance();
      for (s = library.begin(), i = ids.begin(); s < library.end(); ++s, ++i)
      {
        double precursor_MZ = (*s).getPrecursors()[0].getMZ();
        Size MZ_multi = (Size)precursor_MZ * precursor_mass_multiplier;
        map<Size, vector<PeakSpectrum> >::iterator found;
        found = MSLibrary.find(MZ_multi);

        PeakSpectrum librar;
        bool variable_modifications_ok = true;
        bool fixed_modifications_ok = true;
        const AASequence& aaseq = i->getHits()[0].getSequence();
        //variable fixed modifications
        if (!fixed_modifications.empty())
        {
          for (Size i = 0; i < aaseq.size(); ++i)
          {
            const   Residue& mod  = aaseq.getResidue(i);
            for (Size s = 0; s < fixed_modifications.size(); ++s)
            {
              if (mod.getOneLetterCode() == mdb->getModification(fixed_modifications[s]).getOrigin() && fixed_modifications[s] != mod.getModification())
              {
                fixed_modifications_ok = false;
                break;
              }
            }
          }
        }
        //variable modifications
        if (aaseq.isModified() && (!variable_modifications.empty()))
        {
          for (Size i = 0; i < aaseq.size(); ++i)
          {
            if (aaseq.isModified(i))
            {
              const   Residue& mod  = aaseq.getResidue(i);
              for (Size s = 0; s < variable_modifications.size(); ++s)
              {
                if (mod.getOneLetterCode() == mdb->getModification(variable_modifications[s]).getOrigin() && variable_modifications[s] != mod.getModification())
                {
                  variable_modifications_ok = false;
                  break;
                }
              }
            }
          }
        }
        if (variable_modifications_ok && fixed_modifications_ok)
        {
          PeptideIdentification& translocate_pid = *i;
          librar.getPeptideIdentifications().push_back(translocate_pid);
          librar.setPrecursors(s->getPrecursors());
          //library entry transformation
          for (UInt l = 0; l < s->size(); ++l)
          {
            Peak1D peak;
            if ((*s)[l].getIntensity() >  remove_peaks_below_threshold)
            {
              const String& info = (*s)[l].getMetaValue("MSPPeakInfo");
              if (info[0] == '?')
              {
                peak.setIntensity(sqrt(0.2 * (*s)[l].getIntensity()));
              }
              else
              {
                peak.setIntensity(sqrt((*s)[l].getIntensity()));
              }

              peak.setMZ((*s)[l].getMZ());
              peak.setPosition((*s)[l].getPosition());
              librar.push_back(peak);
            }
          }
          if (found != MSLibrary.end())
          {
            found->second.push_back(librar);
          }
          else
          {
            vector<PeakSpectrum> tmp;
            tmp.push_back(librar);
            MSLibrary.insert(make_pair(MZ_multi, tmp));
          }
        }
      }
    }
    time_t end_build_time = time(NULL);
    cout << "Time needed for preprocessing data: " << (end_build_time - start_build_time) << "\n";
    //compare function
    PeakSpectrumCompareFunctor* comparor = Factory<PeakSpectrumCompareFunctor>::create(compare_function);
    //-------------------------------------------------------------
    // calculations
    //-------------------------------------------------------------
    double score;
    StringList::iterator in, out_file;
    for (in  = in_spec.begin(), out_file  = out.begin(); in < in_spec.end(); ++in, ++out_file)
    {
      time_t start_time = time(NULL);
      spectra.load(*in, query);
      //Will hold valuable hits
      vector<PeptideIdentification> peptide_ids;
      vector<ProteinIdentification> protein_ids;
      // Write parameters to ProteinIdentifcation
      ProteinIdentification prot_id;
      //Parameters of identificaion
      prot_id.setIdentifier("test");
      prot_id.setSearchEngineVersion("SpecLibSearcher");
      prot_id.setDateTime(DateTime::now());
      prot_id.setScoreType(compare_function);
      ProteinIdentification::SearchParameters searchparam;
      searchparam.precursor_tolerance = precursor_mass_tolerance;
      prot_id.setSearchParameters(searchparam);
      /***********SEARCH**********/
      for (UInt j = 0; j < query.size(); ++j)
      {
        //Set identifier for each identifications
        PeptideIdentification pid;
        pid.setIdentifier("test");
        pid.setScoreType(compare_function);
        ProteinHit pr_hit;
        pr_hit.setAccession(j);
        prot_id.insertHit(pr_hit);
        //RichPeak1D to Peak1D transformation for the compare function query
        PeakSpectrum quer;
        bool peak_ok = true;
        query[j].sortByIntensity(true);
        double min_high_intensity = 0;

        if (query[j].empty() || query[j].getMSLevel() != 2)
        {
          continue;
        }
        if (query[j].getPrecursors().empty())
        {
          writeLog_("Warning MS2 spectrum without precursor information");
          continue;
        }

        min_high_intensity = (1 / cut_peaks_below) * query[j][0].getIntensity();

        query[j].sortByPosition();
        for (UInt k = 0; k < query[j].size() && k < max_peaks; ++k)
        {
          if (query[j][k].getIntensity() >  remove_peaks_below_threshold && query[j][k].getIntensity() >= min_high_intensity)
          {
            Peak1D peak;
            peak.setIntensity(sqrt(query[j][k].getIntensity()));
            peak.setMZ(query[j][k].getMZ());
            peak.setPosition(query[j][k].getPosition());
            quer.push_back(peak);
          }
        }
        if (quer.size() >= min_peaks)
        {
          peak_ok = true;
        }
        else
        {
          peak_ok = false;
        }
        double query_MZ = query[j].getPrecursors()[0].getMZ();
        if (peak_ok)
        {
          bool charge_one = false;
          Int percent = (Int) Math::round((query[j].size() / 100.0) * 3.0);
          Int margin  = (Int) Math::round((query[j].size() / 100.0) * 1.0);
          for (vector<RichPeak1D>::iterator peak = query[j].end() - 1; percent >= 0; --peak, --percent)
          {
            if (peak->getMZ() < query_MZ)
            {
              break;
            }
          }
          if (percent > margin)
          {
            charge_one = true;
          }
          float min_MZ = (query_MZ - precursor_mass_tolerance) * precursor_mass_multiplier;
          float max_MZ = (query_MZ + precursor_mass_tolerance) * precursor_mass_multiplier;
          for (Size mz = (Size)min_MZ; mz <= ((Size)max_MZ) + 1; ++mz)
          {
            map<Size, vector<PeakSpectrum> >::iterator found;
            found = MSLibrary.find(mz);
            if (found != MSLibrary.end())
            {
              vector<PeakSpectrum>& library = found->second;
              for (Size i = 0; i < library.size(); ++i)
              {
                float this_MZ  = library[i].getPrecursors()[0].getMZ() * precursor_mass_multiplier;
                if (this_MZ >= min_MZ && max_MZ >= this_MZ && ((charge_one == true && library[i].getPeptideIdentifications()[0].getHits()[0].getCharge() == 1) || charge_one == false))
                {
                  PeptideHit hit = library[i].getPeptideIdentifications()[0].getHits()[0];
                  PeakSpectrum& librar = library[i];
                  //Special treatment for SpectraST score as it computes a score based on the whole library
                  if (compare_function == "SpectraSTSimilarityScore")
                  {
                    SpectraSTSimilarityScore* sp = static_cast<SpectraSTSimilarityScore*>(comparor);
                    BinnedSpectrum quer_bin = sp->transform(quer);
                    BinnedSpectrum librar_bin = sp->transform(librar);
                    score = (*sp)(quer, librar); //(*sp)(quer_bin,librar_bin);
                    double dot_bias = sp->dot_bias(quer_bin, librar_bin, score);
                    hit.setMetaValue("DOTBIAS", dot_bias);
                  }
                  else
                  {
                    score = (*comparor)(quer, librar);
                  }

                  DataValue RT(library[i].getRT());
                  DataValue MZ(library[i].getPrecursors()[0].getMZ());
                  hit.setMetaValue("RT", RT);
                  hit.setMetaValue("MZ", MZ);
                  hit.setScore(score);
                  PeptideEvidence pe;
                  pe.setProteinAccession(pr_hit.getAccession());
                  hit.addPeptideEvidence(pe);
                  pid.insertHit(hit);
                }
              }
            }
          }
        }
        pid.setHigherScoreBetter(true);
        pid.sort();
        if (compare_function == "SpectraSTSimilarityScore")
        {
          if (!pid.empty() && !pid.getHits().empty())
          {
            vector<PeptideHit> final_hits;
            final_hits.resize(pid.getHits().size());
            SpectraSTSimilarityScore* sp = static_cast<SpectraSTSimilarityScore*>(comparor);
            Size runner_up = 1;
            for (; runner_up < pid.getHits().size(); ++runner_up)
            {
              if (pid.getHits()[0].getSequence().toUnmodifiedString() != pid.getHits()[runner_up].getSequence().toUnmodifiedString() || runner_up > 5)
              {
                break;
              }
            }
            double delta_D = sp->delta_D(pid.getHits()[0].getScore(), pid.getHits()[runner_up].getScore());
            for (Size s = 0; s < pid.getHits().size(); ++s)
            {
              final_hits[s] = pid.getHits()[s];
              final_hits[s].setMetaValue("delta D", delta_D);
              final_hits[s].setMetaValue("dot product", pid.getHits()[s].getScore());
              final_hits[s].setScore(sp->compute_F(pid.getHits()[s].getScore(), delta_D, pid.getHits()[s].getMetaValue("DOTBIAS")));

              //final_hits[s].removeMetaValue("DOTBIAS");
            }
            pid.setHits(final_hits);
            pid.sort();
            pid.setMZ(query[j].getPrecursors()[0].getMZ());
            pid.setRT(query_MZ);
          }
        }
        if (top_hits != -1 && (UInt)top_hits < pid.getHits().size())
        {
          vector<PeptideHit> hits;
          hits.resize(top_hits);
          for (Size i = 0; i < (UInt)top_hits; ++i)
          {
            hits[i] = pid.getHits()[i];
          }
          pid.setHits(hits);
        }
        peptide_ids.push_back(pid);
      }
      protein_ids.push_back(prot_id);
      //-------------------------------------------------------------
      // writing output
      //-------------------------------------------------------------
      IdXMLFile id_xml_file;
      id_xml_file.store(*out_file, protein_ids, peptide_ids);
      time_t end_time = time(NULL);
      cout << "Search time: " << difftime(end_time, start_time) << " seconds for " << *in << "\n";
    }
    time_t end_time = time(NULL);
    cout << "Total time: " << difftime(end_time, prog_time) << " secconds\n";
    return EXECUTION_OK;
  }
示例#29
0
  void IDFilter::filterIdentificationsByVariableModifications(const PeptideIdentification& identification,
                                                              const vector<String>& fixed_modifications,
                                                              PeptideIdentification& filtered_identification)
  {
    vector<Size> new_peptide_indices;
    vector<PeptideHit> filtered_peptide_hits;

    filtered_identification = identification;
    filtered_identification.setHits(vector<PeptideHit>());

    const vector<PeptideHit>& temp_peptide_hits = identification.getHits();

    for (Size i = 0; i < temp_peptide_hits.size(); i++)
    {
      const AASequence& aa_seq = temp_peptide_hits[i].getSequence();

      /*
       TODO: check these cases
      // check terminal modifications
      if (aa_seq.hasNTerminalModification())
      {
        String unimod_name = aa_seq.getNTerminalModification();
        if (find(fixed_modifications.begin(), fixed_modifications.end(), unimod_name) == fixed_modifications.end())
        {
          new_peptide_indices.push_back(i);
          continue;
        }
      }

      if (aa_seq.hasCTerminalModification())
      {
        String unimod_name = aa_seq.getCTerminalModification();
        if (find(fixed_modifications.begin(), fixed_modifications.end(), unimod_name) == fixed_modifications.end())
        {
          new_peptide_indices.push_back(i);
          continue;
        }
      }
      */
      // check internal modifications
      for (Size j = 0; j != aa_seq.size(); ++j)
      {
        if (aa_seq[j].isModified())
        {
          String unimod_name = aa_seq[j].getModification() + " (" + aa_seq[j].getOneLetterCode() + ")";
          if (find(fixed_modifications.begin(), fixed_modifications.end(), unimod_name) == fixed_modifications.end())
          {
            new_peptide_indices.push_back(i);
            continue;
          }
        }
      }
    }

    for (Size i = 0; i < new_peptide_indices.size(); i++)
    {
      const PeptideHit& ph = temp_peptide_hits[new_peptide_indices[i]];
      filtered_peptide_hits.push_back(ph);
    }
    if (!filtered_peptide_hits.empty())
    {
      filtered_identification.setHits(filtered_peptide_hits);
      filtered_identification.assignRanks();
    }
  }
  void IDDecoyProbability::apply_(vector<PeptideIdentification> & ids, const vector<double> & rev_scores, const vector<double> & fwd_scores, const vector<double> & all_scores)
  {
    Size number_of_bins(param_.getValue("number_of_bins"));



    // normalize distribution to [0, 1]
    vector<double> fwd_scores_normalized(number_of_bins, 0.0), rev_scores_normalized(number_of_bins, 0.0), diff_scores(number_of_bins, 0.0), all_scores_normalized(number_of_bins, 0.0);
    Transformation_ rev_trafo, fwd_trafo, all_trafo;
    normalizeBins_(rev_scores, rev_scores_normalized, rev_trafo);
    normalizeBins_(fwd_scores, fwd_scores_normalized, fwd_trafo);
    normalizeBins_(all_scores, all_scores_normalized, all_trafo);

    // rev scores fitting
    vector<DPosition<2> > rev_data;

    for (Size i = 0; i < number_of_bins; ++i)
    {
      DPosition<2> pos;
      pos.setX(((double)i) / (double)number_of_bins + 0.0001);    // necessary????
      pos.setY(rev_scores_normalized[i]);
      rev_data.push_back(pos);
#ifdef IDDECOYPROBABILITY_DEBUG
      cerr << pos.getX() << " " << pos.getY() << endl;
#endif
    }

    Math::GammaDistributionFitter gdf;
    Math::GammaDistributionFitter::GammaDistributionFitResult result_gamma_1st (1.0, 3.0);
    gdf.setInitialParameters(result_gamma_1st);
    // TODO heuristic for good start parameters
    Math::GammaDistributionFitter::GammaDistributionFitResult result_gamma = gdf.fit(rev_data);

#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << gdf.getGnuplotFormula() << endl;
    String rev_filename = param_.getValue("rev_filename");
    generateDistributionImage_(rev_scores_normalized, gdf.getGnuplotFormula(), rev_filename);
#endif

    // generate diffs of distributions
    // get the fwd and rev distribution, apply all_trafo and calculate the diff
    vector<Size> fwd_bins(number_of_bins, 0), rev_bins(number_of_bins, 0);
    double min(all_trafo.min_score), diff(all_trafo.diff_score);
    Size max_bin(0);
    for (vector<double>::const_iterator it = fwd_scores.begin(); it != fwd_scores.end(); ++it)
    {
      Size bin = (Size)((*it - min) / diff * (double)(number_of_bins - 1));
      ++fwd_bins[bin];
      if (fwd_bins[bin] > max_bin)
      {
        max_bin = fwd_bins[bin];
      }
    }

    Size max_reverse_bin(0), max_reverse_bin_value(0);
    //min = rev_trafo.min_score;
    //diff = rev_trafo.diff_score;
    for (vector<double>::const_iterator it = rev_scores.begin(); it != rev_scores.end(); ++it)
    {
      Size bin = (Size)((*it - min) / diff * (double)number_of_bins);
      ++rev_bins[bin];
      if (rev_bins[bin] > max_bin)
      {
        max_bin = rev_bins[bin];
      }
      if (rev_bins[bin] > max_reverse_bin_value)
      {
        max_reverse_bin = bin;
        max_reverse_bin_value = rev_bins[bin];
      }
    }

#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << "Trying to get diff scores" << endl;
#endif

    // get diff of fwd and rev
    for (Size i = 0; i < number_of_bins; ++i)
    {
      Size fwd(0), rev(0);
      fwd = fwd_bins[i];
      rev = rev_bins[i];
      if ((double)fwd > (double)(1.3 * rev) && max_reverse_bin < i)
      {
        diff_scores[i] = (double)(fwd - rev) / (double)max_bin;
      }
      else
      {
        diff_scores[i] = 0.0;
      }
    }
#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << "Gauss Fitting values size of diff scores=" << diff_scores.size() << endl;
#endif
    // diff scores fitting
    vector<DPosition<2> > diff_data;
    double gauss_A(0), gauss_x0(0), norm_factor(0);
    for (Size i = 0; i < number_of_bins; ++i)
    {
      DPosition<2> pos;
      pos.setX((double)i / (double)number_of_bins);
      pos.setY(diff_scores[i]);

      if (pos.getY() > gauss_A)
      {
        gauss_A = pos.getY();
      }
      gauss_x0 += pos.getX() * pos.getY();
      norm_factor += pos.getY();


      diff_data.push_back(pos);
    }

    double gauss_sigma(0);
    gauss_x0 /= (double)diff_data.size();
    gauss_x0 /= norm_factor;

    for (Size i = 0; i <= number_of_bins; ++i)
    {
      gauss_sigma += fabs(gauss_x0 - (double)i / (double)number_of_bins);
    }

    gauss_sigma /= (double)diff_data.size();



#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << "setting initial parameters: " << endl;
#endif
    Math::GaussFitter gf;
    Math::GaussFitter::GaussFitResult result_1st(gauss_A, gauss_x0, gauss_sigma);
    gf.setInitialParameters(result_1st);
#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << "Initial Gauss guess: A=" << gauss_A << ", x0=" << gauss_x0 << ", sigma=" << gauss_sigma << endl;
#endif

    //TODO: fail-to-fit correction was done using the GNUPlotFormula. Seemed to be a hack.
    //Changed it to try-catch-block but I am not sure if this correction should be made
    //at all. Can someone please verify?
    Math::GaussFitter::GaussFitResult result_gauss (gauss_A, gauss_x0, gauss_sigma);
    try{
        result_gauss = gf.fit(diff_data);
    }
    catch(Exception::UnableToFit& /* e */)
    {
      result_gauss.A = gauss_A;
      result_gauss.x0 = gauss_x0;
      result_gauss.sigma = gauss_sigma;
    }

//    // fit failed?
//    if (gf.getGnuplotFormula() == "")
//    {
//      result_gauss.A = gauss_A;
//      result_gauss.x0 = gauss_x0;
//      result_gauss.sigma = gauss_sigma;
//    }

#ifdef IDDECOYPROBABILITY_DEBUG
    cerr << gf.getGnuplotFormula() << endl;
    String fwd_filename = param_.getValue("fwd_filename");
    if (gf.getGnuplotFormula() == "")
    {
      String formula("f(x)=" + String(gauss_A) + " * exp(-(x - " + String(gauss_x0) + ") ** 2 / 2 / (" + String(gauss_sigma) + ") ** 2)");
      generateDistributionImage_(diff_scores, formula, fwd_filename);
    }
    else
    {
      generateDistributionImage_(diff_scores, gf.getGnuplotFormula(), fwd_filename);
    }
#endif

#ifdef IDDECOYPROBABILITY_DEBUG
    //all_trafo.diff_score + all_trafo.min_score
    String gauss_formula("f(x)=" + String(result_gauss.A / all_trafo.max_intensity) + " * exp(-(x - " + String(result_gauss.x0 * all_trafo.diff_score + all_trafo.min_score) + ") ** 2 / 2 / (" + String(result_gauss.sigma * all_trafo.diff_score)   + ") ** 2)");

    String b_str(result_gamma.b), p_str(result_gamma.p);
    String gamma_formula = "g(x)=(" + b_str + " ** " + p_str + ") / gamma(" + p_str + ") * x ** (" + p_str + " - 1) * exp(- " + b_str + " * x)";

    generateDistributionImage_(all_scores_normalized, all_trafo, gauss_formula, gamma_formula, (String)param_.getValue("fwd_filename"));
#endif

    vector<PeptideIdentification> new_prob_ids;
    // calculate the probabilities and write them to the IDs
    for (vector<PeptideIdentification>::const_iterator it = ids.begin(); it != ids.end(); ++it)
    {
      if (it->getHits().size() > 0)
      {
        vector<PeptideHit> hits;
        String score_type = it->getScoreType() + "_score";
        for (vector<PeptideHit>::const_iterator pit = it->getHits().begin(); pit != it->getHits().end(); ++pit)
        {
          PeptideHit hit = *pit;
          double score = hit.getScore();
          if (!it->isHigherScoreBetter())
          {
            score = -log10(score);
          }
          hit.setMetaValue(score_type, hit.getScore());
          hit.setScore(getProbability_(result_gamma, rev_trafo, result_gauss, fwd_trafo, score));
          hits.push_back(hit);
        }
        PeptideIdentification id = *it;
        id.setHigherScoreBetter(true);
        id.setScoreType(id.getScoreType() + "_DecoyProbability");
        id.setHits(hits);

        new_prob_ids.push_back(id);
      }
    }
    ids = new_prob_ids;
  }