void IDMapper::getIDDetails_(const PeptideIdentification & id, DoubleReal & rt_pep, DoubleList & mz_values, IntList & charges, bool use_avg_mass) const { mz_values.clear(); charges.clear(); rt_pep = id.getMetaValue("RT"); // collect m/z values of pepId if (param_.getValue("mz_reference") == "precursor") // use precursor m/z of pepId { mz_values.push_back(id.getMetaValue("MZ")); } for (vector<PeptideHit>::const_iterator hit_it = id.getHits().begin(); hit_it != id.getHits().end(); ++hit_it) { Int charge = hit_it->getCharge(); charges.push_back(charge); if (param_.getValue("mz_reference") == "peptide") // use mass of each pepHit (assuming H+ adducts) { DoubleReal mass = use_avg_mass ? hit_it->getSequence().getAverageWeight(Residue::Full, charge) : hit_it->getSequence().getMonoWeight(Residue::Full, charge); mz_values.push_back( mass / (DoubleReal) charge); } } }
//Visualizing PeptideIdentification object void MetaDataBrowser::visualize_(PeptideIdentification & meta, QTreeWidgetItem * parent) { PeptideIdentificationVisualizer * visualizer = new PeptideIdentificationVisualizer(isEditable(), this, this); QStringList labels; int id = ws_->addWidget(visualizer); labels << QString("PeptideIdentification %1").arg(meta.getScoreType().c_str()) << QString::number(id); visualizer->load(meta, id); QTreeWidgetItem * item; if (parent == nullptr) { item = new QTreeWidgetItem(treeview_, labels); } else { item = new QTreeWidgetItem(parent, labels); } //check for proteins and peptides hits meta.assignRanks(); //list all peptides hits in the tree for (Size i = 0; i < meta.getHits().size(); ++i) { visualize_(const_cast<PeptideHit &>(meta.getHits()[i]), item); } visualize_(dynamic_cast<MetaInfoInterface &>(meta), item); connectVisualizer_(visualizer); }
bool IDFilter::filterIdentificationsByMetaValueRange(const PeptideIdentification& identification, const String& key, DoubleReal low, DoubleReal high, bool missing) { if (!identification.metaValueExists(key)) return missing; DoubleReal value = identification.getMetaValue(key); return (value >= low) && (value <= high); }
void IDFilter::filterIdentificationsByCharge(const PeptideIdentification& identification, Int min_charge, PeptideIdentification& filtered_identification) { vector<Size> new_peptide_indices; vector<PeptideHit> filtered_peptide_hits; filtered_identification = identification; filtered_identification.setHits(vector<PeptideHit>()); const vector<PeptideHit>& temp_peptide_hits = identification.getHits(); for (Size i = 0; i < temp_peptide_hits.size(); i++) { if (temp_peptide_hits[i].getCharge() >= min_charge) { new_peptide_indices.push_back(i); } } for (Size i = 0; i < new_peptide_indices.size(); i++) { filtered_peptide_hits.push_back(identification.getHits()[new_peptide_indices[i]]); } if (!filtered_peptide_hits.empty()) { filtered_identification.setHits(filtered_peptide_hits); filtered_identification.assignRanks(); } }
void IDFilter::filterIdentificationsByMzError(const PeptideIdentification& identification, DoubleReal mass_error, bool unit_ppm, PeptideIdentification& filtered_identification) { vector<PeptideHit> hits; filtered_identification = identification; vector<PeptideHit> temp_hits = identification.getHits(); for (vector<PeptideHit>::iterator it = temp_hits.begin(); it != temp_hits.end(); ++it) { Int charge = it->getCharge(); if (charge == 0) { charge = 1; } DoubleReal exp_mz = (DoubleReal)identification.getMetaValue("MZ"); DoubleReal theo_mz = (it->getSequence().getMonoWeight() + (DoubleReal)charge * Constants::PROTON_MASS_U) / (DoubleReal)charge; DoubleReal error(exp_mz - theo_mz); if (unit_ppm) { error = error / theo_mz * (DoubleReal)1e6; } if (fabs(error) <= mass_error) { hits.push_back(*it); } } filtered_identification.setHits(hits); }
void CompNovoIdentificationCID::getIdentifications(vector<PeptideIdentification> & pep_ids, const PeakMap & exp) { Size count(1); for (PeakMap::ConstIterator it = exp.begin(); it != exp.end(); ++it, ++count) { //cerr << count << "/" << exp.size() << endl; PeptideIdentification id; // TODO check if both CID and ETD is present; PeakSpectrum CID_spec(*it); id.setRT(it->getRT()); id.setMZ(it->getPrecursors().begin()->getMZ()); subspec_to_sequences_.clear(); permute_cache_.clear(); decomp_cache_.clear(); getIdentification(id, CID_spec); //cerr << "size_of id=" << id.getHits().size() << endl; pep_ids.push_back(id); //++it; // //if (count == 10) //{ //return; //} } return; }
// Equality operator bool PeptideIdentification::operator==(const PeptideIdentification & rhs) const { return MetaInfoInterface::operator==(rhs) && id_ == rhs.id_ && hits_ == rhs.getHits() && significance_threshold_ == rhs.getSignificanceThreshold() && score_type_ == rhs.score_type_ && higher_score_better_ == rhs.higher_score_better_; }
// compare peptide IDs by score of best hit (hits must be sorted first!) // (note to self: the "static" is necessary to avoid cryptic "no matching // function" errors from gcc when the comparator is used below) static bool compareIDs_(const PeptideIdentification & left, const PeptideIdentification & right) { if (left.getHits()[0].getScore() < right.getHits()[0].getScore()) { return true; } return false; }
// Equality operator bool PeptideIdentification::operator==(const PeptideIdentification& rhs) const { return MetaInfoInterface::operator==(rhs) && id_ == rhs.id_ && (rt_ == rhs.rt_ || (!this->hasRT() && !rhs.hasRT())) // might be NaN, so comparing == will always be false && (mz_ == rhs.mz_ || (!this->hasMZ() && !rhs.hasMZ())) // might be NaN, so comparing == will always be false && hits_ == rhs.getHits() && significance_threshold_ == rhs.getSignificanceThreshold() && score_type_ == rhs.score_type_ && higher_score_better_ == rhs.higher_score_better_ && base_name_ == rhs.base_name_; }
// list of peptide hits in "peptide" will be sorted bool MapAlignmentAlgorithmIdentification::hasGoodHit_(PeptideIdentification & peptide) { if (peptide.empty() || peptide.getHits().empty()) return false; peptide.sort(); DoubleReal score = peptide.getHits().begin()->getScore(); if (peptide.isHigherScoreBetter()) return score >= score_threshold_; return score <= score_threshold_; }
void IDFilter::filterIdentificationsByBestHits(const PeptideIdentification& identification, PeptideIdentification& filtered_identification, bool strict) { vector<PeptideHit> filtered_peptide_hits; PeptideHit temp_peptide_hit; vector<Size> new_peptide_indices; filtered_identification = identification; filtered_identification.setHits(vector<PeptideHit>()); if (!identification.getHits().empty()) { Real optimal_value = identification.getHits()[0].getScore(); new_peptide_indices.push_back(0); // searching for peptide(s) with maximal score for (Size i = 1; i < identification.getHits().size(); i++) { Real temp_score = identification.getHits()[i].getScore(); bool new_leader = false; if ((identification.isHigherScoreBetter() && (temp_score > optimal_value)) || (!identification.isHigherScoreBetter() && (temp_score < optimal_value))) new_leader = true; if (new_leader) { optimal_value = temp_score; new_peptide_indices.clear(); new_peptide_indices.push_back(i); } else if (temp_score == optimal_value) { new_peptide_indices.push_back(i); } } if (!strict || new_peptide_indices.size() == 1) { for (Size i = 0; i < new_peptide_indices.size(); i++) { filtered_peptide_hits.push_back(identification.getHits()[new_peptide_indices[i]]); } } } if (!filtered_peptide_hits.empty()) { filtered_identification.setHits(filtered_peptide_hits); filtered_identification.assignRanks(); } }
void IDFilter::filterIdentificationsByProteins(const PeptideIdentification& identification, const vector<FASTAFile::FASTAEntry>& proteins, PeptideIdentification& filtered_identification, bool no_protein_identifiers) { // TODO: this is highly inefficient! the Protein-Index should be build once for all peptide-identifications instead of // doing this once for every ID. Furthermore the index itself is inefficient (use seqan instead) String protein_sequences; String accession_sequences; vector<PeptideHit> filtered_peptide_hits; PeptideHit temp_peptide_hit; filtered_identification = identification; filtered_identification.setHits(vector<PeptideHit>()); for (Size i = 0; i < proteins.size(); i++) { if (proteins[i].identifier != "") { accession_sequences.append("*" + proteins[i].identifier); } if (proteins[i].sequence != "") { protein_sequences.append("*" + proteins[i].sequence); } } accession_sequences.append("*"); protein_sequences.append("*"); for (Size i = 0; i < identification.getHits().size(); i++) { if (no_protein_identifiers || accession_sequences == "*") // filter by sequence alone if no protein accesssions are available { if (protein_sequences.find(identification.getHits()[i].getSequence().toUnmodifiedString()) != String::npos) { filtered_peptide_hits.push_back(identification.getHits()[i]); } } else // filter by protein accessions { for (vector<String>::const_iterator ac_it = identification.getHits()[i].getProteinAccessions().begin(); ac_it != identification.getHits()[i].getProteinAccessions().end(); ++ac_it) { if (accession_sequences.find("*" + *ac_it) != String::npos) { filtered_peptide_hits.push_back(identification.getHits()[i]); break; // we found a matching protein, the peptide is valid -> exit } } } } filtered_identification.setHits(filtered_peptide_hits); filtered_identification.assignRanks(); }
std::vector<PeptideIdentification> toPepVec(const QStringList& sl_pep) { std::vector<PeptideIdentification> pep_vec; for (Size i = 0; i < sl_pep.size(); ++i) { PeptideHit hit; hit.setSequence(AASequence::fromString(sl_pep[int(i)])); std::vector<PeptideHit> hits; hits.push_back(hit); PeptideIdentification pi; pi.setHits(hits); pep_vec.push_back(pi); } return pep_vec; }
void IDFilter::filterIdentificationsByRTPValues(const PeptideIdentification& identification, PeptideIdentification& filtered_identification, DoubleReal p_value) { DoubleReal border = 1 - p_value; vector<PeptideHit> filtered_peptide_hits; PeptideHit temp_peptide_hit; filtered_identification = identification; filtered_identification.setHits(vector<PeptideHit>()); Size missing_meta_value = 0; for (Size i = 0; i < identification.getHits().size(); i++) { if (identification.getHits()[i].metaValueExists("predicted_RT_p_value")) { if ((DoubleReal)(identification.getHits()[i].getMetaValue("predicted_RT_p_value")) <= border) { filtered_peptide_hits.push_back(identification.getHits()[i]); } } else ++missing_meta_value; } if (missing_meta_value > 0) LOG_WARN << "Filtering identifications by p-value did not work on " << missing_meta_value << " of " << identification.getHits().size() << " hits. Your data is missing a meta-value ('predicted_RT_p_value') from RTPredict!\n"; if (!filtered_peptide_hits.empty()) { filtered_identification.setHits(filtered_peptide_hits); filtered_identification.assignRanks(); } }
void IDFilter::filterIdentificationsByCharge(const PeptideIdentification& identification, Int min_charge, PeptideIdentification& filtered_identification) { filtered_identification = identification; const vector<PeptideHit>& temp_peptide_hits = identification.getHits(); vector<PeptideHit> filtered_peptide_hits; for (Size i = 0; i < temp_peptide_hits.size(); ++i) { if (temp_peptide_hits[i].getCharge() >= min_charge) { filtered_peptide_hits.push_back(temp_peptide_hits[i]); } } filtered_identification.setHits(filtered_peptide_hits); filtered_identification.assignRanks(); }
void IDFilter::filterIdentificationsUnique(const PeptideIdentification& identification, PeptideIdentification& filtered_identification) { vector<PeptideHit> hits; filtered_identification = identification; vector<PeptideHit> temp_hits = identification.getHits(); for (vector<PeptideHit>::iterator it = temp_hits.begin(); it != temp_hits.end(); ++it) { if (find(hits.begin(), hits.end(), *it) == hits.end()) { hits.push_back(*it); } } filtered_identification.setHits(hits); }
void IDFilter::filterIdentificationsUnique(const PeptideIdentification& identification, PeptideIdentification& filtered_identification) { // there's no "PeptideHit::operator<" defined, so we can't use a set nor // "sort" + "unique" from the standard library vector<PeptideHit> hits; filtered_identification = identification; vector<PeptideHit> temp_hits = identification.getHits(); for (vector<PeptideHit>::iterator it = temp_hits.begin(); it != temp_hits.end(); ++it) { if (find(hits.begin(), hits.end(), *it) == hits.end()) { hits.push_back(*it); } } filtered_identification.setHits(hits); }
void IDFilter::filterIdentificationsByExclusionPeptides(const PeptideIdentification& identification, const set<String>& peptides, PeptideIdentification& filtered_identification) { String protein_sequences; String accession_sequences; vector<PeptideHit> filtered_peptide_hits; PeptideHit temp_peptide_hit; filtered_identification = identification; filtered_identification.setHits(vector<PeptideHit>()); for (Size i = 0; i < identification.getHits().size(); i++) { if (find(peptides.begin(), peptides.end(), identification.getHits()[i].getSequence().toString()) == peptides.end()) { filtered_peptide_hits.push_back(identification.getHits()[i]); } } if (!filtered_peptide_hits.empty()) { filtered_identification.setHits(filtered_peptide_hits); filtered_identification.assignRanks(); } }
void IDRipper::getProteinIdentification_(ProteinIdentification & result, PeptideIdentification pep_ident, std::vector<ProteinIdentification> & prot_idents) { const String & identifier = pep_ident.getIdentifier(); for (vector<ProteinIdentification>::iterator prot_it = prot_idents.begin(); prot_it != prot_idents.end(); ++prot_it) { if (identifier.compare(prot_it->getIdentifier()) == 0) { result = *prot_it; break; } } }
void IDFilter::filterIdentificationsByLength(const PeptideIdentification& identification, PeptideIdentification& filtered_identification, Size min_length, Size max_length) { vector<Size> new_peptide_indices; vector<PeptideHit> filtered_peptide_hits; filtered_identification = identification; filtered_identification.setHits(vector<PeptideHit>()); Size ml = max_length; if (max_length < min_length) { ml = UINT_MAX; } const vector<PeptideHit>& temp_peptide_hits = identification.getHits(); for (Size i = 0; i < temp_peptide_hits.size(); i++) { if (temp_peptide_hits[i].getSequence().size() >= min_length && temp_peptide_hits[i].getSequence().size() <= ml) { new_peptide_indices.push_back(i); } } for (Size i = 0; i < new_peptide_indices.size(); i++) { filtered_peptide_hits.push_back(identification.getHits()[new_peptide_indices[i]]); } if (!filtered_peptide_hits.empty()) { filtered_identification.setHits(filtered_peptide_hits); filtered_identification.assignRanks(); } }
void IDFilter::filterIdentificationsByLength(const PeptideIdentification& identification, PeptideIdentification& filtered_identification, Size min_length, Size max_length) { filtered_identification = identification; if (max_length < min_length) { max_length = UINT_MAX; } const vector<PeptideHit>& temp_peptide_hits = identification.getHits(); vector<PeptideHit> filtered_peptide_hits; for (Size i = 0; i < temp_peptide_hits.size(); ++i) { if (min_length <= temp_peptide_hits[i].getSequence().size() && temp_peptide_hits[i].getSequence().size() <= max_length) { filtered_peptide_hits.push_back(temp_peptide_hits[i]); } } filtered_identification.setHits(filtered_peptide_hits); filtered_identification.assignRanks(); }
// static bool IDConflictResolverAlgorithm::compareIDsSmallerScores_(const PeptideIdentification & left, const PeptideIdentification & right) { // if any of them is empty, the other is considered "greater" // independent of the score in the first hit if (left.getHits().empty() || right.getHits().empty()) { // also: for strict weak ordering, comp(x,x) needs to be false return left.getHits().size() < right.getHits().size(); } if (left.getHits()[0].getScore() < right.getHits()[0].getScore()) { return true; } return false; }
void IDFilter::filterIdentificationsByExclusionPeptides(const PeptideIdentification& identification, const set<String>& peptides, bool ignore_modifications, PeptideIdentification& filtered_identification) { vector<PeptideHit> filtered_peptide_hits; filtered_identification = identification; filtered_identification.setHits(vector<PeptideHit>()); for (Size i = 0; i < identification.getHits().size(); i++) { String query = ignore_modifications ? identification.getHits()[i].getSequence().toUnmodifiedString() : identification.getHits()[i].getSequence().toString(); if (find(peptides.begin(), peptides.end(), query) == peptides.end()) { filtered_peptide_hits.push_back(identification.getHits()[i]); } } if (!filtered_peptide_hits.empty()) { filtered_identification.setHits(filtered_peptide_hits); filtered_identification.assignRanks(); } }
void CompNovoIdentificationCID::getIdentification(PeptideIdentification & id, const PeakSpectrum & CID_spec) { //if (CID_spec.getPrecursors().begin()->getMZ() > 1000.0) //{ //cerr << "Weight of precursor has been estimated to exceed 2000.0 Da which is the current limit" << endl; //return; //} PeakSpectrum new_CID_spec(CID_spec); windowMower_(new_CID_spec, 0.3, 1); Param zhang_param; zhang_param = zhang_.getParameters(); zhang_param.setValue("tolerance", fragment_mass_tolerance_); zhang_param.setValue("use_gaussian_factor", "true"); zhang_param.setValue("use_linear_factor", "false"); zhang_.setParameters(zhang_param); Normalizer normalizer; Param n_param(normalizer.getParameters()); n_param.setValue("method", "to_one"); normalizer.setParameters(n_param); normalizer.filterSpectrum(new_CID_spec); Size charge(2); double precursor_weight(0); // [M+H]+ if (!CID_spec.getPrecursors().empty()) { // believe charge of spectrum? if (CID_spec.getPrecursors().begin()->getCharge() != 0) { charge = CID_spec.getPrecursors().begin()->getCharge(); } else { // TODO estimate charge state } precursor_weight = CID_spec.getPrecursors().begin()->getMZ() * charge - ((charge - 1) * Constants::PROTON_MASS_U); } //cerr << "charge=" << charge << ", [M+H]=" << precursor_weight << endl; // now delete all peaks that are right of the estimated precursor weight Size peak_counter(0); for (PeakSpectrum::ConstIterator it = new_CID_spec.begin(); it != new_CID_spec.end(); ++it, ++peak_counter) { if (it->getPosition()[0] > precursor_weight) { break; } } if (peak_counter < new_CID_spec.size()) { new_CID_spec.resize(peak_counter); } static double oxonium_mass = EmpiricalFormula("H2O+").getMonoWeight(); Peak1D p; p.setIntensity(1); p.setPosition(oxonium_mass); new_CID_spec.push_back(p); p.setPosition(precursor_weight); new_CID_spec.push_back(p); // add complement to spectrum /* for (PeakSpectrum::ConstIterator it1 = CID_spec.begin(); it1 != CID_spec.end(); ++it1) { // get m/z of complement double mz_comp = precursor_weight - it1->getPosition()[0] + Constants::PROTON_MASS_U; // search if peaks are available that have similar m/z values Size count(0); bool found(false); for (PeakSpectrum::ConstIterator it2 = CID_spec.begin(); it2 != CID_spec.end(); ++it2, ++count) { if (fabs(mz_comp - it2->getPosition()[0]) < fragment_mass_tolerance) { // add peak intensity to corresponding peak in new_CID_spec new_CID_spec[count].setIntensity(new_CID_spec[count].getIntensity()); } } if (!found) { // infer this peak Peak1D p; p.setIntensity(it1->getIntensity()); p.setPosition(mz_comp); new_CID_spec.push_back(p); } }*/ CompNovoIonScoringCID ion_scoring; Param ion_scoring_param(ion_scoring.getParameters()); ion_scoring_param.setValue("fragment_mass_tolerance", fragment_mass_tolerance_); ion_scoring_param.setValue("precursor_mass_tolerance", precursor_mass_tolerance_); ion_scoring_param.setValue("decomp_weights_precision", decomp_weights_precision_); ion_scoring_param.setValue("double_charged_iso_threshold", (double)param_.getValue("double_charged_iso_threshold")); ion_scoring_param.setValue("max_isotope_to_score", param_.getValue("max_isotope_to_score")); ion_scoring_param.setValue("max_isotope", max_isotope_); ion_scoring.setParameters(ion_scoring_param); Map<double, IonScore> ion_scores; ion_scoring.scoreSpectrum(ion_scores, new_CID_spec, precursor_weight, charge); new_CID_spec.sortByPosition(); /* cerr << "Size of ion_scores " << ion_scores.size() << endl; for (Map<double, IonScore>::const_iterator it = ion_scores.begin(); it != ion_scores.end(); ++it) { cerr << it->first << " " << it->second.score << endl; }*/ #ifdef WRITE_SCORED_SPEC PeakSpectrum filtered_spec(new_CID_spec); filtered_spec.clear(); for (Map<double, CompNovoIonScoringCID::IonScore>::const_iterator it = ion_scores.begin(); it != ion_scores.end(); ++it) { Peak1D p; p.setIntensity(it->second.score); p.setPosition(it->first); filtered_spec.push_back(p); } DTAFile().store("spec_scored.dta", filtered_spec); #endif set<String> sequences; getDecompositionsDAC_(sequences, 0, new_CID_spec.size() - 1, precursor_weight, new_CID_spec, ion_scores); #ifdef SPIKE_IN sequences.insert("AFCVDGEGR"); sequences.insert("APEFAAPWPDFVPR"); sequences.insert("AVKQFEESQGR"); sequences.insert("CCTESLVNR"); sequences.insert("DAFLGSFLYEYSR"); sequences.insert("DAIPENLPPLTADFAEDK"); sequences.insert("DDNKVEDIWSFLSK"); sequences.insert("DDPHACYSTVFDK"); sequences.insert("DEYELLCLDGSR"); sequences.insert("DGAESYKELSVLLPNR"); sequences.insert("DGASCWCVDADGR"); sequences.insert("DLFIPTCLETGEFAR"); sequences.insert("DTHKSEIAHR"); sequences.insert("DVCKNYQEAK"); sequences.insert("EACFAVEGPK"); sequences.insert("ECCHGDLLECADDR"); sequences.insert("EFLGDKFYTVISSLK"); sequences.insert("EFTPVLQADFQK"); sequences.insert("ELFLDSGIFQPMLQGR"); sequences.insert("ETYGDMADCCEK"); sequences.insert("EVGCPSSSVQEMVSCLR"); sequences.insert("EYEATLEECCAK"); sequences.insert("FADLIQSGTFQLHLDSK"); sequences.insert("FFSASCVPGATIEQK"); sequences.insert("FLANVSTVLTSK"); sequences.insert("FLSGSDYAIR"); sequences.insert("FTASCPPSIK"); sequences.insert("GAIEWEGIESGSVEQAVAK"); sequences.insert("GDVAFIQHSTVEENTGGK"); sequences.insert("GEPPSCAEDQSCPSER"); sequences.insert("GEYVPTSLTAR"); sequences.insert("GQEFTITGQKR"); sequences.insert("GTFAALSELHCDK"); sequences.insert("HLVDEPQNLIK"); sequences.insert("HQDCLVTTLQTQPGAVR"); sequences.insert("HTTVNENAPDQK"); sequences.insert("ILDCGSPDTEVR"); sequences.insert("KCPSPCQLQAER"); sequences.insert("KGTEFTVNDLQGK"); sequences.insert("KQTALVELLK"); sequences.insert("KVPQVSTPTLVEVSR"); sequences.insert("LALQFTTNAKR"); sequences.insert("LCVLHEKTPVSEK"); sequences.insert("LFTFHADICTLPDTEK"); sequences.insert("LGEYGFQNALIVR"); sequences.insert("LHVDPENFK"); sequences.insert("LKECCDKPLLEK"); sequences.insert("LKHLVDEPQNLIK"); sequences.insert("LKPDPNTLCDEFK"); sequences.insert("LLGNVLVVVLAR"); sequences.insert("LLVVYPWTQR"); sequences.insert("LRVDPVNFK"); sequences.insert("LTDEELAFPPLSPSR"); sequences.insert("LVNELTEFAK"); sequences.insert("MFLSFPTTK"); sequences.insert("MPCTEDYLSLILNR"); sequences.insert("NAPYSGYSGAFHCLK"); sequences.insert("NECFLSHKDDSPDLPK"); sequences.insert("NEPNKVPACPGSCEEVK"); sequences.insert("NLQMDDFELLCTDGR"); sequences.insert("QAGVQAEPSPK"); sequences.insert("RAPEFAAPWPDFVPR"); sequences.insert("RHPEYAVSVLLR"); sequences.insert("RPCFSALTPDETYVPK"); sequences.insert("RSLLLAPEEGPVSQR"); sequences.insert("SAFPPEPLLCSVQR"); sequences.insert("SAGWNIPIGTLLHR"); sequences.insert("SCWCVDEAGQK"); sequences.insert("SGNPNYPHEFSR"); sequences.insert("SHCIAEVEK"); sequences.insert("SISSGFFECER"); sequences.insert("SKYLASASTMDHAR"); sequences.insert("SLHTLFGDELCK"); sequences.insert("SLLLAPEEGPVSQR"); sequences.insert("SPPQCSPDGAFRPVQCK"); sequences.insert("SREGDPLAVYLK"); sequences.insert("SRQIPQCPTSCER"); sequences.insert("TAGTPVSIPVCDDSSVK"); sequences.insert("TCVADESHAGCEK"); sequences.insert("TQFGCLEGFGR"); sequences.insert("TVMENFVAFVDK"); sequences.insert("TYFPHFDLSHGSAQVK"); sequences.insert("TYMLAFDVNDEK"); sequences.insert("VDEVGGEALGR"); sequences.insert("VDLLIGSSQDDGLINR"); sequences.insert("VEDIWSFLSK"); sequences.insert("VGGHAAEYGAEALER"); sequences.insert("VGTRCCTKPESER"); sequences.insert("VKVDEVGGEALGR"); sequences.insert("VKVDLLIGSSQDDGLINR"); sequences.insert("VLDSFSNGMK"); sequences.insert("VLSAADKGNVK"); sequences.insert("VPQVSTPTLVEVSR"); sequences.insert("VTKCCTESLVNR"); sequences.insert("VVAASDASQDALGCVK"); sequences.insert("VVAGVANALAHR"); sequences.insert("YICDNQDTISSK"); sequences.insert("YLASASTMDHAR"); sequences.insert("YNGVFQECCQAEDK"); #endif SpectrumAlignmentScore spectra_zhang; spectra_zhang.setParameters(zhang_param); vector<PeptideHit> hits; Size missed_cleavages = param_.getValue("missed_cleavages"); for (set<String>::const_iterator it = sequences.begin(); it != sequences.end(); ++it) { Size num_missed = countMissedCleavagesTryptic_(*it); if (missed_cleavages < num_missed) { //cerr << "Two many missed cleavages: " << *it << ", found " << num_missed << ", allowed " << missed_cleavages << endl; continue; } PeakSpectrum CID_sim_spec; getCIDSpectrum_(CID_sim_spec, *it, charge); //normalizer.filterSpectrum(CID_sim_spec); double cid_score = zhang_(CID_sim_spec, CID_spec); PeptideHit hit; hit.setScore(cid_score); hit.setSequence(getModifiedAASequence_(*it)); hit.setCharge((Int)charge); //TODO unify charge interface: int or size? hits.push_back(hit); //cerr << getModifiedAASequence_(*it) << " " << cid_score << " " << endl; } // rescore the top hits id.setHits(hits); id.assignRanks(); hits = id.getHits(); SpectrumAlignmentScore alignment_score; Param align_param(alignment_score.getParameters()); align_param.setValue("tolerance", fragment_mass_tolerance_); align_param.setValue("use_linear_factor", "true"); alignment_score.setParameters(align_param); for (vector<PeptideHit>::iterator it = hits.begin(); it != hits.end(); ++it) { //cerr << "Pre: " << it->getRank() << " " << it->getSequence() << " " << it->getScore() << " " << endl; } Size number_of_prescoring_hits = param_.getValue("number_of_prescoring_hits"); if (hits.size() > number_of_prescoring_hits) { hits.resize(number_of_prescoring_hits); } for (vector<PeptideHit>::iterator it = hits.begin(); it != hits.end(); ++it) { PeakSpectrum CID_sim_spec; getCIDSpectrum_(CID_sim_spec, getModifiedStringFromAASequence_(it->getSequence()), charge); normalizer.filterSpectrum(CID_sim_spec); //DTAFile().store("sim_specs/" + it->getSequence().toUnmodifiedString() + "_sim_CID.dta", CID_sim_spec); //double cid_score = spectra_zhang(CID_sim_spec, CID_spec); double cid_score = alignment_score(CID_sim_spec, CID_spec); //cerr << "Final: " << it->getSequence() << " " << cid_score << endl; it->setScore(cid_score); } id.setHits(hits); id.assignRanks(); hits = id.getHits(); for (vector<PeptideHit>::iterator it = hits.begin(); it != hits.end(); ++it) { //cerr << "Fin: " << it->getRank() << " " << it->getSequence() << " " << it->getScore() << " " << endl; } Size number_of_hits = param_.getValue("number_of_hits"); if (id.getHits().size() > number_of_hits) { hits.resize(number_of_hits); } id.setHits(hits); id.assignRanks(); return; }
/////////////////////////// START_TEST(IDFilter, "$Id$") ///////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////// using namespace OpenMS; using namespace std; ///load input data std::vector<ProteinIdentification> protein_identifications; std::vector<PeptideIdentification> identifications; String document_id; IdXMLFile().load(OPENMS_GET_TEST_DATA_PATH("IDFilter_test.idXML"), protein_identifications, identifications, document_id); PeptideIdentification identification = identifications[0]; ProteinIdentification protein_identification = protein_identifications[0]; /// Proteins for search vector<FASTAFile::FASTAEntry> proteins; proteins.push_back(FASTAFile::FASTAEntry("Q824A5", "test description 1", "LHASGITVTEIPVTATNFK")); proteins.push_back(FASTAFile::FASTAEntry("Q872T5", "test description 2", "THPYGHAIVAGIERYPSK")); IDFilter* ptr = 0; IDFilter* nullPointer = 0; START_SECTION((IDFilter())) ptr = new IDFilter(); TEST_NOT_EQUAL(ptr, nullPointer); END_SECTION
ExitCodes main_(int, const char **) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- //input/output files String in(getStringOption_("in")); String out(getStringOption_("out")); //------------------------------------------------------------- // loading input //------------------------------------------------------------- RichPeakMap exp; MzMLFile f; f.setLogType(log_type_); f.load(in, exp); writeDebug_("Data set contains " + String(exp.size()) + " spectra", 1); //------------------------------------------------------------- // calculations //------------------------------------------------------------- writeDebug_("Reading model file", 2); // create model an set the given options PILISModel * model = new PILISModel(); model->readFromFile(getStringOption_("model_file")); Param model_param(model->getParameters()); model_param.setValue("upper_mz", getDoubleOption_("model:upper_mz")); model_param.setValue("lower_mz", getDoubleOption_("model:lower_mz")); model_param.setValue("charge_directed_threshold", getDoubleOption_("model:charge_directed_threshold")); model_param.setValue("charge_remote_threshold", getDoubleOption_("model:charge_remote_threshold")); //model_param.setValue("min_main_ion_intensity", getDoubleOption_("model:min_main_ion_intensity")); //model_param.setValue("min_loss_ion_intensity", getDoubleOption_("model:min_loss_ion_intensity")); model_param.setValue("min_y_ion_intensity", getDoubleOption_("model:min_y_ion_intensity")); model_param.setValue("min_b_ion_intensity", getDoubleOption_("model:min_b_ion_intensity")); model_param.setValue("min_a_ion_intensity", getDoubleOption_("model:min_a_ion_intensity")); model_param.setValue("min_y_loss_intensity", getDoubleOption_("model:min_y_loss_intensity")); model_param.setValue("min_b_loss_intensity", getDoubleOption_("model:min_b_loss_intensity")); model_param.setValue("charge_loss_factor", getDoubleOption_("model:charge_loss_factor")); model_param.setValue("visible_model_depth", getIntOption_("model:visible_model_depth")); model_param.setValue("model_depth", getIntOption_("model:model_depth")); model_param.setValue("fixed_modifications", getStringOption_("fixed_modifications")); model->setParameters(model_param); writeDebug_("Reading sequence db", 2); // create sequence db SuffixArrayPeptideFinder * sapf = new SuffixArrayPeptideFinder(getStringOption_("peptide_db_file"), "trypticCompressed"); sapf->setTolerance(getDoubleOption_("precursor_mass_tolerance")); sapf->setNumberOfModifications(0); sapf->setUseTags(false); //exp.resize(50); // TODO UInt max_charge(3), min_charge(1); // TODO vector<double> pre_weights; for (RichPeakMap::Iterator it = exp.begin(); it != exp.end(); ++it) { double pre_weight(it->getPrecursors()[0].getMZ()); for (Size z = min_charge; z <= max_charge; ++z) { pre_weights.push_back((pre_weight * (double)z) - (double)z); } } sort(pre_weights.begin(), pre_weights.end()); cerr << "Getting candidates from SA..."; vector<vector<pair<pair<String, String>, String> > > candidates; sapf->getCandidates(candidates, pre_weights); cerr << "done" << endl; delete sapf; map<double, vector<pair<pair<String, String>, String> > > sorted_candidates; UInt count(0); for (Size count = 0; count != candidates.size(); ++count) { sorted_candidates[pre_weights[count]] = candidates[count]; } candidates.clear(); // create ProteinIdentification and set the options PILISIdentification PILIS_id; PILIS_id.setModel(model); Param id_param(PILIS_id.getParameters()); id_param.setValue("precursor_mass_tolerance", getDoubleOption_("precursor_mass_tolerance")); id_param.setValue("max_candidates", getIntOption_("max_pre_candidates")); // disable evalue scoring, this is done separately to allow for a single id per spectrum id_param.setValue("use_evalue_scoring", 0); id_param.setValue("fixed_modifications", getStringOption_("fixed_modifications")); PILIS_id.setParameters(id_param); vector<PeptideIdentification> ids; // perform the ProteinIdentification of the given spectra UInt no(0); for (RichPeakMap::Iterator it = exp.begin(); it != exp.end(); ++it, ++no) { if (it->getMSLevel() == 0) { writeLog_("Warning: MSLevel is 0, assuming MSLevel 2"); it->setMSLevel(2); } if (it->getMSLevel() == 2) { writeDebug_(String(no) + "/" + String(exp.size()), 1); PeptideIdentification id; map<String, UInt> cand; for (UInt z = min_charge; z <= max_charge; ++z) { double pre_weight = (it->getPrecursors()[0].getMZ() * (double)z) - (double)z; for (vector<pair<pair<String, String>, String> >::const_iterator cit = sorted_candidates[pre_weight].begin(); cit != sorted_candidates[pre_weight].end(); ++cit) { String seq = cit->first.second; if (seq.size() > 39) { continue; } UInt num_cleavages_sites(0); for (Size k = 0; k != seq.size(); ++k) { if (k != seq.size() - 1) { if ((seq[k] == 'K' || seq[k] == 'R') && seq[k + 1] != 'P') { ++num_cleavages_sites; } } } if (num_cleavages_sites > 1) { continue; } cand[seq] = z; } } cerr << "#cand=" << cand.size() << endl; PILIS_id.getIdentification(cand, id, *it); id.setMetaValue("RT", it->getRT()); id.setMetaValue("MZ", it->getPrecursors()[0].getMZ()); ids.push_back(id); if (!id.getHits().empty()) { cerr << it->getPrecursors()[0].getMZ() << " " << AASequence(id.getHits().begin()->getSequence()).getAverageWeight() << endl; writeDebug_(id.getHits().begin()->getSequence().toString() + " (z=" + id.getHits().begin()->getCharge() + "), score=" + String(id.getHits().begin()->getScore()), 10); } } } // perform the PILIS scoring to the spectra if (!getFlag_("scoring:do_not_use_evalue_scoring")) { PILISScoring scoring; Param scoring_param(scoring.getParameters()); scoring_param.setValue("use_local_scoring", (int)getFlag_("scoring:use_local_scoring")); scoring_param.setValue("survival_function_bin_size", getIntOption_("scoring:survival_function_bin_size")); scoring_param.setValue("global_linear_fitting_threshold", getDoubleOption_("scoring:global_linear_fitting_threshold")); scoring_param.setValue("local_linear_fitting_threshold", getDoubleOption_("scoring:local_linear_fitting_threshold")); scoring.setParameters(scoring_param); scoring.getScores(ids); } // write the result to the IdentificationData structure for the storing UInt max_candidates = getIntOption_("max_candidates"); for (Size i = 0; i != ids.size(); ++i) { if (ids[i].getHits().size() > max_candidates) { vector<PeptideHit> hits = ids[i].getHits(); hits.resize(max_candidates); ids[i].setHits(hits); } } delete model; //------------------------------------------------------------- // writing output //------------------------------------------------------------- DateTime now; now.now(); String date_string; //now.get(date_string); // @todo Fix it (Andreas) String identifier("PILIS_" + date_string); //UInt count(0); count = 0; for (RichPeakMap::ConstIterator it = exp.begin(); it != exp.end(); ++it) { if (it->getMSLevel() == 2) { ids[count].setMetaValue("RT", it->getRT()); ids[count].setMetaValue("MZ", it->getPrecursors()[0].getMZ()); ids[count].setIdentifier(identifier); ids[count++].setHigherScoreBetter(false); } } // search parameters ProteinIdentification::SearchParameters search_parameters; search_parameters.db = getStringOption_("peptide_db_file"); search_parameters.db_version = ""; search_parameters.taxonomy = ""; //search_parameters.charges = getStringOption_("charges"); search_parameters.mass_type = ProteinIdentification::MONOISOTOPIC; vector<String> fixed_mods; getStringOption_("fixed_modifications").split(',', fixed_mods); search_parameters.fixed_modifications = fixed_mods; search_parameters.enzyme = ProteinIdentification::TRYPSIN; search_parameters.missed_cleavages = 1; search_parameters.peak_mass_tolerance = getDoubleOption_("peak_mass_tolerance"); search_parameters.precursor_tolerance = getDoubleOption_("precursor_mass_tolerance"); ProteinIdentification protein_identification; protein_identification.setDateTime(now); protein_identification.setSearchEngine("PILIS"); protein_identification.setSearchEngineVersion("beta"); protein_identification.setSearchParameters(search_parameters); protein_identification.setIdentifier(identifier); vector<ProteinIdentification> protein_identifications; protein_identifications.push_back(protein_identification); IdXMLFile().store(out, protein_identifications, ids); return EXECUTION_OK; }
END_SECTION START_SECTION(void load(const String &filename, ProteinIdentification &protein_ids, PeptideIdentification &peptide_ids)) { ProtXMLFile f; ProteinIdentification proteins; PeptideIdentification peptides; String prot_file; StringList ids = ListUtils::create<String>("16627578304933075941,13229490167902618598"); // we do this twice, just to check that members are correctly reset etc.. for (Int i=0;i<2;++i) { prot_file = OPENMS_GET_TEST_DATA_PATH("ProtXMLFile_input_1.protXML"); f.load(prot_file, proteins, peptides); TEST_EQUAL(proteins.getIdentifier(), ids[i]); TEST_EQUAL(peptides.getIdentifier(), ids[i]); // groups TEST_EQUAL(proteins.getProteinGroups().size(), 7); TEST_EQUAL(proteins.getProteinGroups()[0].probability, 0.9990); TEST_EQUAL(proteins.getProteinGroups()[0].accessions.size(), 1); TEST_EQUAL(proteins.getProteinGroups()[3].accessions.size(), 2); TEST_EQUAL(proteins.getProteinGroups()[3].accessions[0], "P01876|IGHA1_HUMAN"); TEST_EQUAL(proteins.getProteinGroups()[3].accessions[1], "P01877|IGHA2_HUMAN"); TEST_EQUAL(proteins.getProteinGroups()[6].probability, 0.2026); TEST_EQUAL(proteins.getProteinGroups()[6].accessions.size(), 1); TEST_EQUAL(proteins.getIndistinguishableProteins().size(), 7); TEST_EQUAL(proteins.getIndistinguishableProteins()[0].accessions.size(), 1); TEST_EQUAL(proteins.getIndistinguishableProteins()[3].accessions.size(), 2); TEST_EQUAL(proteins.getIndistinguishableProteins()[3].accessions[0], "P01876|IGHA1_HUMAN"); TEST_EQUAL(proteins.getIndistinguishableProteins()[3].accessions[1], "P01877|IGHA2_HUMAN"); TEST_EQUAL(proteins.getIndistinguishableProteins()[6].accessions.size(), 1); // proteins TEST_EQUAL(proteins.getHits().size(), 9); TEST_EQUAL(proteins.getHits()[0].getAccession(), "P02787|TRFE_HUMAN"); TEST_EQUAL(proteins.getHits()[0].getCoverage(), 8.6); TEST_EQUAL(proteins.getHits()[0].getScore(), 0.9990); // this one is indistinguishable... therefore it should have minimal infos TEST_EQUAL(proteins.getHits()[6].getAccession(), "P00739|HPTR_HUMAN"); TEST_EQUAL(proteins.getHits()[6].getCoverage(), -1); TEST_EQUAL(proteins.getHits()[6].getScore(), -1); TEST_EQUAL(proteins.getHits()[8].getAccession(), "P04217|A1BG_HUMAN"); TEST_EQUAL(proteins.getHits()[8].getCoverage(), 2.0); TEST_EQUAL(proteins.getHits()[8].getScore(), 0.2026); // peptides TEST_EQUAL(peptides.getHits().size(), 16); AASequence aa_seq("MYLGYEYVTAIR"); TEST_EQUAL(peptides.getHits()[0].getSequence(), aa_seq); TEST_EQUAL(peptides.getHits()[0].getCharge(), 2); TEST_EQUAL(peptides.getHits()[0].getScore(), 0.8633); TEST_EQUAL(peptides.getHits()[0].getProteinAccessions().size(), 1); TEST_EQUAL(peptides.getHits()[0].getProteinAccessions()[0], "P02787|TRFE_HUMAN"); TEST_EQUAL(peptides.getHits()[0].getMetaValue("is_unique"), true); TEST_EQUAL(peptides.getHits()[0].getMetaValue("is_contributing"), true); // load 2 nd file and prot_file = OPENMS_GET_TEST_DATA_PATH("ProtXMLFile_input_2.protXML"); } }
ExitCodes main_(int, const char**) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- StringList in_spec = getStringList_("in"); StringList out = getStringList_("out"); String in_lib = getStringOption_("lib"); String compare_function = getStringOption_("compare_function"); Int precursor_mass_multiplier = getIntOption_("round_precursor_to_integer"); float precursor_mass_tolerance = getDoubleOption_("precursor_mass_tolerance"); //Int min_precursor_charge = getIntOption_("min_precursor_charge"); //Int max_precursor_charge = getIntOption_("max_precursor_charge"); float remove_peaks_below_threshold = getDoubleOption_("filter:remove_peaks_below_threshold"); UInt min_peaks = getIntOption_("filter:min_peaks"); UInt max_peaks = getIntOption_("filter:max_peaks"); Int cut_peaks_below = getIntOption_("filter:cut_peaks_below"); StringList fixed_modifications = getStringList_("fixed_modifications"); StringList variable_modifications = getStringList_("variable_modifications"); Int top_hits = getIntOption_("top_hits"); if (top_hits < -1) { writeLog_("top_hits (should be >= -1 )"); return ILLEGAL_PARAMETERS; } //------------------------------------------------------------- // loading input //------------------------------------------------------------- if (out.size() != in_spec.size()) { writeLog_("out (should be as many as input files)"); return ILLEGAL_PARAMETERS; } time_t prog_time = time(NULL); MSPFile spectral_library; RichPeakMap query, library; //spectrum which will be identified MzMLFile spectra; spectra.setLogType(log_type_); time_t start_build_time = time(NULL); //------------------------------------------------------------- //building map for faster search //------------------------------------------------------------- //library containing already identified peptide spectra vector<PeptideIdentification> ids; spectral_library.load(in_lib, ids, library); map<Size, vector<PeakSpectrum> > MSLibrary; { RichPeakMap::iterator s; vector<PeptideIdentification>::iterator i; ModificationsDB* mdb = ModificationsDB::getInstance(); for (s = library.begin(), i = ids.begin(); s < library.end(); ++s, ++i) { double precursor_MZ = (*s).getPrecursors()[0].getMZ(); Size MZ_multi = (Size)precursor_MZ * precursor_mass_multiplier; map<Size, vector<PeakSpectrum> >::iterator found; found = MSLibrary.find(MZ_multi); PeakSpectrum librar; bool variable_modifications_ok = true; bool fixed_modifications_ok = true; const AASequence& aaseq = i->getHits()[0].getSequence(); //variable fixed modifications if (!fixed_modifications.empty()) { for (Size i = 0; i < aaseq.size(); ++i) { const Residue& mod = aaseq.getResidue(i); for (Size s = 0; s < fixed_modifications.size(); ++s) { if (mod.getOneLetterCode() == mdb->getModification(fixed_modifications[s]).getOrigin() && fixed_modifications[s] != mod.getModification()) { fixed_modifications_ok = false; break; } } } } //variable modifications if (aaseq.isModified() && (!variable_modifications.empty())) { for (Size i = 0; i < aaseq.size(); ++i) { if (aaseq.isModified(i)) { const Residue& mod = aaseq.getResidue(i); for (Size s = 0; s < variable_modifications.size(); ++s) { if (mod.getOneLetterCode() == mdb->getModification(variable_modifications[s]).getOrigin() && variable_modifications[s] != mod.getModification()) { variable_modifications_ok = false; break; } } } } } if (variable_modifications_ok && fixed_modifications_ok) { PeptideIdentification& translocate_pid = *i; librar.getPeptideIdentifications().push_back(translocate_pid); librar.setPrecursors(s->getPrecursors()); //library entry transformation for (UInt l = 0; l < s->size(); ++l) { Peak1D peak; if ((*s)[l].getIntensity() > remove_peaks_below_threshold) { const String& info = (*s)[l].getMetaValue("MSPPeakInfo"); if (info[0] == '?') { peak.setIntensity(sqrt(0.2 * (*s)[l].getIntensity())); } else { peak.setIntensity(sqrt((*s)[l].getIntensity())); } peak.setMZ((*s)[l].getMZ()); peak.setPosition((*s)[l].getPosition()); librar.push_back(peak); } } if (found != MSLibrary.end()) { found->second.push_back(librar); } else { vector<PeakSpectrum> tmp; tmp.push_back(librar); MSLibrary.insert(make_pair(MZ_multi, tmp)); } } } } time_t end_build_time = time(NULL); cout << "Time needed for preprocessing data: " << (end_build_time - start_build_time) << "\n"; //compare function PeakSpectrumCompareFunctor* comparor = Factory<PeakSpectrumCompareFunctor>::create(compare_function); //------------------------------------------------------------- // calculations //------------------------------------------------------------- double score; StringList::iterator in, out_file; for (in = in_spec.begin(), out_file = out.begin(); in < in_spec.end(); ++in, ++out_file) { time_t start_time = time(NULL); spectra.load(*in, query); //Will hold valuable hits vector<PeptideIdentification> peptide_ids; vector<ProteinIdentification> protein_ids; // Write parameters to ProteinIdentifcation ProteinIdentification prot_id; //Parameters of identificaion prot_id.setIdentifier("test"); prot_id.setSearchEngineVersion("SpecLibSearcher"); prot_id.setDateTime(DateTime::now()); prot_id.setScoreType(compare_function); ProteinIdentification::SearchParameters searchparam; searchparam.precursor_tolerance = precursor_mass_tolerance; prot_id.setSearchParameters(searchparam); /***********SEARCH**********/ for (UInt j = 0; j < query.size(); ++j) { //Set identifier for each identifications PeptideIdentification pid; pid.setIdentifier("test"); pid.setScoreType(compare_function); ProteinHit pr_hit; pr_hit.setAccession(j); prot_id.insertHit(pr_hit); //RichPeak1D to Peak1D transformation for the compare function query PeakSpectrum quer; bool peak_ok = true; query[j].sortByIntensity(true); double min_high_intensity = 0; if (query[j].empty() || query[j].getMSLevel() != 2) { continue; } if (query[j].getPrecursors().empty()) { writeLog_("Warning MS2 spectrum without precursor information"); continue; } min_high_intensity = (1 / cut_peaks_below) * query[j][0].getIntensity(); query[j].sortByPosition(); for (UInt k = 0; k < query[j].size() && k < max_peaks; ++k) { if (query[j][k].getIntensity() > remove_peaks_below_threshold && query[j][k].getIntensity() >= min_high_intensity) { Peak1D peak; peak.setIntensity(sqrt(query[j][k].getIntensity())); peak.setMZ(query[j][k].getMZ()); peak.setPosition(query[j][k].getPosition()); quer.push_back(peak); } } if (quer.size() >= min_peaks) { peak_ok = true; } else { peak_ok = false; } double query_MZ = query[j].getPrecursors()[0].getMZ(); if (peak_ok) { bool charge_one = false; Int percent = (Int) Math::round((query[j].size() / 100.0) * 3.0); Int margin = (Int) Math::round((query[j].size() / 100.0) * 1.0); for (vector<RichPeak1D>::iterator peak = query[j].end() - 1; percent >= 0; --peak, --percent) { if (peak->getMZ() < query_MZ) { break; } } if (percent > margin) { charge_one = true; } float min_MZ = (query_MZ - precursor_mass_tolerance) * precursor_mass_multiplier; float max_MZ = (query_MZ + precursor_mass_tolerance) * precursor_mass_multiplier; for (Size mz = (Size)min_MZ; mz <= ((Size)max_MZ) + 1; ++mz) { map<Size, vector<PeakSpectrum> >::iterator found; found = MSLibrary.find(mz); if (found != MSLibrary.end()) { vector<PeakSpectrum>& library = found->second; for (Size i = 0; i < library.size(); ++i) { float this_MZ = library[i].getPrecursors()[0].getMZ() * precursor_mass_multiplier; if (this_MZ >= min_MZ && max_MZ >= this_MZ && ((charge_one == true && library[i].getPeptideIdentifications()[0].getHits()[0].getCharge() == 1) || charge_one == false)) { PeptideHit hit = library[i].getPeptideIdentifications()[0].getHits()[0]; PeakSpectrum& librar = library[i]; //Special treatment for SpectraST score as it computes a score based on the whole library if (compare_function == "SpectraSTSimilarityScore") { SpectraSTSimilarityScore* sp = static_cast<SpectraSTSimilarityScore*>(comparor); BinnedSpectrum quer_bin = sp->transform(quer); BinnedSpectrum librar_bin = sp->transform(librar); score = (*sp)(quer, librar); //(*sp)(quer_bin,librar_bin); double dot_bias = sp->dot_bias(quer_bin, librar_bin, score); hit.setMetaValue("DOTBIAS", dot_bias); } else { score = (*comparor)(quer, librar); } DataValue RT(library[i].getRT()); DataValue MZ(library[i].getPrecursors()[0].getMZ()); hit.setMetaValue("RT", RT); hit.setMetaValue("MZ", MZ); hit.setScore(score); PeptideEvidence pe; pe.setProteinAccession(pr_hit.getAccession()); hit.addPeptideEvidence(pe); pid.insertHit(hit); } } } } } pid.setHigherScoreBetter(true); pid.sort(); if (compare_function == "SpectraSTSimilarityScore") { if (!pid.empty() && !pid.getHits().empty()) { vector<PeptideHit> final_hits; final_hits.resize(pid.getHits().size()); SpectraSTSimilarityScore* sp = static_cast<SpectraSTSimilarityScore*>(comparor); Size runner_up = 1; for (; runner_up < pid.getHits().size(); ++runner_up) { if (pid.getHits()[0].getSequence().toUnmodifiedString() != pid.getHits()[runner_up].getSequence().toUnmodifiedString() || runner_up > 5) { break; } } double delta_D = sp->delta_D(pid.getHits()[0].getScore(), pid.getHits()[runner_up].getScore()); for (Size s = 0; s < pid.getHits().size(); ++s) { final_hits[s] = pid.getHits()[s]; final_hits[s].setMetaValue("delta D", delta_D); final_hits[s].setMetaValue("dot product", pid.getHits()[s].getScore()); final_hits[s].setScore(sp->compute_F(pid.getHits()[s].getScore(), delta_D, pid.getHits()[s].getMetaValue("DOTBIAS"))); //final_hits[s].removeMetaValue("DOTBIAS"); } pid.setHits(final_hits); pid.sort(); pid.setMZ(query[j].getPrecursors()[0].getMZ()); pid.setRT(query_MZ); } } if (top_hits != -1 && (UInt)top_hits < pid.getHits().size()) { vector<PeptideHit> hits; hits.resize(top_hits); for (Size i = 0; i < (UInt)top_hits; ++i) { hits[i] = pid.getHits()[i]; } pid.setHits(hits); } peptide_ids.push_back(pid); } protein_ids.push_back(prot_id); //------------------------------------------------------------- // writing output //------------------------------------------------------------- IdXMLFile id_xml_file; id_xml_file.store(*out_file, protein_ids, peptide_ids); time_t end_time = time(NULL); cout << "Search time: " << difftime(end_time, start_time) << " seconds for " << *in << "\n"; } time_t end_time = time(NULL); cout << "Total time: " << difftime(end_time, prog_time) << " secconds\n"; return EXECUTION_OK; }
void IDFilter::filterIdentificationsByVariableModifications(const PeptideIdentification& identification, const vector<String>& fixed_modifications, PeptideIdentification& filtered_identification) { vector<Size> new_peptide_indices; vector<PeptideHit> filtered_peptide_hits; filtered_identification = identification; filtered_identification.setHits(vector<PeptideHit>()); const vector<PeptideHit>& temp_peptide_hits = identification.getHits(); for (Size i = 0; i < temp_peptide_hits.size(); i++) { const AASequence& aa_seq = temp_peptide_hits[i].getSequence(); /* TODO: check these cases // check terminal modifications if (aa_seq.hasNTerminalModification()) { String unimod_name = aa_seq.getNTerminalModification(); if (find(fixed_modifications.begin(), fixed_modifications.end(), unimod_name) == fixed_modifications.end()) { new_peptide_indices.push_back(i); continue; } } if (aa_seq.hasCTerminalModification()) { String unimod_name = aa_seq.getCTerminalModification(); if (find(fixed_modifications.begin(), fixed_modifications.end(), unimod_name) == fixed_modifications.end()) { new_peptide_indices.push_back(i); continue; } } */ // check internal modifications for (Size j = 0; j != aa_seq.size(); ++j) { if (aa_seq[j].isModified()) { String unimod_name = aa_seq[j].getModification() + " (" + aa_seq[j].getOneLetterCode() + ")"; if (find(fixed_modifications.begin(), fixed_modifications.end(), unimod_name) == fixed_modifications.end()) { new_peptide_indices.push_back(i); continue; } } } } for (Size i = 0; i < new_peptide_indices.size(); i++) { const PeptideHit& ph = temp_peptide_hits[new_peptide_indices[i]]; filtered_peptide_hits.push_back(ph); } if (!filtered_peptide_hits.empty()) { filtered_identification.setHits(filtered_peptide_hits); filtered_identification.assignRanks(); } }
void IDDecoyProbability::apply_(vector<PeptideIdentification> & ids, const vector<double> & rev_scores, const vector<double> & fwd_scores, const vector<double> & all_scores) { Size number_of_bins(param_.getValue("number_of_bins")); // normalize distribution to [0, 1] vector<double> fwd_scores_normalized(number_of_bins, 0.0), rev_scores_normalized(number_of_bins, 0.0), diff_scores(number_of_bins, 0.0), all_scores_normalized(number_of_bins, 0.0); Transformation_ rev_trafo, fwd_trafo, all_trafo; normalizeBins_(rev_scores, rev_scores_normalized, rev_trafo); normalizeBins_(fwd_scores, fwd_scores_normalized, fwd_trafo); normalizeBins_(all_scores, all_scores_normalized, all_trafo); // rev scores fitting vector<DPosition<2> > rev_data; for (Size i = 0; i < number_of_bins; ++i) { DPosition<2> pos; pos.setX(((double)i) / (double)number_of_bins + 0.0001); // necessary???? pos.setY(rev_scores_normalized[i]); rev_data.push_back(pos); #ifdef IDDECOYPROBABILITY_DEBUG cerr << pos.getX() << " " << pos.getY() << endl; #endif } Math::GammaDistributionFitter gdf; Math::GammaDistributionFitter::GammaDistributionFitResult result_gamma_1st (1.0, 3.0); gdf.setInitialParameters(result_gamma_1st); // TODO heuristic for good start parameters Math::GammaDistributionFitter::GammaDistributionFitResult result_gamma = gdf.fit(rev_data); #ifdef IDDECOYPROBABILITY_DEBUG cerr << gdf.getGnuplotFormula() << endl; String rev_filename = param_.getValue("rev_filename"); generateDistributionImage_(rev_scores_normalized, gdf.getGnuplotFormula(), rev_filename); #endif // generate diffs of distributions // get the fwd and rev distribution, apply all_trafo and calculate the diff vector<Size> fwd_bins(number_of_bins, 0), rev_bins(number_of_bins, 0); double min(all_trafo.min_score), diff(all_trafo.diff_score); Size max_bin(0); for (vector<double>::const_iterator it = fwd_scores.begin(); it != fwd_scores.end(); ++it) { Size bin = (Size)((*it - min) / diff * (double)(number_of_bins - 1)); ++fwd_bins[bin]; if (fwd_bins[bin] > max_bin) { max_bin = fwd_bins[bin]; } } Size max_reverse_bin(0), max_reverse_bin_value(0); //min = rev_trafo.min_score; //diff = rev_trafo.diff_score; for (vector<double>::const_iterator it = rev_scores.begin(); it != rev_scores.end(); ++it) { Size bin = (Size)((*it - min) / diff * (double)number_of_bins); ++rev_bins[bin]; if (rev_bins[bin] > max_bin) { max_bin = rev_bins[bin]; } if (rev_bins[bin] > max_reverse_bin_value) { max_reverse_bin = bin; max_reverse_bin_value = rev_bins[bin]; } } #ifdef IDDECOYPROBABILITY_DEBUG cerr << "Trying to get diff scores" << endl; #endif // get diff of fwd and rev for (Size i = 0; i < number_of_bins; ++i) { Size fwd(0), rev(0); fwd = fwd_bins[i]; rev = rev_bins[i]; if ((double)fwd > (double)(1.3 * rev) && max_reverse_bin < i) { diff_scores[i] = (double)(fwd - rev) / (double)max_bin; } else { diff_scores[i] = 0.0; } } #ifdef IDDECOYPROBABILITY_DEBUG cerr << "Gauss Fitting values size of diff scores=" << diff_scores.size() << endl; #endif // diff scores fitting vector<DPosition<2> > diff_data; double gauss_A(0), gauss_x0(0), norm_factor(0); for (Size i = 0; i < number_of_bins; ++i) { DPosition<2> pos; pos.setX((double)i / (double)number_of_bins); pos.setY(diff_scores[i]); if (pos.getY() > gauss_A) { gauss_A = pos.getY(); } gauss_x0 += pos.getX() * pos.getY(); norm_factor += pos.getY(); diff_data.push_back(pos); } double gauss_sigma(0); gauss_x0 /= (double)diff_data.size(); gauss_x0 /= norm_factor; for (Size i = 0; i <= number_of_bins; ++i) { gauss_sigma += fabs(gauss_x0 - (double)i / (double)number_of_bins); } gauss_sigma /= (double)diff_data.size(); #ifdef IDDECOYPROBABILITY_DEBUG cerr << "setting initial parameters: " << endl; #endif Math::GaussFitter gf; Math::GaussFitter::GaussFitResult result_1st(gauss_A, gauss_x0, gauss_sigma); gf.setInitialParameters(result_1st); #ifdef IDDECOYPROBABILITY_DEBUG cerr << "Initial Gauss guess: A=" << gauss_A << ", x0=" << gauss_x0 << ", sigma=" << gauss_sigma << endl; #endif //TODO: fail-to-fit correction was done using the GNUPlotFormula. Seemed to be a hack. //Changed it to try-catch-block but I am not sure if this correction should be made //at all. Can someone please verify? Math::GaussFitter::GaussFitResult result_gauss (gauss_A, gauss_x0, gauss_sigma); try{ result_gauss = gf.fit(diff_data); } catch(Exception::UnableToFit& /* e */) { result_gauss.A = gauss_A; result_gauss.x0 = gauss_x0; result_gauss.sigma = gauss_sigma; } // // fit failed? // if (gf.getGnuplotFormula() == "") // { // result_gauss.A = gauss_A; // result_gauss.x0 = gauss_x0; // result_gauss.sigma = gauss_sigma; // } #ifdef IDDECOYPROBABILITY_DEBUG cerr << gf.getGnuplotFormula() << endl; String fwd_filename = param_.getValue("fwd_filename"); if (gf.getGnuplotFormula() == "") { String formula("f(x)=" + String(gauss_A) + " * exp(-(x - " + String(gauss_x0) + ") ** 2 / 2 / (" + String(gauss_sigma) + ") ** 2)"); generateDistributionImage_(diff_scores, formula, fwd_filename); } else { generateDistributionImage_(diff_scores, gf.getGnuplotFormula(), fwd_filename); } #endif #ifdef IDDECOYPROBABILITY_DEBUG //all_trafo.diff_score + all_trafo.min_score String gauss_formula("f(x)=" + String(result_gauss.A / all_trafo.max_intensity) + " * exp(-(x - " + String(result_gauss.x0 * all_trafo.diff_score + all_trafo.min_score) + ") ** 2 / 2 / (" + String(result_gauss.sigma * all_trafo.diff_score) + ") ** 2)"); String b_str(result_gamma.b), p_str(result_gamma.p); String gamma_formula = "g(x)=(" + b_str + " ** " + p_str + ") / gamma(" + p_str + ") * x ** (" + p_str + " - 1) * exp(- " + b_str + " * x)"; generateDistributionImage_(all_scores_normalized, all_trafo, gauss_formula, gamma_formula, (String)param_.getValue("fwd_filename")); #endif vector<PeptideIdentification> new_prob_ids; // calculate the probabilities and write them to the IDs for (vector<PeptideIdentification>::const_iterator it = ids.begin(); it != ids.end(); ++it) { if (it->getHits().size() > 0) { vector<PeptideHit> hits; String score_type = it->getScoreType() + "_score"; for (vector<PeptideHit>::const_iterator pit = it->getHits().begin(); pit != it->getHits().end(); ++pit) { PeptideHit hit = *pit; double score = hit.getScore(); if (!it->isHigherScoreBetter()) { score = -log10(score); } hit.setMetaValue(score_type, hit.getScore()); hit.setScore(getProbability_(result_gamma, rev_trafo, result_gauss, fwd_trafo, score)); hits.push_back(hit); } PeptideIdentification id = *it; id.setHigherScoreBetter(true); id.setScoreType(id.getScoreType() + "_DecoyProbability"); id.setHits(hits); new_prob_ids.push_back(id); } } ids = new_prob_ids; }