void IDFilter::removeUnreferencedProteinHits(const ProteinIdentification& identification, const vector<PeptideIdentification> peptide_identifications, ProteinIdentification& filtered_identification) { const String& run_identifier = identification.getIdentifier(); // build set of protein accessions that are referenced by peptides set<String> proteinaccessions_with_peptides; for (Size i = 0; i != peptide_identifications.size(); ++i) { // run id of protein and peptide identification must match if (run_identifier == peptide_identifications[i].getIdentifier()) { const vector<PeptideHit>& tmp_pep_hits = peptide_identifications[i].getHits(); // extract protein accessions of each peptide hit for (Size j = 0; j != tmp_pep_hits.size(); ++j) { const std::vector<String>& protein_accessions = tmp_pep_hits[j].getProteinAccessions(); for (Size k = 0; k != protein_accessions.size(); ++k) { String key = protein_accessions[k]; proteinaccessions_with_peptides.insert(key); } } } } // add all protein hits referenced by a peptide const vector<ProteinHit>& temp_protein_hits = identification.getHits(); vector<ProteinHit> filtered_protein_hits; for (Size j = 0; j != temp_protein_hits.size(); ++j) { const String& protein_accession = temp_protein_hits[j].getAccession(); if (proteinaccessions_with_peptides.find(protein_accession) != proteinaccessions_with_peptides.end()) { filtered_protein_hits.push_back(temp_protein_hits[j]); } } // copy identification filtered_identification = identification; // assign filtered hits to protein identification filtered_identification.setHits(filtered_protein_hits); }
END_SECTION START_SECTION(void load(const String &filename, ProteinIdentification &protein_ids, PeptideIdentification &peptide_ids)) { ProtXMLFile f; ProteinIdentification proteins; PeptideIdentification peptides; String prot_file; StringList ids = ListUtils::create<String>("16627578304933075941,13229490167902618598"); // we do this twice, just to check that members are correctly reset etc.. for (Int i=0;i<2;++i) { prot_file = OPENMS_GET_TEST_DATA_PATH("ProtXMLFile_input_1.protXML"); f.load(prot_file, proteins, peptides); TEST_EQUAL(proteins.getIdentifier(), ids[i]); TEST_EQUAL(peptides.getIdentifier(), ids[i]); // groups TEST_EQUAL(proteins.getProteinGroups().size(), 7); TEST_EQUAL(proteins.getProteinGroups()[0].probability, 0.9990); TEST_EQUAL(proteins.getProteinGroups()[0].accessions.size(), 1); TEST_EQUAL(proteins.getProteinGroups()[3].accessions.size(), 2); TEST_EQUAL(proteins.getProteinGroups()[3].accessions[0], "P01876|IGHA1_HUMAN"); TEST_EQUAL(proteins.getProteinGroups()[3].accessions[1], "P01877|IGHA2_HUMAN"); TEST_EQUAL(proteins.getProteinGroups()[6].probability, 0.2026); TEST_EQUAL(proteins.getProteinGroups()[6].accessions.size(), 1); TEST_EQUAL(proteins.getIndistinguishableProteins().size(), 7); TEST_EQUAL(proteins.getIndistinguishableProteins()[0].accessions.size(), 1); TEST_EQUAL(proteins.getIndistinguishableProteins()[3].accessions.size(), 2); TEST_EQUAL(proteins.getIndistinguishableProteins()[3].accessions[0], "P01876|IGHA1_HUMAN"); TEST_EQUAL(proteins.getIndistinguishableProteins()[3].accessions[1], "P01877|IGHA2_HUMAN"); TEST_EQUAL(proteins.getIndistinguishableProteins()[6].accessions.size(), 1); // proteins TEST_EQUAL(proteins.getHits().size(), 9); TEST_EQUAL(proteins.getHits()[0].getAccession(), "P02787|TRFE_HUMAN"); TEST_EQUAL(proteins.getHits()[0].getCoverage(), 8.6); TEST_EQUAL(proteins.getHits()[0].getScore(), 0.9990); // this one is indistinguishable... therefore it should have minimal infos TEST_EQUAL(proteins.getHits()[6].getAccession(), "P00739|HPTR_HUMAN"); TEST_EQUAL(proteins.getHits()[6].getCoverage(), -1); TEST_EQUAL(proteins.getHits()[6].getScore(), -1); TEST_EQUAL(proteins.getHits()[8].getAccession(), "P04217|A1BG_HUMAN"); TEST_EQUAL(proteins.getHits()[8].getCoverage(), 2.0); TEST_EQUAL(proteins.getHits()[8].getScore(), 0.2026); // peptides TEST_EQUAL(peptides.getHits().size(), 16); AASequence aa_seq("MYLGYEYVTAIR"); TEST_EQUAL(peptides.getHits()[0].getSequence(), aa_seq); TEST_EQUAL(peptides.getHits()[0].getCharge(), 2); TEST_EQUAL(peptides.getHits()[0].getScore(), 0.8633); TEST_EQUAL(peptides.getHits()[0].getProteinAccessions().size(), 1); TEST_EQUAL(peptides.getHits()[0].getProteinAccessions()[0], "P02787|TRFE_HUMAN"); TEST_EQUAL(peptides.getHits()[0].getMetaValue("is_unique"), true); TEST_EQUAL(peptides.getHits()[0].getMetaValue("is_contributing"), true); // load 2 nd file and prot_file = OPENMS_GET_TEST_DATA_PATH("ProtXMLFile_input_2.protXML"); } }
void IDFilter::removeUnreferencedPeptideHits(const ProteinIdentification& identification, vector<PeptideIdentification>& peptide_identifications, bool delete_unreferenced_peptide_hits /* = false */) { const String& run_identifier = identification.getIdentifier(); // build set of protein accessions set<String> all_prots; const vector<ProteinHit>& temp_protein_hits = identification.getHits(); for (Size j = 0; j != temp_protein_hits.size(); ++j) { all_prots.insert(temp_protein_hits[j].getAccession()); } vector<PeptideIdentification> filtered_peptide_identifications; // remove peptides which are not referenced for (Size i = 0; i != peptide_identifications.size(); ++i) { // run id of protein and peptide identification must match if (run_identifier == peptide_identifications[i].getIdentifier()) { const vector<PeptideHit>& tmp_pep_hits = peptide_identifications[i].getHits(); vector<PeptideHit> filtered_pep_hits; // check protein accessions of each peptide hit for (Size j = 0; j != tmp_pep_hits.size(); ++j) { vector<PeptideEvidence> hit_peptide_evidences = tmp_pep_hits[j].getPeptideEvidences(); vector<PeptideEvidence> valid_peptide_evidence; for (vector<PeptideEvidence>::const_iterator pe_it = hit_peptide_evidences.begin(); pe_it != hit_peptide_evidences.end(); ++pe_it) { // find valid proteins if (all_prots.find(pe_it->getProteinAccession()) != all_prots.end()) { valid_peptide_evidence.push_back(*pe_it); } } if (!valid_peptide_evidence.empty() || !delete_unreferenced_peptide_hits) { // if present, copy the hit filtered_pep_hits.push_back(tmp_pep_hits[j]); filtered_pep_hits.back().setPeptideEvidences(valid_peptide_evidence); } } // if the peptide has hits, we use it if (!filtered_pep_hits.empty()) { filtered_peptide_identifications.push_back(peptide_identifications[i]); filtered_peptide_identifications.back().setHits(filtered_pep_hits); } } else // peptide is from another run, let it pass the filter { filtered_peptide_identifications.push_back(peptide_identifications[i]); } } // exchange with new hits filtered_peptide_identifications.swap(peptide_identifications); }