ExitCodes main_(int, const char**) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- StringList in_spec = getStringList_("in"); StringList out = getStringList_("out"); String in_lib = getStringOption_("lib"); String compare_function = getStringOption_("compare_function"); Int precursor_mass_multiplier = getIntOption_("round_precursor_to_integer"); float precursor_mass_tolerance = getDoubleOption_("precursor_mass_tolerance"); //Int min_precursor_charge = getIntOption_("min_precursor_charge"); //Int max_precursor_charge = getIntOption_("max_precursor_charge"); float remove_peaks_below_threshold = getDoubleOption_("filter:remove_peaks_below_threshold"); UInt min_peaks = getIntOption_("filter:min_peaks"); UInt max_peaks = getIntOption_("filter:max_peaks"); Int cut_peaks_below = getIntOption_("filter:cut_peaks_below"); StringList fixed_modifications = getStringList_("fixed_modifications"); StringList variable_modifications = getStringList_("variable_modifications"); Int top_hits = getIntOption_("top_hits"); if (top_hits < -1) { writeLog_("top_hits (should be >= -1 )"); return ILLEGAL_PARAMETERS; } //------------------------------------------------------------- // loading input //------------------------------------------------------------- if (out.size() != in_spec.size()) { writeLog_("out (should be as many as input files)"); return ILLEGAL_PARAMETERS; } time_t prog_time = time(NULL); MSPFile spectral_library; RichPeakMap query, library; //spectrum which will be identified MzMLFile spectra; spectra.setLogType(log_type_); time_t start_build_time = time(NULL); //------------------------------------------------------------- //building map for faster search //------------------------------------------------------------- //library containing already identified peptide spectra vector<PeptideIdentification> ids; spectral_library.load(in_lib, ids, library); map<Size, vector<PeakSpectrum> > MSLibrary; { RichPeakMap::iterator s; vector<PeptideIdentification>::iterator i; ModificationsDB* mdb = ModificationsDB::getInstance(); for (s = library.begin(), i = ids.begin(); s < library.end(); ++s, ++i) { double precursor_MZ = (*s).getPrecursors()[0].getMZ(); Size MZ_multi = (Size)precursor_MZ * precursor_mass_multiplier; map<Size, vector<PeakSpectrum> >::iterator found; found = MSLibrary.find(MZ_multi); PeakSpectrum librar; bool variable_modifications_ok = true; bool fixed_modifications_ok = true; const AASequence& aaseq = i->getHits()[0].getSequence(); //variable fixed modifications if (!fixed_modifications.empty()) { for (Size i = 0; i < aaseq.size(); ++i) { const Residue& mod = aaseq.getResidue(i); for (Size s = 0; s < fixed_modifications.size(); ++s) { if (mod.getOneLetterCode() == mdb->getModification(fixed_modifications[s]).getOrigin() && fixed_modifications[s] != mod.getModification()) { fixed_modifications_ok = false; break; } } } } //variable modifications if (aaseq.isModified() && (!variable_modifications.empty())) { for (Size i = 0; i < aaseq.size(); ++i) { if (aaseq.isModified(i)) { const Residue& mod = aaseq.getResidue(i); for (Size s = 0; s < variable_modifications.size(); ++s) { if (mod.getOneLetterCode() == mdb->getModification(variable_modifications[s]).getOrigin() && variable_modifications[s] != mod.getModification()) { variable_modifications_ok = false; break; } } } } } if (variable_modifications_ok && fixed_modifications_ok) { PeptideIdentification& translocate_pid = *i; librar.getPeptideIdentifications().push_back(translocate_pid); librar.setPrecursors(s->getPrecursors()); //library entry transformation for (UInt l = 0; l < s->size(); ++l) { Peak1D peak; if ((*s)[l].getIntensity() > remove_peaks_below_threshold) { const String& info = (*s)[l].getMetaValue("MSPPeakInfo"); if (info[0] == '?') { peak.setIntensity(sqrt(0.2 * (*s)[l].getIntensity())); } else { peak.setIntensity(sqrt((*s)[l].getIntensity())); } peak.setMZ((*s)[l].getMZ()); peak.setPosition((*s)[l].getPosition()); librar.push_back(peak); } } if (found != MSLibrary.end()) { found->second.push_back(librar); } else { vector<PeakSpectrum> tmp; tmp.push_back(librar); MSLibrary.insert(make_pair(MZ_multi, tmp)); } } } } time_t end_build_time = time(NULL); cout << "Time needed for preprocessing data: " << (end_build_time - start_build_time) << "\n"; //compare function PeakSpectrumCompareFunctor* comparor = Factory<PeakSpectrumCompareFunctor>::create(compare_function); //------------------------------------------------------------- // calculations //------------------------------------------------------------- double score; StringList::iterator in, out_file; for (in = in_spec.begin(), out_file = out.begin(); in < in_spec.end(); ++in, ++out_file) { time_t start_time = time(NULL); spectra.load(*in, query); //Will hold valuable hits vector<PeptideIdentification> peptide_ids; vector<ProteinIdentification> protein_ids; // Write parameters to ProteinIdentifcation ProteinIdentification prot_id; //Parameters of identificaion prot_id.setIdentifier("test"); prot_id.setSearchEngineVersion("SpecLibSearcher"); prot_id.setDateTime(DateTime::now()); prot_id.setScoreType(compare_function); ProteinIdentification::SearchParameters searchparam; searchparam.precursor_tolerance = precursor_mass_tolerance; prot_id.setSearchParameters(searchparam); /***********SEARCH**********/ for (UInt j = 0; j < query.size(); ++j) { //Set identifier for each identifications PeptideIdentification pid; pid.setIdentifier("test"); pid.setScoreType(compare_function); ProteinHit pr_hit; pr_hit.setAccession(j); prot_id.insertHit(pr_hit); //RichPeak1D to Peak1D transformation for the compare function query PeakSpectrum quer; bool peak_ok = true; query[j].sortByIntensity(true); double min_high_intensity = 0; if (query[j].empty() || query[j].getMSLevel() != 2) { continue; } if (query[j].getPrecursors().empty()) { writeLog_("Warning MS2 spectrum without precursor information"); continue; } min_high_intensity = (1 / cut_peaks_below) * query[j][0].getIntensity(); query[j].sortByPosition(); for (UInt k = 0; k < query[j].size() && k < max_peaks; ++k) { if (query[j][k].getIntensity() > remove_peaks_below_threshold && query[j][k].getIntensity() >= min_high_intensity) { Peak1D peak; peak.setIntensity(sqrt(query[j][k].getIntensity())); peak.setMZ(query[j][k].getMZ()); peak.setPosition(query[j][k].getPosition()); quer.push_back(peak); } } if (quer.size() >= min_peaks) { peak_ok = true; } else { peak_ok = false; } double query_MZ = query[j].getPrecursors()[0].getMZ(); if (peak_ok) { bool charge_one = false; Int percent = (Int) Math::round((query[j].size() / 100.0) * 3.0); Int margin = (Int) Math::round((query[j].size() / 100.0) * 1.0); for (vector<RichPeak1D>::iterator peak = query[j].end() - 1; percent >= 0; --peak, --percent) { if (peak->getMZ() < query_MZ) { break; } } if (percent > margin) { charge_one = true; } float min_MZ = (query_MZ - precursor_mass_tolerance) * precursor_mass_multiplier; float max_MZ = (query_MZ + precursor_mass_tolerance) * precursor_mass_multiplier; for (Size mz = (Size)min_MZ; mz <= ((Size)max_MZ) + 1; ++mz) { map<Size, vector<PeakSpectrum> >::iterator found; found = MSLibrary.find(mz); if (found != MSLibrary.end()) { vector<PeakSpectrum>& library = found->second; for (Size i = 0; i < library.size(); ++i) { float this_MZ = library[i].getPrecursors()[0].getMZ() * precursor_mass_multiplier; if (this_MZ >= min_MZ && max_MZ >= this_MZ && ((charge_one == true && library[i].getPeptideIdentifications()[0].getHits()[0].getCharge() == 1) || charge_one == false)) { PeptideHit hit = library[i].getPeptideIdentifications()[0].getHits()[0]; PeakSpectrum& librar = library[i]; //Special treatment for SpectraST score as it computes a score based on the whole library if (compare_function == "SpectraSTSimilarityScore") { SpectraSTSimilarityScore* sp = static_cast<SpectraSTSimilarityScore*>(comparor); BinnedSpectrum quer_bin = sp->transform(quer); BinnedSpectrum librar_bin = sp->transform(librar); score = (*sp)(quer, librar); //(*sp)(quer_bin,librar_bin); double dot_bias = sp->dot_bias(quer_bin, librar_bin, score); hit.setMetaValue("DOTBIAS", dot_bias); } else { score = (*comparor)(quer, librar); } DataValue RT(library[i].getRT()); DataValue MZ(library[i].getPrecursors()[0].getMZ()); hit.setMetaValue("RT", RT); hit.setMetaValue("MZ", MZ); hit.setScore(score); PeptideEvidence pe; pe.setProteinAccession(pr_hit.getAccession()); hit.addPeptideEvidence(pe); pid.insertHit(hit); } } } } } pid.setHigherScoreBetter(true); pid.sort(); if (compare_function == "SpectraSTSimilarityScore") { if (!pid.empty() && !pid.getHits().empty()) { vector<PeptideHit> final_hits; final_hits.resize(pid.getHits().size()); SpectraSTSimilarityScore* sp = static_cast<SpectraSTSimilarityScore*>(comparor); Size runner_up = 1; for (; runner_up < pid.getHits().size(); ++runner_up) { if (pid.getHits()[0].getSequence().toUnmodifiedString() != pid.getHits()[runner_up].getSequence().toUnmodifiedString() || runner_up > 5) { break; } } double delta_D = sp->delta_D(pid.getHits()[0].getScore(), pid.getHits()[runner_up].getScore()); for (Size s = 0; s < pid.getHits().size(); ++s) { final_hits[s] = pid.getHits()[s]; final_hits[s].setMetaValue("delta D", delta_D); final_hits[s].setMetaValue("dot product", pid.getHits()[s].getScore()); final_hits[s].setScore(sp->compute_F(pid.getHits()[s].getScore(), delta_D, pid.getHits()[s].getMetaValue("DOTBIAS"))); //final_hits[s].removeMetaValue("DOTBIAS"); } pid.setHits(final_hits); pid.sort(); pid.setMZ(query[j].getPrecursors()[0].getMZ()); pid.setRT(query_MZ); } } if (top_hits != -1 && (UInt)top_hits < pid.getHits().size()) { vector<PeptideHit> hits; hits.resize(top_hits); for (Size i = 0; i < (UInt)top_hits; ++i) { hits[i] = pid.getHits()[i]; } pid.setHits(hits); } peptide_ids.push_back(pid); } protein_ids.push_back(prot_id); //------------------------------------------------------------- // writing output //------------------------------------------------------------- IdXMLFile id_xml_file; id_xml_file.store(*out_file, protein_ids, peptide_ids); time_t end_time = time(NULL); cout << "Search time: " << difftime(end_time, start_time) << " seconds for " << *in << "\n"; } time_t end_time = time(NULL); cout << "Total time: " << difftime(end_time, prog_time) << " secconds\n"; return EXECUTION_OK; }
void DigestSimulation::digest(SimTypes::FeatureMapSim& feature_map) { LOG_INFO << "Digest Simulation ... started" << std::endl; if ((String)param_.getValue("enzyme") == String("none")) { //peptides = proteins; // convert all proteins into peptides // for each protein_hit in the FeatureMap for (std::vector<ProteinHit>::iterator protein_hit = feature_map.getProteinIdentifications()[0].getHits().begin(); protein_hit != feature_map.getProteinIdentifications()[0].getHits().end(); ++protein_hit) { // generate a PeptideHit hit with the correct link to the protein PeptideHit pep_hit(1.0, 1, 0, AASequence::fromString(protein_hit->getSequence())); PeptideEvidence pe; pe.setProteinAccession(protein_hit->getAccession()); pep_hit.addPeptideEvidence(pe); // add the PeptideHit to the PeptideIdentification PeptideIdentification pep_id; pep_id.insertHit(pep_hit); // generate Feature with correct Intensity and corresponding PeptideIdentification Feature f; f.getPeptideIdentifications().push_back(pep_id); f.setIntensity(protein_hit->getMetaValue("intensity")); // copy intensity meta-values and additional annotations from Protein to Feature StringList keys; protein_hit->getKeys(keys); for (StringList::const_iterator it_key = keys.begin(); it_key != keys.end(); ++it_key) { f.setMetaValue(*it_key, protein_hit->getMetaValue(*it_key)); } // add Feature to SimTypes::FeatureMapSim feature_map.push_back(f); } return; } UInt min_peptide_length = param_.getValue("min_peptide_length"); bool use_log_model = param_.getValue("model") == "trained" ? true : false; UInt missed_cleavages = param_.getValue("model_naive:missed_cleavages"); double cleave_threshold = param_.getValue("model_trained:threshold"); EnzymaticDigestion digestion; digestion.setEnzyme(digestion.getEnzymeByName((String)param_.getValue("enzyme"))); digestion.setLogModelEnabled(use_log_model); digestion.setLogThreshold(cleave_threshold); std::vector<AASequence> digestion_products; // keep track of generated features std::map<AASequence, Feature> generated_features; // Iterate through ProteinHits in the FeatureMap and digest them for (std::vector<ProteinHit>::iterator protein_hit = feature_map.getProteinIdentifications()[0].getHits().begin(); protein_hit != feature_map.getProteinIdentifications()[0].getHits().end(); ++protein_hit) { // determine abundance of each digestion product (this is quite long now...) // we assume that each digestion product will have the same abundance // note: missed cleavages reduce overall abundance as they combine two (or more) single peptides // how many "atomic"(i.e. non-cleavable) peptides are created? digestion.setMissedCleavages(0); Size complete_digest_count = digestion.peptideCount(AASequence::fromString(protein_hit->getSequence())); // compute average number of "atomic" peptides summed from all digestion products Size number_atomic_whole = 0; Size number_of_digestion_products = 0; for (Size i = 0; (i <= missed_cleavages) && (i < complete_digest_count); ++i) { number_atomic_whole += (complete_digest_count - i) * (i + 1); number_of_digestion_products += (complete_digest_count - i); } // mean number of "atomic" peptides per digestion product is now: number_atomic_whole / number_of_digestion_products // -> thus abundance of a digestion product is: #proteins / avg#of"atomic"peptides // i.e.: protein->second / (number_atomic_whole / number_of_digestion_products) Map<String, SimTypes::SimIntensityType> intensities; StringList keys; protein_hit->getKeys(keys); for (StringList::const_iterator it_key = keys.begin(); it_key != keys.end(); ++it_key) { if (!it_key->hasPrefix("intensity")) continue; intensities[*it_key] = std::max(SimTypes::SimIntensityType(1), SimTypes::SimIntensityType(protein_hit->getMetaValue(*it_key)) * SimTypes::SimIntensityType(number_of_digestion_products) / SimTypes::SimIntensityType(number_atomic_whole)); // order changed for numeric stability } // do real digest digestion.setMissedCleavages(missed_cleavages); digestion.digest(AASequence::fromString(protein_hit->getSequence()), digestion_products); for (std::vector<AASequence>::const_iterator dp_it = digestion_products.begin(); dp_it != digestion_products.end(); ++dp_it) { if (dp_it->size() < min_peptide_length) continue; // sum equal peptide's intensities // *dp_it -> peptide // If we see this Peptide the first time -> generate corresponding feature if (generated_features.count(*dp_it) == 0) { PeptideHit pep_hit(1.0, 1, 0, *dp_it); PeptideIdentification pep_id; pep_id.insertHit(pep_hit); // create feature Feature f; f.getPeptideIdentifications().push_back(pep_id); // set intensity to 0 to avoid problems when summing up f.setIntensity(0.0); // copy all non-intensity meta values StringList lkeys; protein_hit->getKeys(lkeys); for (StringList::iterator key = lkeys.begin(); key != lkeys.end(); ++key) { if (!key->hasPrefix("intensity")) { f.setMetaValue(*key, protein_hit->getMetaValue(*key)); } } // insert into map generated_features.insert(std::make_pair(*dp_it, f)); } // sum up intensity values generated_features[*dp_it].setIntensity(generated_features[*dp_it].getIntensity() + intensities["intensity"]); // ... same for other intensities (iTRAQ...) for (Map<String, SimTypes::SimIntensityType>::const_iterator it_other = intensities.begin(); it_other != intensities.end(); ++it_other) { if (!generated_features[*dp_it].metaValueExists(it_other->first)) { generated_features[*dp_it].setMetaValue(it_other->first, it_other->second); } else { generated_features[*dp_it].setMetaValue(it_other->first, SimTypes::SimIntensityType(generated_features[*dp_it].getMetaValue(it_other->first)) + it_other->second); } } // add current protein accession // existing proteins accessions ... std::set<String> protein_accessions = generated_features[*dp_it].getPeptideIdentifications()[0].getHits()[0].extractProteinAccessions(); // ... add accession of current protein protein_accessions.insert(protein_hit->getAccession()); std::vector<PeptideIdentification> pep_idents = generated_features[*dp_it].getPeptideIdentifications(); std::vector<PeptideHit> pep_hits = pep_idents[0].getHits(); for (std::set<String>::const_iterator s_it = protein_accessions.begin(); s_it != protein_accessions.end(); ++s_it) { PeptideEvidence pe; pe.setProteinAccession(*s_it); pep_hits[0].addPeptideEvidence(pe); } pep_idents[0].setHits(pep_hits); generated_features[*dp_it].setPeptideIdentifications(pep_idents); } } // add generated_features to FeatureMap for (std::map<AASequence, Feature>::iterator it_gf = generated_features.begin(); it_gf != generated_features.end(); ++it_gf) { // round up intensity (it_gf->second).setIntensity(ceil((it_gf->second).getIntensity())); feature_map.push_back(it_gf->second); } }
void ProtXMLFile::startElement(const XMLCh* const /*uri*/, const XMLCh* const /*local_name*/, const XMLCh* const qname, const xercesc::Attributes& attributes) { String tag = sm_.convert(qname); if (tag == "protein_summary_header") { String db = attributeAsString_(attributes, "reference_database"); String enzyme = attributeAsString_(attributes, "sample_enzyme"); ProteinIdentification::SearchParameters sp = prot_id_->getSearchParameters(); sp.db = db; // find a matching enzyme name sp.digestion_enzyme = *(ProteaseDB::getInstance()->getEnzyme(enzyme)); prot_id_->setSearchParameters(sp); prot_id_->setScoreType("ProteinProphet probability"); prot_id_->setHigherScoreBetter(true); pep_id_->setScoreType("ProteinProphet probability"); pep_id_->setHigherScoreBetter(true); } // identifier for Protein & PeptideIdentification // <program_details analysis="proteinprophet" time="2009-11-29T18:30:03" ... if (tag == "program_details") { String analysis = attributeAsString_(attributes, "analysis"); String time = attributeAsString_(attributes, "time"); String version = attributeAsString_(attributes, "version"); QDateTime date = QDateTime::fromString(time.toQString()); if (!date.isValid()) date = QDateTime::fromString(time.toQString(), Qt::ISODate); if (!date.isValid()) LOG_WARN << "Warning: Cannot parse 'time'='" << time << "'.\n"; prot_id_->setDateTime(date); prot_id_->setSearchEngine(analysis); prot_id_->setSearchEngineVersion(version); String id = String(UniqueIdGenerator::getUniqueId()); // was: analysis + "_" + time; prot_id_->setIdentifier(id); pep_id_->setIdentifier(id); } if (tag == "protein_group") { // we group all <protein>'s and <indistinguishable_protein>'s in our // internal group structure protein_group_ = ProteinGroup(); protein_group_.probability = attributeAsDouble_(attributes, "probability"); } else if (tag == "protein") { // usually there will be just one <protein> per <protein_group>, but more // are possible; each <protein> is distinguishable from the other, we // nevertheless group them String protein_name = attributeAsString_(attributes, "protein_name"); // open new "indistinguishable" group: prot_id_->insertIndistinguishableProteins(ProteinGroup()); registerProtein_(protein_name); // create new protein // fill protein with life double pc_coverage; if (optionalAttributeAsDouble_(pc_coverage, attributes, "percent_coverage")) { prot_id_->getHits().back().setCoverage(pc_coverage); } else { LOG_WARN << "Required attribute 'percent_coverage' missing\n"; } prot_id_->getHits().back().setScore(attributeAsDouble_(attributes, "probability")); } else if (tag == "indistinguishable_protein") { String protein_name = attributeAsString_(attributes, "protein_name"); // current last protein is from the same "indistinguishable" group: double score = prot_id_->getHits().back().getScore(); registerProtein_(protein_name); // score of group leader might technically not be transferable (due to // protein length etc.), but we still transfer it to allow filtering of // proteins by score without disrupting the groups: prot_id_->getHits().back().setScore(score); } else if (tag == "peptide") { // If a peptide is degenerate it will show in multiple groups, but have different statistics (e.g. 'nsp_adjusted_probability') // We thus treat each instance as a separate peptide // todo/improvement: link them by a group in PeptideIdentification?! pep_hit_ = new PeptideHit; pep_hit_->setSequence(AASequence::fromString(String(attributeAsString_(attributes, "peptide_sequence")))); pep_hit_->setScore(attributeAsDouble_(attributes, "nsp_adjusted_probability")); Int charge; if (optionalAttributeAsInt_(charge, attributes, "charge")) { pep_hit_->setCharge(charge); } else { LOG_WARN << "Required attribute 'charge' missing\n"; } // add accessions of all indistinguishable proteins the peptide belongs to ProteinIdentification::ProteinGroup& indist = prot_id_->getIndistinguishableProteins().back(); for (StringList::const_iterator accession = indist.accessions.begin(); accession != indist.accessions.end(); ++accession) { PeptideEvidence pe; pe.setProteinAccession(*accession); pep_hit_->addPeptideEvidence(pe); } pep_hit_->setMetaValue("is_unique", String(attributeAsString_(attributes, "is_nondegenerate_evidence")) == "Y" ? 1 : 0); pep_hit_->setMetaValue("is_contributing", String(attributeAsString_(attributes, "is_contributing_evidence")) == "Y" ? 1 : 0); } else if (tag == "mod_aminoacid_mass") { // relates to the last seen peptide (we hope) Size position = attributeAsInt_(attributes, "position"); double mass = attributeAsDouble_(attributes, "mass"); AASequence temp_aa_sequence(pep_hit_->getSequence()); String temp_description = ""; String origin = temp_aa_sequence[position - 1].getOneLetterCode(); matchModification_(mass, origin, temp_description); if (temp_description.size() > 0) // only if a mod was found { // e.g. Carboxymethyl (C) vector<String> mod_split; temp_description.split(' ', mod_split); if (mod_split.size() == 2) { if (mod_split[1] == "(C-term)" || ModificationsDB::getInstance()->getModification(temp_description).getTermSpecificity() == ResidueModification::C_TERM) { temp_aa_sequence.setCTerminalModification(mod_split[0]); } else { if (mod_split[1] == "(N-term)" || ModificationsDB::getInstance()->getModification(temp_description).getTermSpecificity() == ResidueModification::N_TERM) { temp_aa_sequence.setNTerminalModification(mod_split[0]); } else { // search this mod, if not directly use a general one temp_aa_sequence.setModification(position - 1, mod_split[0]); } } } else { error(LOAD, String("Cannot parse modification '") + temp_description + "@" + position + "'"); } } else { error(LOAD, String("Cannot find modification '") + String(mass) + " " + String(origin) + "' @" + String(position)); } pep_hit_->setSequence(temp_aa_sequence); } }