void CompNovoIdentificationBase::getCIDSpectrumLight_(PeakSpectrum & spec, const String & sequence, DoubleReal prefix, DoubleReal suffix) { static DoubleReal h2o_mass = EmpiricalFormula("H2O").getMonoWeight(); Peak1D p; DoubleReal b_pos(0.0 + prefix); DoubleReal y_pos(h2o_mass + suffix); for (Size i = 0; i != sequence.size() - 1; ++i) { char aa(sequence[i]); b_pos += aa_to_weight_[aa]; char aa2(sequence[sequence.size() - i - 1]); y_pos += aa_to_weight_[aa2]; if (b_pos > min_mz_ && b_pos < max_mz_) { p.setPosition(b_pos + Constants::PROTON_MASS_U); p.setIntensity(1.0f); spec.push_back(p); } if (y_pos > min_mz_ && y_pos < max_mz_) { p.setPosition(y_pos + Constants::PROTON_MASS_U); p.setIntensity(1.0f); spec.push_back(p); } } spec.sortByPosition(); return; }
void CompNovoIdentificationBase::windowMower_(PeakSpectrum & spec, DoubleReal windowsize, Size no_peaks) { PeakSpectrum copy(spec); vector<Peak1D> to_be_deleted; for (Size i = 0; i < spec.size(); ++i) { PeakSpectrum sub_spec; bool end(false); for (Size j = i; spec[j].getPosition()[0] - spec[i].getPosition()[0] < windowsize; ) { sub_spec.push_back(spec[j]); if (++j == spec.size()) { end = true; break; } } sub_spec.sortByIntensity(true); for (Size k = no_peaks; k < sub_spec.size(); ++k) { Peak1D p(sub_spec[k]); to_be_deleted.push_back(p); } if (end) { break; } } spec.clear(false); for (PeakSpectrum::ConstIterator it = copy.begin(); it != copy.end(); ++it) { if (find(to_be_deleted.begin(), to_be_deleted.end(), *it) == to_be_deleted.end()) { spec.push_back(*it); } } spec.sortByPosition(); }
ExitCodes main_(int, const char**) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- StringList in_spec = getStringList_("in"); StringList out = getStringList_("out"); String in_lib = getStringOption_("lib"); String compare_function = getStringOption_("compare_function"); Int precursor_mass_multiplier = getIntOption_("round_precursor_to_integer"); float precursor_mass_tolerance = getDoubleOption_("precursor_mass_tolerance"); //Int min_precursor_charge = getIntOption_("min_precursor_charge"); //Int max_precursor_charge = getIntOption_("max_precursor_charge"); float remove_peaks_below_threshold = getDoubleOption_("filter:remove_peaks_below_threshold"); UInt min_peaks = getIntOption_("filter:min_peaks"); UInt max_peaks = getIntOption_("filter:max_peaks"); Int cut_peaks_below = getIntOption_("filter:cut_peaks_below"); StringList fixed_modifications = getStringList_("fixed_modifications"); StringList variable_modifications = getStringList_("variable_modifications"); Int top_hits = getIntOption_("top_hits"); if (top_hits < -1) { writeLog_("top_hits (should be >= -1 )"); return ILLEGAL_PARAMETERS; } //------------------------------------------------------------- // loading input //------------------------------------------------------------- if (out.size() != in_spec.size()) { writeLog_("out (should be as many as input files)"); return ILLEGAL_PARAMETERS; } time_t prog_time = time(NULL); MSPFile spectral_library; RichPeakMap query, library; //spectrum which will be identified MzMLFile spectra; spectra.setLogType(log_type_); time_t start_build_time = time(NULL); //------------------------------------------------------------- //building map for faster search //------------------------------------------------------------- //library containing already identified peptide spectra vector<PeptideIdentification> ids; spectral_library.load(in_lib, ids, library); map<Size, vector<PeakSpectrum> > MSLibrary; { RichPeakMap::iterator s; vector<PeptideIdentification>::iterator i; ModificationsDB* mdb = ModificationsDB::getInstance(); for (s = library.begin(), i = ids.begin(); s < library.end(); ++s, ++i) { double precursor_MZ = (*s).getPrecursors()[0].getMZ(); Size MZ_multi = (Size)precursor_MZ * precursor_mass_multiplier; map<Size, vector<PeakSpectrum> >::iterator found; found = MSLibrary.find(MZ_multi); PeakSpectrum librar; bool variable_modifications_ok = true; bool fixed_modifications_ok = true; const AASequence& aaseq = i->getHits()[0].getSequence(); //variable fixed modifications if (!fixed_modifications.empty()) { for (Size i = 0; i < aaseq.size(); ++i) { const Residue& mod = aaseq.getResidue(i); for (Size s = 0; s < fixed_modifications.size(); ++s) { if (mod.getOneLetterCode() == mdb->getModification(fixed_modifications[s]).getOrigin() && fixed_modifications[s] != mod.getModification()) { fixed_modifications_ok = false; break; } } } } //variable modifications if (aaseq.isModified() && (!variable_modifications.empty())) { for (Size i = 0; i < aaseq.size(); ++i) { if (aaseq.isModified(i)) { const Residue& mod = aaseq.getResidue(i); for (Size s = 0; s < variable_modifications.size(); ++s) { if (mod.getOneLetterCode() == mdb->getModification(variable_modifications[s]).getOrigin() && variable_modifications[s] != mod.getModification()) { variable_modifications_ok = false; break; } } } } } if (variable_modifications_ok && fixed_modifications_ok) { PeptideIdentification& translocate_pid = *i; librar.getPeptideIdentifications().push_back(translocate_pid); librar.setPrecursors(s->getPrecursors()); //library entry transformation for (UInt l = 0; l < s->size(); ++l) { Peak1D peak; if ((*s)[l].getIntensity() > remove_peaks_below_threshold) { const String& info = (*s)[l].getMetaValue("MSPPeakInfo"); if (info[0] == '?') { peak.setIntensity(sqrt(0.2 * (*s)[l].getIntensity())); } else { peak.setIntensity(sqrt((*s)[l].getIntensity())); } peak.setMZ((*s)[l].getMZ()); peak.setPosition((*s)[l].getPosition()); librar.push_back(peak); } } if (found != MSLibrary.end()) { found->second.push_back(librar); } else { vector<PeakSpectrum> tmp; tmp.push_back(librar); MSLibrary.insert(make_pair(MZ_multi, tmp)); } } } } time_t end_build_time = time(NULL); cout << "Time needed for preprocessing data: " << (end_build_time - start_build_time) << "\n"; //compare function PeakSpectrumCompareFunctor* comparor = Factory<PeakSpectrumCompareFunctor>::create(compare_function); //------------------------------------------------------------- // calculations //------------------------------------------------------------- double score; StringList::iterator in, out_file; for (in = in_spec.begin(), out_file = out.begin(); in < in_spec.end(); ++in, ++out_file) { time_t start_time = time(NULL); spectra.load(*in, query); //Will hold valuable hits vector<PeptideIdentification> peptide_ids; vector<ProteinIdentification> protein_ids; // Write parameters to ProteinIdentifcation ProteinIdentification prot_id; //Parameters of identificaion prot_id.setIdentifier("test"); prot_id.setSearchEngineVersion("SpecLibSearcher"); prot_id.setDateTime(DateTime::now()); prot_id.setScoreType(compare_function); ProteinIdentification::SearchParameters searchparam; searchparam.precursor_tolerance = precursor_mass_tolerance; prot_id.setSearchParameters(searchparam); /***********SEARCH**********/ for (UInt j = 0; j < query.size(); ++j) { //Set identifier for each identifications PeptideIdentification pid; pid.setIdentifier("test"); pid.setScoreType(compare_function); ProteinHit pr_hit; pr_hit.setAccession(j); prot_id.insertHit(pr_hit); //RichPeak1D to Peak1D transformation for the compare function query PeakSpectrum quer; bool peak_ok = true; query[j].sortByIntensity(true); double min_high_intensity = 0; if (query[j].empty() || query[j].getMSLevel() != 2) { continue; } if (query[j].getPrecursors().empty()) { writeLog_("Warning MS2 spectrum without precursor information"); continue; } min_high_intensity = (1 / cut_peaks_below) * query[j][0].getIntensity(); query[j].sortByPosition(); for (UInt k = 0; k < query[j].size() && k < max_peaks; ++k) { if (query[j][k].getIntensity() > remove_peaks_below_threshold && query[j][k].getIntensity() >= min_high_intensity) { Peak1D peak; peak.setIntensity(sqrt(query[j][k].getIntensity())); peak.setMZ(query[j][k].getMZ()); peak.setPosition(query[j][k].getPosition()); quer.push_back(peak); } } if (quer.size() >= min_peaks) { peak_ok = true; } else { peak_ok = false; } double query_MZ = query[j].getPrecursors()[0].getMZ(); if (peak_ok) { bool charge_one = false; Int percent = (Int) Math::round((query[j].size() / 100.0) * 3.0); Int margin = (Int) Math::round((query[j].size() / 100.0) * 1.0); for (vector<RichPeak1D>::iterator peak = query[j].end() - 1; percent >= 0; --peak, --percent) { if (peak->getMZ() < query_MZ) { break; } } if (percent > margin) { charge_one = true; } float min_MZ = (query_MZ - precursor_mass_tolerance) * precursor_mass_multiplier; float max_MZ = (query_MZ + precursor_mass_tolerance) * precursor_mass_multiplier; for (Size mz = (Size)min_MZ; mz <= ((Size)max_MZ) + 1; ++mz) { map<Size, vector<PeakSpectrum> >::iterator found; found = MSLibrary.find(mz); if (found != MSLibrary.end()) { vector<PeakSpectrum>& library = found->second; for (Size i = 0; i < library.size(); ++i) { float this_MZ = library[i].getPrecursors()[0].getMZ() * precursor_mass_multiplier; if (this_MZ >= min_MZ && max_MZ >= this_MZ && ((charge_one == true && library[i].getPeptideIdentifications()[0].getHits()[0].getCharge() == 1) || charge_one == false)) { PeptideHit hit = library[i].getPeptideIdentifications()[0].getHits()[0]; PeakSpectrum& librar = library[i]; //Special treatment for SpectraST score as it computes a score based on the whole library if (compare_function == "SpectraSTSimilarityScore") { SpectraSTSimilarityScore* sp = static_cast<SpectraSTSimilarityScore*>(comparor); BinnedSpectrum quer_bin = sp->transform(quer); BinnedSpectrum librar_bin = sp->transform(librar); score = (*sp)(quer, librar); //(*sp)(quer_bin,librar_bin); double dot_bias = sp->dot_bias(quer_bin, librar_bin, score); hit.setMetaValue("DOTBIAS", dot_bias); } else { score = (*comparor)(quer, librar); } DataValue RT(library[i].getRT()); DataValue MZ(library[i].getPrecursors()[0].getMZ()); hit.setMetaValue("RT", RT); hit.setMetaValue("MZ", MZ); hit.setScore(score); PeptideEvidence pe; pe.setProteinAccession(pr_hit.getAccession()); hit.addPeptideEvidence(pe); pid.insertHit(hit); } } } } } pid.setHigherScoreBetter(true); pid.sort(); if (compare_function == "SpectraSTSimilarityScore") { if (!pid.empty() && !pid.getHits().empty()) { vector<PeptideHit> final_hits; final_hits.resize(pid.getHits().size()); SpectraSTSimilarityScore* sp = static_cast<SpectraSTSimilarityScore*>(comparor); Size runner_up = 1; for (; runner_up < pid.getHits().size(); ++runner_up) { if (pid.getHits()[0].getSequence().toUnmodifiedString() != pid.getHits()[runner_up].getSequence().toUnmodifiedString() || runner_up > 5) { break; } } double delta_D = sp->delta_D(pid.getHits()[0].getScore(), pid.getHits()[runner_up].getScore()); for (Size s = 0; s < pid.getHits().size(); ++s) { final_hits[s] = pid.getHits()[s]; final_hits[s].setMetaValue("delta D", delta_D); final_hits[s].setMetaValue("dot product", pid.getHits()[s].getScore()); final_hits[s].setScore(sp->compute_F(pid.getHits()[s].getScore(), delta_D, pid.getHits()[s].getMetaValue("DOTBIAS"))); //final_hits[s].removeMetaValue("DOTBIAS"); } pid.setHits(final_hits); pid.sort(); pid.setMZ(query[j].getPrecursors()[0].getMZ()); pid.setRT(query_MZ); } } if (top_hits != -1 && (UInt)top_hits < pid.getHits().size()) { vector<PeptideHit> hits; hits.resize(top_hits); for (Size i = 0; i < (UInt)top_hits; ++i) { hits[i] = pid.getHits()[i]; } pid.setHits(hits); } peptide_ids.push_back(pid); } protein_ids.push_back(prot_id); //------------------------------------------------------------- // writing output //------------------------------------------------------------- IdXMLFile id_xml_file; id_xml_file.store(*out_file, protein_ids, peptide_ids); time_t end_time = time(NULL); cout << "Search time: " << difftime(end_time, start_time) << " seconds for " << *in << "\n"; } time_t end_time = time(NULL); cout << "Total time: " << difftime(end_time, prog_time) << " secconds\n"; return EXECUTION_OK; }
void CompNovoIdentificationBase::getCIDSpectrum_(PeakSpectrum & spec, const String & sequence, Size charge, DoubleReal prefix, DoubleReal suffix) { static DoubleReal h2o_mass = EmpiricalFormula("H2O").getMonoWeight(); static DoubleReal nh3_mass = EmpiricalFormula("NH3").getMonoWeight(); static DoubleReal co_mass = EmpiricalFormula("CO").getMonoWeight(); Peak1D p; DoubleReal b_pos(0 + prefix); DoubleReal y_pos(h2o_mass + suffix); bool b_H2O_loss(false), b_NH3_loss(false), y_NH3_loss(false); for (Size i = 0; i != sequence.size() - 1; ++i) { char aa(sequence[i]); b_pos += aa_to_weight_[aa]; char aa2(sequence[sequence.size() - i - 1]); y_pos += aa_to_weight_[aa2]; for (Size z = 1; z <= charge && z < 3; ++z) { // b-ions if (b_pos >= min_mz_ && b_pos <= max_mz_) { for (Size j = 0; j != max_isotope_; ++j) { if (z == 1 /*|| b_pos > MIN_DOUBLE_MZ*/) { p.setPosition((b_pos + (DoubleReal)z * Constants::PROTON_MASS_U + (DoubleReal)j + Constants::NEUTRON_MASS_U) / (DoubleReal)z); p.setIntensity(isotope_distributions_[(Size)b_pos][j] * 0.8 / (z * z)); spec.push_back(p); } } } // b-ion losses if (b_pos - h2o_mass > min_mz_ && b_pos - h2o_mass < max_mz_) { if (b_H2O_loss || aa == 'S' || aa == 'T' || aa == 'E' || aa == 'D') { b_H2O_loss = true; p.setPosition((b_pos + z * Constants::PROTON_MASS_U - h2o_mass) / z); p.setIntensity(0.02 / (DoubleReal)(z * z)); if (z == 1 /* || b_pos > MIN_DOUBLE_MZ*/) { spec.push_back(p); } } if (b_NH3_loss || aa == 'Q' || aa == 'N' || aa == 'R' || aa == 'K') { b_NH3_loss = true; p.setPosition((b_pos + z * Constants::PROTON_MASS_U - nh3_mass) / z); p.setIntensity(0.02 / (DoubleReal)(z * z)); if (z == 1 /* || b_pos > MIN_DOUBLE_MZ*/) { spec.push_back(p); } } } // a-ions only for charge 1 if (z == 1) { if (b_pos - co_mass > min_mz_ && b_pos - co_mass < max_mz_) { // a-ions p.setPosition((b_pos + z * Constants::PROTON_MASS_U - co_mass) / (DoubleReal)z); p.setIntensity(0.1f); spec.push_back(p); } } if (y_pos > min_mz_ && y_pos < max_mz_) { // y-ions for (Size j = 0; j != max_isotope_; ++j) { if (z == 1 /* || y_pos > MIN_DOUBLE_MZ*/) { p.setPosition((y_pos + (DoubleReal)z * Constants::PROTON_MASS_U + (DoubleReal)j * Constants::NEUTRON_MASS_U) / (DoubleReal)z); p.setIntensity(isotope_distributions_[(Size)y_pos][j] / (DoubleReal) (z * z)); spec.push_back(p); } } // H2O loss p.setPosition((y_pos + z * Constants::PROTON_MASS_U - h2o_mass) / (DoubleReal)z); p.setIntensity(0.1 / (DoubleReal)(z * z)); if (aa2 == 'Q') // pyroglutamic acid formation { p.setIntensity(0.5f); } if (z == 1 /* || y_pos > MIN_DOUBLE_MZ*/) { spec.push_back(p); } // NH3 loss if (y_NH3_loss || aa2 == 'Q' || aa2 == 'N' || aa2 == 'R' || aa2 == 'K') { y_NH3_loss = true; p.setPosition((y_pos + z * Constants::PROTON_MASS_U - nh3_mass) / (DoubleReal)z); p.setIntensity(0.1 / (DoubleReal)(z * z)); if (z == 1 /*|| y_pos > MIN_DOUBLE_MZ*/) { spec.push_back(p); } } } } } // if Q1 abundant loss of water -> pyroglutamic acid formation if (sequence[0] == 'Q' && prefix == 0 && suffix == 0) { /* for (PeakSpectrum::Iterator it = spec.begin(); it != spec.end(); ++it) { it->setIntensity(it->getIntensity() * 0.5); }*/ /* for (Size j = 0; j != max_isotope; ++j) { p.setPosition((precursor_weight + charge - 1 + j)/(DoubleReal)charge); p.setIntensity(isotope_distributions_[(Int)p.getPosition()[0]][j] * 0.1); spec.push_back(p); } */ } spec.sortByPosition(); return; }