void CompNovoIdentificationCID::getIdentifications(vector<PeptideIdentification> & pep_ids, const PeakMap & exp) { Size count(1); for (PeakMap::ConstIterator it = exp.begin(); it != exp.end(); ++it, ++count) { //cerr << count << "/" << exp.size() << endl; PeptideIdentification id; // TODO check if both CID and ETD is present; PeakSpectrum CID_spec(*it); id.setMetaValue("RT", it->getRT()); id.setMetaValue("MZ", it->getPrecursors().begin()->getMZ()); subspec_to_sequences_.clear(); permute_cache_.clear(); decomp_cache_.clear(); getIdentification(id, CID_spec); //cerr << "size_of id=" << id.getHits().size() << endl; pep_ids.push_back(id); //++it; // //if (count == 10) //{ //return; //} } return; }
ExitCodes main_(int, const char **) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- //input/output files String in(getStringOption_("in")); String out(getStringOption_("out")); //------------------------------------------------------------- // loading input //------------------------------------------------------------- RichPeakMap exp; MzMLFile f; f.setLogType(log_type_); f.load(in, exp); writeDebug_("Data set contains " + String(exp.size()) + " spectra", 1); //------------------------------------------------------------- // calculations //------------------------------------------------------------- writeDebug_("Reading model file", 2); // create model an set the given options PILISModel * model = new PILISModel(); model->readFromFile(getStringOption_("model_file")); Param model_param(model->getParameters()); model_param.setValue("upper_mz", getDoubleOption_("model:upper_mz")); model_param.setValue("lower_mz", getDoubleOption_("model:lower_mz")); model_param.setValue("charge_directed_threshold", getDoubleOption_("model:charge_directed_threshold")); model_param.setValue("charge_remote_threshold", getDoubleOption_("model:charge_remote_threshold")); //model_param.setValue("min_main_ion_intensity", getDoubleOption_("model:min_main_ion_intensity")); //model_param.setValue("min_loss_ion_intensity", getDoubleOption_("model:min_loss_ion_intensity")); model_param.setValue("min_y_ion_intensity", getDoubleOption_("model:min_y_ion_intensity")); model_param.setValue("min_b_ion_intensity", getDoubleOption_("model:min_b_ion_intensity")); model_param.setValue("min_a_ion_intensity", getDoubleOption_("model:min_a_ion_intensity")); model_param.setValue("min_y_loss_intensity", getDoubleOption_("model:min_y_loss_intensity")); model_param.setValue("min_b_loss_intensity", getDoubleOption_("model:min_b_loss_intensity")); model_param.setValue("charge_loss_factor", getDoubleOption_("model:charge_loss_factor")); model_param.setValue("visible_model_depth", getIntOption_("model:visible_model_depth")); model_param.setValue("model_depth", getIntOption_("model:model_depth")); model_param.setValue("fixed_modifications", getStringOption_("fixed_modifications")); model->setParameters(model_param); writeDebug_("Reading sequence db", 2); // create sequence db SuffixArrayPeptideFinder * sapf = new SuffixArrayPeptideFinder(getStringOption_("peptide_db_file"), "trypticCompressed"); sapf->setTolerance(getDoubleOption_("precursor_mass_tolerance")); sapf->setNumberOfModifications(0); sapf->setUseTags(false); //exp.resize(50); // TODO UInt max_charge(3), min_charge(1); // TODO vector<double> pre_weights; for (RichPeakMap::Iterator it = exp.begin(); it != exp.end(); ++it) { double pre_weight(it->getPrecursors()[0].getMZ()); for (Size z = min_charge; z <= max_charge; ++z) { pre_weights.push_back((pre_weight * (double)z) - (double)z); } } sort(pre_weights.begin(), pre_weights.end()); cerr << "Getting candidates from SA..."; vector<vector<pair<pair<String, String>, String> > > candidates; sapf->getCandidates(candidates, pre_weights); cerr << "done" << endl; delete sapf; map<double, vector<pair<pair<String, String>, String> > > sorted_candidates; UInt count(0); for (Size count = 0; count != candidates.size(); ++count) { sorted_candidates[pre_weights[count]] = candidates[count]; } candidates.clear(); // create ProteinIdentification and set the options PILISIdentification PILIS_id; PILIS_id.setModel(model); Param id_param(PILIS_id.getParameters()); id_param.setValue("precursor_mass_tolerance", getDoubleOption_("precursor_mass_tolerance")); id_param.setValue("max_candidates", getIntOption_("max_pre_candidates")); // disable evalue scoring, this is done separately to allow for a single id per spectrum id_param.setValue("use_evalue_scoring", 0); id_param.setValue("fixed_modifications", getStringOption_("fixed_modifications")); PILIS_id.setParameters(id_param); vector<PeptideIdentification> ids; // perform the ProteinIdentification of the given spectra UInt no(0); for (RichPeakMap::Iterator it = exp.begin(); it != exp.end(); ++it, ++no) { if (it->getMSLevel() == 0) { writeLog_("Warning: MSLevel is 0, assuming MSLevel 2"); it->setMSLevel(2); } if (it->getMSLevel() == 2) { writeDebug_(String(no) + "/" + String(exp.size()), 1); PeptideIdentification id; map<String, UInt> cand; for (UInt z = min_charge; z <= max_charge; ++z) { double pre_weight = (it->getPrecursors()[0].getMZ() * (double)z) - (double)z; for (vector<pair<pair<String, String>, String> >::const_iterator cit = sorted_candidates[pre_weight].begin(); cit != sorted_candidates[pre_weight].end(); ++cit) { String seq = cit->first.second; if (seq.size() > 39) { continue; } UInt num_cleavages_sites(0); for (Size k = 0; k != seq.size(); ++k) { if (k != seq.size() - 1) { if ((seq[k] == 'K' || seq[k] == 'R') && seq[k + 1] != 'P') { ++num_cleavages_sites; } } } if (num_cleavages_sites > 1) { continue; } cand[seq] = z; } } cerr << "#cand=" << cand.size() << endl; PILIS_id.getIdentification(cand, id, *it); id.setMetaValue("RT", it->getRT()); id.setMetaValue("MZ", it->getPrecursors()[0].getMZ()); ids.push_back(id); if (!id.getHits().empty()) { cerr << it->getPrecursors()[0].getMZ() << " " << AASequence(id.getHits().begin()->getSequence()).getAverageWeight() << endl; writeDebug_(id.getHits().begin()->getSequence().toString() + " (z=" + id.getHits().begin()->getCharge() + "), score=" + String(id.getHits().begin()->getScore()), 10); } } } // perform the PILIS scoring to the spectra if (!getFlag_("scoring:do_not_use_evalue_scoring")) { PILISScoring scoring; Param scoring_param(scoring.getParameters()); scoring_param.setValue("use_local_scoring", (int)getFlag_("scoring:use_local_scoring")); scoring_param.setValue("survival_function_bin_size", getIntOption_("scoring:survival_function_bin_size")); scoring_param.setValue("global_linear_fitting_threshold", getDoubleOption_("scoring:global_linear_fitting_threshold")); scoring_param.setValue("local_linear_fitting_threshold", getDoubleOption_("scoring:local_linear_fitting_threshold")); scoring.setParameters(scoring_param); scoring.getScores(ids); } // write the result to the IdentificationData structure for the storing UInt max_candidates = getIntOption_("max_candidates"); for (Size i = 0; i != ids.size(); ++i) { if (ids[i].getHits().size() > max_candidates) { vector<PeptideHit> hits = ids[i].getHits(); hits.resize(max_candidates); ids[i].setHits(hits); } } delete model; //------------------------------------------------------------- // writing output //------------------------------------------------------------- DateTime now; now.now(); String date_string; //now.get(date_string); // @todo Fix it (Andreas) String identifier("PILIS_" + date_string); //UInt count(0); count = 0; for (RichPeakMap::ConstIterator it = exp.begin(); it != exp.end(); ++it) { if (it->getMSLevel() == 2) { ids[count].setMetaValue("RT", it->getRT()); ids[count].setMetaValue("MZ", it->getPrecursors()[0].getMZ()); ids[count].setIdentifier(identifier); ids[count++].setHigherScoreBetter(false); } } // search parameters ProteinIdentification::SearchParameters search_parameters; search_parameters.db = getStringOption_("peptide_db_file"); search_parameters.db_version = ""; search_parameters.taxonomy = ""; //search_parameters.charges = getStringOption_("charges"); search_parameters.mass_type = ProteinIdentification::MONOISOTOPIC; vector<String> fixed_mods; getStringOption_("fixed_modifications").split(',', fixed_mods); search_parameters.fixed_modifications = fixed_mods; search_parameters.enzyme = ProteinIdentification::TRYPSIN; search_parameters.missed_cleavages = 1; search_parameters.peak_mass_tolerance = getDoubleOption_("peak_mass_tolerance"); search_parameters.precursor_tolerance = getDoubleOption_("precursor_mass_tolerance"); ProteinIdentification protein_identification; protein_identification.setDateTime(now); protein_identification.setSearchEngine("PILIS"); protein_identification.setSearchEngineVersion("beta"); protein_identification.setSearchParameters(search_parameters); protein_identification.setIdentifier(identifier); vector<ProteinIdentification> protein_identifications; protein_identifications.push_back(protein_identification); IdXMLFile().store(out, protein_identifications, ids); return EXECUTION_OK; }
ExitCodes main_(int, const char **) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- StringList in_spec = getStringList_("in"); StringList out = getStringList_("out"); String in_lib = getStringOption_("lib"); String compare_function = getStringOption_("compare_function"); Int precursor_mass_multiplier = getIntOption_("round_precursor_to_integer"); Real precursor_mass_tolerance = getDoubleOption_("precursor_mass_tolerance"); //Int min_precursor_charge = getIntOption_("min_precursor_charge"); //Int max_precursor_charge = getIntOption_("max_precursor_charge"); Real remove_peaks_below_threshold = getDoubleOption_("filter:remove_peaks_below_threshold"); UInt min_peaks = getIntOption_("filter:min_peaks"); UInt max_peaks = getIntOption_("filter:max_peaks"); Int cut_peaks_below = getIntOption_("filter:cut_peaks_below"); StringList fixed_modifications = getStringList_("fixed_modifications"); StringList variable_modifications = getStringList_("variable_modifications"); Int top_hits = getIntOption_("top_hits"); if (top_hits < -1) { writeLog_("top_hits (should be >= -1 )"); return ILLEGAL_PARAMETERS; } //------------------------------------------------------------- // loading input //------------------------------------------------------------- if (out.size() != in_spec.size()) { writeLog_("out (should be as many as input files)"); return ILLEGAL_PARAMETERS; } time_t prog_time = time(NULL); MSPFile spectral_library; RichPeakMap query, library; //spectrum which will be identified MzMLFile spectra; spectra.setLogType(log_type_); time_t start_build_time = time(NULL); //------------------------------------------------------------- //building map for faster search //------------------------------------------------------------- //library containing already identified peptide spectra vector<PeptideIdentification> ids; spectral_library.load(in_lib, ids, library); map<Size, vector<PeakSpectrum> > MSLibrary; { RichPeakMap::iterator s; vector<PeptideIdentification>::iterator i; ModificationsDB * mdb = ModificationsDB::getInstance(); for (s = library.begin(), i = ids.begin(); s < library.end(); ++s, ++i) { DoubleReal precursor_MZ = (*s).getPrecursors()[0].getMZ(); Size MZ_multi = (Size)precursor_MZ * precursor_mass_multiplier; map<Size, vector<PeakSpectrum> >::iterator found; found = MSLibrary.find(MZ_multi); PeakSpectrum librar; bool variable_modifications_ok = true; bool fixed_modifications_ok = true; const AASequence & aaseq = i->getHits()[0].getSequence(); //variable fixed modifications if (!fixed_modifications.empty()) { for (Size i = 0; i < aaseq.size(); ++i) { const Residue & mod = aaseq.getResidue(i); for (Size s = 0; s < fixed_modifications.size(); ++s) { if (mod.getOneLetterCode() == mdb->getModification(fixed_modifications[s]).getOrigin() && fixed_modifications[s] != mod.getModification()) { fixed_modifications_ok = false; break; } } } } //variable modifications if (aaseq.isModified() && (!variable_modifications.empty())) { for (Size i = 0; i < aaseq.size(); ++i) { if (aaseq.isModified(i)) { const Residue & mod = aaseq.getResidue(i); for (Size s = 0; s < variable_modifications.size(); ++s) { if (mod.getOneLetterCode() == mdb->getModification(variable_modifications[s]).getOrigin() && variable_modifications[s] != mod.getModification()) { variable_modifications_ok = false; break; } } } } } if (variable_modifications_ok && fixed_modifications_ok) { PeptideIdentification & translocate_pid = *i; librar.getPeptideIdentifications().push_back(translocate_pid); librar.setPrecursors(s->getPrecursors()); //library entry transformation for (UInt l = 0; l < s->size(); ++l) { Peak1D peak; if ((*s)[l].getIntensity() > remove_peaks_below_threshold) { const String & info = (*s)[l].getMetaValue("MSPPeakInfo"); if (info[0] == '?') { peak.setIntensity(sqrt(0.2 * (*s)[l].getIntensity())); } else { peak.setIntensity(sqrt((*s)[l].getIntensity())); } peak.setMZ((*s)[l].getMZ()); peak.setPosition((*s)[l].getPosition()); librar.push_back(peak); } } if (found != MSLibrary.end()) { found->second.push_back(librar); } else { vector<PeakSpectrum> tmp; tmp.push_back(librar); MSLibrary.insert(make_pair(MZ_multi, tmp)); } } } } time_t end_build_time = time(NULL); cout << "Time needed for preprocessing data: " << (end_build_time - start_build_time) << "\n"; //compare function PeakSpectrumCompareFunctor * comparor = Factory<PeakSpectrumCompareFunctor>::create(compare_function); //------------------------------------------------------------- // calculations //------------------------------------------------------------- DoubleReal score; StringList::iterator in, out_file; for (in = in_spec.begin(), out_file = out.begin(); in < in_spec.end(); ++in, ++out_file) { time_t start_time = time(NULL); spectra.load(*in, query); //Will hold valuable hits vector<PeptideIdentification> peptide_ids; vector<ProteinIdentification> protein_ids; // Write parameters to ProteinIdentifcation ProteinIdentification prot_id; //Parameters of identificaion prot_id.setIdentifier("test"); prot_id.setSearchEngineVersion("SpecLibSearcher"); prot_id.setDateTime(DateTime::now()); prot_id.setScoreType(compare_function); ProteinIdentification::SearchParameters searchparam; searchparam.precursor_tolerance = precursor_mass_tolerance; prot_id.setSearchParameters(searchparam); /***********SEARCH**********/ for (UInt j = 0; j < query.size(); ++j) { //Set identifier for each identifications PeptideIdentification pid; pid.setIdentifier("test"); pid.setScoreType(compare_function); ProteinHit pr_hit; pr_hit.setAccession(j); prot_id.insertHit(pr_hit); //RichPeak1D to Peak1D transformation for the compare function query PeakSpectrum quer; bool peak_ok = true; query[j].sortByIntensity(true); DoubleReal min_high_intensity = 0; if (query[j].empty() || query[j].getMSLevel() != 2) { continue; } if (query[j].getPrecursors().empty()) { writeLog_("Warning MS2 spectrum without precursor information"); continue; } min_high_intensity = (1 / cut_peaks_below) * query[j][0].getIntensity(); query[j].sortByPosition(); for (UInt k = 0; k < query[j].size() && k < max_peaks; ++k) { if (query[j][k].getIntensity() > remove_peaks_below_threshold && query[j][k].getIntensity() >= min_high_intensity) { Peak1D peak; peak.setIntensity(sqrt(query[j][k].getIntensity())); peak.setMZ(query[j][k].getMZ()); peak.setPosition(query[j][k].getPosition()); quer.push_back(peak); } } if (quer.size() >= min_peaks) { peak_ok = true; } else { peak_ok = false; } DoubleReal query_MZ = query[j].getPrecursors()[0].getMZ(); if (peak_ok) { bool charge_one = false; Int percent = (Int) Math::round((query[j].size() / 100.0) * 3.0); Int margin = (Int) Math::round((query[j].size() / 100.0) * 1.0); for (vector<RichPeak1D>::iterator peak = query[j].end() - 1; percent >= 0; --peak, --percent) { if (peak->getMZ() < query_MZ) { break; } } if (percent > margin) { charge_one = true; } Real min_MZ = (query_MZ - precursor_mass_tolerance) * precursor_mass_multiplier; Real max_MZ = (query_MZ + precursor_mass_tolerance) * precursor_mass_multiplier; for (Size mz = (Size)min_MZ; mz <= ((Size)max_MZ) + 1; ++mz) { map<Size, vector<PeakSpectrum> >::iterator found; found = MSLibrary.find(mz); if (found != MSLibrary.end()) { vector<PeakSpectrum> & library = found->second; for (Size i = 0; i < library.size(); ++i) { Real this_MZ = library[i].getPrecursors()[0].getMZ() * precursor_mass_multiplier; if (this_MZ >= min_MZ && max_MZ >= this_MZ && ((charge_one == true && library[i].getPeptideIdentifications()[0].getHits()[0].getCharge() == 1) || charge_one == false)) { PeptideHit hit = library[i].getPeptideIdentifications()[0].getHits()[0]; PeakSpectrum & librar = library[i]; //Special treatment for SpectraST score as it computes a score based on the whole library if (compare_function == "SpectraSTSimilarityScore") { SpectraSTSimilarityScore * sp = static_cast<SpectraSTSimilarityScore *>(comparor); BinnedSpectrum quer_bin = sp->transform(quer); BinnedSpectrum librar_bin = sp->transform(librar); score = (*sp)(quer, librar); //(*sp)(quer_bin,librar_bin); double dot_bias = sp->dot_bias(quer_bin, librar_bin, score); hit.setMetaValue("DOTBIAS", dot_bias); } else { if (compare_function == "CompareFouriertransform") { CompareFouriertransform * ft = static_cast<CompareFouriertransform *>(comparor); ft->transform(quer); ft->transform(librar); } score = (*comparor)(quer, librar); } DataValue RT(library[i].getRT()); DataValue MZ(library[i].getPrecursors()[0].getMZ()); hit.setMetaValue("RT", RT); hit.setMetaValue("MZ", MZ); hit.setScore(score); hit.addProteinAccession(pr_hit.getAccession()); pid.insertHit(hit); } } } } } pid.setHigherScoreBetter(true); pid.sort(); if (compare_function == "SpectraSTSimilarityScore") { if (!pid.empty() && !pid.getHits().empty()) { vector<PeptideHit> final_hits; final_hits.resize(pid.getHits().size()); SpectraSTSimilarityScore * sp = static_cast<SpectraSTSimilarityScore *>(comparor); Size runner_up = 1; for (; runner_up < pid.getHits().size(); ++runner_up) { if (pid.getHits()[0].getSequence().toUnmodifiedString() != pid.getHits()[runner_up].getSequence().toUnmodifiedString() || runner_up > 5) { break; } } double delta_D = sp->delta_D(pid.getHits()[0].getScore(), pid.getHits()[runner_up].getScore()); for (Size s = 0; s < pid.getHits().size(); ++s) { final_hits[s] = pid.getHits()[s]; final_hits[s].setMetaValue("delta D", delta_D); final_hits[s].setMetaValue("dot product", pid.getHits()[s].getScore()); final_hits[s].setScore(sp->compute_F(pid.getHits()[s].getScore(), delta_D, pid.getHits()[s].getMetaValue("DOTBIAS"))); //final_hits[s].removeMetaValue("DOTBIAS"); } pid.setHits(final_hits); pid.sort(); pid.setMetaValue("MZ", query[j].getPrecursors()[0].getMZ()); pid.setMetaValue("RT", query_MZ); } } if (top_hits != -1 && (UInt)top_hits < pid.getHits().size()) { vector<PeptideHit> hits; hits.resize(top_hits); for (Size i = 0; i < (UInt)top_hits; ++i) { hits[i] = pid.getHits()[i]; } pid.setHits(hits); } peptide_ids.push_back(pid); } protein_ids.push_back(prot_id); //------------------------------------------------------------- // writing output //------------------------------------------------------------- IdXMLFile id_xml_file; id_xml_file.store(*out_file, protein_ids, peptide_ids); time_t end_time = time(NULL); cout << "Search time: " << difftime(end_time, start_time) << " seconds for " << *in << "\n"; } time_t end_time = time(NULL); cout << "Total time: " << difftime(end_time, prog_time) << " secconds\n"; return EXECUTION_OK; }