ExitCodes main_(int, const char **) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- //input/output files String in(getStringOption_("in")); String out(getStringOption_("out")); //------------------------------------------------------------- // loading input //------------------------------------------------------------- RichPeakMap exp; MzMLFile f; f.setLogType(log_type_); f.load(in, exp); writeDebug_("Data set contains " + String(exp.size()) + " spectra", 1); //------------------------------------------------------------- // calculations //------------------------------------------------------------- writeDebug_("Reading model file", 2); // create model an set the given options PILISModel * model = new PILISModel(); model->readFromFile(getStringOption_("model_file")); Param model_param(model->getParameters()); model_param.setValue("upper_mz", getDoubleOption_("model:upper_mz")); model_param.setValue("lower_mz", getDoubleOption_("model:lower_mz")); model_param.setValue("charge_directed_threshold", getDoubleOption_("model:charge_directed_threshold")); model_param.setValue("charge_remote_threshold", getDoubleOption_("model:charge_remote_threshold")); //model_param.setValue("min_main_ion_intensity", getDoubleOption_("model:min_main_ion_intensity")); //model_param.setValue("min_loss_ion_intensity", getDoubleOption_("model:min_loss_ion_intensity")); model_param.setValue("min_y_ion_intensity", getDoubleOption_("model:min_y_ion_intensity")); model_param.setValue("min_b_ion_intensity", getDoubleOption_("model:min_b_ion_intensity")); model_param.setValue("min_a_ion_intensity", getDoubleOption_("model:min_a_ion_intensity")); model_param.setValue("min_y_loss_intensity", getDoubleOption_("model:min_y_loss_intensity")); model_param.setValue("min_b_loss_intensity", getDoubleOption_("model:min_b_loss_intensity")); model_param.setValue("charge_loss_factor", getDoubleOption_("model:charge_loss_factor")); model_param.setValue("visible_model_depth", getIntOption_("model:visible_model_depth")); model_param.setValue("model_depth", getIntOption_("model:model_depth")); model_param.setValue("fixed_modifications", getStringOption_("fixed_modifications")); model->setParameters(model_param); writeDebug_("Reading sequence db", 2); // create sequence db SuffixArrayPeptideFinder * sapf = new SuffixArrayPeptideFinder(getStringOption_("peptide_db_file"), "trypticCompressed"); sapf->setTolerance(getDoubleOption_("precursor_mass_tolerance")); sapf->setNumberOfModifications(0); sapf->setUseTags(false); //exp.resize(50); // TODO UInt max_charge(3), min_charge(1); // TODO vector<double> pre_weights; for (RichPeakMap::Iterator it = exp.begin(); it != exp.end(); ++it) { double pre_weight(it->getPrecursors()[0].getMZ()); for (Size z = min_charge; z <= max_charge; ++z) { pre_weights.push_back((pre_weight * (double)z) - (double)z); } } sort(pre_weights.begin(), pre_weights.end()); cerr << "Getting candidates from SA..."; vector<vector<pair<pair<String, String>, String> > > candidates; sapf->getCandidates(candidates, pre_weights); cerr << "done" << endl; delete sapf; map<double, vector<pair<pair<String, String>, String> > > sorted_candidates; UInt count(0); for (Size count = 0; count != candidates.size(); ++count) { sorted_candidates[pre_weights[count]] = candidates[count]; } candidates.clear(); // create ProteinIdentification and set the options PILISIdentification PILIS_id; PILIS_id.setModel(model); Param id_param(PILIS_id.getParameters()); id_param.setValue("precursor_mass_tolerance", getDoubleOption_("precursor_mass_tolerance")); id_param.setValue("max_candidates", getIntOption_("max_pre_candidates")); // disable evalue scoring, this is done separately to allow for a single id per spectrum id_param.setValue("use_evalue_scoring", 0); id_param.setValue("fixed_modifications", getStringOption_("fixed_modifications")); PILIS_id.setParameters(id_param); vector<PeptideIdentification> ids; // perform the ProteinIdentification of the given spectra UInt no(0); for (RichPeakMap::Iterator it = exp.begin(); it != exp.end(); ++it, ++no) { if (it->getMSLevel() == 0) { writeLog_("Warning: MSLevel is 0, assuming MSLevel 2"); it->setMSLevel(2); } if (it->getMSLevel() == 2) { writeDebug_(String(no) + "/" + String(exp.size()), 1); PeptideIdentification id; map<String, UInt> cand; for (UInt z = min_charge; z <= max_charge; ++z) { double pre_weight = (it->getPrecursors()[0].getMZ() * (double)z) - (double)z; for (vector<pair<pair<String, String>, String> >::const_iterator cit = sorted_candidates[pre_weight].begin(); cit != sorted_candidates[pre_weight].end(); ++cit) { String seq = cit->first.second; if (seq.size() > 39) { continue; } UInt num_cleavages_sites(0); for (Size k = 0; k != seq.size(); ++k) { if (k != seq.size() - 1) { if ((seq[k] == 'K' || seq[k] == 'R') && seq[k + 1] != 'P') { ++num_cleavages_sites; } } } if (num_cleavages_sites > 1) { continue; } cand[seq] = z; } } cerr << "#cand=" << cand.size() << endl; PILIS_id.getIdentification(cand, id, *it); id.setMetaValue("RT", it->getRT()); id.setMetaValue("MZ", it->getPrecursors()[0].getMZ()); ids.push_back(id); if (!id.getHits().empty()) { cerr << it->getPrecursors()[0].getMZ() << " " << AASequence(id.getHits().begin()->getSequence()).getAverageWeight() << endl; writeDebug_(id.getHits().begin()->getSequence().toString() + " (z=" + id.getHits().begin()->getCharge() + "), score=" + String(id.getHits().begin()->getScore()), 10); } } } // perform the PILIS scoring to the spectra if (!getFlag_("scoring:do_not_use_evalue_scoring")) { PILISScoring scoring; Param scoring_param(scoring.getParameters()); scoring_param.setValue("use_local_scoring", (int)getFlag_("scoring:use_local_scoring")); scoring_param.setValue("survival_function_bin_size", getIntOption_("scoring:survival_function_bin_size")); scoring_param.setValue("global_linear_fitting_threshold", getDoubleOption_("scoring:global_linear_fitting_threshold")); scoring_param.setValue("local_linear_fitting_threshold", getDoubleOption_("scoring:local_linear_fitting_threshold")); scoring.setParameters(scoring_param); scoring.getScores(ids); } // write the result to the IdentificationData structure for the storing UInt max_candidates = getIntOption_("max_candidates"); for (Size i = 0; i != ids.size(); ++i) { if (ids[i].getHits().size() > max_candidates) { vector<PeptideHit> hits = ids[i].getHits(); hits.resize(max_candidates); ids[i].setHits(hits); } } delete model; //------------------------------------------------------------- // writing output //------------------------------------------------------------- DateTime now; now.now(); String date_string; //now.get(date_string); // @todo Fix it (Andreas) String identifier("PILIS_" + date_string); //UInt count(0); count = 0; for (RichPeakMap::ConstIterator it = exp.begin(); it != exp.end(); ++it) { if (it->getMSLevel() == 2) { ids[count].setMetaValue("RT", it->getRT()); ids[count].setMetaValue("MZ", it->getPrecursors()[0].getMZ()); ids[count].setIdentifier(identifier); ids[count++].setHigherScoreBetter(false); } } // search parameters ProteinIdentification::SearchParameters search_parameters; search_parameters.db = getStringOption_("peptide_db_file"); search_parameters.db_version = ""; search_parameters.taxonomy = ""; //search_parameters.charges = getStringOption_("charges"); search_parameters.mass_type = ProteinIdentification::MONOISOTOPIC; vector<String> fixed_mods; getStringOption_("fixed_modifications").split(',', fixed_mods); search_parameters.fixed_modifications = fixed_mods; search_parameters.enzyme = ProteinIdentification::TRYPSIN; search_parameters.missed_cleavages = 1; search_parameters.peak_mass_tolerance = getDoubleOption_("peak_mass_tolerance"); search_parameters.precursor_tolerance = getDoubleOption_("precursor_mass_tolerance"); ProteinIdentification protein_identification; protein_identification.setDateTime(now); protein_identification.setSearchEngine("PILIS"); protein_identification.setSearchEngineVersion("beta"); protein_identification.setSearchParameters(search_parameters); protein_identification.setIdentifier(identifier); vector<ProteinIdentification> protein_identifications; protein_identifications.push_back(protein_identification); IdXMLFile().store(out, protein_identifications, ids); return EXECUTION_OK; }
ExitCodes main_(int, const char **) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- //input/output files StringList in(getStringList_("in")); StringList id_in(getStringList_("id_in")); String trained_model_file(getStringOption_("trained_model_file")); String model_file(getStringOption_("model_file")); bool score_filtering(getFlag_("score_filtering")); double score_threshold(getDoubleOption_("score_threshold")); Int min_charge(getIntOption_("min_charge")); Int max_charge(getIntOption_("max_charge")); if (in.empty()) { writeLog_("For 'training' mode spectra and identifications are needed."); return INCOMPATIBLE_INPUT_DATA; } //bool duplicates_by_tic(getFlag_("duplicates_by_tic")); //bool base_model_from_file(getFlag_("base_model_from_file")); // create model, either read from a model file, or initialize with default parameters PILISModel model; if (model_file != "") { writeDebug_("Reading model from file '" + model_file + "'", 1); model.readFromFile(model_file); } else { writeDebug_("Initializing model", 1); model.setParameters(getParam_().copy("PILIS_parameters:", true)); model.init(); } Param pilis_param(model.getParameters()); ModificationDefinitionsSet mod_set(pilis_param.getValue("fixed_modifications"), pilis_param.getValue("variable_modifications")); // read spectra file (if available) vector<RichPeakMap> exp; vector<vector<ProteinIdentification> > prot_ids; vector<vector<PeptideIdentification> > pep_ids; if (!in.empty()) { FileTypes::Type in_file_type = FileHandler().getType(in[0]); writeDebug_("File type of parameter 'in' estimated as '" + FileTypes::typeToName(in_file_type) + "'", 1); // TODO check all types if (in_file_type == FileTypes::MSP) { writeDebug_("Reading MSP file", 1); MSPFile f; exp.resize(in.size()); pep_ids.resize(in.size()); for (Size i = 0; i != in.size(); ++i) { f.load(in[i], pep_ids[i], exp[i]); for (Size j = 0; j != exp[i].size(); ++j) { exp[i][j].getPeptideIdentifications().push_back(pep_ids[i][j]); } } } if (in_file_type == FileTypes::MZML) { MzMLFile f; f.setLogType(log_type_); exp.resize(in.size()); for (Size i = 0; i != in.size(); ++i) { f.load(in[i], exp[i]); } } } if (!id_in.empty()) { prot_ids.resize(id_in.size()); pep_ids.resize(id_in.size()); IdXMLFile f; for (Size i = 0; i != id_in.size(); ++i) { f.load(id_in[i], prot_ids[i], pep_ids[i]); } } if (!id_in.empty() && !in.empty()) { // map the if (id_in.size() != in.size()) { writeLog_("If in parameter contains mzML files and id_in contains idXML files, the number should be equal to allow mapping of the identification to the spectra"); return INCOMPATIBLE_INPUT_DATA; } // map the ids to the spectra IDMapper id_mapper; for (Size i = 0; i != exp.size(); ++i) { id_mapper.annotate(exp[i], pep_ids[i], prot_ids[i]); } } // get the peptides and spectra vector<PILISCrossValidation::Peptide> peptides; for (vector<RichPeakMap>::const_iterator it1 = exp.begin(); it1 != exp.end(); ++it1) { for (RichPeakMap::ConstIterator it2 = it1->begin(); it2 != it1->end(); ++it2) { if (it2->getPeptideIdentifications().empty()) { continue; } PeptideHit hit; if (it2->getPeptideIdentifications().begin()->getHits().size() > 0) { hit = *it2->getPeptideIdentifications().begin()->getHits().begin(); } else { continue; } // check whether the sequence contains a modification not modelled if (!mod_set.isCompatible(hit.getSequence()) || hit.getSequence().size() > (UInt)pilis_param.getValue("visible_model_depth")) { continue; } if (score_filtering && ((hit.getScore() < score_threshold && it2->getPeptideIdentifications().begin()->isHigherScoreBetter()) || (hit.getScore() > score_threshold && !it2->getPeptideIdentifications().begin()->isHigherScoreBetter()))) { continue; } PILISCrossValidation::Peptide pep_struct; pep_struct.sequence = hit.getSequence(); pep_struct.charge = hit.getCharge(); pep_struct.spec = *it2; pep_struct.hits = it2->getPeptideIdentifications().begin()->getHits(); // check charges if (pep_struct.charge < min_charge || pep_struct.charge > max_charge) { continue; } peptides.push_back(pep_struct); } } getUniquePeptides(peptides); writeDebug_("Number of (unique) peptides for training: " + String(peptides.size()), 1); //model.writeToFile("pilis_tmp.dat"); model.setParameters(pilis_param); for (vector<PILISCrossValidation::Peptide>::const_iterator it = peptides.begin(); it != peptides.end(); ++it) { model.train(it->spec, it->sequence, it->charge); } model.evaluate(); if (trained_model_file != "") { model.writeToFile(trained_model_file); } return EXECUTION_OK; }