ExitCodes main_(int, const char**) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- String in = getStringOption_("in"); String out = getStringOption_("out"); String trafo_in = getStringOption_("trafo_in"); String trafo_out = getStringOption_("trafo_out"); Param model_params = getParam_().copy("model:", true); String model_type = model_params.getValue("type"); model_params = model_params.copy(model_type + ":", true); ProgressLogger progresslogger; progresslogger.setLogType(log_type_); //------------------------------------------------------------- // check for valid input //------------------------------------------------------------- if (out.empty() && trafo_out.empty()) { writeLog_("Error: Either a data or a transformation output file has to be provided (parameters 'out'/'trafo_out')"); return ILLEGAL_PARAMETERS; } if (in.empty() != out.empty()) { writeLog_("Error: Data input and output parameters ('in'/'out') must be used together"); return ILLEGAL_PARAMETERS; } //------------------------------------------------------------- // apply transformation //------------------------------------------------------------- TransformationXMLFile trafoxml; TransformationDescription trafo; trafoxml.load(trafo_in, trafo); if (model_type != "none") { trafo.fitModel(model_type, model_params); } if (getFlag_("invert")) { trafo.invert(); } if (!trafo_out.empty()) { trafoxml.store(trafo_out, trafo); } if (!in.empty()) // load input { FileTypes::Type in_type = FileHandler::getType(in); if (in_type == FileTypes::MZML) { MzMLFile file; MSExperiment<> map; applyTransformation_(in, out, trafo, file, map); } else if (in_type == FileTypes::FEATUREXML) { FeatureXMLFile file; FeatureMap map; applyTransformation_(in, out, trafo, file, map); } else if (in_type == FileTypes::CONSENSUSXML) { ConsensusXMLFile file; ConsensusMap map; applyTransformation_(in, out, trafo, file, map); } else if (in_type == FileTypes::IDXML) { IdXMLFile file; vector<ProteinIdentification> proteins; vector<PeptideIdentification> peptides; file.load(in, proteins, peptides); bool store_original_rt = getFlag_("store_original_rt"); MapAlignmentTransformer::transformRetentionTimes(peptides, trafo, store_original_rt); // no "data processing" section in idXML file.store(out, proteins, peptides); } } return EXECUTION_OK; }
NEW_TMP_FILE(filename) IdXMLFile().store(filename, protein_ids2, peptide_ids2, document_id2); FuzzyStringComparator fuzzy; fuzzy.setWhitelist(ListUtils::create<String>("<?xml-stylesheet")); fuzzy.setAcceptableAbsolute(0.0001); bool result = fuzzy.compareFiles(input_path, filename); TEST_EQUAL(result, true); END_SECTION START_SECTION([EXTRA] static bool isValid(const String& filename)) std::vector<ProteinIdentification> protein_ids, protein_ids2; std::vector<PeptideIdentification> peptide_ids, peptide_ids2; String filename; IdXMLFile f; //test if empty file is valid NEW_TMP_FILE(filename) f.store(filename, protein_ids2, peptide_ids2); TEST_EQUAL(f.isValid(filename, std::cerr),true); //test if full file is valid NEW_TMP_FILE(filename); String document_id; f.load(OPENMS_GET_TEST_DATA_PATH("IdXMLFile_whole.idXML"), protein_ids2, peptide_ids2, document_id); protein_ids2[0].setMetaValue("stringvalue",String("bla")); protein_ids2[0].setMetaValue("intvalue",4711); protein_ids2[0].setMetaValue("floatvalue",5.3); f.store(filename, protein_ids2, peptide_ids2); TEST_EQUAL(f.isValid(filename, std::cerr),true);
ExitCodes main_(int, const char**) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- String in_file = getStringOption_("in"); String out_file = getStringOption_("out"); DoubleReal rt_calibrant_1_input = getDoubleOption_("calibrant_1_input"); DoubleReal rt_calibrant_2_input = getDoubleOption_("calibrant_2_input"); DoubleReal rt_calibrant_1_reference = getDoubleOption_("calibrant_1_reference"); DoubleReal rt_calibrant_2_reference = getDoubleOption_("calibrant_2_reference"); if (rt_calibrant_1_input == rt_calibrant_2_input) { LOG_ERROR << "rt_calibrant_1_input and rt_calibrant_2_input must not have the same value"; return ILLEGAL_PARAMETERS; } if (rt_calibrant_1_reference == rt_calibrant_2_reference) { LOG_ERROR << "rt_calibrant_1_reference and rt_calibrant_2_reference must not have the same value"; return ILLEGAL_PARAMETERS; } if (rt_calibrant_1_reference == -1 || rt_calibrant_2_reference == -1) { LOG_ERROR << "rt_calibrant_1_reference and rt_calibrant_2_reference must be set"; return ILLEGAL_PARAMETERS; } //------------------------------------------------------------- // testing whether input and output files are accessible //------------------------------------------------------------- if (rt_calibrant_1_input > rt_calibrant_2_input) { DoubleReal temp = rt_calibrant_1_input; rt_calibrant_1_input = rt_calibrant_2_input; rt_calibrant_2_input = temp; } if (rt_calibrant_1_reference > rt_calibrant_2_reference) { DoubleReal temp = rt_calibrant_1_reference; rt_calibrant_1_reference = rt_calibrant_2_reference; rt_calibrant_2_reference = temp; } //------------------------------------------------------------- // calculations //------------------------------------------------------------- IdXMLFile file; vector<ProteinIdentification> protein_identifications; vector<PeptideIdentification> identifications; String document_id; file.load(in_file, protein_identifications, identifications, document_id); for (Size i = 0; i < identifications.size(); ++i) { if (identifications[i].metaValueExists("RT")) { DoubleReal temp_rt = identifications[i].getMetaValue("RT"); temp_rt = (temp_rt - rt_calibrant_1_input) / (rt_calibrant_2_input - rt_calibrant_1_input) * (rt_calibrant_2_reference - rt_calibrant_1_reference) + rt_calibrant_1_reference; identifications[i].setMetaValue("RT", temp_rt); } } //------------------------------------------------------------- // writing output //------------------------------------------------------------- file.store(out_file, protein_identifications, identifications); return EXECUTION_OK; }
ExitCodes main_(int, const char**) { //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- String inputfile_name = getStringOption_("in"); String outputfile_name = getStringOption_("out"); Param fit_algorithm = getParam_().copy("fit_algorithm:", true); fit_algorithm.setValue("out_plot", getStringOption_("out_plot")); // re-assemble full param (was moved to top-level) bool split_charge = getFlag_("split_charge"); bool top_hits_only = getFlag_("top_hits_only"); double fdr_for_targets_smaller = getDoubleOption_("fdr_for_targets_smaller"); bool target_decoy_available = false; bool ignore_bad_data = getFlag_("ignore_bad_data"); bool prob_correct = getFlag_("prob_correct"); // Set fixed e-value threshold smallest_e_value_ = numeric_limits<double>::denorm_min(); //------------------------------------------------------------- // reading input //------------------------------------------------------------- IdXMLFile file; vector<ProteinIdentification> protein_ids; vector<PeptideIdentification> peptide_ids; file.load(inputfile_name, protein_ids, peptide_ids); vector<double> scores; vector<double> decoy; vector<double> target; set<Int> charges; PosteriorErrorProbabilityModel PEP_model; PEP_model.setParameters(fit_algorithm); StringList search_engines = ListUtils::create<String>("XTandem,OMSSA,MASCOT,SpectraST,MyriMatch,SimTandem,MSGFPlus,MS-GF+,Comet"); //------------------------------------------------------------- // calculations //------------------------------------------------------------- if (split_charge) { for (vector<PeptideIdentification>::iterator pep_it = peptide_ids.begin(); pep_it != peptide_ids.end(); ++pep_it) { vector<PeptideHit>& hits = pep_it->getHits(); for (std::vector<PeptideHit>::iterator hit_it = hits.begin(); hit_it != hits.end(); ++hit_it) { charges.insert(hit_it->getCharge()); } } if (charges.empty()) { throw Exception::ElementNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "no charges found!"); } } for (vector<PeptideIdentification>::iterator pep_it = peptide_ids.begin(); pep_it != peptide_ids.end(); ++pep_it) { if (!pep_it->getHits().empty()) { target_decoy_available = ((pep_it->getScoreType() == "q-value") && pep_it->getHits()[0].metaValueExists("target_decoy")); break; } } set<Int>::iterator charge_it = charges.begin(); // charges can be empty, no problem if split_charge is not set if (split_charge && charges.empty()) { throw Exception::Precondition(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "'split_charge' is set, but the list of charge states is empty"); } map<String, vector<vector<double> > > all_scores; char splitter = ','; // to split the engine from the charge state later on do { for (StringList::iterator engine_it = search_engines.begin(); engine_it != search_engines.end(); ++engine_it) { for (vector<ProteinIdentification>::iterator prot_it = protein_ids.begin(); prot_it != protein_ids.end(); ++prot_it) { String searchengine = prot_it->getSearchEngine(); if ((*engine_it == searchengine) || (*engine_it == searchengine.toUpper())) { for (vector<PeptideIdentification>::iterator pep_it = peptide_ids.begin(); pep_it != peptide_ids.end(); ++pep_it) { if (prot_it->getIdentifier() == pep_it->getIdentifier()) { vector<PeptideHit>& hits = pep_it->getHits(); if (top_hits_only) { pep_it->sort(); if (!hits.empty() && (!split_charge || hits[0].getCharge() == *charge_it)) { double score = getScore_(*engine_it, hits[0]); if (!boost::math::isnan(score)) // issue #740: ignore scores with 0 values, otherwise you will get the error "unable to fit data" { scores.push_back(score); if (target_decoy_available) { if (hits[0].getScore() < fdr_for_targets_smaller) { target.push_back(score); } else { decoy.push_back(score); } } } } } else { for (std::vector<PeptideHit>::iterator hit_it = hits.begin(); hit_it != hits.end(); ++hit_it) { if (!split_charge || (hit_it->getCharge() == *charge_it)) { double score = getScore_(*engine_it, *hit_it); if (!boost::math::isnan(score)) // issue #740: ignore scores with 0 values, otherwise you will get the error "unable to fit data" { scores.push_back(score); } } } } } } } } if (scores.size() > 2) { vector<vector<double> > tmp; tmp.push_back(scores); tmp.push_back(target); tmp.push_back(decoy); if (split_charge) { String engine_with_charge_state = *engine_it + String(splitter) + String(*charge_it); all_scores.insert(make_pair(engine_with_charge_state, tmp)); } else { all_scores.insert(make_pair(*engine_it, tmp)); } } scores.clear(); target.clear(); decoy.clear(); } if (split_charge) ++charge_it; } while (charge_it != charges.end()); if (all_scores.empty()) { writeLog_("No data collected. Check whether search engine is supported."); if (!ignore_bad_data) return INPUT_FILE_EMPTY; } String out_plot = fit_algorithm.getValue("out_plot").toString().trim(); for (map<String, vector<vector<double> > >::iterator score_it = all_scores.begin(); score_it != all_scores.end(); ++score_it) { vector<String> engine_info; score_it->first.split(splitter, engine_info); String engine = engine_info[0]; Int charge = -1; if (engine_info.size() == 2) { charge = engine_info[1].toInt(); } if (split_charge) { // only adapt plot output if plot is requested (this badly violates the output rules and needs to change!) // one way to fix this: plot charges into a single file (no renaming of output file needed) - but this requires major code restructuring if (!out_plot.empty()) fit_algorithm.setValue("out_plot", out_plot + "_charge_" + String(charge)); PEP_model.setParameters(fit_algorithm); } const bool return_value = PEP_model.fit(score_it->second[0]); if (!return_value) writeLog_("Unable to fit data. Algorithm did not run through for the following search engine: " + engine); if (!return_value && !ignore_bad_data) return UNEXPECTED_RESULT; if (return_value) { // plot target_decoy if (!out_plot.empty() && top_hits_only && target_decoy_available && (score_it->second[0].size() > 0)) { PEP_model.plotTargetDecoyEstimation(score_it->second[1], score_it->second[2]); //target, decoy } bool unable_to_fit_data = true; bool data_might_not_be_well_fit = true; for (vector<ProteinIdentification>::iterator prot_it = protein_ids.begin(); prot_it != protein_ids.end(); ++prot_it) { String searchengine = prot_it->getSearchEngine(); if ((engine == searchengine) || (engine == searchengine.toUpper())) { for (vector<PeptideIdentification>::iterator pep_it = peptide_ids.begin(); pep_it != peptide_ids.end(); ++pep_it) { if (prot_it->getIdentifier() == pep_it->getIdentifier()) { String score_type = pep_it->getScoreType() + "_score"; vector<PeptideHit> hits = pep_it->getHits(); for (std::vector<PeptideHit>::iterator hit_it = hits.begin(); hit_it != hits.end(); ++hit_it) { if (!split_charge || (hit_it->getCharge() == charge)) { double score; hit_it->setMetaValue(score_type, hit_it->getScore()); score = getScore_(engine, *hit_it); if (boost::math::isnan(score)) // issue #740: ignore scores with 0 values, otherwise you will get the error "unable to fit data" { score = 1.0; } else { score = PEP_model.computeProbability(score); if ((score > 0.0) && (score < 1.0)) unable_to_fit_data = false; // only if all it->second[0] are 0 or 1 unable_to_fit_data stays true if ((score > 0.2) && (score < 0.8)) data_might_not_be_well_fit = false; //same as above } hit_it->setScore(score); if (prob_correct) { hit_it->setScore(1.0 - score); } else { hit_it->setScore(score); } } } pep_it->setHits(hits); } if (prob_correct) { pep_it->setScoreType("Posterior Probability"); pep_it->setHigherScoreBetter(true); } else { pep_it->setScoreType("Posterior Error Probability"); pep_it->setHigherScoreBetter(false); } } } } if (unable_to_fit_data) writeLog_(String("Unable to fit data for search engine: ") + engine); if (unable_to_fit_data && !ignore_bad_data) return UNEXPECTED_RESULT; if (data_might_not_be_well_fit) writeLog_(String("Data might not be well fitted for search engine: ") + engine); } } //------------------------------------------------------------- // writing output //------------------------------------------------------------- file.store(outputfile_name, protein_ids, peptide_ids); return EXECUTION_OK; }
void mergePepXMLProtXML_(StringList filenames, vector<ProteinIdentification>& proteins, vector<PeptideIdentification>& peptides) { IdXMLFile idxml; idxml.load(filenames[0], proteins, peptides); vector<ProteinIdentification> pepxml_proteins, protxml_proteins; vector<PeptideIdentification> pepxml_peptides, protxml_peptides; if (proteins[0].getProteinGroups().empty()) // first idXML contains data from the pepXML { proteins.swap(pepxml_proteins); peptides.swap(pepxml_peptides); idxml.load(filenames[1], protxml_proteins, protxml_peptides); if (protxml_proteins[0].getProteinGroups().empty()) { throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "None of the input files seems to be derived from a protXML file (information about protein groups is missing)."); } } else // first idXML contains data from the protXML { proteins.swap(protxml_proteins); peptides.swap(protxml_peptides); idxml.load(filenames[1], pepxml_proteins, pepxml_peptides); } if ((protxml_peptides.size() > 1) || (protxml_proteins.size() > 1)) { throw Exception::InvalidParameter(__FILE__, __LINE__, __PRETTY_FUNCTION__, "The idXML derived from a protXML file should contain only one 'ProteinIdentification' and one 'PeptideIdentification' instance."); } // peptide information comes from the pepXML (additional information in // the protXML - adapted peptide hit score, "is_unique", "is_contributing" // - is not transferred): peptides.swap(pepxml_peptides); // prepare scores and coverage values of protein hits from the protXML: map<String, pair<DoubleReal, DoubleReal> > hit_values; ProteinIdentification & protein = protxml_proteins[0]; for (vector<ProteinHit>::iterator hit_it = protein.getHits().begin(); hit_it != protein.getHits().end(); ++hit_it) { hit_values[hit_it->getAccession()] = make_pair(hit_it->getScore(), hit_it->getCoverage()); } // merge protein information: proteins.swap(pepxml_proteins); for (vector<ProteinIdentification>::iterator prot_it = proteins.begin(); prot_it != proteins.end(); ++prot_it) { prot_it->getProteinGroups() = protein.getProteinGroups(); prot_it->getIndistinguishableProteins() = protein.getIndistinguishableProteins(); // TODO: since a protXML file can integrate data from several protein // identification runs, the protein groups/indistinguishable proteins // that we write to one identification run could contain references to // proteins that are not observed in this run, but in others; also, some // protein hits without enough evidence may not occur in the protXML // (thus also not in the protein groups) - clean this up? prot_it->setScoreType(protein.getScoreType()); prot_it->setHigherScoreBetter(protein.isHigherScoreBetter()); prot_it->setSignificanceThreshold(protein.getSignificanceThreshold()); for (vector<ProteinHit>::iterator hit_it = prot_it->getHits().begin(); hit_it != prot_it->getHits().end(); ++hit_it) { map<String, pair<DoubleReal, DoubleReal> >::const_iterator pos = hit_values.find(hit_it->getAccession()); if (pos == hit_values.end()) { hit_it->setScore(-1); } else { hit_it->setScore(pos->second.first); hit_it->setCoverage(pos->second.second); } } } }
NEW_TMP_FILE(filename) IdXMLFile().store(filename, protein_ids2, peptide_ids2, document_id2); FuzzyStringComparator fuzzy; fuzzy.setWhitelist(StringList::create("<?xml-stylesheet")); fuzzy.setAcceptableAbsolute(0.0001); bool result = fuzzy.compareFiles(input_path, filename); TEST_EQUAL(result, true); END_SECTION START_SECTION([EXTRA] static bool isValid(const String& filename)) std::vector<ProteinIdentification> protein_ids, protein_ids2; std::vector<PeptideIdentification> peptide_ids, peptide_ids2; String filename; IdXMLFile f; //test if empty file is valid NEW_TMP_FILE(filename) f.store(filename, protein_ids2, peptide_ids2); TEST_EQUAL(f.isValid(filename),true); //test if full file is valid NEW_TMP_FILE(filename); String document_id; f.load(OPENMS_GET_TEST_DATA_PATH("IdXMLFile_whole.idXML"), protein_ids2, peptide_ids2, document_id); protein_ids2[0].setMetaValue("stringvalue",String("bla")); protein_ids2[0].setMetaValue("intvalue",4711); protein_ids2[0].setMetaValue("floatvalue",5.3); f.store(filename, protein_ids2, peptide_ids2); TEST_EQUAL(f.isValid(filename),true);
ExitCodes main_(int, const char **) { //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- StringList id_in(getStringList_("id_in")); StringList in_raw(getStringList_("in")); Size number_of_bins((UInt)getIntOption_("number_of_bins")); bool precursor_error_ppm(getFlag_("precursor_error_ppm")); bool fragment_error_ppm(getFlag_("fragment_error_ppm")); bool generate_gnuplot_scripts(DataValue(getStringOption_("generate_gnuplot_scripts")).toBool()); if (in_raw.size() != id_in.size()) { writeLog_("Number of spectrum files and identification files differs..."); return ILLEGAL_PARAMETERS; } //------------------------------------------------------------- // reading input //------------------------------------------------------------- vector<vector<PeptideIdentification> > pep_ids; vector<vector<ProteinIdentification> > prot_ids; pep_ids.resize(id_in.size()); prot_ids.resize(id_in.size()); IdXMLFile idxmlfile; for (Size i = 0; i != id_in.size(); ++i) { String doc_id; idxmlfile.load(id_in[i], prot_ids[i], pep_ids[i], doc_id); } // read mzML files vector<RichPeakMap> maps_raw; maps_raw.resize(in_raw.size()); MzMLFile mzml_file; for (Size i = 0; i != in_raw.size(); ++i) { mzml_file.load(in_raw[i], maps_raw[i]); } //------------------------------------------------------------- // calculations //------------------------------------------------------------- // mapping ids IDMapper mapper; for (Size i = 0; i != maps_raw.size(); ++i) { mapper.annotate(maps_raw[i], pep_ids[i], prot_ids[i]); } // normalize the spectra Normalizer normalizer; for (vector<RichPeakMap>::iterator it1 = maps_raw.begin(); it1 != maps_raw.end(); ++it1) { for (RichPeakMap::Iterator it2 = it1->begin(); it2 != it1->end(); ++it2) { normalizer.filterSpectrum(*it2); } } // generate precursor statistics vector<MassDifference> precursor_diffs; if (getStringOption_("precursor_out") != "") { for (Size i = 0; i != maps_raw.size(); ++i) { for (Size j = 0; j != maps_raw[i].size(); ++j) { if (maps_raw[i][j].getPeptideIdentifications().empty()) { continue; } for (vector<PeptideIdentification>::const_iterator it = maps_raw[i][j].getPeptideIdentifications().begin(); it != maps_raw[i][j].getPeptideIdentifications().end(); ++it) { if (it->getHits().size() > 0) { PeptideHit hit = *it->getHits().begin(); MassDifference md; Int charge = hit.getCharge(); if (charge == 0) { charge = 1; } md.exp_mz = it->getMZ(); md.theo_mz = (hit.getSequence().getMonoWeight() + (double)charge * Constants::PROTON_MASS_U) / (double)charge; md.charge = charge; precursor_diffs.push_back(md); } } } } } // generate fragment ions statistics vector<MassDifference> fragment_diffs; TheoreticalSpectrumGenerator tsg; SpectrumAlignment sa; double fragment_mass_tolerance(getDoubleOption_("fragment_mass_tolerance")); Param sa_param(sa.getParameters()); sa_param.setValue("tolerance", fragment_mass_tolerance); sa.setParameters(sa_param); if (getStringOption_("fragment_out") != "") { for (Size i = 0; i != maps_raw.size(); ++i) { for (Size j = 0; j != maps_raw[i].size(); ++j) { if (maps_raw[i][j].getPeptideIdentifications().empty()) { continue; } for (vector<PeptideIdentification>::const_iterator it = maps_raw[i][j].getPeptideIdentifications().begin(); it != maps_raw[i][j].getPeptideIdentifications().end(); ++it) { if (it->getHits().size() > 0) { PeptideHit hit = *it->getHits().begin(); RichPeakSpectrum theo_spec; tsg.addPeaks(theo_spec, hit.getSequence(), Residue::YIon); tsg.addPeaks(theo_spec, hit.getSequence(), Residue::BIon); vector<pair<Size, Size> > pairs; sa.getSpectrumAlignment(pairs, theo_spec, maps_raw[i][j]); //cerr << hit.getSequence() << " " << hit.getSequence().getSuffix(1).getFormula() << " " << hit.getSequence().getSuffix(1).getFormula().getMonoWeight() << endl; for (vector<pair<Size, Size> >::const_iterator pit = pairs.begin(); pit != pairs.end(); ++pit) { MassDifference md; md.exp_mz = maps_raw[i][j][pit->second].getMZ(); md.theo_mz = theo_spec[pit->first].getMZ(); //cerr.precision(15); //cerr << md.exp_mz << " " << md.theo_mz << " " << md.exp_mz - md.theo_mz << endl; md.intensity = maps_raw[i][j][pit->second].getIntensity(); md.charge = hit.getCharge(); fragment_diffs.push_back(md); } } } } } } //------------------------------------------------------------- // writing output //------------------------------------------------------------- String precursor_out_file(getStringOption_("precursor_out")); if (precursor_out_file != "") { vector<double> errors; ofstream precursor_out(precursor_out_file.c_str()); double min_diff(numeric_limits<double>::max()), max_diff(numeric_limits<double>::min()); for (Size i = 0; i != precursor_diffs.size(); ++i) { double diff = getMassDifference(precursor_diffs[i].theo_mz, precursor_diffs[i].exp_mz, precursor_error_ppm); precursor_out << diff << "\n"; errors.push_back(diff); if (diff > max_diff) { max_diff = diff; } if (diff < min_diff) { min_diff = diff; } } precursor_out.close(); // fill histogram with the collected values double bin_size = (max_diff - min_diff) / (double)number_of_bins; Histogram<double, double> hist(min_diff, max_diff, bin_size); for (Size i = 0; i != errors.size(); ++i) { hist.inc(errors[i], 1.0); } writeDebug_("min_diff=" + String(min_diff) + ", max_diff=" + String(max_diff) + ", number_of_bins=" + String(number_of_bins), 1); // transform the histogram into a vector<DPosition<2> > for the fitting vector<DPosition<2> > values; for (Size i = 0; i != hist.size(); ++i) { DPosition<2> p; p.setX((double)i / (double)number_of_bins * (max_diff - min_diff) + min_diff); p.setY(hist[i]); values.push_back(p); } double mean = Math::mean(errors.begin(), errors.end()); double abs_dev = Math::absdev(errors.begin(), errors.end(), mean); double sdv = Math::sd(errors.begin(), errors.end(), mean); sort(errors.begin(), errors.end()); double median = errors[(Size)(errors.size() / 2.0)]; writeDebug_("Precursor mean error: " + String(mean), 1); writeDebug_("Precursor abs. dev.: " + String(abs_dev), 1); writeDebug_("Precursor std. dev.: " + String(sdv), 1); writeDebug_("Precursor median error: " + String(median), 1); // calculate histogram for gauss fitting GaussFitter gf; GaussFitter::GaussFitResult init_param (hist.maxValue(), median, sdv/500.0); gf.setInitialParameters(init_param); try { gf.fit(values); // write gnuplot scripts if (generate_gnuplot_scripts) { ofstream out(String(precursor_out_file + "_gnuplot.dat").c_str()); for (vector<DPosition<2> >::const_iterator it = values.begin(); it != values.end(); ++it) { out << it->getX() << " " << it->getY() << endl; } out.close(); ofstream gpl_out(String(precursor_out_file + "_gnuplot.gpl").c_str()); gpl_out << "set terminal png" << endl; gpl_out << "set output \"" << precursor_out_file << "_gnuplot.png\"" << endl; if (precursor_error_ppm) { gpl_out << "set xlabel \"error in ppm\"" << endl; } else { gpl_out << "set xlabel \"error in Da\"" << endl; } gpl_out << "set ylabel \"frequency\"" << endl; gpl_out << "plot '" << precursor_out_file << "_gnuplot.dat' title 'Precursor mass error distribution' w boxes, f(x) w lp title 'Gaussian fit of the error distribution'" << endl; gpl_out.close(); } } catch (Exception::UnableToFit) { writeLog_("Unable to fit a Gaussian distribution to the precursor mass errors"); } } String fragment_out_file(getStringOption_("fragment_out")); if (fragment_out_file != "") { vector<double> errors; ofstream fragment_out(fragment_out_file.c_str()); double min_diff(numeric_limits<double>::max()), max_diff(numeric_limits<double>::min()); for (Size i = 0; i != fragment_diffs.size(); ++i) { double diff = getMassDifference(fragment_diffs[i].theo_mz, fragment_diffs[i].exp_mz, fragment_error_ppm); fragment_out << diff << endl; errors.push_back(diff); if (diff > max_diff) { max_diff = diff; } if (diff < min_diff) { min_diff = diff; } } fragment_out.close(); // fill histogram with the collected values // here we use the intensities to scale the error // low intensity peaks are likely to be random matches double bin_size = (max_diff - min_diff) / (double)number_of_bins; Histogram<double, double> hist(min_diff, max_diff, bin_size); for (Size i = 0; i != fragment_diffs.size(); ++i) { double diff = getMassDifference(fragment_diffs[i].theo_mz, fragment_diffs[i].exp_mz, fragment_error_ppm); hist.inc(diff, fragment_diffs[i].intensity); } writeDebug_("min_diff=" + String(min_diff) + ", max_diff=" + String(max_diff) + ", number_of_bins=" + String(number_of_bins), 1); // transform the histogram into a vector<DPosition<2> > for the fitting vector<DPosition<2> > values; for (Size i = 0; i != hist.size(); ++i) { DPosition<2> p; p.setX((double)i / (double)number_of_bins * (max_diff - min_diff) + min_diff); p.setY(hist[i]); values.push_back(p); } double mean = Math::mean(errors.begin(), errors.end()); double abs_dev = Math::absdev(errors.begin(), errors.end(), mean); double sdv = Math::sd(errors.begin(), errors.end(), mean); sort(errors.begin(), errors.end()); double median = errors[(Size)(errors.size() / 2.0)]; writeDebug_("Fragment mean error: " + String(mean), 1); writeDebug_("Fragment abs. dev.: " + String(abs_dev), 1); writeDebug_("Fragment std. dev.: " + String(sdv), 1); writeDebug_("Fragment median error: " + String(median), 1); // calculate histogram for gauss fitting GaussFitter gf; GaussFitter::GaussFitResult init_param (hist.maxValue(), median, sdv / 100.0); gf.setInitialParameters(init_param); try { gf.fit(values); // write gnuplot script if (generate_gnuplot_scripts) { ofstream out(String(fragment_out_file + "_gnuplot.dat").c_str()); for (vector<DPosition<2> >::const_iterator it = values.begin(); it != values.end(); ++it) { out << it->getX() << " " << it->getY() << endl; } out.close(); ofstream gpl_out(String(fragment_out_file + "_gnuplot.gpl").c_str()); gpl_out << "set terminal png" << endl; gpl_out << "set output \"" << fragment_out_file << "_gnuplot.png\"" << endl; if (fragment_error_ppm) { gpl_out << "set xlabel \"error in ppm\"" << endl; } else { gpl_out << "set xlabel \"error in Da\"" << endl; } gpl_out << "set ylabel \"frequency\"" << endl; gpl_out << "plot '" << fragment_out_file << "_gnuplot.dat' title 'Fragment mass error distribution' w boxes, f(x) w lp title 'Gaussian fit of the error distribution'" << endl; gpl_out.close(); } } catch (Exception::UnableToFit) { writeLog_("Unable to fit a Gaussian distribution to the fragment mass errors"); } } return EXECUTION_OK; }
ExitCodes main_(int, const char **) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- //input/output files StringList in(getStringList_("in")); StringList id_in(getStringList_("id_in")); String trained_model_file(getStringOption_("trained_model_file")); String model_file(getStringOption_("model_file")); bool score_filtering(getFlag_("score_filtering")); double score_threshold(getDoubleOption_("score_threshold")); Int min_charge(getIntOption_("min_charge")); Int max_charge(getIntOption_("max_charge")); if (in.empty()) { writeLog_("For 'training' mode spectra and identifications are needed."); return INCOMPATIBLE_INPUT_DATA; } //bool duplicates_by_tic(getFlag_("duplicates_by_tic")); //bool base_model_from_file(getFlag_("base_model_from_file")); // create model, either read from a model file, or initialize with default parameters PILISModel model; if (model_file != "") { writeDebug_("Reading model from file '" + model_file + "'", 1); model.readFromFile(model_file); } else { writeDebug_("Initializing model", 1); model.setParameters(getParam_().copy("PILIS_parameters:", true)); model.init(); } Param pilis_param(model.getParameters()); ModificationDefinitionsSet mod_set(pilis_param.getValue("fixed_modifications"), pilis_param.getValue("variable_modifications")); // read spectra file (if available) vector<RichPeakMap> exp; vector<vector<ProteinIdentification> > prot_ids; vector<vector<PeptideIdentification> > pep_ids; if (!in.empty()) { FileTypes::Type in_file_type = FileHandler().getType(in[0]); writeDebug_("File type of parameter 'in' estimated as '" + FileTypes::typeToName(in_file_type) + "'", 1); // TODO check all types if (in_file_type == FileTypes::MSP) { writeDebug_("Reading MSP file", 1); MSPFile f; exp.resize(in.size()); pep_ids.resize(in.size()); for (Size i = 0; i != in.size(); ++i) { f.load(in[i], pep_ids[i], exp[i]); for (Size j = 0; j != exp[i].size(); ++j) { exp[i][j].getPeptideIdentifications().push_back(pep_ids[i][j]); } } } if (in_file_type == FileTypes::MZML) { MzMLFile f; f.setLogType(log_type_); exp.resize(in.size()); for (Size i = 0; i != in.size(); ++i) { f.load(in[i], exp[i]); } } } if (!id_in.empty()) { prot_ids.resize(id_in.size()); pep_ids.resize(id_in.size()); IdXMLFile f; for (Size i = 0; i != id_in.size(); ++i) { f.load(id_in[i], prot_ids[i], pep_ids[i]); } } if (!id_in.empty() && !in.empty()) { // map the if (id_in.size() != in.size()) { writeLog_("If in parameter contains mzML files and id_in contains idXML files, the number should be equal to allow mapping of the identification to the spectra"); return INCOMPATIBLE_INPUT_DATA; } // map the ids to the spectra IDMapper id_mapper; for (Size i = 0; i != exp.size(); ++i) { id_mapper.annotate(exp[i], pep_ids[i], prot_ids[i]); } } // get the peptides and spectra vector<PILISCrossValidation::Peptide> peptides; for (vector<RichPeakMap>::const_iterator it1 = exp.begin(); it1 != exp.end(); ++it1) { for (RichPeakMap::ConstIterator it2 = it1->begin(); it2 != it1->end(); ++it2) { if (it2->getPeptideIdentifications().empty()) { continue; } PeptideHit hit; if (it2->getPeptideIdentifications().begin()->getHits().size() > 0) { hit = *it2->getPeptideIdentifications().begin()->getHits().begin(); } else { continue; } // check whether the sequence contains a modification not modelled if (!mod_set.isCompatible(hit.getSequence()) || hit.getSequence().size() > (UInt)pilis_param.getValue("visible_model_depth")) { continue; } if (score_filtering && ((hit.getScore() < score_threshold && it2->getPeptideIdentifications().begin()->isHigherScoreBetter()) || (hit.getScore() > score_threshold && !it2->getPeptideIdentifications().begin()->isHigherScoreBetter()))) { continue; } PILISCrossValidation::Peptide pep_struct; pep_struct.sequence = hit.getSequence(); pep_struct.charge = hit.getCharge(); pep_struct.spec = *it2; pep_struct.hits = it2->getPeptideIdentifications().begin()->getHits(); // check charges if (pep_struct.charge < min_charge || pep_struct.charge > max_charge) { continue; } peptides.push_back(pep_struct); } } getUniquePeptides(peptides); writeDebug_("Number of (unique) peptides for training: " + String(peptides.size()), 1); //model.writeToFile("pilis_tmp.dat"); model.setParameters(pilis_param); for (vector<PILISCrossValidation::Peptide>::const_iterator it = peptides.begin(); it != peptides.end(); ++it) { model.train(it->spec, it->sequence, it->charge); } model.evaluate(); if (trained_model_file != "") { model.writeToFile(trained_model_file); } return EXECUTION_OK; }
ExitCodes main_(int, const char**) override { //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- String in = getStringOption_("in"); String out = getStringOption_("out"); PeptideIndexing indexer; Param param = getParam_().copy("", true); Param param_pi = indexer.getParameters(); param_pi.update(param, false, Log_debug); // suppress param. update message indexer.setParameters(param_pi); indexer.setLogType(this->log_type_); String db_name = getStringOption_("fasta"); if (!File::readable(db_name)) { String full_db_name; try { full_db_name = File::findDatabase(db_name); } catch (...) { printUsage_(); return ILLEGAL_PARAMETERS; } db_name = full_db_name; } //------------------------------------------------------------- // reading input //------------------------------------------------------------- // we stream the Fasta file std::vector<ProteinIdentification> prot_ids; std::vector<PeptideIdentification> pep_ids; IdXMLFile idxmlfile; idxmlfile.setLogType(this->log_type_); idxmlfile.load(in, prot_ids, pep_ids); //------------------------------------------------------------- // calculations //------------------------------------------------------------- FASTAContainer<TFI_File> proteins(db_name); PeptideIndexing::ExitCodes indexer_exit = indexer.run(proteins, prot_ids, pep_ids); //------------------------------------------------------------- // calculate protein coverage //------------------------------------------------------------- if (param.getValue("write_protein_sequence").toBool()) { for (Size i = 0; i < prot_ids.size(); ++i) { prot_ids[i].computeCoverage(pep_ids); } } //------------------------------------------------------------- // writing output //------------------------------------------------------------- idxmlfile.store(out, prot_ids, pep_ids); if (indexer_exit == PeptideIndexing::DATABASE_EMPTY) { return INPUT_FILE_EMPTY; } else if (indexer_exit == PeptideIndexing::UNEXPECTED_RESULT) { return UNEXPECTED_RESULT; } else if ((indexer_exit != PeptideIndexing::EXECUTION_OK) && (indexer_exit != PeptideIndexing::PEPTIDE_IDS_EMPTY)) { return UNKNOWN_ERROR; } return EXECUTION_OK; }
ExitCodes main_(int, const char**) { //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- String inputfile_name = getStringOption_("in"); String outputfile_name = getStringOption_("out"); smallest_e_value_ = getDoubleOption_("smallest_e_value"); Param fit_algorithm = getParam_().copy("fit_algorithm:", true); bool split_charge = getFlag_("split_charge"); bool top_hits_only = getFlag_("top_hits_only"); DoubleReal fdr_for_targets_smaller = getDoubleOption_("fdr_for_targets_smaller"); bool target_decoy_available = false; bool ignore_bad_data = getFlag_("ignore_bad_data"); bool prob_correct = getFlag_("prob_correct"); //------------------------------------------------------------- // reading input //------------------------------------------------------------- IdXMLFile file; vector<ProteinIdentification> protein_ids; vector<PeptideIdentification> peptide_ids; file.load(inputfile_name, protein_ids, peptide_ids); vector<double> scores; vector<double> decoy; vector<double> target; vector<Int> charges; PosteriorErrorProbabilityModel PEP_model; PEP_model.setParameters(fit_algorithm); StringList search_engines = ListUtils::create<String>("XTandem,OMSSA,MASCOT,SpectraST,MyriMatch,SimTandem"); //------------------------------------------------------------- // calculations //------------------------------------------------------------- if (split_charge) { for (vector<PeptideIdentification>::iterator it = peptide_ids.begin(); it < peptide_ids.end(); ++it) { vector<PeptideHit> hits = it->getHits(); for (std::vector<PeptideHit>::iterator hit = hits.begin(); hit < hits.end(); ++hit) { if (charges.end() == find(charges.begin(), charges.end(), hit->getCharge())) { charges.push_back(hit->getCharge()); } } } if (charges.empty()) { throw Exception::ElementNotFound(__FILE__, __LINE__, __PRETTY_FUNCTION__, "no charges found!"); } } for (vector<PeptideIdentification>::iterator it = peptide_ids.begin(); it < peptide_ids.end(); ++it) { if (!it->getHits().empty()) { target_decoy_available = (it->getScoreType() == "q-value" && it->getHits()[0].getMetaValue("target_decoy") != DataValue::EMPTY); break; } } vector<Int>::iterator charge = charges.begin(); // charges can be empty, no problem if split_charge is not set if (split_charge && charges.empty()) { throw Exception::Precondition(__FILE__, __LINE__, __PRETTY_FUNCTION__, "split_charge is set and the list of charge states is empty but should not be!"); } map<String, vector<vector<double> > > all_scores; char splitter = ','; //to split the engine from the charge state later on do { for (StringList::iterator engine = search_engines.begin(); engine < search_engines.end(); ++engine) { for (vector<ProteinIdentification>::iterator prot_iter = protein_ids.begin(); prot_iter < protein_ids.end(); ++prot_iter) { String searchengine_toUpper = prot_iter->getSearchEngine(); searchengine_toUpper.toUpper(); if (*engine == prot_iter->getSearchEngine() || *engine == searchengine_toUpper) { for (vector<PeptideIdentification>::iterator it = peptide_ids.begin(); it < peptide_ids.end(); ++it) { if (prot_iter->getIdentifier().compare(it->getIdentifier()) == 0) { vector<PeptideHit> hits = it->getHits(); if (top_hits_only) { if (!hits.empty() && (!split_charge || hits[0].getCharge() == *charge)) { scores.push_back(get_score_(*engine, hits[0])); if (target_decoy_available) { if (hits[0].getScore() < fdr_for_targets_smaller) { target.push_back(get_score_(*engine, hits[0])); } else { decoy.push_back(get_score_(*engine, hits[0])); } } } } else { for (std::vector<PeptideHit>::iterator hit = hits.begin(); hit < hits.end(); ++hit) { if (!split_charge || hit->getCharge() == *charge) { scores.push_back(get_score_(*engine, *hit)); } } } } } } } if (scores.size() > 2) { vector<vector<double> > tmp; tmp.push_back(scores); tmp.push_back(target); tmp.push_back(decoy); if (split_charge) { String engine_with_charge_state = *engine + String(splitter) + String(*charge); all_scores.insert(make_pair(engine_with_charge_state, tmp)); } else { all_scores.insert(make_pair(*engine, tmp)); } } scores.clear(); target.clear(); decoy.clear(); } if (split_charge) ++charge; } while (charge < charges.end()); if (all_scores.empty()) { writeLog_("No data collected. Check whether search engine is supported."); if (!ignore_bad_data) return INPUT_FILE_EMPTY; } for (map<String, vector<vector<double> > >::iterator it = all_scores.begin(); it != all_scores.end(); ++it) { vector<String> engine_info; it->first.split(splitter, engine_info); String engine = engine_info[0]; Int charge = -1; if (engine_info.size() == 2) { charge = engine_info[1].toInt(); } if (split_charge) { String output_name = fit_algorithm.getValue("output_name"); fit_algorithm.setValue("output_name", output_name + "_charge_" + String(charge), "...", ListUtils::create<String>("advanced,output file")); PEP_model.setParameters(fit_algorithm); } const bool return_value = PEP_model.fit(it->second[0]); if (!return_value) writeLog_("unable to fit data. Algorithm did not run through for the following search engine: " + engine); if (!return_value && !ignore_bad_data) return UNEXPECTED_RESULT; if (return_value) { //plot target_decoy if (target_decoy_available && it->second[0].size() > 0) { PEP_model.plotTargetDecoyEstimation(it->second[1], it->second[2]); //target, decoy } bool unable_to_fit_data = true; bool data_might_not_be_well_fit = true; for (vector<ProteinIdentification>::iterator prot_iter = protein_ids.begin(); prot_iter < protein_ids.end(); ++prot_iter) { String searchengine_toUpper = prot_iter->getSearchEngine(); searchengine_toUpper.toUpper(); if (engine == prot_iter->getSearchEngine() || engine == searchengine_toUpper) { for (vector<PeptideIdentification>::iterator it = peptide_ids.begin(); it < peptide_ids.end(); ++it) { if (prot_iter->getIdentifier().compare(it->getIdentifier()) == 0) { String score_type = it->getScoreType() + "_score"; vector<PeptideHit> hits = it->getHits(); for (std::vector<PeptideHit>::iterator hit = hits.begin(); hit < hits.end(); ++hit) { if (!split_charge || hit->getCharge() == charge) { DoubleReal score; hit->setMetaValue(score_type, hit->getScore()); score = PEP_model.computeProbability(get_score_(engine, *hit)); if (score > 0 && score < 1) unable_to_fit_data = false; //only if all it->second[0] are 0 or 1 unable_to_fit_data stays true if (score > 0.2 && score < 0.8) data_might_not_be_well_fit = false; //same as above hit->setScore(score); if (prob_correct) { hit->setScore(1 - score); } else { hit->setScore(score); } } } it->setHits(hits); } it->setScoreType("Posterior Error Probability"); it->setHigherScoreBetter(false); } } } if (unable_to_fit_data) writeLog_(String("unable to fit data for search engine: ") + engine); if (unable_to_fit_data && !ignore_bad_data) return UNEXPECTED_RESULT; if (data_might_not_be_well_fit) writeLog_(String("data might not be well fitted for search engine: ") + engine); } } //------------------------------------------------------------- // writing output //------------------------------------------------------------- file.store(outputfile_name, protein_ids, peptide_ids); return EXECUTION_OK; }