void registerOptionsAndFlags_() { registerInputFile_("id", "<file>", "", "Protein/peptide identifications file"); setValidFormats_("id", ListUtils::create<String>("mzid,idXML")); registerInputFile_("in", "<file>", "", "Feature map/consensus map file"); setValidFormats_("in", ListUtils::create<String>("featureXML,consensusXML,mzq")); registerOutputFile_("out", "<file>", "", "Output file (the format depends on the input file format)."); setValidFormats_("out", ListUtils::create<String>("featureXML,consensusXML,mzq")); addEmptyLine_(); IDMapper mapper; Param p = mapper.getParameters(); registerDoubleOption_("rt_tolerance", "<value>", p.getValue("rt_tolerance"), "RT tolerance (in seconds) for the matching of peptide identifications and (consensus) features.\nTolerance is understood as 'plus or minus x', so the matching range increases by twice the given value.", false); setMinFloat_("rt_tolerance", 0.0); registerDoubleOption_("mz_tolerance", "<value>", p.getValue("mz_tolerance"), "m/z tolerance (in ppm or Da) for the matching of peptide identifications and (consensus) features.\nTolerance is understood as 'plus or minus x', so the matching range increases by twice the given value.", false); setMinFloat_("mz_tolerance", 0.0); registerStringOption_("mz_measure", "<choice>", p.getEntry("mz_measure").valid_strings[0], "Unit of 'mz_tolerance'.", false); setValidStrings_("mz_measure", p.getEntry("mz_measure").valid_strings); registerStringOption_("mz_reference", "<choice>", p.getEntry("mz_reference").valid_strings[0], "Source of m/z values for peptide identifications. If 'precursor', the precursor-m/z from the idXML is used. If 'peptide',\nmasses are computed from the sequences of peptide hits; in this case, an identification matches if any of its hits matches.\n('peptide' should be used together with 'feature:use_centroid_mz' to avoid false-positive matches.)", false); setValidStrings_("mz_reference", p.getEntry("mz_reference").valid_strings); registerFlag_("ignore_charge", "For feature/consensus maps: Assign an ID independently of whether its charge state matches that of the (consensus) feature."); addEmptyLine_(); registerTOPPSubsection_("feature", "Additional options for featureXML input"); registerFlag_("feature:use_centroid_rt", "Use the RT coordinates of the feature centroids for matching, instead of the RT ranges of the features/mass traces."); registerFlag_("feature:use_centroid_mz", "Use the m/z coordinates of the feature centroids for matching, instead of the m/z ranges of the features/mass traces.\n(If you choose 'peptide' as 'mz_reference', you should usually set this flag to avoid false-positive matches.)"); addEmptyLine_(); registerTOPPSubsection_("consensus", "Additional options for consensusXML input"); registerFlag_("consensus:use_subelements", "Match using RT and m/z of sub-features instead of consensus RT and m/z. A consensus feature matches if any of its sub-features matches."); registerFlag_("consensus:annotate_ids_with_subelements", "Store the map index of the sub-feature in the peptide ID.", true); }
void InternalCalibration::calibrateMapGlobally(const FeatureMap<> & feature_map, FeatureMap<> & calibrated_feature_map, std::vector<PeptideIdentification> & ref_ids, String trafo_file_name) { checkReferenceIds_(ref_ids); calibrated_feature_map = feature_map; // clear the ids for (Size f = 0; f < calibrated_feature_map.size(); ++f) { calibrated_feature_map[f].getPeptideIdentifications().clear(); } // map the reference ids onto the features IDMapper mapper; Param param; param.setValue("rt_tolerance", (DoubleReal)param_.getValue("rt_tolerance")); param.setValue("mz_tolerance", param_.getValue("mz_tolerance")); param.setValue("mz_measure", param_.getValue("mz_tolerance_unit")); mapper.setParameters(param); std::vector<ProteinIdentification> vec; mapper.annotate(calibrated_feature_map, ref_ids, vec); // calibrate calibrateMapGlobally(calibrated_feature_map, calibrated_feature_map, trafo_file_name); // copy the old ids calibrated_feature_map.setUnassignedPeptideIdentifications(feature_map.getUnassignedPeptideIdentifications()); for (Size f = 0; f < feature_map.size(); ++f) { calibrated_feature_map[f].getPeptideIdentifications().clear(); if (!feature_map[f].getPeptideIdentifications().empty()) { calibrated_feature_map[f].setPeptideIdentifications(feature_map[f].getPeptideIdentifications()); } } }
ExitCodes main_(int, const char **) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- String in_spectra = getStringOption_("in_spectra"); String in_identifications = getStringOption_("in_identifications"); String outfile = getStringOption_("model_output_file"); Int precursor_charge = getIntOption_("precursor_charge"); //------------------------------------------------------------- // init SvmTheoreticalSpectrumGeneratorTrainer //------------------------------------------------------------- SvmTheoreticalSpectrumGeneratorTrainer trainer; Param param = getParam_().copy("algorithm:", true); String write_files = getFlag_("write_training_files") ? "true" : "false"; param.setValue("write_training_files", write_files); trainer.setParameters(param); //------------------------------------------------------------- // loading input //------------------------------------------------------------- PeakMap map; MzMLFile().load(in_spectra, map); std::vector<PeptideIdentification> pep_ids; std::vector<ProteinIdentification> prot_ids; String tmp_str; IdXMLFile().load(in_identifications, prot_ids, pep_ids, tmp_str); IDMapper idmapper; Param par; par.setValue("rt_tolerance", 0.001); par.setValue("mz_tolerance", 0.001); idmapper.setParameters(par); idmapper.annotate(map, pep_ids, prot_ids); //generate vector of annotations std::vector<AASequence> annotations; PeakMap::iterator it; for (it = map.begin(); it != map.end(); ++it) { annotations.push_back(it->getPeptideIdentifications()[0].getHits()[0].getSequence()); } trainer.trainModel(map, annotations, outfile, precursor_charge); return EXECUTION_OK; }
ExitCodes main_(int, const char**) { // LOG_DEBUG << "Starting..." << endl; //---------------------------------------------------------------- // load ids //---------------------------------------------------------------- // LOG_DEBUG << "Loading idXML..." << endl; String id = getStringOption_("id"); vector<ProteinIdentification> protein_ids; vector<PeptideIdentification> peptide_ids; FileTypes::Type in_type = FileHandler::getType(id); if (in_type == FileTypes::IDXML) { IdXMLFile().load(id, protein_ids, peptide_ids); } else if (in_type == FileTypes::MZIDENTML) { MzIdentMLFile().load(id, protein_ids, peptide_ids); } else { throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "wrong id fileformat"); } String in = getStringOption_("in"); String out = getStringOption_("out"); in_type = FileHandler::getType(in); //---------------------------------------------------------------- //create mapper //---------------------------------------------------------------- // LOG_DEBUG << "Creating mapper..." << endl; IDMapper mapper; Param p = mapper.getParameters(); p.setValue("rt_tolerance", getDoubleOption_("rt_tolerance")); p.setValue("mz_tolerance", getDoubleOption_("mz_tolerance")); p.setValue("mz_measure", getStringOption_("mz_measure")); p.setValue("mz_reference", getStringOption_("mz_reference")); p.setValue("ignore_charge", getFlag_("ignore_charge") ? "true" : "false"); mapper.setParameters(p); //---------------------------------------------------------------- // consensusXML //---------------------------------------------------------------- if (in_type == FileTypes::CONSENSUSXML) { // LOG_DEBUG << "Processing consensus map..." << endl; ConsensusXMLFile file; ConsensusMap map; file.load(in, map); bool measure_from_subelements = getFlag_("consensus:use_subelements"); bool annotate_ids_with_subelements = getFlag_("consensus:annotate_ids_with_subelements"); mapper.annotate(map, peptide_ids, protein_ids, measure_from_subelements, annotate_ids_with_subelements); //annotate output with data processing info addDataProcessing_(map, getProcessingInfo_(DataProcessing::IDENTIFICATION_MAPPING)); file.store(out, map); } //---------------------------------------------------------------- // featureXML //---------------------------------------------------------------- if (in_type == FileTypes::FEATUREXML) { // LOG_DEBUG << "Processing feature map..." << endl; FeatureMap map; FeatureXMLFile file; file.load(in, map); mapper.annotate(map, peptide_ids, protein_ids, getFlag_("feature:use_centroid_rt"), getFlag_("feature:use_centroid_mz")); //annotate output with data processing info addDataProcessing_(map, getProcessingInfo_(DataProcessing::IDENTIFICATION_MAPPING)); file.store(out, map); } //---------------------------------------------------------------- // MzQuantML //---------------------------------------------------------------- if (in_type == FileTypes::MZQUANTML) { // LOG_DEBUG << "Processing mzq ..." << endl; MSQuantifications msq; MzQuantMLFile file; file.load(in, msq); bool measure_from_subelements = getFlag_("consensus:use_subelements"); for (std::vector<ConsensusMap>::iterator it = msq.getConsensusMaps().begin(); it != msq.getConsensusMaps().end(); ++it) { mapper.annotate(*it, peptide_ids, protein_ids, measure_from_subelements); //annotate output with data processing info addDataProcessing_(*it, getProcessingInfo_(DataProcessing::IDENTIFICATION_MAPPING)); } //~ writeDebug_(msq.getConsensusMaps().size(),3); //~ writeDebug_(msq.getConsensusMaps().back().size(),3); //~ writeDebug_(msq.getAnalysisSummary().quant_type_,3); file.store(out, msq); } // LOG_DEBUG << "Done." << endl; return EXECUTION_OK; }
ExitCodes main_(int, const char **) { //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- StringList id_in(getStringList_("id_in")); StringList in_raw(getStringList_("in")); Size number_of_bins((UInt)getIntOption_("number_of_bins")); bool precursor_error_ppm(getFlag_("precursor_error_ppm")); bool fragment_error_ppm(getFlag_("fragment_error_ppm")); bool generate_gnuplot_scripts(DataValue(getStringOption_("generate_gnuplot_scripts")).toBool()); if (in_raw.size() != id_in.size()) { writeLog_("Number of spectrum files and identification files differs..."); return ILLEGAL_PARAMETERS; } //------------------------------------------------------------- // reading input //------------------------------------------------------------- vector<vector<PeptideIdentification> > pep_ids; vector<vector<ProteinIdentification> > prot_ids; pep_ids.resize(id_in.size()); prot_ids.resize(id_in.size()); IdXMLFile idxmlfile; for (Size i = 0; i != id_in.size(); ++i) { String doc_id; idxmlfile.load(id_in[i], prot_ids[i], pep_ids[i], doc_id); } // read mzML files vector<RichPeakMap> maps_raw; maps_raw.resize(in_raw.size()); MzMLFile mzml_file; for (Size i = 0; i != in_raw.size(); ++i) { mzml_file.load(in_raw[i], maps_raw[i]); } //------------------------------------------------------------- // calculations //------------------------------------------------------------- // mapping ids IDMapper mapper; for (Size i = 0; i != maps_raw.size(); ++i) { mapper.annotate(maps_raw[i], pep_ids[i], prot_ids[i]); } // normalize the spectra Normalizer normalizer; for (vector<RichPeakMap>::iterator it1 = maps_raw.begin(); it1 != maps_raw.end(); ++it1) { for (RichPeakMap::Iterator it2 = it1->begin(); it2 != it1->end(); ++it2) { normalizer.filterSpectrum(*it2); } } // generate precursor statistics vector<MassDifference> precursor_diffs; if (getStringOption_("precursor_out") != "") { for (Size i = 0; i != maps_raw.size(); ++i) { for (Size j = 0; j != maps_raw[i].size(); ++j) { if (maps_raw[i][j].getPeptideIdentifications().empty()) { continue; } for (vector<PeptideIdentification>::const_iterator it = maps_raw[i][j].getPeptideIdentifications().begin(); it != maps_raw[i][j].getPeptideIdentifications().end(); ++it) { if (it->getHits().size() > 0) { PeptideHit hit = *it->getHits().begin(); MassDifference md; Int charge = hit.getCharge(); if (charge == 0) { charge = 1; } md.exp_mz = it->getMZ(); md.theo_mz = (hit.getSequence().getMonoWeight() + (double)charge * Constants::PROTON_MASS_U) / (double)charge; md.charge = charge; precursor_diffs.push_back(md); } } } } } // generate fragment ions statistics vector<MassDifference> fragment_diffs; TheoreticalSpectrumGenerator tsg; SpectrumAlignment sa; double fragment_mass_tolerance(getDoubleOption_("fragment_mass_tolerance")); Param sa_param(sa.getParameters()); sa_param.setValue("tolerance", fragment_mass_tolerance); sa.setParameters(sa_param); if (getStringOption_("fragment_out") != "") { for (Size i = 0; i != maps_raw.size(); ++i) { for (Size j = 0; j != maps_raw[i].size(); ++j) { if (maps_raw[i][j].getPeptideIdentifications().empty()) { continue; } for (vector<PeptideIdentification>::const_iterator it = maps_raw[i][j].getPeptideIdentifications().begin(); it != maps_raw[i][j].getPeptideIdentifications().end(); ++it) { if (it->getHits().size() > 0) { PeptideHit hit = *it->getHits().begin(); RichPeakSpectrum theo_spec; tsg.addPeaks(theo_spec, hit.getSequence(), Residue::YIon); tsg.addPeaks(theo_spec, hit.getSequence(), Residue::BIon); vector<pair<Size, Size> > pairs; sa.getSpectrumAlignment(pairs, theo_spec, maps_raw[i][j]); //cerr << hit.getSequence() << " " << hit.getSequence().getSuffix(1).getFormula() << " " << hit.getSequence().getSuffix(1).getFormula().getMonoWeight() << endl; for (vector<pair<Size, Size> >::const_iterator pit = pairs.begin(); pit != pairs.end(); ++pit) { MassDifference md; md.exp_mz = maps_raw[i][j][pit->second].getMZ(); md.theo_mz = theo_spec[pit->first].getMZ(); //cerr.precision(15); //cerr << md.exp_mz << " " << md.theo_mz << " " << md.exp_mz - md.theo_mz << endl; md.intensity = maps_raw[i][j][pit->second].getIntensity(); md.charge = hit.getCharge(); fragment_diffs.push_back(md); } } } } } } //------------------------------------------------------------- // writing output //------------------------------------------------------------- String precursor_out_file(getStringOption_("precursor_out")); if (precursor_out_file != "") { vector<double> errors; ofstream precursor_out(precursor_out_file.c_str()); double min_diff(numeric_limits<double>::max()), max_diff(numeric_limits<double>::min()); for (Size i = 0; i != precursor_diffs.size(); ++i) { double diff = getMassDifference(precursor_diffs[i].theo_mz, precursor_diffs[i].exp_mz, precursor_error_ppm); precursor_out << diff << "\n"; errors.push_back(diff); if (diff > max_diff) { max_diff = diff; } if (diff < min_diff) { min_diff = diff; } } precursor_out.close(); // fill histogram with the collected values double bin_size = (max_diff - min_diff) / (double)number_of_bins; Histogram<double, double> hist(min_diff, max_diff, bin_size); for (Size i = 0; i != errors.size(); ++i) { hist.inc(errors[i], 1.0); } writeDebug_("min_diff=" + String(min_diff) + ", max_diff=" + String(max_diff) + ", number_of_bins=" + String(number_of_bins), 1); // transform the histogram into a vector<DPosition<2> > for the fitting vector<DPosition<2> > values; for (Size i = 0; i != hist.size(); ++i) { DPosition<2> p; p.setX((double)i / (double)number_of_bins * (max_diff - min_diff) + min_diff); p.setY(hist[i]); values.push_back(p); } double mean = Math::mean(errors.begin(), errors.end()); double abs_dev = Math::absdev(errors.begin(), errors.end(), mean); double sdv = Math::sd(errors.begin(), errors.end(), mean); sort(errors.begin(), errors.end()); double median = errors[(Size)(errors.size() / 2.0)]; writeDebug_("Precursor mean error: " + String(mean), 1); writeDebug_("Precursor abs. dev.: " + String(abs_dev), 1); writeDebug_("Precursor std. dev.: " + String(sdv), 1); writeDebug_("Precursor median error: " + String(median), 1); // calculate histogram for gauss fitting GaussFitter gf; GaussFitter::GaussFitResult init_param (hist.maxValue(), median, sdv/500.0); gf.setInitialParameters(init_param); try { gf.fit(values); // write gnuplot scripts if (generate_gnuplot_scripts) { ofstream out(String(precursor_out_file + "_gnuplot.dat").c_str()); for (vector<DPosition<2> >::const_iterator it = values.begin(); it != values.end(); ++it) { out << it->getX() << " " << it->getY() << endl; } out.close(); ofstream gpl_out(String(precursor_out_file + "_gnuplot.gpl").c_str()); gpl_out << "set terminal png" << endl; gpl_out << "set output \"" << precursor_out_file << "_gnuplot.png\"" << endl; if (precursor_error_ppm) { gpl_out << "set xlabel \"error in ppm\"" << endl; } else { gpl_out << "set xlabel \"error in Da\"" << endl; } gpl_out << "set ylabel \"frequency\"" << endl; gpl_out << "plot '" << precursor_out_file << "_gnuplot.dat' title 'Precursor mass error distribution' w boxes, f(x) w lp title 'Gaussian fit of the error distribution'" << endl; gpl_out.close(); } } catch (Exception::UnableToFit) { writeLog_("Unable to fit a Gaussian distribution to the precursor mass errors"); } } String fragment_out_file(getStringOption_("fragment_out")); if (fragment_out_file != "") { vector<double> errors; ofstream fragment_out(fragment_out_file.c_str()); double min_diff(numeric_limits<double>::max()), max_diff(numeric_limits<double>::min()); for (Size i = 0; i != fragment_diffs.size(); ++i) { double diff = getMassDifference(fragment_diffs[i].theo_mz, fragment_diffs[i].exp_mz, fragment_error_ppm); fragment_out << diff << endl; errors.push_back(diff); if (diff > max_diff) { max_diff = diff; } if (diff < min_diff) { min_diff = diff; } } fragment_out.close(); // fill histogram with the collected values // here we use the intensities to scale the error // low intensity peaks are likely to be random matches double bin_size = (max_diff - min_diff) / (double)number_of_bins; Histogram<double, double> hist(min_diff, max_diff, bin_size); for (Size i = 0; i != fragment_diffs.size(); ++i) { double diff = getMassDifference(fragment_diffs[i].theo_mz, fragment_diffs[i].exp_mz, fragment_error_ppm); hist.inc(diff, fragment_diffs[i].intensity); } writeDebug_("min_diff=" + String(min_diff) + ", max_diff=" + String(max_diff) + ", number_of_bins=" + String(number_of_bins), 1); // transform the histogram into a vector<DPosition<2> > for the fitting vector<DPosition<2> > values; for (Size i = 0; i != hist.size(); ++i) { DPosition<2> p; p.setX((double)i / (double)number_of_bins * (max_diff - min_diff) + min_diff); p.setY(hist[i]); values.push_back(p); } double mean = Math::mean(errors.begin(), errors.end()); double abs_dev = Math::absdev(errors.begin(), errors.end(), mean); double sdv = Math::sd(errors.begin(), errors.end(), mean); sort(errors.begin(), errors.end()); double median = errors[(Size)(errors.size() / 2.0)]; writeDebug_("Fragment mean error: " + String(mean), 1); writeDebug_("Fragment abs. dev.: " + String(abs_dev), 1); writeDebug_("Fragment std. dev.: " + String(sdv), 1); writeDebug_("Fragment median error: " + String(median), 1); // calculate histogram for gauss fitting GaussFitter gf; GaussFitter::GaussFitResult init_param (hist.maxValue(), median, sdv / 100.0); gf.setInitialParameters(init_param); try { gf.fit(values); // write gnuplot script if (generate_gnuplot_scripts) { ofstream out(String(fragment_out_file + "_gnuplot.dat").c_str()); for (vector<DPosition<2> >::const_iterator it = values.begin(); it != values.end(); ++it) { out << it->getX() << " " << it->getY() << endl; } out.close(); ofstream gpl_out(String(fragment_out_file + "_gnuplot.gpl").c_str()); gpl_out << "set terminal png" << endl; gpl_out << "set output \"" << fragment_out_file << "_gnuplot.png\"" << endl; if (fragment_error_ppm) { gpl_out << "set xlabel \"error in ppm\"" << endl; } else { gpl_out << "set xlabel \"error in Da\"" << endl; } gpl_out << "set ylabel \"frequency\"" << endl; gpl_out << "plot '" << fragment_out_file << "_gnuplot.dat' title 'Fragment mass error distribution' w boxes, f(x) w lp title 'Gaussian fit of the error distribution'" << endl; gpl_out.close(); } } catch (Exception::UnableToFit) { writeLog_("Unable to fit a Gaussian distribution to the fragment mass errors"); } } return EXECUTION_OK; }