void InternalCalibration::calibrateMapGlobally(const FeatureMap<> & feature_map, FeatureMap<> & calibrated_feature_map, std::vector<PeptideIdentification> & ref_ids, String trafo_file_name) { checkReferenceIds_(ref_ids); calibrated_feature_map = feature_map; // clear the ids for (Size f = 0; f < calibrated_feature_map.size(); ++f) { calibrated_feature_map[f].getPeptideIdentifications().clear(); } // map the reference ids onto the features IDMapper mapper; Param param; param.setValue("rt_tolerance", (DoubleReal)param_.getValue("rt_tolerance")); param.setValue("mz_tolerance", param_.getValue("mz_tolerance")); param.setValue("mz_measure", param_.getValue("mz_tolerance_unit")); mapper.setParameters(param); std::vector<ProteinIdentification> vec; mapper.annotate(calibrated_feature_map, ref_ids, vec); // calibrate calibrateMapGlobally(calibrated_feature_map, calibrated_feature_map, trafo_file_name); // copy the old ids calibrated_feature_map.setUnassignedPeptideIdentifications(feature_map.getUnassignedPeptideIdentifications()); for (Size f = 0; f < feature_map.size(); ++f) { calibrated_feature_map[f].getPeptideIdentifications().clear(); if (!feature_map[f].getPeptideIdentifications().empty()) { calibrated_feature_map[f].setPeptideIdentifications(feature_map[f].getPeptideIdentifications()); } } }
ExitCodes main_(int, const char **) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- String in_spectra = getStringOption_("in_spectra"); String in_identifications = getStringOption_("in_identifications"); String outfile = getStringOption_("model_output_file"); Int precursor_charge = getIntOption_("precursor_charge"); //------------------------------------------------------------- // init SvmTheoreticalSpectrumGeneratorTrainer //------------------------------------------------------------- SvmTheoreticalSpectrumGeneratorTrainer trainer; Param param = getParam_().copy("algorithm:", true); String write_files = getFlag_("write_training_files") ? "true" : "false"; param.setValue("write_training_files", write_files); trainer.setParameters(param); //------------------------------------------------------------- // loading input //------------------------------------------------------------- PeakMap map; MzMLFile().load(in_spectra, map); std::vector<PeptideIdentification> pep_ids; std::vector<ProteinIdentification> prot_ids; String tmp_str; IdXMLFile().load(in_identifications, prot_ids, pep_ids, tmp_str); IDMapper idmapper; Param par; par.setValue("rt_tolerance", 0.001); par.setValue("mz_tolerance", 0.001); idmapper.setParameters(par); idmapper.annotate(map, pep_ids, prot_ids); //generate vector of annotations std::vector<AASequence> annotations; PeakMap::iterator it; for (it = map.begin(); it != map.end(); ++it) { annotations.push_back(it->getPeptideIdentifications()[0].getHits()[0].getSequence()); } trainer.trainModel(map, annotations, outfile, precursor_charge); return EXECUTION_OK; }
ExitCodes main_(int, const char**) { // LOG_DEBUG << "Starting..." << endl; //---------------------------------------------------------------- // load ids //---------------------------------------------------------------- // LOG_DEBUG << "Loading idXML..." << endl; String id = getStringOption_("id"); vector<ProteinIdentification> protein_ids; vector<PeptideIdentification> peptide_ids; FileTypes::Type in_type = FileHandler::getType(id); if (in_type == FileTypes::IDXML) { IdXMLFile().load(id, protein_ids, peptide_ids); } else if (in_type == FileTypes::MZIDENTML) { MzIdentMLFile().load(id, protein_ids, peptide_ids); } else { throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "wrong id fileformat"); } String in = getStringOption_("in"); String out = getStringOption_("out"); in_type = FileHandler::getType(in); //---------------------------------------------------------------- //create mapper //---------------------------------------------------------------- // LOG_DEBUG << "Creating mapper..." << endl; IDMapper mapper; Param p = mapper.getParameters(); p.setValue("rt_tolerance", getDoubleOption_("rt_tolerance")); p.setValue("mz_tolerance", getDoubleOption_("mz_tolerance")); p.setValue("mz_measure", getStringOption_("mz_measure")); p.setValue("mz_reference", getStringOption_("mz_reference")); p.setValue("ignore_charge", getFlag_("ignore_charge") ? "true" : "false"); mapper.setParameters(p); //---------------------------------------------------------------- // consensusXML //---------------------------------------------------------------- if (in_type == FileTypes::CONSENSUSXML) { // LOG_DEBUG << "Processing consensus map..." << endl; ConsensusXMLFile file; ConsensusMap map; file.load(in, map); bool measure_from_subelements = getFlag_("consensus:use_subelements"); bool annotate_ids_with_subelements = getFlag_("consensus:annotate_ids_with_subelements"); mapper.annotate(map, peptide_ids, protein_ids, measure_from_subelements, annotate_ids_with_subelements); //annotate output with data processing info addDataProcessing_(map, getProcessingInfo_(DataProcessing::IDENTIFICATION_MAPPING)); file.store(out, map); } //---------------------------------------------------------------- // featureXML //---------------------------------------------------------------- if (in_type == FileTypes::FEATUREXML) { // LOG_DEBUG << "Processing feature map..." << endl; FeatureMap map; FeatureXMLFile file; file.load(in, map); mapper.annotate(map, peptide_ids, protein_ids, getFlag_("feature:use_centroid_rt"), getFlag_("feature:use_centroid_mz")); //annotate output with data processing info addDataProcessing_(map, getProcessingInfo_(DataProcessing::IDENTIFICATION_MAPPING)); file.store(out, map); } //---------------------------------------------------------------- // MzQuantML //---------------------------------------------------------------- if (in_type == FileTypes::MZQUANTML) { // LOG_DEBUG << "Processing mzq ..." << endl; MSQuantifications msq; MzQuantMLFile file; file.load(in, msq); bool measure_from_subelements = getFlag_("consensus:use_subelements"); for (std::vector<ConsensusMap>::iterator it = msq.getConsensusMaps().begin(); it != msq.getConsensusMaps().end(); ++it) { mapper.annotate(*it, peptide_ids, protein_ids, measure_from_subelements); //annotate output with data processing info addDataProcessing_(*it, getProcessingInfo_(DataProcessing::IDENTIFICATION_MAPPING)); } //~ writeDebug_(msq.getConsensusMaps().size(),3); //~ writeDebug_(msq.getConsensusMaps().back().size(),3); //~ writeDebug_(msq.getAnalysisSummary().quant_type_,3); file.store(out, msq); } // LOG_DEBUG << "Done." << endl; return EXECUTION_OK; }
ExitCodes main_(int, const char **) { //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- StringList id_in(getStringList_("id_in")); StringList in_raw(getStringList_("in")); Size number_of_bins((UInt)getIntOption_("number_of_bins")); bool precursor_error_ppm(getFlag_("precursor_error_ppm")); bool fragment_error_ppm(getFlag_("fragment_error_ppm")); bool generate_gnuplot_scripts(DataValue(getStringOption_("generate_gnuplot_scripts")).toBool()); if (in_raw.size() != id_in.size()) { writeLog_("Number of spectrum files and identification files differs..."); return ILLEGAL_PARAMETERS; } //------------------------------------------------------------- // reading input //------------------------------------------------------------- vector<vector<PeptideIdentification> > pep_ids; vector<vector<ProteinIdentification> > prot_ids; pep_ids.resize(id_in.size()); prot_ids.resize(id_in.size()); IdXMLFile idxmlfile; for (Size i = 0; i != id_in.size(); ++i) { String doc_id; idxmlfile.load(id_in[i], prot_ids[i], pep_ids[i], doc_id); } // read mzML files vector<RichPeakMap> maps_raw; maps_raw.resize(in_raw.size()); MzMLFile mzml_file; for (Size i = 0; i != in_raw.size(); ++i) { mzml_file.load(in_raw[i], maps_raw[i]); } //------------------------------------------------------------- // calculations //------------------------------------------------------------- // mapping ids IDMapper mapper; for (Size i = 0; i != maps_raw.size(); ++i) { mapper.annotate(maps_raw[i], pep_ids[i], prot_ids[i]); } // normalize the spectra Normalizer normalizer; for (vector<RichPeakMap>::iterator it1 = maps_raw.begin(); it1 != maps_raw.end(); ++it1) { for (RichPeakMap::Iterator it2 = it1->begin(); it2 != it1->end(); ++it2) { normalizer.filterSpectrum(*it2); } } // generate precursor statistics vector<MassDifference> precursor_diffs; if (getStringOption_("precursor_out") != "") { for (Size i = 0; i != maps_raw.size(); ++i) { for (Size j = 0; j != maps_raw[i].size(); ++j) { if (maps_raw[i][j].getPeptideIdentifications().empty()) { continue; } for (vector<PeptideIdentification>::const_iterator it = maps_raw[i][j].getPeptideIdentifications().begin(); it != maps_raw[i][j].getPeptideIdentifications().end(); ++it) { if (it->getHits().size() > 0) { PeptideHit hit = *it->getHits().begin(); MassDifference md; Int charge = hit.getCharge(); if (charge == 0) { charge = 1; } md.exp_mz = it->getMZ(); md.theo_mz = (hit.getSequence().getMonoWeight() + (double)charge * Constants::PROTON_MASS_U) / (double)charge; md.charge = charge; precursor_diffs.push_back(md); } } } } } // generate fragment ions statistics vector<MassDifference> fragment_diffs; TheoreticalSpectrumGenerator tsg; SpectrumAlignment sa; double fragment_mass_tolerance(getDoubleOption_("fragment_mass_tolerance")); Param sa_param(sa.getParameters()); sa_param.setValue("tolerance", fragment_mass_tolerance); sa.setParameters(sa_param); if (getStringOption_("fragment_out") != "") { for (Size i = 0; i != maps_raw.size(); ++i) { for (Size j = 0; j != maps_raw[i].size(); ++j) { if (maps_raw[i][j].getPeptideIdentifications().empty()) { continue; } for (vector<PeptideIdentification>::const_iterator it = maps_raw[i][j].getPeptideIdentifications().begin(); it != maps_raw[i][j].getPeptideIdentifications().end(); ++it) { if (it->getHits().size() > 0) { PeptideHit hit = *it->getHits().begin(); RichPeakSpectrum theo_spec; tsg.addPeaks(theo_spec, hit.getSequence(), Residue::YIon); tsg.addPeaks(theo_spec, hit.getSequence(), Residue::BIon); vector<pair<Size, Size> > pairs; sa.getSpectrumAlignment(pairs, theo_spec, maps_raw[i][j]); //cerr << hit.getSequence() << " " << hit.getSequence().getSuffix(1).getFormula() << " " << hit.getSequence().getSuffix(1).getFormula().getMonoWeight() << endl; for (vector<pair<Size, Size> >::const_iterator pit = pairs.begin(); pit != pairs.end(); ++pit) { MassDifference md; md.exp_mz = maps_raw[i][j][pit->second].getMZ(); md.theo_mz = theo_spec[pit->first].getMZ(); //cerr.precision(15); //cerr << md.exp_mz << " " << md.theo_mz << " " << md.exp_mz - md.theo_mz << endl; md.intensity = maps_raw[i][j][pit->second].getIntensity(); md.charge = hit.getCharge(); fragment_diffs.push_back(md); } } } } } } //------------------------------------------------------------- // writing output //------------------------------------------------------------- String precursor_out_file(getStringOption_("precursor_out")); if (precursor_out_file != "") { vector<double> errors; ofstream precursor_out(precursor_out_file.c_str()); double min_diff(numeric_limits<double>::max()), max_diff(numeric_limits<double>::min()); for (Size i = 0; i != precursor_diffs.size(); ++i) { double diff = getMassDifference(precursor_diffs[i].theo_mz, precursor_diffs[i].exp_mz, precursor_error_ppm); precursor_out << diff << "\n"; errors.push_back(diff); if (diff > max_diff) { max_diff = diff; } if (diff < min_diff) { min_diff = diff; } } precursor_out.close(); // fill histogram with the collected values double bin_size = (max_diff - min_diff) / (double)number_of_bins; Histogram<double, double> hist(min_diff, max_diff, bin_size); for (Size i = 0; i != errors.size(); ++i) { hist.inc(errors[i], 1.0); } writeDebug_("min_diff=" + String(min_diff) + ", max_diff=" + String(max_diff) + ", number_of_bins=" + String(number_of_bins), 1); // transform the histogram into a vector<DPosition<2> > for the fitting vector<DPosition<2> > values; for (Size i = 0; i != hist.size(); ++i) { DPosition<2> p; p.setX((double)i / (double)number_of_bins * (max_diff - min_diff) + min_diff); p.setY(hist[i]); values.push_back(p); } double mean = Math::mean(errors.begin(), errors.end()); double abs_dev = Math::absdev(errors.begin(), errors.end(), mean); double sdv = Math::sd(errors.begin(), errors.end(), mean); sort(errors.begin(), errors.end()); double median = errors[(Size)(errors.size() / 2.0)]; writeDebug_("Precursor mean error: " + String(mean), 1); writeDebug_("Precursor abs. dev.: " + String(abs_dev), 1); writeDebug_("Precursor std. dev.: " + String(sdv), 1); writeDebug_("Precursor median error: " + String(median), 1); // calculate histogram for gauss fitting GaussFitter gf; GaussFitter::GaussFitResult init_param (hist.maxValue(), median, sdv/500.0); gf.setInitialParameters(init_param); try { gf.fit(values); // write gnuplot scripts if (generate_gnuplot_scripts) { ofstream out(String(precursor_out_file + "_gnuplot.dat").c_str()); for (vector<DPosition<2> >::const_iterator it = values.begin(); it != values.end(); ++it) { out << it->getX() << " " << it->getY() << endl; } out.close(); ofstream gpl_out(String(precursor_out_file + "_gnuplot.gpl").c_str()); gpl_out << "set terminal png" << endl; gpl_out << "set output \"" << precursor_out_file << "_gnuplot.png\"" << endl; if (precursor_error_ppm) { gpl_out << "set xlabel \"error in ppm\"" << endl; } else { gpl_out << "set xlabel \"error in Da\"" << endl; } gpl_out << "set ylabel \"frequency\"" << endl; gpl_out << "plot '" << precursor_out_file << "_gnuplot.dat' title 'Precursor mass error distribution' w boxes, f(x) w lp title 'Gaussian fit of the error distribution'" << endl; gpl_out.close(); } } catch (Exception::UnableToFit) { writeLog_("Unable to fit a Gaussian distribution to the precursor mass errors"); } } String fragment_out_file(getStringOption_("fragment_out")); if (fragment_out_file != "") { vector<double> errors; ofstream fragment_out(fragment_out_file.c_str()); double min_diff(numeric_limits<double>::max()), max_diff(numeric_limits<double>::min()); for (Size i = 0; i != fragment_diffs.size(); ++i) { double diff = getMassDifference(fragment_diffs[i].theo_mz, fragment_diffs[i].exp_mz, fragment_error_ppm); fragment_out << diff << endl; errors.push_back(diff); if (diff > max_diff) { max_diff = diff; } if (diff < min_diff) { min_diff = diff; } } fragment_out.close(); // fill histogram with the collected values // here we use the intensities to scale the error // low intensity peaks are likely to be random matches double bin_size = (max_diff - min_diff) / (double)number_of_bins; Histogram<double, double> hist(min_diff, max_diff, bin_size); for (Size i = 0; i != fragment_diffs.size(); ++i) { double diff = getMassDifference(fragment_diffs[i].theo_mz, fragment_diffs[i].exp_mz, fragment_error_ppm); hist.inc(diff, fragment_diffs[i].intensity); } writeDebug_("min_diff=" + String(min_diff) + ", max_diff=" + String(max_diff) + ", number_of_bins=" + String(number_of_bins), 1); // transform the histogram into a vector<DPosition<2> > for the fitting vector<DPosition<2> > values; for (Size i = 0; i != hist.size(); ++i) { DPosition<2> p; p.setX((double)i / (double)number_of_bins * (max_diff - min_diff) + min_diff); p.setY(hist[i]); values.push_back(p); } double mean = Math::mean(errors.begin(), errors.end()); double abs_dev = Math::absdev(errors.begin(), errors.end(), mean); double sdv = Math::sd(errors.begin(), errors.end(), mean); sort(errors.begin(), errors.end()); double median = errors[(Size)(errors.size() / 2.0)]; writeDebug_("Fragment mean error: " + String(mean), 1); writeDebug_("Fragment abs. dev.: " + String(abs_dev), 1); writeDebug_("Fragment std. dev.: " + String(sdv), 1); writeDebug_("Fragment median error: " + String(median), 1); // calculate histogram for gauss fitting GaussFitter gf; GaussFitter::GaussFitResult init_param (hist.maxValue(), median, sdv / 100.0); gf.setInitialParameters(init_param); try { gf.fit(values); // write gnuplot script if (generate_gnuplot_scripts) { ofstream out(String(fragment_out_file + "_gnuplot.dat").c_str()); for (vector<DPosition<2> >::const_iterator it = values.begin(); it != values.end(); ++it) { out << it->getX() << " " << it->getY() << endl; } out.close(); ofstream gpl_out(String(fragment_out_file + "_gnuplot.gpl").c_str()); gpl_out << "set terminal png" << endl; gpl_out << "set output \"" << fragment_out_file << "_gnuplot.png\"" << endl; if (fragment_error_ppm) { gpl_out << "set xlabel \"error in ppm\"" << endl; } else { gpl_out << "set xlabel \"error in Da\"" << endl; } gpl_out << "set ylabel \"frequency\"" << endl; gpl_out << "plot '" << fragment_out_file << "_gnuplot.dat' title 'Fragment mass error distribution' w boxes, f(x) w lp title 'Gaussian fit of the error distribution'" << endl; gpl_out.close(); } } catch (Exception::UnableToFit) { writeLog_("Unable to fit a Gaussian distribution to the fragment mass errors"); } } return EXECUTION_OK; }