ExitCodes main_(int, const char **) { //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- StringList id_in(getStringList_("id_in")); StringList in_raw(getStringList_("in")); Size number_of_bins((UInt)getIntOption_("number_of_bins")); bool precursor_error_ppm(getFlag_("precursor_error_ppm")); bool fragment_error_ppm(getFlag_("fragment_error_ppm")); bool generate_gnuplot_scripts(DataValue(getStringOption_("generate_gnuplot_scripts")).toBool()); if (in_raw.size() != id_in.size()) { writeLog_("Number of spectrum files and identification files differs..."); return ILLEGAL_PARAMETERS; } //------------------------------------------------------------- // reading input //------------------------------------------------------------- vector<vector<PeptideIdentification> > pep_ids; vector<vector<ProteinIdentification> > prot_ids; pep_ids.resize(id_in.size()); prot_ids.resize(id_in.size()); IdXMLFile idxmlfile; for (Size i = 0; i != id_in.size(); ++i) { String doc_id; idxmlfile.load(id_in[i], prot_ids[i], pep_ids[i], doc_id); } // read mzML files vector<RichPeakMap> maps_raw; maps_raw.resize(in_raw.size()); MzMLFile mzml_file; for (Size i = 0; i != in_raw.size(); ++i) { mzml_file.load(in_raw[i], maps_raw[i]); } //------------------------------------------------------------- // calculations //------------------------------------------------------------- // mapping ids IDMapper mapper; for (Size i = 0; i != maps_raw.size(); ++i) { mapper.annotate(maps_raw[i], pep_ids[i], prot_ids[i]); } // normalize the spectra Normalizer normalizer; for (vector<RichPeakMap>::iterator it1 = maps_raw.begin(); it1 != maps_raw.end(); ++it1) { for (RichPeakMap::Iterator it2 = it1->begin(); it2 != it1->end(); ++it2) { normalizer.filterSpectrum(*it2); } } // generate precursor statistics vector<MassDifference> precursor_diffs; if (getStringOption_("precursor_out") != "") { for (Size i = 0; i != maps_raw.size(); ++i) { for (Size j = 0; j != maps_raw[i].size(); ++j) { if (maps_raw[i][j].getPeptideIdentifications().empty()) { continue; } for (vector<PeptideIdentification>::const_iterator it = maps_raw[i][j].getPeptideIdentifications().begin(); it != maps_raw[i][j].getPeptideIdentifications().end(); ++it) { if (it->getHits().size() > 0) { PeptideHit hit = *it->getHits().begin(); MassDifference md; Int charge = hit.getCharge(); if (charge == 0) { charge = 1; } md.exp_mz = it->getMZ(); md.theo_mz = (hit.getSequence().getMonoWeight() + (double)charge * Constants::PROTON_MASS_U) / (double)charge; md.charge = charge; precursor_diffs.push_back(md); } } } } } // generate fragment ions statistics vector<MassDifference> fragment_diffs; TheoreticalSpectrumGenerator tsg; SpectrumAlignment sa; double fragment_mass_tolerance(getDoubleOption_("fragment_mass_tolerance")); Param sa_param(sa.getParameters()); sa_param.setValue("tolerance", fragment_mass_tolerance); sa.setParameters(sa_param); if (getStringOption_("fragment_out") != "") { for (Size i = 0; i != maps_raw.size(); ++i) { for (Size j = 0; j != maps_raw[i].size(); ++j) { if (maps_raw[i][j].getPeptideIdentifications().empty()) { continue; } for (vector<PeptideIdentification>::const_iterator it = maps_raw[i][j].getPeptideIdentifications().begin(); it != maps_raw[i][j].getPeptideIdentifications().end(); ++it) { if (it->getHits().size() > 0) { PeptideHit hit = *it->getHits().begin(); RichPeakSpectrum theo_spec; tsg.addPeaks(theo_spec, hit.getSequence(), Residue::YIon); tsg.addPeaks(theo_spec, hit.getSequence(), Residue::BIon); vector<pair<Size, Size> > pairs; sa.getSpectrumAlignment(pairs, theo_spec, maps_raw[i][j]); //cerr << hit.getSequence() << " " << hit.getSequence().getSuffix(1).getFormula() << " " << hit.getSequence().getSuffix(1).getFormula().getMonoWeight() << endl; for (vector<pair<Size, Size> >::const_iterator pit = pairs.begin(); pit != pairs.end(); ++pit) { MassDifference md; md.exp_mz = maps_raw[i][j][pit->second].getMZ(); md.theo_mz = theo_spec[pit->first].getMZ(); //cerr.precision(15); //cerr << md.exp_mz << " " << md.theo_mz << " " << md.exp_mz - md.theo_mz << endl; md.intensity = maps_raw[i][j][pit->second].getIntensity(); md.charge = hit.getCharge(); fragment_diffs.push_back(md); } } } } } } //------------------------------------------------------------- // writing output //------------------------------------------------------------- String precursor_out_file(getStringOption_("precursor_out")); if (precursor_out_file != "") { vector<double> errors; ofstream precursor_out(precursor_out_file.c_str()); double min_diff(numeric_limits<double>::max()), max_diff(numeric_limits<double>::min()); for (Size i = 0; i != precursor_diffs.size(); ++i) { double diff = getMassDifference(precursor_diffs[i].theo_mz, precursor_diffs[i].exp_mz, precursor_error_ppm); precursor_out << diff << "\n"; errors.push_back(diff); if (diff > max_diff) { max_diff = diff; } if (diff < min_diff) { min_diff = diff; } } precursor_out.close(); // fill histogram with the collected values double bin_size = (max_diff - min_diff) / (double)number_of_bins; Histogram<double, double> hist(min_diff, max_diff, bin_size); for (Size i = 0; i != errors.size(); ++i) { hist.inc(errors[i], 1.0); } writeDebug_("min_diff=" + String(min_diff) + ", max_diff=" + String(max_diff) + ", number_of_bins=" + String(number_of_bins), 1); // transform the histogram into a vector<DPosition<2> > for the fitting vector<DPosition<2> > values; for (Size i = 0; i != hist.size(); ++i) { DPosition<2> p; p.setX((double)i / (double)number_of_bins * (max_diff - min_diff) + min_diff); p.setY(hist[i]); values.push_back(p); } double mean = Math::mean(errors.begin(), errors.end()); double abs_dev = Math::absdev(errors.begin(), errors.end(), mean); double sdv = Math::sd(errors.begin(), errors.end(), mean); sort(errors.begin(), errors.end()); double median = errors[(Size)(errors.size() / 2.0)]; writeDebug_("Precursor mean error: " + String(mean), 1); writeDebug_("Precursor abs. dev.: " + String(abs_dev), 1); writeDebug_("Precursor std. dev.: " + String(sdv), 1); writeDebug_("Precursor median error: " + String(median), 1); // calculate histogram for gauss fitting GaussFitter gf; GaussFitter::GaussFitResult init_param (hist.maxValue(), median, sdv/500.0); gf.setInitialParameters(init_param); try { gf.fit(values); // write gnuplot scripts if (generate_gnuplot_scripts) { ofstream out(String(precursor_out_file + "_gnuplot.dat").c_str()); for (vector<DPosition<2> >::const_iterator it = values.begin(); it != values.end(); ++it) { out << it->getX() << " " << it->getY() << endl; } out.close(); ofstream gpl_out(String(precursor_out_file + "_gnuplot.gpl").c_str()); gpl_out << "set terminal png" << endl; gpl_out << "set output \"" << precursor_out_file << "_gnuplot.png\"" << endl; if (precursor_error_ppm) { gpl_out << "set xlabel \"error in ppm\"" << endl; } else { gpl_out << "set xlabel \"error in Da\"" << endl; } gpl_out << "set ylabel \"frequency\"" << endl; gpl_out << "plot '" << precursor_out_file << "_gnuplot.dat' title 'Precursor mass error distribution' w boxes, f(x) w lp title 'Gaussian fit of the error distribution'" << endl; gpl_out.close(); } } catch (Exception::UnableToFit) { writeLog_("Unable to fit a Gaussian distribution to the precursor mass errors"); } } String fragment_out_file(getStringOption_("fragment_out")); if (fragment_out_file != "") { vector<double> errors; ofstream fragment_out(fragment_out_file.c_str()); double min_diff(numeric_limits<double>::max()), max_diff(numeric_limits<double>::min()); for (Size i = 0; i != fragment_diffs.size(); ++i) { double diff = getMassDifference(fragment_diffs[i].theo_mz, fragment_diffs[i].exp_mz, fragment_error_ppm); fragment_out << diff << endl; errors.push_back(diff); if (diff > max_diff) { max_diff = diff; } if (diff < min_diff) { min_diff = diff; } } fragment_out.close(); // fill histogram with the collected values // here we use the intensities to scale the error // low intensity peaks are likely to be random matches double bin_size = (max_diff - min_diff) / (double)number_of_bins; Histogram<double, double> hist(min_diff, max_diff, bin_size); for (Size i = 0; i != fragment_diffs.size(); ++i) { double diff = getMassDifference(fragment_diffs[i].theo_mz, fragment_diffs[i].exp_mz, fragment_error_ppm); hist.inc(diff, fragment_diffs[i].intensity); } writeDebug_("min_diff=" + String(min_diff) + ", max_diff=" + String(max_diff) + ", number_of_bins=" + String(number_of_bins), 1); // transform the histogram into a vector<DPosition<2> > for the fitting vector<DPosition<2> > values; for (Size i = 0; i != hist.size(); ++i) { DPosition<2> p; p.setX((double)i / (double)number_of_bins * (max_diff - min_diff) + min_diff); p.setY(hist[i]); values.push_back(p); } double mean = Math::mean(errors.begin(), errors.end()); double abs_dev = Math::absdev(errors.begin(), errors.end(), mean); double sdv = Math::sd(errors.begin(), errors.end(), mean); sort(errors.begin(), errors.end()); double median = errors[(Size)(errors.size() / 2.0)]; writeDebug_("Fragment mean error: " + String(mean), 1); writeDebug_("Fragment abs. dev.: " + String(abs_dev), 1); writeDebug_("Fragment std. dev.: " + String(sdv), 1); writeDebug_("Fragment median error: " + String(median), 1); // calculate histogram for gauss fitting GaussFitter gf; GaussFitter::GaussFitResult init_param (hist.maxValue(), median, sdv / 100.0); gf.setInitialParameters(init_param); try { gf.fit(values); // write gnuplot script if (generate_gnuplot_scripts) { ofstream out(String(fragment_out_file + "_gnuplot.dat").c_str()); for (vector<DPosition<2> >::const_iterator it = values.begin(); it != values.end(); ++it) { out << it->getX() << " " << it->getY() << endl; } out.close(); ofstream gpl_out(String(fragment_out_file + "_gnuplot.gpl").c_str()); gpl_out << "set terminal png" << endl; gpl_out << "set output \"" << fragment_out_file << "_gnuplot.png\"" << endl; if (fragment_error_ppm) { gpl_out << "set xlabel \"error in ppm\"" << endl; } else { gpl_out << "set xlabel \"error in Da\"" << endl; } gpl_out << "set ylabel \"frequency\"" << endl; gpl_out << "plot '" << fragment_out_file << "_gnuplot.dat' title 'Fragment mass error distribution' w boxes, f(x) w lp title 'Gaussian fit of the error distribution'" << endl; gpl_out.close(); } } catch (Exception::UnableToFit) { writeLog_("Unable to fit a Gaussian distribution to the fragment mass errors"); } } return EXECUTION_OK; }
void LabeledPairFinder::run(const vector<ConsensusMap>& input_maps, ConsensusMap& result_map) { if (input_maps.size() != 1) throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "exactly one input map required"); if (result_map.getFileDescriptions().size() != 2) throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "two file descriptions required"); if (result_map.getFileDescriptions().begin()->second.filename != result_map.getFileDescriptions().rbegin()->second.filename) throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the two file descriptions have to contain the same file name"); checkIds_(input_maps); //look up the light and heavy index Size light_index = numeric_limits<Size>::max(); Size heavy_index = numeric_limits<Size>::max(); for (ConsensusMap::FileDescriptions::const_iterator it = result_map.getFileDescriptions().begin(); it != result_map.getFileDescriptions().end(); ++it) { if (it->second.label == "heavy") { heavy_index = it->first; } else if (it->second.label == "light") { light_index = it->first; } } if (light_index == numeric_limits<Size>::max() || heavy_index == numeric_limits<Size>::max()) { throw Exception::IllegalArgument(__FILE__, __LINE__, __PRETTY_FUNCTION__, "the input maps have to be labeled 'light' and 'heavy'"); } result_map.clear(false); // sort consensus features by RT (and MZ) to speed up searching afterwards typedef ConstRefVector<ConsensusMap> RefMap; RefMap model_ref(input_maps[0].begin(), input_maps[0].end()); model_ref.sortByPosition(); //calculate matches ConsensusMap matches; //settings double rt_pair_dist = param_.getValue("rt_pair_dist"); double rt_dev_low = param_.getValue("rt_dev_low"); double rt_dev_high = param_.getValue("rt_dev_high"); double mz_dev = param_.getValue("mz_dev"); DoubleList mz_pair_dists = param_.getValue("mz_pair_dists"); bool mrm = param_.getValue("mrm").toBool(); //estimate RT parameters if (param_.getValue("rt_estimate") == "true") { //find all possible RT distances of features with the same charge and a good m/z distance vector<double> dists; dists.reserve(model_ref.size()); for (RefMap::const_iterator it = model_ref.begin(); it != model_ref.end(); ++it) { for (RefMap::const_iterator it2 = model_ref.begin(); it2 != model_ref.end(); ++it2) { for (DoubleList::const_iterator dist_it = mz_pair_dists.begin(); dist_it != mz_pair_dists.end(); ++dist_it) { double mz_pair_dist = *dist_it; if (it2->getCharge() == it->getCharge() && it2->getMZ() >= it->getMZ() + mz_pair_dist / it->getCharge() - mz_dev && it2->getMZ() <= it->getMZ() + mz_pair_dist / it->getCharge() + mz_dev) { dists.push_back(it2->getRT() - it->getRT()); } } } } if (dists.empty()) { cout << "Warning: Could not find pairs for RT distance estimation. The manual settings are used!" << endl; } else { if (dists.size() < 50) { cout << "Warning: Found only " << dists.size() << " pairs. The estimated shift and std deviation are probably not reliable!" << endl; } //--------------------------- estimate initial parameters of fit --------------------------- GaussFitter::GaussFitResult result(-1, -1, -1); //first estimate of the optimal shift: median of the distances sort(dists.begin(), dists.end()); Size median_index = dists.size() / 2; result.x0 = dists[median_index]; //create histogram of distances //consider only the maximum of pairs, centered around the optimal shift Size max_pairs = model_ref.size() / 2; Size start_index = (Size) max((SignedSize)0, (SignedSize)(median_index - max_pairs / 2)); Size end_index = (Size) min((SignedSize)(dists.size() - 1), (SignedSize)(median_index + max_pairs / 2)); double start_value = dists[start_index]; double end_value = dists[end_index]; double bin_step = fabs(end_value - start_value) / 99.999; //ensure that we have 100 bins Math::Histogram<> hist(start_value, end_value, bin_step); //std::cout << "HIST from " << start_value << " to " << end_value << " (bin size " << bin_step << ")" << endl; for (Size i = start_index; i <= end_index; ++i) { hist.inc(dists[i]); } //cout << hist << endl; dists.clear(); //determine median of bins (uniform background distribution) vector<Size> bins(hist.begin(), hist.end()); sort(bins.begin(), bins.end()); Size bin_median = bins[bins.size() / 2]; bins.clear(); //estimate scale A: maximum of the histogram Size max_value = hist.maxValue(); result.A = max_value - bin_median; //overwrite estimate of x0 with the position of the highest bin for (Size i = 0; i < hist.size(); ++i) { if (hist[i] == max_value) { result.x0 = hist.centerOfBin(i); break; } } //estimate sigma: first time the count is less or equal the median count in the histogram double pos = result.x0; while (pos > start_value && hist.binValue(pos) > bin_median) { pos -= bin_step; } double sigma_low = result.x0 - pos; pos = result.x0; while (pos<end_value&& hist.binValue(pos)> bin_median) { pos += bin_step; } double sigma_high = pos - result.x0; result.sigma = (sigma_high + sigma_low) / 6.0; //cout << "estimated optimal RT distance (before fit): " << result.x0 << endl; //cout << "estimated allowed deviation (before fit): " << result.sigma*3.0 << endl; //--------------------------- do gauss fit --------------------------- vector<DPosition<2> > points(hist.size()); for (Size i = 0; i < hist.size(); ++i) { points[i][0] = hist.centerOfBin(i); points[i][1] = max(0u, hist[i]); } GaussFitter fitter; fitter.setInitialParameters(result); result = fitter.fit(points); cout << "estimated optimal RT distance: " << result.x0 << endl; cout << "estimated allowed deviation: " << fabs(result.sigma) * 3.0 << endl; rt_pair_dist = result.x0; rt_dev_low = fabs(result.sigma) * 3.0; rt_dev_high = fabs(result.sigma) * 3.0; } } // check each feature for (RefMap::const_iterator it = model_ref.begin(); it != model_ref.end(); ++it) { for (DoubleList::const_iterator dist_it = mz_pair_dists.begin(); dist_it != mz_pair_dists.end(); ++dist_it) { double mz_pair_dist = *dist_it; RefMap::const_iterator it2 = lower_bound(model_ref.begin(), model_ref.end(), it->getRT() + rt_pair_dist - rt_dev_low, ConsensusFeature::RTLess()); while (it2 != model_ref.end() && it2->getRT() <= it->getRT() + rt_pair_dist + rt_dev_high) { // if in mrm mode, we need to compare precursor mass difference and fragment mass difference, charge remains the same double prec_mz_diff(0); if (mrm) { prec_mz_diff = fabs((double)it2->getMetaValue("MZ") - (double)it->getMetaValue("MZ")); if (it->getCharge() != 0) { prec_mz_diff = fabs(prec_mz_diff - mz_pair_dist / it->getCharge()); } else { prec_mz_diff = fabs(prec_mz_diff - mz_pair_dist); } } bool mrm_correct_dist(false); double frag_mz_diff = fabs(it->getMZ() - it2->getMZ()); //cerr << it->getRT() << " charge1=" << it->getCharge() << ", charge2=" << it2->getCharge() << ", prec_diff=" << prec_mz_diff << ", frag_diff=" << frag_mz_diff << endl; if (mrm && it2->getCharge() == it->getCharge() && prec_mz_diff < mz_dev && (frag_mz_diff < mz_dev || fabs(frag_mz_diff - mz_pair_dist) < mz_dev)) { mrm_correct_dist = true; //cerr << "mrm_correct_dist" << endl; } if ((mrm && mrm_correct_dist) || (!mrm && it2->getCharge() == it->getCharge() && it2->getMZ() >= it->getMZ() + mz_pair_dist / it->getCharge() - mz_dev && it2->getMZ() <= it->getMZ() + mz_pair_dist / it->getCharge() + mz_dev )) { //cerr << "dist correct" << endl; double score = sqrt( PValue_(it2->getMZ() - it->getMZ(), mz_pair_dist / it->getCharge(), mz_dev, mz_dev) * PValue_(it2->getRT() - it->getRT(), rt_pair_dist, rt_dev_low, rt_dev_high) ); // Note: we used to copy the id from the light feature here, but that strategy does not generalize to more than two labels. // We might want to report consensus features where the light one is missing but more than one heavier variant was found. // Also, the old strategy is inconsistent with what was done in the unlabeled case. Thus now we assign a new unique id here. matches.push_back(ConsensusFeature()); matches.back().setUniqueId(); matches.back().insert(light_index, *it); matches.back().clearMetaInfo(); matches.back().insert(heavy_index, *it2); matches.back().setQuality(score); matches.back().setCharge(it->getCharge()); matches.back().computeMonoisotopicConsensus(); } ++it2; } } } //compute best pairs // - sort matches by quality // - take highest-quality matches first (greedy) and mark them as used set<Size> used_features; matches.sortByQuality(true); for (ConsensusMap::const_iterator match = matches.begin(); match != matches.end(); ++match) { //check if features are not used yet if (used_features.find(match->begin()->getUniqueId()) == used_features.end() && used_features.find(match->rbegin()->getUniqueId()) == used_features.end() ) { //if unused, add it to the final set of elements result_map.push_back(*match); used_features.insert(match->begin()->getUniqueId()); used_features.insert(match->rbegin()->getUniqueId()); } } //Add protein identifications to result map for (Size i = 0; i < input_maps.size(); ++i) { result_map.getProteinIdentifications().insert(result_map.getProteinIdentifications().end(), input_maps[i].getProteinIdentifications().begin(), input_maps[i].getProteinIdentifications().end()); } //Add unassigned peptide identifications to result map for (Size i = 0; i < input_maps.size(); ++i) { result_map.getUnassignedPeptideIdentifications().insert(result_map.getUnassignedPeptideIdentifications().end(), input_maps[i].getUnassignedPeptideIdentifications().begin(), input_maps[i].getUnassignedPeptideIdentifications().end()); } // Very useful for checking the results, and the ids have no real meaning anyway result_map.sortByMZ(); }