void EDTAFile::store(const String& filename, const FeatureMap& map) const { TextFile tf; tf.addLine("RT\tm/z\tintensity\tcharge"); for (Size i = 0; i < map.size(); ++i) { const Feature& f = map[i]; tf.addLine(String(f.getRT()) + "\t" + f.getMZ() + "\t" + f.getIntensity() + "\t" + f.getCharge()); } tf.store(filename); }
void EDTAFile::store(const String& filename, const ConsensusMap& map) const { TextFile tf; // search for maximum number of sub-features (since this determines the number of columns) Size max_sub(0); for (Size i = 0; i < map.size(); ++i) { max_sub = std::max(max_sub, map[i].getFeatures().size()); } // write header String header("RT\tm/z\tintensity\tcharge"); for (Size i = 1; i <= max_sub; ++i) { header += "\tRT" + String(i) + "\tm/z" + String(i) + "\tintensity" + String(i) + "\tcharge" + String(i); } tf.addLine(header); for (Size i = 0; i < map.size(); ++i) { ConsensusFeature f = map[i]; // consensus String entry = String(f.getRT()) + "\t" + f.getMZ() + "\t" + f.getIntensity() + "\t" + f.getCharge(); // sub-features ConsensusFeature::HandleSetType handle = f.getFeatures(); for (ConsensusFeature::HandleSetType::const_iterator it = handle.begin(); it != handle.end(); ++it) { entry += String("\t") + it->getRT() + "\t" + it->getMZ() + "\t" + it->getIntensity() + "\t" + it->getCharge(); } // missing sub-features for (Size j = handle.size(); j < max_sub; ++j) { entry += "\tNA\tNA\tNA\tNA"; } tf.addLine(entry); } tf.store(filename); }
void writeTermTree_(const String& accession, const ControlledVocabulary& cv, TextFile& file, UInt indent) { const ControlledVocabulary::CVTerm& term = cv.getTerm(accession); for (set<String>::const_iterator it = term.children.begin(); it != term.children.end(); ++it) { const ControlledVocabulary::CVTerm& child_term = cv.getTerm(*it); String subterm_line; for (Size i = 0; i < 4 * indent; ++i) subterm_line += " "; String description = child_term.description; if (child_term.synonyms.size() != 0) { description += String(" -- Synonyms: '") + ListUtils::concatenate(child_term.synonyms, ", ") + "'"; } subterm_line += "- <span title=\"" + description + "\">" + child_term.id + " ! " + child_term.name + "</span>"; StringList tags; if (child_term.obsolete) { tags.push_back("<font color=darkred>obsolete</font>"); } if (child_term.xref_type != ControlledVocabulary::CVTerm::NONE) { tags.push_back("value-type=" + ControlledVocabulary::CVTerm::getXRefTypeName(child_term.xref_type)); } if (child_term.units.size() > 0) { StringList units; for (set<String>::const_iterator u_it = child_term.units.begin(); u_it != child_term.units.end(); ++u_it) { units.push_back(*u_it + "!" + cv.getTerm(*u_it).name); } tags.push_back(String("units=") + ListUtils::concatenate(units, ",")); } if (child_term.xref_binary.size() > 0) { StringList types; for (StringList::const_iterator u_it = child_term.xref_binary.begin(); u_it != child_term.xref_binary.end(); ++u_it) { types.push_back(*u_it + "!" + cv.getTerm(*u_it).name); } tags.push_back(String("binary-array-types=") + ListUtils::concatenate(types, ",")); } if (tags.size() != 0) { subterm_line += String("<FONT color=\"grey\"> (") + ListUtils::concatenate(tags, ", ") + ")</FONT>"; } file.addLine(subterm_line + "<BR>"); writeTermTree_(child_term.id, cv, file, indent + 1); } }
ExitCodes main_(int, const char**) { //------------------------------------------------------------- // parameter handling //------------------------------------------------------------- StringList in = getStringList_("in"); String edta = getStringOption_("pos"); String out = getStringOption_("out"); String out_sep = getStringOption_("out_separator"); String out_TIC_debug = getStringOption_("auto_rt:out_debug_TIC"); StringList in_header = getStringList_("in_header"); // number of out_debug_TIC files and input files must be identical /*if (out_TIC_debug.size() > 0 && in.size() != out_TIC_debug.size()) { LOG_FATAL_ERROR << "Error: number of input file 'in' and auto_rt:out_debug_TIC files must be identical!" << std::endl; return ILLEGAL_PARAMETERS; }*/ // number of header files and input files must be identical if (in_header.size() > 0 && in.size() != in_header.size()) { LOG_FATAL_ERROR << "Error: number of input file 'in' and 'in_header' files must be identical!" << std::endl; return ILLEGAL_PARAMETERS; } if (!getFlag_("auto_rt:enabled") && !out_TIC_debug.empty()) { LOG_FATAL_ERROR << "Error: TIC output file requested, but auto_rt is not enabled! Either do not request the file or switch on 'auto_rt:enabled'." << std::endl; return ILLEGAL_PARAMETERS; } double rttol = getDoubleOption_("rt_tol"); double mztol = getDoubleOption_("mz_tol"); Size rt_collect = getIntOption_("rt_collect"); //------------------------------------------------------------- // loading input //------------------------------------------------------------- MzMLFile mzml_file; mzml_file.setLogType(log_type_); MSExperiment<Peak1D> exp, exp_pp; EDTAFile ed; ConsensusMap cm; ed.load(edta, cm); StringList tf_single_header0, tf_single_header1, tf_single_header2; // header content, for each column std::vector<String> vec_single; // one line for each compound, multiple columns per experiment vec_single.resize(cm.size()); for (Size fi = 0; fi < in.size(); ++fi) { // load raw data mzml_file.load(in[fi], exp); exp.sortSpectra(true); if (exp.empty()) { LOG_WARN << "The given file does not contain any conventional peak data, but might" " contain chromatograms. This tool currently cannot handle them, sorry." << std::endl; return INCOMPATIBLE_INPUT_DATA; } // try to detect RT peaks (only for the first input file -- all others should align!) // cm.size() might change in here... if (getFlag_("auto_rt:enabled") && fi == 0) { ConsensusMap cm_local = cm; // we might have different RT peaks for each map if 'auto_rt' is enabled cm.clear(false); // reset global list (about to be filled) // compute TIC MSChromatogram<> tic = exp.getTIC(); MSSpectrum<> tics, tic_gf, tics_pp, tics_sn; for (Size ic = 0; ic < tic.size(); ++ic) { // rewrite Chromatogram to MSSpectrum (GaussFilter requires it) Peak1D peak; peak.setMZ(tic[ic].getRT()); peak.setIntensity(tic[ic].getIntensity()); tics.push_back(peak); } // smooth (no PP_CWT here due to efficiency reasons -- large FWHM take longer!) double fwhm = getDoubleOption_("auto_rt:FHWM"); GaussFilter gf; Param p = gf.getParameters(); p.setValue("gaussian_width", fwhm * 2); // wider than FWHM, just to be sure we have a fully smoothed peak. Merging two peaks is unlikely p.setValue("use_ppm_tolerance", "false"); gf.setParameters(p); tic_gf = tics; gf.filter(tic_gf); // pick peaks PeakPickerHiRes pp; p = pp.getParameters(); p.setValue("signal_to_noise", getDoubleOption_("auto_rt:SNThreshold")); pp.setParameters(p); pp.pick(tic_gf, tics_pp); if (tics_pp.size()) { LOG_INFO << "Found " << tics_pp.size() << " auto-rt peaks at: "; for (Size ipp = 0; ipp != tics_pp.size(); ++ipp) LOG_INFO << " " << tics_pp[ipp].getMZ(); } else { LOG_INFO << "Found no auto-rt peaks. Change threshold parameters!"; } LOG_INFO << std::endl; if (!out_TIC_debug.empty()) // if debug file was given { // store intermediate steps for debug MSExperiment<> out_debug; out_debug.addChromatogram(toChromatogram(tics)); out_debug.addChromatogram(toChromatogram(tic_gf)); SignalToNoiseEstimatorMedian<MSSpectrum<> > snt; snt.init(tics); for (Size is = 0; is < tics.size(); ++is) { Peak1D peak; peak.setMZ(tic[is].getMZ()); peak.setIntensity(snt.getSignalToNoise(tics[is])); tics_sn.push_back(peak); } out_debug.addChromatogram(toChromatogram(tics_sn)); out_debug.addChromatogram(toChromatogram(tics_pp)); // get rid of "native-id" missing warning for (Size id = 0; id < out_debug.size(); ++id) out_debug[id].setNativeID(String("spectrum=") + id); mzml_file.store(out_TIC_debug, out_debug); LOG_DEBUG << "Storing debug AUTO-RT: " << out_TIC_debug << std::endl; } // add target EICs: for each m/z with no/negative RT, add all combinations of that m/z with auto-RTs // duplicate m/z entries will be ignored! // all other lines with positive RT values are copied unaffected //do not allow doubles std::set<double> mz_doubles; for (ConsensusMap::Iterator cit = cm_local.begin(); cit != cm_local.end(); ++cit) { if (cit->getRT() < 0) { if (mz_doubles.find(cit->getMZ()) == mz_doubles.end()) { mz_doubles.insert(cit->getMZ()); } else { LOG_INFO << "Found duplicate m/z entry (" << cit->getMZ() << ") for auto-rt. Skipping ..." << std::endl; continue; } ConsensusMap cm_RT_multiplex; for (MSSpectrum<>::ConstIterator itp = tics_pp.begin(); itp != tics_pp.end(); ++itp) { ConsensusFeature f = *cit; f.setRT(itp->getMZ()); cm.push_back(f); } } else { // default feature with no auto-rt LOG_INFO << "copying feature with RT " << cit->getRT() << std::endl; cm.push_back(*cit); } } // resize, since we have more positions now vec_single.resize(cm.size()); } // search for each EIC and add up Int not_found(0); Map<Size, double> quant; String description; if (fi < in_header.size()) { HeaderInfo info(in_header[fi]); description = info.header_description; } if (fi == 0) { // two additional columns for first file (theoretical RT and m/z) tf_single_header0 << "" << ""; tf_single_header1 << "" << ""; tf_single_header2 << "RT" << "mz"; } // 5 entries for each input file tf_single_header0 << File::basename(in[fi]) << "" << "" << "" << ""; tf_single_header1 << description << "" << "" << "" << ""; tf_single_header2 << "RTobs" << "dRT" << "mzobs" << "dppm" << "intensity"; for (Size i = 0; i < cm.size(); ++i) { //std::cerr << "Rt" << cm[i].getRT() << " mz: " << cm[i].getMZ() << " R " << cm[i].getMetaValue("rank") << "\n"; double mz_da = mztol * cm[i].getMZ() / 1e6; // mz tolerance in Dalton MSExperiment<>::ConstAreaIterator it = exp.areaBeginConst(cm[i].getRT() - rttol / 2, cm[i].getRT() + rttol / 2, cm[i].getMZ() - mz_da, cm[i].getMZ() + mz_da); Peak2D max_peak; max_peak.setIntensity(0); max_peak.setRT(cm[i].getRT()); max_peak.setMZ(cm[i].getMZ()); for (; it != exp.areaEndConst(); ++it) { if (max_peak.getIntensity() < it->getIntensity()) { max_peak.setIntensity(it->getIntensity()); max_peak.setRT(it.getRT()); max_peak.setMZ(it->getMZ()); } } double ppm = 0; // observed m/z offset if (max_peak.getIntensity() == 0) { ++not_found; } else { // take median for m/z found std::vector<double> mz; MSExperiment<>::Iterator itm = exp.RTBegin(max_peak.getRT()); SignedSize low = std::min<SignedSize>(std::distance(exp.begin(), itm), rt_collect); SignedSize high = std::min<SignedSize>(std::distance(itm, exp.end()) - 1, rt_collect); MSExperiment<>::AreaIterator itt = exp.areaBegin((itm - low)->getRT() - 0.01, (itm + high)->getRT() + 0.01, cm[i].getMZ() - mz_da, cm[i].getMZ() + mz_da); for (; itt != exp.areaEnd(); ++itt) { mz.push_back(itt->getMZ()); //std::cerr << "ppm: " << itt.getRT() << " " << itt->getMZ() << " " << itt->getIntensity() << std::endl; } if ((SignedSize)mz.size() > (low + high + 1)) LOG_WARN << "Compound " << i << " has overlapping peaks [" << mz.size() << "/" << low + high + 1 << "]" << std::endl; if (!mz.empty()) { double avg_mz = std::accumulate(mz.begin(), mz.end(), 0.0) / double(mz.size()); //std::cerr << "avg: " << avg_mz << "\n"; ppm = (avg_mz - cm[i].getMZ()) / cm[i].getMZ() * 1e6; } } // appending the second column set requires separator String append_sep = (fi == 0 ? "" : out_sep); vec_single[i] += append_sep; // new line if (fi == 0) { vec_single[i] += String(cm[i].getRT()) + out_sep + String(cm[i].getMZ()) + out_sep; } vec_single[i] += String(max_peak.getRT()) + out_sep + String(max_peak.getRT() - cm[i].getRT()) + out_sep + String(max_peak.getMZ()) + out_sep + String(ppm) + out_sep + String(max_peak.getIntensity()); } if (not_found) LOG_INFO << "Missing peaks for " << not_found << " compounds in file '" << in[fi] << "'.\n"; } //------------------------------------------------------------- // create header //------------------------------------------------------------- vec_single.insert(vec_single.begin(), ListUtils::concatenate(tf_single_header2, out_sep)); vec_single.insert(vec_single.begin(), ListUtils::concatenate(tf_single_header1, out_sep)); vec_single.insert(vec_single.begin(), ListUtils::concatenate(tf_single_header0, out_sep)); //------------------------------------------------------------- // writing output //------------------------------------------------------------- TextFile tf; for (std::vector<String>::iterator v_it = vec_single.begin(); v_it != vec_single.end(); ++v_it) { tf.addLine(*v_it); } tf.store(out); return EXECUTION_OK; }
ExitCodes main_(int, const char**) override { //load data FeatureMap features_in, features_truth; FeatureXMLFile().load(getStringOption_("in"), features_in); features_in.sortByPosition(); FeatureXMLFile().load(getStringOption_("truth"), features_truth); features_truth.sortByPosition(); FeatureMap abort_reasons; if (getStringOption_("abort_reasons") != "") { FeatureXMLFile().load(getStringOption_("abort_reasons"), abort_reasons); } double mz_tol = getDoubleOption_("mz_tol"); writeDebug_(String("Final MZ tolerance: ") + mz_tol, 1); //determine average RT tolerance: //median feature RT span times given factor vector<double> rt_spans; for (Size t = 0; t < features_in.size(); ++t) { if (features_in[t].getConvexHulls().size() != 0) { rt_spans.push_back(features_in[t].getConvexHull().getBoundingBox().width()); } } //feature convex hulls are available => relative RT span double rt_tol = getDoubleOption_("rt_tol_abs"); if (rt_tol < 0.0) { if (!rt_spans.empty()) { sort(rt_spans.begin(), rt_spans.end()); rt_tol = getDoubleOption_("rt_tol") * rt_spans[rt_spans.size() / 2]; } else if (features_in.empty()) { // do nothing, rt_tol does not really matter, as we will not find a match anyway, but we want to have the stats // at the end, so we do not abort } else { writeLog_("Error: Input features do not have convex hulls. You have to set 'rt_tol_abs'!"); return ILLEGAL_PARAMETERS; } } writeDebug_(String("Final RT tolerance: ") + rt_tol, 1); //general statistics std::vector<double> ints_t; std::vector<double> ints_i; std::vector<double> ints_found; std::vector<double> ints_missed; Map<String, UInt> abort_strings; for (Size m = 0; m < features_truth.size(); ++m) { Feature& f_t = features_truth[m]; UInt match_count = 0; bool correct_charge = false; bool exact_centroid_match = false; Size last_match_index = features_in.size() + 1; for (Size a = 0; a < features_in.size(); ++a) { const Feature& f_i = features_in[a]; //RT match if (fabs(f_i.getRT() - f_t.getRT()) < rt_tol) { double charge_mz_tol = mz_tol / f_t.getCharge(); //Exact m/z match if (fabs(f_i.getMZ() - f_t.getMZ()) < charge_mz_tol) { ++match_count; exact_centroid_match = true; if (f_i.getCharge() == f_t.getCharge()) correct_charge = true; last_match_index = a; } //Centroid is one trace off, but still contained in the convex hull else if (f_i.getConvexHull().getBoundingBox().encloses(f_t.getPosition()) && ( fabs(f_i.getMZ() + 1.0 / f_t.getCharge() - f_t.getMZ()) < charge_mz_tol || fabs(f_i.getMZ() - 1.0 / f_t.getCharge() - f_t.getMZ()) < charge_mz_tol ) ) { ++match_count; last_match_index = a; if (f_i.getCharge() == f_t.getCharge()) correct_charge = true; } } } f_t.setMetaValue("matches", match_count); if (match_count == 1) { //flag matched feature with addition information if (correct_charge) { f_t.setMetaValue("correct_charge", String("true")); f_t.setMetaValue("intensity_ratio", features_in[last_match_index].getIntensity() / f_t.getIntensity()); features_in[last_match_index].setMetaValue("correct_hit", "true"); //flag the feature for ROC curve } else { f_t.setMetaValue("correct_charge", String("false")); } if (exact_centroid_match) { f_t.setMetaValue("exact_centroid_match", String("true")); } else { f_t.setMetaValue("exact_centroid_match", String("false")); } } //evaluation of correct features only if (match_count == 1 && correct_charge) { ints_t.push_back(f_t.getIntensity()); ints_i.push_back(features_in[last_match_index].getIntensity()); ints_found.push_back(f_t.getIntensity()); } else { ints_missed.push_back(f_t.getIntensity()); //look up the abort reason of the nearest seed double best_score_ab = 0; String reason = ""; for (Size b = 0; b < abort_reasons.size(); ++b) { const Feature& f_ab = abort_reasons[b]; if (fabs(f_ab.getRT() - f_t.getRT()) <= rt_tol && fabs(f_ab.getMZ() - f_t.getMZ()) <= mz_tol) { double score = (1.0 - fabs(f_ab.getMZ() - f_t.getMZ()) / mz_tol) * (1.0 - fabs(f_ab.getRT() - f_t.getRT()) / rt_tol); if (score > best_score_ab) { best_score_ab = score; reason = f_ab.getMetaValue("abort_reason"); } } } if (reason == "") { reason = "No seed found"; } if (abort_strings.has(reason)) { abort_strings[reason]++; } else { abort_strings[reason] = 1; } } } //------------------------ general statistics ------------------------ cout << endl; cout << "general information:" << endl; cout << "====================" << endl; cout << "input features: " << features_in.size() << endl; cout << "truth features: " << features_truth.size() << endl; //------------------------ matches ------------------------ cout << endl; cout << "feature matching statistics:" << endl; cout << "============================" << endl; Size no_match = count(features_truth, "matches", "0"); cout << "no match: " << no_match << percentage(no_match, features_truth.size()) << endl; Size one_match = count(features_truth, "matches", "1"); cout << "one match: " << one_match << percentage(one_match, features_truth.size()) << endl; Size charge_match = count(features_truth, "correct_charge", "true"); cout << " - correct charge: " << charge_match << percentage(charge_match, features_truth.size()) << endl; Size centroid_match = count(features_truth, "exact_centroid_match", "true"); cout << " - exact centroid match: " << centroid_match << percentage(centroid_match, features_truth.size()) << endl; Size multi_match = features_truth.size() - count(features_truth, "matches", "0") - count(features_truth, "matches", "1"); cout << "multiple matches: " << multi_match << percentage(multi_match, features_truth.size()) << endl; Size incorrect_match = multi_match + one_match - charge_match; cout << "incorrect matches: " << incorrect_match << percentage(incorrect_match, features_truth.size()) << endl; if (abort_reasons.size()) { cout << "reasons for unmatched features:" << endl; for (Map<String, UInt>::iterator it = abort_strings.begin(); it != abort_strings.end(); ++it) { cout << " - " << String(it->second).fillLeft(' ', 4) << ": " << it->first << endl; } } //------------------------ intensity ------------------------ cout << endl; cout << "intensity statistics:" << endl; cout << "=====================" << endl; if (ints_i.empty()) { cout << "correlation of found features: nan" << endl; } else { cout << "correlation of found features: " << pearsonCorrelationCoefficient(ints_i.begin(), ints_i.end(), ints_t.begin(), ints_t.end()) << endl; } if (ints_found.empty()) { cout << "intensity distribution of found: 0.0 0.0 0.0 0.0 0.0" << endl; } else { cout << "intensity distribution of found: " << fiveNumbers(ints_found, 1) << endl; } if (ints_missed.empty()) { cout << "intensity distribution of missed: 0.0 0.0 0.0 0.0 0.0" << endl; } else { cout << "intensity distribution of missed: " << fiveNumbers(ints_missed, 1) << endl; } //------------------------ charges ------------------------ cout << endl; cout << "charge matches statistics:" << endl; cout << "===========================" << endl; Map<UInt, UInt> present_charges, found_charges; for (Size i = 0; i < features_truth.size(); ++i) { UInt charge = features_truth[i].getCharge(); present_charges[charge]++; if (features_truth[i].getMetaValue("correct_charge").toString() == "true") { found_charges[charge]++; } } for (Map<UInt, UInt>::const_iterator it = present_charges.begin(); it != present_charges.end(); ++it) { cout << "charge " << it->first << ": " << found_charges[it->first] << "/" << it->second << percentage(found_charges[it->first], it->second) << endl; } //write output if (getStringOption_("out") != "") { FeatureXMLFile().store(getStringOption_("out"), features_truth); } //ROC curve if (getStringOption_("out_roc") != "") { TextFile tf; tf.addLine("false\tcorrect\tFDR\tTPR"); features_in.sortByIntensity(true); UInt f_correct = 0; UInt f_false = 0; double found = features_in.size(); double correct = features_truth.size(); for (Size i = 0; i < features_in.size(); ++i) { if (features_in[i].metaValueExists("correct_hit")) { ++f_correct; } else { ++f_false; } tf.addLine(String(f_false) + "\t" + f_correct + "\t" + String::number(f_false / found, 3) + "\t" + String::number(f_correct / correct, 3)); } tf.store(getStringOption_("out_roc")); } return EXECUTION_OK; }
ExitCodes main_(int, const char**) { //---------------------------------------------------------------- // load data //---------------------------------------------------------------- StringList in_list = getStringList_("in"); String out = getStringOption_("out"); String out_csv = getStringOption_("out_csv"); String format = getStringOption_("out_type"); if (out.empty() && out_csv.empty()) { LOG_ERROR << "Neither 'out' nor 'out_csv' were provided. Please assign at least one of them." << std::endl; return ILLEGAL_PARAMETERS; } if (!out.empty() && format == "") // get from filename { try { format = out.suffix('.'); } catch (Exception::ElementNotFound& /*e*/) { format = "nosuffix"; } // check if format is valid: if (!ListUtils::contains(out_formats_, format.toLower())) { LOG_ERROR << "No explicit image output format was provided via 'out_type', and the suffix ('" << format << "') does not resemble a valid type. Please fix one of them." << std::endl; return ILLEGAL_PARAMETERS; } } double q_min = getDoubleOption_("q_min"); double q_max = getDoubleOption_("q_max"); if (q_min >= q_max) { LOG_ERROR << "The parameter 'q_min' must be smaller than 'q_max'. Quitting..." << std::endl; return ILLEGAL_PARAMETERS; } IDEvaluationBase* mw = new IDEvaluationBase(); Param alg_param = mw->getParameters(); alg_param.insert("", getParam_().copy("algorithm:", true)); mw->setParameters(alg_param); if (!mw->loadFiles(in_list)) { LOG_ERROR << "Tool failed. See above." << std::endl; return INCOMPATIBLE_INPUT_DATA; } mw->setVisibleArea(q_min, q_max); if (!out.empty()) // save as image and exit { String error; bool r = mw->exportAsImage(out.toQString(), error, format.toQString()); if (r) return EXECUTION_OK; else { LOG_ERROR << error << std::endl; return ILLEGAL_PARAMETERS; } } if (!out_csv.empty()) { TextFile tf; for (Size i = 0; i < mw->getPoints().size(); ++i) { MSSpectrum s = mw->getPoints()[i]; StringList sl1; StringList sl2; for (Size j = 0; j < s.size(); ++j) { sl1.push_back(s[j].getMZ()); sl2.push_back(s[j].getIntensity()); } tf.addLine(String("# ") + String(s.getMetaValue("search_engine"))); tf.addLine(ListUtils::concatenate(sl1, ",")); tf.addLine(ListUtils::concatenate(sl2, ",")); } tf.store(out_csv); } delete(mw); return EXECUTION_OK; }
void IBSpectraFile::store(const String& filename, const ConsensusMap& cm) { // typdefs for shorter code typedef std::vector<ProteinHit>::iterator ProtHitIt; // general settings .. do we need to expose these? // ---------------------------------------------------------------------- /// Allow also non-unique peptides to be exported bool allow_non_unique = true; /// Intensities below this value will be set to 0.0 to avoid numerical problems when quantifying double intensity_threshold = 0.00001; // ---------------------------------------------------------------------- // guess experiment type boost::shared_ptr<IsobaricQuantitationMethod> quantMethod = guessExperimentType_(cm); // we need the protein identifications to reference the protein names ProteinIdentification protIdent; bool has_proteinIdentifications = false; if (cm.getProteinIdentifications().size() > 0) { protIdent = cm.getProteinIdentifications()[0]; has_proteinIdentifications = true; } // start the file by adding the tsv header TextFile textFile; textFile.addLine(ListUtils::concatenate(constructHeader_(*quantMethod), "\t")); for (ConsensusMap::ConstIterator cm_iter = cm.begin(); cm_iter != cm.end(); ++cm_iter) { const ConsensusFeature& cFeature = *cm_iter; std::vector<IdCSV> entries; /// 1st we extract the identification information from the consensus feature if (cFeature.getPeptideIdentifications().size() == 0 || !has_proteinIdentifications) { // we store unidentified hits anyway, because the iTRAQ quant is still helpful for normalization entries.push_back(IdCSV()); } else { // protein name: const PeptideHit& peptide_hit = cFeature.getPeptideIdentifications()[0].getHits()[0]; std::set<String> protein_accessions = peptide_hit.extractProteinAccessions(); if (protein_accessions.size() != 1) { if (!allow_non_unique) continue; // we only want unique peptides } for (std::set<String>::const_iterator prot_ac = protein_accessions.begin(); prot_ac != protein_accessions.end(); ++prot_ac) { IdCSV entry; entry.charge = cFeature.getPeptideIdentifications()[0].getHits()[0].getCharge(); entry.peptide = cFeature.getPeptideIdentifications()[0].getHits()[0].getSequence().toUnmodifiedString(); entry.theo_mass = cFeature.getPeptideIdentifications()[0].getHits()[0].getSequence().getMonoWeight(Residue::Full, cFeature.getPeptideIdentifications()[0].getHits()[0].getCharge()); // write modif entry.modif = getModifString_(cFeature.getPeptideIdentifications()[0].getHits()[0].getSequence()); ProtHitIt proteinHit = protIdent.findHit(*prot_ac); if (proteinHit == protIdent.getHits().end()) { std::cerr << "Protein referenced in peptide not found...\n"; continue; // protein not found } entry.accession = proteinHit->getAccession(); entries.push_back(entry); } } // 2nd we add the quantitative information of the channels // .. skip features with 0 intensity if (cFeature.getIntensity() == 0) { continue; } for (std::vector<IdCSV>::iterator entry = entries.begin(); entry != entries.end(); ++entry) { // set parent intensity entry->parent_intens = cFeature.getIntensity(); entry->retention_time = cFeature.getRT(); entry->spectrum = cFeature.getUniqueId(); entry->exp_mass = cFeature.getMZ(); // create output line StringList currentLine; // add entry to currentLine entry->toStringList(currentLine); // extract channel intensities and positions std::map<Int, double> intensityMap; ConsensusFeature::HandleSetType features = cFeature.getFeatures(); for (ConsensusFeature::HandleSetType::const_iterator fIt = features.begin(); fIt != features.end(); ++fIt) { intensityMap[Int(fIt->getMZ())] = (fIt->getIntensity() > intensity_threshold ? fIt->getIntensity() : 0.0); } for (IsobaricQuantitationMethod::IsobaricChannelList::const_iterator it = quantMethod->getChannelInformation().begin(); it != quantMethod->getChannelInformation().end(); ++it) { currentLine.push_back(String(it->center)); } for (IsobaricQuantitationMethod::IsobaricChannelList::const_iterator it = quantMethod->getChannelInformation().begin(); it != quantMethod->getChannelInformation().end(); ++it) { currentLine.push_back(String(intensityMap[int(it->center)])); } textFile.addLine(ListUtils::concatenate(currentLine, "\t")); } } // write to file textFile.store(filename); }
ExitCodes main_(int, const char**) { StringList cv_files = getStringList_("cv_files"); StringList cv_names = getStringList_("cv_names"); if (cv_files.size() != cv_names.size()) { cerr << "Error: You have to specify an identifier for each CV file. Aborting!" << endl; return ILLEGAL_PARAMETERS; } // load cv terms ControlledVocabulary cv; for (Size i = 0; i < cv_files.size(); ++i) { cv.loadFromOBO(cv_names[i], cv_files[i]); } Map<String, ControlledVocabulary::CVTerm> terms = cv.getTerms(); // load mappings from mapping file String mapping_file = getStringOption_("mapping_file"); CVMappings mappings; CVMappingFile().load(mapping_file, mappings); //store HTML version of mapping and CV if (getStringOption_("html") != "") { TextFile file; file.addLine("<HTML>"); file.addLine(" <HEAD>"); file.addLine(" <TITLE>CV mapping file</TITLE>"); file.addLine(" <SCRIPT language=javascript type='text/javascript'>"); file.addLine(" function toggleDiv(layer_ref,force_state) "); file.addLine(" {"); file.addLine(" if (document.getElementById(layer_ref).style.display=='none' || force_state=='true')"); file.addLine(" {"); file.addLine(" document.getElementById(layer_ref).style.display = 'block';"); file.addLine(" }"); file.addLine(" else if (document.getElementById(layer_ref).style.display=='block' || force_state=='false')"); file.addLine(" {"); file.addLine(" document.getElementById(layer_ref).style.display = 'none';"); file.addLine(" }"); file.addLine(" }"); file.addLine(" </SCRIPT>"); file.addLine(" </HEAD>"); file.addLine(" <BODY>"); //count the number of terms and add button to expend/collaps all terms Int term_count = 0; for (vector<CVMappingRule>::const_iterator it = mappings.getMappingRules().begin(); it != mappings.getMappingRules().end(); ++it) { for (vector<CVMappingTerm>::const_iterator tit = it->getCVTerms().begin(); tit != it->getCVTerms().end(); ++tit) { ++term_count; } } String expand_all = " <a href=\"javascript:toggleDiv('div0','true')"; String collapse_all = " <a href=\"javascript:toggleDiv('div0','false')"; for (Int i = 1; i < term_count; ++i) { expand_all += String(";toggleDiv('div") + i + "','true')"; collapse_all += String(";toggleDiv('div") + i + "','false')"; } file.addLine(expand_all + "\">Expand all</a><BR>"); file.addLine(collapse_all + "\">Collapse all</a>"); file.addLine(" <TABLE width=100% border=0>"); term_count = -1; for (vector<CVMappingRule>::const_iterator it = mappings.getMappingRules().begin(); it != mappings.getMappingRules().end(); ++it) { //create rule line file.addLine(" <TR><TD colspan=\"2\"><HR></TD></TR>"); file.addLine(String(" <TR><TD>Identifier:</TD><TD><B>") + it->getIdentifier() + "</B></TD></TR>"); file.addLine(String(" <TR><TD>Element:</TD><TD><B>") + it->getElementPath() + "</B></TD></TR>"); if (it->getRequirementLevel() == CVMappingRule::MUST) { file.addLine(" <TR><TD>Requirement level:</TD><TD><FONT color=\"red\">MUST</FONT></TD></TR>"); } else if (it->getRequirementLevel() == CVMappingRule::SHOULD) { file.addLine(" <TR><TD>Requirement level:</TD><TD><FONT color=\"orange\">SHOULD</FONT></TD></TR>"); } else if (it->getRequirementLevel() == CVMappingRule::MAY) { file.addLine(" <TR><TD>Requirement level:</TD><TD><FONT color=\"green\">MAY</FONT></TD></TR>"); } if (it->getCombinationsLogic() == CVMappingRule::AND) { file.addLine(" <TR><TD>Combination logic:</TD><TD><FONT color=\"red\">AND</FONT></TD></TR>"); } else if (it->getCombinationsLogic() == CVMappingRule::XOR) { file.addLine(" <TR><TD>Combination logic:</TD><TD><FONT color=\"orange\">XOR</FONT></TD></TR>"); } else if (it->getCombinationsLogic() == CVMappingRule::OR) { file.addLine(" <TR><TD>Combination logic:</TD><TD><FONT color=\"green\">OR</FONT></TD></TR>"); } //create table with terms for (vector<CVMappingTerm>::const_iterator tit = it->getCVTerms().begin(); tit != it->getCVTerms().end(); ++tit) { //create term line String term_line = String(" <TR><TD valign=\"top\">Term:</TD><TD>"); if (tit->getAllowChildren()) { ++term_count; term_line += String("<a href=\"javascript:toggleDiv('div") + term_count + "','')\" style=\"text-decoration:none\" >+</a> "; } else { term_line += String(" "); } //add Term accession, name and description (as popup) if (cv.exists(tit->getAccession())) { const ControlledVocabulary::CVTerm& child_term = cv.getTerm(tit->getAccession()); String description = child_term.description; if (child_term.synonyms.size() != 0) { description += String(" -- Synonyms: '") + ListUtils::concatenate(child_term.synonyms, ", ") + "'"; } term_line += "<span title=\"" + description + "\">"; } term_line += tit->getAccession() + " ! " + tit->getTermName(); if (cv.exists(tit->getAccession())) { term_line += "</span>"; //check if term accession and term name correspond to the CV const ControlledVocabulary::CVTerm& main_term = cv.getTerm(tit->getAccession()); if (main_term.name != tit->getTermName()) { cerr << "Warning: Accession '" << tit->getAccession() << "' and name '" << tit->getTermName() << "' do not match. Name should be '" << main_term.name << "'." << endl; } } //tags StringList tags; if (!tit->getUseTerm()) { tags.push_back("children only"); } if (tit->getIsRepeatable()) { tags.push_back("repeatable"); } if (cv.exists(tit->getAccession())) { const ControlledVocabulary::CVTerm& term = cv.getTerm(tit->getAccession()); if (term.obsolete) { tags.push_back("<font color=darkred>obsolete</font>"); } if (term.xref_type != ControlledVocabulary::CVTerm::NONE) { tags.push_back("value-type=" + ControlledVocabulary::CVTerm::getXRefTypeName(term.xref_type)); } if (term.units.size() > 0) { StringList units; for (set<String>::const_iterator u_it = term.units.begin(); u_it != term.units.end(); ++u_it) { units.push_back(*u_it + "!" + cv.getTerm(*u_it).name); } tags.push_back(String("units=") + ListUtils::concatenate(units, ",")); } if (term.xref_binary.size() > 0) { StringList types; for (StringList::const_iterator u_it = term.xref_binary.begin(); u_it != term.xref_binary.end(); ++u_it) { types.push_back(*u_it + "!" + cv.getTerm(*u_it).name); } tags.push_back(String("binary-array-types=") + ListUtils::concatenate(types, ",")); } } if (tags.size() != 0) { term_line += String("<FONT color=\"grey\"> (") + ListUtils::concatenate(tags, ", ") + ")</FONT>"; } file.addLine(term_line); // check whether we need the whole tree, or just the term itself if (tit->getAllowChildren()) { file.addLine(String(" <div id=\"div") + term_count + "\" style=\"display: none\">"); if (cv.exists(tit->getAccession())) { writeTermTree_(tit->getAccession(), cv, file, 1); //BEGIN - THIS IS NEEDED FOR WRITING PARSERS ONLY /* set<String> allowed_terms; cv.getAllChildTerms(allowed_terms, tit->getAccession()); for (set<String>::const_iterator atit=allowed_terms.begin(); atit!=allowed_terms.end(); ++atit) { const ControlledVocabulary::CVTerm& child_term = cv.getTerm(*atit); String parser_string = String("os << \"<cvParam cvRef=\\\"MS\\\" accession=\\\"") + child_term.id + "\\\" name=\\\"" + child_term.name + "\\\""; for (Size i=0; i<child_term.unparsed.size(); ++i) { //TODO this does not work anymore. The type is now stored as a member if (child_term.unparsed[i].hasSubstring("value-type:xsd\\:int") || child_term.unparsed[i].hasSubstring("value-type:xsd\\:float") || child_term.unparsed[i].hasSubstring("value-type:xsd\\:string")) { parser_string += " value=\\\"\" << << \"\\\""; } } parser_string += "/>\\n\";<BR>"; file.push_back(parser_string); }*/ } else { file.addLine(" - Missing terms, CV not loaded..."); cerr << "Warning: no child terms for " << tit->getAccession() << " found!" << endl; } file.addLine(" </div>"); file.addLine(" </TD></TD></TR>"); } } } file.addLine(" </TABLE>"); file.addLine(" </BODY>"); file.addLine("</HTML>"); file.store(getStringOption_("html")); return EXECUTION_OK; } // iterator over all mapping rules and store the mentioned terms StringList ignore_namespaces = getStringList_("ignore_cv"); set<String> ignore_cv_list; for (StringList::const_iterator it = ignore_namespaces.begin(); it != ignore_namespaces.end(); ++it) { ignore_cv_list.insert(*it); } set<String> used_terms; for (vector<CVMappingRule>::const_iterator it = mappings.getMappingRules().begin(); it != mappings.getMappingRules().end(); ++it) { set<String> allowed_terms; // iterate over all allowed terms for (vector<CVMappingTerm>::const_iterator tit = it->getCVTerms().begin(); tit != it->getCVTerms().end(); ++tit) { // check whether the term itself it allowed, or only its children if (tit->getUseTerm()) { allowed_terms.insert(tit->getAccession()); } // check whether we need the whole tree, or just the term itself if (tit->getAllowChildren()) { // check whether we want to ignore this term if (!(tit->getAccession().has(':') && ignore_cv_list.find(tit->getAccession().prefix(':')) != ignore_cv_list.end())) { cv.getAllChildTerms(allowed_terms, tit->getAccession()); } // also add the term itself to the used_terms, because all the children are allowed used_terms.insert(tit->getAccession()); } } // print the allowed terms for the rule cout << "MappingRule: id=" << it->getIdentifier() << ", elementPath=" << it->getElementPath() << ", #terms=" << it->getCVTerms().size() << endl; for (set<String>::const_iterator ait = allowed_terms.begin(); ait != allowed_terms.end(); ++ait) { cout << *ait << " " << terms[*ait].name << endl; } used_terms.insert(allowed_terms.begin(), allowed_terms.end()); } // find unused terms, which CANNOT be used in the XML due to the mapping file set<String> unused_terms; for (Map<String, ControlledVocabulary::CVTerm>::ConstIterator it = terms.begin(); it != terms.end(); ++it) { if (used_terms.find(it->first) == used_terms.end()) { unused_terms.insert(it->first); } } cout << "\n\nCVTerms which are unused in the mapping file and therefore MUST NOT be used in an instance document" << endl; for (set<String>::const_iterator it = unused_terms.begin(); it != unused_terms.end(); ++it) { cout << *it << " " << terms[*it].name; // print also parent names for (set<String>::const_iterator pit = terms[*it].parents.begin(); pit != terms[*it].parents.end(); ++pit) { cout << " " << terms[*pit].id << " " << terms[*pit].name; } cout << endl; } return EXECUTION_OK; }