void MascotXMLFile::initializeLookup(SpectrumMetaDataLookup& lookup, const MSExperiment<>& exp, const String& scan_regex) { // load spectra and extract scan numbers from the native IDs // (expected format: "... scan=#"): lookup.readSpectra(exp.getSpectra()); if (scan_regex.empty()) // use default formats { if (!lookup.empty()) // raw data given -> spectrum look-up possible { // possible formats and resulting scan numbers: // - Mascot 2.3 (?): // <pep_scan_title>scan=818</pep_scan_title> -> 818 // - ProteomeDiscoverer/Mascot 2.3 or 2.4: // <pep_scan_title>Spectrum136 scans:712,</pep_scan_title> -> 712 // - other variants: // <pep_scan_title>Spectrum3411 scans: 2975,</pep_scan_title> -> 2975 // <...>File773 Spectrum198145 scans: 6094</...> -> 6094 // <...>6860: Scan 10668 (rt=5380.57)</...> -> 10668 // <pep_scan_title>Scan Number: 1460</pep_scan_title> -> 1460 lookup.addReferenceFormat("[Ss]can( [Nn]umber)?s?[=:]? *(?<SCAN>\\d+)"); // - with .dta input to Mascot: // <...>/path/to/FTAC05_13.673.673.2.dta</...> -> 673 lookup.addReferenceFormat("\\.(?<SCAN>\\d+)\\.\\d+\\.(?<CHARGE>\\d+)(\\.dta)?"); } // title containing RT and MZ instead of scan number: // <...>575.848571777344_5018.0811_controllerType=0 controllerNumber=1 scan=11515_EcoliMS2small</...> lookup.addReferenceFormat("^(?<MZ>\\d+(\\.\\d+)?)_(?<RT>\\d+(\\.\\d+)?)"); } else // use only user-defined format { lookup.addReferenceFormat(scan_regex); } }
void MascotXMLFile::load(const String& filename, ProteinIdentification& protein_identification, vector<PeptideIdentification>& id_data, map<String, vector<AASequence> >& peptides, const SpectrumMetaDataLookup& lookup) { //clear protein_identification = ProteinIdentification(); id_data.clear(); Internal::MascotXMLHandler handler(protein_identification, id_data, filename, peptides, lookup); parse_(filename, &handler); // since the Mascot XML can contain "peptides" without sequences, // the identifications without any real peptide hit are removed vector<PeptideIdentification> filtered_hits; filtered_hits.reserve(id_data.size()); Size missing_sequence = 0; // counter for (vector<PeptideIdentification>::iterator id_it = id_data.begin(); id_it != id_data.end(); ++id_it) { const vector<PeptideHit>& peptide_hits = id_it->getHits(); if (!peptide_hits.empty() && (peptide_hits.size() > 1 || !peptide_hits[0].getSequence().empty())) { filtered_hits.push_back(*id_it); } else if (!id_it->empty()) ++missing_sequence; } if (missing_sequence) { LOG_WARN << "Warning: Removed " << missing_sequence << " peptide identifications without sequence." << endl; } id_data.swap(filtered_hits); // check if we have (some) RT information: Size no_rt_count = 0; for (vector<PeptideIdentification>::iterator id_it = id_data.begin(); id_it != id_data.end(); ++id_it) { if (!id_it->hasRT()) ++no_rt_count; } if (no_rt_count) { LOG_WARN << "Warning: " << no_rt_count << " (of " << id_data.size() << ") peptide identifications have no retention time value." << endl; } // if we have a mapping, but couldn't find any RT values, that's an error: if (!lookup.empty() && (no_rt_count == id_data.size())) { throw Exception::MissingInformation( __FILE__, __LINE__, __PRETTY_FUNCTION__, "No retention time information for peptide identifications found"); } // argh! Mascot 2.2 tends to repeat the first hit (yes it appears twice), // so we delete one of them for (vector<PeptideIdentification>::iterator it = id_data.begin(); it != id_data.end(); ++it) { vector<PeptideHit> peptide_hits = it->getHits(); // check if equal, except for rank if (peptide_hits.size() > 1 && peptide_hits[0].getScore() == peptide_hits[1].getScore() && peptide_hits[0].getSequence() == peptide_hits[1].getSequence() && peptide_hits[0].getCharge() == peptide_hits[1].getCharge()) { // erase first hit peptide_hits.erase(peptide_hits.begin() + 1); it->setHits(peptide_hits); } } }
spectrum.getPrecursors().push_back(prec); spectra.push_back(spectrum); spectrum.setNativeID("spectrum=2"); spectrum.setRT(3.0); spectrum.setMSLevel(2); prec.setMZ(500.0); prec.setCharge(3); spectrum.getPrecursors()[0] = prec; spectra.push_back(spectrum); SpectrumMetaDataLookup lookup; START_SECTION((template <typename SpectrumContainer> void readSpectra(const SpectrumContainer&, const String&, bool))) { lookup.readSpectra(spectra, SpectrumLookup::default_scan_regexp, true); TEST_EQUAL(lookup.empty(), false); } END_SECTION START_SECTION((void getSpectrumMetaData(Size, SpectrumMetaData&) const)) { SpectrumMetaDataLookup::SpectrumMetaData meta; lookup.getSpectrumMetaData(0, meta); TEST_EQUAL(meta.rt, 1.0); TEST_EQUAL(meta.ms_level, 1); TEST_EQUAL(meta.native_id, "spectrum=0"); TEST_EQUAL(meta.scan_number, 0); lookup.getSpectrumMetaData(1, meta); TEST_EQUAL(meta.rt, 2.0); TEST_EQUAL(meta.precursor_rt, 1.0);