예제 #1
0
void MascotXMLFile::initializeLookup(SpectrumMetaDataLookup& lookup, const MSExperiment<>& exp, const String& scan_regex)
{
    // load spectra and extract scan numbers from the native IDs
    // (expected format: "... scan=#"):
    lookup.readSpectra(exp.getSpectra());
    if (scan_regex.empty()) // use default formats
    {
        if (!lookup.empty()) // raw data given -> spectrum look-up possible
        {
            // possible formats and resulting scan numbers:
            // - Mascot 2.3 (?):
            // <pep_scan_title>scan=818</pep_scan_title> -> 818
            // - ProteomeDiscoverer/Mascot 2.3 or 2.4:
            // <pep_scan_title>Spectrum136 scans:712,</pep_scan_title> -> 712
            // - other variants:
            // <pep_scan_title>Spectrum3411 scans: 2975,</pep_scan_title> -> 2975
            // <...>File773 Spectrum198145 scans: 6094</...> -> 6094
            // <...>6860: Scan 10668 (rt=5380.57)</...> -> 10668
            // <pep_scan_title>Scan Number: 1460</pep_scan_title> -> 1460
            lookup.addReferenceFormat("[Ss]can( [Nn]umber)?s?[=:]? *(?<SCAN>\\d+)");
            // - with .dta input to Mascot:
            // <...>/path/to/FTAC05_13.673.673.2.dta</...> -> 673
            lookup.addReferenceFormat("\\.(?<SCAN>\\d+)\\.\\d+\\.(?<CHARGE>\\d+)(\\.dta)?");
        }
        // title containing RT and MZ instead of scan number:
        // <...>575.848571777344_5018.0811_controllerType=0 controllerNumber=1 scan=11515_EcoliMS2small</...>
        lookup.addReferenceFormat("^(?<MZ>\\d+(\\.\\d+)?)_(?<RT>\\d+(\\.\\d+)?)");
    }
    else // use only user-defined format
    {
        lookup.addReferenceFormat(scan_regex);
    }
}
예제 #2
0
void MascotXMLFile::load(const String& filename,
                         ProteinIdentification& protein_identification,
                         vector<PeptideIdentification>& id_data,
                         map<String, vector<AASequence> >& peptides,
                         const SpectrumMetaDataLookup& lookup)
{
    //clear
    protein_identification = ProteinIdentification();
    id_data.clear();

    Internal::MascotXMLHandler handler(protein_identification, id_data,
                                       filename, peptides, lookup);
    parse_(filename, &handler);

    // since the Mascot XML can contain "peptides" without sequences,
    // the identifications without any real peptide hit are removed
    vector<PeptideIdentification> filtered_hits;
    filtered_hits.reserve(id_data.size());
    Size missing_sequence = 0; // counter

    for (vector<PeptideIdentification>::iterator id_it = id_data.begin();
            id_it != id_data.end(); ++id_it)
    {
        const vector<PeptideHit>& peptide_hits = id_it->getHits();
        if (!peptide_hits.empty() &&
                (peptide_hits.size() > 1 || !peptide_hits[0].getSequence().empty()))
        {
            filtered_hits.push_back(*id_it);
        }
        else if (!id_it->empty()) ++missing_sequence;
    }
    if (missing_sequence)
    {
        LOG_WARN << "Warning: Removed " << missing_sequence
                 << " peptide identifications without sequence." << endl;
    }
    id_data.swap(filtered_hits);

    // check if we have (some) RT information:
    Size no_rt_count = 0;
    for (vector<PeptideIdentification>::iterator id_it = id_data.begin();
            id_it != id_data.end(); ++id_it)
    {
        if (!id_it->hasRT()) ++no_rt_count;
    }
    if (no_rt_count)
    {
        LOG_WARN << "Warning: " << no_rt_count << " (of " << id_data.size()
                 << ") peptide identifications have no retention time value."
                 << endl;
    }
    // if we have a mapping, but couldn't find any RT values, that's an error:
    if (!lookup.empty() && (no_rt_count == id_data.size()))
    {
        throw Exception::MissingInformation(
            __FILE__, __LINE__, __PRETTY_FUNCTION__,
            "No retention time information for peptide identifications found");
    }

    // argh! Mascot 2.2 tends to repeat the first hit (yes it appears twice),
    // so we delete one of them
    for (vector<PeptideIdentification>::iterator it = id_data.begin();
            it != id_data.end(); ++it)
    {
        vector<PeptideHit> peptide_hits = it->getHits();
        // check if equal, except for rank
        if (peptide_hits.size() > 1 &&
                peptide_hits[0].getScore() == peptide_hits[1].getScore() &&
                peptide_hits[0].getSequence() == peptide_hits[1].getSequence() &&
                peptide_hits[0].getCharge() == peptide_hits[1].getCharge())
        {
            // erase first hit
            peptide_hits.erase(peptide_hits.begin() + 1);
            it->setHits(peptide_hits);
        }
    }
}
spectrum.getPrecursors().push_back(prec);
spectra.push_back(spectrum);
spectrum.setNativeID("spectrum=2");
spectrum.setRT(3.0);
spectrum.setMSLevel(2);
prec.setMZ(500.0);
prec.setCharge(3);
spectrum.getPrecursors()[0] = prec;
spectra.push_back(spectrum);

SpectrumMetaDataLookup lookup;

START_SECTION((template <typename SpectrumContainer> void readSpectra(const SpectrumContainer&, const String&, bool)))
{
  lookup.readSpectra(spectra, SpectrumLookup::default_scan_regexp, true);
  TEST_EQUAL(lookup.empty(), false);
}
END_SECTION

START_SECTION((void getSpectrumMetaData(Size, SpectrumMetaData&) const))
{
  SpectrumMetaDataLookup::SpectrumMetaData meta;
  lookup.getSpectrumMetaData(0, meta);
  TEST_EQUAL(meta.rt, 1.0);
  TEST_EQUAL(meta.ms_level, 1);
  TEST_EQUAL(meta.native_id, "spectrum=0");
  TEST_EQUAL(meta.scan_number, 0);

  lookup.getSpectrumMetaData(1, meta);
  TEST_EQUAL(meta.rt, 2.0);
  TEST_EQUAL(meta.precursor_rt, 1.0);