Ejemplo n.º 1
0
  void ProtXMLFile::load(const String& filename, ProteinIdentification& protein_ids, PeptideIdentification& peptide_ids)
  {
    //Filename for error messages in XMLHandler
    file_ = filename;

    resetMembers_();

    // reset incoming data
    protein_ids = ProteinIdentification();
    peptide_ids = PeptideIdentification();

    // remember data link while parsing
    prot_id_ = &protein_ids;
    pep_id_ = &peptide_ids;

    parse_(filename, this);
  }
Ejemplo n.º 2
0
void MascotXMLFile::load(const String& filename,
                         ProteinIdentification& protein_identification,
                         vector<PeptideIdentification>& id_data,
                         map<String, vector<AASequence> >& peptides,
                         const SpectrumMetaDataLookup& lookup)
{
    //clear
    protein_identification = ProteinIdentification();
    id_data.clear();

    Internal::MascotXMLHandler handler(protein_identification, id_data,
                                       filename, peptides, lookup);
    parse_(filename, &handler);

    // since the Mascot XML can contain "peptides" without sequences,
    // the identifications without any real peptide hit are removed
    vector<PeptideIdentification> filtered_hits;
    filtered_hits.reserve(id_data.size());
    Size missing_sequence = 0; // counter

    for (vector<PeptideIdentification>::iterator id_it = id_data.begin();
            id_it != id_data.end(); ++id_it)
    {
        const vector<PeptideHit>& peptide_hits = id_it->getHits();
        if (!peptide_hits.empty() &&
                (peptide_hits.size() > 1 || !peptide_hits[0].getSequence().empty()))
        {
            filtered_hits.push_back(*id_it);
        }
        else if (!id_it->empty()) ++missing_sequence;
    }
    if (missing_sequence)
    {
        LOG_WARN << "Warning: Removed " << missing_sequence
                 << " peptide identifications without sequence." << endl;
    }
    id_data.swap(filtered_hits);

    // check if we have (some) RT information:
    Size no_rt_count = 0;
    for (vector<PeptideIdentification>::iterator id_it = id_data.begin();
            id_it != id_data.end(); ++id_it)
    {
        if (!id_it->hasRT()) ++no_rt_count;
    }
    if (no_rt_count)
    {
        LOG_WARN << "Warning: " << no_rt_count << " (of " << id_data.size()
                 << ") peptide identifications have no retention time value."
                 << endl;
    }
    // if we have a mapping, but couldn't find any RT values, that's an error:
    if (!lookup.empty() && (no_rt_count == id_data.size()))
    {
        throw Exception::MissingInformation(
            __FILE__, __LINE__, __PRETTY_FUNCTION__,
            "No retention time information for peptide identifications found");
    }

    // argh! Mascot 2.2 tends to repeat the first hit (yes it appears twice),
    // so we delete one of them
    for (vector<PeptideIdentification>::iterator it = id_data.begin();
            it != id_data.end(); ++it)
    {
        vector<PeptideHit> peptide_hits = it->getHits();
        // check if equal, except for rank
        if (peptide_hits.size() > 1 &&
                peptide_hits[0].getScore() == peptide_hits[1].getScore() &&
                peptide_hits[0].getSequence() == peptide_hits[1].getSequence() &&
                peptide_hits[0].getCharge() == peptide_hits[1].getCharge())
        {
            // erase first hit
            peptide_hits.erase(peptide_hits.begin() + 1);
            it->setHits(peptide_hits);
        }
    }
}
Ejemplo n.º 3
0
  void
  PepNovoOutfile::load(
    const std::string & result_filename,
    vector<PeptideIdentification> & peptide_identifications,
    ProteinIdentification & protein_identification,
    const double & score_threshold,
    const IndexPosMappingType & index_to_precursor,
    const map<String, String> & pnovo_modkey_to_mod_id
    )
  {
    // generally used variables
    StringList substrings;
    map<String, Int> columns;
    PeptideHit peptide_hit;

    String
      line,
      score_type = "PepNovo",
      version = "unknown",
      identifier,
      filename,
      sequence,
      sequence_with_mods;

    DateTime datetime = DateTime::now();     // there's no date given from PepNovo
    protein_identification.setDateTime(datetime);

    peptide_identifications.clear();
    PeptideIdentification peptide_identification;
    protein_identification = ProteinIdentification();

    // open the result
    ifstream result_file(result_filename.c_str());
    if (!result_file)
    {
      throw Exception::FileNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, result_filename);
    }

    Size line_number(0);     // used to report in which line an error occurred
    Size id_count(0);        // number of IDs seen (not necessarily the ones finally returned)

    getSearchEngineAndVersion(result_filename, protein_identification);
    //if information could not be retrieved from the outfile use defaults
    if (protein_identification.getSearchEngineVersion().empty())
    {
      protein_identification.setSearchEngine("PepNovo");
      protein_identification.setSearchEngineVersion(version);
    }
    identifier = protein_identification.getSearchEngine() + "_" + datetime.getDate();
    protein_identification.setIdentifier(identifier);

    map<String, String> mod_mask_map;
    const vector<String> & mods = protein_identification.getSearchParameters().variable_modifications;
    for (vector<String>::const_iterator mod_it = mods.begin(); mod_it != mods.end(); ++mod_it)
    {
      if (mod_it->empty())
        continue;
      //cout<<*mod_it<<endl;
      if (pnovo_modkey_to_mod_id.find(*mod_it) != pnovo_modkey_to_mod_id.end())
      {
        //cout<<keys_to_id.find(*mod_it)->second<<endl;
        ResidueModification tmp_mod = ModificationsDB::getInstance()->getModification(pnovo_modkey_to_mod_id.find(*mod_it)->second);
        if (mod_it->prefix(1) == "^" || mod_it->prefix(1) == "$")
        {
          mod_mask_map[*mod_it] = "(" + tmp_mod.getId() + ")";
        }
        else
        {
          mod_mask_map[*mod_it] = String(tmp_mod.getOrigin()) + "(" + tmp_mod.getId() + ")";
        }
      }
      else
      {
        if (mod_it->prefix(1) != "^" && mod_it->prefix(1) != "$")
        {
          mod_mask_map[*mod_it] = mod_it->prefix(1) + "[" + mod_it->substr(1) + "]";
          //cout<<mod_mask_map[*mod_it]<<endl;
        }
        else
        {
          mod_mask_map[*mod_it] = "[" + *mod_it + "]";
          //cout<<mod_mask_map[*mod_it]<<endl;
        }
      }
    }


    Size index;
    while (getline(result_file, line))
    {
      if (!line.empty() && (line[line.length() - 1] < 33)) line.resize(line.length() - 1); // remove weird EOL character
      line.trim();
      ++line_number;
      if (line.hasPrefix(">> "))         // >> 1 /home/shared/pepnovo/4611_raw_ms2_picked.mzXML.1001.2.dta
      {
        ++id_count;
        if (!peptide_identification.empty() && !peptide_identification.getHits().empty())
        {
          peptide_identifications.push_back(peptide_identification);
        }

        line.split(' ', substrings);
        //String index = File::basename(line.substr(line.find(' ', strlen(">> ")) + 1));
        if (substrings.size() < 3)
        {
          throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Not enough columns (spectrum Id) in file in line " + String(line_number) + String(" (should be 2 or more)!"), result_filename);
        }

        try
        {
          index = substrings[2].trim().toInt();
        }
        catch (...)
        {
          throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Expected an index number in line " + String(line_number) + String(" at position 2 (line was: '" + line + "')!"), result_filename);
        }

        //cout<<"INDEX: "<<index<<endl;
        peptide_identification = PeptideIdentification();
        bool success = false;
        if (index_to_precursor.size()>0)
        {
          if (index_to_precursor.find(index) != index_to_precursor.end())
          {
            peptide_identification.setRT(index_to_precursor.find(index)->second.first);
            peptide_identification.setMZ(index_to_precursor.find(index)->second.second);
            success = true;
          }
          else throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Index '" + String(index) + String("' in line '" + line + "' not found in index table (line was: '" + line + "')!"), result_filename);
        }

        if (!success)
        { // try to reconstruct from title entry (usually sensible when MGF is supplied to PepNovo)
          try
          {
            if (substrings.size() >= 4)
            {
              StringList parts = ListUtils::create<String>(substrings[3], '_');
              if (parts.size() >= 2)
              {
                peptide_identification.setRT(parts[1].toDouble());
                peptide_identification.setMZ(parts[0].toDouble());
                success = true;
              }
            }
          }
          catch (...)
          {

          }
          if (!success) throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Precursor could not be reconstructed from title '" + substrings[3] + String("' in line '" + line + "' (line was: '" + line + "')!"), result_filename);
        }
        peptide_identification.setSignificanceThreshold(score_threshold);
        peptide_identification.setScoreType(score_type);
        peptide_identification.setIdentifier(identifier);
      }
      else if (line.hasPrefix("#Index"))         // #Index  Prob    Score   N-mass  C-Mass  [M+H]   Charge  Sequence
      {
        if (columns.empty())           // map the column names to their column number
        {
          line.split('\t', substrings);
          for (vector<String>::const_iterator s_i = substrings.begin(); s_i != substrings.end(); ++s_i)
          {
            if ((*s_i) == "#Index")
              columns["Index"] = s_i - substrings.begin();
            else if ((*s_i) == "RnkScr")
              columns["RnkScr"] = s_i - substrings.begin();
            else if ((*s_i) == "PnvScr")
              columns["PnvScr"] = s_i - substrings.begin();
            else if ((*s_i) == "N-Gap")
              columns["N-Gap"] = s_i - substrings.begin();
            else if ((*s_i) == "C-Gap")
              columns["C-Gap"] = s_i - substrings.begin();
            else if ((*s_i) == "[M+H]")
              columns["[M+H]"] = s_i - substrings.begin();
            else if ((*s_i) == "Charge")
              columns["Charge"] = s_i - substrings.begin();
            else if ((*s_i) == "Sequence")
              columns["Sequence"] = s_i - substrings.begin();
          }

          if (columns.size() != 8)
          {
            result_file.close();
            result_file.clear();
            throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Not enough columns in file in line " + String(line_number) + String(" (should be 8)!"), result_filename);
          }
        }
        while (getline(result_file, line))
        {
          ++line_number;
          if (!line.empty() && (line[line.length() - 1] < 33))
            line.resize(line.length() - 1);
          line.trim();

          if (line.empty())
            break;

          line.split('\t', substrings);
          if (!substrings.empty())
          {
            if (substrings.size() != 8)
            {
              result_file.close();
              result_file.clear();
              throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Not enough columns in file in line " + String(line_number) + String(" (should be 8)!"), result_filename);
            }
            if (substrings[columns["RnkScr"]].toFloat() >= score_threshold)
            {
              peptide_hit = PeptideHit();
              peptide_hit.setCharge(substrings[columns["Charge"]].toInt());
              peptide_hit.setRank(substrings[columns["Index"]].toInt() + 1);
              peptide_hit.setScore(substrings[columns["RnkScr"]].toFloat());
              peptide_hit.setMetaValue("PnvScr", substrings[columns["PnvScr"]].toFloat());
              peptide_hit.setMetaValue("N-Gap", substrings[columns["N-Gap"]].toFloat());
              peptide_hit.setMetaValue("C-Gap", substrings[columns["C-Gap"]].toFloat());
              peptide_hit.setMetaValue("MZ", substrings[columns["[M+H]"]].toFloat());
              sequence = substrings[columns["Sequence"]];


              for (map<String, String>::iterator mask_it = mod_mask_map.begin(); mask_it != mod_mask_map.end(); ++mask_it)
              {
                if (mask_it->first.hasPrefix("^") && sequence.hasSubstring(mask_it->first))
                {
                  sequence.substitute(mask_it->first, "");
                  sequence = mask_it->second + sequence;
                }
                //cout<<mask_it->first<<" "<<mask_it->second<<endl;
                sequence.substitute(mask_it->first, mask_it->second);
              }
              peptide_hit.setSequence(AASequence::fromString(sequence));
              peptide_identification.insertHit(peptide_hit);
            }
          }
        }
      }
    }
    if (!peptide_identifications.empty() || !peptide_identification.getHits().empty())
    {
      peptide_identifications.push_back(peptide_identification);
    }

    result_file.close();
    result_file.clear();

    LOG_INFO << "Parsed " << id_count << " ids, retained " << peptide_identifications.size() << "." << std::endl;

  }