Esempio n. 1
0
  void ProtXMLFile::load(const String& filename, ProteinIdentification& protein_ids, PeptideIdentification& peptide_ids)
  {
    //Filename for error messages in XMLHandler
    file_ = filename;

    resetMembers_();

    // reset incoming data
    protein_ids = ProteinIdentification();
    peptide_ids = PeptideIdentification();

    // remember data link while parsing
    prot_id_ = &protein_ids;
    pep_id_ = &peptide_ids;

    parse_(filename, this);
  }
Esempio n. 2
0
  void
  PepNovoOutfile::load(
    const std::string & result_filename,
    vector<PeptideIdentification> & peptide_identifications,
    ProteinIdentification & protein_identification,
    const double & score_threshold,
    const IndexPosMappingType & index_to_precursor,
    const map<String, String> & pnovo_modkey_to_mod_id
    )
  {
    // generally used variables
    StringList substrings;
    map<String, Int> columns;
    PeptideHit peptide_hit;

    String
      line,
      score_type = "PepNovo",
      version = "unknown",
      identifier,
      filename,
      sequence,
      sequence_with_mods;

    DateTime datetime = DateTime::now();     // there's no date given from PepNovo
    protein_identification.setDateTime(datetime);

    peptide_identifications.clear();
    PeptideIdentification peptide_identification;
    protein_identification = ProteinIdentification();

    // open the result
    ifstream result_file(result_filename.c_str());
    if (!result_file)
    {
      throw Exception::FileNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, result_filename);
    }

    Size line_number(0);     // used to report in which line an error occurred
    Size id_count(0);        // number of IDs seen (not necessarily the ones finally returned)

    getSearchEngineAndVersion(result_filename, protein_identification);
    //if information could not be retrieved from the outfile use defaults
    if (protein_identification.getSearchEngineVersion().empty())
    {
      protein_identification.setSearchEngine("PepNovo");
      protein_identification.setSearchEngineVersion(version);
    }
    identifier = protein_identification.getSearchEngine() + "_" + datetime.getDate();
    protein_identification.setIdentifier(identifier);

    map<String, String> mod_mask_map;
    const vector<String> & mods = protein_identification.getSearchParameters().variable_modifications;
    for (vector<String>::const_iterator mod_it = mods.begin(); mod_it != mods.end(); ++mod_it)
    {
      if (mod_it->empty())
        continue;
      //cout<<*mod_it<<endl;
      if (pnovo_modkey_to_mod_id.find(*mod_it) != pnovo_modkey_to_mod_id.end())
      {
        //cout<<keys_to_id.find(*mod_it)->second<<endl;
        ResidueModification tmp_mod = ModificationsDB::getInstance()->getModification(pnovo_modkey_to_mod_id.find(*mod_it)->second);
        if (mod_it->prefix(1) == "^" || mod_it->prefix(1) == "$")
        {
          mod_mask_map[*mod_it] = "(" + tmp_mod.getId() + ")";
        }
        else
        {
          mod_mask_map[*mod_it] = String(tmp_mod.getOrigin()) + "(" + tmp_mod.getId() + ")";
        }
      }
      else
      {
        if (mod_it->prefix(1) != "^" && mod_it->prefix(1) != "$")
        {
          mod_mask_map[*mod_it] = mod_it->prefix(1) + "[" + mod_it->substr(1) + "]";
          //cout<<mod_mask_map[*mod_it]<<endl;
        }
        else
        {
          mod_mask_map[*mod_it] = "[" + *mod_it + "]";
          //cout<<mod_mask_map[*mod_it]<<endl;
        }
      }
    }


    Size index;
    while (getline(result_file, line))
    {
      if (!line.empty() && (line[line.length() - 1] < 33)) line.resize(line.length() - 1); // remove weird EOL character
      line.trim();
      ++line_number;
      if (line.hasPrefix(">> "))         // >> 1 /home/shared/pepnovo/4611_raw_ms2_picked.mzXML.1001.2.dta
      {
        ++id_count;
        if (!peptide_identification.empty() && !peptide_identification.getHits().empty())
        {
          peptide_identifications.push_back(peptide_identification);
        }

        line.split(' ', substrings);
        //String index = File::basename(line.substr(line.find(' ', strlen(">> ")) + 1));
        if (substrings.size() < 3)
        {
          throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Not enough columns (spectrum Id) in file in line " + String(line_number) + String(" (should be 2 or more)!"), result_filename);
        }

        try
        {
          index = substrings[2].trim().toInt();
        }
        catch (...)
        {
          throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Expected an index number in line " + String(line_number) + String(" at position 2 (line was: '" + line + "')!"), result_filename);
        }

        //cout<<"INDEX: "<<index<<endl;
        peptide_identification = PeptideIdentification();
        bool success = false;
        if (index_to_precursor.size()>0)
        {
          if (index_to_precursor.find(index) != index_to_precursor.end())
          {
            peptide_identification.setRT(index_to_precursor.find(index)->second.first);
            peptide_identification.setMZ(index_to_precursor.find(index)->second.second);
            success = true;
          }
          else throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Index '" + String(index) + String("' in line '" + line + "' not found in index table (line was: '" + line + "')!"), result_filename);
        }

        if (!success)
        { // try to reconstruct from title entry (usually sensible when MGF is supplied to PepNovo)
          try
          {
            if (substrings.size() >= 4)
            {
              StringList parts = ListUtils::create<String>(substrings[3], '_');
              if (parts.size() >= 2)
              {
                peptide_identification.setRT(parts[1].toDouble());
                peptide_identification.setMZ(parts[0].toDouble());
                success = true;
              }
            }
          }
          catch (...)
          {

          }
          if (!success) throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Precursor could not be reconstructed from title '" + substrings[3] + String("' in line '" + line + "' (line was: '" + line + "')!"), result_filename);
        }
        peptide_identification.setSignificanceThreshold(score_threshold);
        peptide_identification.setScoreType(score_type);
        peptide_identification.setIdentifier(identifier);
      }
      else if (line.hasPrefix("#Index"))         // #Index  Prob    Score   N-mass  C-Mass  [M+H]   Charge  Sequence
      {
        if (columns.empty())           // map the column names to their column number
        {
          line.split('\t', substrings);
          for (vector<String>::const_iterator s_i = substrings.begin(); s_i != substrings.end(); ++s_i)
          {
            if ((*s_i) == "#Index")
              columns["Index"] = s_i - substrings.begin();
            else if ((*s_i) == "RnkScr")
              columns["RnkScr"] = s_i - substrings.begin();
            else if ((*s_i) == "PnvScr")
              columns["PnvScr"] = s_i - substrings.begin();
            else if ((*s_i) == "N-Gap")
              columns["N-Gap"] = s_i - substrings.begin();
            else if ((*s_i) == "C-Gap")
              columns["C-Gap"] = s_i - substrings.begin();
            else if ((*s_i) == "[M+H]")
              columns["[M+H]"] = s_i - substrings.begin();
            else if ((*s_i) == "Charge")
              columns["Charge"] = s_i - substrings.begin();
            else if ((*s_i) == "Sequence")
              columns["Sequence"] = s_i - substrings.begin();
          }

          if (columns.size() != 8)
          {
            result_file.close();
            result_file.clear();
            throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Not enough columns in file in line " + String(line_number) + String(" (should be 8)!"), result_filename);
          }
        }
        while (getline(result_file, line))
        {
          ++line_number;
          if (!line.empty() && (line[line.length() - 1] < 33))
            line.resize(line.length() - 1);
          line.trim();

          if (line.empty())
            break;

          line.split('\t', substrings);
          if (!substrings.empty())
          {
            if (substrings.size() != 8)
            {
              result_file.close();
              result_file.clear();
              throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Not enough columns in file in line " + String(line_number) + String(" (should be 8)!"), result_filename);
            }
            if (substrings[columns["RnkScr"]].toFloat() >= score_threshold)
            {
              peptide_hit = PeptideHit();
              peptide_hit.setCharge(substrings[columns["Charge"]].toInt());
              peptide_hit.setRank(substrings[columns["Index"]].toInt() + 1);
              peptide_hit.setScore(substrings[columns["RnkScr"]].toFloat());
              peptide_hit.setMetaValue("PnvScr", substrings[columns["PnvScr"]].toFloat());
              peptide_hit.setMetaValue("N-Gap", substrings[columns["N-Gap"]].toFloat());
              peptide_hit.setMetaValue("C-Gap", substrings[columns["C-Gap"]].toFloat());
              peptide_hit.setMetaValue("MZ", substrings[columns["[M+H]"]].toFloat());
              sequence = substrings[columns["Sequence"]];


              for (map<String, String>::iterator mask_it = mod_mask_map.begin(); mask_it != mod_mask_map.end(); ++mask_it)
              {
                if (mask_it->first.hasPrefix("^") && sequence.hasSubstring(mask_it->first))
                {
                  sequence.substitute(mask_it->first, "");
                  sequence = mask_it->second + sequence;
                }
                //cout<<mask_it->first<<" "<<mask_it->second<<endl;
                sequence.substitute(mask_it->first, mask_it->second);
              }
              peptide_hit.setSequence(AASequence::fromString(sequence));
              peptide_identification.insertHit(peptide_hit);
            }
          }
        }
      }
    }
    if (!peptide_identifications.empty() || !peptide_identification.getHits().empty())
    {
      peptide_identifications.push_back(peptide_identification);
    }

    result_file.close();
    result_file.clear();

    LOG_INFO << "Parsed " << id_count << " ids, retained " << peptide_identifications.size() << "." << std::endl;

  }