Esempio n. 1
0
  ExitCodes main_(int, const char**) override
  {
    vector<ProteinIdentification> protein_identifications;

    vector<PeptideIdentification> identifications;
    PeptideIdentification peptide_identification;
    DateTime date_time = DateTime::now();
    String date_time_string = date_time.get();
    peptide_identification.setIdentifier("In-silico_digestion" + date_time_string);

    ProteinIdentification protein_identification;

    protein_identifications.push_back(ProteinIdentification());
    //-------------------------------------------------------------
    // parsing parameters
    //-------------------------------------------------------------
    String inputfile_name = getStringOption_("in");
    String outputfile_name = getStringOption_("out");

    FASTAID FASTA_ID = getStringOption_("FASTA:ID") == "parent" ? PARENT : (getStringOption_("FASTA:ID") == "number" ? NUMBER : BOTH);
    bool keep_FASTA_desc = (getStringOption_("FASTA:description") == "keep");

    // output file type
    FileHandler fh;
    FileTypes::Type out_type = FileTypes::nameToType(getStringOption_("out_type"));

    if (out_type == FileTypes::UNKNOWN)
    {
      out_type = fh.getTypeByFileName(outputfile_name);
      writeDebug_(String("Output file type: ") + FileTypes::typeToName(out_type), 2);
    }

    if (out_type == FileTypes::UNKNOWN)
    {
      LOG_ERROR << ("Error: Could not determine output file type!") << std::endl;
      return PARSE_ERROR;
    }

    Size min_size = getIntOption_("min_length");
    Size max_size = getIntOption_("max_length");
    Size missed_cleavages = getIntOption_("missed_cleavages");


    bool has_FASTA_output = (out_type == FileTypes::FASTA);

    //-------------------------------------------------------------
    // reading input
    //-------------------------------------------------------------
    FASTAFile ff;
    ff.readStart(inputfile_name);
    if (has_FASTA_output) ff.writeStart(outputfile_name);

    //-------------------------------------------------------------
    // calculations
    //-------------------------------------------------------------

    // This should be updated if more cleavage enzymes are available
    ProteinIdentification::SearchParameters search_parameters;
    String enzyme = getStringOption_("enzyme");
    ProteaseDigestion digestor;
    digestor.setEnzyme(enzyme);
    digestor.setMissedCleavages(missed_cleavages);
    search_parameters.digestion_enzyme = *ProteaseDB::getInstance()->getEnzyme(enzyme);

    PeptideHit temp_peptide_hit;
    PeptideEvidence temp_pe;

    protein_identifications[0].setSearchParameters(search_parameters);
    protein_identifications[0].setDateTime(date_time);
    protein_identifications[0].setSearchEngine("In-silico digestion");
    protein_identifications[0].setIdentifier("In-silico_digestion" + date_time_string);

    Size dropped_by_length(0); // stats for removing candidates
    Size fasta_out_count(0);

    FASTAFile::FASTAEntry fe;
    while (ff.readNext(fe))
    {
      if (!has_FASTA_output)
      {
        ProteinHit temp_protein_hit;
        temp_protein_hit.setSequence(fe.sequence);
        temp_protein_hit.setAccession(fe.identifier);
        protein_identifications[0].insertHit(temp_protein_hit);
        temp_pe.setProteinAccession(fe.identifier);
        temp_peptide_hit.setPeptideEvidences(vector<PeptideEvidence>(1, temp_pe));
      }

      vector<AASequence> current_digest;
      if (enzyme == "none")
      {
        current_digest.push_back(AASequence::fromString(fe.sequence));
      }
      else
      {
        dropped_by_length += digestor.digest(AASequence::fromString(fe.sequence), current_digest, min_size, max_size);
      }

      String id = fe.identifier;
      for (auto const& s : current_digest)
      {
        if (!has_FASTA_output)
        {
          temp_peptide_hit.setSequence(s);
          peptide_identification.insertHit(temp_peptide_hit);
          identifications.push_back(peptide_identification);
          peptide_identification.setHits(std::vector<PeptideHit>()); // clear
        }
        else // for FASTA file output
        {
          ++fasta_out_count;
          switch (FASTA_ID)
          {
            case PARENT: break;
            case NUMBER: id = String(fasta_out_count); break;
            case BOTH: id = fe.identifier + "_" + String(fasta_out_count); break;
          }
          ff.writeNext(FASTAFile::FASTAEntry(id, keep_FASTA_desc ? fe.description : "", s.toString()));
        }
      }
    }

    //-------------------------------------------------------------
    // writing output
    //-------------------------------------------------------------

    if (has_FASTA_output)
    {
      ff.writeEnd();
    }
    else
    {
      IdXMLFile().store(outputfile_name,
                        protein_identifications,
                        identifications);
    }

    Size pep_remaining_count = (has_FASTA_output ? fasta_out_count : identifications.size());
    LOG_INFO << "Statistics:\n"
             << "  file:                                    " << inputfile_name << "\n"
             << "  total #peptides after digestion:         " << pep_remaining_count + dropped_by_length << "\n"
             << "  removed #peptides (length restrictions): " << dropped_by_length << "\n"
             << "  remaining #peptides:                     " << pep_remaining_count << std::endl;

    return EXECUTION_OK;
  }
Esempio n. 2
0
  ExitCodes main_(int, const char**) override
  {
    vector<ProteinIdentification> protein_identifications;

    vector<PeptideIdentification> identifications;
    PeptideIdentification peptide_identification;
    DateTime date_time = DateTime::now();
    String date_time_string = date_time.get();
    peptide_identification.setIdentifier("In-silico_digestion" + date_time_string);

    ProteinIdentification protein_identification;

    protein_identifications.push_back(ProteinIdentification());
    //-------------------------------------------------------------
    // parsing parameters
    //-------------------------------------------------------------
    String inputfile_name = getStringOption_("in");
    String outputfile_name = getStringOption_("out");

    // output file type
    FileHandler fh;
    FileTypes::Type out_type = FileTypes::nameToType(getStringOption_("out_type"));

    if (out_type == FileTypes::UNKNOWN)
    {
      out_type = fh.getTypeByFileName(outputfile_name);
      writeDebug_(String("Output file type: ") + FileTypes::typeToName(out_type), 2);
    }

    if (out_type == FileTypes::UNKNOWN)
    {
      LOG_ERROR << ("Error: Could not determine output file type!") << std::endl;
      return PARSE_ERROR;
    }

    Size min_size = getIntOption_("min_length");
    Size max_size = getIntOption_("max_length");
    Size missed_cleavages = getIntOption_("missed_cleavages");


    bool has_FASTA_output = (out_type == FileTypes::FASTA);

    //-------------------------------------------------------------
    // reading input
    //-------------------------------------------------------------
    std::vector<FASTAFile::FASTAEntry> protein_data;
    FASTAFile().load(inputfile_name, protein_data);
    //-------------------------------------------------------------
    // calculations
    //-------------------------------------------------------------

    // This should be updated if more cleavage enzymes are available
    ProteinIdentification::SearchParameters search_parameters;
    String enzyme = getStringOption_("enzyme");
    ProteaseDigestion digestor;
    digestor.setEnzyme(enzyme);
    digestor.setMissedCleavages(missed_cleavages);
    search_parameters.digestion_enzyme = *ProteaseDB::getInstance()->getEnzyme(enzyme);

    PeptideHit temp_peptide_hit;
    PeptideEvidence temp_pe;

    protein_identifications[0].setSearchParameters(search_parameters);
    protein_identifications[0].setDateTime(date_time);
    protein_identifications[0].setSearchEngine("In-silico digestion");
    protein_identifications[0].setIdentifier("In-silico_digestion" + date_time_string);

    std::vector<FASTAFile::FASTAEntry> all_peptides;

    Size dropped_bylength(0); // stats for removing candidates

    for (Size i = 0; i < protein_data.size(); ++i)
    {
      if (!has_FASTA_output)
      {
        ProteinHit temp_protein_hit;
        temp_protein_hit.setSequence(protein_data[i].sequence);
        temp_protein_hit.setAccession(protein_data[i].identifier);
        protein_identifications[0].insertHit(temp_protein_hit);
        temp_pe.setProteinAccession(protein_data[i].identifier);
        temp_peptide_hit.setPeptideEvidences(vector<PeptideEvidence>(1, temp_pe));
      }

      vector<AASequence> temp_peptides;
      if (enzyme == "none")
      {
        temp_peptides.push_back(AASequence::fromString(protein_data[i].sequence));
      }
      else
      {
        vector<AASequence> current_digest;
        digestor.digest(AASequence::fromString(protein_data[i].sequence), current_digest);

        // keep peptides that match length restrictions (and count those that don't match)
        std::copy_if(current_digest.begin(), current_digest.end(), std::back_inserter(temp_peptides), 
          [&dropped_bylength, &min_size, &max_size](const AASequence& s) -> bool
          {
            bool valid_length = (s.size() >= min_size && s.size() <= max_size);
            if (!valid_length)
            {
              ++dropped_bylength;
              return false;
            }
            
            return true;
          });
      }

      for (auto s : temp_peptides)
      {
        if (!has_FASTA_output)
        {
          temp_peptide_hit.setSequence(s);
          peptide_identification.insertHit(temp_peptide_hit);
          identifications.push_back(peptide_identification);
          peptide_identification.setHits(std::vector<PeptideHit>()); // clear
        }
        else // for FASTA file output
        {
          FASTAFile::FASTAEntry pep(protein_data[i].identifier, protein_data[i].description, s.toString());
          all_peptides.push_back(pep);
        }
      }
    }

    //-------------------------------------------------------------
    // writing output
    //-------------------------------------------------------------

    if (has_FASTA_output)
    {
      FASTAFile().store(outputfile_name, all_peptides);
    }
    else
    {
      IdXMLFile().store(outputfile_name,
                        protein_identifications,
                        identifications);
    }

    Size pep_remaining_count = (has_FASTA_output ? all_peptides.size() : identifications.size());
    LOG_INFO << "Statistics:\n"
             << "  total #peptides after digestion:         " << pep_remaining_count + dropped_bylength << "\n"
             << "  removed #peptides (length restrictions): " << dropped_bylength << "\n"
             << "  remaining #peptides:                     " << pep_remaining_count << std::endl;

    return EXECUTION_OK;
  }