ExitCodes main_(int, const char**) override { vector<ProteinIdentification> protein_identifications; vector<PeptideIdentification> identifications; PeptideIdentification peptide_identification; DateTime date_time = DateTime::now(); String date_time_string = date_time.get(); peptide_identification.setIdentifier("In-silico_digestion" + date_time_string); ProteinIdentification protein_identification; protein_identifications.push_back(ProteinIdentification()); //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- String inputfile_name = getStringOption_("in"); String outputfile_name = getStringOption_("out"); FASTAID FASTA_ID = getStringOption_("FASTA:ID") == "parent" ? PARENT : (getStringOption_("FASTA:ID") == "number" ? NUMBER : BOTH); bool keep_FASTA_desc = (getStringOption_("FASTA:description") == "keep"); // output file type FileHandler fh; FileTypes::Type out_type = FileTypes::nameToType(getStringOption_("out_type")); if (out_type == FileTypes::UNKNOWN) { out_type = fh.getTypeByFileName(outputfile_name); writeDebug_(String("Output file type: ") + FileTypes::typeToName(out_type), 2); } if (out_type == FileTypes::UNKNOWN) { LOG_ERROR << ("Error: Could not determine output file type!") << std::endl; return PARSE_ERROR; } Size min_size = getIntOption_("min_length"); Size max_size = getIntOption_("max_length"); Size missed_cleavages = getIntOption_("missed_cleavages"); bool has_FASTA_output = (out_type == FileTypes::FASTA); //------------------------------------------------------------- // reading input //------------------------------------------------------------- FASTAFile ff; ff.readStart(inputfile_name); if (has_FASTA_output) ff.writeStart(outputfile_name); //------------------------------------------------------------- // calculations //------------------------------------------------------------- // This should be updated if more cleavage enzymes are available ProteinIdentification::SearchParameters search_parameters; String enzyme = getStringOption_("enzyme"); ProteaseDigestion digestor; digestor.setEnzyme(enzyme); digestor.setMissedCleavages(missed_cleavages); search_parameters.digestion_enzyme = *ProteaseDB::getInstance()->getEnzyme(enzyme); PeptideHit temp_peptide_hit; PeptideEvidence temp_pe; protein_identifications[0].setSearchParameters(search_parameters); protein_identifications[0].setDateTime(date_time); protein_identifications[0].setSearchEngine("In-silico digestion"); protein_identifications[0].setIdentifier("In-silico_digestion" + date_time_string); Size dropped_by_length(0); // stats for removing candidates Size fasta_out_count(0); FASTAFile::FASTAEntry fe; while (ff.readNext(fe)) { if (!has_FASTA_output) { ProteinHit temp_protein_hit; temp_protein_hit.setSequence(fe.sequence); temp_protein_hit.setAccession(fe.identifier); protein_identifications[0].insertHit(temp_protein_hit); temp_pe.setProteinAccession(fe.identifier); temp_peptide_hit.setPeptideEvidences(vector<PeptideEvidence>(1, temp_pe)); } vector<AASequence> current_digest; if (enzyme == "none") { current_digest.push_back(AASequence::fromString(fe.sequence)); } else { dropped_by_length += digestor.digest(AASequence::fromString(fe.sequence), current_digest, min_size, max_size); } String id = fe.identifier; for (auto const& s : current_digest) { if (!has_FASTA_output) { temp_peptide_hit.setSequence(s); peptide_identification.insertHit(temp_peptide_hit); identifications.push_back(peptide_identification); peptide_identification.setHits(std::vector<PeptideHit>()); // clear } else // for FASTA file output { ++fasta_out_count; switch (FASTA_ID) { case PARENT: break; case NUMBER: id = String(fasta_out_count); break; case BOTH: id = fe.identifier + "_" + String(fasta_out_count); break; } ff.writeNext(FASTAFile::FASTAEntry(id, keep_FASTA_desc ? fe.description : "", s.toString())); } } } //------------------------------------------------------------- // writing output //------------------------------------------------------------- if (has_FASTA_output) { ff.writeEnd(); } else { IdXMLFile().store(outputfile_name, protein_identifications, identifications); } Size pep_remaining_count = (has_FASTA_output ? fasta_out_count : identifications.size()); LOG_INFO << "Statistics:\n" << " file: " << inputfile_name << "\n" << " total #peptides after digestion: " << pep_remaining_count + dropped_by_length << "\n" << " removed #peptides (length restrictions): " << dropped_by_length << "\n" << " remaining #peptides: " << pep_remaining_count << std::endl; return EXECUTION_OK; }
ExitCodes main_(int, const char**) override { vector<ProteinIdentification> protein_identifications; vector<PeptideIdentification> identifications; PeptideIdentification peptide_identification; DateTime date_time = DateTime::now(); String date_time_string = date_time.get(); peptide_identification.setIdentifier("In-silico_digestion" + date_time_string); ProteinIdentification protein_identification; protein_identifications.push_back(ProteinIdentification()); //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- String inputfile_name = getStringOption_("in"); String outputfile_name = getStringOption_("out"); // output file type FileHandler fh; FileTypes::Type out_type = FileTypes::nameToType(getStringOption_("out_type")); if (out_type == FileTypes::UNKNOWN) { out_type = fh.getTypeByFileName(outputfile_name); writeDebug_(String("Output file type: ") + FileTypes::typeToName(out_type), 2); } if (out_type == FileTypes::UNKNOWN) { LOG_ERROR << ("Error: Could not determine output file type!") << std::endl; return PARSE_ERROR; } Size min_size = getIntOption_("min_length"); Size max_size = getIntOption_("max_length"); Size missed_cleavages = getIntOption_("missed_cleavages"); bool has_FASTA_output = (out_type == FileTypes::FASTA); //------------------------------------------------------------- // reading input //------------------------------------------------------------- std::vector<FASTAFile::FASTAEntry> protein_data; FASTAFile().load(inputfile_name, protein_data); //------------------------------------------------------------- // calculations //------------------------------------------------------------- // This should be updated if more cleavage enzymes are available ProteinIdentification::SearchParameters search_parameters; String enzyme = getStringOption_("enzyme"); ProteaseDigestion digestor; digestor.setEnzyme(enzyme); digestor.setMissedCleavages(missed_cleavages); search_parameters.digestion_enzyme = *ProteaseDB::getInstance()->getEnzyme(enzyme); PeptideHit temp_peptide_hit; PeptideEvidence temp_pe; protein_identifications[0].setSearchParameters(search_parameters); protein_identifications[0].setDateTime(date_time); protein_identifications[0].setSearchEngine("In-silico digestion"); protein_identifications[0].setIdentifier("In-silico_digestion" + date_time_string); std::vector<FASTAFile::FASTAEntry> all_peptides; Size dropped_bylength(0); // stats for removing candidates for (Size i = 0; i < protein_data.size(); ++i) { if (!has_FASTA_output) { ProteinHit temp_protein_hit; temp_protein_hit.setSequence(protein_data[i].sequence); temp_protein_hit.setAccession(protein_data[i].identifier); protein_identifications[0].insertHit(temp_protein_hit); temp_pe.setProteinAccession(protein_data[i].identifier); temp_peptide_hit.setPeptideEvidences(vector<PeptideEvidence>(1, temp_pe)); } vector<AASequence> temp_peptides; if (enzyme == "none") { temp_peptides.push_back(AASequence::fromString(protein_data[i].sequence)); } else { vector<AASequence> current_digest; digestor.digest(AASequence::fromString(protein_data[i].sequence), current_digest); // keep peptides that match length restrictions (and count those that don't match) std::copy_if(current_digest.begin(), current_digest.end(), std::back_inserter(temp_peptides), [&dropped_bylength, &min_size, &max_size](const AASequence& s) -> bool { bool valid_length = (s.size() >= min_size && s.size() <= max_size); if (!valid_length) { ++dropped_bylength; return false; } return true; }); } for (auto s : temp_peptides) { if (!has_FASTA_output) { temp_peptide_hit.setSequence(s); peptide_identification.insertHit(temp_peptide_hit); identifications.push_back(peptide_identification); peptide_identification.setHits(std::vector<PeptideHit>()); // clear } else // for FASTA file output { FASTAFile::FASTAEntry pep(protein_data[i].identifier, protein_data[i].description, s.toString()); all_peptides.push_back(pep); } } } //------------------------------------------------------------- // writing output //------------------------------------------------------------- if (has_FASTA_output) { FASTAFile().store(outputfile_name, all_peptides); } else { IdXMLFile().store(outputfile_name, protein_identifications, identifications); } Size pep_remaining_count = (has_FASTA_output ? all_peptides.size() : identifications.size()); LOG_INFO << "Statistics:\n" << " total #peptides after digestion: " << pep_remaining_count + dropped_bylength << "\n" << " removed #peptides (length restrictions): " << dropped_bylength << "\n" << " remaining #peptides: " << pep_remaining_count << std::endl; return EXECUTION_OK; }