ExitCodes main_(int, const char**) override { vector<ProteinIdentification> protein_identifications; vector<PeptideIdentification> identifications; PeptideIdentification peptide_identification; DateTime date_time = DateTime::now(); String date_time_string = date_time.get(); peptide_identification.setIdentifier("In-silico_digestion" + date_time_string); ProteinIdentification protein_identification; protein_identifications.push_back(ProteinIdentification()); //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- String inputfile_name = getStringOption_("in"); String outputfile_name = getStringOption_("out"); FASTAID FASTA_ID = getStringOption_("FASTA:ID") == "parent" ? PARENT : (getStringOption_("FASTA:ID") == "number" ? NUMBER : BOTH); bool keep_FASTA_desc = (getStringOption_("FASTA:description") == "keep"); // output file type FileHandler fh; FileTypes::Type out_type = FileTypes::nameToType(getStringOption_("out_type")); if (out_type == FileTypes::UNKNOWN) { out_type = fh.getTypeByFileName(outputfile_name); writeDebug_(String("Output file type: ") + FileTypes::typeToName(out_type), 2); } if (out_type == FileTypes::UNKNOWN) { LOG_ERROR << ("Error: Could not determine output file type!") << std::endl; return PARSE_ERROR; } Size min_size = getIntOption_("min_length"); Size max_size = getIntOption_("max_length"); Size missed_cleavages = getIntOption_("missed_cleavages"); bool has_FASTA_output = (out_type == FileTypes::FASTA); //------------------------------------------------------------- // reading input //------------------------------------------------------------- FASTAFile ff; ff.readStart(inputfile_name); if (has_FASTA_output) ff.writeStart(outputfile_name); //------------------------------------------------------------- // calculations //------------------------------------------------------------- // This should be updated if more cleavage enzymes are available ProteinIdentification::SearchParameters search_parameters; String enzyme = getStringOption_("enzyme"); ProteaseDigestion digestor; digestor.setEnzyme(enzyme); digestor.setMissedCleavages(missed_cleavages); search_parameters.digestion_enzyme = *ProteaseDB::getInstance()->getEnzyme(enzyme); PeptideHit temp_peptide_hit; PeptideEvidence temp_pe; protein_identifications[0].setSearchParameters(search_parameters); protein_identifications[0].setDateTime(date_time); protein_identifications[0].setSearchEngine("In-silico digestion"); protein_identifications[0].setIdentifier("In-silico_digestion" + date_time_string); Size dropped_by_length(0); // stats for removing candidates Size fasta_out_count(0); FASTAFile::FASTAEntry fe; while (ff.readNext(fe)) { if (!has_FASTA_output) { ProteinHit temp_protein_hit; temp_protein_hit.setSequence(fe.sequence); temp_protein_hit.setAccession(fe.identifier); protein_identifications[0].insertHit(temp_protein_hit); temp_pe.setProteinAccession(fe.identifier); temp_peptide_hit.setPeptideEvidences(vector<PeptideEvidence>(1, temp_pe)); } vector<AASequence> current_digest; if (enzyme == "none") { current_digest.push_back(AASequence::fromString(fe.sequence)); } else { dropped_by_length += digestor.digest(AASequence::fromString(fe.sequence), current_digest, min_size, max_size); } String id = fe.identifier; for (auto const& s : current_digest) { if (!has_FASTA_output) { temp_peptide_hit.setSequence(s); peptide_identification.insertHit(temp_peptide_hit); identifications.push_back(peptide_identification); peptide_identification.setHits(std::vector<PeptideHit>()); // clear } else // for FASTA file output { ++fasta_out_count; switch (FASTA_ID) { case PARENT: break; case NUMBER: id = String(fasta_out_count); break; case BOTH: id = fe.identifier + "_" + String(fasta_out_count); break; } ff.writeNext(FASTAFile::FASTAEntry(id, keep_FASTA_desc ? fe.description : "", s.toString())); } } } //------------------------------------------------------------- // writing output //------------------------------------------------------------- if (has_FASTA_output) { ff.writeEnd(); } else { IdXMLFile().store(outputfile_name, protein_identifications, identifications); } Size pep_remaining_count = (has_FASTA_output ? fasta_out_count : identifications.size()); LOG_INFO << "Statistics:\n" << " file: " << inputfile_name << "\n" << " total #peptides after digestion: " << pep_remaining_count + dropped_by_length << "\n" << " removed #peptides (length restrictions): " << dropped_by_length << "\n" << " remaining #peptides: " << pep_remaining_count << std::endl; return EXECUTION_OK; }
ExitCodes main_(int, const char **) { //------------------------------------------------------------- // parsing parameters //------------------------------------------------------------- StringList in(getStringList_("in")); String out(getStringOption_("out")); bool append = (!getFlag_("only_decoy")); bool shuffle = (getStringOption_("method") == "shuffle"); String decoy_string(getStringOption_("decoy_string")); bool decoy_string_position_prefix = (String(getStringOption_("decoy_string_position")) == "prefix" ? true : false); //------------------------------------------------------------- // reading input //------------------------------------------------------------- if (in.size() == 1) { LOG_WARN << "Warning: Only one FASTA input file was provided, which might not contain contaminants. You probably want to have them! Just add the contaminant file to the input file list 'in'." << endl; } set<String> identifiers; // spot duplicate identifiers // std::unordered_set<string> has slightly more RAM, but slightly less CPU FASTAFile f; f.writeStart(out); FASTAFile::FASTAEntry protein; for (Size i = 0; i < in.size(); ++i) { f.readStart(in[i]); //------------------------------------------------------------- // calculations //------------------------------------------------------------- while (f.readNext(protein)) { if (identifiers.find(protein.identifier) != identifiers.end()) { LOG_WARN << "DecoyDatabase: Warning, identifier '" << protein.identifier << "' occurs more than once!" << endl; } identifiers.insert(protein.identifier); if (append) { f.writeNext(protein); } // identifier protein.identifier = getIdentifier_(protein.identifier, decoy_string, decoy_string_position_prefix); // sequence if (shuffle) { String temp; Size x = protein.sequence.size(); srand(time(0)); while (x != 0) { Size y = rand() % x; temp += protein.sequence[y]; --x; protein.sequence[y] = protein.sequence[x]; // overwrite consumed position with last position (about to go out of scope for next dice roll) } } else // reverse { protein.sequence.reverse(); } //------------------------------------------------------------- // writing output //------------------------------------------------------------- f.writeNext(protein); } // next protein } // input files return EXECUTION_OK; }