PeptideIndexing::PeptideIndexing() : DefaultParamHandler("PeptideIndexing") { defaults_.setValue("decoy_string", "", "String that was appended (or prefixed - see 'decoy_string_position' flag below) to the accessions in the protein database to indicate decoy proteins. If empty (default), it's determined automatically (checking for common terms, both as prefix and suffix)."); defaults_.setValue("decoy_string_position", "prefix", "Is the 'decoy_string' prepended (prefix) or appended (suffix) to the protein accession? (ignored if decoy_string is empty)"); defaults_.setValidStrings("decoy_string_position", ListUtils::create<String>("prefix,suffix")); defaults_.setValue("missing_decoy_action", "error", "Action to take if NO peptide was assigned to a decoy protein (which indicates wrong database or decoy string): 'error' (exit with error, no output), 'warn' (exit with success, warning message), 'silent' (no action is taken, not even a warning)"); defaults_.setValidStrings("missing_decoy_action", ListUtils::create<String>("error,warn,silent")); defaults_.setValue("enzyme:name", "Trypsin", "Enzyme which determines valid cleavage sites - e.g. trypsin cleaves after lysine (K) or arginine (R), but not before proline (P)."); StringList enzymes; ProteaseDB::getInstance()->getAllNames(enzymes); defaults_.setValidStrings("enzyme:name", enzymes); defaults_.setValue("enzyme:specificity", EnzymaticDigestion::NamesOfSpecificity[0], "Specificity of the enzyme." "\n '" + EnzymaticDigestion::NamesOfSpecificity[0] + "': both internal cleavage sites must match." "\n '" + EnzymaticDigestion::NamesOfSpecificity[1] + "': one of two internal cleavage sites must match." "\n '" + EnzymaticDigestion::NamesOfSpecificity[2] + "': allow all peptide hits no matter their context. Therefore, the enzyme chosen does not play a role here"); StringList spec; spec.assign(EnzymaticDigestion::NamesOfSpecificity, EnzymaticDigestion::NamesOfSpecificity + EnzymaticDigestion::SIZE_OF_SPECIFICITY); defaults_.setValidStrings("enzyme:specificity", spec); defaults_.setValue("write_protein_sequence", "false", "If set, the protein sequences are stored as well."); defaults_.setValidStrings("write_protein_sequence", ListUtils::create<String>("true,false")); defaults_.setValue("write_protein_description", "false", "If set, the protein description is stored as well."); defaults_.setValidStrings("write_protein_description", ListUtils::create<String>("true,false")); defaults_.setValue("keep_unreferenced_proteins", "false", "If set, protein hits which are not referenced by any peptide are kept."); defaults_.setValidStrings("keep_unreferenced_proteins", ListUtils::create<String>("true,false")); defaults_.setValue("allow_unmatched", "false", "If set, unmatched peptide sequences are allowed. By default (i.e. if this flag is not set) the program terminates with an error on unmatched peptides."); defaults_.setValidStrings("allow_unmatched", ListUtils::create<String>("true,false")); defaults_.setValue("aaa_max", 3, "Maximal number of ambiguous amino acids (AAAs) allowed when matching to a protein database with AAAs. AAAs are B, J, Z and X!"); defaults_.setMinInt("aaa_max", 0); defaults_.setMaxInt("aaa_max", 10); defaults_.setValue("mismatches_max", 0, "Maximal number of mismatched (mm) amino acids allowed when matching to a protein database." " The required runtime is exponential in the number of mm's; apply with care." " MM's are allowed in addition to AAA's."); defaults_.setMinInt("mismatches_max", 0); defaults_.setMaxInt("mismatches_max", 10); defaults_.setValue("IL_equivalent", "false", "Treat the isobaric amino acids isoleucine ('I') and leucine ('L') as equivalent (indistinguishable). Also occurences of 'J' will be treated as 'I' thus avoiding ambiguous matching."); defaults_.setValidStrings("IL_equivalent", ListUtils::create<String>("true,false")); defaultsToParam_(); }
void registerOptionsAndFlags_() { vector<String> all_mods; StringList all_enzymes; StringList specificity; ModificationsDB::getInstance()->getAllSearchModifications(all_mods); EnzymesDB::getInstance()->getAllNames(all_enzymes); specificity.assign(EnzymaticDigestion::NamesOfSpecificity, EnzymaticDigestion::NamesOfSpecificity + EnzymaticDigestion::SIZE_OF_SPECIFICITY); registerInputFile_("in", "<file>", "", "input file "); setValidFormats_("in", ListUtils::create<String>("idXML")); registerOutputFile_("out", "<file>", "", "output file "); setValidFormats_("out", ListUtils::create<String>("idXML")); registerTOPPSubsection_("precursor", "Filtering by precursor RT or m/z"); registerStringOption_("precursor:rt", "[min]:[max]", ":", "Retention time range to extract.", false); registerStringOption_("precursor:mz", "[min]:[max]", ":", "Mass-to-charge range to extract.", false); registerTOPPSubsection_("score", "Filtering by peptide/protein score."); registerDoubleOption_("score:pep", "<score>", 0, "The score which should be reached by a peptide hit to be kept.", false); registerDoubleOption_("score:prot", "<score>", 0, "The score which should be reached by a protein hit to be kept. Use in combination with 'delete_unreferenced_peptide_hits' to remove affected peptides.", false); registerTOPPSubsection_("thresh", "Filtering by significance threshold"); registerDoubleOption_("thresh:pep", "<fraction>", 0.0, "Keep a peptide hit only if its score is above this fraction of the peptide significance threshold.", false, true); registerDoubleOption_("thresh:prot", "<fraction>", 0.0, "Keep a protein hit only if its score is above this fraction of the protein significance threshold. Use in combination with 'delete_unreferenced_peptide_hits' to remove affected peptides.", false, true); registerTOPPSubsection_("whitelist", "Filtering by whitelisting (only peptides/proteins from a given set can pass)"); registerInputFile_("whitelist:proteins", "<file>", "", "Filename of a FASTA file containing protein sequences.\n" "All peptides that are not referencing a protein in this file are removed.\n" "All proteins whose accessions are not present in this file are removed.", false); setValidFormats_("whitelist:proteins", ListUtils::create<String>("fasta")); registerStringList_("whitelist:protein_accessions", "<accessions>", vector<String>(), "All peptides that do not reference at least one of the provided protein accession are removed.\nOnly proteins of the provided list are retained.", false); registerInputFile_("whitelist:peptides", "<file>", "", "Only peptides with the same sequence and modification assignment as any peptide in this file are kept. Use with 'whitelist:ignore_modifications' to only compare by sequence.\n", false); setValidFormats_("whitelist:peptides", ListUtils::create<String>("idXML")); registerFlag_("whitelist:ignore_modifications", "Compare whitelisted peptides by sequence only.", false); registerStringList_("whitelist:modifications", "<selection>", vector<String>(), "Keep only peptides with sequences that contain (any of) the selected modification(s)", false); setValidStrings_("whitelist:modifications", all_mods); registerTOPPSubsection_("blacklist", "Filtering by blacklisting (only peptides/proteins NOT present in a given set can pass)"); registerInputFile_("blacklist:proteins", "<file>", "", "Filename of a FASTA file containing protein sequences.\n" "All peptides that are referencing a protein in this file are removed.\n" "All proteins whose accessions are present in this file are removed.", false); setValidFormats_("blacklist:proteins", ListUtils::create<String>("fasta")); registerStringList_("blacklist:protein_accessions", "<accessions>", vector<String>(), "All peptides that reference at least one of the provided protein accession are removed.\nOnly proteins not in the provided list are retained.", false); registerInputFile_("blacklist:peptides", "<file>", "", "Peptides with the same sequence and modification assignment as any peptide in this file are filtered out. Use with 'blacklist:ignore_modifications' to only compare by sequence.\n", false); setValidFormats_("blacklist:peptides", ListUtils::create<String>("idXML")); registerFlag_("blacklist:ignore_modifications", "Compare blacklisted peptides by sequence only.", false); registerStringList_("blacklist:modifications", "<selection>", vector<String>(), "Remove all peptides with sequences that contain (any of) the selected modification(s)", false); setValidStrings_("blacklist:modifications", all_mods); registerTOPPSubsection_("digest", "Perform protein digestion and filter peptides based on digestion products"); registerInputFile_("digest:fasta", "<file>", "", "Input sequence database in FASTA format", false); setValidFormats_("digest:fasta", ListUtils::create<String>("fasta")); registerStringOption_("digest:enzyme", "<enzyme>", "Trypsin", "Specify the digestion enzyme",false); setValidStrings_("digest:enzyme", all_enzymes); registerStringOption_("digest:specificity", "<specificity>", specificity[EnzymaticDigestion::SPEC_FULL], "Specificity of the filter", false); setValidStrings_("digest:specificity", specificity); registerIntOption_("digest:missed_cleavages", "<integer>", -1, "filter peptide evidences that have more than the specified missed_cleavages\n" "By default missed cleavages are ignored", false); setMinInt_("digest:missed_cleavages", -1); registerFlag_("digest:methionine_cleavage", "Allow methionine cleavage at the protein start", false); registerTOPPSubsection_("rt", "Filtering by RT predicted by 'RTPredict'"); registerDoubleOption_("rt:p_value", "<float>", 0.0, "Retention time filtering by the p-value predicted by RTPredict.", false, true); registerDoubleOption_("rt:p_value_1st_dim", "<float>", 0.0, "Retention time filtering by the p-value predicted by RTPredict for first dimension.", false, true); setMinFloat_("rt:p_value", 0); setMaxFloat_("rt:p_value", 1); setMinFloat_("rt:p_value_1st_dim", 0); setMaxFloat_("rt:p_value_1st_dim", 1); registerTOPPSubsection_("mz", "Filtering by mass error"); registerDoubleOption_("mz:error", "<float>", -1, "Filtering by deviation to theoretical mass (disabled for negative values).", false); registerStringOption_("mz:unit", "<String>", "ppm", "Absolute or relative error.", false); setValidStrings_("mz:unit", ListUtils::create<String>("Da,ppm")); registerTOPPSubsection_("best", "Filtering best hits per spectrum (for peptides) or from proteins"); registerIntOption_("best:n_peptide_hits", "<integer>", 0, "Keep only the 'n' highest scoring peptide hits per spectrum (for n > 0).", false); setMinInt_("best:n_peptide_hits", 0); registerIntOption_("best:n_protein_hits", "<integer>", 0, "Keep only the 'n' highest scoring protein hits (for n > 0).", false); setMinInt_("best:n_protein_hits", 0); registerFlag_("best:strict", "Keep only the highest scoring peptide hit.\n" "Similar to n_peptide_hits=1, but if there are ties between two or more highest scoring hits, none are kept."); registerStringOption_("best:n_to_m_peptide_hits", "[min]:[max]", ":", "Peptide hit rank range to extracts", false, true); registerStringOption_("length", "[min]:[max]", ":", "Keep only peptide hits with a sequence length in this range.", false); registerStringOption_("charge", "[min]:[max]", ":", "Keep only peptide hits with charge states in this range.", false); registerFlag_("var_mods", "Keep only peptide hits with variable modifications (as defined in the 'SearchParameters' section of the input file).", false); registerFlag_("unique", "If a peptide hit occurs more than once per peptide ID, only one instance is kept."); registerFlag_("unique_per_protein", "Only peptides matching exactly one protein are kept. Remember that isoforms count as different proteins!"); registerFlag_("keep_unreferenced_protein_hits", "Proteins not referenced by a peptide are retained in the IDs."); registerFlag_("remove_decoys", "Remove proteins according to the information in the user parameters. Usually used in combination with 'delete_unreferenced_peptide_hits'."); registerFlag_("delete_unreferenced_peptide_hits", "Peptides not referenced by any protein are deleted in the IDs. Usually used in combination with 'score:prot' or 'thresh:prot'."); }