void EnzymaticDigestion::digest(const AASequence& protein, vector<AASequence>& output) const { // initialization output.clear(); // naive cleavage sites Size missed_cleavages = missed_cleavages_; std::vector<Size> pep_positions = tokenize_(protein.toUnmodifiedString()); Size count = pep_positions.size(); Size begin = pep_positions[0]; for (Size i = 1; i < count; ++i) { output.push_back(protein.getSubsequence(begin, pep_positions[i] - begin)); begin = pep_positions[i]; } output.push_back(protein.getSubsequence(begin, protein.size() - begin)); // missed cleavages if (pep_positions.size() > 0 && missed_cleavages_ != 0) // there is at least one cleavage site! { // generate fragments with missed cleavages for (Size i = 1; ((i <= missed_cleavages) && (count > i)); ++i) { begin = pep_positions[0]; for (Size j = 1; j < count - i; ++j) { output.push_back(protein.getSubsequence(begin, pep_positions[j + i] - begin)); begin = pep_positions[j]; } output.push_back(protein.getSubsequence(begin, protein.size() - begin)); } } }
Size EnzymaticDigestion::peptideCount(const AASequence& protein) { std::vector<Size> pep_positions = tokenize_(protein.toUnmodifiedString()); Size count = pep_positions.size(); // missed cleavages Size sum = count; for (Size i = 1; i < count; ++i) { if (i > missed_cleavages_) break; sum += count - i; } return sum; }
Size ProteaseDigestion::digest(const AASequence& protein, vector<AASequence>& output, Size min_length, Size max_length) const { // initialization output.clear(); // disable max length filter by setting to maximum length if (max_length == 0 || max_length > protein.size()) { max_length = protein.size(); } Size mc = (enzyme_->getName() == UnspecificCleavage) ? std::numeric_limits<Size>::max() : missed_cleavages_; Size wrong_size(0); // naive cleavage sites std::vector<int> pep_positions = tokenize_(protein.toUnmodifiedString()); pep_positions.push_back(protein.size()); // positions now contains 0, x1, ... xn, end Size count = pep_positions.size(); Size begin = pep_positions[0]; for (Size i = 1; i < count; ++i) { Size l = pep_positions[i] - begin; if (l >= min_length && l <= max_length) output.push_back(protein.getSubsequence(begin, l)); else ++wrong_size; begin = pep_positions[i]; } // missed cleavages if (pep_positions.size() > 1 && mc != 0) // there is at least one cleavage site (in addition to last position)! { // generate fragments with missed cleavages for (Size mcs = 1; ((mcs <= mc) && (mcs < count - 1)); ++mcs) { begin = pep_positions[0]; for (Size j = 1; j < count - mcs; ++j) { Size l = pep_positions[j + mcs] - begin; if (l >= min_length && l <= max_length) output.push_back(protein.getSubsequence(begin, l)); else ++wrong_size; begin = pep_positions[j]; } } } return wrong_size; }
Size ProteaseDigestion::peptideCount(const AASequence& protein) { // For unspecific cleavage every cutting position may be skipped. Thus, we get (n + 1) \choose 2 products. if (enzyme_->getName() == UnspecificCleavage) { return (protein.size() + 1) * protein.size() / 2; }; std::vector<int> pep_positions = tokenize_(protein.toUnmodifiedString()); Size count = pep_positions.size(); // missed cleavages Size sum = count; for (Size i = 1; i < count; ++i) { if (i > missed_cleavages_) break; sum += count - i; } return sum; }
void EnzymaticDigestion::digestUnmodifiedString(const StringView sequence, std::vector<StringView>& output, Size min_length, Size max_length) const { // initialization output.clear(); // naive cleavage sites std::vector<Size> pep_positions = tokenize_(sequence.getString()); Size count = pep_positions.size(); // disable max length filter by setting to maximum length if (max_length == 0) { max_length = sequence.size(); } // no cleavage sites? return full string if (count == 0) { if (sequence.size() >= min_length && sequence.size() <= max_length) { output.push_back(sequence); } return; } for (Size i = 1; i != count; ++i) { // add if cleavage product larger then min length Size l = pep_positions[i] - pep_positions[i - 1]; if (l >= min_length && l <= max_length) { output.push_back(sequence.substr(pep_positions[i - 1], pep_positions[i] - 1)); } } // add last cleavage product (need to add because end is not a cleavage site) if larger then min length Size l = sequence.size() - pep_positions[count - 1]; if (l >= min_length && l <= max_length) { output.push_back(sequence.substr(pep_positions[count - 1], sequence.size() - 1)); } // generate fragments with missed cleavages for (Size i = 1; ((i <= missed_cleavages_) && (i < count)); ++i) { for (Size j = 1; j < count - i; ++j) { Size l = pep_positions[j + i] - pep_positions[j - 1]; if (l >= min_length && l <= max_length) { output.push_back(sequence.substr(pep_positions[j - 1], pep_positions[j + i] - 1)); } } // add last cleavage product (need to add because end is not a cleavage site) Size l = sequence.size() - pep_positions[count - i - 1]; if (l >= min_length && l <= max_length) { output.push_back(sequence.substr(pep_positions[count - i - 1], sequence.size() - 1 )); } } }
bool EnzymaticDigestion::isValidProduct(const AASequence& protein, Size pep_pos, Size pep_length, bool methionine_cleavage, bool ignore_missed_cleavages) const { if (pep_pos >= protein.size()) { LOG_WARN << "Error: start of peptide (" << pep_pos << ") is beyond end of protein '" << protein.toString() << "'!" << endl; return false; } else if (pep_pos + pep_length > protein.size()) { LOG_WARN << "Error: end of peptide (" << (pep_pos + pep_length) << ") is beyond end of protein '" << protein.toString() << "'!" << endl; return false; } else if (pep_length == 0 || protein.size() == 0) { LOG_WARN << "Error: peptide or protein must not be empty!" << endl; return false; } if (specificity_ == SPEC_NONE) { return true; // we don't care about terminal ends } else // either SPEC_SEMI or SPEC_FULL { bool spec_c = false, spec_n = false; std::vector<Size> pep_positions = tokenize_(protein.toUnmodifiedString()); // initialize start and end std::vector<Size>::const_iterator begin_pos, end_pos; begin_pos = end_pos = pep_positions.end(); // test each end if (pep_pos == 0 || (begin_pos = std::find(pep_positions.begin(), pep_positions.end(), pep_pos)) != pep_positions.end()) { spec_n = true; } // if allow methionine cleavage at the protein start position if (pep_pos == 1 && methionine_cleavage && protein.getResidue((Size)0).getOneLetterCode() == "M") { // methionine_cleavage:consider the first product for begin_pos begin_pos = pep_positions.begin(); spec_n = true; } if (pep_pos + pep_length == protein.size() || (end_pos = std::find(pep_positions.begin(), pep_positions.end(), pep_pos + pep_length)) != pep_positions.end()) { spec_c = true; } if (spec_n && spec_c) { if (ignore_missed_cleavages) { return true; } Size offset = std::distance(begin_pos, end_pos); if (pep_pos + pep_length == protein.size()) { return (pep_positions.size() <= getMissedCleavages() + 1); } else if (offset > getMissedCleavages() + 1) { return false; } else if (offset == 0) { // This corner case needs to be checked when peptide is at the start and the end of the protein. // We check with the total number of cleavages. return (pep_positions.size() >= getMissedCleavages() + 1); } else { return true; } } else if ((specificity_ == SPEC_SEMI) && (spec_n || spec_c)) { return true; // one only for SEMI } else { return false; } } }