// TODO US-ASCII support only, no UTF-8 support // While UTF-8 might work in some cases, we do not guarantee full functionality template <typename StringView> inline auto decompose(const StringView &lhs, const StringView &rhs) { auto const lcs = longest_common_substring(lhs, rhs); // trim spaces, transform to lower const auto trim = [](StringView view) { // we compare suffixes based on this value, it might break UTF chars, but as long as we are // consistent in handling, we do not create bad results std::string str = boost::to_lower_copy(view.to_string()); auto front = str.find_first_not_of(" "); if (front == std::string::npos) return str; auto back = str.find_last_not_of(" "); return str.substr(front, back - front + 1); }; if (lcs.empty()) { return std::make_tuple(trim(lhs), trim(rhs), std::string(), std::string()); } // find the common substring in both auto lhs_pos = lhs.find(lcs); auto rhs_pos = rhs.find(lcs); BOOST_ASSERT(lhs_pos + lcs.size() <= lhs.size()); BOOST_ASSERT(rhs_pos + lcs.size() <= rhs.size()); // prefixes auto lhs_prefix = (lhs_pos > 0) ? lhs.substr(0, lhs_pos) : StringView(); auto rhs_prefix = (rhs_pos > 0) ? rhs.substr(0, rhs_pos) : StringView(); // suffices auto lhs_suffix = lhs.substr(lhs_pos + lcs.size()); auto rhs_suffix = rhs.substr(rhs_pos + lcs.size()); return std::make_tuple(trim(lhs_prefix), trim(lhs_suffix), trim(rhs_prefix), trim(rhs_suffix)); }
void EnzymaticDigestion::digestUnmodifiedString(const StringView sequence, std::vector<StringView>& output, Size min_length, Size max_length) const { // initialization output.clear(); // naive cleavage sites std::vector<Size> pep_positions = tokenize_(sequence.getString()); Size count = pep_positions.size(); // disable max length filter by setting to maximum length if (max_length == 0) { max_length = sequence.size(); } // no cleavage sites? return full string if (count == 0) { if (sequence.size() >= min_length && sequence.size() <= max_length) { output.push_back(sequence); } return; } for (Size i = 1; i != count; ++i) { // add if cleavage product larger then min length Size l = pep_positions[i] - pep_positions[i - 1]; if (l >= min_length && l <= max_length) { output.push_back(sequence.substr(pep_positions[i - 1], pep_positions[i] - 1)); } } // add last cleavage product (need to add because end is not a cleavage site) if larger then min length Size l = sequence.size() - pep_positions[count - 1]; if (l >= min_length && l <= max_length) { output.push_back(sequence.substr(pep_positions[count - 1], sequence.size() - 1)); } // generate fragments with missed cleavages for (Size i = 1; ((i <= missed_cleavages_) && (i < count)); ++i) { for (Size j = 1; j < count - i; ++j) { Size l = pep_positions[j + i] - pep_positions[j - 1]; if (l >= min_length && l <= max_length) { output.push_back(sequence.substr(pep_positions[j - 1], pep_positions[j + i] - 1)); } } // add last cleavage product (need to add because end is not a cleavage site) Size l = sequence.size() - pep_positions[count - i - 1]; if (l >= min_length && l <= max_length) { output.push_back(sequence.substr(pep_positions[count - i - 1], sequence.size() - 1 )); } } }