// TODO US-ASCII support only, no UTF-8 support
// While UTF-8 might work in some cases, we do not guarantee full functionality
template <typename StringView> inline auto decompose(const StringView &lhs, const StringView &rhs)
{
    auto const lcs = longest_common_substring(lhs, rhs);

    // trim spaces, transform to lower
    const auto trim = [](StringView view) {
        // we compare suffixes based on this value, it might break UTF chars, but as long as we are
        // consistent in handling, we do not create bad results
        std::string str = boost::to_lower_copy(view.to_string());
        auto front = str.find_first_not_of(" ");

        if (front == std::string::npos)
            return str;

        auto back = str.find_last_not_of(" ");
        return str.substr(front, back - front + 1);
    };

    if (lcs.empty())
    {
        return std::make_tuple(trim(lhs), trim(rhs), std::string(), std::string());
    }

    // find the common substring in both
    auto lhs_pos = lhs.find(lcs);
    auto rhs_pos = rhs.find(lcs);

    BOOST_ASSERT(lhs_pos + lcs.size() <= lhs.size());
    BOOST_ASSERT(rhs_pos + lcs.size() <= rhs.size());

    // prefixes
    auto lhs_prefix = (lhs_pos > 0) ? lhs.substr(0, lhs_pos) : StringView();
    auto rhs_prefix = (rhs_pos > 0) ? rhs.substr(0, rhs_pos) : StringView();

    // suffices
    auto lhs_suffix = lhs.substr(lhs_pos + lcs.size());
    auto rhs_suffix = rhs.substr(rhs_pos + lcs.size());

    return std::make_tuple(trim(lhs_prefix), trim(lhs_suffix), trim(rhs_prefix), trim(rhs_suffix));
}
Beispiel #2
0
  void EnzymaticDigestion::digestUnmodifiedString(const StringView sequence, std::vector<StringView>& output, Size min_length, Size max_length) const
  {
    // initialization
    output.clear();

    // naive cleavage sites
    std::vector<Size> pep_positions = tokenize_(sequence.getString());
    Size count = pep_positions.size();

    // disable max length filter by setting to maximum length
    if (max_length == 0)
    {
      max_length = sequence.size();
    }

    // no cleavage sites? return full string
    if (count == 0) 
    {
      if (sequence.size() >= min_length && sequence.size() <= max_length)
      {
        output.push_back(sequence);
      }
      return;
    }

    for (Size i = 1; i != count; ++i)
    {
      // add if cleavage product larger then min length
      Size l = pep_positions[i] - pep_positions[i - 1];
      if (l >= min_length && l <= max_length)
      {
        output.push_back(sequence.substr(pep_positions[i - 1], pep_positions[i] - 1));
      }
    }

    // add last cleavage product (need to add because end is not a cleavage site) if larger then min length
    Size l = sequence.size() - pep_positions[count - 1];
    if (l >= min_length && l <= max_length)
    {
      output.push_back(sequence.substr(pep_positions[count - 1], sequence.size() - 1));
    }

    // generate fragments with missed cleavages
    for (Size i = 1; ((i <= missed_cleavages_) && (i < count)); ++i)
    {
      for (Size j = 1; j < count - i; ++j)
      {
        Size l = pep_positions[j + i] - pep_positions[j - 1];
        if (l >= min_length && l <= max_length)
        {
          output.push_back(sequence.substr(pep_positions[j - 1], pep_positions[j + i] - 1));
        }
      }

      // add last cleavage product (need to add because end is not a cleavage site)
      Size l = sequence.size() - pep_positions[count - i - 1];
      if (l >= min_length && l <= max_length)
      {
        output.push_back(sequence.substr(pep_positions[count - i - 1], sequence.size() - 1 ));
      }
    }
  }