size_t qfind_first_byte_of_sse42(const StringPieceLite haystack, const StringPieceLite needles) { if (UNLIKELY(needles.empty() || haystack.empty())) { return std::string::npos; } else if (needles.size() <= 16) { // we can save some unnecessary load instructions by optimizing for // the common case of needles.size() <= 16 return qfind_first_byte_of_needles16(haystack, needles); } if (haystack.size() < 16 && page_for(haystack.end() - 1) != page_for(haystack.data() + 16)) { // We can't safely SSE-load haystack. Use a different approach. if (haystack.size() <= 2) { return qfind_first_byte_of_std(haystack, needles); } return qfind_first_byte_of_byteset(haystack, needles); } auto ret = scanHaystackBlock<false>(haystack, needles, 0); if (ret != std::string::npos) { return ret; } size_t i = nextAlignedIndex(haystack.data()); for (; i < haystack.size(); i += 16) { ret = scanHaystackBlock<true>(haystack, needles, i); if (ret != std::string::npos) { return ret; } } return std::string::npos; }
size_t scanHaystackBlock(const StringPieceLite haystack, const StringPieceLite needles, uint64_t blockStartIdx) { DCHECK_GT(needles.size(), 16u); // should handled by *needles16() method DCHECK(blockStartIdx + 16 <= haystack.size() || (page_for(haystack.data() + blockStartIdx) == page_for(haystack.data() + blockStartIdx + 15))); __m128i arr1; if (HAYSTACK_ALIGNED) { arr1 = _mm_load_si128( reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx)); } else { arr1 = _mm_loadu_si128( reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx)); } // This load is safe because needles.size() >= 16 auto arr2 = _mm_loadu_si128( reinterpret_cast<const __m128i*>(needles.data())); size_t b = _mm_cmpestri(arr2, 16, arr1, int(haystack.size() - blockStartIdx), 0); size_t j = nextAlignedIndex(needles.data()); for (; j < needles.size(); j += 16) { arr2 = _mm_load_si128( reinterpret_cast<const __m128i*>(needles.data() + j)); auto index = _mm_cmpestri( arr2, int(needles.size() - j), arr1, int(haystack.size() - blockStartIdx), 0); b = std::min<size_t>(index, b); } if (b < 16) { return blockStartIdx + b; } return std::string::npos; }
// helper method for case where needles.size() <= 16 size_t qfind_first_byte_of_needles16(const StringPieceLite haystack, const StringPieceLite needles) { DCHECK_GT(haystack.size(), 0u); DCHECK_GT(needles.size(), 0u); DCHECK_LE(needles.size(), 16u); if ((needles.size() <= 2 && haystack.size() >= 256) || // must bail if we can't even SSE-load a single segment of haystack (haystack.size() < 16 && page_for(haystack.end() - 1) != page_for(haystack.data() + 15)) || // can't load needles into SSE register if it could cross page boundary page_for(needles.end() - 1) != page_for(needles.data() + 15)) { return detail::qfind_first_byte_of_nosse(haystack, needles); } auto arr2 = _mm_loadu_si128( reinterpret_cast<const __m128i*>(needles.data())); // do an unaligned load for first block of haystack auto arr1 = _mm_loadu_si128( reinterpret_cast<const __m128i*>(haystack.data())); auto index = _mm_cmpestri(arr2, int(needles.size()), arr1, int(haystack.size()), 0); if (index < 16) { return index; } // Now, we can do aligned loads hereafter... size_t i = nextAlignedIndex(haystack.data()); for (; i < haystack.size(); i+= 16) { arr1 = _mm_load_si128(reinterpret_cast<const __m128i*>(haystack.data() + i)); index = _mm_cmpestri( arr2, int(needles.size()), arr1, int(haystack.size() - i), 0); if (index < 16) { return i + index; } } return std::string::npos; }