size_t qfind_first_byte_of_sse42(const StringPiece& haystack, const StringPiece& needles) { if (UNLIKELY(needles.empty() || haystack.empty())) { return StringPiece::npos; } else if (needles.size() <= 16) { // we can save some unnecessary load instructions by optimizing for // the common case of needles.size() <= 16 return qfind_first_byte_of_needles16(haystack, needles); } if (haystack.size() < 16 && PAGE_FOR(haystack.end() - 1) != PAGE_FOR(haystack.data() + 16)) { // We can't safely SSE-load haystack. Use a different approach. if (haystack.size() <= 2) { return qfind_first_of(haystack, needles, asciiCaseSensitive); } return qfind_first_byte_of_byteset(haystack, needles); } auto ret = scanHaystackBlock<false>(haystack, needles, 0); if (ret != StringPiece::npos) { return ret; } size_t i = nextAlignedIndex(haystack.data()); for (; i < haystack.size(); i += 16) { auto ret = scanHaystackBlock<true>(haystack, needles, i); if (ret != StringPiece::npos) { return ret; } } return StringPiece::npos; }
size_t scanHaystackBlock(const StringPieceLite haystack, const StringPieceLite needles, uint64_t blockStartIdx) { DCHECK_GT(needles.size(), 16u); // should handled by *needles16() method DCHECK(blockStartIdx + 16 <= haystack.size() || (page_for(haystack.data() + blockStartIdx) == page_for(haystack.data() + blockStartIdx + 15))); __m128i arr1; if (HAYSTACK_ALIGNED) { arr1 = _mm_load_si128( reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx)); } else { arr1 = _mm_loadu_si128( reinterpret_cast<const __m128i*>(haystack.data() + blockStartIdx)); } // This load is safe because needles.size() >= 16 auto arr2 = _mm_loadu_si128( reinterpret_cast<const __m128i*>(needles.data())); size_t b = _mm_cmpestri(arr2, 16, arr1, int(haystack.size() - blockStartIdx), 0); size_t j = nextAlignedIndex(needles.data()); for (; j < needles.size(); j += 16) { arr2 = _mm_load_si128( reinterpret_cast<const __m128i*>(needles.data() + j)); auto index = _mm_cmpestri( arr2, int(needles.size() - j), arr1, int(haystack.size() - blockStartIdx), 0); b = std::min<size_t>(index, b); } if (b < 16) { return blockStartIdx + b; } return std::string::npos; }
// helper method for case where needles.size() <= 16 size_t qfind_first_byte_of_needles16(const StringPieceLite haystack, const StringPieceLite needles) { DCHECK_GT(haystack.size(), 0u); DCHECK_GT(needles.size(), 0u); DCHECK_LE(needles.size(), 16u); if ((needles.size() <= 2 && haystack.size() >= 256) || // must bail if we can't even SSE-load a single segment of haystack (haystack.size() < 16 && page_for(haystack.end() - 1) != page_for(haystack.data() + 15)) || // can't load needles into SSE register if it could cross page boundary page_for(needles.end() - 1) != page_for(needles.data() + 15)) { return detail::qfind_first_byte_of_nosse(haystack, needles); } auto arr2 = _mm_loadu_si128( reinterpret_cast<const __m128i*>(needles.data())); // do an unaligned load for first block of haystack auto arr1 = _mm_loadu_si128( reinterpret_cast<const __m128i*>(haystack.data())); auto index = _mm_cmpestri(arr2, int(needles.size()), arr1, int(haystack.size()), 0); if (index < 16) { return index; } // Now, we can do aligned loads hereafter... size_t i = nextAlignedIndex(haystack.data()); for (; i < haystack.size(); i+= 16) { arr1 = _mm_load_si128(reinterpret_cast<const __m128i*>(haystack.data() + i)); index = _mm_cmpestri( arr2, int(needles.size()), arr1, int(haystack.size() - i), 0); if (index < 16) { return i + index; } } return std::string::npos; }
inline size_t scanHaystackBlock(const StringPiece& haystack, const StringPiece& needles, int64_t blockStartIdx) { DCHECK_GT(needles.size(), 16); // should handled by *needles16() method DCHECK(blockStartIdx + 16 <= haystack.size() || (PAGE_FOR(haystack.data() + blockStartIdx) == PAGE_FOR(haystack.data() + blockStartIdx + 15))); __v16qi arr1; if (HAYSTACK_ALIGNED) { void* ptr1 = __builtin_assume_aligned(haystack.data() + blockStartIdx, 16); arr1 = *reinterpret_cast<const __v16qi*>(ptr1); } else { arr1 = __builtin_ia32_loaddqu(haystack.data() + blockStartIdx); } // This load is safe because needles.size() >= 16 auto arr2 = __builtin_ia32_loaddqu(needles.data()); size_t b = __builtin_ia32_pcmpestri128( arr2, 16, arr1, haystack.size() - blockStartIdx, 0); size_t j = nextAlignedIndex(needles.data()); for (; j < needles.size(); j += 16) { void* ptr2 = __builtin_assume_aligned(needles.data() + j, 16); arr2 = *reinterpret_cast<const __v16qi*>(ptr2); auto index = __builtin_ia32_pcmpestri128( arr2, needles.size() - j, arr1, haystack.size() - blockStartIdx, 0); b = std::min<size_t>(index, b); } if (b < 16) { return blockStartIdx + b; } return StringPiece::npos; }