// Computes and returns the dot product of the n-vectors u and v. // Uses Intel SSE intrinsics to access the SIMD instruction set. static int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n) { int max_offset = n - 8; int offset = 0; // Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit // values, extending to 16 bit, multiplying to make 32 bit results. int32_t result = 0; if (offset <= max_offset) { offset = 8; __m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u)); __m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v)); __m128i sum = _mm_cvtepi8_epi16(packed1); packed2 = _mm_cvtepi8_epi16(packed2); // The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit // ints to make 32 bit results, which are then horizontally added in pairs // to make 4 32 bit results that still fit in a 128 bit register. sum = _mm_madd_epi16(sum, packed2); while (offset <= max_offset) { packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u + offset)); packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v + offset)); offset += 8; packed1 = _mm_cvtepi8_epi16(packed1); packed2 = _mm_cvtepi8_epi16(packed2); packed1 = _mm_madd_epi16(packed1, packed2); sum = _mm_add_epi32(sum, packed1); } // Sum the 4 packed 32 bit sums and extract the low result. sum = _mm_hadd_epi32(sum, sum); sum = _mm_hadd_epi32(sum, sum); result = _mm_cvtsi128_si32(sum); } while (offset < n) { result += u[offset] * v[offset]; ++offset; } return result; }
__m128i kvz_eight_tap_filter_x4_and_flip_16bit(__m128i *data0, __m128i *data1, __m128i *data2, __m128i *data3, __m128i *filter) { __m128i a, b, c, d; __m128i fir = _mm_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(filter))); a = _mm_madd_epi16(*data0, fir); b = _mm_madd_epi16(*data1, fir); a = _mm_hadd_epi32(a, b); c = _mm_madd_epi16(*data2, fir); d = _mm_madd_epi16(*data3, fir); c = _mm_hadd_epi32(c, d); a = _mm_hadd_epi32(a, c); return a; }
__m128i test_mm_cvtepi8_epi16(__m128i a) { // CHECK-LABEL: test_mm_cvtepi8_epi16 // CHECK: call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> {{.*}}) // CHECK-ASM: pmovsxbw %xmm{{.*}}, %xmm{{.*}} return _mm_cvtepi8_epi16(a); }
__m128i test_mm_cvtepi8_epi16(__m128i a) { // CHECK-LABEL: test_mm_cvtepi8_epi16 // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> // CHECK: sext <8 x i8> {{.*}} to <8 x i16> return _mm_cvtepi8_epi16(a); }