// Computes and returns the dot product of the n-vectors u and v.
// Uses Intel SSE intrinsics to access the SIMD instruction set.
static int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n) {
  int max_offset = n - 8;
  int offset = 0;
  // Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit
  // values, extending to 16 bit, multiplying to make 32 bit results.
  int32_t result = 0;
  if (offset <= max_offset) {
    offset = 8;
    __m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u));
    __m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v));
    __m128i sum = _mm_cvtepi8_epi16(packed1);
    packed2 = _mm_cvtepi8_epi16(packed2);
    // The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit
    // ints to make 32 bit results, which are then horizontally added in pairs
    // to make 4 32 bit results that still fit in a 128 bit register.
    sum = _mm_madd_epi16(sum, packed2);
    while (offset <= max_offset) {
      packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u + offset));
      packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v + offset));
      offset += 8;
      packed1 = _mm_cvtepi8_epi16(packed1);
      packed2 = _mm_cvtepi8_epi16(packed2);
      packed1 = _mm_madd_epi16(packed1, packed2);
      sum = _mm_add_epi32(sum, packed1);
    }
    // Sum the 4 packed 32 bit sums and extract the low result.
    sum = _mm_hadd_epi32(sum, sum);
    sum = _mm_hadd_epi32(sum, sum);
    result = _mm_cvtsi128_si32(sum);
  }
  while (offset < n) {
    result += u[offset] * v[offset];
    ++offset;
  }
  return result;
}
Example #2
0
__m128i kvz_eight_tap_filter_x4_and_flip_16bit(__m128i *data0, __m128i *data1, __m128i *data2, __m128i *data3, __m128i *filter)
{
  __m128i a, b, c, d;
  __m128i fir = _mm_cvtepi8_epi16(_mm_loadu_si128((__m128i*)(filter)));

  a = _mm_madd_epi16(*data0, fir);
  b = _mm_madd_epi16(*data1, fir);
  a = _mm_hadd_epi32(a, b);

  c = _mm_madd_epi16(*data2, fir);
  d = _mm_madd_epi16(*data3, fir);
  c = _mm_hadd_epi32(c, d);

  a = _mm_hadd_epi32(a, c);

  return a;
}
Example #3
0
__m128i test_mm_cvtepi8_epi16(__m128i a) {
  // CHECK-LABEL: test_mm_cvtepi8_epi16
  // CHECK: call <8 x i16> @llvm.x86.sse41.pmovsxbw(<16 x i8> {{.*}})
  // CHECK-ASM: pmovsxbw %xmm{{.*}}, %xmm{{.*}}
  return _mm_cvtepi8_epi16(a);
}
Example #4
0
__m128i test_mm_cvtepi8_epi16(__m128i a) {
  // CHECK-LABEL: test_mm_cvtepi8_epi16
  // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  // CHECK: sext <8 x i8> {{.*}} to <8 x i16>
  return _mm_cvtepi8_epi16(a);
}