static FORCE_INLINE __m256i lookup_AVX2(const int16_t *VXFull, const int16_t *VYFull, const PixelType *pref, int w, const __m256i &dwords_time256, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) {
    __m256i vx = _mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)&VXFull[w]));
    __m256i vy = _mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)&VYFull[w]));

    vx = _mm256_madd_epi16(vx, dwords_time256);
    vx = _mm256_srai_epi32(vx, 8);

    vy = _mm256_madd_epi16(vy, dwords_time256);
    vy = _mm256_srai_epi32(vy, 8);
    __m256i addr = _mm256_madd_epi16(vy, dwords_ref_pitch);
    addr = _mm256_add_epi32(addr, vx);
    addr = _mm256_add_epi32(addr, dwords_hoffsets);

    // It's okay to read two or three bytes more than needed. pref is always padded, unless the user chooses a horizontal padding of 0, which would be stupid.
    __m256i gathered = _mm256_i32gather_epi32((const int *)pref, addr, sizeof(PixelType));
    gathered = _mm256_and_si256(gathered, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1));

    return gathered;
}
Beispiel #2
0
__m256i test_mm256_cvtepu16_epi32(__m128i a) {
  // CHECK: @llvm.x86.avx2.pmovzxwd
  return _mm256_cvtepu16_epi32(a);
}
__m256i test_mm256_cvtepu16_epi32(__m128i a) {
  // CHECK-LABEL: test_mm256_cvtepu16_epi32
  // CHECK: zext <8 x i16> {{.*}} to <8 x i32>
  return _mm256_cvtepu16_epi32(a);
}