static FORCE_INLINE __m256i lookup_AVX2(const int16_t *VXFull, const int16_t *VYFull, const PixelType *pref, int w, const __m256i &dwords_time256, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i vx = _mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)&VXFull[w])); __m256i vy = _mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)&VYFull[w])); vx = _mm256_madd_epi16(vx, dwords_time256); vx = _mm256_srai_epi32(vx, 8); vy = _mm256_madd_epi16(vy, dwords_time256); vy = _mm256_srai_epi32(vy, 8); __m256i addr = _mm256_madd_epi16(vy, dwords_ref_pitch); addr = _mm256_add_epi32(addr, vx); addr = _mm256_add_epi32(addr, dwords_hoffsets); // It's okay to read two or three bytes more than needed. pref is always padded, unless the user chooses a horizontal padding of 0, which would be stupid. __m256i gathered = _mm256_i32gather_epi32((const int *)pref, addr, sizeof(PixelType)); gathered = _mm256_and_si256(gathered, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); return gathered; }
__m256i test_mm256_cvtepu16_epi32(__m128i a) { // CHECK: @llvm.x86.avx2.pmovzxwd return _mm256_cvtepu16_epi32(a); }
__m256i test_mm256_cvtepu16_epi32(__m128i a) { // CHECK-LABEL: test_mm256_cvtepu16_epi32 // CHECK: zext <8 x i16> {{.*}} to <8 x i32> return _mm256_cvtepu16_epi32(a); }