static FORCE_INLINE __m256i lookup_double_AVX2(const int16_t *VXFull, const int16_t *VYFull, const PixelType *pref, int w, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i vx = _mm256_cvtepi16_epi32(_mm_loadu_si128((const __m128i *)&VXFull[w])); vx = _mm256_srai_epi32(vx, 1); __m256i vy = _mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)&VYFull[w])); vy = _mm256_srai_epi16(vy, 1); __m256i addr = _mm256_madd_epi16(vy, dwords_ref_pitch); addr = _mm256_add_epi32(addr, vx); addr = _mm256_add_epi32(addr, dwords_hoffsets); // It's okay to read two or three bytes more than needed. pref is always padded, unless the user chooses a horizontal padding of 0, which would be stupid. __m256i gathered = _mm256_i32gather_epi32((const int *)pref, addr, sizeof(PixelType)); gathered = _mm256_and_si256(gathered, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); return gathered; }
void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { __m128i src16[8]; __m256i src32[8]; src16[0] = _mm_loadu_si128((const __m128i *)src_diff); src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride)); src32[0] = _mm256_cvtepi16_epi32(src16[0]); src32[1] = _mm256_cvtepi16_epi32(src16[1]); src32[2] = _mm256_cvtepi16_epi32(src16[2]); src32[3] = _mm256_cvtepi16_epi32(src16[3]); src32[4] = _mm256_cvtepi16_epi32(src16[4]); src32[5] = _mm256_cvtepi16_epi32(src16[5]); src32[6] = _mm256_cvtepi16_epi32(src16[6]); src32[7] = _mm256_cvtepi16_epi32(src16[7]); highbd_hadamard_col8_avx2(src32, 0); highbd_hadamard_col8_avx2(src32, 1); _mm256_storeu_si256((__m256i *)coeff, src32[0]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[1]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[2]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[3]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[4]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[5]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[6]); coeff += 8; _mm256_storeu_si256((__m256i *)coeff, src32[7]); }
__m256i test_mm256_cvtepi16_epi32(__m128i a) { // CHECK: @llvm.x86.avx2.pmovsxwd return _mm256_cvtepi16_epi32(a); }
__m256i test_mm256_cvtepi16_epi32(__m128i a) { // CHECK-LABEL: test_mm256_cvtepi16_epi32 // CHECK: sext <8 x i16> %{{.*}} to <8 x i32> return _mm256_cvtepi16_epi32(a); }
static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) { const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum)); const __m256i sum_hi = _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1)); return _mm256_add_epi32(sum_lo, sum_hi); }