static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int width, const int height) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - width; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); assert(width >= 8); assert(IS_POWER_OF_TWO(width)); do { const __m128i v_p1_w = xx_loadl_64(pre + n + 4); const __m128i v_m1_d = xx_load_128(mask + n + 4); const __m128i v_w1_d = xx_load_128(wsrc + n + 4); const __m128i v_p0_w = xx_loadl_64(pre + n); const __m128i v_m0_d = xx_load_128(mask + n); const __m128i v_w0_d = xx_load_128(wsrc + n); const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); // Rounded absolute difference const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); n += 8; if (n % width == 0) pre += pre_step; } while (n < width * height); return xx_hsum_epi32_si32(v_sad_d); }
static INLINE void build_compound_diffwtd_mask_d16_inv_avx2( uint8_t *mask, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, int shift) { const int mask_base = 38; const __m256i _r = _mm256_set1_epi16((1 << shift) >> 1); const __m256i y38 = _mm256_set1_epi16(mask_base); const __m256i y64 = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); int i = 0; if (w == 4) { do { const __m128i s0A = xx_loadl_64(src0); const __m128i s0B = xx_loadl_64(src0 + src0_stride); const __m128i s0C = xx_loadl_64(src0 + src0_stride * 2); const __m128i s0D = xx_loadl_64(src0 + src0_stride * 3); const __m128i s1A = xx_loadl_64(src1); const __m128i s1B = xx_loadl_64(src1 + src1_stride); const __m128i s1C = xx_loadl_64(src1 + src1_stride * 2); const __m128i s1D = xx_loadl_64(src1 + src1_stride * 3); const __m256i s0 = yy_set_m128i(_mm_unpacklo_epi64(s0C, s0D), _mm_unpacklo_epi64(s0A, s0B)); const __m256i s1 = yy_set_m128i(_mm_unpacklo_epi64(s1C, s1D), _mm_unpacklo_epi64(s1A, s1B)); const __m256i m16 = calc_mask_d16_inv_avx2(&s0, &s1, &_r, &y38, &y64, shift); const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); xx_storeu_128(mask, _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8))); src0 += src0_stride << 2; src1 += src1_stride << 2; mask += 16; i += 4; } while (i < h); } else if (w == 8) { do { const __m256i s0AB = yy_loadu2_128(src0 + src0_stride, src0); const __m256i s0CD = yy_loadu2_128(src0 + src0_stride * 3, src0 + src0_stride * 2); const __m256i s1AB = yy_loadu2_128(src1 + src1_stride, src1); const __m256i s1CD = yy_loadu2_128(src1 + src1_stride * 3, src1 + src1_stride * 2); const __m256i m16AB = calc_mask_d16_inv_avx2(&s0AB, &s1AB, &_r, &y38, &y64, shift); const __m256i m16CD = calc_mask_d16_inv_avx2(&s0CD, &s1CD, &_r, &y38, &y64, shift); const __m256i m8 = _mm256_packus_epi16(m16AB, m16CD); yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); src0 += src0_stride << 2; src1 += src1_stride << 2; mask += 32; i += 4; } while (i < h); } else if (w == 16) { do { const __m256i s0A = yy_loadu_256(src0); const __m256i s0B = yy_loadu_256(src0 + src0_stride); const __m256i s1A = yy_loadu_256(src1); const __m256i s1B = yy_loadu_256(src1 + src1_stride); const __m256i m16A = calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); const __m256i m16B = calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); const __m256i m8 = _mm256_packus_epi16(m16A, m16B); yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); src0 += src0_stride << 1; src1 += src1_stride << 1; mask += 32; i += 2; } while (i < h); } else if (w == 32) { do { const __m256i s0A = yy_loadu_256(src0); const __m256i s0B = yy_loadu_256(src0 + 16); const __m256i s1A = yy_loadu_256(src1); const __m256i s1B = yy_loadu_256(src1 + 16); const __m256i m16A = calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); const __m256i m16B = calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); const __m256i m8 = _mm256_packus_epi16(m16A, m16B); yy_storeu_256(mask, _mm256_permute4x64_epi64(m8, 0xd8)); src0 += src0_stride; src1 += src1_stride; mask += 32; i += 1; } while (i < h); } else if (w == 64) { do { const __m256i s0A = yy_loadu_256(src0); const __m256i s0B = yy_loadu_256(src0 + 16); const __m256i s0C = yy_loadu_256(src0 + 32); const __m256i s0D = yy_loadu_256(src0 + 48); const __m256i s1A = yy_loadu_256(src1); const __m256i s1B = yy_loadu_256(src1 + 16); const __m256i s1C = yy_loadu_256(src1 + 32); const __m256i s1D = yy_loadu_256(src1 + 48); const __m256i m16A = calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); const __m256i m16B = calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); const __m256i m16C = calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); const __m256i m16D = calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); src0 += src0_stride; src1 += src1_stride; mask += 64; i += 1; } while (i < h); } else { do { const __m256i s0A = yy_loadu_256(src0); const __m256i s0B = yy_loadu_256(src0 + 16); const __m256i s0C = yy_loadu_256(src0 + 32); const __m256i s0D = yy_loadu_256(src0 + 48); const __m256i s0E = yy_loadu_256(src0 + 64); const __m256i s0F = yy_loadu_256(src0 + 80); const __m256i s0G = yy_loadu_256(src0 + 96); const __m256i s0H = yy_loadu_256(src0 + 112); const __m256i s1A = yy_loadu_256(src1); const __m256i s1B = yy_loadu_256(src1 + 16); const __m256i s1C = yy_loadu_256(src1 + 32); const __m256i s1D = yy_loadu_256(src1 + 48); const __m256i s1E = yy_loadu_256(src1 + 64); const __m256i s1F = yy_loadu_256(src1 + 80); const __m256i s1G = yy_loadu_256(src1 + 96); const __m256i s1H = yy_loadu_256(src1 + 112); const __m256i m16A = calc_mask_d16_inv_avx2(&s0A, &s1A, &_r, &y38, &y64, shift); const __m256i m16B = calc_mask_d16_inv_avx2(&s0B, &s1B, &_r, &y38, &y64, shift); const __m256i m16C = calc_mask_d16_inv_avx2(&s0C, &s1C, &_r, &y38, &y64, shift); const __m256i m16D = calc_mask_d16_inv_avx2(&s0D, &s1D, &_r, &y38, &y64, shift); const __m256i m16E = calc_mask_d16_inv_avx2(&s0E, &s1E, &_r, &y38, &y64, shift); const __m256i m16F = calc_mask_d16_inv_avx2(&s0F, &s1F, &_r, &y38, &y64, shift); const __m256i m16G = calc_mask_d16_inv_avx2(&s0G, &s1G, &_r, &y38, &y64, shift); const __m256i m16H = calc_mask_d16_inv_avx2(&s0H, &s1H, &_r, &y38, &y64, shift); const __m256i m8AB = _mm256_packus_epi16(m16A, m16B); const __m256i m8CD = _mm256_packus_epi16(m16C, m16D); const __m256i m8EF = _mm256_packus_epi16(m16E, m16F); const __m256i m8GH = _mm256_packus_epi16(m16G, m16H); yy_storeu_256(mask, _mm256_permute4x64_epi64(m8AB, 0xd8)); yy_storeu_256(mask + 32, _mm256_permute4x64_epi64(m8CD, 0xd8)); yy_storeu_256(mask + 64, _mm256_permute4x64_epi64(m8EF, 0xd8)); yy_storeu_256(mask + 96, _mm256_permute4x64_epi64(m8GH, 0xd8)); src0 += src0_stride; src1 += src1_stride; mask += 128; i += 1; } while (i < h); } }
void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int stride0, const uint8_t *src1, int stride1, int h, int w) { const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0; const __m256i y_mask_base = _mm256_set1_epi16(38 - mb); int i = 0; if (4 == w) { do { const __m128i s0A = xx_loadl_32(src0); const __m128i s0B = xx_loadl_32(src0 + stride0); const __m128i s0C = xx_loadl_32(src0 + stride0 * 2); const __m128i s0D = xx_loadl_32(src0 + stride0 * 3); const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B); const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D); const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD); const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD); const __m128i s1A = xx_loadl_32(src1); const __m128i s1B = xx_loadl_32(src1 + stride1); const __m128i s1C = xx_loadl_32(src1 + stride1 * 2); const __m128i s1D = xx_loadl_32(src1 + stride1 * 3); const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B); const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D); const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD); const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD); const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w); const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); const __m128i x_m8 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)); xx_storeu_128(mask, x_m8); src0 += (stride0 << 2); src1 += (stride1 << 2); mask += 16; i += 4; } while (i < h); } else if (8 == w) { do { const __m128i s0A = xx_loadl_64(src0); const __m128i s0B = xx_loadl_64(src0 + stride0); const __m128i s0C = xx_loadl_64(src0 + stride0 * 2); const __m128i s0D = xx_loadl_64(src0 + stride0 * 3); const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C)); const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D)); const __m128i s1A = xx_loadl_64(src1); const __m128i s1B = xx_loadl_64(src1 + stride1); const __m128i s1C = xx_loadl_64(src1 + stride1 * 2); const __m128i s1D = xx_loadl_64(src1 + stride1 * 3); const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C)); const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D)); const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w); const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w); const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD); yy_storeu_256(mask, m8); src0 += stride0 << 2; src1 += stride1 << 2; mask += 32; i += 4; } while (i < h); } else if (16 == w) { do { const __m128i s0A = xx_load_128(src0); const __m128i s0B = xx_load_128(src0 + stride0); const __m128i s1A = xx_load_128(src1); const __m128i s1B = xx_load_128(src1 + stride1); const __m256i s0AL = _mm256_cvtepu8_epi16(s0A); const __m256i s0BL = _mm256_cvtepu8_epi16(s0B); const __m256i s1AL = _mm256_cvtepu8_epi16(s1A); const __m256i s1BL = _mm256_cvtepu8_epi16(s1B); const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL); const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL); const __m256i m8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8); yy_storeu_256(mask, m8); src0 += stride0 << 1; src1 += stride1 << 1; mask += 32; i += 2; } while (i < h); } else { do { int j = 0; do { const __m256i s0 = yy_loadu_256(src0 + j); const __m256i s1 = yy_loadu_256(src1 + j); const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0)); const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); const __m256i s0H = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1)); const __m256i s1H = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L); const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H); const __m256i m8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8); yy_storeu_256(mask + j, m8); j += 32; } while (j < w); src0 += stride0; src1 += stride1; mask += w; i += 1; } while (i < h); } }