static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int height) { const int pre_step = pre_stride - 4; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); do { const __m128i v_p_b = xx_loadl_32(pre + n); const __m128i v_m_d = xx_load_128(mask + n); const __m128i v_w_d = xx_load_128(wsrc + n); const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); // Rounded absolute difference const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); n += 4; if (n % 4 == 0) pre += pre_step; } while (n < 4 * height); return xx_hsum_epi32_si32(v_sad_d); }
static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int width, const int height) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - width; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); assert(width >= 8); assert(IS_POWER_OF_TWO(width)); do { const __m128i v_p1_w = xx_loadl_64(pre + n + 4); const __m128i v_m1_d = xx_load_128(mask + n + 4); const __m128i v_w1_d = xx_load_128(wsrc + n + 4); const __m128i v_p0_w = xx_loadl_64(pre + n); const __m128i v_m0_d = xx_load_128(mask + n); const __m128i v_w0_d = xx_load_128(wsrc + n); const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); // Rounded absolute difference const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); n += 8; if (n % width == 0) pre += pre_step; } while (n < width * height); return xx_hsum_epi32_si32(v_sad_d); }
static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); __m128i v_acc0_q = _mm_setzero_si128(); __m128i v_acc1_q = _mm_setzero_si128(); const int16_t *const end = src + n; assert(n % 64 == 0); while (src < end) { const __m128i v_val_0_w = xx_load_128(src); const __m128i v_val_1_w = xx_load_128(src + 8); const __m128i v_val_2_w = xx_load_128(src + 16); const __m128i v_val_3_w = xx_load_128(src + 24); const __m128i v_val_4_w = xx_load_128(src + 32); const __m128i v_val_5_w = xx_load_128(src + 40); const __m128i v_val_6_w = xx_load_128(src + 48); const __m128i v_val_7_w = xx_load_128(src + 56); const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q)); v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32)); src += 64; } v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); #if ARCH_X86_64 return (uint64_t)_mm_cvtsi128_si64(v_acc0_q); #else { uint64_t tmp; _mm_storel_epi64((__m128i *)&tmp, v_acc0_q); return tmp; } #endif }
/** * See av1_wedge_sign_from_residuals_c */ int av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m, int N, int64_t limit) { int64_t acc; __m128i v_sign_d; __m128i v_acc0_d = _mm_setzero_si128(); __m128i v_acc1_d = _mm_setzero_si128(); __m128i v_acc_q; // Input size limited to 8192 by the use of 32 bit accumulators and m // being between [0, 64]. Overflow might happen at larger sizes, // though it is practically impossible on real video input. assert(N < 8192); assert(N % 64 == 0); do { const __m128i v_m01_b = xx_load_128(m); const __m128i v_m23_b = xx_load_128(m + 16); const __m128i v_m45_b = xx_load_128(m + 32); const __m128i v_m67_b = xx_load_128(m + 48); const __m128i v_d0_w = xx_load_128(ds); const __m128i v_d1_w = xx_load_128(ds + 8); const __m128i v_d2_w = xx_load_128(ds + 16); const __m128i v_d3_w = xx_load_128(ds + 24); const __m128i v_d4_w = xx_load_128(ds + 32); const __m128i v_d5_w = xx_load_128(ds + 40); const __m128i v_d6_w = xx_load_128(ds + 48); const __m128i v_d7_w = xx_load_128(ds + 56); const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128()); const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128()); const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128()); const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128()); const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128()); const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128()); const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w); const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w); const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w); const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w); const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w); const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w); const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w); const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w); const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d); const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d); const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d); const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d); const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d); const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d); v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d); v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d); ds += 64; m += 64; N -= 64; } while (N); v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128()); v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d), _mm_unpackhi_epi32(v_acc0_d, v_sign_d)); v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128()); v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d), _mm_unpackhi_epi32(v_acc1_d, v_sign_d)); v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d); v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); #if ARCH_X86_64 acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q); #else xx_storel_64(&acc, v_acc_q); #endif return acc > limit; }
/** * See av1_wedge_sse_from_residuals_c */ uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, const uint8_t *m, int N) { int n = -N; int n8 = n + 8; uint64_t csse; const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE); const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); __m128i v_acc0_q = _mm_setzero_si128(); assert(N % 64 == 0); r1 += N; d += N; m += N; do { const __m128i v_r0_w = xx_load_128(r1 + n); const __m128i v_r1_w = xx_load_128(r1 + n8); const __m128i v_d0_w = xx_load_128(d + n); const __m128i v_d1_w = xx_load_128(d + n8); const __m128i v_m01_b = xx_load_128(m + n); const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w); const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w); const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w); const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w); const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w); const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w); const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w); const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w); const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w); const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w); const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w); const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w); const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d); const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d); const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w); const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w); const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q), _mm_srli_epi64(v_sq0_d, 32)); const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q), _mm_srli_epi64(v_sq1_d, 32)); v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q); v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q); n8 += 16; n += 16; } while (n); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); #if ARCH_X86_64 csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q); #else xx_storel_64(&csse, v_acc0_q); #endif return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); }
/** * av1_wedge_compute_delta_squares_c */ void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a, const int16_t *b, int N) { const __m128i v_neg_w = _mm_set_epi16(0xffff, 0, 0xffff, 0, 0xffff, 0, 0xffff, 0); assert(N % 64 == 0); do { const __m128i v_a0_w = xx_load_128(a); const __m128i v_b0_w = xx_load_128(b); const __m128i v_a1_w = xx_load_128(a + 8); const __m128i v_b1_w = xx_load_128(b + 8); const __m128i v_a2_w = xx_load_128(a + 16); const __m128i v_b2_w = xx_load_128(b + 16); const __m128i v_a3_w = xx_load_128(a + 24); const __m128i v_b3_w = xx_load_128(b + 24); const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w); const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w); const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w); const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w); const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w); const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w); const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w); const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w); // Negate top word of pairs const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w); const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w); const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w); const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w); const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w); const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w); const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w); const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w); const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w); const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w); const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w); const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w); const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w); const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w); const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w); const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w); const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w); const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w); const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w); const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w); xx_store_128(d, v_r0_w); xx_store_128(d + 8, v_r1_w); xx_store_128(d + 16, v_r2_w); xx_store_128(d + 24, v_r3_w); a += 32; b += 32; d += 32; N -= 32; } while (N); }
void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const uint8_t *src0, int stride0, const uint8_t *src1, int stride1, int h, int w) { const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0; const __m256i y_mask_base = _mm256_set1_epi16(38 - mb); int i = 0; if (4 == w) { do { const __m128i s0A = xx_loadl_32(src0); const __m128i s0B = xx_loadl_32(src0 + stride0); const __m128i s0C = xx_loadl_32(src0 + stride0 * 2); const __m128i s0D = xx_loadl_32(src0 + stride0 * 3); const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B); const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D); const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD); const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD); const __m128i s1A = xx_loadl_32(src1); const __m128i s1B = xx_loadl_32(src1 + stride1); const __m128i s1C = xx_loadl_32(src1 + stride1 * 2); const __m128i s1D = xx_loadl_32(src1 + stride1 * 3); const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B); const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D); const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD); const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD); const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w); const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256()); const __m128i x_m8 = _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8)); xx_storeu_128(mask, x_m8); src0 += (stride0 << 2); src1 += (stride1 << 2); mask += 16; i += 4; } while (i < h); } else if (8 == w) { do { const __m128i s0A = xx_loadl_64(src0); const __m128i s0B = xx_loadl_64(src0 + stride0); const __m128i s0C = xx_loadl_64(src0 + stride0 * 2); const __m128i s0D = xx_loadl_64(src0 + stride0 * 3); const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C)); const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D)); const __m128i s1A = xx_loadl_64(src1); const __m128i s1B = xx_loadl_64(src1 + stride1); const __m128i s1C = xx_loadl_64(src1 + stride1 * 2); const __m128i s1D = xx_loadl_64(src1 + stride1 * 3); const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C)); const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D)); const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w); const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w); const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD); yy_storeu_256(mask, m8); src0 += stride0 << 2; src1 += stride1 << 2; mask += 32; i += 4; } while (i < h); } else if (16 == w) { do { const __m128i s0A = xx_load_128(src0); const __m128i s0B = xx_load_128(src0 + stride0); const __m128i s1A = xx_load_128(src1); const __m128i s1B = xx_load_128(src1 + stride1); const __m256i s0AL = _mm256_cvtepu8_epi16(s0A); const __m256i s0BL = _mm256_cvtepu8_epi16(s0B); const __m256i s1AL = _mm256_cvtepu8_epi16(s1A); const __m256i s1BL = _mm256_cvtepu8_epi16(s1B); const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL); const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL); const __m256i m8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8); yy_storeu_256(mask, m8); src0 += stride0 << 1; src1 += stride1 << 1; mask += 32; i += 2; } while (i < h); } else { do { int j = 0; do { const __m256i s0 = yy_loadu_256(src0 + j); const __m256i s1 = yy_loadu_256(src1 + j); const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0)); const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1)); const __m256i s0H = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1)); const __m256i s1H = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1)); const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L); const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H); const __m256i m8 = _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8); yy_storeu_256(mask + j, m8); j += 32; } while (j < w); src0 += stride0; src1 += stride1; mask += w; i += 1; } while (i < h); } }