Example #1
0
// input and output are int8_t
static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
                                       const __m128i* const fl) {
    const __m128i k3 = _mm_set1_epi8(3);
    const __m128i k4 = _mm_set1_epi8(4);
    __m128i v3 = _mm_adds_epi8(*fl, k3);
    __m128i v4 = _mm_adds_epi8(*fl, k4);

    SignedShift8b(&v4);                  // v4 >> 3
    SignedShift8b(&v3);                  // v3 >> 3
    *q0 = _mm_subs_epi8(*q0, v4);        // q0 -= v4
    *p0 = _mm_adds_epi8(*p0, v3);        // p0 += v3
}
Example #2
0
// input pixels are int8_t
static WEBP_INLINE void GetBaseDelta(const __m128i* const p1,
                                     const __m128i* const p0,
                                     const __m128i* const q0,
                                     const __m128i* const q1,
                                     __m128i* const delta) {
    // beware of addition order, for saturation!
    const __m128i p1_q1 = _mm_subs_epi8(*p1, *q1);   // p1 - q1
    const __m128i q0_p0 = _mm_subs_epi8(*q0, *p0);   // q0 - p0
    const __m128i s1 = _mm_adds_epi8(p1_q1, q0_p0);  // p1 - q1 + 1 * (q0 - p0)
    const __m128i s2 = _mm_adds_epi8(q0_p0, s1);     // p1 - q1 + 2 * (q0 - p0)
    const __m128i s3 = _mm_adds_epi8(q0_p0, s2);     // p1 - q1 + 3 * (q0 - p0)
    *delta = s3;
}
Example #3
0
// Denoise a 16x1 vector with a weaker filter.
static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
    const __m128i k_0, const __m128i k_delta, __m128i acc_diff) {
  __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
  // Calculate differences.
  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
  const __m128i v_mc_running_avg_y =
      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
  // Obtain the sign. FF if diff is negative.
  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
  // Clamp absolute difference to delta to get the adjustment.
  const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
  // Restore the sign and get positive and negative adjustments.
  __m128i padj, nadj;
  padj = _mm_andnot_si128(diff_sign, adj);
  nadj = _mm_and_si128(diff_sign, adj);
  // Calculate filtered value.
  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
  v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

  // Accumulate the adjustments.
  acc_diff = _mm_subs_epi8(acc_diff, padj);
  acc_diff = _mm_adds_epi8(acc_diff, nadj);
  return acc_diff;
}
Example #4
0
__m128i test_mm_adds_epi8(__m128i A, __m128i B) {
  // DAG-LABEL: test_mm_adds_epi8
  // DAG: call <16 x i8> @llvm.x86.sse2.padds.b
  //
  // ASM-LABEL: test_mm_adds_epi8
  // ASM: paddsb
  return _mm_adds_epi8(A, B);
}
    SIMDValue SIMDInt8x16Operation::OpAddSaturate(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);

        x86Result.m128i_value = _mm_adds_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value); // a + b

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Example #6
0
// Updates values of 2 pixels at MB edge during complex filtering.
// Update operations:
// q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)]
// Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip).
static WEBP_INLINE void Update2Pixels(__m128i* const pi, __m128i* const qi,
                                      const __m128i* const a0_lo,
                                      const __m128i* const a0_hi) {
    const __m128i a1_lo = _mm_srai_epi16(*a0_lo, 7);
    const __m128i a1_hi = _mm_srai_epi16(*a0_hi, 7);
    const __m128i delta = _mm_packs_epi16(a1_lo, a1_hi);
    const __m128i sign_bit = _mm_set1_epi8(0x80);
    *pi = _mm_adds_epi8(*pi, delta);
    *qi = _mm_subs_epi8(*qi, delta);
    FLIP_SIGN_BIT2(*pi, *qi);
}
Example #7
0
__m64 _m_paddsb(__m64 _MM1, __m64 _MM2)
{
    __m128i lhs = {0}, rhs = {0};
    lhs.m128i_i64[0] = _MM1.m64_i64;

    rhs.m128i_i64[0] = _MM2.m64_i64;

    lhs = _mm_adds_epi8(lhs, rhs);

    _MM1.m64_i64 = lhs.m128i_i64[0];
    return _MM1;
}
Example #8
0
static void adddiff_sse2_t(Byte *pDst, ptrdiff_t dst_pitch, const Byte *pSrc, ptrdiff_t src_pitch, int width, int height)
{
    int mod32_width = (width / 32) * 32;
    auto pDst2 = pDst;
    auto pSrc2 = pSrc;
    auto v128 = _mm_set1_epi32(0x80808080);

    for ( int j = 0; j < height; ++j ) {
        for ( int i = 0; i < mod32_width; i+=32 ) {
            _mm_prefetch(reinterpret_cast<const char*>(pDst)+i+128, _MM_HINT_T0);
            _mm_prefetch(reinterpret_cast<const char*>(pSrc)+i+128, _MM_HINT_T0);

            auto dst = simd_load_si128<mem_mode>(pDst+i);
            auto dst2 = simd_load_si128<mem_mode>(pDst+i+16);
            auto src = simd_load_si128<mem_mode>(pSrc+i);
            auto src2 = simd_load_si128<mem_mode>(pSrc+i+16);

            auto dstsub = _mm_sub_epi8(dst, v128);
            auto dstsub2 = _mm_sub_epi8(dst2, v128);

            auto srcsub = _mm_sub_epi8(src, v128);
            auto srcsub2 = _mm_sub_epi8(src2, v128);

            auto added = _mm_adds_epi8(dstsub, srcsub);
            auto added2 = _mm_adds_epi8(dstsub2, srcsub2);

            auto result = _mm_add_epi8(added, v128);
            auto result2 = _mm_add_epi8(added2, v128);

            simd_store_si128<mem_mode>(pDst+i, result);
            simd_store_si128<mem_mode>(pDst+i+16, result2);
        }
        pDst += dst_pitch;
        pSrc += src_pitch;
    }

    if (width > mod32_width) {
        adddiff_c(pDst2 + mod32_width, dst_pitch, pSrc2 + mod32_width, src_pitch, width - mod32_width, height);
    }
}
Example #9
0
// Denoise a 16x1 vector.
static INLINE __m128i vp9_denoiser_16x1_sse2(
    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
    const __m128i *k_0, const __m128i *k_4, const __m128i *k_8,
    const __m128i *k_16, const __m128i *l3, const __m128i *l32,
    const __m128i *l21, __m128i acc_diff) {
  // Calculate differences
  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
  const __m128i v_mc_running_avg_y =
      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
  __m128i v_running_avg_y;
  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
  // Obtain the sign. FF if diff is negative.
  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
  // Clamp absolute difference to 16 to be used to get mask. Doing this
  // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
  const __m128i clamped_absdiff =
      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
  // Get masks for l2 l1 and l0 adjustments.
  const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
  const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
  const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
  // Get adjustments for l2, l1, and l0.
  __m128i adj2 = _mm_and_si128(mask2, *l32);
  const __m128i adj1 = _mm_and_si128(mask1, *l21);
  const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
  __m128i adj, padj, nadj;

  // Combine the adjustments and get absolute adjustments.
  adj2 = _mm_add_epi8(adj2, adj1);
  adj = _mm_sub_epi8(*l3, adj2);
  adj = _mm_andnot_si128(mask0, adj);
  adj = _mm_or_si128(adj, adj0);

  // Restore the sign and get positive and negative adjustments.
  padj = _mm_andnot_si128(diff_sign, adj);
  nadj = _mm_and_si128(diff_sign, adj);

  // Calculate filtered value.
  v_running_avg_y = _mm_adds_epu8(v_sig, padj);
  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

  // Adjustments <=7, and each element in acc_diff can fit in signed
  // char.
  acc_diff = _mm_adds_epi8(acc_diff, padj);
  acc_diff = _mm_subs_epi8(acc_diff, nadj);
  return acc_diff;
}
Example #10
0
// Applies filter on 4 pixels (p1, p0, q0 and q1)
static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0,
                                  __m128i* q0, __m128i* q1,
                                  const __m128i* mask, int hev_thresh) {
    __m128i not_hev;
    __m128i t1, t2, t3;
    const __m128i sign_bit = _mm_set1_epi8(0x80);

    // compute hev mask
    GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);

    // convert to signed values
    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);

    t1 = _mm_subs_epi8(*p1, *q1);        // p1 - q1
    t1 = _mm_andnot_si128(not_hev, t1);  // hev(p1 - q1)
    t2 = _mm_subs_epi8(*q0, *p0);        // q0 - p0
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 1 * (q0 - p0)
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 2 * (q0 - p0)
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
    t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about

    // Do +4 side
    t2 = _mm_set1_epi8(4);
    t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 4
    SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
    t3 = t2;                           // save t2
    *q0 = _mm_subs_epi8(*q0, t2);      // q0 -= t2

    // Now do +3 side
    t2 = _mm_set1_epi8(3);
    t2 = _mm_adds_epi8(t1, t2);        // +3 instead of +4
    SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
    *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2

    t2 = _mm_set1_epi8(1);
    t3 = _mm_adds_epi8(t3, t2);
    SIGNED_SHIFT_N(t3, 1);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 4

    t3 = _mm_and_si128(not_hev, t3);   // if !hev
    *q1 = _mm_subs_epi8(*q1, t3);      // q1 -= t3
    *p1 = _mm_adds_epi8(*p1, t3);      // p1 += t3

    // unoffset
    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
}
Example #11
0
// Applies filter on 4 pixels (p1, p0, q0 and q1)
static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
                                  __m128i* const q0, __m128i* const q1,
                                  const __m128i* const mask, int hev_thresh) {
    const __m128i sign_bit = _mm_set1_epi8(0x80);
    const __m128i k64 = _mm_set1_epi8(0x40);
    const __m128i zero = _mm_setzero_si128();
    __m128i not_hev;
    __m128i t1, t2, t3;

    // compute hev mask
    GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);

    // convert to signed values
    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);

    t1 = _mm_subs_epi8(*p1, *q1);        // p1 - q1
    t1 = _mm_andnot_si128(not_hev, t1);  // hev(p1 - q1)
    t2 = _mm_subs_epi8(*q0, *p0);        // q0 - p0
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 1 * (q0 - p0)
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 2 * (q0 - p0)
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
    t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about

    t2 = _mm_set1_epi8(3);
    t3 = _mm_set1_epi8(4);
    t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 3
    t3 = _mm_adds_epi8(t1, t3);        // 3 * (q0 - p0) + (p1 - q1) + 4
    SignedShift8b(&t2);                // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
    SignedShift8b(&t3);                // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
    *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
    *q0 = _mm_subs_epi8(*q0, t3);      // q0 -= t3
    FLIP_SIGN_BIT2(*p0, *q0);

    // this is equivalent to signed (a + 1) >> 1 calculation
    t2 = _mm_add_epi8(t3, sign_bit);
    t3 = _mm_avg_epu8(t2, zero);
    t3 = _mm_sub_epi8(t3, k64);

    t3 = _mm_and_si128(not_hev, t3);   // if !hev
    *q1 = _mm_subs_epi8(*q1, t3);      // q1 -= t3
    *p1 = _mm_adds_epi8(*p1, t3);      // p1 += t3
    FLIP_SIGN_BIT2(*p1, *q1);
}
Example #12
0
FORCE_INLINE
static inline void fast_scan_1(const std::uint8_t* partition, const unsigned* labels,
		const float* dists, const __m128i (&min4)[4], __m128i (&ft4)[4][16],
		const float qmin, const float qmax, binheap* bh, unsigned scan_pqcode_count) {
	const unsigned simd_pqcode_count = 16;
	const int comp_block_size = 16;
	const unsigned simd_block_size = simd_pqcode_count * (4 * 1 + 4 * 0.5);
	const group_header* hdr;

	float bh_bound = qmax;
	__m128i bh_bound_quant = _mm_set1_epi8(Q127(bh_bound, qmin, bh_bound)); // CHK. Is 127

	for (;;) {
		// Parse group header
		hdr = reinterpret_cast<const group_header*>(partition);
		// Check if last group (All bits of size set to 1)
		if (hdr->size == std::numeric_limits<decltype(hdr->size)>::max()) {
			return;
		}
		partition += sizeof(*hdr);
		unsigned simd_block_count = (static_cast<unsigned>(hdr->size)
				+ simd_pqcode_count - 1) / simd_pqcode_count;
		// Load tables
		__m128i ft4_group[4];
		ft4_group[0] = ft4[0][hdr->values[0] >> 4];
		ft4_group[1] = ft4[1][hdr->values[1] >> 4];
		ft4_group[2] = ft4[2][hdr->values[2] >> 4];
		ft4_group[3] = ft4[3][hdr->values[3] >> 4];
		// Scan SIMD Blocks
		while (simd_block_count--) {
			const __m128i low_bits_mask = _mm_set_epi64x(0x0f0f0f0f0f0f0f0f,
					0x0f0f0f0f0f0f0f0f);

			// Component 0
			const __m128i comps_0 = _mm_loadu_si128(
					reinterpret_cast<const __m128i *>(partition));
			const __m128i masked_comps_0 = _mm_and_si128(comps_0, low_bits_mask);
			__m128i candidates = _mm_shuffle_epi8(min4[0], masked_comps_0);
			// Components 1..3
			for (int comp_i = 1; comp_i < 4; ++comp_i) {
				const __m128i comps = _mm_loadu_si128(
						reinterpret_cast<const __m128i *>(partition
								+ comp_i * comp_block_size));
				const __m128i masked_comps = _mm_and_si128(comps, low_bits_mask);
				const __m128i partial = _mm_shuffle_epi8(min4[comp_i], masked_comps);
				candidates = _mm_adds_epi8(candidates, partial);
			}
			// Components 4-5
			__m128i comps_45 = _mm_loadu_si128(
					reinterpret_cast<const __m128i *>(partition
							+ 4 * comp_block_size));
			const __m128i masked_comps_4 = _mm_and_si128(comps_45, low_bits_mask);
			const __m128i partial_4 = _mm_shuffle_epi8(ft4_group[0], masked_comps_4);
			candidates = _mm_adds_epi8(candidates, partial_4);

			comps_45 = _mm_srli_epi64(comps_45, 4);
			const __m128i masked_comps_5 = _mm_and_si128(comps_45, low_bits_mask);
			const __m128i partial_5 = _mm_shuffle_epi8(ft4_group[1], masked_comps_5);
			candidates = _mm_adds_epi8(candidates, partial_5);

			// Components 6-7
			__m128i comps_67 = _mm_loadu_si128(
					reinterpret_cast<const __m128i *>(partition
							+ 5 * comp_block_size));
			const __m128i masked_comps_6 = _mm_and_si128(comps_67, low_bits_mask);
			const __m128i partial_6 = _mm_shuffle_epi8(ft4_group[2], masked_comps_6);
			candidates = _mm_adds_epi8(candidates, partial_6);

			const __m128i comps_7 = _mm_srli_epi64(comps_67, 4);
			const __m128i masked_comp_7 = _mm_and_si128(comps_7, low_bits_mask);
			const __m128i partial_7 = _mm_shuffle_epi8(ft4_group[3], masked_comp_7);
			candidates = _mm_adds_epi8(candidates, partial_7);

			// Compare
			const __m128i compare = _mm_cmplt_epi8(candidates, bh_bound_quant);
			int cmp = _mm_movemask_epi8(compare);
			//std::uint64_t cmp_low = (_mm_cvtsi128_si64(compare));
			//std::uint64_t cmp_high = (_mm_extract_epi64(compare, 1));

			// Compute current block size
			int current_block_actual_size = 0;
			if(simd_block_count == 0) {
				current_block_actual_size = hdr->size % simd_pqcode_count;
				if(current_block_actual_size == 0) {
					current_block_actual_size = simd_pqcode_count;
				} else {
					/*__m128i mask;
					compute_simd_mask(current_block_actual_size, mask);
					compare = _mm_and_si128(compare, mask);*/
					/*
					std::uint64_t low_mask;
					std::uint64_t high_mask;
					compute_high_low_mask(current_block_actual_size, low_mask, high_mask);
					cmp_low = cmp_low & low_mask;
					cmp_high = cmp_high & high_mask;
					*/
					cmp = cmp & BITMASK(current_block_actual_size);
				}
			} else {
				current_block_actual_size = simd_pqcode_count;
			}

			if(cmp) {
				// Check low quadword
				const std::uint8_t cmp_low = cmp & 0xff;
				if (cmp_low) {
					/*const std::uint64_t low_possible_positions = 0x0706050403020100;
					const std::uint64_t match_positions = _pext_u64(
							low_possible_positions, cmp_low);*/
					const int match_count = _popcnt32(cmp_low);
					std::uint64_t match_pos = masktable[cmp_low];


					for (int i = 0; i < match_count; ++i) {
						const std::uint8_t pos = match_pos & 0xff;
						match_pos >>= 8;
						const float candidate = scan_pqcode_in_simd_block_1(pos,
								partition, hdr->values, dists);
						if (candidate < bh_bound) {
							bh->push(labels[scan_pqcode_count + pos],
									candidate);
							bh_bound = bh->max();
							bh_bound_quant = _mm_set1_epi8(
									Q127(bh_bound, qmin, qmax));
						}
					}
				}

				// Check high quadword
				const std::uint8_t cmp_high = (cmp >> 8);
				if (cmp_high) {
					/*const std::uint64_t high_possible_positions = 0x0f0e0d0c0b0a0908;
					const std::uint64_t match_positions = _pext_u64(
							high_possible_positions, cmp_high);*/
					const int match_count = _popcnt32(cmp_high);
					std::uint64_t match_pos = masktable[cmp_high] + 0x0808080808080808;

					for (int i = 0; i < match_count; ++i) {
						const std::uint8_t pos = match_pos & 0xff;
						match_pos >>= 8;
						const float candidate = scan_pqcode_in_simd_block_1(pos,
								partition, hdr->values, dists);
						if (candidate < bh_bound) {
							bh->push(labels[scan_pqcode_count + pos],
									candidate);
							bh_bound = bh->max();
							bh_bound_quant = _mm_set1_epi8(
									Q127(bh_bound, qmin, qmax));
						}
					}
				}
			}

			partition += simd_block_size;
			scan_pqcode_count += current_block_actual_size;
		}
	}
}
Example #13
0
static void mb_lpf_horizontal_edge_w_avx2_16(unsigned char *s, int p,
                                             const unsigned char *_blimit,
                                             const unsigned char *_limit,
                                             const unsigned char *_thresh) {
  __m128i mask, hev, flat, flat2;
  const __m128i zero = _mm_set1_epi16(0);
  const __m128i one = _mm_set1_epi8(1);
  __m128i p7, p6, p5;
  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
  __m128i q5, q6, q7;
  __m256i p256_7, q256_7, p256_6, q256_6, p256_5, q256_5, p256_4, q256_4,
      p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;

  const __m128i thresh =
      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
  const __m128i blimit =
      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));

  p256_4 =
      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
  p256_3 =
      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
  p256_2 =
      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
  p256_1 =
      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
  p256_0 =
      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
  q256_0 =
      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
  q256_1 =
      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
  q256_2 =
      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
  q256_3 =
      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
  q256_4 =
      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 4 * p)));

  p4 = _mm256_castsi256_si128(p256_4);
  p3 = _mm256_castsi256_si128(p256_3);
  p2 = _mm256_castsi256_si128(p256_2);
  p1 = _mm256_castsi256_si128(p256_1);
  p0 = _mm256_castsi256_si128(p256_0);
  q0 = _mm256_castsi256_si128(q256_0);
  q1 = _mm256_castsi256_si128(q256_1);
  q2 = _mm256_castsi256_si128(q256_2);
  q3 = _mm256_castsi256_si128(q256_3);
  q4 = _mm256_castsi256_si128(q256_4);

  {
    const __m128i abs_p1p0 =
        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
    const __m128i abs_q1q0 =
        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
    const __m128i fe = _mm_set1_epi8(0xfe);
    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
    __m128i abs_p0q0 =
        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
    __m128i abs_p1q1 =
        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
    __m128i work;
    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    hev = _mm_subs_epu8(flat, thresh);
    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);

    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    mask = _mm_max_epu8(flat, mask);
    // mask |= (abs(p1 - p0) > limit) * -1;
    // mask |= (abs(q1 - q0) > limit) * -1;
    work = _mm_max_epu8(
        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
    mask = _mm_max_epu8(work, mask);
    work = _mm_max_epu8(
        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
    mask = _mm_max_epu8(work, mask);
    mask = _mm_subs_epu8(mask, limit);
    mask = _mm_cmpeq_epi8(mask, zero);
  }

  // lp filter
  {
    const __m128i t4 = _mm_set1_epi8(4);
    const __m128i t3 = _mm_set1_epi8(3);
    const __m128i t80 = _mm_set1_epi8(0x80);
    const __m128i te0 = _mm_set1_epi8(0xe0);
    const __m128i t1f = _mm_set1_epi8(0x1f);
    const __m128i t1 = _mm_set1_epi8(0x1);
    const __m128i t7f = _mm_set1_epi8(0x7f);

    __m128i ps1 = _mm_xor_si128(p1, t80);
    __m128i ps0 = _mm_xor_si128(p0, t80);
    __m128i qs0 = _mm_xor_si128(q0, t80);
    __m128i qs1 = _mm_xor_si128(q1, t80);
    __m128i filt;
    __m128i work_a;
    __m128i filter1, filter2;
    __m128i flat2_p6, flat2_p5, flat2_p4, flat2_p3, flat2_p2, flat2_p1,
        flat2_p0, flat2_q0, flat2_q1, flat2_q2, flat2_q3, flat2_q4, flat2_q5,
        flat2_q6, flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;

    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
    work_a = _mm_subs_epi8(qs0, ps0);
    filt = _mm_adds_epi8(filt, work_a);
    filt = _mm_adds_epi8(filt, work_a);
    filt = _mm_adds_epi8(filt, work_a);
    /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
    filt = _mm_and_si128(filt, mask);

    filter1 = _mm_adds_epi8(filt, t4);
    filter2 = _mm_adds_epi8(filt, t3);

    /* Filter1 >> 3 */
    work_a = _mm_cmpgt_epi8(zero, filter1);
    filter1 = _mm_srli_epi16(filter1, 3);
    work_a = _mm_and_si128(work_a, te0);
    filter1 = _mm_and_si128(filter1, t1f);
    filter1 = _mm_or_si128(filter1, work_a);
    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);

    /* Filter2 >> 3 */
    work_a = _mm_cmpgt_epi8(zero, filter2);
    filter2 = _mm_srli_epi16(filter2, 3);
    work_a = _mm_and_si128(work_a, te0);
    filter2 = _mm_and_si128(filter2, t1f);
    filter2 = _mm_or_si128(filter2, work_a);
    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);

    /* filt >> 1 */
    filt = _mm_adds_epi8(filter1, t1);
    work_a = _mm_cmpgt_epi8(zero, filt);
    filt = _mm_srli_epi16(filt, 1);
    work_a = _mm_and_si128(work_a, t80);
    filt = _mm_and_si128(filt, t7f);
    filt = _mm_or_si128(filt, work_a);
    filt = _mm_andnot_si128(hev, filt);
    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
    // loopfilter done

    {
      __m128i work;
      work = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
          _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
      flat = _mm_max_epu8(work, flat);
      work = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
          _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
      flat = _mm_max_epu8(work, flat);
      work = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(p4, p0), _mm_subs_epu8(p0, p4)),
          _mm_or_si128(_mm_subs_epu8(q4, q0), _mm_subs_epu8(q0, q4)));
      flat = _mm_subs_epu8(flat, one);
      flat = _mm_cmpeq_epi8(flat, zero);
      flat = _mm_and_si128(flat, mask);

      p256_5 = _mm256_castpd_si256(
          _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
      q256_5 = _mm256_castpd_si256(
          _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
      p5 = _mm256_castsi256_si128(p256_5);
      q5 = _mm256_castsi256_si128(q256_5);
      flat2 = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(p5, p0), _mm_subs_epu8(p0, p5)),
          _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5)));

      flat2 = _mm_max_epu8(work, flat2);
      p256_6 = _mm256_castpd_si256(
          _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
      q256_6 = _mm256_castpd_si256(
          _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
      p6 = _mm256_castsi256_si128(p256_6);
      q6 = _mm256_castsi256_si128(q256_6);
      work = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(p6, p0), _mm_subs_epu8(p0, p6)),
          _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6)));

      flat2 = _mm_max_epu8(work, flat2);

      p256_7 = _mm256_castpd_si256(
          _mm256_broadcast_pd((__m128d const *)(s - 8 * p)));
      q256_7 = _mm256_castpd_si256(
          _mm256_broadcast_pd((__m128d const *)(s + 7 * p)));
      p7 = _mm256_castsi256_si128(p256_7);
      q7 = _mm256_castsi256_si128(q256_7);
      work = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(p7, p0), _mm_subs_epu8(p0, p7)),
          _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7)));

      flat2 = _mm_max_epu8(work, flat2);
      flat2 = _mm_subs_epu8(flat2, one);
      flat2 = _mm_cmpeq_epi8(flat2, zero);
      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
    }

    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // flat and wide flat calculations
    {
      const __m256i eight = _mm256_set1_epi16(8);
      const __m256i four = _mm256_set1_epi16(4);
      __m256i pixelFilter_p, pixelFilter_q, pixetFilter_p2p1p0,
          pixetFilter_q2q1q0, sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;

      const __m256i filter =
          _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
      p256_7 = _mm256_shuffle_epi8(p256_7, filter);
      p256_6 = _mm256_shuffle_epi8(p256_6, filter);
      p256_5 = _mm256_shuffle_epi8(p256_5, filter);
      p256_4 = _mm256_shuffle_epi8(p256_4, filter);
      p256_3 = _mm256_shuffle_epi8(p256_3, filter);
      p256_2 = _mm256_shuffle_epi8(p256_2, filter);
      p256_1 = _mm256_shuffle_epi8(p256_1, filter);
      p256_0 = _mm256_shuffle_epi8(p256_0, filter);
      q256_0 = _mm256_shuffle_epi8(q256_0, filter);
      q256_1 = _mm256_shuffle_epi8(q256_1, filter);
      q256_2 = _mm256_shuffle_epi8(q256_2, filter);
      q256_3 = _mm256_shuffle_epi8(q256_3, filter);
      q256_4 = _mm256_shuffle_epi8(q256_4, filter);
      q256_5 = _mm256_shuffle_epi8(q256_5, filter);
      q256_6 = _mm256_shuffle_epi8(q256_6, filter);
      q256_7 = _mm256_shuffle_epi8(q256_7, filter);

      pixelFilter_p = _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_5),
                                       _mm256_add_epi16(p256_4, p256_3));
      pixelFilter_q = _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_5),
                                       _mm256_add_epi16(q256_4, q256_3));

      pixetFilter_p2p1p0 =
          _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
      pixelFilter_p = _mm256_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);

      pixetFilter_q2q1q0 =
          _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
      pixelFilter_q = _mm256_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);

      pixelFilter_p = _mm256_add_epi16(
          eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));

      pixetFilter_p2p1p0 = _mm256_add_epi16(
          four, _mm256_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));

      res_p = _mm256_srli_epi16(
          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(p256_7, p256_0)), 4);

      flat2_p0 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));

      res_q = _mm256_srli_epi16(
          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(q256_7, q256_0)), 4);

      flat2_q0 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));

      res_p =
          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
                                             _mm256_add_epi16(p256_3, p256_0)),
                            3);

      flat_p0 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));

      res_q =
          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
                                             _mm256_add_epi16(q256_3, q256_0)),
                            3);

      flat_q0 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));

      sum_p7 = _mm256_add_epi16(p256_7, p256_7);

      sum_q7 = _mm256_add_epi16(q256_7, q256_7);

      sum_p3 = _mm256_add_epi16(p256_3, p256_3);

      sum_q3 = _mm256_add_epi16(q256_3, q256_3);

      pixelFilter_q = _mm256_sub_epi16(pixelFilter_p, p256_6);

      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_6);

      res_p = _mm256_srli_epi16(
          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_1)), 4);

      flat2_p1 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));

      res_q = _mm256_srli_epi16(
          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_1)), 4);

      flat2_q1 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));

      pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_p2p1p0, p256_2);

      pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_2);

      res_p =
          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
                                             _mm256_add_epi16(sum_p3, p256_1)),
                            3);

      flat_p1 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));

      res_q =
          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
                                             _mm256_add_epi16(sum_q3, q256_1)),
                            3);

      flat_q1 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));

      sum_p7 = _mm256_add_epi16(sum_p7, p256_7);

      sum_q7 = _mm256_add_epi16(sum_q7, q256_7);

      sum_p3 = _mm256_add_epi16(sum_p3, p256_3);

      sum_q3 = _mm256_add_epi16(sum_q3, q256_3);

      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_5);

      pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_5);

      res_p = _mm256_srli_epi16(
          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_2)), 4);

      flat2_p2 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));

      res_q = _mm256_srli_epi16(
          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_2)), 4);

      flat2_q2 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));

      pixetFilter_p2p1p0 = _mm256_sub_epi16(pixetFilter_p2p1p0, q256_1);

      pixetFilter_q2q1q0 = _mm256_sub_epi16(pixetFilter_q2q1q0, p256_1);

      res_p =
          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0,
                                             _mm256_add_epi16(sum_p3, p256_2)),
                            3);

      flat_p2 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));

      res_q =
          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0,
                                             _mm256_add_epi16(sum_q3, q256_2)),
                            3);

      flat_q2 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));

      sum_p7 = _mm256_add_epi16(sum_p7, p256_7);

      sum_q7 = _mm256_add_epi16(sum_q7, q256_7);

      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_4);

      pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_4);

      res_p = _mm256_srli_epi16(
          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_3)), 4);

      flat2_p3 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));

      res_q = _mm256_srli_epi16(
          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_3)), 4);

      flat2_q3 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));

      sum_p7 = _mm256_add_epi16(sum_p7, p256_7);

      sum_q7 = _mm256_add_epi16(sum_q7, q256_7);

      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_3);

      pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_3);

      res_p = _mm256_srli_epi16(
          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_4)), 4);

      flat2_p4 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));

      res_q = _mm256_srli_epi16(
          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_4)), 4);

      flat2_q4 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));

      sum_p7 = _mm256_add_epi16(sum_p7, p256_7);

      sum_q7 = _mm256_add_epi16(sum_q7, q256_7);

      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_2);

      pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_2);

      res_p = _mm256_srli_epi16(
          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_5)), 4);

      flat2_p5 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));

      res_q = _mm256_srli_epi16(
          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_5)), 4);

      flat2_q5 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));

      sum_p7 = _mm256_add_epi16(sum_p7, p256_7);

      sum_q7 = _mm256_add_epi16(sum_q7, q256_7);

      pixelFilter_p = _mm256_sub_epi16(pixelFilter_p, q256_1);

      pixelFilter_q = _mm256_sub_epi16(pixelFilter_q, p256_1);

      res_p = _mm256_srli_epi16(
          _mm256_add_epi16(pixelFilter_p, _mm256_add_epi16(sum_p7, p256_6)), 4);

      flat2_p6 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));

      res_q = _mm256_srli_epi16(
          _mm256_add_epi16(pixelFilter_q, _mm256_add_epi16(sum_q7, q256_6)), 4);

      flat2_q6 = _mm256_castsi256_si128(
          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
    }

    // wide flat
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    p2 = _mm_andnot_si128(flat, p2);
    flat_p2 = _mm_and_si128(flat, flat_p2);
    p2 = _mm_or_si128(flat_p2, p2);

    p1 = _mm_andnot_si128(flat, ps1);
    flat_p1 = _mm_and_si128(flat, flat_p1);
    p1 = _mm_or_si128(flat_p1, p1);

    p0 = _mm_andnot_si128(flat, ps0);
    flat_p0 = _mm_and_si128(flat, flat_p0);
    p0 = _mm_or_si128(flat_p0, p0);

    q0 = _mm_andnot_si128(flat, qs0);
    flat_q0 = _mm_and_si128(flat, flat_q0);
    q0 = _mm_or_si128(flat_q0, q0);

    q1 = _mm_andnot_si128(flat, qs1);
    flat_q1 = _mm_and_si128(flat, flat_q1);
    q1 = _mm_or_si128(flat_q1, q1);

    q2 = _mm_andnot_si128(flat, q2);
    flat_q2 = _mm_and_si128(flat, flat_q2);
    q2 = _mm_or_si128(flat_q2, q2);

    p6 = _mm_andnot_si128(flat2, p6);
    flat2_p6 = _mm_and_si128(flat2, flat2_p6);
    p6 = _mm_or_si128(flat2_p6, p6);
    _mm_storeu_si128((__m128i *)(s - 7 * p), p6);

    p5 = _mm_andnot_si128(flat2, p5);
    flat2_p5 = _mm_and_si128(flat2, flat2_p5);
    p5 = _mm_or_si128(flat2_p5, p5);
    _mm_storeu_si128((__m128i *)(s - 6 * p), p5);

    p4 = _mm_andnot_si128(flat2, p4);
    flat2_p4 = _mm_and_si128(flat2, flat2_p4);
    p4 = _mm_or_si128(flat2_p4, p4);
    _mm_storeu_si128((__m128i *)(s - 5 * p), p4);

    p3 = _mm_andnot_si128(flat2, p3);
    flat2_p3 = _mm_and_si128(flat2, flat2_p3);
    p3 = _mm_or_si128(flat2_p3, p3);
    _mm_storeu_si128((__m128i *)(s - 4 * p), p3);

    p2 = _mm_andnot_si128(flat2, p2);
    flat2_p2 = _mm_and_si128(flat2, flat2_p2);
    p2 = _mm_or_si128(flat2_p2, p2);
    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);

    p1 = _mm_andnot_si128(flat2, p1);
    flat2_p1 = _mm_and_si128(flat2, flat2_p1);
    p1 = _mm_or_si128(flat2_p1, p1);
    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);

    p0 = _mm_andnot_si128(flat2, p0);
    flat2_p0 = _mm_and_si128(flat2, flat2_p0);
    p0 = _mm_or_si128(flat2_p0, p0);
    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);

    q0 = _mm_andnot_si128(flat2, q0);
    flat2_q0 = _mm_and_si128(flat2, flat2_q0);
    q0 = _mm_or_si128(flat2_q0, q0);
    _mm_storeu_si128((__m128i *)(s - 0 * p), q0);

    q1 = _mm_andnot_si128(flat2, q1);
    flat2_q1 = _mm_and_si128(flat2, flat2_q1);
    q1 = _mm_or_si128(flat2_q1, q1);
    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);

    q2 = _mm_andnot_si128(flat2, q2);
    flat2_q2 = _mm_and_si128(flat2, flat2_q2);
    q2 = _mm_or_si128(flat2_q2, q2);
    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);

    q3 = _mm_andnot_si128(flat2, q3);
    flat2_q3 = _mm_and_si128(flat2, flat2_q3);
    q3 = _mm_or_si128(flat2_q3, q3);
    _mm_storeu_si128((__m128i *)(s + 3 * p), q3);

    q4 = _mm_andnot_si128(flat2, q4);
    flat2_q4 = _mm_and_si128(flat2, flat2_q4);
    q4 = _mm_or_si128(flat2_q4, q4);
    _mm_storeu_si128((__m128i *)(s + 4 * p), q4);

    q5 = _mm_andnot_si128(flat2, q5);
    flat2_q5 = _mm_and_si128(flat2, flat2_q5);
    q5 = _mm_or_si128(flat2_q5, q5);
    _mm_storeu_si128((__m128i *)(s + 5 * p), q5);

    q6 = _mm_andnot_si128(flat2, q6);
    flat2_q6 = _mm_and_si128(flat2, flat2_q6);
    q6 = _mm_or_si128(flat2_q6, q6);
    _mm_storeu_si128((__m128i *)(s + 6 * p), q6);
  }
}
Example #14
0
static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
                                            const unsigned char *_blimit,
                                            const unsigned char *_limit,
                                            const unsigned char *_thresh) {
  __m128i mask, hev, flat, flat2;
  const __m128i zero = _mm_set1_epi16(0);
  const __m128i one = _mm_set1_epi8(1);
  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
  __m128i abs_p1p0;

  const __m128i thresh =
      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
  const __m128i blimit =
      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));

  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
  q4p4 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
  q3p3 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
  q2p2 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
  q1p1 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
  p1q1 = _mm_shuffle_epi32(q1p1, 78);
  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
  q0p0 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
  p0q0 = _mm_shuffle_epi32(q0p0, 78);

  {
    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
    abs_p1p0 =
        _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
    fe = _mm_set1_epi8(0xfe);
    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
    abs_p0q0 =
        _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
    abs_p1q1 =
        _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    hev = _mm_subs_epu8(flat, thresh);
    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);

    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    mask = _mm_max_epu8(abs_p1p0, mask);
    // mask |= (abs(p1 - p0) > limit) * -1;
    // mask |= (abs(q1 - q0) > limit) * -1;

    work = _mm_max_epu8(
        _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
    mask = _mm_max_epu8(work, mask);
    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
    mask = _mm_subs_epu8(mask, limit);
    mask = _mm_cmpeq_epi8(mask, zero);
  }

  // lp filter
  {
    const __m128i t4 = _mm_set1_epi8(4);
    const __m128i t3 = _mm_set1_epi8(3);
    const __m128i t80 = _mm_set1_epi8(0x80);
    const __m128i t1 = _mm_set1_epi16(0x1);
    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
    __m128i qs0 = _mm_xor_si128(p0q0, t80);
    __m128i qs1 = _mm_xor_si128(p1q1, t80);
    __m128i filt;
    __m128i work_a;
    __m128i filter1, filter2;
    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;

    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
    work_a = _mm_subs_epi8(qs0, qs0ps0);
    filt = _mm_adds_epi8(filt, work_a);
    filt = _mm_adds_epi8(filt, work_a);
    filt = _mm_adds_epi8(filt, work_a);
    /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
    filt = _mm_and_si128(filt, mask);

    filter1 = _mm_adds_epi8(filt, t4);
    filter2 = _mm_adds_epi8(filt, t3);

    filter1 = _mm_unpacklo_epi8(zero, filter1);
    filter1 = _mm_srai_epi16(filter1, 0xB);
    filter2 = _mm_unpacklo_epi8(zero, filter2);
    filter2 = _mm_srai_epi16(filter2, 0xB);

    /* Filter1 >> 3 */
    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);

    /* filt >> 1 */
    filt = _mm_adds_epi16(filter1, t1);
    filt = _mm_srai_epi16(filt, 1);
    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
                            filt);
    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
    // loopfilter done

    {
      __m128i work;
      flat = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
          _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
      flat = _mm_max_epu8(abs_p1p0, flat);
      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
      flat = _mm_subs_epu8(flat, one);
      flat = _mm_cmpeq_epi8(flat, zero);
      flat = _mm_and_si128(flat, mask);

      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
      q5p5 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));

      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
      q6p6 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));

      flat2 = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
          _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));

      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
      q7p7 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));

      work = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
          _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));

      flat2 = _mm_max_epu8(work, flat2);
      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
      flat2 = _mm_subs_epu8(flat2, one);
      flat2 = _mm_cmpeq_epi8(flat2, zero);
      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
    }

    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // flat and wide flat calculations
    {
      const __m128i eight = _mm_set1_epi16(8);
      const __m128i four = _mm_set1_epi16(4);
      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
      __m128i pixelFilter_p, pixelFilter_q;
      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;

      p7_16 = _mm_unpacklo_epi8(q7p7, zero);
      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
      q7_16 = _mm_unpackhi_epi8(q7p7, zero);

      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
                                    _mm_add_epi16(p4_16, p3_16));
      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
                                    _mm_add_epi16(q4_16, q3_16));

      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
      pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);

      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
      pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
      pixelFilter_p =
          _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
      pixetFilter_p2p1p0 = _mm_add_epi16(
          four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);

      flat_q0p0 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(p7_16, p7_16);
      sum_q7 = _mm_add_epi16(q7_16, q7_16);
      sum_p3 = _mm_add_epi16(p3_16, p3_16);
      sum_q3 = _mm_add_epi16(q3_16, q3_16);

      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);

      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
      flat_q1p1 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
      sum_q3 = _mm_add_epi16(sum_q3, q3_16);

      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);

      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);

      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
      flat_q2p2 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
    }
    // wide flat
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    flat = _mm_shuffle_epi32(flat, 68);
    flat2 = _mm_shuffle_epi32(flat2, 68);

    q2p2 = _mm_andnot_si128(flat, q2p2);
    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
    q2p2 = _mm_or_si128(q2p2, flat_q2p2);

    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);

    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);

    q6p6 = _mm_andnot_si128(flat2, q6p6);
    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));

    q5p5 = _mm_andnot_si128(flat2, q5p5);
    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));

    q4p4 = _mm_andnot_si128(flat2, q4p4);
    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));

    q3p3 = _mm_andnot_si128(flat2, q3p3);
    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));

    q2p2 = _mm_andnot_si128(flat2, q2p2);
    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));

    q1p1 = _mm_andnot_si128(flat2, q1p1);
    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));

    q0p0 = _mm_andnot_si128(flat2, q0p0);
    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
  }
}
int vp8_denoiser_filter_sse2(YV12_BUFFER_CONFIG *mc_running_avg,
                             YV12_BUFFER_CONFIG *running_avg,
                             MACROBLOCK *signal, unsigned int motion_magnitude,
                             int y_offset, int uv_offset)
{
    unsigned char *sig = signal->thismb;
    int sig_stride = 16;
    unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
    int mc_avg_y_stride = mc_running_avg->y_stride;
    unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
    int avg_y_stride = running_avg->y_stride;
    int r;
    (void)uv_offset;
    __m128i acc_diff = _mm_setzero_si128();
    const __m128i k_0 = _mm_setzero_si128();
    const __m128i k_4 = _mm_set1_epi8(4);
    const __m128i k_8 = _mm_set1_epi8(8);
    const __m128i k_16 = _mm_set1_epi8(16);
    /* Modify each level's adjustment according to motion_magnitude. */
    const __m128i l3 = _mm_set1_epi8(
                      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 : 6);
    /* Difference between level 3 and level 2 is 2. */
    const __m128i l32 = _mm_set1_epi8(2);
    /* Difference between level 2 and level 1 is 1. */
    const __m128i l21 = _mm_set1_epi8(1);

    for (r = 0; r < 16; ++r)
    {
        /* Calculate differences */
        const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
        const __m128i v_mc_running_avg_y = _mm_loadu_si128(
                                           (__m128i *)(&mc_running_avg_y[0]));
        __m128i v_running_avg_y;
        const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
        const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
        /* Obtain the sign. FF if diff is negative. */
        const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
        /* Clamp absolute difference to 16 to be used to get mask. Doing this
         * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
        const __m128i clamped_absdiff = _mm_min_epu8(
                                        _mm_or_si128(pdiff, ndiff), k_16);
        /* Get masks for l2 l1 and l0 adjustments */
        const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
        const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
        const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
        /* Get adjustments for l2, l1, and l0 */
        __m128i adj2 = _mm_and_si128(mask2, l32);
        const __m128i adj1 = _mm_and_si128(mask1, l21);
        const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
        __m128i adj,  padj, nadj;

        /* Combine the adjustments and get absolute adjustments. */
        adj2 = _mm_add_epi8(adj2, adj1);
        adj = _mm_sub_epi8(l3, adj2);
        adj = _mm_andnot_si128(mask0, adj);
        adj = _mm_or_si128(adj, adj0);

        /* Restore the sign and get positive and negative adjustments. */
        padj = _mm_andnot_si128(diff_sign, adj);
        nadj = _mm_and_si128(diff_sign, adj);

        /* Calculate filtered value. */
        v_running_avg_y = _mm_adds_epu8(v_sig, padj);
        v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
        _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

        /* Adjustments <=7, and each element in acc_diff can fit in signed
         * char.
         */
        acc_diff = _mm_adds_epi8(acc_diff, padj);
        acc_diff = _mm_subs_epi8(acc_diff, nadj);

        /* Update pointers for next iteration. */
        sig += sig_stride;
        mc_running_avg_y += mc_avg_y_stride;
        running_avg_y += avg_y_stride;
    }

    {
        /* Compute the sum of all pixel differences of this MB. */
        union sum_union s;
        int sum_diff = 0;
        s.v = acc_diff;
        sum_diff = s.e[0] + s.e[1] + s.e[2] + s.e[3] + s.e[4] + s.e[5]
                 + s.e[6] + s.e[7] + s.e[8] + s.e[9] + s.e[10] + s.e[11]
                 + s.e[12] + s.e[13] + s.e[14] + s.e[15];

        if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
        {
            return COPY_BLOCK;
        }
    }

    vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride,
                      signal->thismb, sig_stride);
    return FILTER_BLOCK;
}
Example #16
0
int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
                             int mc_avg_y_stride, unsigned char *running_avg_y,
                             int avg_y_stride, unsigned char *sig,
                             int sig_stride, unsigned int motion_magnitude,
                             int increase_denoising) {
  unsigned char *running_avg_y_start = running_avg_y;
  unsigned char *sig_start = sig;
  unsigned int sum_diff_thresh;
  int r;
  int shift_inc =
      (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
          ? 1
          : 0;
  __m128i acc_diff = _mm_setzero_si128();
  const __m128i k_0 = _mm_setzero_si128();
  const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
  const __m128i k_8 = _mm_set1_epi8(8);
  const __m128i k_16 = _mm_set1_epi8(16);
  /* Modify each level's adjustment according to motion_magnitude. */
  const __m128i l3 = _mm_set1_epi8(
      (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
  /* Difference between level 3 and level 2 is 2. */
  const __m128i l32 = _mm_set1_epi8(2);
  /* Difference between level 2 and level 1 is 1. */
  const __m128i l21 = _mm_set1_epi8(1);

  for (r = 0; r < 16; ++r) {
    /* Calculate differences */
    const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
    const __m128i v_mc_running_avg_y =
        _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0]));
    __m128i v_running_avg_y;
    const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
    const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
    /* Obtain the sign. FF if diff is negative. */
    const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
    /* Clamp absolute difference to 16 to be used to get mask. Doing this
     * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
    const __m128i clamped_absdiff =
        _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16);
    /* Get masks for l2 l1 and l0 adjustments */
    const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
    const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
    const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
    /* Get adjustments for l2, l1, and l0 */
    __m128i adj2 = _mm_and_si128(mask2, l32);
    const __m128i adj1 = _mm_and_si128(mask1, l21);
    const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
    __m128i adj, padj, nadj;

    /* Combine the adjustments and get absolute adjustments. */
    adj2 = _mm_add_epi8(adj2, adj1);
    adj = _mm_sub_epi8(l3, adj2);
    adj = _mm_andnot_si128(mask0, adj);
    adj = _mm_or_si128(adj, adj0);

    /* Restore the sign and get positive and negative adjustments. */
    padj = _mm_andnot_si128(diff_sign, adj);
    nadj = _mm_and_si128(diff_sign, adj);

    /* Calculate filtered value. */
    v_running_avg_y = _mm_adds_epu8(v_sig, padj);
    v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
    _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

    /* Adjustments <=7, and each element in acc_diff can fit in signed
     * char.
     */
    acc_diff = _mm_adds_epi8(acc_diff, padj);
    acc_diff = _mm_subs_epi8(acc_diff, nadj);

    /* Update pointers for next iteration. */
    sig += sig_stride;
    mc_running_avg_y += mc_avg_y_stride;
    running_avg_y += avg_y_stride;
  }

  {
    /* Compute the sum of all pixel differences of this MB. */
    unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
    sum_diff_thresh = SUM_DIFF_THRESHOLD;
    if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
    if (abs_sum_diff > sum_diff_thresh) {
      // Before returning to copy the block (i.e., apply no denoising),
      // check if we can still apply some (weaker) temporal filtering to
      // this block, that would otherwise not be denoised at all. Simplest
      // is to apply an additional adjustment to running_avg_y to bring it
      // closer to sig. The adjustment is capped by a maximum delta, and
      // chosen such that in most cases the resulting sum_diff will be
      // within the acceptable range given by sum_diff_thresh.

      // The delta is set by the excess of absolute pixel diff over the
      // threshold.
      int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
      // Only apply the adjustment for max delta up to 3.
      if (delta < 4) {
        const __m128i k_delta = _mm_set1_epi8(delta);
        sig -= sig_stride * 16;
        mc_running_avg_y -= mc_avg_y_stride * 16;
        running_avg_y -= avg_y_stride * 16;
        for (r = 0; r < 16; ++r) {
          __m128i v_running_avg_y =
              _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
          // Calculate differences.
          const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
          const __m128i v_mc_running_avg_y =
              _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0]));
          const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
          const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
          // Obtain the sign. FF if diff is negative.
          const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
          // Clamp absolute difference to delta to get the adjustment.
          const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
          // Restore the sign and get positive and negative adjustments.
          __m128i padj, nadj;
          padj = _mm_andnot_si128(diff_sign, adj);
          nadj = _mm_and_si128(diff_sign, adj);
          // Calculate filtered value.
          v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
          v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
          _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

          // Accumulate the adjustments.
          acc_diff = _mm_subs_epi8(acc_diff, padj);
          acc_diff = _mm_adds_epi8(acc_diff, nadj);

          // Update pointers for next iteration.
          sig += sig_stride;
          mc_running_avg_y += mc_avg_y_stride;
          running_avg_y += avg_y_stride;
        }
        abs_sum_diff = abs_sum_diff_16x1(acc_diff);
        if (abs_sum_diff > sum_diff_thresh) {
          return COPY_BLOCK;
        }
      } else {
        return COPY_BLOCK;
      }
    }
  }