Пример #1
0
//FINL 
int16 __ext_v_sum_int16(int16* x, int len)
{
	__m128i msum = _mm_setzero_si128();

#ifdef SORA_PLATFORM
	__int16 ret;
#else
	int16_t ret;
#endif

	const int wlen = 8;

	for (int i = 0; i < len / wlen; i++)
	{
		__m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i));
		msum = _mm_add_epi16(msum, mx);
	}

	__m128i mout = msum;


	msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3));
	mout = _mm_add_epi16(mout, msum);

	msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3));
	mout = _mm_add_epi16(mout, msum);

	msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3));
	mout = _mm_add_epi16(mout, msum);


	unsigned int temp = _mm_cvtsi128_si32(mout);
	ret = temp;
	
	ret += (temp >> 16);

	for (int i = (len / wlen) * wlen; i < len; i++)
	{
		ret += x[i];
	}

	return ret;

}
__m64 interpolvline_2(	unsigned char* image,	int PicWidthInPix){
	
	
	__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;

	__m64 ret;

	xmm7 = _mm_setzero_si128();

	xmm0 = _mm_movpi64_epi64(*((__m64*)(image - 2*PicWidthInPix)));
	xmm0 = _mm_unpacklo_epi8(xmm0,xmm7);
	xmm1 = _mm_movpi64_epi64(*((__m64*)(image - 1*PicWidthInPix)));
	xmm1 = _mm_unpacklo_epi8(xmm1,xmm7);
	xmm2 = _mm_movpi64_epi64(*((__m64*)(image - 0*PicWidthInPix)));
	xmm2 = _mm_unpacklo_epi8(xmm2,xmm7);
	xmm3 = _mm_movpi64_epi64(*((__m64*)(image + 1*PicWidthInPix)));
	xmm3 = _mm_unpacklo_epi8(xmm3,xmm7);
	xmm4 = _mm_movpi64_epi64(*((__m64*)(image + 2*PicWidthInPix)));
	xmm4 = _mm_unpacklo_epi8(xmm4,xmm7);
	xmm5 = _mm_movpi64_epi64(*((__m64*)(image + 3*PicWidthInPix)));
	xmm5 = _mm_unpacklo_epi8(xmm5,xmm7);

// filter on 8 values
	xmm6 = _mm_add_epi16(xmm2,xmm3);
	xmm6 = _mm_slli_epi16(xmm6,2);
	xmm6 = _mm_sub_epi16(xmm6,xmm1);
	xmm6 = _mm_sub_epi16(xmm6,xmm4);

	xmm1 = _mm_set_epi32(0x00050005,0x00050005,0x00050005,0x00050005);
	xmm6 = _mm_mullo_epi16(xmm6,xmm1);
	xmm6 = _mm_add_epi16(xmm6,xmm0);
	xmm6 = _mm_add_epi16(xmm6,xmm5);
	xmm6 = _mm_add_epi16(xmm6,_mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010));
	xmm6 = _mm_max_epi16(xmm6, xmm7); // preventing negative values
	xmm6 = _mm_srli_epi16(xmm6,5);

	xmm6 = _mm_packus_epi16(xmm6,xmm7);
	
	ret = _mm_movepi64_pi64(xmm6);

	_mm_empty(); 

	return(ret);
}
Пример #3
0
/**
 * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
 * precise version of a box filter 4:2:0 pixel subsampling in Q3.
 *
 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
 * active area is specified using width and height.
 *
 * Note: We don't need to worry about going over the active area, as long as we
 * stay inside the CfL prediction buffer.
 */
static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
                                                      int input_stride,
                                                      uint16_t *pred_buf_q3,
                                                      int width, int height) {
  const __m128i twos = _mm_set1_epi8(2);
  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
  const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128;
  const int luma_stride = input_stride << 1;
  do {
    if (width == 4) {
      __m128i top = _mm_loadh_epi32((__m128i *)input);
      top = _mm_maddubs_epi16(top, twos);
      __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride));
      bot = _mm_maddubs_epi16(bot, twos);
      const __m128i sum = _mm_add_epi16(top, bot);
      _mm_storeh_epi32(pred_buf_m128i, sum);
    } else if (width == 8) {
      __m128i top = _mm_loadl_epi64((__m128i *)input);
      top = _mm_maddubs_epi16(top, twos);
      __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
      bot = _mm_maddubs_epi16(bot, twos);
      const __m128i sum = _mm_add_epi16(top, bot);
      _mm_storel_epi64(pred_buf_m128i, sum);
    } else {
      __m128i top = _mm_loadu_si128((__m128i *)input);
      top = _mm_maddubs_epi16(top, twos);
      __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
      bot = _mm_maddubs_epi16(bot, twos);
      const __m128i sum = _mm_add_epi16(top, bot);
      _mm_storeu_si128(pred_buf_m128i, sum);
      if (width == 32) {
        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
        __m128i bot_1 =
            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
        top_1 = _mm_maddubs_epi16(top_1, twos);
        bot_1 = _mm_maddubs_epi16(bot_1, twos);
        __m128i sum_1 = _mm_add_epi16(top_1, bot_1);
        _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
      }
    }
    input += luma_stride;
    pred_buf_m128i += CFL_BUF_LINE_I128;
  } while (pred_buf_m128i < end);
}
    SIMDValue SIMDInt16x8Operation::OpAdd(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);

        x86Result.m128i_value = _mm_add_epi16(tmpaValue.m128i_value, tmpbValue.m128i_value); // a + b

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Пример #5
0
OD_SIMD_INLINE void od_mc_butterfly_2x2_16x8(__m128i *t0, __m128i *t1,
 __m128i *t2, __m128i *t3) {
  __m128i a;
  __m128i b;
  __m128i c;
  __m128i d;
  /*a = t0 + t1, c = (t0 + t1) - (t1 + t1) = t0 - t1
    b = t2 + t3, d = (t2 + t3) - (t3 + t3) = t2 - t3*/
  a = _mm_add_epi16(*t0, *t1);
  c = _mm_add_epi16(*t1, *t1);
  c = _mm_sub_epi16(a, c);
  b = _mm_add_epi16(*t2, *t3);
  d = _mm_add_epi16(*t3, *t3);
  d = _mm_sub_epi16(b, d);
  *t0 = a;
  *t1 = b;
  *t2 = c;
  *t3 = d;
}
Пример #6
0
// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
static WEBP_INLINE void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0,
                                  __m128i* q0, __m128i* q1, __m128i *q2,
                                  const __m128i* mask, int hev_thresh) {
    __m128i a, not_hev;
    const __m128i sign_bit = _mm_set1_epi8(0x80);

    // compute hev mask
    GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);

    // convert to signed values
    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
    FLIP_SIGN_BIT2(*p2, *q2);

    GET_BASE_DELTA(*p1, *p0, *q0, *q1, a);

    {   // do simple filter on pixels with hev
        const __m128i m = _mm_andnot_si128(not_hev, *mask);
        const __m128i f = _mm_and_si128(a, m);
        DO_SIMPLE_FILTER(*p0, *q0, f);
    }
    {   // do strong filter on pixels with not hev
        const __m128i zero = _mm_setzero_si128();
        const __m128i nine = _mm_set1_epi16(0x0900);
        const __m128i sixty_three = _mm_set1_epi16(63);

        const __m128i m = _mm_and_si128(not_hev, *mask);
        const __m128i f = _mm_and_si128(a, m);
        const __m128i f_lo = _mm_unpacklo_epi8(zero, f);
        const __m128i f_hi = _mm_unpackhi_epi8(zero, f);

        const __m128i f9_lo = _mm_mulhi_epi16(f_lo, nine);   // Filter (lo) * 9
        const __m128i f9_hi = _mm_mulhi_epi16(f_hi, nine);   // Filter (hi) * 9
        const __m128i f18_lo = _mm_add_epi16(f9_lo, f9_lo);  // Filter (lo) * 18
        const __m128i f18_hi = _mm_add_epi16(f9_hi, f9_hi);  // Filter (hi) * 18

        const __m128i a2_lo = _mm_add_epi16(f9_lo, sixty_three);  // Filter * 9 + 63
        const __m128i a2_hi = _mm_add_epi16(f9_hi, sixty_three);  // Filter * 9 + 63

        const __m128i a1_lo = _mm_add_epi16(f18_lo, sixty_three);  // F... * 18 + 63
        const __m128i a1_hi = _mm_add_epi16(f18_hi, sixty_three);  // F... * 18 + 63

        const __m128i a0_lo = _mm_add_epi16(f18_lo, a2_lo);  // Filter * 27 + 63
        const __m128i a0_hi = _mm_add_epi16(f18_hi, a2_hi);  // Filter * 27 + 63

        UPDATE_2PIXELS(*p2, *q2, a2_lo, a2_hi);
        UPDATE_2PIXELS(*p1, *q1, a1_lo, a1_hi);
        UPDATE_2PIXELS(*p0, *q0, a0_lo, a0_hi);
    }

    // unoffset
    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
    FLIP_SIGN_BIT2(*p2, *q2);
}
 __m64 interpolhline_2(unsigned char* image){

	__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
	unsigned char* imagetmp = image - 2;
	__m64 ret;

	xmm7 = _mm_setzero_si128();
	xmm6 =  _mm_loadu_si128(((__m128i*)imagetmp));


	xmm0 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm1 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm2 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm3 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm4 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm5 = _mm_unpacklo_epi8(xmm6,xmm7);

// filter on 8 values
	xmm6 = _mm_add_epi16(xmm2,xmm3);
	xmm6 = _mm_slli_epi16(xmm6,2);
	xmm6 = _mm_sub_epi16(xmm6,xmm1);
	xmm6 = _mm_sub_epi16(xmm6,xmm4);

	xmm1 = _mm_set_epi32(0x00050005,0x00050005,0x00050005,0x00050005);
	xmm6 = _mm_mullo_epi16(xmm6,xmm1);
	xmm6 = _mm_add_epi16(xmm6,xmm0);
	xmm6 = _mm_add_epi16(xmm6,xmm5);
	xmm6 = _mm_add_epi16(xmm6,_mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010));
	xmm6 = _mm_max_epi16(xmm6, xmm7); // preventing negative values
	xmm6 = _mm_srli_epi16(xmm6,5);

	xmm6 = _mm_packus_epi16(xmm6,xmm7);
	
	ret = _mm_movepi64_pi64(xmm6);
	_mm_empty(); 

	return(ret);
}
Пример #8
0
static unsigned satd_8bit_4x4_avx2(const kvz_pixel *org, const kvz_pixel *cur)
{

  __m128i original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)org));
  __m128i current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)cur));

  __m128i diff_lo = _mm_sub_epi16(current, original);

  original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(org + 8)));
  current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(cur + 8)));

  __m128i diff_hi = _mm_sub_epi16(current, original);


  //Hor
  __m128i row0 = _mm_hadd_epi16(diff_lo, diff_hi);
  __m128i row1 = _mm_hsub_epi16(diff_lo, diff_hi);

  __m128i row2 = _mm_hadd_epi16(row0, row1);
  __m128i row3 = _mm_hsub_epi16(row0, row1);

  //Ver
  row0 = _mm_hadd_epi16(row2, row3);
  row1 = _mm_hsub_epi16(row2, row3);

  row2 = _mm_hadd_epi16(row0, row1);
  row3 = _mm_hsub_epi16(row0, row1);

  //Abs and sum
  row2 = _mm_abs_epi16(row2);
  row3 = _mm_abs_epi16(row3);

  row3 = _mm_add_epi16(row2, row3);

  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) ));
  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
  row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) ));

  unsigned sum = _mm_extract_epi16(row3, 0);
  unsigned satd = (sum + 1) >> 1;

  return satd;
}
Пример #9
0
static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
                                          uint32_t a2, uint32_t a3) {
  const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);
  const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);
  const __m128i sum = _mm_add_epi16(avg2, avg1);
  const __m128i avg3 = _mm_srli_epi16(sum, 1);
  const __m128i A0 = _mm_packus_epi16(avg3, avg3);
  const uint32_t output = _mm_cvtsi128_si32(A0);
  return output;
}
Пример #10
0
static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i avg1 = Average2_128i(a0, a2);
  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
  const __m128i sum = _mm_add_epi16(avg1, A1);
  const __m128i avg2 = _mm_srli_epi16(sum, 1);
  const __m128i A2 = _mm_packus_epi16(avg2, avg2);
  const uint32_t output = _mm_cvtsi128_si32(A2);
  return output;
}
Пример #11
0
inline Pixel GetPixelSSE(const Image* img, float x, float y)
{
 const int stride = img->width;
 const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel

 // Load the data (2 pixels in one load)
 __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); 
 __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); 

 __m128 weight = CalcWeights(x, y);

 // extend to 16bit
 p12 = _mm_unpacklo_epi8(p12, _mm_setzero_si128());
 p34 = _mm_unpacklo_epi8(p34, _mm_setzero_si128());

 // convert floating point weights to 16bit integer
 weight = _mm_mul_ps(weight, CONST_256); 
 __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1
         weighti = _mm_packs_epi32(weighti, _mm_setzero_si128()); // 32->16bit

 // prepare the weights
 __m128i w12 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(1, 1, 0, 0));
 __m128i w34 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(3, 3, 2, 2));
 w12 = _mm_unpacklo_epi16(w12, w12); // w2 w2 w2 w2 w1 w1 w1 w1
 w34 = _mm_unpacklo_epi16(w34, w34); // w4 w4 w4 w4 w3 w3 w3 w3
 
 // multiply each pixel with its weight (2 pixel per SSE mul)
 __m128i L12 = _mm_mullo_epi16(p12, w12);
 __m128i L34 = _mm_mullo_epi16(p34, w34);

 // sum the results
 __m128i L1234 = _mm_add_epi16(L12, L34); 
 __m128i Lhi = _mm_shuffle_epi32(L1234, _MM_SHUFFLE(3, 2, 3, 2));
 __m128i L = _mm_add_epi16(L1234, Lhi);
  
 // convert back to 8bit
 __m128i L8 = _mm_srli_epi16(L, 8); // divide by 256
 L8 = _mm_packus_epi16(L8, _mm_setzero_si128());
 
 // return
 return _mm_cvtsi128_si32(L8);
}
Пример #12
0
unsigned int vp9_get16x16var_sse2(const uint8_t *src, int src_stride,
                                  const uint8_t *ref, int ref_stride,
                                  unsigned int *sse, int *sum) {
    const __m128i zero = _mm_setzero_si128();
    __m128i vsum = _mm_setzero_si128();
    __m128i vsse = _mm_setzero_si128();
    int i;

    for (i = 0; i < 16; ++i) {
        const __m128i s = _mm_loadu_si128((const __m128i *)src);
        const __m128i r = _mm_loadu_si128((const __m128i *)ref);

        const __m128i src0 = _mm_unpacklo_epi8(s, zero);
        const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
        const __m128i diff0 = _mm_sub_epi16(src0, ref0);

        const __m128i src1 = _mm_unpackhi_epi8(s, zero);
        const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
        const __m128i diff1 = _mm_sub_epi16(src1, ref1);

        vsum = _mm_add_epi16(vsum, diff0);
        vsum = _mm_add_epi16(vsum, diff1);
        vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
        vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));

        src += src_stride;
        ref += ref_stride;
    }

    // sum
    vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
    vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
    *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
           (int16_t)_mm_extract_epi16(vsum, 1);

    // sse
    vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
    vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
    *sse = _mm_cvtsi128_si32(vsse);

    return 0;
}
Пример #13
0
static INLINE void hor_transform_row_avx2(__m128i* row){
  
  __m128i mask_pos = _mm_set1_epi16(1);
  __m128i mask_neg = _mm_set1_epi16(-1);
  __m128i sign_mask = _mm_unpacklo_epi64(mask_pos, mask_neg);
  __m128i temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1));
  *row = _mm_sign_epi16(*row, sign_mask);
  *row = _mm_add_epi16(*row, temp);

  sign_mask = _mm_unpacklo_epi32(mask_pos, mask_neg);
  temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2));
  *row = _mm_sign_epi16(*row, sign_mask);
  *row = _mm_add_epi16(*row, temp);

  sign_mask = _mm_unpacklo_epi16(mask_pos, mask_neg);
  temp = _mm_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2));
  temp = _mm_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2));
  *row = _mm_sign_epi16(*row, sign_mask);
  *row = _mm_add_epi16(*row, temp);
}
Пример #14
0
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
                                         const uint8_t* src) {
  const int x_sub = wrk->x_sub;
  int accum = 0;
  const __m128i zero = _mm_setzero_si128();
  const __m128i mult0 = _mm_set1_epi16(x_sub);
  const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale);
  const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
  __m128i sum = zero;
  rescaler_t* frow = wrk->frow;
  const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;

  if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) {
    WebPRescalerImportRowShrink_C(wrk, src);
    return;
  }
  assert(!WebPRescalerInputDone(wrk));
  assert(!wrk->x_expand);

  for (; frow < frow_end; frow += 4) {
    __m128i base = zero;
    accum += wrk->x_add;
    while (accum > 0) {
      const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src));
      src += 4;
      base = _mm_unpacklo_epi8(A, zero);
      // To avoid overflow, we need: base * x_add / x_sub < 32768
      // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit.
      sum = _mm_add_epi16(sum, base);
      accum -= x_sub;
    }
    {    // Emit next horizontal pixel.
      const __m128i mult = _mm_set1_epi16(-accum);
      const __m128i frac0 = _mm_mullo_epi16(base, mult);  // 16b x 16b -> 32b
      const __m128i frac1 = _mm_mulhi_epu16(base, mult);
      const __m128i frac = _mm_unpacklo_epi16(frac0, frac1);  // frac is 32b
      const __m128i A0 = _mm_mullo_epi16(sum, mult0);
      const __m128i A1 = _mm_mulhi_epu16(sum, mult0);
      const __m128i B0 = _mm_unpacklo_epi16(A0, A1);      // sum * x_sub
      const __m128i frow_out = _mm_sub_epi32(B0, frac);   // sum * x_sub - frac
      const __m128i D0 = _mm_srli_epi64(frac, 32);
      const __m128i D1 = _mm_mul_epu32(frac, mult1);      // 32b x 16b -> 64b
      const __m128i D2 = _mm_mul_epu32(D0, mult1);
      const __m128i E1 = _mm_add_epi64(D1, rounder);
      const __m128i E2 = _mm_add_epi64(D2, rounder);
      const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2));
      const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2));
      const __m128i G = _mm_unpacklo_epi32(F1, F2);
      sum = _mm_packs_epi32(G, zero);
      _mm_storeu_si128((__m128i*)frow, frow_out);
    }
  }
  assert(accum == 0);
}
Пример #15
0
static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
                                                   uint32_t c2) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
  const __m128i V1 = _mm_add_epi16(C0, C1);
  const __m128i V2 = _mm_sub_epi16(V1, C2);
  const __m128i b = _mm_packus_epi16(V2, V2);
  const uint32_t output = _mm_cvtsi128_si32(b);
  return output;
}
Пример #16
0
static FORCE_INLINE __m128i mm_min_epu(const __m128i &a, const __m128i &b) {
    if (sizeof(PixelType) == 1)
        return _mm_min_epu8(a, b);
    else {
        __m128i word_32768 = _mm_set1_epi16(32768);

        __m128i a_minus = _mm_sub_epi16(a, word_32768);
        __m128i b_minus = _mm_sub_epi16(b, word_32768);

        return _mm_add_epi16(_mm_min_epi16(a_minus, b_minus), word_32768);
    }
}
Пример #17
0
// Returns |x| for 16-bit lanes.
static __m128i abs_i16(__m128i x) {
#if defined(__SSSE3__)
    return _mm_abs_epi16(x);
#else
    // Read this all as, return x<0 ? -x : x.
    // To negate two's complement, you flip all the bits then add 1.
    __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128());
    x = _mm_xor_si128(x, is_negative);                      // Flip negative lanes.
    x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15));  // +1 to negative lanes, else +0.
    return x;
#endif
}
Пример #18
0
__m64 _m_paddw(__m64 _MM1, __m64 _MM2)
{
    __m128i lhs = {0}, rhs = {0};
    lhs.m128i_i64[0] = _MM1.m64_i64;

    rhs.m128i_i64[0] = _MM2.m64_i64;

    lhs = _mm_add_epi16(lhs, rhs);

    _MM1.m64_i64 = lhs.m128i_i64[0];
    return _MM1;
}
Пример #19
0
unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
                                const uint8_t *ref, int ref_stride,
                                unsigned int *sse, int *sum) {
    const __m128i zero = _mm_setzero_si128();
    __m128i vsum = _mm_setzero_si128();
    __m128i vsse = _mm_setzero_si128();
    int i;

    for (i = 0; i < 8; i += 2) {
        const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
                (const __m128i *)(src + i * src_stride)), zero);
        const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
                (const __m128i *)(ref + i * ref_stride)), zero);
        const __m128i diff0 = _mm_sub_epi16(src0, ref0);

        const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
                (const __m128i *)(src + (i + 1) * src_stride)), zero);
        const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
                (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
        const __m128i diff1 = _mm_sub_epi16(src1, ref1);

        vsum = _mm_add_epi16(vsum, diff0);
        vsum = _mm_add_epi16(vsum, diff1);
        vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
        vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
    }

    // sum
    vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
    vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
    vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
    *sum = (int16_t)_mm_extract_epi16(vsum, 0);

    // sse
    vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
    vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
    *sse = _mm_cvtsi128_si32(vsse);

    return 0;
}
template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int16_t(void *_idata,
                                                                       const int istride,
                                                                       const char *odata,
                                                                       const int ostride,
                                                                       const int iwidth,
                                                                       const int iheight,
                                                                       const int ooffset_x,
                                                                       const int ooffset_y,
                                                                       const int owidth,
                                                                       const int oheight) {
  int16_t *idata = (int16_t *)_idata;
  const int skip = 1;
  const __m128i ONE = _mm_set1_epi16(1);
  const __m128i OFFSET = _mm_set1_epi16(1 << (active_bits - 1));
  const __m128i SHUF = _mm_set_epi8(15,14, 11,10, 7,6, 3,2,
                                    13,12,   9,8, 5,4, 1,0);
  const __m128i CLIP = _mm_set1_epi16((1 << active_bits) - 1);
  const __m128i ZERO = _mm_set1_epi16(0);

  (void)iwidth;
  (void)iheight;

  for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) {
    for (int x = ooffset_x; x < ooffset_x + owidth; x += 16) {
      __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]);
      __m128i D8 = _mm_load_si128((__m128i *)&idata[y*istride + x + 8]);

      D0 = _mm_shuffle_epi8(D0, SHUF);
      D8 = _mm_shuffle_epi8(D8, SHUF);

      __m128i E0 = _mm_unpacklo_epi64(D0, D8);
      __m128i O1 = _mm_unpackhi_epi64(D0, D8);

      __m128i X0 = _mm_sub_epi16(E0, _mm_srai_epi16(_mm_add_epi16(O1, ONE), 1));
      __m128i X1 = _mm_add_epi16(O1, X0);

      __m128i Z0 = _mm_unpacklo_epi16(X0, X1);
      __m128i Z8 = _mm_unpackhi_epi16(X0, X1);

      if (shift != 0) {
        Z0 = _mm_add_epi16(Z0, ONE);
        Z8 = _mm_add_epi16(Z8, ONE);
        Z0 = _mm_srai_epi16(Z0, shift);
        Z8 = _mm_srai_epi16(Z8, shift);
      }

      Z0 = _mm_add_epi16(Z0, OFFSET);
      Z8 = _mm_add_epi16(Z8, OFFSET);

      Z0 = _mm_min_epi16(Z0, CLIP);
      Z8 = _mm_min_epi16(Z8, CLIP);

      Z0 = _mm_max_epi16(Z0, ZERO);
      Z8 = _mm_max_epi16(Z8, ZERO);

      _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 0 - ooffset_x)], Z0);
      _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 8 - ooffset_x)], Z8);
    }
  }
}
Пример #21
0
Image<uint16_t> blur_fast(Image<uint16_t> in) {
    Image<uint16_t> out(in.width()-8, in.height()-2);
    begin_timing;
    __m128i one_third = _mm_set1_epi16(21846);
#pragma omp parallel for
    for (int yTile = 0; yTile < out.height(); yTile += 32) {
        __m128i a, b, c, sum, avg;
        __m128i tmp[(128/8) * (32 + 2)];
        for (int xTile = 0; xTile < out.width(); xTile += 128) {
            __m128i *tmpPtr = tmp;
            for (int y = 0; y < 32+2; y++) {
                const uint16_t *inPtr = &(in(xTile, yTile+y));
                for (int x = 0; x < 128; x += 8) {
                    a = _mm_load_si128((__m128i*)(inPtr));
                    b = _mm_loadu_si128((__m128i*)(inPtr+1));
                    c = _mm_loadu_si128((__m128i*)(inPtr+2));
                    sum = _mm_add_epi16(_mm_add_epi16(a, b), c);
                    avg = _mm_mulhi_epi16(sum, one_third);
                    _mm_store_si128(tmpPtr++, avg);
                    inPtr+=8;
                }
            }
            tmpPtr = tmp;
            for (int y = 0; y < 32; y++) {
                __m128i *outPtr = (__m128i *)(&(out(xTile, yTile+y)));
                for (int x = 0; x < 128; x += 8) {
                    a = _mm_load_si128(tmpPtr+(2*128)/8);
                    b = _mm_load_si128(tmpPtr+128/8);
                    c = _mm_load_si128(tmpPtr++);
                    sum = _mm_add_epi16(_mm_add_epi16(a, b), c);
                    avg = _mm_mulhi_epi16(sum, one_third);
                    _mm_store_si128(outPtr++, avg);
                }
            }
        }
    }
    end_timing;
    return out;
}
Пример #22
0
static INLINE void ver_add_sub_avx2(__m128i (*temp_hor)[8], __m128i (*temp_ver)[8]){

  // First stage
  for (int i = 0; i < 8; i += 2){
    (*temp_ver)[i+0] = _mm_hadd_epi16((*temp_hor)[i + 0], (*temp_hor)[i + 1]);
    (*temp_ver)[i+1] = _mm_hsub_epi16((*temp_hor)[i + 0], (*temp_hor)[i + 1]);
  }

  // Second stage
  for (int i = 0; i < 8; i += 4){
    (*temp_hor)[i + 0] = _mm_add_epi16((*temp_ver)[i + 0], (*temp_ver)[i + 2]);
    (*temp_hor)[i + 1] = _mm_add_epi16((*temp_ver)[i + 1], (*temp_ver)[i + 3]);
    (*temp_hor)[i + 2] = _mm_sub_epi16((*temp_ver)[i + 0], (*temp_ver)[i + 2]);
    (*temp_hor)[i + 3] = _mm_sub_epi16((*temp_ver)[i + 1], (*temp_ver)[i + 3]);
  }

  // Third stage
  for (int i = 0; i < 4; ++i){
    (*temp_ver)[i + 0] = _mm_add_epi16((*temp_hor)[0 + i], (*temp_hor)[4 + i]);
    (*temp_ver)[i + 4] = _mm_sub_epi16((*temp_hor)[0 + i], (*temp_hor)[4 + i]);
  }
}
Пример #23
0
/**
 * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
 * precise version of a box filter 4:2:0 pixel subsampling in Q3.
 *
 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
 * active area is specified using width and height.
 *
 * Note: We don't need to worry about going over the active area, as long as we
 * stay inside the CfL prediction buffer.
 */
static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input,
                                                      int input_stride,
                                                      uint16_t *pred_buf_q3,
                                                      int width, int height) {
  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
  const int luma_stride = input_stride << 1;
  do {
    if (width == 4) {
      const __m128i top = _mm_loadl_epi64((__m128i *)input);
      const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
      __m128i sum = _mm_add_epi16(top, bot);
      sum = _mm_hadd_epi16(sum, sum);
      *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum));
    } else {
      const __m128i top = _mm_loadu_si128((__m128i *)input);
      const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
      __m128i sum = _mm_add_epi16(top, bot);
      if (width == 8) {
        sum = _mm_hadd_epi16(sum, sum);
        _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
      } else {
        const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
        const __m128i bot_1 =
            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
        sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1));
        _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
        if (width == 32) {
          const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
          const __m128i bot_2 =
              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2);
          const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
          const __m128i bot_3 =
              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3);
          const __m128i sum_2 = _mm_add_epi16(top_2, bot_2);
          const __m128i sum_3 = _mm_add_epi16(top_3, bot_3);
          __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3);
          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1,
                           _mm_add_epi16(next_sum, next_sum));
        }
      }
    }
    input += luma_stride;
  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
}
Пример #24
0
static void GradientPredictInverse(const uint8_t* const in,
                                   const uint8_t* const top,
                                   uint8_t* const row, int length) {
  if (length > 0) {
    int i;
    const int max_pos = length & ~7;
    const __m128i zero = _mm_setzero_si128();
    __m128i A = _mm_set_epi32(0, 0, 0, row[-1]);   // left sample
    for (i = 0; i < max_pos; i += 8) {
      const __m128i tmp0 = _mm_loadl_epi64((const __m128i*)&top[i]);
      const __m128i tmp1 = _mm_loadl_epi64((const __m128i*)&top[i - 1]);
      const __m128i B = _mm_unpacklo_epi8(tmp0, zero);
      const __m128i C = _mm_unpacklo_epi8(tmp1, zero);
      const __m128i tmp2 = _mm_loadl_epi64((const __m128i*)&in[i]);
      const __m128i D = _mm_unpacklo_epi8(tmp2, zero);   // base input
      const __m128i E = _mm_sub_epi16(B, C);  // unclipped gradient basis B - C
      __m128i out = zero;                     // accumulator for output
      __m128i mask_hi = _mm_set_epi32(0, 0, 0, 0xff);
      int k = 8;
      while (1) {
        const __m128i tmp3 = _mm_add_epi16(A, E);        // delta = A + B - C
        const __m128i tmp4 = _mm_min_epi16(tmp3, mask_hi);
        const __m128i tmp5 = _mm_max_epi16(tmp4, zero);  // clipped delta
        const __m128i tmp6 = _mm_add_epi16(tmp5, D);     // add to in[] values
        A = _mm_and_si128(tmp6, mask_hi);                // 1-complement clip
        out = _mm_or_si128(out, A);                      // accumulate output
        if (--k == 0) break;
        A = _mm_slli_si128(A, 2);                        // rotate left sample
        mask_hi = _mm_slli_si128(mask_hi, 2);            // rotate mask
      }
      A = _mm_srli_si128(A, 14);       // prepare left sample for next iteration
      _mm_storel_epi64((__m128i*)&row[i], _mm_packus_epi16(out, zero));
    }
    for (; i < length; ++i) {
      row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]);
    }
  }
}
Пример #25
0
// These constants are 14b fixed-point version of ITU-R BT.601 constants.
// R = (19077 * y             + 26149 * v - 14234) >> 6
// G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
// B = (19077 * y + 33050 * u             - 17685) >> 6
static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0,
                                     const __m128i* const U0,
                                     const __m128i* const V0,
                                     __m128i* const R,
                                     __m128i* const G,
                                     __m128i* const B) {
  const __m128i k19077 = _mm_set1_epi16(19077);
  const __m128i k26149 = _mm_set1_epi16(26149);
  const __m128i k14234 = _mm_set1_epi16(14234);
  // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
  const __m128i k33050 = _mm_set1_epi16((short)33050);
  const __m128i k17685 = _mm_set1_epi16(17685);
  const __m128i k6419  = _mm_set1_epi16(6419);
  const __m128i k13320 = _mm_set1_epi16(13320);
  const __m128i k8708  = _mm_set1_epi16(8708);

  const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);

  const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
  const __m128i R1 = _mm_sub_epi16(Y1, k14234);
  const __m128i R2 = _mm_add_epi16(R1, R0);

  const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
  const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
  const __m128i G2 = _mm_add_epi16(Y1, k8708);
  const __m128i G3 = _mm_add_epi16(G0, G1);
  const __m128i G4 = _mm_sub_epi16(G2, G3);

  // be careful with the saturated *unsigned* arithmetic here!
  const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
  const __m128i B1 = _mm_adds_epu16(B0, Y1);
  const __m128i B2 = _mm_subs_epu16(B1, k17685);

  // use logical shift for B2, which can be larger than 32767
  *R = _mm_srai_epi16(R2, 6);   // range: [-14234, 30815]
  *G = _mm_srai_epi16(G4, 6);   // range: [-10953, 27710]
  *B = _mm_srli_epi16(B2, 6);   // range: [0, 34238]
}
Пример #26
0
/*
=====================
R_CopyDecalSurface
=====================
*/
static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * indexes, int numIndexes,
									const decal_t * decal, const float fadeColor[4] ) {
	assert_16_byte_aligned( &verts[numVerts] );
	assert_16_byte_aligned( &indexes[numIndexes] );
	assert_16_byte_aligned( decal->indexes );
	assert_16_byte_aligned( decal->verts );
	assert( ( ( decal->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 );
	assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
	assert_16_byte_aligned( fadeColor );


	const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
	const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts );
	const __m128 vector_fade_color = _mm_load_ps( fadeColor );
	const __m128i vector_color_mask = _mm_set_epi32( 0, -1, 0, 0 );

	// copy vertices and apply depth/time based fading
	assert_offsetof( idDrawVert, color, 6 * 4 );
	for ( int i = 0; i < decal->numVerts; i++ ) {
		const idDrawVert &srcVert = decal->verts[i];
		idDrawVert &dstVert = verts[numVerts + i];

		__m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert +  0 ) );
		__m128i v1 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 16 ) );
		__m128 depthFade = _mm_splat_ps( _mm_load_ss( decal->vertDepthFade + i ), 0 );

		__m128 timeDepthFade = _mm_mul_ps( depthFade, vector_fade_color );
		__m128i colorInt = _mm_cvtps_epi32( timeDepthFade );
		__m128i colorShort = _mm_packs_epi32( colorInt, colorInt );
		__m128i colorByte = _mm_packus_epi16( colorShort, colorShort );
		v1 = _mm_or_si128( v1, _mm_and_si128( colorByte, vector_color_mask ) );

		_mm_stream_si128( (__m128i *)( (byte *)&dstVert +  0 ), v0 );
		_mm_stream_si128( (__m128i *)( (byte *)&dstVert + 16 ), v1 );
	}

	// copy indexes
	assert( ( decal->numIndexes & 7 ) == 0 );
	assert( sizeof( triIndex_t ) == 2 );
	for ( int i = 0; i < decal->numIndexes; i += 8 ) {
		__m128i vi = _mm_load_si128( (const __m128i *)&decal->indexes[i] );

		vi = _mm_add_epi16( vi, vector_short_num_verts );

		_mm_stream_si128( (__m128i *)&indexes[numIndexes + i], vi );
	}

	_mm_sfence();

}
     SIMDValue SIMDInt16x8Operation::OpNeg(const SIMDValue& value)
    {
        X86SIMDValue x86Result;

        X86SIMDValue SIGNMASK, temp;
        X86SIMDValue negativeOnes = { { -1, -1, -1, -1} };
        X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value);

        temp.m128i_value = _mm_andnot_si128(v.m128i_value, negativeOnes.m128i_value); // (~value) & (negative ones)
        SIGNMASK.m128i_value = _mm_set1_epi16(0x0001);                            // set SIGNMASK to 1
        x86Result.m128i_value = _mm_add_epi16(SIGNMASK.m128i_value, temp.m128i_value);// add 4 integers respectively

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Пример #28
0
		SIMD_INLINE __m128i AdjustedYuvToHue16(__m128i y, __m128i u, __m128i v, const __m128 & KF_255_DIV_6)
		{
			const __m128i red = AdjustedYuvToRed16(y, v);
			const __m128i green = AdjustedYuvToGreen16(y, u, v);
			const __m128i blue = AdjustedYuvToBlue16(y, u);
			const __m128i max = MaxI16(red, green, blue);
			const __m128i range = _mm_subs_epi16(max, MinI16(red, green, blue));

			const __m128i redMaxMask = _mm_cmpeq_epi16(red, max);
			const __m128i greenMaxMask = _mm_andnot_si128(redMaxMask, _mm_cmpeq_epi16(green, max));
			const __m128i blueMaxMask = _mm_andnot_si128(redMaxMask, _mm_andnot_si128(greenMaxMask, K_INV_ZERO));

			const __m128i redMaxCase = _mm_and_si128(redMaxMask, 
				_mm_add_epi16(_mm_sub_epi16(green, blue), _mm_mullo_epi16(range, K16_0006))); 
			const __m128i greenMaxCase = _mm_and_si128(greenMaxMask, 
				_mm_add_epi16(_mm_sub_epi16(blue, red), _mm_mullo_epi16(range, K16_0002))); 
			const __m128i blueMaxCase = _mm_and_si128(blueMaxMask, 
				_mm_add_epi16(_mm_sub_epi16(red, green), _mm_mullo_epi16(range, K16_0004))); 

			const __m128i dividend = _mm_or_si128(_mm_or_si128(redMaxCase, greenMaxCase), blueMaxCase);

			return _mm_andnot_si128(_mm_cmpeq_epi16(range, K_ZERO), _mm_and_si128(MulDiv16(dividend, range, KF_255_DIV_6), K16_00FF));
		}
Пример #29
0
static inline __m128i
v4_mul_color_sse2(__m128i x, __m128i y)
{
   const __m128i zero = _mm_setzero_si128();
   const __m128i sym4_mask = _mm_set_epi32(0x00FF00FF, 0x000000FF, 0x00FF00FF, 0x000000FF);

   __m128i x_l = _mm_unpacklo_epi8(x, zero);
   __m128i x_h = _mm_unpackhi_epi8(x, zero);

   __m128i y_l = _mm_unpacklo_epi8(y, zero);
   __m128i y_h = _mm_unpackhi_epi8(y, zero);

   __m128i r_l = _mm_mullo_epi16(x_l, y_l);
   __m128i r_h = _mm_mullo_epi16(x_h, y_h);

   r_l = _mm_add_epi16(r_l, sym4_mask);
   r_h = _mm_add_epi16(r_h, sym4_mask);

   r_l = _mm_srli_epi16(r_l, 8);
   r_h = _mm_srli_epi16(r_h, 8);

   return  _mm_packus_epi16(r_l, r_h);
}
Пример #30
0
// Compute the sum of all pixel differences of this MB.
static INLINE int sum_diff_16x1(__m128i acc_diff) {
  const __m128i k_1 = _mm_set1_epi16(1);
  const __m128i acc_diff_lo =
      _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
  const __m128i acc_diff_hi =
      _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
  const __m128i hgfe_dcba =
      _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
  const __m128i hgfedcba =
      _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
  return _mm_cvtsi128_si32(hgfedcba);
}