예제 #1
0
static int SSE8x8(const uint8_t* a, const uint8_t* b) {
  const __m128i zero = _mm_setzero_si128();
  int num_pairs = 4;
  __m128i sum = zero;
  int32_t tmp[4];
  while (num_pairs-- > 0) {
    const __m128i a0 = LOAD_8x16b(&a[BPS * 0]);
    const __m128i a1 = LOAD_8x16b(&a[BPS * 1]);
    const __m128i b0 = LOAD_8x16b(&b[BPS * 0]);
    const __m128i b1 = LOAD_8x16b(&b[BPS * 1]);
    // subtract
    const __m128i c0 = _mm_subs_epi16(a0, b0);
    const __m128i c1 = _mm_subs_epi16(a1, b1);
    // multiply/accumulate with self
    const __m128i d0 = _mm_madd_epi16(c0, c0);
    const __m128i d1 = _mm_madd_epi16(c1, c1);
    // collect
    const __m128i sum01 = _mm_add_epi32(d0, d1);
    sum = _mm_add_epi32(sum, sum01);
    a += 2 * BPS;
    b += 2 * BPS;
  }
  _mm_storeu_si128((__m128i*)tmp, sum);
  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
static inline void
inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a,
    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
  gint i;
  __m128i sum, t;

  sum = _mm_setzero_si128 ();

  for (i = 0; i < len; i += 16) {
    t = _mm_loadu_si128 ((__m128i *) (a + i));
    sum =
        _mm_add_epi32 (sum, _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (b + i + 0))));

    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
    sum =
        _mm_add_epi32 (sum, _mm_madd_epi16 (t,
            _mm_load_si128 ((__m128i *) (b + i + 8))));
  }
  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3)));
  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1)));

  sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
  sum = _mm_srai_epi32 (sum, PRECISION_S16);
  sum = _mm_packs_epi32 (sum, sum);
  *o = _mm_extract_epi16 (sum, 0);
}
예제 #3
0
파일: moments.cpp 프로젝트: 12rohanb/opencv
    int operator() (const uchar * ptr, int len, int & x0, int & x1, int & x2, int & x3)
    {
        int x = 0;

        if( useSIMD )
        {
            __m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
            __m128i dx = _mm_set1_epi16(8);
            __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init;

            for( ; x <= len - 8; x += 8 )
            {
                __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z);
                __m128i sx = _mm_mullo_epi16(qx, qx);

                qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z));
                qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx));
                qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx));
                qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx));

                qx = _mm_add_epi16(qx, dx);
            }

            _mm_store_si128((__m128i*)buf, qx0);
            x0 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx1);
            x1 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx2);
            x2 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx3);
            x3 = buf[0] + buf[1] + buf[2] + buf[3];
        }

        return x;
    }
예제 #4
0
 SIMD_INLINE void InterpolateX1(const __m128i * alpha, __m128i * buffer)
 {
     __m128i src = _mm_load_si128(buffer);
     __m128i lo = _mm_madd_epi16(_mm_unpacklo_epi8(src, K_ZERO), _mm_load_si128(alpha + 0));
     __m128i hi = _mm_madd_epi16(_mm_unpackhi_epi8(src, K_ZERO), _mm_load_si128(alpha + 1));
     _mm_store_si128(buffer, _mm_packs_epi32(lo, hi));
 }
void
interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
    gint len, const gpointer icp, gint astride)
{
  gint i = 0;
  gint16 *o = op, *a = ap, *ic = icp;
  __m128i ta, tb, t1, t2;
  __m128i f = _mm_set_epi64x (0, *((gint64 *) ic));
  const gint16 *c[2] = { (gint16 *) ((gint8 *) a + 0 * astride),
    (gint16 *) ((gint8 *) a + 1 * astride)
  };

  f = _mm_unpacklo_epi32 (f, f);
  f = _mm_unpacklo_epi64 (f, f);

  for (; i < len; i += 8) {
    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
    tb = _mm_load_si128 ((__m128i *) (c[1] + i));

    t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f);
    t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f);

    t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
    t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));

    t1 = _mm_srai_epi32 (t1, PRECISION_S16);
    t2 = _mm_srai_epi32 (t2, PRECISION_S16);

    t1 = _mm_packs_epi32 (t1, t2);
    _mm_store_si128 ((__m128i *) (o + i), t1);
  }
}
예제 #6
0
static uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
                                                int stride) {
  const __m128i v_val_0_w =
      _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
  const __m128i v_val_1_w =
      _mm_loadl_epi64((const __m128i *)(src + 1 * stride));
  const __m128i v_val_2_w =
      _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
  const __m128i v_val_3_w =
      _mm_loadl_epi64((const __m128i *)(src + 3 * stride));

  const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
  const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
  const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
  const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);

  const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
  const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
  const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);

  const __m128i v_sum_d =
      _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));

  return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
}
예제 #7
0
//
// multiplies two complex vectors and returns the real and imaginary parts
// as two 32 bit integers.
//
FORCE_INLINE
int __ext_v_conj_mul_complex16_int32(int32* re, int lenout1, int32* im, int lenout2,
        struct complex16* x, int len1, struct complex16* y, int len2 )
{
  const unum8 wlen = 4;// sizeof(vcs) / sizeof(complex16);
  const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000);
  const __m128i xmm4 = _mm_set1_epi32(0x00010000);

  __m128i* Xs = (__m128i*) x;
  __m128i* Ys = (__m128i*) y;
  __m128i* Res = (__m128i*) re;
  __m128i* Ims = (__m128i*) im;
  for (int i = 0; i < len1 / wlen; i++){
    __m128i mx = _mm_loadu_si128(&Xs[i]);
    __m128i my = _mm_loadu_si128(&Ys[i]);

    __m128i ms2 = _mm_xor_si128(my, xmm5);
    ms2 = _mm_add_epi32(ms2, xmm4);

    ms2 = _mm_shufflehi_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));
    ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));

    _mm_storeu_si128(&Res[i], _mm_madd_epi16(my, mx));
    _mm_storeu_si128(&Ims[i], _mm_madd_epi16(ms2, mx));
  }

  for (int i = (len1 / wlen) * wlen; i < len1; i++){
    re[i] = x[i].re * y[i].re + x[i].im * y[i].im ;
    im[i] = x[i].im * y[i].re - x[i].re * y[i].im ;
  }

  return 0;
}
예제 #8
0
static void get4x4var_sse2(const uint8_t *src, int src_stride,
                           const uint8_t *ref, int ref_stride,
                           unsigned int *sse, int *sum) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
  const __m128i diff1 = _mm_sub_epi16(src1, ref1);

  // sum
  __m128i vsum = _mm_add_epi16(diff0, diff1);
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
  *sum = (int16_t)_mm_extract_epi16(vsum, 0);

  // sse
  vsum =
      _mm_add_epi32(_mm_madd_epi16(diff0, diff0), _mm_madd_epi16(diff1, diff1));
  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
  *sse = _mm_cvtsi128_si32(vsum);
}
void SubpixelMaximizer::fitUsingSSE3(float coef[FitMatrix::ROWS], const signed short data[3][3][3]) const
{
  assert(FitMatrix::PADDEDCOLS == 32);
  __m128 localFitMatrixScale = _mm_set_ss(fitMatrix.scale);
  const short* localFitMatrix = fitMatrix();
  // Load data into four SSE Registers
  __m128i x[4];
  signed short* dataFlat = (signed short*) data; // flat arraw of 27 signed shorts
  x[0] = _mm_loadu_si128((__m128i*)(dataFlat + 0));
  x[1] = _mm_loadu_si128((__m128i*)(dataFlat + 8));
  x[2] = _mm_loadu_si128((__m128i*)(dataFlat + 16));
  x[3] = _mm_loadu_si128((__m128i*)(dataFlat + 24));
  x[3] = _mm_srli_si128(_mm_slli_si128(x[3], 10), 10);   // Clear dataFlat[27..31]

  for(int i = 0; i < FitMatrix::ROWS; i++)
  {
    // Compute scalar product between ((float*)x)[0..31] and localFitMatrix
    __m128i sum =             _mm_madd_epi16(x[0], *(__m128i*)(localFitMatrix + 0));
    sum = _mm_add_epi32(sum, _mm_madd_epi16(x[1], *(__m128i*)(localFitMatrix + 8)));
    sum = _mm_add_epi32(sum, _mm_madd_epi16(x[2], *(__m128i*)(localFitMatrix + 16)));
    sum = _mm_add_epi32(sum, _mm_madd_epi16(x[3], *(__m128i*)(localFitMatrix + 24)));
    sum = _mm_hadd_epi32(sum, sum);
    sum = _mm_hadd_epi32(sum, sum);
    _mm_store_ss(coef + i, _mm_mul_ss(_mm_cvtepi32_ps(sum), localFitMatrixScale));
    localFitMatrix += 32;
  }
}
예제 #10
0
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  const __m128i zero = _mm_setzero_si128();

  // Load values. Note that we read 8 pixels instead of 4,
  // but the a/b buffers are over-allocated to that effect.
  const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[BPS * 0]);
  const __m128i a1 = _mm_loadl_epi64((const __m128i*)&a[BPS * 1]);
  const __m128i a2 = _mm_loadl_epi64((const __m128i*)&a[BPS * 2]);
  const __m128i a3 = _mm_loadl_epi64((const __m128i*)&a[BPS * 3]);
  const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[BPS * 0]);
  const __m128i b1 = _mm_loadl_epi64((const __m128i*)&b[BPS * 1]);
  const __m128i b2 = _mm_loadl_epi64((const __m128i*)&b[BPS * 2]);
  const __m128i b3 = _mm_loadl_epi64((const __m128i*)&b[BPS * 3]);
  // Combine pair of lines.
  const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
  const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
  const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
  const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
  // Convert to 16b.
  const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
  const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
  const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
  const __m128i b23s = _mm_unpacklo_epi8(b23, zero);
  // subtract, square and accumulate
  const __m128i d0 = _mm_subs_epi16(a01s, b01s);
  const __m128i d1 = _mm_subs_epi16(a23s, b23s);
  const __m128i e0 = _mm_madd_epi16(d0, d0);
  const __m128i e1 = _mm_madd_epi16(d1, d1);
  const __m128i sum = _mm_add_epi32(e0, e1);

  int32_t tmp[4];
  _mm_storeu_si128((__m128i*)tmp, sum);
  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
예제 #11
0
//compute average channel_level on each (TX,RX) antenna pair
int dl_channel_level(s16 *dl_ch,
		     LTE_DL_FRAME_PARMS *frame_parms) {

  s16 rb;
  __m128i *dl_ch128;
  int avg;

      //clear average level
  avg128F = _mm_xor_si128(avg128F,avg128F);
  dl_ch128=(__m128i *)dl_ch;

  for (rb=0;rb<frame_parms->N_RB_DL;rb++) {
    
    avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[0],dl_ch128[0]));
    avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[1],dl_ch128[1]));
    avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[2],dl_ch128[2]));
    
    dl_ch128+=3;	
    
  }

  avg = (((int*)&avg128F)[0] + 
	 ((int*)&avg128F)[1] + 
	 ((int*)&avg128F)[2] + 
	 ((int*)&avg128F)[3])/(frame_parms->N_RB_DL*12);
  


  _mm_empty();
  _m_empty();
  
  return(avg);
}
static inline void
inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
  gint i = 0;
  __m128i sum[4], t[4];
  __m128i f = _mm_set_epi64x (0, *((long long *) icoeff));
  const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride),
    (gint16 *) ((gint8 *) b + 1 * bstride),
    (gint16 *) ((gint8 *) b + 2 * bstride),
    (gint16 *) ((gint8 *) b + 3 * bstride)
  };

  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
  f = _mm_unpacklo_epi16 (f, sum[0]);

  for (; i < len; i += 8) {
    t[0] = _mm_loadu_si128 ((__m128i *) (a + i));
    sum[0] =
        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[0] + i))));
    sum[1] =
        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[1] + i))));
    sum[2] =
        _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[2] + i))));
    sum[3] =
        _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[3] + i))));
  }
  t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]);
  t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]);
  t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]);
  t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]);

  sum[0] =
      _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0],
          t[1]));
  sum[2] =
      _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2],
          t[3]));
  sum[0] = _mm_add_epi32 (sum[0], sum[2]);

  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
  sum[0] = _mm_madd_epi16 (sum[0], f);

  sum[0] =
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
              3)));
  sum[0] =
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
              1)));

  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
  *o = _mm_extract_epi16 (sum[0], 0);
}
예제 #13
0
 SIMD_INLINE void InterpolateX2(const __m128i * alpha, __m128i * buffer)
 {
     __m128i src = _mm_load_si128(buffer);
     __m128i a = _mm_load_si128(alpha);
     __m128i u = _mm_madd_epi16(_mm_and_si128(src, K16_00FF), a);
     __m128i v = _mm_madd_epi16(_mm_and_si128(_mm_srli_si128(src, 1), K16_00FF), a);
     _mm_store_si128(buffer, _mm_or_si128(u, _mm_slli_si128(v, 2)));
 }
예제 #14
0
// Horizontal add (doubled) of two 16b values, result is 16b.
// in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
static void HorizontalAddPack_SSE41(const __m128i* const A,
                                    const __m128i* const B,
                                    __m128i* const out) {
  const __m128i k2 = _mm_set1_epi16(2);
  const __m128i C = _mm_madd_epi16(*A, k2);
  const __m128i D = _mm_madd_epi16(*B, k2);
  *out = _mm_packs_epi32(C, D);
}
//compute average channel_level on each (TX,RX) antenna pair
int dl_channel_level(int16_t *dl_ch,
                     LTE_DL_FRAME_PARMS *frame_parms)
{

    int16_t rb;
#if defined(__x86_64__) || defined(__i386__)
    __m128i *dl_ch128;
#elif defined(__arm__)
    int16x4_t *dl_ch128;
#endif
    int avg;

    //clear average level
#if defined(__x86_64__) || defined(__i386__)
    avg128F = _mm_setzero_si128();
    dl_ch128=(__m128i *)dl_ch;

    for (rb=0; rb<frame_parms->N_RB_DL; rb++) {

        avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[0],dl_ch128[0]));
        avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[1],dl_ch128[1]));
        avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[2],dl_ch128[2]));

        dl_ch128+=3;

    }
#elif defined(__arm__)
    avg128F = vdupq_n_s32(0);
    dl_ch128=(int16x4_t *)dl_ch;

    for (rb=0; rb<frame_parms->N_RB_DL; rb++) {

        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[0],dl_ch128[0]));
        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[1],dl_ch128[1]));
        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[2],dl_ch128[2]));
        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[3],dl_ch128[3]));
        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[4],dl_ch128[4]));
        avg128F = vqaddq_s32(avg128F,vmull_s16(dl_ch128[5],dl_ch128[5]));
        dl_ch128+=6;


    }


#endif
    DevAssert( frame_parms->N_RB_DL );
    avg = (((int*)&avg128F)[0] +
           ((int*)&avg128F)[1] +
           ((int*)&avg128F)[2] +
           ((int*)&avg128F)[3])/(frame_parms->N_RB_DL*12);


#if defined(__x86_64__) || defined(__i386__)
    _mm_empty();
    _m_empty();
#endif
    return(avg);
}
예제 #16
0
static INLINE unsigned int highbd_masked_sad4xh_ssse3(
    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
    int height) {
  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
  int y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
  const __m128i round_const =
      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
  const __m128i one = _mm_set1_epi16(1);

  for (y = 0; y < height; y += 2) {
    const __m128i src = _mm_unpacklo_epi64(
        _mm_loadl_epi64((const __m128i *)src_ptr),
        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
    const __m128i a =
        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr),
                           _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]));
    const __m128i b =
        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr),
                           _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
    // Zero-extend mask to 16 bits
    const __m128i m = _mm_unpacklo_epi8(
        _mm_unpacklo_epi32(
            _mm_cvtsi32_si128(*(const uint32_t *)m_ptr),
            _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])),
        _mm_setzero_si128());
    const __m128i m_inv = _mm_sub_epi16(mask_max, m);

    const __m128i data_l = _mm_unpacklo_epi16(a, b);
    const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
    __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
    pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
                            AOM_BLEND_A64_ROUND_BITS);

    const __m128i data_r = _mm_unpackhi_epi16(a, b);
    const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
    __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
    pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
                            AOM_BLEND_A64_ROUND_BITS);

    const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
    const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
    res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));

    src_ptr += src_stride * 2;
    a_ptr += a_stride * 2;
    b_ptr += b_stride * 2;
    m_ptr += m_stride * 2;
  }
  res = _mm_hadd_epi32(res, res);
  res = _mm_hadd_epi32(res, res);
  int sad = _mm_cvtsi128_si32(res);
  return (sad + 31) >> 6;
}
예제 #17
0
static INLINE unsigned int highbd_masked_sad_ssse3(
    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
    int width, int height) {
  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
  int x, y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
  const __m128i round_const =
      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
  const __m128i one = _mm_set1_epi16(1);

  for (y = 0; y < height; y++) {
    for (x = 0; x < width; x += 8) {
      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
      // Zero-extend mask to 16 bits
      const __m128i m = _mm_unpacklo_epi8(
          _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128());
      const __m128i m_inv = _mm_sub_epi16(mask_max, m);

      const __m128i data_l = _mm_unpacklo_epi16(a, b);
      const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
      __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
      pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
                              AOM_BLEND_A64_ROUND_BITS);

      const __m128i data_r = _mm_unpackhi_epi16(a, b);
      const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
      __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
      pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
                              AOM_BLEND_A64_ROUND_BITS);

      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
      // so it is safe to do signed saturation here.
      const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
      // There is no 16-bit SAD instruction, so we have to synthesize
      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
      // and accumulating them at the end
      const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
      res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
    }

    src_ptr += src_stride;
    a_ptr += a_stride;
    b_ptr += b_stride;
    m_ptr += m_stride;
  }
  // At this point, we have four 32-bit partial SADs stored in 'res'.
  res = _mm_hadd_epi32(res, res);
  res = _mm_hadd_epi32(res, res);
  int sad = _mm_cvtsi128_si32(res);
  return (sad + 31) >> 6;
}
예제 #18
0
파일: vector_short.c 프로젝트: jfellus/agem
int	vector_ps_short (const short* pa,const short* pb,size_t n)
{
    size_t k;
    size_t q = n / 16;
    size_t r = n % 16;
    int w;
    if (q > 0) {
	__m128i acc1 = _mm_setzero_si128();
	__m128i acc2 = _mm_setzero_si128();
	if (ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) {
	    for (k=0;k<q;k++) {
		/* Charge 16 mots dans chaque tableau */
		__m128i a1 = _mm_load_si128((__m128i*)pa);
		__m128i b1 = _mm_load_si128((__m128i*)pb);
		__m128i a2 = _mm_load_si128((__m128i*)(pa+8));
		__m128i b2 = _mm_load_si128((__m128i*)(pb+8));
		/* Multiple, somme et converti en double word */
		__m128i s1 = _mm_madd_epi16(a1,b1);
		__m128i s2 = _mm_madd_epi16(a2,b2);
		pa += 16;
		pb += 16;
		/* Accumule */
		acc1 = _mm_add_epi32(acc1,s1);
		acc2 = _mm_add_epi32(acc2,s2);
	    }
	}
	else {
	    for (k=0;k<q;k++) {
		/* Charge 16 mots dans chaque tableau */
		__m128i a1 = _mm_loadu_si128((__m128i*)pa);
		__m128i b1 = _mm_loadu_si128((__m128i*)pb);
		__m128i a2 = _mm_loadu_si128((__m128i*)(pa+8));
		__m128i b2 = _mm_loadu_si128((__m128i*)(pb+8));
		/* Multiple, somme et converti en double word */
		__m128i s1 = _mm_madd_epi16(a1,b1);
		__m128i s2 = _mm_madd_epi16(a2,b2);
		pa += 16;
		pb += 16;
		/* Accumule */
		acc1 = _mm_add_epi32(acc1,s1);
		acc2 = _mm_add_epi32(acc2,s2);
	    }
	}
	/* Somme finale */
	acc1 = _mm_add_epi32(acc1,acc2);
	acc1 = _mm_hadd_epi32(acc1,acc1);
	acc1 = _mm_hadd_epi32(acc1,acc1);
	w = _mm_extract_epi32(acc1,0);
    }
    else {
	w = 0;
    }
    for (k=0;k<r;k++)
	w += (*pa++) * (*pb++);
    return w;
}
int64_t av1_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
                                     intptr_t block_size, int64_t *ssz,
                                     int bps) {
  int i, j, test;
  uint32_t temp[4];
  __m128i max, min, cmp0, cmp1, cmp2, cmp3;
  int64_t error = 0, sqcoeff = 0;
  const int shift = 2 * (bps - 8);
  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;

  for (i = 0; i < block_size; i += 8) {
    // Load the data into xmm registers
    __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i));
    __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4));
    __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i));
    __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4));
    // Check if any values require more than 15 bit
    max = _mm_set1_epi32(0x3fff);
    min = _mm_set1_epi32(0xffffc000);
    cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
                         _mm_cmplt_epi32(mm_coeff, min));
    cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
                         _mm_cmplt_epi32(mm_coeff2, min));
    cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
                         _mm_cmplt_epi32(mm_dqcoeff, min));
    cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
                         _mm_cmplt_epi32(mm_dqcoeff2, min));
    test = _mm_movemask_epi8(
        _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)));

    if (!test) {
      __m128i mm_diff, error_sse2, sqcoeff_sse2;
      mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
      mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
      mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
      error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
      sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
      _mm_storeu_si128((__m128i *)temp, error_sse2);
      error = error + temp[0] + temp[1] + temp[2] + temp[3];
      _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2);
      sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
    } else {
      for (j = 0; j < 8; j++) {
        const int64_t diff = coeff[i + j] - dqcoeff[i + j];
        error += diff * diff;
        sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
      }
    }
  }
  assert(error >= 0 && sqcoeff >= 0);
  error = (error + rounding) >> shift;
  sqcoeff = (sqcoeff + rounding) >> shift;

  *ssz = sqcoeff;
  return error;
}
예제 #20
0
//
// multiplies two complex vectors and returns the real and imaginary parts 
// as two 32 bit integers.
//
int __ext_v_conj_mul_complex16_int32(int32* re, int lenout1, int32* im, int lenout2, 
				struct complex16* x, int len1, struct complex16* y, int len2 )
{

	const int wlen = 4;// sizeof(vcs) / sizeof(complex16);
	const __m128i xmm6 = _mm_set1_epi32(0x0000FFFF);		//0x0000FFFF0000FFFF0000FFFF0000FFFF
	const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000);
	const __m128i xmm4 = _mm_set1_epi32(0x00010000);
	for (int i = 0; i < len1 / wlen; i++){

	/*	vcs *vx = (vcs *)(x + wlen*i);
		vcs *vy = (vcs *)(y + wlen*i);
		vi *reout = (vi *)(re + wlen*i);
		vi *imout = (vi *)(im + wlen*i);

		vcs vs2 = conj0(*vy);

	    vs2 = permutate_low<1, 0, 3, 2>(vs2);
		vs2 = permutate_high<1, 0, 3, 2>(vs2);

		*reout = (vcs)muladd(*vx, *vy);
		*imout = (vcs)muladd(*vx, vs2);*/


		__m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i));
		__m128i my = _mm_loadu_si128((__m128i *)(y + wlen*i));


		//__m128i ms1 = _mm_sign_epi16(my, conj);
		__m128i ms2 = _mm_xor_si128(my, xmm5);
		ms2 = _mm_add_epi32(ms2, xmm4);


		ms2 = _mm_shufflehi_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));
		ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));

		__m128i mre = _mm_madd_epi16(my, mx);
		__m128i mim = _mm_madd_epi16(ms2, mx);

		_mm_storeu_si128((__m128i *) (re + wlen*i), mre);
		_mm_storeu_si128((__m128i *) (im + wlen*i), mim);



	}

	for (int i = (len1 / wlen) * wlen; i < len1; i++){
		re[i] = x[i].re * y[i].re + x[i].im * y[i].im ;
		im[i] = x[i].im * y[i].re - x[i].re * y[i].im ;
	};

	return 0;

}
예제 #21
0
opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y,
      int N)
{
    opus_int  i, dataSize16;
    opus_int32 sum;

    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;

    sum = 0;
    dataSize16 = N & ~15;

    acc1 = _mm_setzero_si128();
    acc2 = _mm_setzero_si128();

    for (i=0;i<dataSize16;i+=16)
    {
        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));

        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));

        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);

        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
    }

    acc1 = _mm_add_epi32( acc1, acc2 );

    if (N - i >= 8)
    {
        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));

        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);

        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
        i += 8;
    }

    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1));
    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E));
    sum += _mm_cvtsi128_si32(acc1);

    for (;i<N;i++) {
        sum = silk_SMLABB(sum, x[i], y[i]);
    }

    return sum;
}
예제 #22
0
        SIMD_INLINE __m128i SquaredDifference(__m128i a, __m128i b)
        {
            const __m128i aLo = _mm_unpacklo_epi8(a, _mm_setzero_si128());
            const __m128i bLo = _mm_unpacklo_epi8(b, _mm_setzero_si128());
            const __m128i dLo = _mm_sub_epi16(aLo, bLo);

            const __m128i aHi = _mm_unpackhi_epi8(a, _mm_setzero_si128());
            const __m128i bHi = _mm_unpackhi_epi8(b, _mm_setzero_si128());
            const __m128i dHi = _mm_sub_epi16(aHi, bHi);

            return _mm_add_epi32(_mm_madd_epi16(dLo, dLo), _mm_madd_epi16(dHi, dHi));
        }
예제 #23
0
/* -----------------------------------
 *       weighted_merge_planar
 * -----------------------------------
 */
void weighted_merge_planar_sse2(BYTE *p1, const BYTE *p2, int p1_pitch, int p2_pitch, int width, int height, int weight, int invweight) {
  __m128i round_mask = _mm_set1_epi32(0x4000);
  __m128i zero = _mm_setzero_si128();
  __m128i mask = _mm_set_epi16(weight, invweight, weight, invweight, weight, invweight, weight, invweight);

  int wMod16 = (width/16) * 16;

  for (int y = 0; y < height; y++) {
    for (int x = 0; x < wMod16; x += 16) {
      __m128i px1 = _mm_load_si128(reinterpret_cast<const __m128i*>(p1+x)); //y7y6 y5y4 y3y2 y1y0
      __m128i px2 = _mm_load_si128(reinterpret_cast<const __m128i*>(p2+x)); //Y7Y6 Y5Y4 Y3Y2 Y1Y0

      __m128i p0123 = _mm_unpacklo_epi8(px1, px2); //Y3y3 Y2y2 Y1y1 Y0y0
      __m128i p4567 = _mm_unpackhi_epi8(px1, px2); //Y7y7 Y6y6 Y5y5 Y4y4

      __m128i p01 = _mm_unpacklo_epi8(p0123, zero); //00Y1 00y1 00Y0 00y0
      __m128i p23 = _mm_unpackhi_epi8(p0123, zero); //00Y3 00y3 00Y2 00y2
      __m128i p45 = _mm_unpacklo_epi8(p4567, zero); //00Y5 00y5 00Y4 00y4
      __m128i p67 = _mm_unpackhi_epi8(p4567, zero); //00Y7 00y7 00Y6 00y6

      p01 = _mm_madd_epi16(p01, mask);
      p23 = _mm_madd_epi16(p23, mask);
      p45 = _mm_madd_epi16(p45, mask);
      p67 = _mm_madd_epi16(p67, mask);

      p01 = _mm_add_epi32(p01, round_mask);
      p23 = _mm_add_epi32(p23, round_mask);
      p45 = _mm_add_epi32(p45, round_mask);
      p67 = _mm_add_epi32(p67, round_mask);

      p01 = _mm_srli_epi32(p01, 15);
      p23 = _mm_srli_epi32(p23, 15);
      p45 = _mm_srli_epi32(p45, 15);
      p67 = _mm_srli_epi32(p67, 15);

      p0123 = _mm_packs_epi32(p01, p23);
      p4567 = _mm_packs_epi32(p45, p67);

      __m128i result = _mm_packus_epi16(p0123, p4567);

      _mm_store_si128(reinterpret_cast<__m128i*>(p1+x), result);
    }

    for (int x = wMod16; x < width; x++) {
      p1[x] = (p1[x]*invweight + p2[x]*weight + 16384) >> 15;
    }

    p1 += p1_pitch;
    p2 += p2_pitch;
  }
}
예제 #24
0
static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
                                            const int pre_stride,
                                            const int32_t *wsrc,
                                            const int32_t *mask,
                                            const int width, const int height) {
  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
  const int pre_step = pre_stride - width;
  int n = 0;
  __m128i v_sad_d = _mm_setzero_si128();

  assert(width >= 8);
  assert(IS_POWER_OF_TWO(width));

  do {
    const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
    const __m128i v_m1_d = xx_load_128(mask + n + 4);
    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
    const __m128i v_p0_w = xx_loadl_64(pre + n);
    const __m128i v_m0_d = xx_load_128(mask + n);
    const __m128i v_w0_d = xx_load_128(wsrc + n);

    const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
    const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);

    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
    // boundaries. We use pmaddwd, as it has lower latency on Haswell
    // than pmulld but produces the same result with these inputs.
    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);

    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
    const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
    const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);

    // Rounded absolute difference
    const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
    const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);

    v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
    v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);

    n += 8;

    if (n % width == 0) pre += pre_step;
  } while (n < width * height);

  return xx_hsum_epi32_si32(v_sad_d);
}
예제 #25
0
/* -----------------------------------
 *      weighted_merge_luma_yuy2
 * -----------------------------------
 */
static void weighted_merge_luma_yuy2_sse2(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height, int weight, int invweight)
{
  __m128i round_mask = _mm_set1_epi32(0x4000);
  __m128i mask = _mm_set_epi16(weight, invweight, weight, invweight, weight, invweight, weight, invweight);
  __m128i luma_mask = _mm_set1_epi16(0x00FF);
#pragma warning(push)
#pragma warning(disable: 4309)
  __m128i chroma_mask = _mm_set1_epi16(0xFF00);
#pragma warning(pop)

  int wMod16 = (width/16) * 16;

  for (int y = 0; y < height; y++) {
    for (int x = 0; x < wMod16; x += 16) {
      __m128i px1 = _mm_load_si128(reinterpret_cast<const __m128i*>(src+x)); //V1 Y3 U1 Y2 V0 Y1 U0 Y0
      __m128i px2 = _mm_load_si128(reinterpret_cast<const __m128i*>(luma+x)); //v1 y3 u1 y2 v0 y1 u0 y0

      __m128i src_lo = _mm_unpacklo_epi16(px1, px2); //v0 y1 V0 Y1 u0 y0 U0 Y0
      __m128i src_hi = _mm_unpackhi_epi16(px1, px2); 

      src_lo = _mm_and_si128(src_lo, luma_mask); //00 v0 00 V0 00 u0 00 U0
      src_hi = _mm_and_si128(src_hi, luma_mask); 

      src_lo = _mm_madd_epi16(src_lo, mask);
      src_hi = _mm_madd_epi16(src_hi, mask);

      src_lo = _mm_add_epi32(src_lo, round_mask);
      src_hi = _mm_add_epi32(src_hi, round_mask);

      src_lo = _mm_srli_epi32(src_lo, 15);
      src_hi = _mm_srli_epi32(src_hi, 15);

      __m128i result_luma = _mm_packs_epi32(src_lo, src_hi);

      __m128i result_chroma = _mm_and_si128(px1, chroma_mask);
      __m128i result = _mm_or_si128(result_chroma, result_luma);

      _mm_store_si128(reinterpret_cast<__m128i*>(src+x), result);
    }

    for (int x = wMod16; x < width; x+=2) {
      src[x] = (luma[x] * weight + src[x] * invweight + 16384) >> 15;
    }

    src += pitch;
    luma += luma_pitch;
  }
}
예제 #26
0
 SIMD_INLINE __m128i BgraToGray32(__m128i bgra)
 {
     const __m128i g0a0 = _mm_and_si128(_mm_srli_si128(bgra, 1), K16_00FF);
     const __m128i b0r0 = _mm_and_si128(bgra, K16_00FF);
     const __m128i weightedSum = _mm_add_epi32(_mm_madd_epi16(g0a0, K16_GREEN_0000), _mm_madd_epi16(b0r0, K16_BLUE_RED));
     return _mm_srli_epi32(_mm_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT);
 }
예제 #27
0
static void IDCT_1D_Multi(int16 *in_coeff, T *out_coeff)
{
#if defined(__SSE2__)
{
 for(unsigned col = 0; col < 8; col++)
 {
  __m128i c =  _mm_load_si128((__m128i *)&in_coeff[(col * 8)]);

  for(unsigned x = 0; x < 8; x++)
  {
   __m128i sum;
   __m128i m;
   int32 tmp[4] MDFN_ALIGN(16);

   m = _mm_load_si128((__m128i *)&IDCTMatrix[(x * 8)]);
   sum = _mm_madd_epi16(m, c);
   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (3 << 0) | (2 << 2) | (1 << 4) | (0 << 6)));
   sum = _mm_add_epi32(sum, _mm_shuffle_epi32(sum, (1 << 0) | (0 << 2)));

   //_mm_store_ss((float *)&tmp[0], (__m128)sum);
   _mm_store_si128((__m128i*)tmp, sum);

   if(sizeof(T) == 1)
    out_coeff[(col * 8) + x] = Mask9ClampS8((tmp[0] + 0x4000) >> 15);
   else
    out_coeff[(x * 8) + col] = (tmp[0] + 0x4000) >> 15;
  }
 }
}
예제 #28
0
static WEBP_INLINE __m128i SubtractAndAccumulate(const __m128i a,
        const __m128i b) {
    // take abs(a-b) in 8b
    const __m128i a_b = _mm_subs_epu8(a, b);
    const __m128i b_a = _mm_subs_epu8(b, a);
    const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
    // zero-extend to 16b
    const __m128i C0 = _mm_cvtepu8_epi16(abs_a_b);
    const __m128i C1 = _mm_cvtepu8_epi16(_mm_srli_si128(abs_a_b, 8));
    // multiply with self
    const __m128i D0 = _mm_madd_epi16(C0, C0);
    const __m128i D1 = _mm_madd_epi16(C1, C1);
    // accumulate
    const __m128i sum = _mm_add_epi32(D0, D1);
    return sum;
}
예제 #29
0
static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride,
                                       const int32_t *wsrc, const int32_t *mask,
                                       const int height) {
  const int pre_step = pre_stride - 4;
  int n = 0;
  __m128i v_sad_d = _mm_setzero_si128();

  do {
    const __m128i v_p_b = xx_loadl_32(pre + n);
    const __m128i v_m_d = xx_load_128(mask + n);
    const __m128i v_w_d = xx_load_128(wsrc + n);

    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);

    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
    // boundaries. We use pmaddwd, as it has lower latency on Haswell
    // than pmulld but produces the same result with these inputs.
    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);

    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
    const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);

    // Rounded absolute difference
    const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);

    v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);

    n += 4;

    if (n % 4 == 0) pre += pre_step;
  } while (n < 4 * height);

  return xx_hsum_epi32_si32(v_sad_d);
}
예제 #30
0
static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
    const __m128i zero = _mm_setzero_si128();

    // Load values. Note that we read 8 pixels instead of 4,
    // but the a/b buffers are over-allocated to that effect.
    const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]);
    const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]);
    const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]);
    const __m128i a3 = _mm_loadl_epi64((__m128i*)&a[BPS * 3]);
    const __m128i b0 = _mm_loadl_epi64((__m128i*)&b[BPS * 0]);
    const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]);
    const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]);
    const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]);

    // Combine pair of lines and convert to 16b.
    const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
    const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
    const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
    const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
    const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
    const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
    const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
    const __m128i b23s = _mm_unpacklo_epi8(b23, zero);

    // Compute differences; (a-b)^2 = (abs(a-b))^2 = (sat8(a-b) + sat8(b-a))^2
    // TODO(cduvivier): Dissassemble and figure out why this is fastest. We don't
    //                  need absolute values, there is no need to do calculation
    //                  in 8bit as we are already in 16bit, ... Yet this is what
    //                  benchmarks the fastest!
    const __m128i d0 = _mm_subs_epu8(a01s, b01s);
    const __m128i d1 = _mm_subs_epu8(b01s, a01s);
    const __m128i d2 = _mm_subs_epu8(a23s, b23s);
    const __m128i d3 = _mm_subs_epu8(b23s, a23s);

    // Square and add them all together.
    const __m128i madd0 = _mm_madd_epi16(d0, d0);
    const __m128i madd1 = _mm_madd_epi16(d1, d1);
    const __m128i madd2 = _mm_madd_epi16(d2, d2);
    const __m128i madd3 = _mm_madd_epi16(d3, d3);
    const __m128i sum0 = _mm_add_epi32(madd0, madd1);
    const __m128i sum1 = _mm_add_epi32(madd2, madd3);
    const __m128i sum2 = _mm_add_epi32(sum0, sum1);

    int32_t tmp[4];
    _mm_storeu_si128((__m128i*)tmp, sum2);
    return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}