Exemplo n.º 1
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
  int i;
  __m128i pa;
  __m128i L = _mm_cvtsi32_si128(out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
    __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
    __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
      // We can unpack with any value on the upper 32 bits, provided it's the
      // same on both operands (so that their sum of abs diff is zero). Here we
      // use T.
      const __m128i T_lo = _mm_unpacklo_epi32(T, T);
      const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
      const __m128i T_hi = _mm_unpackhi_epi32(T, T);
      const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
      const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
      const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
      pa = _mm_packs_epi32(s_lo, s_hi);  // pa = sum |T-TL|
  if (i != num_pixels) {
    VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
Exemplo n.º 2
int32_t sse_sadbw_sumsignedbytes(int8_t* array, size_t size) {

    const __m128i zero = _mm_setzero_si128();
    __m128i positive = zero;
    __m128i negative = zero;

    for (size_t i=0; i < size; i += 16) {
        const __m128i v  = _mm_loadu_si128((__m128i*)(array + i));
        const __m128i m  = _mm_cmplt_epi8(v, zero);
        const __m128i va = _mm_abs_epi8(v);

        // sum just positive numbers
        const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);

        // sum just negative numbers
        const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
        positive = _mm_add_epi32(positive, t0);
        negative = _mm_sub_epi32(negative, t1);

    const __m128i accumulator = _mm_add_epi32(positive, negative);

    return int32_t(_mm_extract_epi32(accumulator, 0)) +
           int32_t(_mm_extract_epi32(accumulator, 2));
Exemplo n.º 3
static int HafCpu_Histogram3Thresholds_DATA_U8
		vx_uint32     dstHist[],
		vx_uint8      distThreshold0,
		vx_uint8      distThreshold1,
		vx_uint8      distThreshold2,
		vx_uint32     srcWidth,
		vx_uint32     srcHeight,
		vx_uint8    * pSrcImage,
		vx_uint32     srcImageStrideInBytes
	// offset: to convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
	// thresh: source threshold in -128..127 range
	__m128i offset = _mm_set1_epi8((char)0x80);
	__m128i T0 = _mm_set1_epi8((char)((distThreshold0 - 1) ^ 0x80));
	__m128i T1 = _mm_set1_epi8((char)((distThreshold1 - 1) ^ 0x80));
	__m128i T2 = _mm_set1_epi8((char)((distThreshold2 - 1) ^ 0x80));
	__m128i onemask = _mm_set1_epi8((char)1);
	// process one pixel row at a time that counts "pixel < srcThreshold"
	__m128i count0 = _mm_set1_epi8((char)0);
	__m128i count1 = _mm_set1_epi8((char)0);
	__m128i count2 = _mm_set1_epi8((char)0);
	vx_uint8 * srcRow = pSrcImage;
	vx_uint32 width = (srcWidth + 15) >> 4;
	for (unsigned int y = 0; y < srcHeight; y++) {
		__m128i * src = (__m128i *)srcRow;
		for (unsigned int x = 0; x < width; x++) {
			__m128i pixels = _mm_load_si128(src++);
			pixels = _mm_xor_si128(pixels, offset);
			__m128i cmpout;
			cmpout = _mm_cmpgt_epi8(pixels, T0);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			count0 = _mm_add_epi32(count0, cmpout);
			cmpout = _mm_cmpgt_epi8(pixels, T1);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			count1 = _mm_add_epi32(count1, cmpout);
			cmpout = _mm_cmpgt_epi8(pixels, T2);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			count2 = _mm_add_epi32(count2, cmpout);
		srcRow += srcImageStrideInBytes;
	// extract histogram from count: special case needed when T1 == T2
	dstHist[0] = M128I(count0).m128i_u32[0] + M128I(count0).m128i_u32[2];
	dstHist[1] = M128I(count1).m128i_u32[0] + M128I(count1).m128i_u32[2] - dstHist[0];
	dstHist[2] = M128I(count2).m128i_u32[0] + M128I(count2).m128i_u32[2] - dstHist[0] - dstHist[1];
	dstHist[3] = srcWidth * srcHeight - dstHist[0] - dstHist[1] - dstHist[2];
	if (M128I(T1).m128i_i8[0] == M128I(T2).m128i_i8[0]) {
		dstHist[2] = dstHist[3];
		dstHist[3] = 0;
	return AGO_SUCCESS;
* @brief
*  Compute 8x4 SAD
* @par   Description
*  Compute 8x4 sum of absolute differences between source and reference block
* @param[in] pu1_src
*  Source buffer
* @param[in] pu1_ref
*  Reference buffer
* @param[in] src_strd
*  Source stride
* @param[in] ref_strd
*  Reference stride
* @param[in] wd
*  Assumed to be 8
* @param[in] ht
*  Assumed to be 4

* @returns
*  SAD
* @remarks
WORD32 icv_sad_8x4_ssse3(UWORD8 *pu1_src,
                         UWORD8 *pu1_ref,
                         WORD32 src_strd,
                         WORD32 ref_strd,
                         WORD32 wd,
                         WORD32 ht)
    WORD32 sad;
    __m128 src_r0, src_r1;
    __m128 ref_r0, ref_r1;
    __m128i res_r0, res_r1;

    ASSERT(wd == 8);
    ASSERT(ht == 4);

    /* Load source */
    src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
    pu1_src += src_strd;

    src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
    pu1_src += src_strd;

    src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src));
    pu1_src += src_strd;

    src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src));
    pu1_src += src_strd;

    /* Load reference */
    ref_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_ref));
    pu1_ref += ref_strd;

    ref_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_ref));
    pu1_ref += ref_strd;

    ref_r0 = _mm_loadh_pi (ref_r0, (__m64 *) (pu1_ref));
    pu1_ref += ref_strd;

    ref_r1 = _mm_loadh_pi (ref_r1, (__m64 *) (pu1_ref));
    pu1_ref += ref_strd;

    /* Compute SAD for each row */
    res_r0 = _mm_sad_epu8((__m128i)src_r0, (__m128i)ref_r0);
    res_r1 = _mm_sad_epu8((__m128i)src_r1, (__m128i)ref_r1);

    /* Accumulate SAD */
    res_r0 = _mm_add_epi64(res_r0,  res_r1);
    res_r0 = _mm_add_epi64(res_r0, _mm_srli_si128(res_r0, 8));

    sad  = _mm_cvtsi128_si32(res_r0);

    return sad;
Exemplo n.º 5
// Predictor11: select.
static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
                                 __m128i* const out) {
  // We can unpack with any value on the upper 32 bits, provided it's the same
  // on both operands (to that their sum of abs diff is zero). Here we use *A.
  const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
  const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
  const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
  const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
  const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
  const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
  *out = _mm_packs_epi32(s_lo, s_hi);
Exemplo n.º 6
static unsigned reg_sad_sse41(const pixel * const data1, const pixel * const data2,
                        const int width, const int height, const unsigned stride1, const unsigned stride2)
  int y, x;
  unsigned sad = 0;
  __m128i sse_inc = _mm_setzero_si128 ();
  long long int sse_inc_array[2];
  for (y = 0; y < height; ++y) {
    for (x = 0; x <= width-16; x+=16) {
      const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
      const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
      sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a,b));
      const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
      const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
      switch (((width - (width%2)) - x)/2) {
        case 0:
        case 1:
          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x01)));
        case 2:
          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x03)));
        case 3:
          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x07)));
        case 4:
          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x0f)));
        case 5:
          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x1f)));
        case 6:
          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x3f)));
        case 7:
          sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a, _mm_blend_epi16(a, b, 0x7f)));
          //Should not happen
      x = (width - (width%2));

    for (; x < width; ++x) {
      sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
  _mm_storeu_si128((__m128i*) sse_inc_array, sse_inc);
  sad += sse_inc_array[0] + sse_inc_array[1];

  return sad;
Exemplo n.º 7
int32_t sse_sadbw_unrolled4_sumsignedbytes(int8_t* array, size_t size) {

    const __m128i zero = _mm_setzero_si128();
    __m128i positive = zero;
    __m128i negative = zero;

    for (size_t i=0; i < size; i += 16*4) {
        const __m128i v0 = _mm_loadu_si128((__m128i*)(array + i + 0*16));
        const __m128i v1 = _mm_loadu_si128((__m128i*)(array + i + 1*16));
        const __m128i v2 = _mm_loadu_si128((__m128i*)(array + i + 2*16));
        const __m128i v3 = _mm_loadu_si128((__m128i*)(array + i + 3*16));

            const __m128i v   = v0;
            const __m128i m   = _mm_cmplt_epi8(v, zero);
            const __m128i t0  = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);
            const __m128i va = _mm_abs_epi8(v);
            const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
            positive = _mm_add_epi32(positive, t0);
            negative = _mm_sub_epi32(negative, t1);

            const __m128i v   = v1;
            const __m128i m   = _mm_cmplt_epi8(v, zero);
            const __m128i t0  = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);
            const __m128i va = _mm_abs_epi8(v);
            const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
            positive = _mm_add_epi32(positive, t0);
            negative = _mm_sub_epi32(negative, t1);

            const __m128i v   = v2;
            const __m128i m   = _mm_cmplt_epi8(v, zero);
            const __m128i t0  = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);
            const __m128i va = _mm_abs_epi8(v);
            const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
            positive = _mm_add_epi32(positive, t0);
            negative = _mm_sub_epi32(negative, t1);

            const __m128i v   = v3;
            const __m128i m   = _mm_cmplt_epi8(v, zero);
            const __m128i t0  = _mm_sad_epu8(_mm_andnot_si128(m, v), zero);
            const __m128i va = _mm_abs_epi8(v);
            const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero);
            positive = _mm_add_epi32(positive, t0);
            negative = _mm_sub_epi32(negative, t1);

    const __m128i accumulator = _mm_add_epi32(positive, negative);

    return int32_t(_mm_extract_epi32(accumulator, 0)) +
           int32_t(_mm_extract_epi32(accumulator, 2));
Exemplo n.º 8
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
  int i, j;
  __m128i L = _mm_cvtsi32_si128(out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
    __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
    __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
    __m128i pa;
    GetSumAbsDiff32(&T, &TL, &pa);   // pa = sum |T-TL|
    for (j = 0; j < 4; ++j) {
      const __m128i L_lo = _mm_unpacklo_epi32(L, L);
      const __m128i TL_lo = _mm_unpacklo_epi32(TL, L);
      const __m128i pb = _mm_sad_epu8(L_lo, TL_lo);  // pb = sum |L-TL|
      const __m128i mask = _mm_cmpgt_epi32(pb, pa);
      const __m128i A = _mm_and_si128(mask, L);
      const __m128i B = _mm_andnot_si128(mask, T);
      const __m128i pred = _mm_or_si128(A, B);    // pred = (L > T)? L : T
      L = _mm_add_epi8(src, pred);
      out[i + j] = _mm_cvtsi128_si32(L);
      // Shift the pre-computed value for the next iteration.
      T = _mm_srli_si128(T, 4);
      TL = _mm_srli_si128(TL, 4);
      src = _mm_srli_si128(src, 4);
      pa = _mm_srli_si128(pa, 4);
  if (i != num_pixels) {
    VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
static int diff_abs(__m128i x, __m128i y)
	__m128i z = _mm_sad_epu8(x, y);
	uint64_t *v64val = (uint64_t *) & z;
	int diff = v64val[0] + v64val[1];
	return diff;
Exemplo n.º 10
    int operator() (const uchar * ptr, int len, int & x0, int & x1, int & x2, int & x3)
        int x = 0;

        if( useSIMD )
            __m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
            __m128i dx = _mm_set1_epi16(8);
            __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init;

            for( ; x <= len - 8; x += 8 )
                __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z);
                __m128i sx = _mm_mullo_epi16(qx, qx);

                qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z));
                qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx));
                qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx));
                qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx));

                qx = _mm_add_epi16(qx, dx);

            _mm_store_si128((__m128i*)buf, qx0);
            x0 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx1);
            x1 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx2);
            x2 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx3);
            x3 = buf[0] + buf[1] + buf[2] + buf[3];

        return x;
Exemplo n.º 11
int normL1_(const uchar* a, const uchar* b, int n)
    int j = 0, d = 0;
#if CV_SSE
    __m128i d0 = _mm_setzero_si128();

    for( ; j <= n - 16; j += 16 )
        __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
        __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));

        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));

    for( ; j <= n - 4; j += 4 )
        __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
        __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));

        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
#elif CV_NEON
    uint32x4_t v_sum = vdupq_n_u32(0.0f);
    for ( ; j <= n - 16; j += 16)
        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));

    uint CV_DECL_ALIGNED(16) buf[4];
    vst1q_u32(buf, v_sum);
    d = buf[0] + buf[1] + buf[2] + buf[3];
        for( ; j <= n - 4; j += 4 )
            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
    return d;
Exemplo n.º 12
__m128i test_mm_sad_epu8(__m128i A, __m128i B) {
  // DAG-LABEL: test_mm_sad_epu8
  // DAG: call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
  // ASM-LABEL: test_mm_sad_epu8
  // ASM: psadbw
  return _mm_sad_epu8(A, B);
unsigned int vp9_sad3x16_sse2(
  const unsigned char *src_ptr,
  int  src_stride,
  const unsigned char *ref_ptr,
  int  ref_stride) {
  int r;
  __m128i s0, s1, s2, s3;
  __m128i r0, r1, r2, r3;
  __m128i sad = _mm_setzero_si128();
  __m128i mask;
  const int offset = (uintptr_t)src_ptr & 3;

  /* In current use case, the offset is 1 if CONFIG_SUBPELREFMV is off.
   * Here, for offset=1, we adjust src_ptr to be 4-byte aligned. Then, movd
   * takes much less time.
  if (offset == 1)
    src_ptr -= 1;

  /* mask = 0xffffffffffff0000ffffffffffff0000 */
  mask = _mm_cmpeq_epi32(sad, sad);
  mask = _mm_slli_epi64(mask, 16);

  for (r = 0; r < 16; r += 4) {
    s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride));
    s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride));
    s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride));
    s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride));
    r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * ref_stride));
    r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * ref_stride));
    r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * ref_stride));
    r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * ref_stride));

    s0 = _mm_unpacklo_epi8(s0, s1);
    r0 = _mm_unpacklo_epi8(r0, r1);
    s2 = _mm_unpacklo_epi8(s2, s3);
    r2 = _mm_unpacklo_epi8(r2, r3);
    s0 = _mm_unpacklo_epi64(s0, s2);
    r0 = _mm_unpacklo_epi64(r0, r2);

    // throw out extra byte
    if (offset == 1)
      s0 = _mm_and_si128(s0, mask);
      s0 = _mm_slli_epi64(s0, 16);
    r0 = _mm_slli_epi64(r0, 16);

    sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0));

    src_ptr += src_stride*4;
    ref_ptr += ref_stride*4;

  sad = _mm_add_epi16(sad,  _mm_srli_si128(sad, 8));
  return _mm_cvtsi128_si32(sad);
Exemplo n.º 14
static unsigned sad_8bit_4x4_sse2(const pixel *buf1, const pixel *buf2)
  const __m128i *const mbuf1 = (const __m128i *)buf1;
  const __m128i *const mbuf2 = (const __m128i *)buf2;

  __m128i sum = _mm_sad_epu8(_mm_load_si128(mbuf1), _mm_load_si128(mbuf2));

  uint32_t result[4];
  _mm_storeu_si128((__m128i*)result, sum);
  return result[0] + result[2];
Exemplo n.º 15
__m64 _m_psadbw(__m64 _MM1, __m64 _MM2)
    __m128i lhs = {0}, rhs = {0};
    lhs.m128i_i64[0] = _MM1.m64_i64;

    rhs.m128i_i64[0] = _MM2.m64_i64;

    lhs = _mm_sad_epu8(lhs, rhs);

    _MM1.m64_i64 = lhs.m128i_i64[0];
    return _MM1;
unsigned int vp9_sad16x3_sse2(
  const unsigned char *src_ptr,
  int  src_stride,
  const unsigned char *ref_ptr,
  int  ref_stride) {
  __m128i s0, s1, s2;
  __m128i r0, r1, r2;
  __m128i sad;

  s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride));
  s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride));
  s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride));

  r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * ref_stride));
  r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * ref_stride));
  r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * ref_stride));

  sad = _mm_sad_epu8(s0, r0);
  sad = _mm_add_epi16(sad,  _mm_sad_epu8(s1, r1));
  sad = _mm_add_epi16(sad,  _mm_sad_epu8(s2, r2));
  sad = _mm_add_epi16(sad,  _mm_srli_si128(sad, 8));

  return _mm_cvtsi128_si32(sad);
Exemplo n.º 17
static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
                                            int src_stride,
                                            const uint8_t *a_ptr, int a_stride,
                                            const uint8_t *b_ptr, int b_stride,
                                            const uint8_t *m_ptr, int m_stride,
                                            int width, int height) {
  int x, y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));

  for (y = 0; y < height; y++) {
    for (x = 0; x < width; x += 16) {
      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
      const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
      const __m128i m_inv = _mm_sub_epi8(mask_max, m);

      // Calculate 16 predicted pixels.
      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
      // is 64 * 255, so we have plenty of space to add rounding constants.
      const __m128i data_l = _mm_unpacklo_epi8(a, b);
      const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
      __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
      pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);

      const __m128i data_r = _mm_unpackhi_epi8(a, b);
      const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
      __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
      pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);

      const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
      res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));

    src_ptr += src_stride;
    a_ptr += a_stride;
    b_ptr += b_stride;
    m_ptr += m_stride;
  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
  int32_t sad =
      _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
  return (sad + 31) >> 6;
Exemplo n.º 18
Arquivo: main.cpp Projeto: CCJY/coliru
int countZeroBytes_SSE(char* values, int length) {
    int zeroCount = 0;
    __m128i zero16 = _mm_set1_epi8(0);
    __m128i and16 = _mm_set1_epi8(1);
    for(int i=0; i<length; i+=16) {
        __m128i values16 = _mm_loadu_si128((__m128i*)&values[i]);
        __m128i cmp = _mm_cmpeq_epi8(values16, zero16);
        if(_mm_movemask_epi8(cmp)) {
            cmp = _mm_and_si128(and16, cmp); //change -1 values to 1
            //hortiontal sum of 16 bytes
            __m128i sum1 = _mm_sad_epu8(cmp,zero16);
            __m128i sum2 = _mm_shuffle_epi32(sum1,2);
            __m128i sum3 = _mm_add_epi16(sum1,sum2);
            zeroCount += _mm_cvtsi128_si32(sum3);
    return zeroCount;
Exemplo n.º 19
static INLINE unsigned int masked_sad8xh_ssse3(
    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
    int height) {
  int y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));

  for (y = 0; y < height; y += 2) {
    const __m128i src = _mm_unpacklo_epi64(
        _mm_loadl_epi64((const __m128i *)src_ptr),
        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
    const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr);
    const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]);
    const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr);
    const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]);
    const __m128i m =
        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
                           _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
    const __m128i m_inv = _mm_sub_epi8(mask_max, m);

    const __m128i data_l = _mm_unpacklo_epi8(a0, b0);
    const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
    __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
    pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);

    const __m128i data_r = _mm_unpacklo_epi8(a1, b1);
    const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
    __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
    pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);

    const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));

    src_ptr += src_stride * 2;
    a_ptr += a_stride * 2;
    b_ptr += b_stride * 2;
    m_ptr += m_stride * 2;
  int32_t sad =
      _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
  return (sad + 31) >> 6;
Exemplo n.º 20
static INLINE unsigned int masked_sad4xh_ssse3(
    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
    int height) {
  int y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));

  for (y = 0; y < height; y += 2) {
    // Load two rows at a time, this seems to be a bit faster
    // than four rows at a time in this case.
    const __m128i src = _mm_unpacklo_epi32(
        _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
        _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
    const __m128i a =
        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr),
                           _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride]));
    const __m128i b =
        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
                           _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
    const __m128i m =
        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
                           _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
    const __m128i m_inv = _mm_sub_epi8(mask_max, m);

    const __m128i data = _mm_unpacklo_epi8(a, b);
    const __m128i mask = _mm_unpacklo_epi8(m, m_inv);
    __m128i pred_16bit = _mm_maddubs_epi16(data, mask);
    pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS);

    const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128());
    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));

    src_ptr += src_stride * 2;
    a_ptr += a_stride * 2;
    b_ptr += b_stride * 2;
    m_ptr += m_stride * 2;
  // At this point, the SAD is stored in lane 0 of 'res'
  int32_t sad = _mm_cvtsi128_si32(res);
  return (sad + 31) >> 6;
Exemplo n.º 21
SSE_FUNCTION static void
sad8x8_u8_sse (uint32_t *dest, uint8_t *src1, int sstr1, uint8_t *src2,
    int sstr2)
  int i;
  __m128i sum = _mm_setzero_si128();
  union m128_int sumi;

  for (i = 0; i < 4; i++) {
    __m128i xmm0, xmm1, xmm2, xmm3;
    xmm0 = _mm_loadl_epi64((__m128i *)src1);
    xmm1 = _mm_loadl_epi64((__m128i *)(src1 + sstr1));
    xmm2 = _mm_loadl_epi64((__m128i *)src2);
    xmm3 = _mm_loadl_epi64((__m128i *)(src2 + sstr2));
    xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
    xmm2 = _mm_unpacklo_epi8(xmm2, xmm3);
    sum = _mm_add_epi64(sum, _mm_sad_epu8(xmm0, xmm2));
    src1 += 2 * sstr1;
    src2 += 2 * sstr2;
  sumi.m128 = sum;
  *dest = sumi.i[0] + sumi.i[2];
Exemplo n.º 22
unsigned int luma_sse2(const uint8_t *pSrc, intptr_t nSrcPitch) {
    __m128i sum = zeroes;

    for (unsigned y = 0; y < height; y++) {
        for (unsigned x = 0; x < width; x += 16) {
            __m128i src;
            if (width == 4)
                src = _mm_cvtsi32_si128(*(const int *)pSrc);
            else if (width == 8)
                src = _mm_loadl_epi64((const __m128i *)pSrc);
                src = _mm_loadu_si128((const __m128i *)&pSrc[x]);

            sum = _mm_add_epi64(sum, _mm_sad_epu8(src, zeroes));

        pSrc += nSrcPitch;

    if (width >= 16)
        sum = _mm_add_epi64(sum, _mm_srli_si128(sum, 8));

    return (unsigned)_mm_cvtsi128_si32(sum);
Exemplo n.º 23
static unsigned reg_sad_sse2(const pixel * const data1, const pixel * const data2,
                        const int width, const int height, const unsigned stride1, const unsigned stride2)
  int y, x;
  unsigned sad = 0;
  __m128i sse_inc = _mm_setzero_si128 ();
  long long int sse_inc_array[2];
  for (y = 0; y < height; ++y) {
    for (x = 0; x <= width-16; x+=16) {
      const __m128i a = _mm_loadu_si128((__m128i const*) &data1[y * stride1 + x]);
      const __m128i b = _mm_loadu_si128((__m128i const*) &data2[y * stride2 + x]);
      sse_inc = _mm_add_epi32(sse_inc, _mm_sad_epu8(a,b));

    for (; x < width; ++x) {
      sad += abs(data1[y * stride1 + x] - data2[y * stride2 + x]);
  _mm_storeu_si128((__m128i*) sse_inc_array, sse_inc);
  sad += sse_inc_array[0] + sse_inc_array[1];

  return sad;
Exemplo n.º 24
// The function assumes that the image pointers are 16 byte aligned, and the source and destination strides as well
// It processes the pixels in a width which is the next highest multiple of 16 after dstWidth
static int HafCpu_Histogram1Threshold_DATA_U8
		vx_uint32     dstHist[],
		vx_uint8      distThreshold,
		vx_uint32     srcWidth,
		vx_uint32     srcHeight,
		vx_uint8    * pSrcImage,
		vx_uint32     srcImageStrideInBytes
	// offset: to convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
	// thresh: source threshold in -128..127 range
	__m128i offset = _mm_set1_epi8((char)0x80);
	__m128i thresh = _mm_set1_epi8((char)((distThreshold - 1) ^ 0x80));
	__m128i onemask = _mm_set1_epi8((char)1);
	// process one pixel row at a time that counts "pixel < srcThreshold"
	__m128i count = _mm_set1_epi8((char)0);
	vx_uint8 * srcRow = pSrcImage;
	vx_uint32 width = (srcWidth + 15) >> 4;
	for (unsigned int y = 0; y < srcHeight; y++) {
		__m128i * src = (__m128i *)srcRow;
		for (unsigned int x = 0; x < width; x++) {
			__m128i pixels = _mm_load_si128(src++);
			pixels = _mm_xor_si128(pixels, offset);
			pixels = _mm_cmpgt_epi8(pixels, thresh);
			pixels = _mm_and_si128(pixels, onemask);
			pixels = _mm_sad_epu8(pixels, onemask);
			count = _mm_add_epi32(count, pixels);
		srcRow += srcImageStrideInBytes;
	// extract histogram from count
	dstHist[0] = M128I(count).m128i_u32[0] + M128I(count).m128i_u32[2];
	dstHist[1] = srcWidth * srcHeight - dstHist[0];
	return AGO_SUCCESS;
Exemplo n.º 25
template<> void momentsInTile<uchar, int, int>( const cv::Mat& img, double* moments )
    typedef uchar T;
    typedef int WT;
    typedef int MT;
    Size size = img.size();
    int y;
    MT mom[10] = {0,0,0,0,0,0,0,0,0,0};
    bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);

    for( y = 0; y < size.height; y++ )
        const T* ptr = img.ptr<T>(y);
        int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x = 0;

        if( useSIMD )
            __m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
            __m128i dx = _mm_set1_epi16(8);
            __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init;

            for( ; x <= size.width - 8; x += 8 )
                __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z);
                qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z));
                __m128i px = _mm_mullo_epi16(p, qx);
                __m128i sx = _mm_mullo_epi16(qx, qx);
                qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx));
                qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx));
                qx3 = _mm_add_epi32(qx3, _mm_madd_epi16(px, sx));

                qx = _mm_add_epi16(qx, dx);
            int CV_DECL_ALIGNED(16) buf[4];
            _mm_store_si128((__m128i*)buf, qx0);
            x0 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx1);
            x1 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx2);
            x2 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx3);
            x3 = buf[0] + buf[1] + buf[2] + buf[3];

        for( ; x < size.width; x++ )
            WT p = ptr[x];
            WT xp = x * p, xxp;

            x0 += p;
            x1 += xp;
            xxp = xp * x;
            x2 += xxp;
            x3 += xxp * x;

        WT py = y * x0, sy = y*y;

        mom[9] += ((MT)py) * sy;  // m03
        mom[8] += ((MT)x1) * sy;  // m12
        mom[7] += ((MT)x2) * y;  // m21
        mom[6] += x3;             // m30
        mom[5] += x0 * sy;        // m02
        mom[4] += x1 * y;         // m11
        mom[3] += x2;             // m20
        mom[2] += py;             // m01
        mom[1] += x1;             // m10
        mom[0] += x0;             // m00

    for(int x = 0; x < 10; x++ )
        moments[x] = (double)mom[x];
Exemplo n.º 26
int dist( unsigned char *p1, unsigned char *p2, int lx, int distlim, int block_height )
	if (block_height == 8) {
		__m128i a, b, r;

		a = _mm_load_si128 ((__m128i*)p1 +  0);
		b = _mm_loadu_si128((__m128i*)p2 +  0);
		r = _mm_sad_epu8(a, b);
		a = _mm_load_si128 ((__m128i*)(p1 + lx));
		b = _mm_loadu_si128((__m128i*)(p2 + lx));
		r = _mm_add_epi32(r, _mm_sad_epu8(a, b));
		a = _mm_load_si128 ((__m128i*)(p1 + 2*lx));
		b = _mm_loadu_si128((__m128i*)(p2 + 2*lx));
		r = _mm_add_epi32(r, _mm_sad_epu8(a, b));
		a = _mm_load_si128 ((__m128i*)(p1 + 3*lx));
		b = _mm_loadu_si128((__m128i*)(p2 + 3*lx));
		r = _mm_add_epi32(r, _mm_sad_epu8(a, b));
		a = _mm_load_si128 ((__m128i*)(p1 + 4*lx));
		b = _mm_loadu_si128((__m128i*)(p2 + 4*lx));
		r = _mm_add_epi32(r, _mm_sad_epu8(a, b));
		a = _mm_load_si128 ((__m128i*)(p1 + 5*lx));
		b = _mm_loadu_si128((__m128i*)(p2 + 5*lx));
		r = _mm_add_epi32(r, _mm_sad_epu8(a, b));
		a = _mm_load_si128 ((__m128i*)(p1 + 6*lx));
		b = _mm_loadu_si128((__m128i*)(p2 + 6*lx));
		r = _mm_add_epi32(r, _mm_sad_epu8(a, b));
		a = _mm_load_si128 ((__m128i*)(p1 + 7*lx));
		b = _mm_loadu_si128((__m128i*)(p2 + 7*lx));
		r = _mm_add_epi32(r, _mm_sad_epu8(a, b));
		return _mm_extract_epi16(r, 0) + _mm_extract_epi16(r, 4);;
	int s = 0;
	for(int i=0;i<block_height;i++)
		s += motion_lookup[p1[0]][p2[0]];
		s += motion_lookup[p1[1]][p2[1]];
		s += motion_lookup[p1[2]][p2[2]];
		s += motion_lookup[p1[3]][p2[3]];
		s += motion_lookup[p1[4]][p2[4]];
		s += motion_lookup[p1[5]][p2[5]];
		s += motion_lookup[p1[6]][p2[6]];
		s += motion_lookup[p1[7]][p2[7]];
		s += motion_lookup[p1[8]][p2[8]];
		s += motion_lookup[p1[9]][p2[9]];
		s += motion_lookup[p1[10]][p2[10]];
		s += motion_lookup[p1[11]][p2[11]];
		s += motion_lookup[p1[12]][p2[12]];
		s += motion_lookup[p1[13]][p2[13]];
		s += motion_lookup[p1[14]][p2[14]];
		s += motion_lookup[p1[15]][p2[15]];*/
		__m128i a = _mm_load_si128((__m128i*)p1);
		__m128i b = _mm_loadu_si128((__m128i*)p2);
		__m128i r = _mm_sad_epu8(a, b);
		s += _mm_extract_epi16(r, 0) + _mm_extract_epi16(r, 4);

		if (s > distlim)	break;

		p1 += lx;
		p2 += lx;
	return s;
Exemplo n.º 27
/// Computes the absolute difference of the upper 8 unsigned bytes
/// and stores the result in the upper quadword; computes the same
/// for the lower 8 unsigned bytes and stores the result in the 
/// lower quadword.
/// @ingroup SIMD
inline xmm_u16 sad(const xmm_u8 &a, const xmm_u8 &b) { return _mm_sad_epu8(a, b); }
Exemplo n.º 28
    mlib_d64 *z,
    const mlib_s8 *x,
    const mlib_s8 *y,
    mlib_s32 n)
    if (n <= 0)
        return (MLIB_FAILURE);

    mlib_s32 i, nstep, ax, ay, n1, n2, n3, diff, sum = 0;
    mlib_s8 *px = (mlib_s8 *)x, *py = (mlib_s8 *)y;
    __m128i zero, xbuf, ybuf, zbuf, mext, mbuf;
    zero = _mm_setzero_si128();
    zbuf = zero;

    nstep = 16 / sizeof (mlib_s8);
    ax = (mlib_addr)x & 15;
    ay = (mlib_addr)y & 15;
    n1 = ((16 - ax) & 15) / sizeof (mlib_s8);
    n2 = (n - n1) / nstep;
    n3 = n - n1 - n2 * nstep;

    if (n2 < 1) {
        for (i = 0; i < n; i++) {
            diff = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(diff);
        *z = sum;
    } else {
        for (i = 0; i < n1; i++) {
            diff = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(diff);
        if (ax == ay) {
            for (i = 0; i < n2; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_load_si128((__m128i *)py);
                mext = _mm_cmpgt_epi8(ybuf, xbuf);
                mbuf = _mm_sub_epi8(xbuf, ybuf);
                mbuf = _mm_xor_si128(mbuf, mext);
                mbuf = _mm_sub_epi8(mbuf, mext);
                mbuf = _mm_sad_epu8(mbuf, zero);
                zbuf = _mm_add_epi64(zbuf, mbuf);
                px += nstep;
                py += nstep;
        } else {
            for (i = 0; i < n2; i++) {
                xbuf = _mm_load_si128((__m128i *)px);
                ybuf = _mm_loadu_si128((__m128i *)py);
                mext = _mm_cmpgt_epi8(ybuf, xbuf);
                mbuf = _mm_sub_epi8(xbuf, ybuf);
                mbuf = _mm_xor_si128(mbuf, mext);
                mbuf = _mm_sub_epi8(mbuf, mext);
                mbuf = _mm_sad_epu8(mbuf, zero);
                zbuf = _mm_add_epi64(zbuf, mbuf);
                px += nstep;
                py += nstep;
        for (i = 0; i < n3; i++) {
            diff = (mlib_s32)(*px++) - (*py++);
            sum += ABS_VALUE(diff);

        mlib_d64 dsum = sum;
        long long pz[2];
        _mm_storeu_si128((__m128i *)pz, zbuf);
        dsum += pz[0];
        dsum += pz[1];
        *z = dsum;
    return (MLIB_SUCCESS);
Exemplo n.º 29
void EmitColorIndices_Intrinsics( const byte *colorBlock, const byte *minColor, const byte *maxColor, byte *&outData )
	ALIGN16( byte color0[16] );
	ALIGN16( byte color1[16] );
	ALIGN16( byte color2[16] );
	ALIGN16( byte color3[16] );
	ALIGN16( byte result[16] );

	// mov esi, maxColor
	// mov edi, minColor

	__m128i t0, t1, t2, t3, t4, t5, t6, t7;

	t7 = _mm_setzero_si128();
	//t7 = _mm_xor_si128(t7, t7);
	_mm_store_si128 ( (__m128i*) &result, t7 );

	//t0 = _mm_load_si128 ( (__m128i*)  maxColor );
	t0 = _mm_cvtsi32_si128( *(int*)maxColor);

	// Bitwise AND
	__m128i tt = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_colorMask );
	t0 = _mm_and_si128(t0, tt);

	t0 = _mm_unpacklo_epi8(t0, t7);

	t4 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 0, 3, 2, 3 ));
	t5 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 3, 1, 3, 3 ));

	t4 = _mm_srli_epi16(t4, 5);
	t5 = _mm_srli_epi16(t5, 6);

	// Bitwise Logical OR
	t0 = _mm_or_si128(t0, t4);
	t0 = _mm_or_si128(t0, t5);   // t0 contains color0 in 565

	//t1 = _mm_load_si128 ( (__m128i*)  minColor );
	t1 = _mm_cvtsi32_si128( *(int*)minColor);

	t1 = _mm_and_si128(t1, tt);

	t1 = _mm_unpacklo_epi8(t1, t7);

	t4 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 0, 3, 2, 3 ));
	t5 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 3, 1, 3, 3 ));

	t4 = _mm_srli_epi16(t4, 5);
	t5 = _mm_srli_epi16(t5, 6);

	t1 = _mm_or_si128(t1, t4);
	t1 = _mm_or_si128(t1, t5);  // t1 contains color1 in 565

	t2 = t0;

	t2 = _mm_packus_epi16(t2, t7);

	t2 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 0, 1, 0, 1 ));

	_mm_store_si128 ( (__m128i*) &color0, t2 );

	t6 = t0;
	t6 = _mm_add_epi16(t6, t0);
	t6 = _mm_add_epi16(t6, t1);

	// Multiply Packed Signed Integers and Store High Result
	__m128i tw3 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_div_by_3 );
	t6 = _mm_mulhi_epi16(t6, tw3);
	t6 = _mm_packus_epi16(t6, t7);

	t6 = _mm_shuffle_epi32( t6, R_SHUFFLE_D( 0, 1, 0, 1 ));

	_mm_store_si128 ( (__m128i*) &color2, t6 );

	t3 = t1;
	t3 = _mm_packus_epi16(t3, t7);
	t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 1, 0, 1 ));

	_mm_store_si128 ( (__m128i*) &color1, t3 );

	t1 = _mm_add_epi16(t1, t1);
	t0 = _mm_add_epi16(t0, t1);

	t0 = _mm_mulhi_epi16(t0, tw3);
	t0 = _mm_packus_epi16(t0, t7);

	t0 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 0, 1, 0, 1 ));
	_mm_store_si128 ( (__m128i*) &color3, t0 );

	__m128i w0 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_0);
	__m128i w1 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_1);
	__m128i w2 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_2);

	    // mov eax, 32
	    // mov esi, colorBlock
	int x = 32;
	//const byte *c = colorBlock;
	while (x >= 0)
	    t3 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+0));
	    t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 2, 1, 3 ));

	    t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+8));
	    t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 ));

	    t0 = t3;
	    t6 = t5;
	    // Compute Sum of Absolute Difference
	    __m128i c0 = _mm_load_si128 ( (__m128i*)  color0 );
	    t0 = _mm_sad_epu8(t0, c0);
	    t6 = _mm_sad_epu8(t6, c0);
	    // Pack with Signed Saturation
	    t0 = _mm_packs_epi32 (t0, t6);

	    t1 = t3;
	    t6 = t5;
	    __m128i c1 = _mm_load_si128 ( (__m128i*)  color1 );
	    t1 = _mm_sad_epu8(t1, c1);
	    t6 = _mm_sad_epu8(t6, c1);
	    t1 = _mm_packs_epi32 (t1, t6);

	    t2 = t3;
	    t6 = t5;
	    __m128i c2 = _mm_load_si128 ( (__m128i*)  color2 );
	    t2 = _mm_sad_epu8(t2, c2);
	    t6 = _mm_sad_epu8(t6, c2);
	    t2 = _mm_packs_epi32 (t2, t6);

	    __m128i c3 = _mm_load_si128 ( (__m128i*)  color3 );
	    t3 = _mm_sad_epu8(t3, c3);
	    t5 = _mm_sad_epu8(t5, c3);
	    t3 = _mm_packs_epi32 (t3, t5);

	    t4 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+16));
	    t4 = _mm_shuffle_epi32( t4, R_SHUFFLE_D( 0, 2, 1, 3 ));

	    t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+24));
	    t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 ));

	    t6 = t4;
	    t7 = t5;
	    t6 = _mm_sad_epu8(t6, c0);
	    t7 = _mm_sad_epu8(t7, c0);
	    t6 = _mm_packs_epi32 (t6, t7);
	    t0 = _mm_packs_epi32 (t0, t6);  // d0

	    t6 = t4;
	    t7 = t5;
	    t6 = _mm_sad_epu8(t6, c1);
	    t7 = _mm_sad_epu8(t7, c1);
	    t6 = _mm_packs_epi32 (t6, t7);
	    t1 = _mm_packs_epi32 (t1, t6);  // d1

	    t6 = t4;
	    t7 = t5;
	    t6 = _mm_sad_epu8(t6, c2);
	    t7 = _mm_sad_epu8(t7, c2);
	    t6 = _mm_packs_epi32 (t6, t7);
	    t2 = _mm_packs_epi32 (t2, t6);  // d2

	    t4 = _mm_sad_epu8(t4, c3);
	    t5 = _mm_sad_epu8(t5, c3);
	    t4 = _mm_packs_epi32 (t4, t5);
	    t3 = _mm_packs_epi32 (t3, t4);  // d3

	    t7 = _mm_load_si128 ( (__m128i*) result );

	    t7 = _mm_slli_epi32( t7, 16);

	    t4 = t0;
	    t5 = t1;
	    // Compare Packed Signed Integers for Greater Than
	    t0 = _mm_cmpgt_epi16(t0, t3); // b0
	    t1 = _mm_cmpgt_epi16(t1, t2); // b1
	    t4 = _mm_cmpgt_epi16(t4, t2); // b2
	    t5 = _mm_cmpgt_epi16(t5, t3); // b3
	    t2 = _mm_cmpgt_epi16(t2, t3); // b4

	    t4 = _mm_and_si128(t4, t1); // x0
	    t5 = _mm_and_si128(t5, t0); // x1
	    t2 = _mm_and_si128(t2, t0); // x2

	    t4 = _mm_or_si128(t4, t5);
	    t2 = _mm_and_si128(t2, w1);
	    t4 = _mm_and_si128(t4, w2);
	    t2 = _mm_or_si128(t2, t4);

	    t5 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 2, 3, 0, 1 ));

	    // Unpack Low Data
	    t2 = _mm_unpacklo_epi16 ( t2, w0);
	    t5 = _mm_unpacklo_epi16 ( t5, w0);

	    //t5 = _mm_slli_si128 ( t5, 8);
	    t5 = _mm_slli_epi32( t5, 8);

	    t7 = _mm_or_si128(t7, t5);
	    t7 = _mm_or_si128(t7, t2);

	    _mm_store_si128 ( (__m128i*) &result, t7 );

	    x -=32;

	t4 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 1, 2, 3, 0 ));
	t5 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 2, 3, 0, 1 ));
	t6 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 3, 0, 1, 2 ));

	t4 = _mm_slli_epi32 ( t4, 2);
	t5 = _mm_slli_epi32 ( t5, 4);
	t6 = _mm_slli_epi32 ( t6, 6);

	t7 = _mm_or_si128(t7, t4);
	t7 = _mm_or_si128(t7, t5);
	t7 = _mm_or_si128(t7, t6);

	//_mm_store_si128 ( (__m128i*) outData, t7 );

	int r = _mm_cvtsi128_si32 (t7);
	memcpy(outData, &r, 4);   // Anything better ?

	outData += 4;
* @brief
*  Performs spatial edge adaptive filtering
* @par   Description
*  Performs spatial edge adaptive filtering by detecting edge direction
* @param[in] pu1_src
*  Source buffer
* @param[in] pu1_out
*  Destination buffer
* @param[in] src_strd
*  Source stride
* @param[in] out_strd
*  Destination stride

* @returns
* None
* @remarks
void ideint_spatial_filter_ssse3(UWORD8 *pu1_src,
                           UWORD8 *pu1_out,
                           WORD32 src_strd,
                           WORD32 out_strd)
    WORD32 i;

    WORD32 adiff[6];
    WORD32 *pi4_diff;
    WORD32 shifts[2];
    WORD32 dir_45_le_90, dir_45_le_135, dir_135_le_90;

    __m128i row1_0, row1_m1, row1_p1;
    __m128i row2_0, row2_m1, row2_p1;
    __m128i diff, diffs[3];
    __m128i zero;

    /* Direction detection                                           */

    zero = _mm_setzero_si128();
    diffs[0] = _mm_setzero_si128();
    diffs[1]  = _mm_setzero_si128();
    diffs[2] = _mm_setzero_si128();

    /* Load source */
    row1_m1 = _mm_loadl_epi64((__m128i *) (pu1_src - 1));
    row1_0  = _mm_loadl_epi64((__m128i *) (pu1_src));
    row1_p1 = _mm_loadl_epi64((__m128i *) (pu1_src + 1));
    pu1_src += src_strd;

    /* Unpack to 16 bits */
    row1_m1 = _mm_unpacklo_epi8(row1_m1, zero);
    row1_0  = _mm_unpacklo_epi8(row1_0,  zero);
    row1_p1 = _mm_unpacklo_epi8(row1_p1, zero);

    /* Calculating the difference along each of the 3 directions.    */
    for(i = 0; i < SUB_BLK_HT; i ++)
        row2_m1 = _mm_loadl_epi64((__m128i *) (pu1_src - 1));
        row2_0  = _mm_loadl_epi64((__m128i *) (pu1_src));
        row2_p1 = _mm_loadl_epi64((__m128i *) (pu1_src + 1));
        pu1_src += src_strd;

        /* Unpack to 16 bits */
        row2_m1 = _mm_unpacklo_epi8(row2_m1, zero);
        row2_0  = _mm_unpacklo_epi8(row2_0,  zero);
        row2_p1 = _mm_unpacklo_epi8(row2_p1, zero);

        diff    = _mm_sad_epu8(row1_0, row2_0);
        diffs[0]  = _mm_add_epi64(diffs[0], diff);

        diff    = _mm_sad_epu8(row1_m1, row2_p1);
        diffs[1] = _mm_add_epi64(diffs[1], diff);

        diff    = _mm_sad_epu8(row1_p1, row2_m1);
        diffs[2]  = _mm_add_epi64(diffs[2], diff);

        row1_m1 = row2_m1;
        row1_0 = row2_0;
        row1_p1 = row2_p1;
    /* Revert pu1_src increment */
    pu1_src -= (SUB_BLK_HT + 1) * src_strd;

    adiff[0] = _mm_cvtsi128_si32(diffs[0]);
    adiff[1] = _mm_cvtsi128_si32(diffs[1]);
    adiff[2] = _mm_cvtsi128_si32(diffs[2]);
    adiff[3] = _mm_cvtsi128_si32(_mm_srli_si128(diffs[0], 8));
    adiff[4] = _mm_cvtsi128_si32(_mm_srli_si128(diffs[1], 8));
    adiff[5] = _mm_cvtsi128_si32(_mm_srli_si128(diffs[2], 8));
    pi4_diff = adiff;

    for(i = 0; i < 2; i++)
        /* Applying bias, to make the diff comparision more robust.      */
        pi4_diff[0] *= EDGE_BIAS_0;
        pi4_diff[1] *= EDGE_BIAS_1;
        pi4_diff[2] *= EDGE_BIAS_1;

        /* comapring the diffs */
        dir_45_le_90  = (pi4_diff[2] <= pi4_diff[0]);
        dir_45_le_135 = (pi4_diff[2] <= pi4_diff[1]);
        dir_135_le_90 = (pi4_diff[1] <= pi4_diff[0]);

        /* Direction selection. */
        shifts[i] = 0;
        if(1 == dir_45_le_135)
            if(1 == dir_45_le_90)
                shifts[i] = 1;
            if(1 == dir_135_le_90)
                shifts[i] = -1;
        pi4_diff += 3;
    /* Directional interpolation */
    for(i = 0; i < SUB_BLK_HT / 2; i++)
        __m128i dst;
        __m128i row1, row2;

        UWORD32 *pu4_row1th, *pu4_row1tl;
        UWORD32 *pu4_row2th, *pu4_row2tl;
        UWORD32 *pu4_row1bh, *pu4_row1bl;
        UWORD32 *pu4_row2bh, *pu4_row2bl;

        pu4_row1th  = (UWORD32 *)(pu1_src + shifts[0]);
        pu4_row1tl  = (UWORD32 *)(pu1_src + SUB_BLK_WD + shifts[1]);

        pu1_src += src_strd;
        pu4_row2th  = (UWORD32 *)(pu1_src + shifts[0]);
        pu4_row2tl  = (UWORD32 *)(pu1_src + SUB_BLK_WD + shifts[1]);

        pu4_row1bh  = (UWORD32 *)(pu1_src - shifts[0]);
        pu4_row1bl  = (UWORD32 *)(pu1_src + SUB_BLK_WD - shifts[1]);

        pu1_src += src_strd;
        pu4_row2bh  = (UWORD32 *)(pu1_src - shifts[0]);
        pu4_row2bl  = (UWORD32 *)(pu1_src + SUB_BLK_WD - shifts[1]);

        row1 = _mm_set_epi32(*pu4_row1tl, *pu4_row1th, *pu4_row2tl, *pu4_row2th);
        row2 = _mm_set_epi32(*pu4_row1bl, *pu4_row1bh, *pu4_row2bl, *pu4_row2bh);

        dst = _mm_avg_epu8(row1, row2);

        _mm_storel_epi64((__m128i *)pu1_out, _mm_srli_si128(dst, 8));
        pu1_out += out_strd;

        _mm_storel_epi64((__m128i *)pu1_out, dst);
        pu1_out += out_strd;