Beispiel #1
0
static void ConvertARGBToUV_SSE41(const uint32_t* argb,
                                  uint8_t* u, uint8_t* v,
                                  int src_width, int do_store) {
  const int max_width = src_width & ~31;
  int i;
  for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
    __m128i rgb[6], U0, V0, U1, V1;
    RGB32PackedToPlanar_SSE41(&argb[i], rgb);
    HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]);
    HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]);
    HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]);
    ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);

    RGB32PackedToPlanar_SSE41(&argb[i + 16], rgb);
    HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]);
    HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]);
    HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]);
    ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);

    U0 = _mm_packus_epi16(U0, U1);
    V0 = _mm_packus_epi16(V0, V1);
    if (!do_store) {
      const __m128i prev_u = LOAD_16(u);
      const __m128i prev_v = LOAD_16(v);
      U0 = _mm_avg_epu8(U0, prev_u);
      V0 = _mm_avg_epu8(V0, prev_v);
    }
    STORE_16(U0, u);
    STORE_16(V0, v);
  }
  if (i < src_width) {  // left-over
    WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
  }
}
void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
   png_const_bytep prev)
{
   /* The Avg filter predicts each pixel as the (truncated) average of a and b.
    * There's no pixel to the left of the first pixel.  Luckily, it's
    * predicted to be half of the pixel above it.  So again, this works
    * perfectly with our loop if we make sure a starts at zero.
    */

   png_size_t rb;

   const __m128i zero = _mm_setzero_si128();

   __m128i    b;
   __m128i a, d = zero;

   png_debug(1, "in png_read_filter_row_avg3_sse2");
   rb = row_info->rowbytes;
   while (rb >= 4) {
      __m128i avg;
             b = load4(prev);
      a = d; d = load4(row );

      /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
      avg = _mm_avg_epu8(a,b);
      /* ...but we can fix it up by subtracting off 1 if it rounded up. */
      avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
                                            _mm_set1_epi8(1)));
      d = _mm_add_epi8(d, avg);
      store3(row, d);

      prev += 3;
      row  += 3;
      rb   -= 3;
   }
   if (rb > 0) {
      __m128i avg;
             b = load3(prev);
      a = d; d = load3(row );

      /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
      avg = _mm_avg_epu8(a,b);
      /* ...but we can fix it up by subtracting off 1 if it rounded up. */
      avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
                                            _mm_set1_epi8(1)));

      d = _mm_add_epi8(d, avg);
      store3(row, d);

      prev += 3;
      row  += 3;
      rb   -= 3;
   }
}
Beispiel #3
0
void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width,
                            int height, const uint8_t *ref, int ref_stride) {
  /* comp_pred and pred must be 16 byte aligned. */
  assert(((intptr_t)comp_pred & 0xf) == 0);
  assert(((intptr_t)pred & 0xf) == 0);
  if (width > 8) {
    int x, y;
    for (y = 0; y < height; ++y) {
      for (x = 0; x < width; x += 16) {
        const __m128i p = _mm_load_si128((const __m128i *)(pred + x));
        const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x));
        const __m128i avg = _mm_avg_epu8(p, r);
        _mm_store_si128((__m128i *)(comp_pred + x), avg);
      }
      comp_pred += width;
      pred += width;
      ref += ref_stride;
    }
  } else {  // width must be 4 or 8.
    int i;
    // Process 16 elements at a time. comp_pred and pred have width == stride
    // and therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are
    // all divisible by 16 so just ref needs to be massaged when loading.
    for (i = 0; i < width * height; i += 16) {
      const __m128i p = _mm_load_si128((const __m128i *)pred);
      __m128i r;
      __m128i avg;
      if (width == ref_stride) {
        r = _mm_loadu_si128((const __m128i *)ref);
        ref += 16;
      } else if (width == 4) {
        r = _mm_set_epi32(loadu_uint32(ref + 3 * ref_stride),
                          loadu_uint32(ref + 2 * ref_stride),
                          loadu_uint32(ref + ref_stride), loadu_uint32(ref));

        ref += 4 * ref_stride;
      } else {
        const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref);
        assert(width == 8);
        r = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(r_0),
                                          (const __m64 *)(ref + ref_stride)));

        ref += 2 * ref_stride;
      }
      avg = _mm_avg_epu8(p, r);
      _mm_store_si128((__m128i *)comp_pred, avg);

      pred += 16;
      comp_pred += 16;
    }
  }
}
Beispiel #4
0
void sk_avg_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {
    // The Avg filter predicts each pixel as the (truncated) average of a and b.
    // There's no pixel to the left of the first pixel.  Luckily, it's
    // predicted to be half of the pixel above it.  So again, this works
    // perfectly with our loop if we make sure a starts at zero.
    const __m128i zero = _mm_setzero_si128();
    __m128i    b;
    __m128i a, d = zero;

    int rb = row_info->rowbytes;
    while (rb > 0) {
        b = load<bpp>(prev);
        a = d;
        d = load<bpp>(row );

        // PNG requires a truncating average here, so sadly we can't just use _mm_avg_epu8...
        __m128i avg = _mm_avg_epu8(a,b);
        // ...but we can fix it up by subtracting off 1 if it rounded up.
        avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_epi8(1)));

        d = _mm_add_epi8(d, avg);
        store<bpp>(row, d);

        prev += bpp;
        row  += bpp;
        rb   -= bpp;
    }
}
Beispiel #5
0
/** Average each 2x2 pixels into 1x1 pixel (arithmetic average)
  *  - <b>Input format:</b> uint8_t, 1 channel
  *  - <b>Output format:</b> uint8_t, 1 channel
  *  - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*1
  *  - <b>Notes:</b>
  *  - <b>Requires:</b> SSE2
  *  - <b>Invoked from:</b> mrpt::utils::CImage::scaleHalfSmooth()
  */
void image_SSE2_scale_half_smooth_1c8u(const uint8_t* in, uint8_t* out, int w, int h)
{
	MRPT_ALIGN16 const unsigned long long mask[2] = {0x00FF00FF00FF00FFull, 0x00FF00FF00FF00FFull};
	const uint8_t* nextRow = in + w;
	__m128i m = _mm_load_si128((const __m128i*)mask);
	int sw = w >> 4;
	int sh = h >> 1;

	for (int i=0; i<sh; i++)
	{
		for (int j=0; j<sw; j++)
		{
			__m128i here = _mm_load_si128((const __m128i*)in);
			__m128i next = _mm_load_si128((const __m128i*)nextRow);
			here = _mm_avg_epu8(here,next);
			next = _mm_and_si128(_mm_srli_si128(here,1), m);
			here = _mm_and_si128(here,m);
			here = _mm_avg_epu16(here, next);
			_mm_storel_epi64((__m128i*)out, _mm_packus_epi16(here,here));
			in += 16;
			nextRow += 16;
			out += 8;
		}

		in += w;
		nextRow += w;
	}
}
Beispiel #6
0
__m128i test_mm_avg_epu8(__m128i A, __m128i B) {
  // DAG-LABEL: test_mm_avg_epu8
  // DAG: call <16 x i8> @llvm.x86.sse2.pavg.b
  //
  // ASM-LABEL: test_mm_avg_epu8
  // ASM: pavgb
  return _mm_avg_epu8(A, B);
}
Beispiel #7
0
static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
                                       const __m128i* const a1,
                                       __m128i* const avg) {
  // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
  const __m128i ones = _mm_set1_epi8(1);
  const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
  const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
  *avg = _mm_sub_epi8(avg1, one);
}
void read_luma_inter_pred_avg_8x16_intrinsic( BYTE *address1, BYTE *address2, INT stride_src, BYTE *dst, INT stride_dst )
{
	int i;
	int src_stride = stride_src;
	int dst_stride = stride_dst;
	const unsigned char* src1 = address1;
	const unsigned char* src2 = address2;

	for( i = 0; i < 16; i+=8)
	{
		__declspec(align(16)) __m128i r0, r1, r2, r3, r4, r5, r6, r7,
			r0_x, r1_x, r2_x, r3_x, r4_x, r5_x, r6_x, r7_x;
		int stride2 = (src_stride<<1);
		int stride4 = (src_stride<<2);
		int dst_stride2 = (dst_stride<<1);
		int dst_stride4 = (dst_stride<<2);
		r0 = _mm_loadl_epi64((__m128i*)(src1));
		r1 = _mm_loadl_epi64((__m128i*)(src1+src_stride));
		r2 = _mm_loadl_epi64((__m128i*)(src1+stride2));
		r3 = _mm_loadl_epi64((__m128i*)(src1+stride2+src_stride));
		r4 = _mm_loadl_epi64((__m128i*)(src1+stride4));
		r5 = _mm_loadl_epi64((__m128i*)(src1+stride4+src_stride));
		r6 = _mm_loadl_epi64((__m128i*)(src1+stride4+stride2));
		r7 = _mm_loadl_epi64((__m128i*)(src1+stride4+stride2+src_stride));
		r0_x = _mm_loadl_epi64((__m128i*)(src2));
		r1_x = _mm_loadl_epi64((__m128i*)(src2+src_stride));
		r2_x = _mm_loadl_epi64((__m128i*)(src2+stride2));
		r3_x = _mm_loadl_epi64((__m128i*)(src2+stride2+src_stride));
		r4_x = _mm_loadl_epi64((__m128i*)(src2+stride4));
		r5_x = _mm_loadl_epi64((__m128i*)(src2+stride4+src_stride));
		r6_x = _mm_loadl_epi64((__m128i*)(src2+stride4+stride2));
		r7_x = _mm_loadl_epi64((__m128i*)(src2+stride4+stride2+src_stride));
		r0 = _mm_avg_epu8(r0, r0_x);
		r1 = _mm_avg_epu8(r1, r1_x);
		r2 = _mm_avg_epu8(r2, r2_x);
		r3 = _mm_avg_epu8(r3, r3_x);
		r4 = _mm_avg_epu8(r4, r4_x);
		r5 = _mm_avg_epu8(r5, r5_x);
		r6 = _mm_avg_epu8(r6, r6_x);
		r7 = _mm_avg_epu8(r7, r7_x);
		_mm_storel_epi64((__m128i*)(dst), r0);
		_mm_storel_epi64((__m128i*)(dst+dst_stride), r1);
		_mm_storel_epi64((__m128i*)(dst+dst_stride2), r2);
		_mm_storel_epi64((__m128i*)(dst+dst_stride2+dst_stride), r3);
		_mm_storel_epi64((__m128i*)(dst+dst_stride4), r4);
		_mm_storel_epi64((__m128i*)(dst+dst_stride4+dst_stride), r5);
		_mm_storel_epi64((__m128i*)(dst+dst_stride4+dst_stride2), r6);
		_mm_storel_epi64((__m128i*)(dst+dst_stride4+dst_stride2+dst_stride), r7);
		src1 += (stride4<<1);
		src2 += (stride4<<1);
		dst += (dst_stride4<<1);
	}
}
Beispiel #9
0
static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
                                        __m128i* const avg) {
  // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
  const __m128i ones = _mm_set1_epi8(1);
  const __m128i A0 = _mm_cvtsi32_si128(a0);
  const __m128i A1 = _mm_cvtsi32_si128(a1);
  const __m128i avg1 = _mm_avg_epu8(A0, A1);
  const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);
  *avg = _mm_sub_epi8(avg1, one);
}
Beispiel #10
0
static inline __m128i calculate_pixel_avg(const __m128i rb,
   const __m128i prb, __m128i pixel, const __m128i mask)
{
   __m128i round;

   round = _mm_xor_si128(prb, pixel);
   pixel = _mm_avg_epu8(pixel, prb);
   round = _mm_and_si128(round, mask);
   pixel = _mm_sub_epi8(pixel, round);
   return _mm_add_epi8(pixel, rb);
}
 __m64 interpolvline_1(	unsigned char* image,	int PicWidthInPix){
	
	 
	 
	__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;

	__m64 ret;

	xmm7 = _mm_setzero_si128();

	xmm0 = _mm_movpi64_epi64(*((__m64*)(image - 2*PicWidthInPix)));
	xmm0 = _mm_unpacklo_epi8(xmm0,xmm7);
	xmm1 = _mm_movpi64_epi64(*((__m64*)(image - 1*PicWidthInPix)));
	xmm1 = _mm_unpacklo_epi8(xmm1,xmm7);
	xmm2 = _mm_movpi64_epi64(*((__m64*)(image - 0*PicWidthInPix)));
	xmm2 = _mm_unpacklo_epi8(xmm2,xmm7);
	xmm3 = _mm_movpi64_epi64(*((__m64*)(image + 1*PicWidthInPix)));
	xmm3 = _mm_unpacklo_epi8(xmm3,xmm7);
	xmm4 = _mm_movpi64_epi64(*((__m64*)(image + 2*PicWidthInPix)));
	xmm4 = _mm_unpacklo_epi8(xmm4,xmm7);
	xmm5 = _mm_movpi64_epi64(*((__m64*)(image + 3*PicWidthInPix)));
	xmm5 = _mm_unpacklo_epi8(xmm5,xmm7);

// filter on 8 values
	xmm6 = _mm_add_epi16(xmm2,xmm3);
	xmm6 = _mm_slli_epi16(xmm6,2);
	xmm6 = _mm_sub_epi16(xmm6,xmm1);
	xmm6 = _mm_sub_epi16(xmm6,xmm4);

	xmm1 = _mm_set_epi32(0x00050005,0x00050005,0x00050005,0x00050005);
	xmm6 = _mm_mullo_epi16(xmm6,xmm1);
	xmm6 = _mm_add_epi16(xmm6,xmm0);
	xmm6 = _mm_add_epi16(xmm6,xmm5);
	xmm6 = _mm_add_epi16(xmm6,_mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010));
	xmm6 = _mm_max_epi16(xmm6, xmm7); // preventing negative values
	xmm6 = _mm_srli_epi16(xmm6,5);

	xmm2 = _mm_packus_epi16(xmm2,xmm7);
	xmm3 = _mm_packus_epi16(xmm3,xmm7);
	xmm6 = _mm_packus_epi16(xmm6,xmm7);

	xmm5 = _mm_unpacklo_epi8(xmm2,xmm6);
	xmm4 = _mm_unpacklo_epi8(xmm6,xmm3);
	xmm6 = _mm_avg_epu8(xmm4,xmm5);

	xmm6 = _mm_slli_epi16(xmm6,8);
	xmm6 = _mm_srli_epi16(xmm6,8);
	xmm6 = _mm_packus_epi16(xmm6,xmm7);
	
	ret = _mm_movepi64_pi64(xmm6);
	_mm_empty(); 

	return(ret);
}
Beispiel #12
0
__m64 _m_pavgb(__m64 _MM1, __m64 _MM2)
{
    __m128i lhs = {0}, rhs = {0};
    lhs.m128i_i64[0] = _MM1.m64_i64;

    rhs.m128i_i64[0] = _MM2.m64_i64;

    lhs = _mm_avg_epu8(lhs, rhs);

    _MM1.m64_i64 = lhs.m128i_i64[0];
    return _MM1;
}
Beispiel #13
0
void avg (int simd, uint8_t *src , uint8_t *src2, uint8_t *dst) {
  if (simd) {
    (*(__m128i*)dst) = _mm_avg_epu8((*(__m128i*)src), (*(__m128i*)src2));
  }
  else {
    int size = 16;
    while (size) {
      *dst = ((*src)+(*src2))/2;
      src++; src2++; dst++; size--;
    }
  }
}
 __m64 interpolhline_1(unsigned char* image){

	__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
	unsigned char* imagetmp = image - 2;
	__m64 ret;

	xmm7 = _mm_setzero_si128();
	xmm6 = _mm_loadu_si128(((__m128i*)imagetmp));


	xmm0 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm1 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm2 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm3 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm4 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm5 = _mm_unpacklo_epi8(xmm6,xmm7);

// filter on 8 values
	xmm6 = _mm_add_epi16(xmm2,xmm3);//(C + D)
	xmm6 = _mm_slli_epi16(xmm6,2);//(C + D) << 2
	xmm6 = _mm_sub_epi16(xmm6,xmm1);//((C + D) << 2) - B
	xmm6 = _mm_sub_epi16(xmm6,xmm4);//((C + D) << 2) - B - E

	xmm1 = _mm_set_epi32(0x00050005,0x00050005,0x00050005,0x00050005);
	xmm6 = _mm_mullo_epi16(xmm6,xmm1);//(((C + D) << 2) - B - E) * 5
	xmm6 = _mm_add_epi16(xmm6,xmm0);//((((C + D) << 2) - B - E) * 5) + A
	xmm6 = _mm_add_epi16(xmm6,xmm5);//((((C + D) << 2) - B - E) * 5) + A + F
	xmm6 = _mm_add_epi16(xmm6,_mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010));//((((C + D) << 2) - B - E) * 5) + A + F + 16
	xmm6 = _mm_max_epi16(xmm6, xmm7); // preventing negative values Clip255_16
	xmm6 = _mm_srli_epi16(xmm6,5); // result0 >> 5

	xmm2 = _mm_packus_epi16(xmm2,xmm7);
	xmm3 = _mm_packus_epi16(xmm3,xmm7);
	xmm6 = _mm_packus_epi16(xmm6,xmm7);

	xmm5 = _mm_unpacklo_epi8(xmm2,xmm6);
	xmm4 = _mm_unpacklo_epi8(xmm6,xmm3);
	xmm6 = _mm_avg_epu8(xmm4,xmm5);

	xmm6 = _mm_slli_epi16(xmm6,8);
	xmm6 = _mm_srli_epi16(xmm6,8);
	xmm6 = _mm_packus_epi16(xmm6,xmm7);

	ret = _mm_movepi64_pi64(xmm6);
	_mm_empty(); 

	return(ret);
}
Beispiel #15
0
ppm_t* img_fast_bw(ppm_t* src, uint64_t* cycles)
{
	int i = 0, j = 0;
	
	uint8_t* r = src->r;
	uint8_t* g = src->g;
	uint8_t* b = src->b;
	int ni = 0;
	
	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5;
	
	uint64_t t0 = readTSC();
	
	for (j = 0; j < src->w; j++)
	{
		for (i = 0; i < src->h; i += 16)
		{
			ni += 16;
			
			// Load 16 RGB pixels
			xmm0 = _mm_load_si128((__m128i*)(r + ni));
			xmm1 = _mm_load_si128((__m128i*)(g + ni));
			xmm2 = _mm_load_si128((__m128i*)(b + ni));
			
			// Average by pairs (RG, GB), then average pairs
			xmm3 = _mm_avg_epu8(xmm0, xmm1);
			xmm4 = _mm_avg_epu8(xmm1, xmm2);
			xmm5 = _mm_avg_epu8(xmm0, xmm1);
			
			// Store back
			_mm_store_si128((__m128i*)(r + ni), xmm5);
			_mm_store_si128((__m128i*)(g + ni), xmm5);
			_mm_store_si128((__m128i*)(b + ni), xmm5);
		}
	}
	
	if (cycles != NULL) *cycles = cyclesElapsed(readTSC(), t0);
	
	return src;
}
void read_luma_inter_pred_avg_16x16_intrinsic( BYTE *address1, BYTE *address2, INT stride_src, BYTE *dst, INT stride_dst )
{
	for(int i = 0; i < 16; i+=8)
	{
		__declspec(align(16)) __m128i r0, r1, r2, r3, r4, r5, r6, r7,
			r0_x, r1_x, r2_x, r3_x, r4_x, r5_x, r6_x, r7_x;
		int stride2 = (stride_src<<1);
		int stride4 = (stride_src<<2);
		int dst_stride2 = (stride_dst<<1);
		int dst_stride4 = (stride_dst<<2);
		r0 = _mm_loadu_si128((__m128i*)(address1));
		r1 = _mm_loadu_si128((__m128i*)(address1+stride_dst));
		r2 = _mm_loadu_si128((__m128i*)(address1+stride2));
		r3 = _mm_loadu_si128((__m128i*)(address1+stride2+stride_src));
		r4 = _mm_loadu_si128((__m128i*)(address1+stride4));
		r5 = _mm_loadu_si128((__m128i*)(address1+stride4+stride_src));
		r6 = _mm_loadu_si128((__m128i*)(address1+stride4+stride2));
		r7 = _mm_loadu_si128((__m128i*)(address1+stride4+stride2+stride_src));
		r0_x = _mm_loadu_si128((__m128i*)(address2));
		r1_x = _mm_loadu_si128((__m128i*)(address2+stride_src));
		r2_x = _mm_loadu_si128((__m128i*)(address2+stride2));
		r3_x = _mm_loadu_si128((__m128i*)(address2+stride2+stride_src));
		r4_x = _mm_loadu_si128((__m128i*)(address2+stride4));
		r5_x = _mm_loadu_si128((__m128i*)(address2+stride4+stride_dst));
		r6_x = _mm_loadu_si128((__m128i*)(address2+stride4+stride2));
		r7_x = _mm_loadu_si128((__m128i*)(address2+stride4+stride2+stride_dst));
		r0 = _mm_avg_epu8(r0, r0_x);
		r1 = _mm_avg_epu8(r1, r1_x);
		r2 = _mm_avg_epu8(r2, r2_x);
		r3 = _mm_avg_epu8(r3, r3_x);
		r4 = _mm_avg_epu8(r4, r4_x);
		r5 = _mm_avg_epu8(r5, r5_x);
		r6 = _mm_avg_epu8(r6, r6_x);
		r7 = _mm_avg_epu8(r7, r7_x);
		_mm_storeu_si128((__m128i*)(dst), r0);
		_mm_storeu_si128((__m128i*)(dst+stride_dst), r1);
		_mm_storeu_si128((__m128i*)(dst+dst_stride2), r2);
		_mm_storeu_si128((__m128i*)(dst+dst_stride2+stride_dst), r3);
		_mm_storeu_si128((__m128i*)(dst+dst_stride4), r4);
		_mm_storeu_si128((__m128i*)(dst+dst_stride4+stride_dst), r5);
		_mm_storeu_si128((__m128i*)(dst+dst_stride4+dst_stride2), r6);
		_mm_storeu_si128((__m128i*)(dst+dst_stride4+dst_stride2+stride_dst), r7);
		address1 += (stride4<<1);
		address2 += (stride4<<1);
		dst += (dst_stride4<<1);
	}
}
static t_INL void doRow16(unsigned char *dst, unsigned char *src, unsigned char *src2, int iLen){
   __m128i m1;//,mRes;
   __m128i m2;//,mRes;
   while(iLen)
   {
      m1=_mm_loadu_si128((__m128i*)src);
      m2=_mm_loadu_si128((__m128i*)src2);
      //mRes=_mm_avg_epu16(*(__m128i*)src1i,m1);_mm_storeu_si128((__m128i*)dsti,mRes);
      *(__m128i*)dst=_mm_avg_epu8(m2,m1);
     dst+=16;
     src+=16;
     src2+=16;
     iLen--;
   }

}
void luma_sample_interp_8x8_3_3_SSE2(unsigned char* sortie, unsigned char* image,const short PicWidthInPix,const short OutStride){


	unsigned char* tmpimg = image + 1;
	unsigned char* tmpimg2 = image + PicWidthInPix;
	__m64 *ptr_out = (__m64 *)sortie;

	int i;
	for(i = 0; i < 8 * PicWidthInPix; i += PicWidthInPix, ptr_out += (OutStride >> 3))
	{
		__m128i verthalf = _mm_movpi64_epi64(interpolvline_2(tmpimg + i,PicWidthInPix));
		__m128i horizhalf = _mm_movpi64_epi64(interpolhline_2(tmpimg2 + i));
		*ptr_out = _mm_movepi64_pi64(_mm_avg_epu8(verthalf,horizhalf));
	}

	_mm_empty(); 
}
Beispiel #19
0
// Applies filter on 4 pixels (p1, p0, q0 and q1)
static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
                                  __m128i* const q0, __m128i* const q1,
                                  const __m128i* const mask, int hev_thresh) {
    const __m128i sign_bit = _mm_set1_epi8(0x80);
    const __m128i k64 = _mm_set1_epi8(0x40);
    const __m128i zero = _mm_setzero_si128();
    __m128i not_hev;
    __m128i t1, t2, t3;

    // compute hev mask
    GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);

    // convert to signed values
    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);

    t1 = _mm_subs_epi8(*p1, *q1);        // p1 - q1
    t1 = _mm_andnot_si128(not_hev, t1);  // hev(p1 - q1)
    t2 = _mm_subs_epi8(*q0, *p0);        // q0 - p0
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 1 * (q0 - p0)
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 2 * (q0 - p0)
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
    t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about

    t2 = _mm_set1_epi8(3);
    t3 = _mm_set1_epi8(4);
    t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 3
    t3 = _mm_adds_epi8(t1, t3);        // 3 * (q0 - p0) + (p1 - q1) + 4
    SignedShift8b(&t2);                // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
    SignedShift8b(&t3);                // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
    *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
    *q0 = _mm_subs_epi8(*q0, t3);      // q0 -= t3
    FLIP_SIGN_BIT2(*p0, *q0);

    // this is equivalent to signed (a + 1) >> 1 calculation
    t2 = _mm_add_epi8(t3, sign_bit);
    t3 = _mm_avg_epu8(t2, zero);
    t3 = _mm_sub_epi8(t3, k64);

    t3 = _mm_and_si128(not_hev, t3);   // if !hev
    *q1 = _mm_subs_epi8(*q1, t3);      // q1 -= t3
    *p1 = _mm_adds_epi8(*p1, t3);      // p1 += t3
    FLIP_SIGN_BIT2(*p1, *q1);
}
void luma_sample_interp_8x8_3_2_SSE2(unsigned char* sortie, unsigned char* image,const short PicWidthInPix,const short OutStride){
	
	__m64 *ptr_out = (__m64 *)sortie;
	unsigned char* tmpimg = image + 1;
	__m128i temp[13];
	int i;
	for(i = 0; i < 13;i++)	{
		_mm_store_si128(temp + i, interpolhline128(image + PicWidthInPix * (i - 2)));
	}

	for(i = 0; i < 8;i++, ptr_out += (OutStride >> 3))
	{

		__m128i verthalf = _mm_movpi64_epi64(interpolvline_2(tmpimg + i*PicWidthInPix,PicWidthInPix));
		__m128i centerpix = _mm_movpi64_epi64(interpolvline128_2(temp+i));

		*ptr_out = _mm_movepi64_pi64(_mm_avg_epu8(verthalf,centerpix));
	}

	_mm_empty(); 
}
Beispiel #21
0
/* -----------------------------------
 *            average_plane
 * -----------------------------------
 */
static void average_plane_sse2(BYTE *p1, const BYTE *p2, int p1_pitch, int p2_pitch, int width, int height) {
  int mod16_width = width / 16 * 16;

  for(int y = 0; y < height; y++) {
    for(int x = 0; x < mod16_width; x+=16) {
      __m128i src1  = _mm_load_si128(reinterpret_cast<const __m128i*>(p1+x));
      __m128i src2  = _mm_load_si128(reinterpret_cast<const __m128i*>(p2+x));

      __m128i dst  = _mm_avg_epu8(src1, src2);

      _mm_store_si128(reinterpret_cast<__m128i*>(p1+x), dst);
    }

    if (mod16_width != width) {
      for (int x = mod16_width; x < width; ++x) {
        p1[x] = (int(p1[x]) + p2[x] + 1) >> 1;
      }
    }
    p1 += p1_pitch;
    p2 += p2_pitch;
  }
}
Beispiel #22
0
		template <> SIMD_INLINE __m128i OperationBinary8u<SimdOperationBinary8uAverage>(const __m128i & a, const __m128i & b)
		{
			return _mm_avg_epu8(a, b);
		}
/* SSE2 version of the rotzoom/affine warp filter */
void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
                          int stride, uint8_t *pred, int p_col, int p_row,
                          int p_width, int p_height, int p_stride,
                          int subsampling_x, int subsampling_y, int ref_frm,
                          int16_t alpha, int16_t beta, int16_t gamma,
                          int16_t delta) {
  __m128i tmp[15];
  int i, j, k;

  /* Note: For this code to work, the left/right frame borders need to be
     extended by at least 13 pixels each. By the time we get here, other
     code will have set up this border, but we allow an explicit check
     for debugging purposes.
  */
  /*for (i = 0; i < height; ++i) {
    for (j = 0; j < 13; ++j) {
      assert(ref[i * stride - 13 + j] == ref[i * stride]);
      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
    }
  }*/

  for (i = 0; i < p_height; i += 8) {
    for (j = 0; j < p_width; j += 8) {
      // (x, y) coordinates of the center of this block in the destination
      // image
      int32_t dst_x = p_col + j + 4;
      int32_t dst_y = p_row + i + 4;

      int32_t x4, y4, ix4, sx4, iy4, sy4;
      if (subsampling_x)
        x4 = ROUND_POWER_OF_TWO_SIGNED(
            mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] +
                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
            1);
      else
        x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];

      if (subsampling_y)
        y4 = ROUND_POWER_OF_TWO_SIGNED(
            mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] +
                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
            1);
      else
        y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];

      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);

      // Horizontal filter
      for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
        int iy = iy4 + k;
        if (iy < 0)
          iy = 0;
        else if (iy > height - 1)
          iy = height - 1;

        // If the block is aligned such that, after clamping, every sample
        // would be taken from the leftmost/rightmost column, then we can
        // skip the expensive horizontal filter.
        if (ix4 <= -7) {
          tmp[k + 7] = _mm_set1_epi16(
              ref[iy * stride] *
              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
        } else if (ix4 >= width + 6) {
          tmp[k + 7] = _mm_set1_epi16(
              ref[iy * stride + (width - 1)] *
              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
        } else {
          int sx = sx4 + alpha * (-4) + beta * k +
                   // Include rounding and offset here
                   (1 << (WARPEDDIFF_PREC_BITS - 1)) +
                   (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);

          // Load source pixels
          __m128i zero = _mm_setzero_si128();
          __m128i src =
              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));

          // Filter even-index pixels
          __m128i tmp_0 = _mm_loadu_si128(
              (__m128i *)(filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
          __m128i tmp_2 = _mm_loadu_si128(
              (__m128i *)(filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
          __m128i tmp_4 = _mm_loadu_si128(
              (__m128i *)(filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
          __m128i tmp_6 = _mm_loadu_si128(
              (__m128i *)(filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));

          // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
          __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
          // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
          __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
          // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
          __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
          // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
          __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);

          // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
          __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
          // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
          __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
          // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
          __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
          // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
          __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);

          __m128i round_const =
              _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);

          // Calculate filtered results
          __m128i src_0 = _mm_unpacklo_epi8(src, zero);
          __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
          __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
          __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
          __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
          __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
          __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
          __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);

          __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
                                           _mm_add_epi32(res_2, res_6));
          res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
                                    HORSHEAR_REDUCE_PREC_BITS);

          // Filter odd-index pixels
          __m128i tmp_1 = _mm_loadu_si128(
              (__m128i *)(filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
          __m128i tmp_3 = _mm_loadu_si128(
              (__m128i *)(filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
          __m128i tmp_5 = _mm_loadu_si128(
              (__m128i *)(filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
          __m128i tmp_7 = _mm_loadu_si128(
              (__m128i *)(filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));

          __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
          __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
          __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
          __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);

          __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
          __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
          __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
          __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);

          __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero);
          __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
          __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero);
          __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
          __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero);
          __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
          __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero);
          __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);

          __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
                                          _mm_add_epi32(res_3, res_7));
          res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
                                   HORSHEAR_REDUCE_PREC_BITS);

          // Combine results into one register.
          // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
          // as this order helps with the vertical filter.
          tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
        }
      }

      // Vertical filter
      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
        int sy = sy4 + gamma * (-4) + delta * k +
                 (1 << (WARPEDDIFF_PREC_BITS - 1)) +
                 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);

        // Load from tmp and rearrange pairs of consecutive rows into the
        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
        __m128i *src = tmp + (k + 4);
        __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
        __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
        __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
        __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);

        // Filter even-index pixels
        __m128i tmp_0 = _mm_loadu_si128(
            (__m128i *)(filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
        __m128i tmp_2 = _mm_loadu_si128(
            (__m128i *)(filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
        __m128i tmp_4 = _mm_loadu_si128(
            (__m128i *)(filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
        __m128i tmp_6 = _mm_loadu_si128(
            (__m128i *)(filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));

        __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
        __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
        __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
        __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);

        __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
        __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
        __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
        __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);

        __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
        __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
        __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
        __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);

        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
                                         _mm_add_epi32(res_4, res_6));

        // Filter odd-index pixels
        __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
        __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
        __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
        __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);

        __m128i tmp_1 = _mm_loadu_si128(
            (__m128i *)(filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
        __m128i tmp_3 = _mm_loadu_si128(
            (__m128i *)(filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
        __m128i tmp_5 = _mm_loadu_si128(
            (__m128i *)(filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
        __m128i tmp_7 = _mm_loadu_si128(
            (__m128i *)(filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));

        __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
        __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
        __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
        __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);

        __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
        __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
        __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
        __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);

        __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
        __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
        __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
        __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);

        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
                                        _mm_add_epi32(res_5, res_7));

        // Rearrange pixels back into the order 0 ... 7
        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);

        // Round and pack into 8 bits
        __m128i round_const =
            _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);

        __m128i res_lo_round = _mm_srai_epi32(
            _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
        __m128i res_hi_round = _mm_srai_epi32(
            _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);

        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
        __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);

        // Store, blending with 'pred' if needed
        __m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j];

        // Note: If we're outputting a 4x4 block, we need to be very careful
        // to only output 4 pixels at this point, to avoid encode/decode
        // mismatches when encoding with multiple threads.
        if (p_width == 4) {
          if (ref_frm) {
            const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
            res_8bit = _mm_avg_epu8(res_8bit, orig);
          }
          *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
        } else {
          if (ref_frm) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
          _mm_storel_epi64(p, res_8bit);
        }
      }
    }
  }
}
 __m64 interpolvline128_3(__m128i* temp){



	__m128i xmm6;

	__m64 ret;

	__m128i xmm7 = _mm_setzero_si128();

	__m128i xmm0 = _mm_load_si128(temp++);
	__m128i xmm1 = _mm_load_si128(temp++);
	__m128i xmm2 = _mm_load_si128(temp++);
	__m128i xmm3 = _mm_load_si128(temp++);
	__m128i xmm4 = _mm_load_si128(temp++);
	__m128i xmm5 = _mm_load_si128(temp);

	xmm1 = _mm_add_epi16(xmm1,xmm4);
	xmm0 = _mm_add_epi16(xmm0,xmm5);

	xmm6 = _mm_set_epi32(0xFFFBFFFB,0xFFFBFFFB,0xFFFBFFFB,0xFFFBFFFB);

	xmm4 = _mm_mullo_epi16(xmm1, xmm6);
	xmm5 = _mm_mulhi_epi16(xmm1, xmm6);

	xmm1 = _mm_unpacklo_epi16(xmm4, xmm5);
	xmm6 = _mm_unpackhi_epi16(xmm4, xmm5);

	xmm7 = _mm_set_epi32(0x00140014,0x00140014,0x00140014,0x00140014);
	xmm5 = _mm_add_epi16(xmm2,xmm3);

	xmm4 = _mm_mullo_epi16(xmm5, xmm7);
	xmm5 = _mm_mulhi_epi16(xmm5, xmm7);

	xmm7 = _mm_unpacklo_epi16(xmm4, xmm5);
	xmm4 = _mm_unpackhi_epi16(xmm4, xmm5);

	xmm7 = _mm_add_epi32(xmm7,xmm1);
	xmm4 = _mm_add_epi32(xmm4,xmm6); 

	xmm6 = _mm_set_epi32(0x00010001,0x00010001,0x00010001,0x00010001);
	xmm6 = _mm_mulhi_epi16(xmm0, xmm6);

	xmm1 = _mm_unpacklo_epi16(xmm0, xmm6);
	xmm6 = _mm_unpackhi_epi16(xmm0, xmm6);

	xmm7 = _mm_add_epi32(xmm7,xmm1);
	xmm4 = _mm_add_epi32(xmm4,xmm6); 
	
	xmm1 = _mm_set_epi32(0x00000200,0x00000200,0x00000200,0x00000200);
	
	xmm7 = _mm_add_epi32(xmm7,xmm1);
	xmm4 = _mm_add_epi32(xmm4,xmm1);
	
	xmm5 = _mm_setzero_si128();

	xmm7 = _mm_srli_epi32(xmm7, 10);
	xmm7 = _mm_max_epi16(xmm7, xmm5); // preventing negative values
	xmm7 = _mm_slli_epi32(xmm7,16);
	xmm7 = _mm_srli_epi32(xmm7,16);

	xmm4 = _mm_srli_epi32(xmm4, 10);

	xmm4 = _mm_max_epi16(xmm4, xmm5); // preventing negative values
	xmm4 = _mm_slli_epi32(xmm4,16);
	xmm4 = _mm_srli_epi32(xmm4,16);

	xmm6 = _mm_packs_epi32(xmm7, xmm4);
	
	xmm1 = _mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010);
	xmm2 = _mm_add_epi16(xmm2,xmm1);
	xmm2 = _mm_max_epi16(xmm2, xmm5); // preventing negative values
	xmm2 = _mm_srli_epi16(xmm2,5);

	
	xmm3 = _mm_add_epi16(xmm3,xmm1);
	xmm3 = _mm_max_epi16(xmm3, xmm5); // preventing negative values
	xmm3 = _mm_srli_epi16(xmm3,5);

	xmm2 = _mm_packus_epi16(xmm2,xmm5);
	xmm3 = _mm_packus_epi16(xmm3,xmm5);
	xmm6 = _mm_packus_epi16(xmm6,xmm5);

	xmm7 = _mm_unpacklo_epi8(xmm2,xmm6);
	xmm4 = _mm_unpacklo_epi8(xmm6,xmm3);
	xmm6 = _mm_avg_epu8(xmm4,xmm7);

	xmm6 = _mm_srli_epi16(xmm6,8);
	xmm6 = _mm_packus_epi16(xmm6,xmm5);
	ret = _mm_movepi64_pi64(xmm6);

	_mm_empty(); 

	return(ret);
}
/**
*******************************************************************************
*
* @brief
*  Performs spatial edge adaptive filtering
*
* @par   Description
*  Performs spatial edge adaptive filtering by detecting edge direction
*
* @param[in] pu1_src
*  Source buffer
*
* @param[in] pu1_out
*  Destination buffer
*
* @param[in] src_strd
*  Source stride
*
* @param[in] out_strd
*  Destination stride

* @returns
* None
*
* @remarks
*
*******************************************************************************
*/
void ideint_spatial_filter_ssse3(UWORD8 *pu1_src,
                           UWORD8 *pu1_out,
                           WORD32 src_strd,
                           WORD32 out_strd)
{
    WORD32 i;

    WORD32 adiff[6];
    WORD32 *pi4_diff;
    WORD32 shifts[2];
    WORD32 dir_45_le_90, dir_45_le_135, dir_135_le_90;

    __m128i row1_0, row1_m1, row1_p1;
    __m128i row2_0, row2_m1, row2_p1;
    __m128i diff, diffs[3];
    __m128i zero;

    /*****************************************************************/
    /* Direction detection                                           */
    /*****************************************************************/

    zero = _mm_setzero_si128();
    diffs[0] = _mm_setzero_si128();
    diffs[1]  = _mm_setzero_si128();
    diffs[2] = _mm_setzero_si128();

    /* Load source */
    row1_m1 = _mm_loadl_epi64((__m128i *) (pu1_src - 1));
    row1_0  = _mm_loadl_epi64((__m128i *) (pu1_src));
    row1_p1 = _mm_loadl_epi64((__m128i *) (pu1_src + 1));
    pu1_src += src_strd;

    /* Unpack to 16 bits */
    row1_m1 = _mm_unpacklo_epi8(row1_m1, zero);
    row1_0  = _mm_unpacklo_epi8(row1_0,  zero);
    row1_p1 = _mm_unpacklo_epi8(row1_p1, zero);

    /*****************************************************************/
    /* Calculating the difference along each of the 3 directions.    */
    /*****************************************************************/
    for(i = 0; i < SUB_BLK_HT; i ++)
    {
        row2_m1 = _mm_loadl_epi64((__m128i *) (pu1_src - 1));
        row2_0  = _mm_loadl_epi64((__m128i *) (pu1_src));
        row2_p1 = _mm_loadl_epi64((__m128i *) (pu1_src + 1));
        pu1_src += src_strd;

        /* Unpack to 16 bits */
        row2_m1 = _mm_unpacklo_epi8(row2_m1, zero);
        row2_0  = _mm_unpacklo_epi8(row2_0,  zero);
        row2_p1 = _mm_unpacklo_epi8(row2_p1, zero);

        diff    = _mm_sad_epu8(row1_0, row2_0);
        diffs[0]  = _mm_add_epi64(diffs[0], diff);

        diff    = _mm_sad_epu8(row1_m1, row2_p1);
        diffs[1] = _mm_add_epi64(diffs[1], diff);

        diff    = _mm_sad_epu8(row1_p1, row2_m1);
        diffs[2]  = _mm_add_epi64(diffs[2], diff);

        row1_m1 = row2_m1;
        row1_0 = row2_0;
        row1_p1 = row2_p1;
    }
    /* Revert pu1_src increment */
    pu1_src -= (SUB_BLK_HT + 1) * src_strd;


    adiff[0] = _mm_cvtsi128_si32(diffs[0]);
    adiff[1] = _mm_cvtsi128_si32(diffs[1]);
    adiff[2] = _mm_cvtsi128_si32(diffs[2]);
    adiff[3] = _mm_cvtsi128_si32(_mm_srli_si128(diffs[0], 8));
    adiff[4] = _mm_cvtsi128_si32(_mm_srli_si128(diffs[1], 8));
    adiff[5] = _mm_cvtsi128_si32(_mm_srli_si128(diffs[2], 8));
    pi4_diff = adiff;

    for(i = 0; i < 2; i++)
    {
        /*****************************************************************/
        /* Applying bias, to make the diff comparision more robust.      */
        /*****************************************************************/
        pi4_diff[0] *= EDGE_BIAS_0;
        pi4_diff[1] *= EDGE_BIAS_1;
        pi4_diff[2] *= EDGE_BIAS_1;

        /*****************************************************************/
        /* comapring the diffs */
        /*****************************************************************/
        dir_45_le_90  = (pi4_diff[2] <= pi4_diff[0]);
        dir_45_le_135 = (pi4_diff[2] <= pi4_diff[1]);
        dir_135_le_90 = (pi4_diff[1] <= pi4_diff[0]);

        /*****************************************************************/
        /* Direction selection. */
        /*****************************************************************/
        shifts[i] = 0;
        if(1 == dir_45_le_135)
        {
            if(1 == dir_45_le_90)
                shifts[i] = 1;
        }
        else
        {
            if(1 == dir_135_le_90)
                shifts[i] = -1;
        }
        pi4_diff += 3;
    }
    /*****************************************************************/
    /* Directional interpolation */
    /*****************************************************************/
    for(i = 0; i < SUB_BLK_HT / 2; i++)
    {
        __m128i dst;
        __m128i row1, row2;

        UWORD32 *pu4_row1th, *pu4_row1tl;
        UWORD32 *pu4_row2th, *pu4_row2tl;
        UWORD32 *pu4_row1bh, *pu4_row1bl;
        UWORD32 *pu4_row2bh, *pu4_row2bl;

        pu4_row1th  = (UWORD32 *)(pu1_src + shifts[0]);
        pu4_row1tl  = (UWORD32 *)(pu1_src + SUB_BLK_WD + shifts[1]);

        pu1_src += src_strd;
        pu4_row2th  = (UWORD32 *)(pu1_src + shifts[0]);
        pu4_row2tl  = (UWORD32 *)(pu1_src + SUB_BLK_WD + shifts[1]);

        pu4_row1bh  = (UWORD32 *)(pu1_src - shifts[0]);
        pu4_row1bl  = (UWORD32 *)(pu1_src + SUB_BLK_WD - shifts[1]);

        pu1_src += src_strd;
        pu4_row2bh  = (UWORD32 *)(pu1_src - shifts[0]);
        pu4_row2bl  = (UWORD32 *)(pu1_src + SUB_BLK_WD - shifts[1]);

        row1 = _mm_set_epi32(*pu4_row1tl, *pu4_row1th, *pu4_row2tl, *pu4_row2th);
        row2 = _mm_set_epi32(*pu4_row1bl, *pu4_row1bh, *pu4_row2bl, *pu4_row2bh);

        dst = _mm_avg_epu8(row1, row2);

        _mm_storel_epi64((__m128i *)pu1_out, _mm_srli_si128(dst, 8));
        pu1_out += out_strd;

        _mm_storel_epi64((__m128i *)pu1_out, dst);
        pu1_out += out_strd;
    }
}
Beispiel #26
0
void Sobel::sobelSSE(const Image1D& srcImage, SobelImage& destImage)
{
  ASSERT(srcImage.width % 16 == 0);
  ASSERT(srcImage.height >= 3);
  ASSERT(srcImage.yStart >= 0);
  ASSERT(srcImage.yStart <= srcImage.height);

  destImage.setResolution(srcImage.width, srcImage.height);
  destImage.yStart = srcImage.yStart;

  if(srcImage.yStart >= srcImage.height)
    return;

  // a b c    0 1 2
  // d e f    3 4 5
  // g h i    6 7 8
  __m128i valA, valB, valC, valD, valF, valG, valH, valI;
  __m128i sumX;
  __m128i sumY;
  __m128i tmp;

  __m128i zeros = _mm_setzero_si128();

  __m128i* pDestImg;
  __m128i* pDestImgLineEnd;

  // Fill top line
  for(pDestImg = reinterpret_cast<__m128i*>(destImage[destImage.yStart]), pDestImgLineEnd = reinterpret_cast<__m128i*>(destImage[destImage.yStart + 1]);
      pDestImg < pDestImgLineEnd; ++pDestImg)
  {
    *pDestImg = zeros;
  }


  int lastRow = destImage.height - 1;
  const Image1D::Pixel* p0 = srcImage[destImage.yStart];
  const Image1D::Pixel* p1 = srcImage[destImage.yStart + 1];
  const Image1D::Pixel* p2 = srcImage[destImage.yStart + 2];
  const Image1D::Pixel* p0LineEnd;

  for(int y = destImage.yStart + 1; y < lastRow; ++y)
  {
    for(p0LineEnd = srcImage[y], pDestImg = reinterpret_cast<__m128i*>(destImage[y]); p0 < p0LineEnd;
        p0 += 16, p1 += 16, p2 += 16, pDestImg += 2)
    {
      // laod values
      valA = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p0 - 1));
      valB = _mm_load_si128(reinterpret_cast<const __m128i*>(p0));
      valC = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p0 + 1));

      valD = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p1 - 1));
      valF = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p1 + 1));

      valG = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p2 - 1));
      valH = _mm_load_si128(reinterpret_cast<const __m128i*>(p2));
      valI = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p2 + 1));

      sumX = _mm_avg_epu8(valA, valG); // sumX = (a + g) / 2
      sumX = _mm_avg_epu8(sumX, valD); // sumX = (sumX + d) / 2
      sumX = _mm_avg_epu8(sumX, zeros); // sumX = sumX / 2 with average, because there is no 8 bit shift
      tmp = _mm_avg_epu8(valC, valI); // tmp = (c + i) / 2
      tmp = _mm_avg_epu8(tmp, valF); // tnp = (tmp + f) / 2
      tmp = _mm_avg_epu8(tmp, zeros); // tnp = tnp / 2 with average, because there is no 8 bit shift
      sumX = _mm_sub_epi8(sumX, tmp);

      sumY = _mm_avg_epu8(valA, valC); // sumX = (a + c) / 2
      sumY = _mm_avg_epu8(sumY, valB); // sumX = (sumX + b) / 2
      sumY = _mm_avg_epu8(sumY, zeros); // sumX = sumX / 2 with average, because there is no 8 bit shift
      tmp = _mm_avg_epu8(valG, valI); // tmp = (g + i) / 2
      tmp = _mm_avg_epu8(tmp, valH); // tnp = (tmp + h) / 2
      tmp = _mm_avg_epu8(tmp, zeros); // tnp = tnp / 2 with average, because there is no 8 bit shift
      sumY = _mm_sub_epi8(sumY, tmp);

      *pDestImg = _mm_unpacklo_epi8(sumX, sumY);
      *(pDestImg + 1) = _mm_unpackhi_epi8(sumX, sumY);
    }
  }

  // Fill bottom line
  for(pDestImg = reinterpret_cast<__m128i*>(destImage[destImage.height - 1]), pDestImgLineEnd = reinterpret_cast<__m128i*>(destImage[destImage.height]);
      pDestImg < pDestImgLineEnd; ++pDestImg)
  {
    *pDestImg = zeros;
  }

  // Fill right and left border
  for(int y = destImage.yStart; y < destImage.height - 1; y++)
  {
    destImage[y]->index = 0;
    (destImage[y + 1] - 1)->index = 0;
  }
}
Beispiel #27
0
void spu_interpreter::AVGB(SPUThread& CPU, spu_opcode_t op)
{
	CPU.GPR[op.rt].vi = _mm_avg_epu8(CPU.GPR[op.ra].vi, CPU.GPR[op.rb].vi);
}
mlib_status
__mlib_VideoP64Decimate_U8_U8(
	mlib_u8 *dst,
	const mlib_u8 *src,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 dst_stride,
	mlib_s32 src_stride)
{
	mlib_s32 x, y;
	const mlib_u8 *sd1, *sd2;
	mlib_u8 *dd;
	mlib_u32 src_stride2;

	sd1 = src;
	sd2 = src + src_stride;
	src_stride2 = 2 * src_stride;
	dd = dst;

	mlib_s32 dw = width & 0xF;
	__m128i txmm0, txmm1, txmm2, txmm3, txmm4, txmm5, txmm6, txmm7;

	txmm7 = _mm_set1_epi16(0xff);
	for (y = 0; y < height; y++) {
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		for (x = 0; x < width - dw; x += 16) {
			txmm0 = _mm_loadu_si128((__m128i *)&sd1[2*x]);
			txmm1 = _mm_loadu_si128((__m128i *)&sd2[2*x]);
			txmm2 =  _mm_srli_si128(txmm0, 1);
			txmm3 =  _mm_srli_si128(txmm1, 1);
			txmm4 = _mm_avg_epu8(txmm0, txmm2);
			txmm5 = _mm_avg_epu8(txmm1, txmm3);
			txmm6 = _mm_avg_epu8(txmm5, txmm4);
			txmm6 = _mm_and_si128(txmm6, txmm7);
			txmm0 = _mm_loadu_si128((__m128i *)&sd1[2 * x + 16]);
			txmm1 = _mm_loadu_si128((__m128i *)&sd2[2 * x + 16]);
			txmm2 =  _mm_srli_si128(txmm0, 1);
			txmm3 =  _mm_srli_si128(txmm1, 1);
			txmm4 = _mm_avg_epu8(txmm0, txmm2);
			txmm5 = _mm_avg_epu8(txmm1, txmm3);
			txmm5 = _mm_avg_epu8(txmm5, txmm4);
			txmm5 = _mm_and_si128(txmm5, txmm7);
			txmm1 = _mm_packus_epi16(txmm6, txmm5);
			_mm_storeu_si128((__m128i *)&dd[x], txmm1);

		}
#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		for (; x < width; x++) {
			dd[x] = (sd1[x * 2] + sd1[x * 2 + 1] + sd2[x * 2] +
				sd2[x * 2 + 1] + 2) >> 2;
		}

		sd1 += src_stride2;
		sd2 += src_stride2;
		dd += dst_stride;
	}

	return (MLIB_SUCCESS);
}
static FORCE_INLINE __m128i mm_avg_epu(const __m128i &a, const __m128i &b) {
    if (sizeof(PixelType) == 1)
        return _mm_avg_epu8(a, b);
    else
        return _mm_avg_epu16(a, b);
}