Example #1
0
static void replace_luma_yuy2_mmx(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height)
{
  int mod8_width = width / 8 * 8;
  __m64 luma_mask = _mm_set1_pi16(0x00FF);
#pragma warning(push)
#pragma warning(disable: 4309)
  __m64 chroma_mask = _mm_set1_pi16(0xFF00);
#pragma warning(pop)

  for(int y = 0; y < height; y++) {
    for(int x = 0; x < mod8_width; x+=8) {
      __m64 s = *reinterpret_cast<const __m64*>(src+x);
      __m64 l = *reinterpret_cast<const __m64*>(luma+x);

      __m64 s_chroma = _mm_and_si64(s, chroma_mask);
      __m64 l_luma = _mm_and_si64(l, luma_mask);

      __m64 result = _mm_or_si64(s_chroma, l_luma);

      *reinterpret_cast<__m64*>(src+x) = result;
    }

    for (int x = mod8_width; x < width; x+=2) {
      src[x] = luma[x];
    }
    src += pitch;
    luma += luma_pitch;
  }
  _mm_empty();
}
Example #2
0
static void weighted_merge_luma_yuy2_mmx(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height, int weight, int invweight)
{
  __m64 round_mask = _mm_set1_pi32(0x4000);
  __m64 mask = _mm_set_pi16(weight, invweight, weight, invweight);
  __m64 luma_mask = _mm_set1_pi16(0x00FF);
#pragma warning(push)
#pragma warning(disable: 4309)
  __m64 chroma_mask = _mm_set1_pi16(0xFF00);
#pragma warning(pop)

  int wMod8 = (width/8) * 8;

  for (int y = 0; y < height; y++) {
    for (int x = 0; x < wMod8; x += 8) {
      __m64 px1 = *reinterpret_cast<const __m64*>(src+x); //V1 Y3 U1 Y2 V0 Y1 U0 Y0
      __m64 px2 = *reinterpret_cast<const __m64*>(luma+x); //v1 y3 u1 y2 v0 y1 u0 y0

      __m64 src_lo = _mm_unpacklo_pi16(px1, px2); //v0 y1 V0 Y1 u0 y0 U0 Y0
      __m64 src_hi = _mm_unpackhi_pi16(px1, px2); 

      src_lo = _mm_and_si64(src_lo, luma_mask); //00 v0 00 V0 00 u0 00 U0
      src_hi = _mm_and_si64(src_hi, luma_mask); 

      src_lo = _mm_madd_pi16(src_lo, mask);
      src_hi = _mm_madd_pi16(src_hi, mask);

      src_lo = _mm_add_pi32(src_lo, round_mask);
      src_hi = _mm_add_pi32(src_hi, round_mask);

      src_lo = _mm_srli_pi32(src_lo, 15);
      src_hi = _mm_srli_pi32(src_hi, 15);

      __m64 result_luma = _mm_packs_pi32(src_lo, src_hi);

      __m64 result_chroma = _mm_and_si64(px1, chroma_mask);
      __m64 result = _mm_or_si64(result_chroma, result_luma);

      *reinterpret_cast<__m64*>(src+x) = result;
    }

    for (int x = wMod8; x < width; x+=2) {
      src[x] = (luma[x] * weight + src[x] * invweight + 16384) >> 15;
    }

    src += pitch;
    luma += luma_pitch;
  }
  _mm_empty();
}
Example #3
0
int32_t od_mc_compute_satd8_4x4_sse2(const unsigned char *src, int systride,
 const unsigned char *ref, int rystride) {
  int32_t satd;
  __m64 sums;
  __m64 a;
  __m64 b;
  __m64 c;
  __m64 d;
  a = od_load_convert_subtract_x4(src + 0*systride, ref + 0*rystride);
  b = od_load_convert_subtract_x4(src + 1*systride, ref + 1*rystride);
  c = od_load_convert_subtract_x4(src + 2*systride, ref + 2*rystride);
  d = od_load_convert_subtract_x4(src + 3*systride, ref + 3*rystride);
  /*Vertical 1D transform.*/
  od_mc_butterfly_2x2_16x4(&a, &b, &c, &d);
  od_mc_butterfly_2x2_16x4(&a, &b, &c, &d);
  od_transpose16x4(&a, &b, &c, &d);
  /*Horizontal 1D transform.*/
  od_mc_butterfly_2x2_16x4(&a, &b, &c, &d);
  /*Use the fact that (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) to merge the
     final butterfly stage with the calculating the absolute values and the
     first stage of accumulation.
    Calculates (abs(a+b)+abs(a-b))/2-0x7FFF.
    An offset must be added to the final sum before rounding to account for
     subtracting 0x7FFF.*/
  a = _mm_sub_pi16(_mm_max_pi16(a, b), _mm_adds_pi16(_mm_add_pi16(a, b),
   _mm_set1_pi16(0x7FFF)));
  c = _mm_sub_pi16(_mm_max_pi16(c, d), _mm_adds_pi16(_mm_add_pi16(c, d),
   _mm_set1_pi16(0x7FFF)));
  /*Take the sum of all the absolute values.*/
  sums = _mm_add_pi16(a, c);
  /*Sum the elements of the vector.*/
  sums = _mm_add_pi16(sums, _mm_shuffle_pi16(sums, _MM_SHUFFLE(0, 1, 2, 3)));
  sums = _mm_add_pi16(sums, _mm_shuffle_pi16(sums, _MM_SHUFFLE(2, 3, 0, 1)));
  sums = _mm_unpacklo_pi16(sums, _mm_setzero_si64());
  satd = _mm_cvtsi64_si32(sums);
  /*Subtract the offset (8) and round.*/
  satd = (satd + 1 - 8) >> 1;
#if defined(OD_CHECKASM)
  {
    int32_t c_satd;
    c_satd = od_mc_compute_satd8_4x4_c(src, systride, ref, rystride);
    if (satd != c_satd) {
      fprintf(stderr, "od_mc_compute_satd %ix%i check failed: %i!=%i\n",
       4, 4, satd, c_satd);
    }
  }
#endif
  return satd;
}
Example #4
0
__m64 test_mm_set1_pi16(short a) {
  // CHECK-LABEL: test_mm_set1_pi16
  // CHECK: insertelement <4 x i16>
  // CHECK: insertelement <4 x i16>
  // CHECK: insertelement <4 x i16>
  // CHECK: insertelement <4 x i16>
  return _mm_set1_pi16(a);
}
Example #5
0
/* do the processing for all colourspaces */
void pix_motionblur :: processMMX(imageStruct &image)
{
  m_savedImage.xsize=image.xsize;
  m_savedImage.ysize=image.ysize;
  m_savedImage.setCsizeByFormat(image.format);
  m_savedImage.reallocate();

  int pixsize=image.ysize*image.xsize*image.csize;
  pixsize=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0);

  __m64*pixels=(__m64*)image.data;
  __m64*old=(__m64*)m_savedImage.data;

  __m64 newGain = _mm_set1_pi16(static_cast<short>(m_blur0));
  __m64 oldGain = _mm_set1_pi16(static_cast<short>(m_blur1));
  __m64 null64 = _mm_setzero_si64();

  __m64 newpix1, newpix2, oldpix1, oldpix2;

  while(pixsize--) {
    newpix1=pixels[pixsize];
    oldpix1=old[pixsize];

    newpix2 = _mm_unpackhi_pi8(newpix1, null64);
    newpix1 = _mm_unpacklo_pi8(newpix1, null64);
    oldpix2 = _mm_unpackhi_pi8(oldpix1, null64);
    oldpix1 = _mm_unpacklo_pi8(oldpix1, null64);

    newpix1 = _mm_mullo_pi16(newpix1, newGain);
    newpix2 = _mm_mullo_pi16(newpix2, newGain);
    oldpix1 = _mm_mullo_pi16(oldpix1, oldGain);
    oldpix2 = _mm_mullo_pi16(oldpix2, oldGain);

    newpix1 = _mm_adds_pu16 (newpix1, oldpix1);
    newpix2 = _mm_adds_pu16 (newpix2, oldpix2);

    newpix1 = _mm_srli_pi16(newpix1, 8);
    newpix2 = _mm_srli_pi16(newpix2, 8);
    newpix1 = _mm_packs_pu16(newpix1, newpix2);
    pixels[pixsize]=newpix1;
    old   [pixsize]=newpix1;
  }
  _mm_empty();
}
Example #6
0
void DrawAAPMMX(PixelBlock& w, int x, int y, Color c)
{
	if(!Rect(w.GetSize()).Contains(Rect(x, y, x + 6, y + 11)))
		return;
	dword *a = w.PointAdr(x, y);
	int d = w.LineDelta();
	__m64 zero = _mm_setzero_si64();
	__m64 mc = _mm_unpacklo_pi8(_mm_cvtsi32_si64(c.GetRaw()), zero);
	__m64 mask = _mm_set1_pi16(0xff);
	const byte *s = aa_packed;
	dword *t = a;
	__m64 alpha;
	__m64 h;
	__m64 m;
	for(;;) {
		dword c = *s++;
		if(c == 0)
			break;
		t += (c >> 3) & 15;
		switch(c & 7) {
		case 7:
			AAPMMX_(6);
		case 6:
			AAPMMX_(5);
		case 5:
			AAPMMX_(4);
		case 4:
			AAPMMX_(3);
		case 3:
			AAPMMX_(2);
		case 2:
			AAPMMX_(1);
		case 1:
			AAPMMX_(0);
		}
		t += c & 7;
		s += c & 7;
		if(c & 0x80) {
			a += d;
			t = a;
		}
	}
	_mm_empty();
}
Example #7
0
#include <assert.h>

#ifndef NDEBUG
#define unreachable() assert(0)
#else
#define unreachable __builtin_unreachable
#endif

// requires w%16 == 0
__attribute__((hot))
void maxblend_sse(void *restrict dest, const void *restrict src, int w, int h)
{
	//FIXME: use src_stride
	//FIXME: deal with w%16 != 0
	__m64 *mbdst = dest; const __m64 *mbsrc = src;
	const __m64 off = _mm_set1_pi16(0x8000);
	_mm_prefetch(mbdst, _MM_HINT_NTA);
	_mm_prefetch(mbsrc, _MM_HINT_NTA);
	for(unsigned int i=0; i < 2*w*h/sizeof(__m64); i+=4, mbdst+=4, mbsrc+=4) {
		_mm_prefetch(mbdst + 4, _MM_HINT_NTA);
		_mm_prefetch(mbsrc + 4, _MM_HINT_NTA);

		__m64 v1, v2, v3, v4, t1, t2, t3, t4;

		v1 = mbdst[0], t1 = mbsrc[0];
		v1 = _mm_add_pi16(v1, off); t1 = _mm_add_pi16(t1, off);
		v1 = _mm_max_pi16(v1, t1);
		v1 = _mm_sub_pi16(v1, off);
		mbdst[0]=v1;

		v2 = mbdst[1], t2 = mbsrc[1];
Example #8
0
void reverb::comb_allpass4(signed short *sp,
														signed short *dp,
														const comb_param &comb_delay,
														const int comb_gain,
														const int allpass_delay,
														const int allpass_gain,
														const int *rvol,
														const unsigned int sz)
{
#ifdef use_intrinsics
	__m64   cg=_mm_set1_pi16(comb_gain),
				ag=_mm_set1_pi16(allpass_gain),
				rv[2];
	rv[0]=_mm_set1_pi16(rvol[0]);
	rv[1]=_mm_set1_pi16(rvol[1]);

	for (unsigned int i=0; i<(sz>>4); i++, sp+=2<<2, dp+=2<<2)
	{
		__m64 dv[2];

		for (int c=0; c<2; c++)
		{
			// Comb

			__m64 v=_mm_setzero_si64();

			for (int f=0; f<4; f++)
			{
				int yck=(yp-comb_delay[c][f])&(max_delay-1);
				__m64 xv=*(__m64 *)(&x[c][yck]),
							yv=*(__m64 *)(&y[c][f][yck]);
				yv=_mm_mulhi_pi16(yv,cg);
				yv=_mm_adds_pi16(yv,yv);
				yv=_mm_adds_pi16(xv,yv);
				*((__m64 *)&y[c][f][yp])=yv;
				yv=_mm_srai_pi16(yv,2);
				v=_mm_adds_pi16(v,yv);
			}

			// Allpass

			if (allpass_delay)
			{
				*((__m64 *)&ax[c][yp])=v;

				int ypa=(yp-allpass_delay)&(max_delay-1);
				__m64 ayv=*(__m64 *)&ay[c][ypa],
								xv=*(__m64 *)&x[c][yp],
								axv=*(__m64 *)&ax[c][ypa];

				ayv=_mm_subs_pi16(ayv,xv);
				ayv=_mm_mulhi_pi16(ayv,ag);
				ayv=_mm_adds_pi16(ayv,ayv);
				v=_mm_adds_pi16(ayv,axv);
				*((__m64 *)&ay[c][yp])=v;
			}

			// Output

			dv[c]=_mm_mulhi_pi16(v,rv[c]);
			dv[c]=_mm_adds_pi16(dv[c],dv[c]);
		}

		__m64 dv1=_mm_unpacklo_pi16(dv[0],dv[1]),
					dv2=_mm_unpackhi_pi16(dv[0],dv[1]),
					d1=*(__m64 *)&dp[0],
					d2=*(__m64 *)&dp[4],
					s1=*(__m64 *)&sp[0],
					s2=*(__m64 *)&sp[4];
		d1=_mm_adds_pi16(d1,s1);
		d2=_mm_adds_pi16(d2,s2);
		d1=_mm_adds_pi16(d1,dv1);
		d2=_mm_adds_pi16(d2,dv2);
		*(__m64 *)&dp[0]=d1;
		*(__m64 *)&dp[4]=d2;

		yp=(yp+4)&(max_delay-1);
	}

	_mm_empty();
#endif
}
Example #9
0
void
mlib_m_ImageMaximum_U16_3(
    mlib_s32 *res32,
    const mlib_image *img)
{
/* src address */
	__m64 *sp, *sl;

/* src data */
	__m64 sd;

/* max values */
	__m64 max1, max2, max3;

	__m64 _2s32_1, _2s32_2, _2s32_3, _2s32_4, _2s32_5, _2s32_6;
	mlib_s32 s1, s2, s3, s4, s5, s6;

/* edge mask */
	mlib_s32 emask;

/* loop variables */
	mlib_s32 n1;

/* height of image */
	mlib_s32 height = mlib_ImageGetHeight(img);

/* elements to next row */
	mlib_s32 slb = mlib_ImageGetStride(img);
	mlib_s32 width = mlib_ImageGetWidth(img) * 3;

	mlib_u16 *dend;

	if (slb == width) {
		width *= height;
		height = 1;
	}

	sp = sl = (__m64 *) mlib_ImageGetData(img);

	max1 = _mm_set1_pi16(MLIB_U16_MIN);
	max2 = _mm_set1_pi16(MLIB_U16_MIN);
	max3 = _mm_set1_pi16(MLIB_U16_MIN);

	for (; height > 0; height--) {

		n1 = width;
		dend = (mlib_u16 *)sp + width;

		for (; n1 > 11; n1 -= 12) {
			sd = (*sp++);
			MLIB_M_IMAGE_MAXIMUM_U16(max1, max1, sd);
			sd = (*sp++);
			MLIB_M_IMAGE_MAXIMUM_U16(max2, max2, sd);
			sd = (*sp++);
			MLIB_M_IMAGE_MAXIMUM_U16(max3, max3, sd);
		}

		if (n1 > 0) {
			emask = (n1 > 3) ? 0xF : (0xF << (4 - n1));
			sd = (*sp++);
			MLIB_M_IMAGE_MAXIMUM_U16_M32(max1, max1, sd, emask);

			n1 = ((mlib_s16 *)dend - (mlib_s16 *)sp);
			if (n1 > 0) {
				emask = (n1 > 3) ? 0xF : (0xF << (4 - n1));
				sd = (*sp++);
				MLIB_M_IMAGE_MAXIMUM_U16_M32(max2, max2, sd,
				    emask);

				n1 = ((mlib_s16 *)dend - (mlib_s16 *)sp);
				if (n1 > 0) {
					emask = (0xF << (4 - n1));
					sd = *sp;
					MLIB_M_IMAGE_MAXIMUM_U16_M32(max3, max3,
					    sd, emask);
				}
			}
		}

		sp = sl = (__m64 *) ((mlib_u8 *)sl + slb);
	}

	MLIB_M_CONVERT_4U16_2S32(_2s32_1, _2s32_2, max1);
	MLIB_M_CONVERT_4U16_2S32(_2s32_3, _2s32_4, max2);
	MLIB_M_CONVERT_4U16_2S32(_2s32_5, _2s32_6, max3);

	MLIB_M_IMAGE_MAXIMUM_S32(max1, _2s32_1, _2s32_6);
	MLIB_M_IMAGE_MAXIMUM_S32(max2, _2s32_2, _2s32_3);
	MLIB_M_IMAGE_MAXIMUM_S32(max3, _2s32_4, _2s32_5);

	MLIB_M_CONVERT_2S32_S32(s1, s2, max1);
	MLIB_M_CONVERT_2S32_S32(s3, s4, max2);
	MLIB_M_CONVERT_2S32_S32(s5, s6, max3);

	MLIB_M_IMAGE_MAXIMUM(res32[0], s2, s3);
	MLIB_M_IMAGE_MAXIMUM(res32[1], s4, s5);
	MLIB_M_IMAGE_MAXIMUM(res32[2], s1, s6);

	_mm_empty();
}
Example #10
0
void
mlib_m_ImageMaximum_U16_124(
    mlib_s32 *res32,
    const mlib_image *img)
{
/* src address */
	__m64 *sp, *sl;

/* src data */
	__m64 sd;

/* min values */
	__m64 max;

	__m64 _2s32_1, _2s32_2;

/* edge mask */
	mlib_s32 emask;

/* loop variables */
	mlib_s32 n1;

/* height of image */
	mlib_s32 height = mlib_ImageGetHeight(img);

/* elements to next row */
	mlib_s32 slb = mlib_ImageGetStride(img);

/* number of image channels */
	mlib_s32 channels = mlib_ImageGetChannels(img);
	mlib_s32 width = mlib_ImageGetWidth(img) * channels;

	mlib_s32 s1, s2;

	if (slb == width) {
		width *= height;
		height = 1;
	}

	sp = sl = (__m64 *) mlib_ImageGetData(img);

/* min values */
	max = _mm_set1_pi16(MLIB_U16_MIN);

	for (; height > 0; height--) {

		n1 = width;

		for (; n1 > 3; n1 -= 4) {
			sd = (*sp++);
			MLIB_M_IMAGE_MAXIMUM_U16(max, max, sd);
		}

		if (n1 > 0) {
			emask = (0xF << (4 - n1));
			sd = *sp;
			MLIB_M_IMAGE_MAXIMUM_U16_M32(max, max, sd, emask);
		}

		sp = sl = (__m64 *) ((mlib_u8 *)sl + slb);
	}

	switch (channels) {
	case 1:
	    {
		    MLIB_M_CONVERT_4U16_2S32(_2s32_1, _2s32_2, max);
		    MLIB_M_IMAGE_MAXIMUM_S32(_2s32_1, _2s32_1, _2s32_2);
		    MLIB_M_CONVERT_2S32_S32(s1, s2, _2s32_1);
		    MLIB_M_IMAGE_MAXIMUM(res32[0], s1, s2);
		    break;
	    }

	case 2:
	    {
		    MLIB_M_CONVERT_4U16_2S32(_2s32_1, _2s32_2, max);
		    MLIB_M_IMAGE_MAXIMUM_S32(_2s32_1, _2s32_1, _2s32_2);
		    ((__m64 *) res32)[0] = _2s32_1;
		    break;
	    }

	case 4:
	    {
		    MLIB_M_CONVERT_4U16_2S32(_2s32_1, _2s32_2, max);
		    ((__m64 *) res32)[0] = _2s32_2;
		    ((__m64 *) res32)[1] = _2s32_1;
		    break;
	    }
	}

	_mm_empty();
}