template<class _mm> void TimgFilterNoiseMplayer::Tprocess::lineNoiseAvg_simd(uint8_t *dst,const uint8_t *src, int len, int8_t **shift_)
{
    if (_mm::align && (intptr_t(src)&15 || intptr_t(dst)&15)) {
        lineNoiseAvg_simd<typename _mm::T64>(dst,src,len,shift_);
        return;
    }
    const int mmx_len=len&(~(_mm::size-1));

    int8_t *shift2[3]= {shift_[0]+mmx_len, shift_[1]+mmx_len, shift_[2]+mmx_len};
    for (int x=-mmx_len; x<0; x+=_mm::size) {
        //".balign 16                   \n\t"
        typename _mm::__m mm0,mm1,mm2,mm3;
        movdqu (mm1,shift2[0]+mmx_len+x);
        movq (mm0,src+mmx_len+x);
        typename _mm::__m shift1_8;
        movVqu(shift1_8, shift2[1]+mmx_len+x);
        paddb (mm1,shift1_8);
        typename _mm::__m shift2_8;
        movVqu(shift2_8, shift2[2]+mmx_len+x);
        paddb (mm1,shift2_8);
        movq (mm2,mm0);
        movq (mm3,mm1);
        punpcklbw (mm0,mm0);
        punpckhbw (mm2,mm2);
        punpcklbw (mm1,mm1);
        punpckhbw (mm3,mm3);
        pmulhw (mm1,mm0);
        pmulhw (mm3,mm2);
        paddw (mm1,mm1);
        paddw (mm3,mm3);
        paddw (mm1,mm0);
        paddw (mm3,mm2);
        psrlw (mm1,8);
        psrlw (mm3,8);
        packuswb (mm1,mm3);
        movq (dst+mmx_len+x,mm1);
    }

    if (mmx_len!=len) {
        lineNoiseAvg_C(dst+mmx_len, src+mmx_len, len-mmx_len, shift2);
    }
}
Example #2
0
int main()
{
	int rval;
	mmx_t ma;
	mmx_t mb;

	movq_r2r(mm0, mm1);

	rval = mmx_ok();

	/* Announce return value of mmx_ok() */
//	printf("Value returned from init was %x.", rval);
//	printf(" (Indicates MMX %s available)\n\n",(rval)? "is" : "not");
//	fflush(stdout); fflush(stderr);

//	if(rval)
	{
		/* PADD *****************************************************/
		ma.q = 0x1111111180000000LL;
		mb.q = 0x7fffffff00000001LL;
		paddd(ma, mb);
		fprintf(stdout, "paddd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddd: mb.q is 9111111080000001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddw(ma, mb);
		fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddw: mb.q is 8001800000000002\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddw(ma, mb);
		fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddw: mb.q is 8001800000000001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddb(ma, mb);
		fprintf(stdout, "paddb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddb: mb.q is 8180000281800002\n");
		fflush(stdout); fflush(stderr);


		/* PADDS ****************************************************/
		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddsw(ma, mb);
		fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsw: mb.q is 80017fff00000002\n");

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddsw(ma, mb);
		fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsw: mb.q is 80017fff00000001\n");

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddsb(ma, mb);
		fprintf(stdout, "paddsb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsb: mb.q is 817f0002817f0002\n");
		fflush(stdout); fflush(stderr);


		/* PADDUS ***************************************************/
		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddusw(ma, mb);
		fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusw: mb.q is 80018000ffff0002\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddusw(ma, mb);
		fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusw: mb.q is 80018000ffff0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddusb(ma, mb);
		fprintf(stdout, "paddusb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusb: mb.q is 8180ff028180ff02\n");
		fflush(stdout); fflush(stderr);


		/* PSUB *****************************************************/
		ma.q = 0x7fffffff00000001LL;
		mb.q = 0x1111111180000000LL;
		psubd(ma, mb);
		fprintf(stdout, "psubd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubd: mb.q is 911111127fffffff\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubw(ma, mb);
		fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubw: mb.q is 8001800200020000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubw(ma, mb);
		fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubw: mb.q is 7fff7ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubb(ma, mb);
		fprintf(stdout, "psubb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubb: mb.q is 818202007f7efe00\n");
		fflush(stdout); fflush(stderr);


		/* PSUBS ****************************************************/
		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubsw(ma, mb);
		fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsw: mb.q is 7fff800200020000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubsw(ma, mb);
		fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsw: mb.q is 80007ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubsb(ma, mb);
		fprintf(stdout, "psubsb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsb: mb.q is 7f820200807efe00\n");
		fflush(stdout); fflush(stderr);
 

		/* PSUBUS ***************************************************/
		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubusw(ma, mb);
		fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubusw(ma, mb);
		fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusw: mb.q is 7fff7ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubusb(ma, mb);
		fprintf(stdout, "psubusb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusb: mb.q is 000000007f7efe00\n");
		fflush(stdout); fflush(stderr);


		/* PMUL *****************************************************/
		ma.q = 0x8000ffff00ff0000LL;
		mb.q = 0x0200ffff00ffffffLL;
		pmulhw(ma, mb);
		fprintf(stdout, "pmulhw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmulhw: mb.q is ff00000000000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0200ffff00ffffffLL;
		pmullw(ma, mb);
		fprintf(stdout, "pmullw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmullw: mb.q is 00000001fe010000\n");
		fflush(stdout); fflush(stderr);


		/* PMADD ****************************************************/
		ma.q = 0x8000345680007f34LL;
		mb.q = 0x93234a27ffff1707LL;

		pmaddwd(ma, mb);
		fprintf(stdout, "pmaddwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmaddwd: mb.q is 4597551a0b71a66c\n");
		fflush(stdout); fflush(stderr);


		/* PCMPEQ ***************************************************/
		ma.q = 0x800034568f237f34LL;
		mb.q = 0x93009a568f237f34LL;

		pcmpeqd(ma, mb);
		fprintf(stdout, "pcmpeqd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqd: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x93009a568f237f34LL;
		pcmpeqw(ma, mb);
		fprintf(stdout, "pcmpeqw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqw: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x93009a568f237f34LL;
		pcmpeqb(ma, mb);
		fprintf(stdout, "pcmpeqb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqb: mb.q is 00ff00ffffffffff\n");
		fflush(stdout); fflush(stderr);



		/* PCMPGT ***************************************************/
		ma.q = 0x666688884477aaffLL;
		mb.q = 0x1234567890abcdefLL;

		pcmpgtd(ma, mb);
		fprintf(stdout, "pcmpgtd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtd: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x1234567890abcdefLL;
		pcmpgtw(ma, mb);
		fprintf(stdout, "pcmpgtw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtw: mb.q is 0000ffff0000ffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x1234567890abcdefLL;
		pcmpgtb(ma, mb);
		fprintf(stdout, "pcmpgtb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtb: mb.q is 0000ffff0000ff00\n");
		fflush(stdout); fflush(stderr);


		/* PACKSS ***************************************************/
		ma.q = 0x00012222000abbbbLL;
		mb.q = 0x0000888800003333LL;

		packssdw(ma, mb);
		fprintf(stdout, "packssdw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packssdw: mb.q is 7fff7fff7fff3333\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x00aa00dd01009999LL;
		mb.q = 0x0011002200330044LL;

		packsswb(ma, mb);
		fprintf(stdout, "packsswb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packsswb: mb.q is 7f7f7f8011223344\n");
		fflush(stdout); fflush(stderr);


		/* PACKUS ***************************************************/
		ma.q = 0x00aa00dd01009999LL;
		mb.q = 0x0011002200330044LL;

		packuswb(ma, mb);
		fprintf(stdout, "packuswb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packuswb: mb.q is aaddff0011223344\n");
		fflush(stdout); fflush(stderr);


		/* PUNPCKH **************************************************/
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;

		punpckhdq(ma, mb);
		fprintf(stdout, "punpckhdq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhdq: mb.q is 090a0b0c01020304\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpckhwd(ma, mb);
		fprintf(stdout, "punpckhwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhwd: mb.q is 090a01020b0c0304\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpckhbw(ma, mb);
		fprintf(stdout, "punpckhbw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhbw: mb.q is 09010a020b030c04\n");
		fflush(stdout); fflush(stderr);


		/* PUNPCKL **************************************************/
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;

		punpckldq(ma, mb);
		fprintf(stdout, "punpckldq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckldq: mb.q is 0d0e0f0005060708\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpcklwd(ma, mb);
		fprintf(stdout, "punpcklwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpcklwd: mb.q is 0d0e05060f000708\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpcklbw(ma, mb);
		fprintf(stdout, "punpcklbw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpcklbw: mb.q is 0d050e060f070008\n");
		fflush(stdout); fflush(stderr);



		/* PAND, PANDN, POR, PXOR ***********************************/
		ma.q = 0x5555555555555555LL;
		mb.q = 0x3333333333333333LL;

		pand(ma, mb);
		fprintf(stdout, "pand: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pand: mb.q is 1111111111111111\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		pandn(ma, mb);
		fprintf(stdout, "pandn: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pandn: mb.q is 4444444444444444\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		por(ma, mb);
		fprintf(stdout, "por: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "por: mb.q is 7777777777777777\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		pxor(ma, mb);
		fprintf(stdout, "pxor: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pxor: mb.q is 6666666666666666\n");
		fflush(stdout); fflush(stderr);



		/* PSLL *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psllq(ma, mb);
		fprintf(stdout, "psllq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psllq: mb.q is 6789abcdef000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		pslld(ma, mb);
		fprintf(stdout, "pslld: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pslld: mb.q is 67000000ef000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psllw(ma, mb);
		fprintf(stdout, "psllw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psllw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);



		/* PSRL *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psrlq(ma, mb);
		fprintf(stdout, "psrlq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrlq: mb.q is 0000000123456789\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psrld(ma, mb);
		fprintf(stdout, "psrld: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrld: mb.q is 0000000100000089\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psrlw(ma, mb);
		fprintf(stdout, "psrlw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrlw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);



		/* PSRA *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psrad(ma, mb);
		fprintf(stdout, "psrad: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrad: mb.q is 00000001ffffff89\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psraw(ma, mb);
		fprintf(stdout, "psraw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psraw: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		/* Exit MXX *************************************************/
		emms();
	}

	/* Clean-up and exit nicely */
	exit(0);
}
 template <class _mm, int src_aligned, int dst_aligned> void TffdshowConverters2::convert_simd_GBRPtoRGB(
    const uint8_t* srcG,
    const uint8_t* srcB,
    const uint8_t* srcR,
    uint8_t* dst,
    int dx,
    int dy,
    stride_t stride_src,
    stride_t stride_dst)
 {
    int xCount = dx / _mm::size;
    if (xCount <= 0)
        return;

    _mm::__m _mm0,_mm1,_mm2,_mm3,_mm4,_mm5,_mm6;
    _mm::__m ffff;
    pxor(ffff,ffff);
    pcmpeqb(ffff,ffff);
    for (int y = 0 ; y < dy ; y++) {
        const uint8_t *g = srcG + y * stride_src;
        const uint8_t *b = srcB + y * stride_src;
        const uint8_t *r = srcR + y * stride_src;
        uint8_t *dst1 = dst + y * stride_dst;
        int x = xCount;
        do {
            if (src_aligned) {
                movVqa(_mm0, r);
                movVqa(_mm1, g);
                movVqa(_mm2, b);
            } else {
                movVqu(_mm0, r);
                movVqu(_mm1, g);
                movVqu(_mm2, b);
            }
            _mm4 = _mm0;
            _mm5 = _mm2;
            punpcklbw(_mm0,ffff);    // 0xff,R
            punpcklbw(_mm2,_mm1);    // G,B
            punpckhbw(_mm4,ffff);    // 0xff,R
            punpckhbw(_mm5,_mm1);    // G,B
            _mm3 = _mm2;
            _mm6 = _mm5;
            punpckhwd(_mm2,_mm0);    // 0xff,RGB * 4 (dst+_mm::size)
            punpcklwd(_mm3,_mm0);    // 0xff,RGB * 4 (dst)
            punpckhwd(_mm5,_mm4);    // 0xff,RGB * 4 (dst+_mm::size)
            punpcklwd(_mm6,_mm4);    // 0xff,RGB * 4 (dst)
            if (dst_aligned) {
                _mm::movntVq(dst1              , _mm3);
                _mm::movntVq(dst1 + _mm::size  , _mm2);
                _mm::movntVq(dst1 + _mm::size*2, _mm6);
                _mm::movntVq(dst1 + _mm::size*3, _mm5);
            } else {
                movVqu(dst1              , _mm3);
                movVqu(dst1 + _mm::size  , _mm2);
                movVqu(dst1 + _mm::size*2, _mm6);
                movVqu(dst1 + _mm::size*3, _mm5);
            }
            r += _mm::size;
            g += _mm::size;
            b += _mm::size;
            dst1 += _mm::size * 4;
        } while(--x);
    }

    if (xCount * (int)_mm::size < dx && dx > _mm::size) {
        // handle non-mod 8 resolution.
        int dxDone = dx - _mm::size;
        srcG += dxDone;
        srcB += dxDone;
        srcR += dxDone;
        dst  += dxDone * 4;
        convert_simd_GBRPtoRGB<_mm, 0, 0>(srcG, srcB, srcR, dst, _mm::size, dy, stride_src, stride_dst);
    }

    _mm::empty();
 }
template <class _mm, int src_aligned, int dst_aligned> void TffdshowConverters2::convert_simd_AYUV(
    const uint8_t* srcY,
    const uint8_t* srcCb,
    const uint8_t* srcCr,
    uint8_t* dst,
    int dx,
    int dy,
    stride_t stride_src,
    stride_t stride_dst)
{
    int xCount = dx / _mm::size;
    if (xCount <= 0)
        return;

    _mm::__m _mm0,_mm1,_mm2,_mm3,_mm4,_mm5;
    _mm::__m ffff;
    pxor(ffff,ffff);
    pcmpeqb(ffff,ffff);
    for (int y = 0 ; y < dy ; y++) {
        if (y == dy-1)
            int a=0;
        const uint8_t *Y = srcY + y * stride_src;
        uint8_t *dst1 = dst + y * stride_dst;
        const uint8_t *Cb = srcCb + y * stride_src;
        const uint8_t *Cr = srcCr + y * stride_src;
        int x = xCount;
        do {
            if (src_aligned) {
                movVqa(_mm1, Y);
                movVqa(_mm4, Cb);
                movVqa(_mm2, Cr);
            } else {
                movVqu(_mm1, Y);
                movVqu(_mm4, Cb);
                movVqu(_mm2, Cr);
            }
            Y  += _mm::size;
            Cb += _mm::size;
            Cr += _mm::size;
            _mm3 = _mm2;
            punpcklbw(_mm2, _mm4);
            punpckhbw(_mm3, _mm4);
            _mm5 = _mm1;
            punpcklbw(_mm1, ffff);
            punpckhbw(_mm5, ffff);
            _mm4 = _mm2;
            punpcklwd(_mm2, _mm1);
            punpckhwd(_mm4, _mm1);
            _mm0 = _mm3;
            punpcklwd(_mm0, _mm5);
            punpckhwd(_mm3, _mm5);
            if (dst_aligned) {
                _mm::movntVq(dst1,                 _mm2);
                _mm::movntVq(dst1 + _mm::size,     _mm4);
                _mm::movntVq(dst1 + _mm::size * 2, _mm0);
                _mm::movntVq(dst1 + _mm::size * 3, _mm3);
            } else {
                movVqu(dst1,                 _mm2);
                movVqu(dst1 + _mm::size,     _mm4);
                movVqu(dst1 + _mm::size * 2, _mm0);
                movVqu(dst1 + _mm::size * 3, _mm3);
            }
            dst1 += _mm::size * 4;
        } while(--x);
    }

    if (xCount * (int)_mm::size < dx && dx > _mm::size) {
        // handle non-mod 8 resolution.
        int dxDone = dx - _mm::size;
        srcY  += dxDone;
        srcCb += dxDone;
        srcCr += dxDone;
        dst   += dxDone * 4;
        convert_simd_AYUV<_mm, 0, 0>(srcY, srcCb, srcCr, dst, _mm::size, dy, stride_src, stride_dst);
    }

    _mm::empty();
}
template <class _mm, int src_aligned, int dst_aligned> void TffdshowConverters2::convert_YV12toNV12(
    const uint8_t* srcY,
    const uint8_t* srcCb,
    const uint8_t* srcCr,
    uint8_t* dstY,
    uint8_t* dstCbCr,
    int dx,
    int dy,
    stride_t stride_Y,
    stride_t stride_CbCr,
    stride_t stride_dstY,
    stride_t stride_dstCbCr)
{
    int xCount = dx / (_mm::size*2);
    if (xCount <= 0)
        return;

    _mm::__m _mm0,_mm1,_mm2,_mm3;
    _mm::__m zero;
    pxor(zero,zero);
    for (int y = 0 ; y < dy ; y++) {
        const uint8_t *src = srcY + y * stride_Y;
        uint8_t *dst = dstY + y * stride_dstY;
        int x = xCount;
        do {
            if (src_aligned) {
                movVqa(_mm0, src);
                movVqa(_mm1, src + _mm::size);
            } else {
                movVqu(_mm0, src);
                movVqu(_mm1, src + _mm::size);
            }
            src += _mm::size * 2;
            if (dst_aligned) {
                _mm::movntVq(dst, _mm0);
                _mm::movntVq(dst + _mm::size, _mm1);
            } else {
                movVqu(dst, _mm0);
                movVqu(dst + _mm::size, _mm1);
            }
            dst += _mm::size * 2;
        } while(--x);
    }
    int dyCbCr = dy/2;
    for (int y = 0 ; y < dyCbCr ; y++) {
        const uint8_t *Cb = srcCb + y * stride_CbCr;
        const uint8_t *Cr = srcCr + y * stride_CbCr;
        uint8_t *dst = dstCbCr + y * stride_dstCbCr;
        int x = xCount;
        do {
            if (src_aligned) {
                movVqa(_mm0, Cb);
                movVqa(_mm1, Cr);
            } else {
                movVqu(_mm0, Cb);
                movVqu(_mm1, Cr);
            }
            _mm2 = _mm0;
            _mm3 = _mm1;
            Cb += _mm::size;
            Cr += _mm::size;
            punpcklbw(_mm0, _mm1);
            punpckhbw(_mm2, _mm3);

            if (dst_aligned) {
                _mm::movntVq(dst, _mm0);
                _mm::movntVq(dst + _mm::size, _mm2);
            } else {
                movVqu(dst, _mm0);
                movVqu(dst + _mm::size, _mm2);
            }
            dst += _mm::size * 2;
        } while(--x);
    }

    if (xCount * (int)_mm::size * 2 < dx && dx > _mm::size * 2) {
        int dxDone = dx - _mm::size * 2;
        srcY  += dxDone;
        srcCb += dxDone/2;
        srcCr += dxDone/2;
        dstY    += dxDone;
        dstCbCr += dxDone;
        convert_YV12toNV12<_mm, 0, 0>(srcY, srcCb, srcCr, dstY, dstCbCr, _mm::size * 2, dy, stride_Y, stride_CbCr, stride_dstY, stride_dstCbCr);
    }

    _mm::empty();
}
Example #6
0
int main()
{
	int rval, i, co, tmp;   //, j;
	mmx_t ma, mb;  //, mm0, mm1, *pm0, *pm1;	

	tmp = calc_cpu_speed();
	printf(" Calculating CPU-Speed....running at %d MHz\n", tmp);
	printf(" Calculating CPU-OFFSET...found : ");
	co = calc_cpu_ofs();
	printf(" %d\n",co);
	rval = 1; //mmx_ok(); /* Announce return value of mmx_ok() */
	printf(" *** Bogus message since we are emulating so MMX does allways exist ***\n");
	printf(" Value returned from init was %x.", rval);
	printf(" (Indicates MMX %s available)\n\n",(rval)? "is" : "not");
	
	if(rval) {
		/* PADD *****************************************************/
		ma.q = 0x1111111180000000LL;
		mb.q = 0x7fffffff00000001LL;
		mmx_regdump(ma);
		mmx_regdump(mb);
		paddd( &ma, &mb);		 
		printf("paddd: mb.q is %016llx\n", mb.q);
		mmx_regdump(mb);
		paddw( &ma, &mb);
		mmx_regdump(ma);
		mmx_regdump(mb);  
		ma.q = 0x0000000000000008LL;
		mb.q = 0x0001000200030004LL;
		mmx_regdump(ma);
		mmx_regdump(mb);
		psllw( &ma, &mb);
		mmx_regdump(ma);
		mmx_regdump(mb); 
		ma.q = 0x0000000000000000LL;
		mb.q = 0x888044a87f06fe80LL;
		mmx_regdump(ma);
		mmx_regdump(mb);
//		packuswb( &ma,&mb);
		ma.q = 0x00aa00dd01009999LL;
		mb.q = 0x0011002200330044LL;
		mmx_regdump(ma);		
		mmx_regdump(mb);
		packuswb( &ma, &mb); 
//		punpckhdq(&ma,&mb);
		mmx_regdump(ma);		
		mmx_regdump(mb);
/* punpckhdq */
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;
		mmx_regdump(ma);		
		mmx_regdump(mb);
		punpckhdq( &ma, &mb); 
		mmx_regdump(ma);		
		mmx_regdump(mb);		
/* punpckhwd */
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;
		mmx_regdump(ma);		
		mmx_regdump(mb);
		punpckhwd( &ma, &mb); 
		mmx_regdump(ma);		
		mmx_regdump(mb);	
/* punpckhbw */
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;
		mmx_regdump(ma);		
		mmx_regdump(mb);
		punpckhbw( &ma, &mb); 
		mmx_regdump(ma);		
		mmx_regdump(mb);	

/* punpckldq */
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;
		mmx_regdump(ma);		
		mmx_regdump(mb);
		punpckldq( &ma, &mb); 
		mmx_regdump(ma);		
		mmx_regdump(mb);		
/* punpcklwd */
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;
		mmx_regdump(ma);		
		mmx_regdump(mb);
		punpcklwd( &ma, &mb); 
		mmx_regdump(ma);		
		mmx_regdump(mb);	
/* punpcklbw */
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;
		mmx_regdump(ma);		
		mmx_regdump(mb);
		punpcklbw( &ma, &mb); 
		mmx_regdump(ma);		
		mmx_regdump(mb);	

/* pmaddwd */
		ma.q = 0x8000800080008000LL;
		mb.q = 0x8000800080008000LL;
		mmx_regdump(ma);		
		mmx_regdump(mb);
		pmaddwd( &ma, &mb); 
		mmx_regdump(ma);		
		mmx_regdump(mb);
/* packsswb */
		ma.q = 0x00aa00dd01009999LL;
		mb.q = 0x0011002200330044LL;
		mmx_regdump(ma);		
		mmx_regdump(mb);
		packsswb( &ma, &mb); 
		mmx_regdump(ma);		
		mmx_regdump(mb);
/* packsswb */
		ma.q = 0x007e7f00ef9dff88LL;
		mb.q = 0xff020085007e81cfLL;
		mmx_regdump(ma);		
		mmx_regdump(mb);
		packsswb( &ma, &mb); 
		mmx_regdump(ma);		
		mmx_regdump(mb);		
		for (i=0; i< 0x0F; i++){
			printf ("%d. Test running\n", i);
			GET_TSC(tsc1);
			mb.w[0] = i;
			ma.w[0] = i;
			GET_TSC(tsc2);
			tick_dump(co);
			GET_TSC(tsc1);     /* Let's test some real asm */
			asm("
//			pushl %eax\n
//			movl $0, %eax\n
//			cpuid\n
			packsswb %mm0, %mm1\n
			pmaddwd %mm0, %mm1\n
			punpcklbw %mm0, %mm1\n
			punpcklbw %mm0, %mm1\n
			pmaddwd %mm0, %mm1\n
			punpcklbw %mm0, %mm1\n
//			popl %eax\n
			");
			GET_TSC(tsc2);
			tick_dump(co);
			ma.q = 0x8000800080008000LL;
			mb.q = 0x8000800080008000LL;
			GET_TSC(tsc1);     /* Here some emulated routines */
			pmaddwd( &ma, &mb);
//			packsswb( &ma, &mb);
			GET_TSC(tsc2);
			tick_dump(co);
//			mmx_regdump(mb);
		}

		}
	exit(0); /* Clean-up and exit nicely */
}