template<class _mm> void TimgFilterNoiseMplayer::Tprocess::lineNoiseAvg_simd(uint8_t *dst,const uint8_t *src, int len, int8_t **shift_) { if (_mm::align && (intptr_t(src)&15 || intptr_t(dst)&15)) { lineNoiseAvg_simd<typename _mm::T64>(dst,src,len,shift_); return; } const int mmx_len=len&(~(_mm::size-1)); int8_t *shift2[3]= {shift_[0]+mmx_len, shift_[1]+mmx_len, shift_[2]+mmx_len}; for (int x=-mmx_len; x<0; x+=_mm::size) { //".balign 16 \n\t" typename _mm::__m mm0,mm1,mm2,mm3; movdqu (mm1,shift2[0]+mmx_len+x); movq (mm0,src+mmx_len+x); typename _mm::__m shift1_8; movVqu(shift1_8, shift2[1]+mmx_len+x); paddb (mm1,shift1_8); typename _mm::__m shift2_8; movVqu(shift2_8, shift2[2]+mmx_len+x); paddb (mm1,shift2_8); movq (mm2,mm0); movq (mm3,mm1); punpcklbw (mm0,mm0); punpckhbw (mm2,mm2); punpcklbw (mm1,mm1); punpckhbw (mm3,mm3); pmulhw (mm1,mm0); pmulhw (mm3,mm2); paddw (mm1,mm1); paddw (mm3,mm3); paddw (mm1,mm0); paddw (mm3,mm2); psrlw (mm1,8); psrlw (mm3,8); packuswb (mm1,mm3); movq (dst+mmx_len+x,mm1); } if (mmx_len!=len) { lineNoiseAvg_C(dst+mmx_len, src+mmx_len, len-mmx_len, shift2); } }
int main() { int rval; mmx_t ma; mmx_t mb; movq_r2r(mm0, mm1); rval = mmx_ok(); /* Announce return value of mmx_ok() */ // printf("Value returned from init was %x.", rval); // printf(" (Indicates MMX %s available)\n\n",(rval)? "is" : "not"); // fflush(stdout); fflush(stderr); // if(rval) { /* PADD *****************************************************/ ma.q = 0x1111111180000000LL; mb.q = 0x7fffffff00000001LL; paddd(ma, mb); fprintf(stdout, "paddd: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddd: mb.q is 9111111080000001\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010001LL; mb.q = 0x80007fffffff0001LL; paddw(ma, mb); fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddw: mb.q is 8001800000000002\n"); fflush(stdout); fflush(stderr); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010000LL; paddw(ma, mb); fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddw: mb.q is 8001800000000001\n"); fflush(stdout); fflush(stderr); ma.q = 0x01010101807fff01LL; mb.q = 0x807fff0101010101LL; paddb(ma, mb); fprintf(stdout, "paddb: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddb: mb.q is 8180000281800002\n"); fflush(stdout); fflush(stderr); /* PADDS ****************************************************/ ma.q = 0x0001000100010001LL; mb.q = 0x80007fffffff0001LL; paddsw(ma, mb); fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddsw: mb.q is 80017fff00000002\n"); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010000LL; paddsw(ma, mb); fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddsw: mb.q is 80017fff00000001\n"); ma.q = 0x01010101807fff01LL; mb.q = 0x807fff0101010101LL; paddsb(ma, mb); fprintf(stdout, "paddsb: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddsb: mb.q is 817f0002817f0002\n"); fflush(stdout); fflush(stderr); /* PADDUS ***************************************************/ ma.q = 0x0001000100010001LL; mb.q = 0x80007fffffff0001LL; paddusw(ma, mb); fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddusw: mb.q is 80018000ffff0002\n"); fflush(stdout); fflush(stderr); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010000LL; paddusw(ma, mb); fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddusw: mb.q is 80018000ffff0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x01010101807fff01LL; mb.q = 0x807fff0101010101LL; paddusb(ma, mb); fprintf(stdout, "paddusb: mb.q is %016llx\n", mb.q); fprintf(stderr, "paddusb: mb.q is 8180ff028180ff02\n"); fflush(stdout); fflush(stderr); /* PSUB *****************************************************/ ma.q = 0x7fffffff00000001LL; mb.q = 0x1111111180000000LL; psubd(ma, mb); fprintf(stdout, "psubd: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubd: mb.q is 911111127fffffff\n"); fflush(stdout); fflush(stderr); ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010001LL; psubw(ma, mb); fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubw: mb.q is 8001800200020000\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010000LL; mb.q = 0x80007fffffff0001LL; psubw(ma, mb); fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubw: mb.q is 7fff7ffefffe0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x807fff0101010101LL; mb.q = 0x01010101807fff01LL; psubb(ma, mb); fprintf(stdout, "psubb: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubb: mb.q is 818202007f7efe00\n"); fflush(stdout); fflush(stderr); /* PSUBS ****************************************************/ ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010001LL; psubsw(ma, mb); fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubsw: mb.q is 7fff800200020000\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010000LL; mb.q = 0x80007fffffff0001LL; psubsw(ma, mb); fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubsw: mb.q is 80007ffefffe0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x807fff0101010101LL; mb.q = 0x01010101807fff01LL; psubsb(ma, mb); fprintf(stdout, "psubsb: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubsb: mb.q is 7f820200807efe00\n"); fflush(stdout); fflush(stderr); /* PSUBUS ***************************************************/ ma.q = 0x80007fffffff0001LL; mb.q = 0x0001000100010001LL; psubusw(ma, mb); fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubusw: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); ma.q = 0x0001000100010000LL; mb.q = 0x80007fffffff0001LL; psubusw(ma, mb); fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubusw: mb.q is 7fff7ffefffe0001\n"); fflush(stdout); fflush(stderr); ma.q = 0x807fff0101010101LL; mb.q = 0x01010101807fff01LL; psubusb(ma, mb); fprintf(stdout, "psubusb: mb.q is %016llx\n", mb.q); fprintf(stderr, "psubusb: mb.q is 000000007f7efe00\n"); fflush(stdout); fflush(stderr); /* PMUL *****************************************************/ ma.q = 0x8000ffff00ff0000LL; mb.q = 0x0200ffff00ffffffLL; pmulhw(ma, mb); fprintf(stdout, "pmulhw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pmulhw: mb.q is ff00000000000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x0200ffff00ffffffLL; pmullw(ma, mb); fprintf(stdout, "pmullw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pmullw: mb.q is 00000001fe010000\n"); fflush(stdout); fflush(stderr); /* PMADD ****************************************************/ ma.q = 0x8000345680007f34LL; mb.q = 0x93234a27ffff1707LL; pmaddwd(ma, mb); fprintf(stdout, "pmaddwd: mb.q is %016llx\n", mb.q); fprintf(stderr, "pmaddwd: mb.q is 4597551a0b71a66c\n"); fflush(stdout); fflush(stderr); /* PCMPEQ ***************************************************/ ma.q = 0x800034568f237f34LL; mb.q = 0x93009a568f237f34LL; pcmpeqd(ma, mb); fprintf(stdout, "pcmpeqd: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpeqd: mb.q is 00000000ffffffff\n"); fflush(stdout); fflush(stderr); mb.q = 0x93009a568f237f34LL; pcmpeqw(ma, mb); fprintf(stdout, "pcmpeqw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpeqw: mb.q is 00000000ffffffff\n"); fflush(stdout); fflush(stderr); mb.q = 0x93009a568f237f34LL; pcmpeqb(ma, mb); fprintf(stdout, "pcmpeqb: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpeqb: mb.q is 00ff00ffffffffff\n"); fflush(stdout); fflush(stderr); /* PCMPGT ***************************************************/ ma.q = 0x666688884477aaffLL; mb.q = 0x1234567890abcdefLL; pcmpgtd(ma, mb); fprintf(stdout, "pcmpgtd: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpgtd: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x1234567890abcdefLL; pcmpgtw(ma, mb); fprintf(stdout, "pcmpgtw: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpgtw: mb.q is 0000ffff0000ffff\n"); fflush(stdout); fflush(stderr); mb.q = 0x1234567890abcdefLL; pcmpgtb(ma, mb); fprintf(stdout, "pcmpgtb: mb.q is %016llx\n", mb.q); fprintf(stderr, "pcmpgtb: mb.q is 0000ffff0000ff00\n"); fflush(stdout); fflush(stderr); /* PACKSS ***************************************************/ ma.q = 0x00012222000abbbbLL; mb.q = 0x0000888800003333LL; packssdw(ma, mb); fprintf(stdout, "packssdw: mb.q is %016llx\n", mb.q); fprintf(stderr, "packssdw: mb.q is 7fff7fff7fff3333\n"); fflush(stdout); fflush(stderr); ma.q = 0x00aa00dd01009999LL; mb.q = 0x0011002200330044LL; packsswb(ma, mb); fprintf(stdout, "packsswb: mb.q is %016llx\n", mb.q); fprintf(stderr, "packsswb: mb.q is 7f7f7f8011223344\n"); fflush(stdout); fflush(stderr); /* PACKUS ***************************************************/ ma.q = 0x00aa00dd01009999LL; mb.q = 0x0011002200330044LL; packuswb(ma, mb); fprintf(stdout, "packuswb: mb.q is %016llx\n", mb.q); fprintf(stderr, "packuswb: mb.q is aaddff0011223344\n"); fflush(stdout); fflush(stderr); /* PUNPCKH **************************************************/ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; punpckhdq(ma, mb); fprintf(stdout, "punpckhdq: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckhdq: mb.q is 090a0b0c01020304\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpckhwd(ma, mb); fprintf(stdout, "punpckhwd: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckhwd: mb.q is 090a01020b0c0304\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpckhbw(ma, mb); fprintf(stdout, "punpckhbw: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckhbw: mb.q is 09010a020b030c04\n"); fflush(stdout); fflush(stderr); /* PUNPCKL **************************************************/ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; punpckldq(ma, mb); fprintf(stdout, "punpckldq: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpckldq: mb.q is 0d0e0f0005060708\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpcklwd(ma, mb); fprintf(stdout, "punpcklwd: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpcklwd: mb.q is 0d0e05060f000708\n"); fflush(stdout); fflush(stderr); mb.q = 0x0102030405060708LL; punpcklbw(ma, mb); fprintf(stdout, "punpcklbw: mb.q is %016llx\n", mb.q); fprintf(stderr, "punpcklbw: mb.q is 0d050e060f070008\n"); fflush(stdout); fflush(stderr); /* PAND, PANDN, POR, PXOR ***********************************/ ma.q = 0x5555555555555555LL; mb.q = 0x3333333333333333LL; pand(ma, mb); fprintf(stdout, "pand: mb.q is %016llx\n", mb.q); fprintf(stderr, "pand: mb.q is 1111111111111111\n"); fflush(stdout); fflush(stderr); mb.q = 0x3333333333333333LL; pandn(ma, mb); fprintf(stdout, "pandn: mb.q is %016llx\n", mb.q); fprintf(stderr, "pandn: mb.q is 4444444444444444\n"); fflush(stdout); fflush(stderr); mb.q = 0x3333333333333333LL; por(ma, mb); fprintf(stdout, "por: mb.q is %016llx\n", mb.q); fprintf(stderr, "por: mb.q is 7777777777777777\n"); fflush(stdout); fflush(stderr); mb.q = 0x3333333333333333LL; pxor(ma, mb); fprintf(stdout, "pxor: mb.q is %016llx\n", mb.q); fprintf(stderr, "pxor: mb.q is 6666666666666666\n"); fflush(stdout); fflush(stderr); /* PSLL *****************************************************/ ma.q = 0x0000000000000018LL; mb.q = 0x0123456789abcdefLL; psllq(ma, mb); fprintf(stdout, "psllq: mb.q is %016llx\n", mb.q); fprintf(stderr, "psllq: mb.q is 6789abcdef000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; pslld(ma, mb); fprintf(stdout, "pslld: mb.q is %016llx\n", mb.q); fprintf(stderr, "pslld: mb.q is 67000000ef000000\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psllw(ma, mb); fprintf(stdout, "psllw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psllw: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); /* PSRL *****************************************************/ ma.q = 0x0000000000000018LL; mb.q = 0x0123456789abcdefLL; psrlq(ma, mb); fprintf(stdout, "psrlq: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrlq: mb.q is 0000000123456789\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psrld(ma, mb); fprintf(stdout, "psrld: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrld: mb.q is 0000000100000089\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psrlw(ma, mb); fprintf(stdout, "psrlw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrlw: mb.q is 0000000000000000\n"); fflush(stdout); fflush(stderr); /* PSRA *****************************************************/ ma.q = 0x0000000000000018LL; mb.q = 0x0123456789abcdefLL; psrad(ma, mb); fprintf(stdout, "psrad: mb.q is %016llx\n", mb.q); fprintf(stderr, "psrad: mb.q is 00000001ffffff89\n"); fflush(stdout); fflush(stderr); mb.q = 0x0123456789abcdefLL; psraw(ma, mb); fprintf(stdout, "psraw: mb.q is %016llx\n", mb.q); fprintf(stderr, "psraw: mb.q is 00000000ffffffff\n"); fflush(stdout); fflush(stderr); /* Exit MXX *************************************************/ emms(); } /* Clean-up and exit nicely */ exit(0); }
template <class _mm, int src_aligned, int dst_aligned> void TffdshowConverters2::convert_simd_GBRPtoRGB( const uint8_t* srcG, const uint8_t* srcB, const uint8_t* srcR, uint8_t* dst, int dx, int dy, stride_t stride_src, stride_t stride_dst) { int xCount = dx / _mm::size; if (xCount <= 0) return; _mm::__m _mm0,_mm1,_mm2,_mm3,_mm4,_mm5,_mm6; _mm::__m ffff; pxor(ffff,ffff); pcmpeqb(ffff,ffff); for (int y = 0 ; y < dy ; y++) { const uint8_t *g = srcG + y * stride_src; const uint8_t *b = srcB + y * stride_src; const uint8_t *r = srcR + y * stride_src; uint8_t *dst1 = dst + y * stride_dst; int x = xCount; do { if (src_aligned) { movVqa(_mm0, r); movVqa(_mm1, g); movVqa(_mm2, b); } else { movVqu(_mm0, r); movVqu(_mm1, g); movVqu(_mm2, b); } _mm4 = _mm0; _mm5 = _mm2; punpcklbw(_mm0,ffff); // 0xff,R punpcklbw(_mm2,_mm1); // G,B punpckhbw(_mm4,ffff); // 0xff,R punpckhbw(_mm5,_mm1); // G,B _mm3 = _mm2; _mm6 = _mm5; punpckhwd(_mm2,_mm0); // 0xff,RGB * 4 (dst+_mm::size) punpcklwd(_mm3,_mm0); // 0xff,RGB * 4 (dst) punpckhwd(_mm5,_mm4); // 0xff,RGB * 4 (dst+_mm::size) punpcklwd(_mm6,_mm4); // 0xff,RGB * 4 (dst) if (dst_aligned) { _mm::movntVq(dst1 , _mm3); _mm::movntVq(dst1 + _mm::size , _mm2); _mm::movntVq(dst1 + _mm::size*2, _mm6); _mm::movntVq(dst1 + _mm::size*3, _mm5); } else { movVqu(dst1 , _mm3); movVqu(dst1 + _mm::size , _mm2); movVqu(dst1 + _mm::size*2, _mm6); movVqu(dst1 + _mm::size*3, _mm5); } r += _mm::size; g += _mm::size; b += _mm::size; dst1 += _mm::size * 4; } while(--x); } if (xCount * (int)_mm::size < dx && dx > _mm::size) { // handle non-mod 8 resolution. int dxDone = dx - _mm::size; srcG += dxDone; srcB += dxDone; srcR += dxDone; dst += dxDone * 4; convert_simd_GBRPtoRGB<_mm, 0, 0>(srcG, srcB, srcR, dst, _mm::size, dy, stride_src, stride_dst); } _mm::empty(); }
template <class _mm, int src_aligned, int dst_aligned> void TffdshowConverters2::convert_simd_AYUV( const uint8_t* srcY, const uint8_t* srcCb, const uint8_t* srcCr, uint8_t* dst, int dx, int dy, stride_t stride_src, stride_t stride_dst) { int xCount = dx / _mm::size; if (xCount <= 0) return; _mm::__m _mm0,_mm1,_mm2,_mm3,_mm4,_mm5; _mm::__m ffff; pxor(ffff,ffff); pcmpeqb(ffff,ffff); for (int y = 0 ; y < dy ; y++) { if (y == dy-1) int a=0; const uint8_t *Y = srcY + y * stride_src; uint8_t *dst1 = dst + y * stride_dst; const uint8_t *Cb = srcCb + y * stride_src; const uint8_t *Cr = srcCr + y * stride_src; int x = xCount; do { if (src_aligned) { movVqa(_mm1, Y); movVqa(_mm4, Cb); movVqa(_mm2, Cr); } else { movVqu(_mm1, Y); movVqu(_mm4, Cb); movVqu(_mm2, Cr); } Y += _mm::size; Cb += _mm::size; Cr += _mm::size; _mm3 = _mm2; punpcklbw(_mm2, _mm4); punpckhbw(_mm3, _mm4); _mm5 = _mm1; punpcklbw(_mm1, ffff); punpckhbw(_mm5, ffff); _mm4 = _mm2; punpcklwd(_mm2, _mm1); punpckhwd(_mm4, _mm1); _mm0 = _mm3; punpcklwd(_mm0, _mm5); punpckhwd(_mm3, _mm5); if (dst_aligned) { _mm::movntVq(dst1, _mm2); _mm::movntVq(dst1 + _mm::size, _mm4); _mm::movntVq(dst1 + _mm::size * 2, _mm0); _mm::movntVq(dst1 + _mm::size * 3, _mm3); } else { movVqu(dst1, _mm2); movVqu(dst1 + _mm::size, _mm4); movVqu(dst1 + _mm::size * 2, _mm0); movVqu(dst1 + _mm::size * 3, _mm3); } dst1 += _mm::size * 4; } while(--x); } if (xCount * (int)_mm::size < dx && dx > _mm::size) { // handle non-mod 8 resolution. int dxDone = dx - _mm::size; srcY += dxDone; srcCb += dxDone; srcCr += dxDone; dst += dxDone * 4; convert_simd_AYUV<_mm, 0, 0>(srcY, srcCb, srcCr, dst, _mm::size, dy, stride_src, stride_dst); } _mm::empty(); }
template <class _mm, int src_aligned, int dst_aligned> void TffdshowConverters2::convert_YV12toNV12( const uint8_t* srcY, const uint8_t* srcCb, const uint8_t* srcCr, uint8_t* dstY, uint8_t* dstCbCr, int dx, int dy, stride_t stride_Y, stride_t stride_CbCr, stride_t stride_dstY, stride_t stride_dstCbCr) { int xCount = dx / (_mm::size*2); if (xCount <= 0) return; _mm::__m _mm0,_mm1,_mm2,_mm3; _mm::__m zero; pxor(zero,zero); for (int y = 0 ; y < dy ; y++) { const uint8_t *src = srcY + y * stride_Y; uint8_t *dst = dstY + y * stride_dstY; int x = xCount; do { if (src_aligned) { movVqa(_mm0, src); movVqa(_mm1, src + _mm::size); } else { movVqu(_mm0, src); movVqu(_mm1, src + _mm::size); } src += _mm::size * 2; if (dst_aligned) { _mm::movntVq(dst, _mm0); _mm::movntVq(dst + _mm::size, _mm1); } else { movVqu(dst, _mm0); movVqu(dst + _mm::size, _mm1); } dst += _mm::size * 2; } while(--x); } int dyCbCr = dy/2; for (int y = 0 ; y < dyCbCr ; y++) { const uint8_t *Cb = srcCb + y * stride_CbCr; const uint8_t *Cr = srcCr + y * stride_CbCr; uint8_t *dst = dstCbCr + y * stride_dstCbCr; int x = xCount; do { if (src_aligned) { movVqa(_mm0, Cb); movVqa(_mm1, Cr); } else { movVqu(_mm0, Cb); movVqu(_mm1, Cr); } _mm2 = _mm0; _mm3 = _mm1; Cb += _mm::size; Cr += _mm::size; punpcklbw(_mm0, _mm1); punpckhbw(_mm2, _mm3); if (dst_aligned) { _mm::movntVq(dst, _mm0); _mm::movntVq(dst + _mm::size, _mm2); } else { movVqu(dst, _mm0); movVqu(dst + _mm::size, _mm2); } dst += _mm::size * 2; } while(--x); } if (xCount * (int)_mm::size * 2 < dx && dx > _mm::size * 2) { int dxDone = dx - _mm::size * 2; srcY += dxDone; srcCb += dxDone/2; srcCr += dxDone/2; dstY += dxDone; dstCbCr += dxDone; convert_YV12toNV12<_mm, 0, 0>(srcY, srcCb, srcCr, dstY, dstCbCr, _mm::size * 2, dy, stride_Y, stride_CbCr, stride_dstY, stride_dstCbCr); } _mm::empty(); }
int main() { int rval, i, co, tmp; //, j; mmx_t ma, mb; //, mm0, mm1, *pm0, *pm1; tmp = calc_cpu_speed(); printf(" Calculating CPU-Speed....running at %d MHz\n", tmp); printf(" Calculating CPU-OFFSET...found : "); co = calc_cpu_ofs(); printf(" %d\n",co); rval = 1; //mmx_ok(); /* Announce return value of mmx_ok() */ printf(" *** Bogus message since we are emulating so MMX does allways exist ***\n"); printf(" Value returned from init was %x.", rval); printf(" (Indicates MMX %s available)\n\n",(rval)? "is" : "not"); if(rval) { /* PADD *****************************************************/ ma.q = 0x1111111180000000LL; mb.q = 0x7fffffff00000001LL; mmx_regdump(ma); mmx_regdump(mb); paddd( &ma, &mb); printf("paddd: mb.q is %016llx\n", mb.q); mmx_regdump(mb); paddw( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); ma.q = 0x0000000000000008LL; mb.q = 0x0001000200030004LL; mmx_regdump(ma); mmx_regdump(mb); psllw( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); ma.q = 0x0000000000000000LL; mb.q = 0x888044a87f06fe80LL; mmx_regdump(ma); mmx_regdump(mb); // packuswb( &ma,&mb); ma.q = 0x00aa00dd01009999LL; mb.q = 0x0011002200330044LL; mmx_regdump(ma); mmx_regdump(mb); packuswb( &ma, &mb); // punpckhdq(&ma,&mb); mmx_regdump(ma); mmx_regdump(mb); /* punpckhdq */ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; mmx_regdump(ma); mmx_regdump(mb); punpckhdq( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); /* punpckhwd */ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; mmx_regdump(ma); mmx_regdump(mb); punpckhwd( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); /* punpckhbw */ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; mmx_regdump(ma); mmx_regdump(mb); punpckhbw( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); /* punpckldq */ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; mmx_regdump(ma); mmx_regdump(mb); punpckldq( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); /* punpcklwd */ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; mmx_regdump(ma); mmx_regdump(mb); punpcklwd( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); /* punpcklbw */ ma.q = 0x090a0b0c0d0e0f00LL; mb.q = 0x0102030405060708LL; mmx_regdump(ma); mmx_regdump(mb); punpcklbw( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); /* pmaddwd */ ma.q = 0x8000800080008000LL; mb.q = 0x8000800080008000LL; mmx_regdump(ma); mmx_regdump(mb); pmaddwd( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); /* packsswb */ ma.q = 0x00aa00dd01009999LL; mb.q = 0x0011002200330044LL; mmx_regdump(ma); mmx_regdump(mb); packsswb( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); /* packsswb */ ma.q = 0x007e7f00ef9dff88LL; mb.q = 0xff020085007e81cfLL; mmx_regdump(ma); mmx_regdump(mb); packsswb( &ma, &mb); mmx_regdump(ma); mmx_regdump(mb); for (i=0; i< 0x0F; i++){ printf ("%d. Test running\n", i); GET_TSC(tsc1); mb.w[0] = i; ma.w[0] = i; GET_TSC(tsc2); tick_dump(co); GET_TSC(tsc1); /* Let's test some real asm */ asm(" // pushl %eax\n // movl $0, %eax\n // cpuid\n packsswb %mm0, %mm1\n pmaddwd %mm0, %mm1\n punpcklbw %mm0, %mm1\n punpcklbw %mm0, %mm1\n pmaddwd %mm0, %mm1\n punpcklbw %mm0, %mm1\n // popl %eax\n "); GET_TSC(tsc2); tick_dump(co); ma.q = 0x8000800080008000LL; mb.q = 0x8000800080008000LL; GET_TSC(tsc1); /* Here some emulated routines */ pmaddwd( &ma, &mb); // packsswb( &ma, &mb); GET_TSC(tsc2); tick_dump(co); // mmx_regdump(mb); } } exit(0); /* Clean-up and exit nicely */ }