static inline void MC_put1_16 (int height, uint8_t * dest, const uint8_t * ref, const int stride) { do { movq_m2r (*ref, mm0); movq_m2r (*(ref+8), mm1); ref += stride; movq_r2m (mm0, *dest); movq_r2m (mm1, *(dest+8)); dest += stride; } while (--height); }
static inline void MC_put2_16 (int height, uint8_t * dest, uint8_t * ref, int stride, int offset, int cpu) { do { movq_m2r (*ref, mm0); movq_m2r (*(ref+8), mm1); pavg_m2r (*(ref+offset), mm0); pavg_m2r (*(ref+offset+8), mm1); movq_r2m (mm0, *dest); ref += stride; movq_r2m (mm1, *(dest+8)); dest += stride; } while (--height); }
static inline void MC_avg4_16 (int height, uint8_t * dest, const uint8_t * ref, const int stride, const int cpu) { do { movq_m2r (*ref, mm0); movq_m2r (*(ref+stride+1), mm1); movq_r2r (mm0, mm7); movq_m2r (*(ref+1), mm2); pxor_r2r (mm1, mm7); movq_m2r (*(ref+stride), mm3); movq_r2r (mm2, mm6); pxor_r2r (mm3, mm6); pavg_r2r (mm1, mm0); pavg_r2r (mm3, mm2); por_r2r (mm6, mm7); movq_r2r (mm0, mm6); pxor_r2r (mm2, mm6); pand_r2r (mm6, mm7); pand_m2r (mask_one, mm7); pavg_r2r (mm2, mm0); psubusb_r2r (mm7, mm0); movq_m2r (*dest, mm1); pavg_r2r (mm1, mm0); movq_r2m (mm0, *dest); movq_m2r (*(ref+8), mm0); movq_m2r (*(ref+stride+9), mm1); movq_r2r (mm0, mm7); movq_m2r (*(ref+9), mm2); pxor_r2r (mm1, mm7); movq_m2r (*(ref+stride+8), mm3); movq_r2r (mm2, mm6); pxor_r2r (mm3, mm6); pavg_r2r (mm1, mm0); pavg_r2r (mm3, mm2); por_r2r (mm6, mm7); movq_r2r (mm0, mm6); pxor_r2r (mm2, mm6); pand_r2r (mm6, mm7); pand_m2r (mask_one, mm7); pavg_r2r (mm2, mm0); psubusb_r2r (mm7, mm0); movq_m2r (*(dest+8), mm1); pavg_r2r (mm1, mm0); ref += stride; movq_r2m (mm0, *(dest+8)); dest += stride; } while (--height); }
static inline void mmx_end(uint8_t *src3, uint8_t *src5, uint8_t *dst, int X) { punpcklbw_m2r (mm_cpool[0], mm4); punpckhbw_m2r (mm_cpool[0], mm5); psubusw_r2r (mm2, mm0); psubusw_r2r (mm3, mm1); movq_m2r (src5[X], mm2); movq_m2r (src5[X], mm3); punpcklbw_m2r (mm_cpool[0], mm2); punpckhbw_m2r (mm_cpool[0], mm3); psubusw_r2r (mm2, mm0); psubusw_r2r (mm3, mm1); psrlw_i2r (3, mm0); psrlw_i2r (3, mm1); psubw_r2r (mm6, mm4); psubw_r2r (mm7, mm5); packuswb_r2r (mm1,mm0); movq_r2r (mm4, mm6); movq_r2r (mm5, mm7); pcmpgtw_m2r (mm_lthr, mm4); pcmpgtw_m2r (mm_lthr, mm5); pcmpgtw_m2r (mm_hthr, mm6); pcmpgtw_m2r (mm_hthr, mm7); packsswb_r2r (mm5, mm4); packsswb_r2r (mm7, mm6); pxor_r2r (mm6, mm4); movq_r2r (mm4, mm5); pandn_r2r (mm0, mm4); pand_m2r (src3[X], mm5); por_r2r (mm4, mm5); movq_r2m (mm5, dst[X]); }
int main(int ac, char **av) { int i, j, k, n; unsigned char dat0[8] = { 0x01, 0xf2, 0x03, 0x04, 0x05, 0x06, 0xf7, 0x08 }; long long *datp = (long long *)&dat0; int16_t dat1[8] = { 0x10, 0x20, -0x130, -0x140, 0x50, -0x160, -0x170, 0x80 }; volatile uint8_t *rfp = dat0; volatile int16_t *bp = dat1; unsigned char ans1[8], ans2[8]; n = 0; for( i=-32768; i<32768; ++i ) { j = 0; while( j < 256 ) { for( k=0; k<8; ++k ) { dat0[k] = i; dat1[k] = j++; } movq_m2r(m_(&rfp[0]),mm1); /* rfp[0..7] */ pxor_r2r(mm3,mm3); pxor_r2r(mm4,mm4); movq_m2r(m_(&bp[0]),mm5); /* bp[0..3] */ movq_r2r(mm1,mm2); movq_m2r(m_(&bp[4]),mm6); /* bp[4..7] */ punpcklbw_r2r(mm3,mm1); /* rfp[0,2,4,6] */ punpckhbw_r2r(mm3,mm2); /* rfp[1,3,5,7] */ paddsw_r2r(mm5,mm1); /* bp[0..3] */ paddsw_r2r(mm6,mm2); /* bp[4..7] */ pcmpgtw_r2r(mm1,mm3); pcmpgtw_r2r(mm2,mm4); pandn_r2r(mm1,mm3); pandn_r2r(mm2,mm4); packuswb_r2r(mm4,mm3); movq_r2m(mm3,m_(&ans1[0])); emms(); ans2[0] = clip(bp[0] + rfp[0]); ans2[1] = clip(bp[1] + rfp[1]); ans2[2] = clip(bp[2] + rfp[2]); ans2[3] = clip(bp[3] + rfp[3]); ans2[4] = clip(bp[4] + rfp[4]); ans2[5] = clip(bp[5] + rfp[5]); ans2[6] = clip(bp[6] + rfp[6]); ans2[7] = clip(bp[7] + rfp[7]); if( *(uint64_t *)&ans1[0] != *(uint64_t *)&ans2[0] ) { printf(" i=%5d %02x %02x %02x %02x %02x %02x %02x %02x\n", i, ans1[0], ans1[1], ans1[2], ans1[3], ans1[4], ans1[5], ans1[6], ans1[7]); printf(" j=%5d %02x %02x %02x %02x %02x %02x %02x %02x\n", j, ans2[0], ans2[1], ans2[2], ans2[3], ans2[4], ans2[5], ans2[6], ans2[7]); // exit(0); } n += 8; } } printf("n=%d\n",n); return 0; }
static inline void mmx_interp_average_2_U8 (uint8_t * dest, const uint8_t * src1, const uint8_t * src2) { /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */ movq_m2r (*dest, mm1); /* load 8 dest bytes */ movq_r2r (mm1, mm2); /* copy 8 dest bytes */ movq_m2r (*src1, mm3); /* load 8 src1 bytes */ movq_r2r (mm3, mm4); /* copy 8 src1 bytes */ movq_m2r (*src2, mm5); /* load 8 src2 bytes */ movq_r2r (mm5, mm6); /* copy 8 src2 bytes */ pxor_r2r (mm3, mm5); /* xor src1 and src2 */ pand_m2r (mask1, mm5); /* mask lower bits */ psrlq_i2r (1, mm5); /* /2 */ por_r2r (mm4, mm6); /* or src1 and src2 */ psubb_r2r (mm5, mm6); /* subtract subresults */ movq_r2r (mm6, mm5); /* copy subresult */ pxor_r2r (mm1, mm5); /* xor srcavg and dest */ pand_m2r (mask1, mm5); /* mask lower bits */ psrlq_i2r (1, mm5); /* /2 */ por_r2r (mm2, mm6); /* or srcavg and dest */ psubb_r2r (mm5, mm6); /* subtract subresults */ movq_r2m (mm6, *dest); /* store result in dest */ }
static inline void mmx_average_2_U8 (uint8_t * dest, uint8_t * src1, uint8_t * src2) { /* *dest = (*src1 + *src2 + 1)/ 2; */ movq_m2r (*src1, mm1); // load 8 src1 bytes movq_r2r (mm1, mm2); // copy 8 src1 bytes movq_m2r (*src2, mm3); // load 8 src2 bytes movq_r2r (mm3, mm4); // copy 8 src2 bytes punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes paddw_r2r (mm3, mm1); // add lows to mm1 paddw_m2r (round1, mm1); psraw_i2r (1, mm1); // /2 paddw_r2r (mm4, mm2); // add highs to mm2 paddw_m2r (round1, mm2); psraw_i2r (1, mm2); // /2 packuswb_r2r (mm2, mm1); // pack (w/ saturation) movq_r2m (mm1, *dest); // store result in dest }
static inline void MC_put_mmx (const int width, int height, uint8_t * dest, const uint8_t * ref, const int stride) { mmx_zero_reg (); do { movq_m2r (* ref, mm1); /* load 8 ref bytes */ movq_r2m (mm1,* dest); /* store 8 bytes at curr */ if (width == 16) { movq_m2r (* (ref+8), mm1); /* load 8 ref bytes */ movq_r2m (mm1,* (dest+8)); /* store 8 bytes at curr */ } dest += stride; ref += stride; } while (--height); }
static inline void mean8(unsigned char *refpix,unsigned char *pixel,int radius_count,int row_stride,int threshold,int8_t *diff,unsigned char *count) { int a,b; pxor_r2r(mm6,mm6); // mm6 (aka count) = 0 pxor_r2r(mm7,mm7); // mm7 (aka diff) = 0 movq_m2r(*refpix,mm3); // mm3 = refpix[0] movd_g2r(0x80808080,mm4); // mm4 = 128 punpcklbw_r2r(mm4,mm4); pxor_r2r(mm4,mm3); // mm3 = refpix[0]-128 movd_g2r(threshold,mm5); // mm5 = threshold punpcklbw_r2r(mm5,mm5); punpcklbw_r2r(mm5,mm5); punpcklbw_r2r(mm5,mm5); for( b=0; b<radius_count; b++ ) { for( a=0; a<radius_count; a++ ) { movq_m2r(*pixel,mm0); // mm0 = pixel[0] pxor_r2r(mm4,mm0); // mm0 = pixel[0]-128 movq_r2r(mm3,mm2); // mm2 = refpix[0]-128 psubsb_r2r(mm0,mm2); // mm2 = refpix[0]-pixel[0] psubsb_r2r(mm3,mm0); // mm0 = pixel[0]-refpix[0] pminub_r2r(mm0,mm2); // mm2 = abs(pixel[0]-refpix[0]) movq_r2r(mm5,mm1); // mm1 = threshold pcmpgtb_r2r(mm2,mm1); // mm1 = (threshold > abs(pixel[0]-refpix[0])) ? -1 : 0 psubb_r2r(mm1,mm6); // mm6 += (threshold > abs(pixel[0]-refpix[0])) pand_r2r(mm1,mm0); // mm0 = (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0 paddb_r2r(mm0,mm7); // mm7 += (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0 ++pixel; } pixel += row_stride - radius_count; } movq_r2m(mm6,*count); movq_r2m(mm7,*diff); emms(); }
static inline void MC_put1_8 (int height, uint8_t * dest, uint8_t * ref, int stride) { do { movq_m2r (*ref, mm0); movq_r2m (mm0, *dest); ref += stride; dest += stride; } while (--height); }
static inline void MC_avg1_8 (int height, uint8_t * dest, const uint8_t * ref, const int stride, const int cpu) { do { movq_m2r (*ref, mm0); pavg_m2r (*dest, mm0); ref += stride; movq_r2m (mm0, *dest); dest += stride; } while (--height); }
static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst, uint8_t *src, int i_src ) { int y; /* Interlaced */ for( y = 0; y < 8; y += 2 ) { movq_m2r( src[0], mm0 ); movq_r2m( mm0, dst[0] ); dst += i_dst; movq_m2r( src[2*i_src], mm1 ); pavgb_r2r( mm1, mm0 ); movq_r2m( mm0, dst[0] ); dst += 1*i_dst; src += 2*i_src; } }
static inline void MC_avg2_8 (int height, uint8_t * dest, uint8_t * ref, int stride, int offset, int cpu) { do { movq_m2r (*ref, mm0); pavg_m2r (*(ref+offset), mm0); pavg_m2r (*dest, mm0); ref += stride; movq_r2m (mm0, *dest); dest += stride; } while (--height); }
static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1, const uint8_t * src2, const uint8_t * src3, const uint8_t * src4) { /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */ movq_m2r (*src1, mm1); /* load 8 src1 bytes */ movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */ punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */ movq_m2r (*src2, mm3); /* load 8 src2 bytes */ movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */ punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */ paddw_r2r (mm3, mm1); /* add lows */ paddw_r2r (mm4, mm2); /* add highs */ /* now have partials in mm1 and mm2 */ movq_m2r (*src3, mm3); /* load 8 src3 bytes */ movq_r2r (mm3, mm4); /* copy 8 src3 bytes */ punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */ punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */ paddw_r2r (mm3, mm1); /* add lows */ paddw_r2r (mm4, mm2); /* add highs */ movq_m2r (*src4, mm5); /* load 8 src4 bytes */ movq_r2r (mm5, mm6); /* copy 8 src4 bytes */ punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */ punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */ paddw_r2r (mm5, mm1); /* add lows */ paddw_r2r (mm6, mm2); /* add highs */ /* now have subtotal in mm1 and mm2 */ paddw_m2r (round4, mm1); psraw_i2r (2, mm1); /* /4 */ paddw_m2r (round4, mm2); psraw_i2r (2, mm2); /* /4 */ packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */ movq_r2m (mm1, *dest); /* store result in dest */ }
static inline void mmx_average_4_U8 (uint8_t * dest, uint8_t * src1, uint8_t * src2, uint8_t * src3, uint8_t * src4) { /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */ movq_m2r (*src1, mm1); // load 8 src1 bytes movq_r2r (mm1, mm2); // copy 8 src1 bytes punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes movq_m2r (*src2, mm3); // load 8 src2 bytes movq_r2r (mm3, mm4); // copy 8 src2 bytes punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes paddw_r2r (mm3, mm1); // add lows paddw_r2r (mm4, mm2); // add highs /* now have partials in mm1 and mm2 */ movq_m2r (*src3, mm3); // load 8 src3 bytes movq_r2r (mm3, mm4); // copy 8 src3 bytes punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes paddw_r2r (mm3, mm1); // add lows paddw_r2r (mm4, mm2); // add highs movq_m2r (*src4, mm5); // load 8 src4 bytes movq_r2r (mm5, mm6); // copy 8 src4 bytes punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes paddw_r2r (mm5, mm1); // add lows paddw_r2r (mm6, mm2); // add highs /* now have subtotal in mm1 and mm2 */ paddw_m2r (round4, mm1); psraw_i2r (2, mm1); // /4 paddw_m2r (round4, mm2); psraw_i2r (2, mm2); // /4 packuswb_r2r (mm2, mm1); // pack (w/ saturation) movq_r2m (mm1, *dest); // store result in dest }
static void line_filter_mmx_fast(uint8_t *dst, int width, int start_width, uint8_t *buf, uint8_t *src2, uint8_t *src3, uint8_t *src4, uint8_t *src5) { int X; for (X = start_width; X < width - 7; X += 8) { mmx_start(buf, src2, src3, src4, X); movq_r2m (mm4, buf[X]); mmx_end(src3, src5, dst, X); } line_filter_c_fast(dst, width, X, buf, src2, src3, src4, src5); }
static void fast_memcpy_mmx( void *d, const void *s, size_t n ) { const uint8_t *src = s; uint8_t *dest = d; if( dest != src ) { while( n > 64 ) { movq_m2r( src[ 0 ], mm0 ); movq_m2r( src[ 8 ], mm1 ); movq_m2r( src[ 16 ], mm2 ); movq_m2r( src[ 24 ], mm3 ); movq_m2r( src[ 32 ], mm4 ); movq_m2r( src[ 40 ], mm5 ); movq_m2r( src[ 48 ], mm6 ); movq_m2r( src[ 56 ], mm7 ); movq_r2m( mm0, dest[ 0 ] ); movq_r2m( mm1, dest[ 8 ] ); movq_r2m( mm2, dest[ 16 ] ); movq_r2m( mm3, dest[ 24 ] ); movq_r2m( mm4, dest[ 32 ] ); movq_r2m( mm5, dest[ 40 ] ); movq_r2m( mm6, dest[ 48 ] ); movq_r2m( mm7, dest[ 56 ] ); dest += 64; src += 64; n -= 64; } while( n > 8 ) { movq_m2r( src[ 0 ], mm0 ); movq_r2m( mm0, dest[ 0 ] ); dest += 8; src += 8; n -= 8; } if( n ) small_memcpy( dest, src, n ); emms(); } }
static inline void MC_put4_8 (int height, uint8_t * dest, const uint8_t * ref, const int stride, const int cpu) { movq_m2r (*ref, mm0); movq_m2r (*(ref+1), mm1); movq_r2r (mm0, mm7); pxor_r2r (mm1, mm7); pavg_r2r (mm1, mm0); ref += stride; do { movq_m2r (*ref, mm2); movq_r2r (mm0, mm5); movq_m2r (*(ref+1), mm3); movq_r2r (mm2, mm6); pxor_r2r (mm3, mm6); pavg_r2r (mm3, mm2); por_r2r (mm6, mm7); pxor_r2r (mm2, mm5); pand_r2r (mm5, mm7); pavg_r2r (mm2, mm0); pand_m2r (mask_one, mm7); psubusb_r2r (mm7, mm0); ref += stride; movq_r2m (mm0, *dest); dest += stride; movq_r2r (mm6, mm7); /* unroll ! */ movq_r2r (mm2, mm0); /* unroll ! */ } while (--height); }
static void frame_f2i_sse(float *src,u_char *dst,int l) { int i; // put 128 in all 4 words of mm7 movd_g2r(128,mm7); punpcklwd_r2r(mm7,mm7); punpckldq_r2r(mm7,mm7); // put 128 in all 8 bytes of mm6 movd_g2r(128,mm6); punpcklbw_r2r(mm6,mm6); punpcklwd_r2r(mm6,mm6); punpckldq_r2r(mm6,mm6); for( i=0; i<l; i+=8 ) { movaps_m2r(src[0],xmm0); movaps_m2r(src[4],xmm2); movhlps_r2r(xmm0,xmm1); cvtps2pi_r2r(xmm0,mm0); cvtps2pi_r2r(xmm1,mm1); movhlps_r2r(xmm2,xmm3); cvtps2pi_r2r(xmm2,mm2); cvtps2pi_r2r(xmm3,mm3); packssdw_r2r(mm1,mm0); packssdw_r2r(mm3,mm2); psubw_r2r(mm7,mm0); psubw_r2r(mm7,mm2); packsswb_r2r(mm2, mm0); paddb_r2r(mm6, mm0); movq_r2m(mm0,dst[0]); src+=8; dst+=8; } emms(); }
static void deinterlace_scanline_linear_mmx (GstDeinterlaceSimpleMethod * self, guint8 * out, const guint8 * bot, const guint8 * top, gint size) { const mmx_t shiftmask = { 0xfefffefffefffeffULL }; /* To avoid shifting chroma to luma. */ int i; for (i = size / 32; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); movq_m2r (*(bot + 8), mm2); movq_m2r (*(top + 8), mm3); movq_m2r (*(bot + 16), mm4); movq_m2r (*(top + 16), mm5); movq_m2r (*(bot + 24), mm6); movq_m2r (*(top + 24), mm7); pand_m2r (shiftmask, mm0); pand_m2r (shiftmask, mm1); pand_m2r (shiftmask, mm2); pand_m2r (shiftmask, mm3); pand_m2r (shiftmask, mm4); pand_m2r (shiftmask, mm5); pand_m2r (shiftmask, mm6); pand_m2r (shiftmask, mm7); psrlw_i2r (1, mm0); psrlw_i2r (1, mm1); psrlw_i2r (1, mm2); psrlw_i2r (1, mm3); psrlw_i2r (1, mm4); psrlw_i2r (1, mm5); psrlw_i2r (1, mm6); psrlw_i2r (1, mm7); paddb_r2r (mm1, mm0); paddb_r2r (mm3, mm2); paddb_r2r (mm5, mm4); paddb_r2r (mm7, mm6); movq_r2m (mm0, *out); movq_r2m (mm2, *(out + 8)); movq_r2m (mm4, *(out + 16)); movq_r2m (mm6, *(out + 24)); out += 32; top += 32; bot += 32; } size = (size & 0x1f); for (i = size / 8; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); pand_m2r (shiftmask, mm0); pand_m2r (shiftmask, mm1); psrlw_i2r (1, mm0); psrlw_i2r (1, mm1); paddb_r2r (mm1, mm0); movq_r2m (mm0, *out); out += 8; top += 8; bot += 8; } emms (); size = size & 0xf; /* Handle last few pixels. */ for (i = size; i; --i) { *out++ = ((*top++) + (*bot++)) >> 1; } }
static void vfilter_chroma_332_packed422_scanline_mmx( uint8_t *output, int width, uint8_t *m, uint8_t *t, uint8_t *b ) { int i; const mmx_t ymask = { 0x00ff00ff00ff00ffULL }; const mmx_t cmask = { 0xff00ff00ff00ff00ULL }; // Get width in bytes. width *= 2; i = width / 8; width -= i * 8; movq_m2r( ymask, mm7 ); movq_m2r( cmask, mm6 ); while ( i-- ) { movq_m2r( *t, mm0 ); movq_m2r( *b, mm1 ); movq_m2r( *m, mm2 ); movq_r2r ( mm2, mm3 ); pand_r2r ( mm7, mm3 ); pand_r2r ( mm6, mm0 ); pand_r2r ( mm6, mm1 ); pand_r2r ( mm6, mm2 ); psrlq_i2r( 8, mm0 ); psrlq_i2r( 7, mm1 ); psrlq_i2r( 8, mm2 ); movq_r2r ( mm0, mm4 ); psllw_i2r( 1, mm4 ); paddw_r2r( mm4, mm0 ); movq_r2r ( mm2, mm4 ); psllw_i2r( 1, mm4 ); paddw_r2r( mm4, mm2 ); paddw_r2r( mm0, mm2 ); paddw_r2r( mm1, mm2 ); psllw_i2r( 5, mm2 ); pand_r2r( mm6, mm2 ); por_r2r ( mm3, mm2 ); movq_r2m( mm2, *output ); output += 8; t += 8; b += 8; m += 8; } output++; t++; b++; m++; while ( width-- ) { *output = (3 * *t + 3 * *m + 2 * *b) >> 3; output +=2; t+=2; b+=2; m+=2; } emms(); }
static void interpolate_packed422_scanline_mmx( uint8_t *output, uint8_t *top, uint8_t *bot, int width ) { const mmx_t shiftmask = { 0xfefffefffefffeffULL }; /* To avoid shifting chroma to luma. */ int i; for( i = width/16; i; --i ) { movq_m2r( *bot, mm0 ); movq_m2r( *top, mm1 ); movq_m2r( *(bot + 8), mm2 ); movq_m2r( *(top + 8), mm3 ); movq_m2r( *(bot + 16), mm4 ); movq_m2r( *(top + 16), mm5 ); movq_m2r( *(bot + 24), mm6 ); movq_m2r( *(top + 24), mm7 ); pand_m2r( shiftmask, mm0 ); pand_m2r( shiftmask, mm1 ); pand_m2r( shiftmask, mm2 ); pand_m2r( shiftmask, mm3 ); pand_m2r( shiftmask, mm4 ); pand_m2r( shiftmask, mm5 ); pand_m2r( shiftmask, mm6 ); pand_m2r( shiftmask, mm7 ); psrlw_i2r( 1, mm0 ); psrlw_i2r( 1, mm1 ); psrlw_i2r( 1, mm2 ); psrlw_i2r( 1, mm3 ); psrlw_i2r( 1, mm4 ); psrlw_i2r( 1, mm5 ); psrlw_i2r( 1, mm6 ); psrlw_i2r( 1, mm7 ); paddb_r2r( mm1, mm0 ); paddb_r2r( mm3, mm2 ); paddb_r2r( mm5, mm4 ); paddb_r2r( mm7, mm6 ); movq_r2m( mm0, *output ); movq_r2m( mm2, *(output + 8) ); movq_r2m( mm4, *(output + 16) ); movq_r2m( mm6, *(output + 24) ); output += 32; top += 32; bot += 32; } width = (width & 0xf); for( i = width/4; i; --i ) { movq_m2r( *bot, mm0 ); movq_m2r( *top, mm1 ); pand_m2r( shiftmask, mm0 ); pand_m2r( shiftmask, mm1 ); psrlw_i2r( 1, mm0 ); psrlw_i2r( 1, mm1 ); paddb_r2r( mm1, mm0 ); movq_r2m( mm0, *output ); output += 8; top += 8; bot += 8; } width = width & 0x7; /* Handle last few pixels. */ for( i = width * 2; i; --i ) { *output++ = ((*top++) + (*bot++)) >> 1; } emms(); }
static void deinterlace_scanline_linear_mmx (GstDeinterlaceMethod * self, GstDeinterlace * parent, guint8 * out, GstDeinterlaceScanlineData * scanlines, gint width) { const mmx_t shiftmask = { 0xfefffefffefffeffULL }; /* To avoid shifting chroma to luma. */ int i; guint8 *bot = scanlines->b0, *top = scanlines->t0; for (i = width / 16; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); movq_m2r (*(bot + 8), mm2); movq_m2r (*(top + 8), mm3); movq_m2r (*(bot + 16), mm4); movq_m2r (*(top + 16), mm5); movq_m2r (*(bot + 24), mm6); movq_m2r (*(top + 24), mm7); pand_m2r (shiftmask, mm0); pand_m2r (shiftmask, mm1); pand_m2r (shiftmask, mm2); pand_m2r (shiftmask, mm3); pand_m2r (shiftmask, mm4); pand_m2r (shiftmask, mm5); pand_m2r (shiftmask, mm6); pand_m2r (shiftmask, mm7); psrlw_i2r (1, mm0); psrlw_i2r (1, mm1); psrlw_i2r (1, mm2); psrlw_i2r (1, mm3); psrlw_i2r (1, mm4); psrlw_i2r (1, mm5); psrlw_i2r (1, mm6); psrlw_i2r (1, mm7); paddb_r2r (mm1, mm0); paddb_r2r (mm3, mm2); paddb_r2r (mm5, mm4); paddb_r2r (mm7, mm6); movq_r2m (mm0, *out); movq_r2m (mm2, *(out + 8)); movq_r2m (mm4, *(out + 16)); movq_r2m (mm6, *(out + 24)); out += 32; top += 32; bot += 32; } width = (width & 0xf); for (i = width / 4; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); pand_m2r (shiftmask, mm0); pand_m2r (shiftmask, mm1); psrlw_i2r (1, mm0); psrlw_i2r (1, mm1); paddb_r2r (mm1, mm0); movq_r2m (mm0, *out); out += 8; top += 8; bot += 8; } width = width & 0x7; /* Handle last few pixels. */ for (i = width * 2; i; --i) { *out++ = ((*top++) + (*bot++)) >> 1; } emms (); }
static void deinterlace_greedy_scanline_mmxext (GstDeinterlaceMethodGreedyL * self, const guint8 * m0, const guint8 * t1, const guint8 * b1, const guint8 * m2, guint8 * output, gint width) { mmx_t MaxComb; // How badly do we let it weave? 0-255 MaxComb.ub[0] = self->max_comb; MaxComb.ub[1] = self->max_comb; MaxComb.ub[2] = self->max_comb; MaxComb.ub[3] = self->max_comb; MaxComb.ub[4] = self->max_comb; MaxComb.ub[5] = self->max_comb; MaxComb.ub[6] = self->max_comb; MaxComb.ub[7] = self->max_comb; // L2 == m0 // L1 == t1 // L3 == b1 // LP2 == m2 movq_m2r (MaxComb, mm6); for (; width > 7; width -= 8) { movq_m2r (*t1, mm1); // L1 movq_m2r (*m0, mm2); // L2 movq_m2r (*b1, mm3); // L3 movq_m2r (*m2, mm0); // LP2 // average L1 and L3 leave result in mm4 movq_r2r (mm1, mm4); // L1 pavgb_r2r (mm3, mm4); // (L1 + L3)/2 // get abs value of possible L2 comb movq_r2r (mm2, mm7); // L2 psubusb_r2r (mm4, mm7); // L2 - avg movq_r2r (mm4, mm5); // avg psubusb_r2r (mm2, mm5); // avg - L2 por_r2r (mm7, mm5); // abs(avg-L2) // get abs value of possible LP2 comb movq_r2r (mm0, mm7); // LP2 psubusb_r2r (mm4, mm7); // LP2 - avg psubusb_r2r (mm0, mm4); // avg - LP2 por_r2r (mm7, mm4); // abs(avg-LP2) // use L2 or LP2 depending upon which makes smaller comb psubusb_r2r (mm5, mm4); // see if it goes to zero pxor_r2r (mm5, mm5); // 0 pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0 pcmpeqb_r2r (mm4, mm5); // opposite of mm4 // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0 pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0 por_r2r (mm5, mm4); // may the best win // Now lets clip our chosen value to be not outside of the range // of the high/low range L1-L3 by more than abs(L1-L3) // This allows some comb but limits the damages and also allows more // detail than a boring oversmoothed clip. movq_r2r (mm1, mm2); // copy L1 pmaxub_r2r (mm3, mm2); // now = Max(L1,L3) pminub_r2r (mm1, mm3); // now = Min(L1,L3) // allow the value to be above the high or below the low by amt of MaxComb paddusb_r2r (mm6, mm2); // increase max by diff psubusb_r2r (mm6, mm3); // lower min by diff pmaxub_r2r (mm3, mm4); // now = Max(best,Min(L1,L3) pminub_r2r (mm4, mm2); // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped movq_r2m (mm2, *output); // move in our clipped best // Advance to the next set of pixels. output += 8; m0 += 8; t1 += 8; b1 += 8; m2 += 8; } emms (); if (width > 0) deinterlace_greedy_scanline_c (self, m0, t1, b1, m2, output, width); }
static inline void mmx_interp_average_4_U8 (uint8_t * dest, const uint8_t * src1, const uint8_t * src2, const uint8_t * src3, const uint8_t * src4) { /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */ movq_m2r (*src1, mm1); /* load 8 src1 bytes */ movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */ punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */ movq_m2r (*src2, mm3); /* load 8 src2 bytes */ movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */ punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */ paddw_r2r (mm3, mm1); /* add lows */ paddw_r2r (mm4, mm2); /* add highs */ /* now have partials in mm1 and mm2 */ movq_m2r (*src3, mm3); /* load 8 src3 bytes */ movq_r2r (mm3, mm4); /* copy 8 src3 bytes */ punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */ punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */ paddw_r2r (mm3, mm1); /* add lows */ paddw_r2r (mm4, mm2); /* add highs */ movq_m2r (*src4, mm5); /* load 8 src4 bytes */ movq_r2r (mm5, mm6); /* copy 8 src4 bytes */ punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */ punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */ paddw_r2r (mm5, mm1); /* add lows */ paddw_r2r (mm6, mm2); /* add highs */ paddw_m2r (round4, mm1); psraw_i2r (2, mm1); /* /4 */ paddw_m2r (round4, mm2); psraw_i2r (2, mm2); /* /4 */ /* now have subtotal/4 in mm1 and mm2 */ movq_m2r (*dest, mm3); /* load 8 dest bytes */ movq_r2r (mm3, mm4); /* copy 8 dest bytes */ packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */ movq_r2r (mm1,mm2); /* copy subresult */ pxor_r2r (mm1, mm3); /* xor srcavg and dest */ pand_m2r (mask1, mm3); /* mask lower bits */ psrlq_i2r (1, mm3); /* /2 */ por_r2r (mm2, mm4); /* or srcavg and dest */ psubb_r2r (mm3, mm4); /* subtract subresults */ movq_r2m (mm4, *dest); /* store result in dest */ }
int field_dct_best_mmx( uint8_t *cur_lum_mb, uint8_t *pred_lum_mb) { /* * calculate prediction error (cur-pred) for top (blk0) * and bottom field (blk1) */ double r,d; int rowoffs = 0; int sumtop, sumbot, sumsqtop, sumsqbot, sumbottop; int j; int dct_type; int topvar, botvar; mmx_t sumtop_accs, sumbot_accs; mmx_t sumsqtop_accs, sumsqbot_accs, sumxprod_accs; int32_t sumtop_acc, sumbot_acc; int32_t sumsqtop_acc, sumsqbot_acc, sumxprod_acc; pxor_r2r(mm0,mm0); movq_r2m( mm0, *(&sumtop_accs) ); movq_r2m( mm0, *(&sumbot_accs) ); movq_r2m( mm0, *(&sumsqtop_accs) ); movq_r2m( mm0, *(&sumsqbot_accs) ); movq_r2m( mm0, *(&sumxprod_accs) ); sumtop = sumsqtop = sumbot = sumsqbot = sumbottop = 0; sumtop_acc = sumbot_acc = sumsqtop_acc = sumsqbot_acc = sumxprod_acc = 0; for (j=0; j<8; j++) { #ifdef ORIGINAL_CODE for (i=0; i<16; i++) { register int toppix = cur_lum_mb[rowoffs+i] - pred_lum_mb[rowoffs+i]; register int botpix = cur_lum_mb[rowoffs+width+i] - pred_lum_mb[rowoffs+width+i]; sumtop += toppix; sumsqtop += toppix*toppix; sumbot += botpix; sumsqbot += botpix*botpix; sumbottop += toppix*botpix; } #endif sum_sumsq_8bytes( &cur_lum_mb[rowoffs], &pred_lum_mb[rowoffs], &sumtop_accs, &sumbot_accs, &sumsqtop_accs, &sumsqbot_accs, &sumxprod_accs ); sum_sumsq_8bytes( &cur_lum_mb[rowoffs+8], &pred_lum_mb[rowoffs+8], &sumtop_accs, &sumbot_accs, &sumsqtop_accs, &sumsqbot_accs, &sumxprod_accs ); rowoffs += (opt->phy_width<<1); } mmx_sum_4_word_accs( &sumtop_accs, &sumtop ); mmx_sum_4_word_accs( &sumbot_accs, &sumbot ); emms(); sumsqtop = sumsqtop_accs.d[0] + sumsqtop_accs.d[1]; sumsqbot = sumsqbot_accs.d[0] + sumsqbot_accs.d[1]; sumbottop = sumxprod_accs.d[0] + sumxprod_accs.d[1]; /* Calculate Variances top and bottom. If they're of similar sign estimate correlation if its good use frame DCT otherwise use field. */ r = 0.0; topvar = sumsqtop-sumtop*sumtop/128; botvar = sumsqbot-sumbot*sumbot/128; if ( !((topvar <= 0) ^ (botvar <= 0)) ) { d = ((double) topvar) * ((double)botvar); r = (sumbottop-(sumtop*sumbot)/128); if (r>0.5*sqrt(d)) return 0; /* frame DCT */ else return 1; /* field DCT */ } else return 1; /* field DCT */ return dct_type; }
void deinterlace_bob_yuv_mmx(uint8_t *pdst, uint8_t *psrc, int width, int height ) { int Line; long long* YVal1; long long* YVal2; long long* YVal3; long long* Dest; uint8_t* pEvenLines = psrc; uint8_t* pOddLines = psrc+width; int LineLength = width; int Pitch = width * 2; int IsOdd = 1; long EdgeDetect = 625; long JaggieThreshold = 73; int n; unsigned long long qwEdgeDetect; unsigned long long qwThreshold; const unsigned long long Mask = 0xfefefefefefefefeULL; const unsigned long long YMask = 0x00ff00ff00ff00ffULL; qwEdgeDetect = EdgeDetect; qwEdgeDetect += (qwEdgeDetect << 48) + (qwEdgeDetect << 32) + (qwEdgeDetect << 16); qwThreshold = JaggieThreshold; qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16); // copy first even line no matter what, and the first odd line if we're // processing an odd field. ac_memcpy(pdst, pEvenLines, LineLength); if (IsOdd) ac_memcpy(pdst + LineLength, pOddLines, LineLength); height = height / 2; for (Line = 0; Line < height - 1; ++Line) { if (IsOdd) { YVal1 = (long long *)(pOddLines + Line * Pitch); YVal2 = (long long *)(pEvenLines + (Line + 1) * Pitch); YVal3 = (long long *)(pOddLines + (Line + 1) * Pitch); Dest = (long long *)(pdst + (Line * 2 + 2) * LineLength); } else { YVal1 = (long long *)(pEvenLines + Line * Pitch); YVal2 = (long long *)(pOddLines + Line * Pitch); YVal3 = (long long *)(pEvenLines + (Line + 1) * Pitch); Dest = (long long *)(pdst + (Line * 2 + 1) * LineLength); } // For ease of reading, the comments below assume that we're operating on an odd // field (i.e., that bIsOdd is true). The exact same processing is done when we // operate on an even field, but the roles of the odd and even fields are reversed. // It's just too cumbersome to explain the algorithm in terms of "the next odd // line if we're doing an odd field, or the next even line if we're doing an // even field" etc. So wherever you see "odd" or "even" below, keep in mind that // half the time this function is called, those words' meanings will invert. // Copy the odd line to the overlay verbatim. ac_memcpy((char *)Dest + LineLength, YVal3, LineLength); n = LineLength >> 3; while( n-- ) { movq_m2r (*YVal1++, mm0); movq_m2r (*YVal2++, mm1); movq_m2r (*YVal3++, mm2); // get intensities in mm3 - 4 movq_r2r ( mm0, mm3 ); movq_r2r ( mm1, mm4 ); movq_r2r ( mm2, mm5 ); pand_m2r ( *&YMask, mm3 ); pand_m2r ( *&YMask, mm4 ); pand_m2r ( *&YMask, mm5 ); // get average in mm0 pand_m2r ( *&Mask, mm0 ); pand_m2r ( *&Mask, mm2 ); psrlw_i2r ( 01, mm0 ); psrlw_i2r ( 01, mm2 ); paddw_r2r ( mm2, mm0 ); // work out (O1 - E) * (O2 - E) / 2 - EdgeDetect * (O1 - O2) ^ 2 >> 12 // result will be in mm6 psrlw_i2r ( 01, mm3 ); psrlw_i2r ( 01, mm4 ); psrlw_i2r ( 01, mm5 ); movq_r2r ( mm3, mm6 ); psubw_r2r ( mm4, mm6 ); //mm6 = O1 - E movq_r2r ( mm5, mm7 ); psubw_r2r ( mm4, mm7 ); //mm7 = O2 - E pmullw_r2r ( mm7, mm6 ); // mm6 = (O1 - E) * (O2 - E) movq_r2r ( mm3, mm7 ); psubw_r2r ( mm5, mm7 ); // mm7 = (O1 - O2) pmullw_r2r ( mm7, mm7 ); // mm7 = (O1 - O2) ^ 2 psrlw_i2r ( 12, mm7 ); // mm7 = (O1 - O2) ^ 2 >> 12 pmullw_m2r ( *&qwEdgeDetect, mm7 );// mm7 = EdgeDetect * (O1 - O2) ^ 2 >> 12 psubw_r2r ( mm7, mm6 ); // mm6 is what we want pcmpgtw_m2r ( *&qwThreshold, mm6 ); movq_r2r ( mm6, mm7 ); pand_r2r ( mm6, mm0 ); pandn_r2r ( mm1, mm7 ); por_r2r ( mm0, mm7 ); movq_r2m ( mm7, *Dest++ ); } } // Copy last odd line if we're processing an even field. if (! IsOdd) { ac_memcpy(pdst + (height * 2 - 1) * LineLength, pOddLines + (height - 1) * Pitch, LineLength); } // clear out the MMX registers ready for doing floating point // again emms(); }
static __inline__ void sum_sumsq_8bytes( uint8_t *cur_lum_mb, uint8_t *pred_lum_mb, mmx_t *sumtop_accs, mmx_t *sumbot_accs, mmx_t *sumsqtop_accs, mmx_t *sumsqbot_accs, mmx_t *sumxprod_accs ) { pxor_r2r(mm0,mm0); /* Load pixels from top field into mm1.w,mm2.w */ movq_m2r( *((mmx_t*)cur_lum_mb), mm1 ); movq_m2r( *((mmx_t*)pred_lum_mb), mm2 ); /* mm3 := mm1 mm4 := mm2 mm1.w[0..3] := mm1.b[0..3]-mm2.b[0..3] */ movq_r2r( mm1, mm3 ); punpcklbw_r2r( mm0, mm1 ); movq_r2r( mm2, mm4 ); punpcklbw_r2r( mm0, mm2 ); psubw_r2r( mm2, mm1 ); /* mm3.w[0..3] := mm3.b[4..7]-mm4.b[4..7] */ punpckhbw_r2r( mm0, mm3 ); punpckhbw_r2r( mm0, mm4 ); psubw_r2r( mm4, mm3 ); /* sumtop_accs->w[0..3] += mm1.w[0..3]; sumtop_accs->w[0..3] += mm3.w[0..3]; mm6 = mm1; mm7 = mm3; */ movq_m2r( *sumtop_accs, mm5 ); paddw_r2r( mm1, mm5 ); paddw_r2r( mm3, mm5 ); movq_r2r( mm1, mm6 ); movq_r2r( mm3, mm7 ); movq_r2m( mm5, *sumtop_accs ); /* *sumsq_top_acc += mm1.w[0..3] * mm1.w[0..3]; *sumsq_top_acc += mm3.w[0..3] * mm3.w[0..3]; */ pmaddwd_r2r( mm1, mm1 ); movq_m2r( *sumsqtop_accs, mm5 ); pmaddwd_r2r( mm3, mm3 ); paddd_r2r( mm1, mm5 ); paddd_r2r( mm3, mm5 ); movq_r2m( mm5, *sumsqtop_accs ); /* Load pixels from bot field into mm1.w,mm2.w */ movq_m2r( *((mmx_t*)(cur_lum_mb+opt->phy_width)), mm1 ); movq_m2r( *((mmx_t*)(pred_lum_mb+opt->phy_width)), mm2 ); /* mm2 := mm1 mm4 := mm2 mm1.w[0..3] := mm1.b[0..3]-mm2.b[0..3] */ movq_r2r( mm1, mm3 ); punpcklbw_r2r( mm0, mm1 ); movq_r2r( mm2, mm4 ); punpcklbw_r2r( mm0, mm2 ); psubw_r2r( mm2, mm1 ); /* mm3.w[0..3] := mm3.b[4..7]-mm4.b[4..7] */ punpckhbw_r2r( mm0, mm3 ); punpckhbw_r2r( mm0, mm4 ); psubw_r2r( mm4, mm3 ); /* sumbot_accs->w[0..3] += mm1.w[0..3]; sumbot_accs->w[0..3] += mm3.w[0..3]; mm2 := mm1; mm4 := mm3; */ movq_m2r( *sumbot_accs, mm5 ); paddw_r2r( mm1, mm5 ); movq_r2r( mm1, mm2 ); paddw_r2r( mm3, mm5 ); movq_r2r( mm3, mm4 ); movq_r2m( mm5, *sumbot_accs ); /* *sumsqbot_acc += mm1.w[0..3] * mm1.w[0..3]; *sumsqbot_acc += mm3.w[0..3] * mm3.w[0..3]; */ pmaddwd_r2r( mm1, mm1 ); movq_m2r( *sumsqbot_accs, mm5 ); pmaddwd_r2r( mm3, mm3 ); paddd_r2r( mm1, mm5 ); paddd_r2r( mm3, mm5 ); movq_r2m( mm5, *sumsqbot_accs ); /* Accumulate cross-product *sum_xprod_acc += mm1.w[0..3] * mm6[0..3]; *sum_xprod_acc += mm3.w[0..3] * mm7[0..3]; */ movq_m2r( *sumxprod_accs, mm5 ); pmaddwd_r2r( mm6, mm2); pmaddwd_r2r( mm7, mm4); paddd_r2r( mm2, mm5 ); paddd_r2r( mm4, mm5 ); movq_r2m( mm5, *sumxprod_accs ); emms(); }