示例#1
0
static inline void MC_put1_16 (int height, uint8_t * dest, const uint8_t * ref,
			       const int stride)
{
    do {
	movq_m2r (*ref, mm0);
	movq_m2r (*(ref+8), mm1);
	ref += stride;
	movq_r2m (mm0, *dest);
	movq_r2m (mm1, *(dest+8));
	dest += stride;
    } while (--height);
}
示例#2
0
static inline void MC_put2_16 (int height, uint8_t * dest, uint8_t * ref,
                               int stride, int offset, int cpu)
{
    do {
        movq_m2r (*ref, mm0);
        movq_m2r (*(ref+8), mm1);
        pavg_m2r (*(ref+offset), mm0);
        pavg_m2r (*(ref+offset+8), mm1);
        movq_r2m (mm0, *dest);
        ref += stride;
        movq_r2m (mm1, *(dest+8));
        dest += stride;
    } while (--height);
}
示例#3
0
static inline void MC_avg4_16 (int height, uint8_t * dest, const uint8_t * ref,
			       const int stride, const int cpu)
{
    do {
	movq_m2r (*ref, mm0);
	movq_m2r (*(ref+stride+1), mm1);
	movq_r2r (mm0, mm7);
	movq_m2r (*(ref+1), mm2);
	pxor_r2r (mm1, mm7);
	movq_m2r (*(ref+stride), mm3);
	movq_r2r (mm2, mm6);
	pxor_r2r (mm3, mm6);
	pavg_r2r (mm1, mm0);
	pavg_r2r (mm3, mm2);
	por_r2r (mm6, mm7);
	movq_r2r (mm0, mm6);
	pxor_r2r (mm2, mm6);
	pand_r2r (mm6, mm7);
	pand_m2r (mask_one, mm7);
	pavg_r2r (mm2, mm0);
	psubusb_r2r (mm7, mm0);
	movq_m2r (*dest, mm1);
	pavg_r2r (mm1, mm0);
	movq_r2m (mm0, *dest);

	movq_m2r (*(ref+8), mm0);
	movq_m2r (*(ref+stride+9), mm1);
	movq_r2r (mm0, mm7);
	movq_m2r (*(ref+9), mm2);
	pxor_r2r (mm1, mm7);
	movq_m2r (*(ref+stride+8), mm3);
	movq_r2r (mm2, mm6);
	pxor_r2r (mm3, mm6);
	pavg_r2r (mm1, mm0);
	pavg_r2r (mm3, mm2);
	por_r2r (mm6, mm7);
	movq_r2r (mm0, mm6);
	pxor_r2r (mm2, mm6);
	pand_r2r (mm6, mm7);
	pand_m2r (mask_one, mm7);
	pavg_r2r (mm2, mm0);
	psubusb_r2r (mm7, mm0);
	movq_m2r (*(dest+8), mm1);
	pavg_r2r (mm1, mm0);
	ref += stride;
	movq_r2m (mm0, *(dest+8));
	dest += stride;
    } while (--height);
}
示例#4
0
static inline void mmx_end(uint8_t *src3, uint8_t *src5,
                           uint8_t *dst, int X)
{
    punpcklbw_m2r (mm_cpool[0], mm4);
    punpckhbw_m2r (mm_cpool[0], mm5);
    psubusw_r2r (mm2, mm0);
    psubusw_r2r (mm3, mm1);
    movq_m2r (src5[X], mm2);
    movq_m2r (src5[X], mm3);
    punpcklbw_m2r (mm_cpool[0], mm2);
    punpckhbw_m2r (mm_cpool[0], mm3);
    psubusw_r2r (mm2, mm0);
    psubusw_r2r (mm3, mm1);
    psrlw_i2r (3, mm0);
    psrlw_i2r (3, mm1);
    psubw_r2r (mm6, mm4);
    psubw_r2r (mm7, mm5);
    packuswb_r2r (mm1,mm0);
    movq_r2r (mm4, mm6);
    movq_r2r (mm5, mm7);
    pcmpgtw_m2r (mm_lthr, mm4);
    pcmpgtw_m2r (mm_lthr, mm5);
    pcmpgtw_m2r (mm_hthr, mm6);
    pcmpgtw_m2r (mm_hthr, mm7);
    packsswb_r2r (mm5, mm4);
    packsswb_r2r (mm7, mm6);
    pxor_r2r (mm6, mm4);
    movq_r2r (mm4, mm5);
    pandn_r2r (mm0, mm4);
    pand_m2r (src3[X], mm5);
    por_r2r (mm4, mm5);
    movq_r2m (mm5, dst[X]);
}
示例#5
0
文件: tst.c 项目: Cuchulain/cinelerra
int main(int ac, char **av)
{
   int i, j, k, n;
   unsigned char dat0[8] = { 0x01, 0xf2, 0x03, 0x04, 0x05, 0x06, 0xf7, 0x08 };
   long long *datp = (long long *)&dat0;
   int16_t dat1[8] = { 0x10, 0x20, -0x130, -0x140, 0x50, -0x160, -0x170, 0x80 };
   volatile uint8_t *rfp = dat0;
   volatile int16_t *bp  = dat1;
   unsigned char ans1[8], ans2[8];

   n = 0;
   for( i=-32768; i<32768; ++i ) {
     j = 0;
     while( j < 256 ) {
        for( k=0; k<8; ++k ) {
          dat0[k] = i;
          dat1[k] = j++;
        }
       movq_m2r(m_(&rfp[0]),mm1);  /* rfp[0..7] */
       pxor_r2r(mm3,mm3);
       pxor_r2r(mm4,mm4);
       movq_m2r(m_(&bp[0]),mm5);   /* bp[0..3] */
       movq_r2r(mm1,mm2);
       movq_m2r(m_(&bp[4]),mm6);   /* bp[4..7] */
       punpcklbw_r2r(mm3,mm1);     /* rfp[0,2,4,6] */
       punpckhbw_r2r(mm3,mm2);     /* rfp[1,3,5,7] */
       paddsw_r2r(mm5,mm1);        /* bp[0..3] */
       paddsw_r2r(mm6,mm2);        /* bp[4..7] */
       pcmpgtw_r2r(mm1,mm3);
       pcmpgtw_r2r(mm2,mm4);
       pandn_r2r(mm1,mm3);
       pandn_r2r(mm2,mm4);
       packuswb_r2r(mm4,mm3);
       movq_r2m(mm3,m_(&ans1[0]));
       emms();

       ans2[0] = clip(bp[0] + rfp[0]);
       ans2[1] = clip(bp[1] + rfp[1]);
       ans2[2] = clip(bp[2] + rfp[2]);
       ans2[3] = clip(bp[3] + rfp[3]);
       ans2[4] = clip(bp[4] + rfp[4]);
       ans2[5] = clip(bp[5] + rfp[5]);
       ans2[6] = clip(bp[6] + rfp[6]);
       ans2[7] = clip(bp[7] + rfp[7]);

       if( *(uint64_t *)&ans1[0] != *(uint64_t *)&ans2[0] )
       {
         printf(" i=%5d %02x %02x %02x %02x  %02x %02x %02x %02x\n", i,
           ans1[0], ans1[1], ans1[2], ans1[3], ans1[4], ans1[5], ans1[6], ans1[7]);
         printf(" j=%5d %02x %02x %02x %02x  %02x %02x %02x %02x\n", j,
           ans2[0], ans2[1], ans2[2], ans2[3], ans2[4], ans2[5], ans2[6], ans2[7]);
       //  exit(0);
       }
       n += 8;
     }
   }

   printf("n=%d\n",n);
   return 0;
}
示例#6
0
static inline void mmx_interp_average_2_U8 (uint8_t * dest,
					    const uint8_t * src1,
					    const uint8_t * src2)
{
    /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */

    movq_m2r (*dest, mm1);	/* load 8 dest bytes */
    movq_r2r (mm1, mm2);	/* copy 8 dest bytes */

    movq_m2r (*src1, mm3);	/* load 8 src1 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src1 bytes */

    movq_m2r (*src2, mm5);	/* load 8 src2 bytes */
    movq_r2r (mm5, mm6);	/* copy 8 src2 bytes */

    pxor_r2r (mm3, mm5);	/* xor src1 and src2 */
    pand_m2r (mask1, mm5);	/* mask lower bits */
    psrlq_i2r (1, mm5);		/* /2 */
    por_r2r (mm4, mm6);		/* or src1 and src2 */
    psubb_r2r (mm5, mm6);	/* subtract subresults */
    movq_r2r (mm6, mm5);	/* copy subresult */

    pxor_r2r (mm1, mm5);	/* xor srcavg and dest */
    pand_m2r (mask1, mm5);	/* mask lower bits */
    psrlq_i2r (1, mm5);		/* /2 */
    por_r2r (mm2, mm6);		/* or srcavg and dest */
    psubb_r2r (mm5, mm6);	/* subtract subresults */
    movq_r2m (mm6, *dest);	/* store result in dest */
}
示例#7
0
static inline void mmx_average_2_U8 (uint8_t * dest,
                                     uint8_t * src1, uint8_t * src2)
{
    /* *dest = (*src1 + *src2 + 1)/ 2; */

    movq_m2r (*src1, mm1);      // load 8 src1 bytes
    movq_r2r (mm1, mm2);        // copy 8 src1 bytes

    movq_m2r (*src2, mm3);      // load 8 src2 bytes
    movq_r2r (mm3, mm4);        // copy 8 src2 bytes

    punpcklbw_r2r (mm0, mm1);   // unpack low src1 bytes
    punpckhbw_r2r (mm0, mm2);   // unpack high src1 bytes

    punpcklbw_r2r (mm0, mm3);   // unpack low src2 bytes
    punpckhbw_r2r (mm0, mm4);   // unpack high src2 bytes

    paddw_r2r (mm3, mm1);       // add lows to mm1
    paddw_m2r (round1, mm1);
    psraw_i2r (1, mm1);         // /2

    paddw_r2r (mm4, mm2);       // add highs to mm2
    paddw_m2r (round1, mm2);
    psraw_i2r (1, mm2);         // /2

    packuswb_r2r (mm2, mm1);    // pack (w/ saturation)
    movq_r2m (mm1, *dest);      // store result in dest
}
示例#8
0
static inline void MC_put_mmx (const int width, int height, uint8_t * dest,
			       const uint8_t * ref, const int stride)
{
    mmx_zero_reg ();

    do {
	movq_m2r (* ref, mm1);	/* load 8 ref bytes */
	movq_r2m (mm1,* dest);	/* store 8 bytes at curr */

	if (width == 16)
	    {
		movq_m2r (* (ref+8), mm1);	/* load 8 ref bytes */
		movq_r2m (mm1,* (dest+8));	/* store 8 bytes at curr */
	    }

	dest += stride;
	ref += stride;
    } while (--height);
}
示例#9
0
static inline void mean8(unsigned char *refpix,unsigned char *pixel,int radius_count,int row_stride,int threshold,int8_t *diff,unsigned char *count)
{
    int a,b;

    pxor_r2r(mm6,mm6); // mm6 (aka count) = 0
    pxor_r2r(mm7,mm7); // mm7 (aka diff) = 0
    movq_m2r(*refpix,mm3); // mm3 = refpix[0]

    movd_g2r(0x80808080,mm4); // mm4 = 128
    punpcklbw_r2r(mm4,mm4);

    pxor_r2r(mm4,mm3); // mm3 = refpix[0]-128

    movd_g2r(threshold,mm5); // mm5 = threshold
    punpcklbw_r2r(mm5,mm5);
    punpcklbw_r2r(mm5,mm5);
    punpcklbw_r2r(mm5,mm5);

    for( b=0; b<radius_count; b++ ) {
        for( a=0; a<radius_count; a++ ) {
            movq_m2r(*pixel,mm0); // mm0  = pixel[0]
            pxor_r2r(mm4,mm0);    // mm0  = pixel[0]-128
            movq_r2r(mm3,mm2);    // mm2  = refpix[0]-128
            psubsb_r2r(mm0,mm2);  // mm2  = refpix[0]-pixel[0]
            psubsb_r2r(mm3,mm0);  // mm0  = pixel[0]-refpix[0]
            pminub_r2r(mm0,mm2);  // mm2  = abs(pixel[0]-refpix[0])
            movq_r2r(mm5,mm1);    // mm1  = threshold
            pcmpgtb_r2r(mm2,mm1); // mm1  = (threshold > abs(pixel[0]-refpix[0])) ? -1 : 0
            psubb_r2r(mm1,mm6);   // mm6 += (threshold > abs(pixel[0]-refpix[0]))
            pand_r2r(mm1,mm0);    // mm0  = (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0
            paddb_r2r(mm0,mm7);   // mm7 += (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0

            ++pixel;
        }
        pixel += row_stride - radius_count;
    }

    movq_r2m(mm6,*count);
    movq_r2m(mm7,*diff);

    emms();
 
}
示例#10
0
static inline void MC_put1_8 (int height, uint8_t * dest, uint8_t * ref,
                              int stride)
{
    do {
        movq_m2r (*ref, mm0);
        movq_r2m (mm0, *dest);
        ref += stride;
        dest += stride;
    } while (--height);
}
示例#11
0
static inline void MC_avg1_8 (int height, uint8_t * dest, const uint8_t * ref,
			      const int stride, const int cpu)
{
    do {
	movq_m2r (*ref, mm0);
	pavg_m2r (*dest, mm0);
	ref += stride;
	movq_r2m (mm0, *dest);
	dest += stride;
    } while (--height);
}
示例#12
0
static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
                                          uint8_t *src, int i_src )
{
    int y;

    /* Interlaced */
    for( y = 0; y < 8; y += 2 )
    {
        movq_m2r( src[0], mm0 );
        movq_r2m( mm0, dst[0] );
        dst += i_dst;

        movq_m2r( src[2*i_src], mm1 );
        pavgb_r2r( mm1, mm0 );

        movq_r2m( mm0, dst[0] );

        dst += 1*i_dst;
        src += 2*i_src;
    }
}
示例#13
0
static inline void MC_avg2_8 (int height, uint8_t * dest, uint8_t * ref,
                              int stride, int offset, int cpu)
{
    do {
        movq_m2r (*ref, mm0);
        pavg_m2r (*(ref+offset), mm0);
        pavg_m2r (*dest, mm0);
        ref += stride;
        movq_r2m (mm0, *dest);
        dest += stride;
    } while (--height);
}
示例#14
0
static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1,
				     const uint8_t * src2,
				     const uint8_t * src3,
				     const uint8_t * src4)
{
    /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */

    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */

    punpcklbw_r2r (mm0, mm1);	/* unpack low src1 bytes */
    punpckhbw_r2r (mm0, mm2);	/* unpack high src1 bytes */

    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src2 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src2 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    /* now have partials in mm1 and mm2 */

    movq_m2r (*src3, mm3);	/* load 8 src3 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src3 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src3 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src3 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    movq_m2r (*src4, mm5);	/* load 8 src4 bytes */
    movq_r2r (mm5, mm6);	/* copy 8 src4 bytes */

    punpcklbw_r2r (mm0, mm5);	/* unpack low src4 bytes */
    punpckhbw_r2r (mm0, mm6);	/* unpack high src4 bytes */

    paddw_r2r (mm5, mm1);	/* add lows */
    paddw_r2r (mm6, mm2);	/* add highs */

    /* now have subtotal in mm1 and mm2 */

    paddw_m2r (round4, mm1);
    psraw_i2r (2, mm1);		/* /4 */
    paddw_m2r (round4, mm2);
    psraw_i2r (2, mm2);		/* /4 */

    packuswb_r2r (mm2, mm1);	/* pack (w/ saturation) */
    movq_r2m (mm1, *dest);	/* store result in dest */
}
示例#15
0
static inline void mmx_average_4_U8 (uint8_t * dest,
                                     uint8_t * src1, uint8_t * src2,
                                     uint8_t * src3, uint8_t * src4)
{
    /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */

    movq_m2r (*src1, mm1);      // load 8 src1 bytes
    movq_r2r (mm1, mm2);        // copy 8 src1 bytes

    punpcklbw_r2r (mm0, mm1);   // unpack low src1 bytes
    punpckhbw_r2r (mm0, mm2);   // unpack high src1 bytes

    movq_m2r (*src2, mm3);      // load 8 src2 bytes
    movq_r2r (mm3, mm4);        // copy 8 src2 bytes

    punpcklbw_r2r (mm0, mm3);   // unpack low src2 bytes
    punpckhbw_r2r (mm0, mm4);   // unpack high src2 bytes

    paddw_r2r (mm3, mm1);       // add lows
    paddw_r2r (mm4, mm2);       // add highs

    /* now have partials in mm1 and mm2 */

    movq_m2r (*src3, mm3);      // load 8 src3 bytes
    movq_r2r (mm3, mm4);        // copy 8 src3 bytes

    punpcklbw_r2r (mm0, mm3);   // unpack low src3 bytes
    punpckhbw_r2r (mm0, mm4);   // unpack high src3 bytes

    paddw_r2r (mm3, mm1);       // add lows
    paddw_r2r (mm4, mm2);       // add highs

    movq_m2r (*src4, mm5);      // load 8 src4 bytes
    movq_r2r (mm5, mm6);        // copy 8 src4 bytes

    punpcklbw_r2r (mm0, mm5);   // unpack low src4 bytes
    punpckhbw_r2r (mm0, mm6);   // unpack high src4 bytes

    paddw_r2r (mm5, mm1);       // add lows
    paddw_r2r (mm6, mm2);       // add highs

    /* now have subtotal in mm1 and mm2 */

    paddw_m2r (round4, mm1);
    psraw_i2r (2, mm1);         // /4
    paddw_m2r (round4, mm2);
    psraw_i2r (2, mm2);         // /4

    packuswb_r2r (mm2, mm1);    // pack (w/ saturation)
    movq_r2m (mm1, *dest);      // store result in dest
}
示例#16
0
static void line_filter_mmx_fast(uint8_t *dst, int width, int start_width,
                                 uint8_t *buf, uint8_t *src2, uint8_t *src3,
                                 uint8_t *src4, uint8_t *src5)
{
    int X;
    for (X = start_width; X < width - 7; X += 8)
    {
        mmx_start(buf, src2, src3, src4, X);
        movq_r2m (mm4, buf[X]);
        mmx_end(src3, src5, dst, X);
    } 

    line_filter_c_fast(dst, width, X, buf, src2, src3, src4, src5);
}
示例#17
0
static void fast_memcpy_mmx( void *d, const void *s, size_t n )
{
    const uint8_t *src = s;
    uint8_t *dest = d;

    if( dest != src ) {
        while( n > 64 ) {
            movq_m2r( src[ 0 ], mm0 );
            movq_m2r( src[ 8 ], mm1 );
            movq_m2r( src[ 16 ], mm2 );
            movq_m2r( src[ 24 ], mm3 );
            movq_m2r( src[ 32 ], mm4 );
            movq_m2r( src[ 40 ], mm5 );
            movq_m2r( src[ 48 ], mm6 );
            movq_m2r( src[ 56 ], mm7 );
            movq_r2m( mm0, dest[ 0 ] );
            movq_r2m( mm1, dest[ 8 ] );
            movq_r2m( mm2, dest[ 16 ] );
            movq_r2m( mm3, dest[ 24 ] );
            movq_r2m( mm4, dest[ 32 ] );
            movq_r2m( mm5, dest[ 40 ] );
            movq_r2m( mm6, dest[ 48 ] );
            movq_r2m( mm7, dest[ 56 ] );
            dest += 64;
            src += 64;
            n -= 64;
        }

        while( n > 8 ) {
            movq_m2r( src[ 0 ], mm0 );
            movq_r2m( mm0, dest[ 0 ] );
            dest += 8;
            src += 8;
            n -= 8;
        }

        if( n ) small_memcpy( dest, src, n );

        emms();
    }
}
示例#18
0
static inline void MC_put4_8 (int height, uint8_t * dest, const uint8_t * ref,
			      const int stride, const int cpu)
{
    movq_m2r (*ref, mm0);
    movq_m2r (*(ref+1), mm1);
    movq_r2r (mm0, mm7);
    pxor_r2r (mm1, mm7);
    pavg_r2r (mm1, mm0);
    ref += stride;

    do {
	movq_m2r (*ref, mm2);
	movq_r2r (mm0, mm5);

	movq_m2r (*(ref+1), mm3);
	movq_r2r (mm2, mm6);

	pxor_r2r (mm3, mm6);
	pavg_r2r (mm3, mm2);

	por_r2r (mm6, mm7);
	pxor_r2r (mm2, mm5);

	pand_r2r (mm5, mm7);
	pavg_r2r (mm2, mm0);

	pand_m2r (mask_one, mm7);

	psubusb_r2r (mm7, mm0);

	ref += stride;
	movq_r2m (mm0, *dest);
	dest += stride;

	movq_r2r (mm6, mm7);	/* unroll ! */
	movq_r2r (mm2, mm0);	/* unroll ! */
    } while (--height);
}
static void frame_f2i_sse(float *src,u_char *dst,int l)
{
    int i;

    // put 128 in all 4 words of mm7
    movd_g2r(128,mm7);
    punpcklwd_r2r(mm7,mm7);
    punpckldq_r2r(mm7,mm7);

    // put 128 in all 8 bytes of mm6
    movd_g2r(128,mm6);
    punpcklbw_r2r(mm6,mm6);
    punpcklwd_r2r(mm6,mm6);
    punpckldq_r2r(mm6,mm6);

    for( i=0; i<l; i+=8 ) {
        movaps_m2r(src[0],xmm0);
        movaps_m2r(src[4],xmm2);
        movhlps_r2r(xmm0,xmm1);
        cvtps2pi_r2r(xmm0,mm0);
        cvtps2pi_r2r(xmm1,mm1);
        movhlps_r2r(xmm2,xmm3);
        cvtps2pi_r2r(xmm2,mm2);
        cvtps2pi_r2r(xmm3,mm3);
        packssdw_r2r(mm1,mm0);
        packssdw_r2r(mm3,mm2);
        psubw_r2r(mm7,mm0);
        psubw_r2r(mm7,mm2);
        packsswb_r2r(mm2, mm0);
        paddb_r2r(mm6, mm0);
        movq_r2m(mm0,dst[0]);

        src+=8;
        dst+=8;
    }

    emms();
}
示例#20
0
static void
deinterlace_scanline_linear_mmx (GstDeinterlaceSimpleMethod * self,
    guint8 * out, const guint8 * bot, const guint8 * top, gint size)
{
  const mmx_t shiftmask = { 0xfefffefffefffeffULL };    /* To avoid shifting chroma to luma. */
  int i;

  for (i = size / 32; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    movq_m2r (*(bot + 8), mm2);
    movq_m2r (*(top + 8), mm3);
    movq_m2r (*(bot + 16), mm4);
    movq_m2r (*(top + 16), mm5);
    movq_m2r (*(bot + 24), mm6);
    movq_m2r (*(top + 24), mm7);
    pand_m2r (shiftmask, mm0);
    pand_m2r (shiftmask, mm1);
    pand_m2r (shiftmask, mm2);
    pand_m2r (shiftmask, mm3);
    pand_m2r (shiftmask, mm4);
    pand_m2r (shiftmask, mm5);
    pand_m2r (shiftmask, mm6);
    pand_m2r (shiftmask, mm7);
    psrlw_i2r (1, mm0);
    psrlw_i2r (1, mm1);
    psrlw_i2r (1, mm2);
    psrlw_i2r (1, mm3);
    psrlw_i2r (1, mm4);
    psrlw_i2r (1, mm5);
    psrlw_i2r (1, mm6);
    psrlw_i2r (1, mm7);
    paddb_r2r (mm1, mm0);
    paddb_r2r (mm3, mm2);
    paddb_r2r (mm5, mm4);
    paddb_r2r (mm7, mm6);
    movq_r2m (mm0, *out);
    movq_r2m (mm2, *(out + 8));
    movq_r2m (mm4, *(out + 16));
    movq_r2m (mm6, *(out + 24));
    out += 32;
    top += 32;
    bot += 32;
  }
  size = (size & 0x1f);

  for (i = size / 8; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    pand_m2r (shiftmask, mm0);
    pand_m2r (shiftmask, mm1);
    psrlw_i2r (1, mm0);
    psrlw_i2r (1, mm1);
    paddb_r2r (mm1, mm0);
    movq_r2m (mm0, *out);
    out += 8;
    top += 8;
    bot += 8;
  }
  emms ();

  size = size & 0xf;

  /* Handle last few pixels. */
  for (i = size; i; --i) {
    *out++ = ((*top++) + (*bot++)) >> 1;
  }
}
示例#21
0
文件: color.c 项目: afljafa/mythtv
static void vfilter_chroma_332_packed422_scanline_mmx( uint8_t *output, int width,
        uint8_t *m, uint8_t *t, uint8_t *b )
{
    int i;
    const mmx_t ymask = { 0x00ff00ff00ff00ffULL };
    const mmx_t cmask = { 0xff00ff00ff00ff00ULL };

    // Get width in bytes.
    width *= 2; 
    i = width / 8;
    width -= i * 8;

    movq_m2r( ymask, mm7 );
    movq_m2r( cmask, mm6 );

    while ( i-- ) 
    {
        movq_m2r( *t, mm0 );
        movq_m2r( *b, mm1 );
        movq_m2r( *m, mm2 );

        movq_r2r ( mm2, mm3 );
        pand_r2r ( mm7, mm3 );

        pand_r2r ( mm6, mm0 );
        pand_r2r ( mm6, mm1 );
        pand_r2r ( mm6, mm2 );

        psrlq_i2r( 8, mm0 );
        psrlq_i2r( 7, mm1 );
        psrlq_i2r( 8, mm2 );

        movq_r2r ( mm0, mm4 );
        psllw_i2r( 1, mm4 );
        paddw_r2r( mm4, mm0 );

        movq_r2r ( mm2, mm4 );
        psllw_i2r( 1, mm4 );
        paddw_r2r( mm4, mm2 );

        paddw_r2r( mm0, mm2 );
        paddw_r2r( mm1, mm2 );

        psllw_i2r( 5, mm2 );
        pand_r2r( mm6, mm2 );

        por_r2r ( mm3, mm2 );

        movq_r2m( mm2, *output );
        output += 8;
        t += 8;
        b += 8;
        m += 8;
    }
    output++; t++; b++; m++;
    while ( width-- ) 
    {
        *output = (3 * *t + 3 * *m + 2 * *b) >> 3;
        output +=2; t+=2; b+=2; m+=2;
    }

    emms();
}
示例#22
0
static void interpolate_packed422_scanline_mmx( uint8_t *output, uint8_t *top,
                                                uint8_t *bot, int width )
{
    const mmx_t shiftmask = { 0xfefffefffefffeffULL };  /* To avoid shifting chroma to luma. */
    int i;

    for( i = width/16; i; --i ) {
        movq_m2r( *bot, mm0 );
        movq_m2r( *top, mm1 );
        movq_m2r( *(bot + 8), mm2 );
        movq_m2r( *(top + 8), mm3 );
        movq_m2r( *(bot + 16), mm4 );
        movq_m2r( *(top + 16), mm5 );
        movq_m2r( *(bot + 24), mm6 );
        movq_m2r( *(top + 24), mm7 );
        pand_m2r( shiftmask, mm0 );
        pand_m2r( shiftmask, mm1 );
        pand_m2r( shiftmask, mm2 );
        pand_m2r( shiftmask, mm3 );
        pand_m2r( shiftmask, mm4 );
        pand_m2r( shiftmask, mm5 );
        pand_m2r( shiftmask, mm6 );
        pand_m2r( shiftmask, mm7 );
        psrlw_i2r( 1, mm0 );
        psrlw_i2r( 1, mm1 );
        psrlw_i2r( 1, mm2 );
        psrlw_i2r( 1, mm3 );
        psrlw_i2r( 1, mm4 );
        psrlw_i2r( 1, mm5 );
        psrlw_i2r( 1, mm6 );
        psrlw_i2r( 1, mm7 );
        paddb_r2r( mm1, mm0 );
        paddb_r2r( mm3, mm2 );
        paddb_r2r( mm5, mm4 );
        paddb_r2r( mm7, mm6 );
        movq_r2m( mm0, *output );
        movq_r2m( mm2, *(output + 8) );
        movq_r2m( mm4, *(output + 16) );
        movq_r2m( mm6, *(output + 24) );
        output += 32;
        top += 32;
        bot += 32;
    }
    width = (width & 0xf);

    for( i = width/4; i; --i ) {
        movq_m2r( *bot, mm0 );
        movq_m2r( *top, mm1 );
        pand_m2r( shiftmask, mm0 );
        pand_m2r( shiftmask, mm1 );
        psrlw_i2r( 1, mm0 );
        psrlw_i2r( 1, mm1 );
        paddb_r2r( mm1, mm0 );
        movq_r2m( mm0, *output );
        output += 8;
        top += 8;
        bot += 8;
    }
    width = width & 0x7;

    /* Handle last few pixels. */
    for( i = width * 2; i; --i ) {
        *output++ = ((*top++) + (*bot++)) >> 1;
    }

    emms();
}
示例#23
0
static void
deinterlace_scanline_linear_mmx (GstDeinterlaceMethod * self,
    GstDeinterlace * parent, guint8 * out,
    GstDeinterlaceScanlineData * scanlines, gint width)
{
  const mmx_t shiftmask = { 0xfefffefffefffeffULL };    /* To avoid shifting chroma to luma. */
  int i;
  guint8 *bot = scanlines->b0, *top = scanlines->t0;

  for (i = width / 16; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    movq_m2r (*(bot + 8), mm2);
    movq_m2r (*(top + 8), mm3);
    movq_m2r (*(bot + 16), mm4);
    movq_m2r (*(top + 16), mm5);
    movq_m2r (*(bot + 24), mm6);
    movq_m2r (*(top + 24), mm7);
    pand_m2r (shiftmask, mm0);
    pand_m2r (shiftmask, mm1);
    pand_m2r (shiftmask, mm2);
    pand_m2r (shiftmask, mm3);
    pand_m2r (shiftmask, mm4);
    pand_m2r (shiftmask, mm5);
    pand_m2r (shiftmask, mm6);
    pand_m2r (shiftmask, mm7);
    psrlw_i2r (1, mm0);
    psrlw_i2r (1, mm1);
    psrlw_i2r (1, mm2);
    psrlw_i2r (1, mm3);
    psrlw_i2r (1, mm4);
    psrlw_i2r (1, mm5);
    psrlw_i2r (1, mm6);
    psrlw_i2r (1, mm7);
    paddb_r2r (mm1, mm0);
    paddb_r2r (mm3, mm2);
    paddb_r2r (mm5, mm4);
    paddb_r2r (mm7, mm6);
    movq_r2m (mm0, *out);
    movq_r2m (mm2, *(out + 8));
    movq_r2m (mm4, *(out + 16));
    movq_r2m (mm6, *(out + 24));
    out += 32;
    top += 32;
    bot += 32;
  }
  width = (width & 0xf);

  for (i = width / 4; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    pand_m2r (shiftmask, mm0);
    pand_m2r (shiftmask, mm1);
    psrlw_i2r (1, mm0);
    psrlw_i2r (1, mm1);
    paddb_r2r (mm1, mm0);
    movq_r2m (mm0, *out);
    out += 8;
    top += 8;
    bot += 8;
  }
  width = width & 0x7;

  /* Handle last few pixels. */
  for (i = width * 2; i; --i) {
    *out++ = ((*top++) + (*bot++)) >> 1;
  }

  emms ();
}
示例#24
0
static void
deinterlace_greedy_scanline_mmxext (GstDeinterlaceMethodGreedyL *
    self, const guint8 * m0, const guint8 * t1, const guint8 * b1,
    const guint8 * m2, guint8 * output, gint width)
{
  mmx_t MaxComb;

  // How badly do we let it weave? 0-255
  MaxComb.ub[0] = self->max_comb;
  MaxComb.ub[1] = self->max_comb;
  MaxComb.ub[2] = self->max_comb;
  MaxComb.ub[3] = self->max_comb;
  MaxComb.ub[4] = self->max_comb;
  MaxComb.ub[5] = self->max_comb;
  MaxComb.ub[6] = self->max_comb;
  MaxComb.ub[7] = self->max_comb;

  // L2 == m0
  // L1 == t1
  // L3 == b1
  // LP2 == m2

  movq_m2r (MaxComb, mm6);

  for (; width > 7; width -= 8) {
    movq_m2r (*t1, mm1);        // L1
    movq_m2r (*m0, mm2);        // L2
    movq_m2r (*b1, mm3);        // L3
    movq_m2r (*m2, mm0);        // LP2

    // average L1 and L3 leave result in mm4
    movq_r2r (mm1, mm4);        // L1
    pavgb_r2r (mm3, mm4);       // (L1 + L3)/2

    // get abs value of possible L2 comb
    movq_r2r (mm2, mm7);        // L2
    psubusb_r2r (mm4, mm7);     // L2 - avg
    movq_r2r (mm4, mm5);        // avg
    psubusb_r2r (mm2, mm5);     // avg - L2
    por_r2r (mm7, mm5);         // abs(avg-L2)

    // get abs value of possible LP2 comb
    movq_r2r (mm0, mm7);        // LP2
    psubusb_r2r (mm4, mm7);     // LP2 - avg
    psubusb_r2r (mm0, mm4);     // avg - LP2
    por_r2r (mm7, mm4);         // abs(avg-LP2)

    // use L2 or LP2 depending upon which makes smaller comb
    psubusb_r2r (mm5, mm4);     // see if it goes to zero
    pxor_r2r (mm5, mm5);        // 0
    pcmpeqb_r2r (mm5, mm4);     // if (mm4=0) then FF else 0
    pcmpeqb_r2r (mm4, mm5);     // opposite of mm4

    // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
    pand_r2r (mm2, mm5);        // use L2 if mm5 == ff, else 0
    pand_r2r (mm0, mm4);        // use LP2 if mm4 = ff, else 0
    por_r2r (mm5, mm4);         // may the best win

    // Now lets clip our chosen value to be not outside of the range
    // of the high/low range L1-L3 by more than abs(L1-L3)
    // This allows some comb but limits the damages and also allows more
    // detail than a boring oversmoothed clip.

    movq_r2r (mm1, mm2);        // copy L1
    pmaxub_r2r (mm3, mm2);      // now = Max(L1,L3)

    pminub_r2r (mm1, mm3);      // now = Min(L1,L3)

    // allow the value to be above the high or below the low by amt of MaxComb
    paddusb_r2r (mm6, mm2);     // increase max by diff
    psubusb_r2r (mm6, mm3);     // lower min by diff


    pmaxub_r2r (mm3, mm4);      // now = Max(best,Min(L1,L3)
    pminub_r2r (mm4, mm2);      // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped

    movq_r2m (mm2, *output);    // move in our clipped best

    // Advance to the next set of pixels.
    output += 8;
    m0 += 8;
    t1 += 8;
    b1 += 8;
    m2 += 8;
  }
  emms ();

  if (width > 0)
    deinterlace_greedy_scanline_c (self, m0, t1, b1, m2, output, width);
}
示例#25
0
static inline void mmx_interp_average_4_U8 (uint8_t * dest,
					    const uint8_t * src1,
					    const uint8_t * src2,
					    const uint8_t * src3,
					    const uint8_t * src4)
{
    /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */

    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */

    punpcklbw_r2r (mm0, mm1);	/* unpack low src1 bytes */
    punpckhbw_r2r (mm0, mm2);	/* unpack high src1 bytes */

    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src2 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src2 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    /* now have partials in mm1 and mm2 */

    movq_m2r (*src3, mm3);	/* load 8 src3 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src3 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src3 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src3 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    movq_m2r (*src4, mm5);	/* load 8 src4 bytes */
    movq_r2r (mm5, mm6);	/* copy 8 src4 bytes */

    punpcklbw_r2r (mm0, mm5);	/* unpack low src4 bytes */
    punpckhbw_r2r (mm0, mm6);	/* unpack high src4 bytes */

    paddw_r2r (mm5, mm1);	/* add lows */
    paddw_r2r (mm6, mm2);	/* add highs */

    paddw_m2r (round4, mm1);
    psraw_i2r (2, mm1);		/* /4 */
    paddw_m2r (round4, mm2);
    psraw_i2r (2, mm2);		/* /4 */

    /* now have subtotal/4 in mm1 and mm2 */

    movq_m2r (*dest, mm3);	/* load 8 dest bytes */
    movq_r2r (mm3, mm4);	/* copy 8 dest bytes */

    packuswb_r2r (mm2, mm1);	/* pack (w/ saturation) */
    movq_r2r (mm1,mm2);		/* copy subresult */

    pxor_r2r (mm1, mm3);	/* xor srcavg and dest */
    pand_m2r (mask1, mm3);	/* mask lower bits */
    psrlq_i2r (1, mm3);		/* /2 */
    por_r2r (mm2, mm4);		/* or srcavg and dest */
    psubb_r2r (mm3, mm4);	/* subtract subresults */
    movq_r2m (mm4, *dest);	/* store result in dest */
}
示例#26
0
int field_dct_best_mmx( uint8_t *cur_lum_mb, uint8_t *pred_lum_mb)
{
	/*
	 * calculate prediction error (cur-pred) for top (blk0)
	 * and bottom field (blk1)
	 */
	double r,d;
	int rowoffs = 0;
	int sumtop, sumbot, sumsqtop, sumsqbot, sumbottop;
	int j;
	int dct_type;
	int topvar, botvar;
	mmx_t sumtop_accs, sumbot_accs;
	mmx_t sumsqtop_accs, sumsqbot_accs, sumxprod_accs;
	int32_t sumtop_acc, sumbot_acc;
	int32_t sumsqtop_acc, sumsqbot_acc, sumxprod_acc;

	pxor_r2r(mm0,mm0);
	movq_r2m( mm0, *(&sumtop_accs) );
	movq_r2m( mm0, *(&sumbot_accs) );
	movq_r2m( mm0, *(&sumsqtop_accs) );
	movq_r2m( mm0, *(&sumsqbot_accs) );
	movq_r2m( mm0, *(&sumxprod_accs) );
	
	sumtop = sumsqtop = sumbot = sumsqbot = sumbottop = 0;
	sumtop_acc = sumbot_acc = sumsqtop_acc = sumsqbot_acc = sumxprod_acc = 0; 
	for (j=0; j<8; j++)
	{
#ifdef ORIGINAL_CODE
		for (i=0; i<16; i++)
		{
			register int toppix = 
				cur_lum_mb[rowoffs+i] - pred_lum_mb[rowoffs+i];
			register int botpix = 
				cur_lum_mb[rowoffs+width+i] - pred_lum_mb[rowoffs+width+i];
			sumtop += toppix;
			sumsqtop += toppix*toppix;
			sumbot += botpix;
			sumsqbot += botpix*botpix;
			sumbottop += toppix*botpix;
		}
#endif
		sum_sumsq_8bytes( &cur_lum_mb[rowoffs], &pred_lum_mb[rowoffs],
						  &sumtop_accs, &sumbot_accs,
						  &sumsqtop_accs, &sumsqbot_accs, &sumxprod_accs
						  );
		sum_sumsq_8bytes( &cur_lum_mb[rowoffs+8], &pred_lum_mb[rowoffs+8],
						  &sumtop_accs, &sumbot_accs,
						  &sumsqtop_accs, &sumsqbot_accs, &sumxprod_accs );
		rowoffs += (opt->phy_width<<1);
	}

	mmx_sum_4_word_accs( &sumtop_accs, &sumtop );
	mmx_sum_4_word_accs( &sumbot_accs, &sumbot );
	emms();
	sumsqtop = sumsqtop_accs.d[0] + sumsqtop_accs.d[1];
	sumsqbot = sumsqbot_accs.d[0] + sumsqbot_accs.d[1];
	sumbottop = sumxprod_accs.d[0] + sumxprod_accs.d[1];

	/* Calculate Variances top and bottom.  If they're of similar
	 sign estimate correlation if its good use frame DCT otherwise
	 use field.
	*/
	r = 0.0;
	topvar = sumsqtop-sumtop*sumtop/128;
	botvar = sumsqbot-sumbot*sumbot/128;
	if ( !((topvar <= 0) ^ (botvar <= 0)) )
	{
		d = ((double) topvar) * ((double)botvar);
		r = (sumbottop-(sumtop*sumbot)/128);
		if (r>0.5*sqrt(d))
			return 0; /* frame DCT */
		else
			return 1; /* field DCT */
	}
	else
		return 1; /* field DCT */

	return dct_type;
}
示例#27
0
void deinterlace_bob_yuv_mmx(uint8_t *pdst, uint8_t *psrc,
			     int width, int height )
{

  int Line;
  long long* YVal1;
  long long* YVal2;
  long long* YVal3;
  long long* Dest;
  uint8_t* pEvenLines = psrc;
  uint8_t* pOddLines = psrc+width;
  int LineLength = width;
  int Pitch = width * 2;
  int IsOdd = 1;
  long EdgeDetect = 625;
  long JaggieThreshold = 73;

  int n;

  unsigned long long qwEdgeDetect;
  unsigned long long qwThreshold;
  const unsigned long long Mask = 0xfefefefefefefefeULL;
  const unsigned long long YMask = 0x00ff00ff00ff00ffULL;

  qwEdgeDetect = EdgeDetect;
  qwEdgeDetect += (qwEdgeDetect << 48) + (qwEdgeDetect << 32) + (qwEdgeDetect << 16);
  qwThreshold = JaggieThreshold;
  qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16);


  // copy first even line no matter what, and the first odd line if we're
  // processing an odd field.
  ac_memcpy(pdst, pEvenLines, LineLength);
  if (IsOdd)
    ac_memcpy(pdst + LineLength, pOddLines, LineLength);

  height = height / 2;
  for (Line = 0; Line < height - 1; ++Line)
  {
    if (IsOdd)
    {
      YVal1 = (long long *)(pOddLines + Line * Pitch);
      YVal2 = (long long *)(pEvenLines + (Line + 1) * Pitch);
      YVal3 = (long long *)(pOddLines + (Line + 1) * Pitch);
      Dest = (long long *)(pdst + (Line * 2 + 2) * LineLength);
    }
    else
    {
      YVal1 = (long long *)(pEvenLines + Line * Pitch);
      YVal2 = (long long *)(pOddLines + Line * Pitch);
      YVal3 = (long long *)(pEvenLines + (Line + 1) * Pitch);
      Dest = (long long *)(pdst + (Line * 2 + 1) * LineLength);
    }

    // For ease of reading, the comments below assume that we're operating on an odd
    // field (i.e., that bIsOdd is true).  The exact same processing is done when we
    // operate on an even field, but the roles of the odd and even fields are reversed.
    // It's just too cumbersome to explain the algorithm in terms of "the next odd
    // line if we're doing an odd field, or the next even line if we're doing an
    // even field" etc.  So wherever you see "odd" or "even" below, keep in mind that
    // half the time this function is called, those words' meanings will invert.

    // Copy the odd line to the overlay verbatim.
    ac_memcpy((char *)Dest + LineLength, YVal3, LineLength);

    n = LineLength >> 3;
    while( n-- )
    {
      movq_m2r (*YVal1++, mm0);
      movq_m2r (*YVal2++, mm1);
      movq_m2r (*YVal3++, mm2);

      // get intensities in mm3 - 4
      movq_r2r ( mm0, mm3 );
      movq_r2r ( mm1, mm4 );
      movq_r2r ( mm2, mm5 );

      pand_m2r ( *&YMask, mm3 );
      pand_m2r ( *&YMask, mm4 );
      pand_m2r ( *&YMask, mm5 );

      // get average in mm0
      pand_m2r ( *&Mask, mm0 );
      pand_m2r ( *&Mask, mm2 );
      psrlw_i2r ( 01, mm0 );
      psrlw_i2r ( 01, mm2 );
      paddw_r2r ( mm2, mm0 );

      // work out (O1 - E) * (O2 - E) / 2 - EdgeDetect * (O1 - O2) ^ 2 >> 12
      // result will be in mm6

      psrlw_i2r ( 01, mm3 );
      psrlw_i2r ( 01, mm4 );
      psrlw_i2r ( 01, mm5 );

      movq_r2r ( mm3, mm6 );
      psubw_r2r ( mm4, mm6 );	//mm6 = O1 - E

      movq_r2r ( mm5, mm7 );
      psubw_r2r ( mm4, mm7 );	//mm7 = O2 - E

      pmullw_r2r ( mm7, mm6 );		// mm6 = (O1 - E) * (O2 - E)

      movq_r2r ( mm3, mm7 );
      psubw_r2r ( mm5, mm7 );		// mm7 = (O1 - O2)
      pmullw_r2r ( mm7, mm7 );	// mm7 = (O1 - O2) ^ 2
      psrlw_i2r ( 12, mm7 );		// mm7 = (O1 - O2) ^ 2 >> 12
      pmullw_m2r ( *&qwEdgeDetect, mm7 );// mm7  = EdgeDetect * (O1 - O2) ^ 2 >> 12

      psubw_r2r ( mm7, mm6 );      // mm6 is what we want

      pcmpgtw_m2r ( *&qwThreshold, mm6 );

      movq_r2r ( mm6, mm7 );

      pand_r2r ( mm6, mm0 );

      pandn_r2r ( mm1, mm7 );

      por_r2r ( mm0, mm7 );

      movq_r2m ( mm7, *Dest++ );
    }
  }

  // Copy last odd line if we're processing an even field.
  if (! IsOdd)
  {
    ac_memcpy(pdst + (height * 2 - 1) * LineLength,
                      pOddLines + (height - 1) * Pitch,
                      LineLength);
  }

  // clear out the MMX registers ready for doing floating point
  // again
  emms();
}
示例#28
0
static __inline__ void
sum_sumsq_8bytes( uint8_t *cur_lum_mb, 
				  uint8_t *pred_lum_mb,
				  mmx_t *sumtop_accs,
				  mmx_t *sumbot_accs,
				  mmx_t *sumsqtop_accs,
				  mmx_t *sumsqbot_accs,
				  mmx_t *sumxprod_accs
	)
{
	pxor_r2r(mm0,mm0);

	/* Load pixels from top field into mm1.w,mm2.w
	 */
	movq_m2r( *((mmx_t*)cur_lum_mb), mm1 );
	movq_m2r( *((mmx_t*)pred_lum_mb), mm2 );
	
	/* mm3 := mm1 mm4 := mm2
	   mm1.w[0..3] := mm1.b[0..3]-mm2.b[0..3]
	*/ 
	
	movq_r2r( mm1, mm3 );
	punpcklbw_r2r( mm0, mm1 );
	movq_r2r( mm2, mm4 );
	punpcklbw_r2r( mm0, mm2 );
	psubw_r2r( mm2, mm1 );
	
	/* mm3.w[0..3] := mm3.b[4..7]-mm4.b[4..7]
	 */
	punpckhbw_r2r( mm0, mm3 );
	punpckhbw_r2r( mm0, mm4 );
	psubw_r2r( mm4, mm3 );

	/* sumtop_accs->w[0..3] += mm1.w[0..3];
	   sumtop_accs->w[0..3] += mm3.w[0..3];
	   mm6 = mm1; mm7 = mm3;
	*/
	movq_m2r( *sumtop_accs, mm5 );
	paddw_r2r( mm1, mm5 );
	paddw_r2r( mm3, mm5 );
	movq_r2r( mm1, mm6 );
	movq_r2r( mm3, mm7 );
	movq_r2m( mm5, *sumtop_accs );

	/* 
	   *sumsq_top_acc += mm1.w[0..3] * mm1.w[0..3];
	   *sumsq_top_acc += mm3.w[0..3] * mm3.w[0..3];
	*/
	pmaddwd_r2r( mm1, mm1 );
	movq_m2r( *sumsqtop_accs, mm5 );
	pmaddwd_r2r( mm3, mm3 );
	paddd_r2r( mm1, mm5 );
	paddd_r2r( mm3, mm5 );
	movq_r2m( mm5, *sumsqtop_accs );
	

	/* Load pixels from bot field into mm1.w,mm2.w
	 */
	movq_m2r( *((mmx_t*)(cur_lum_mb+opt->phy_width)), mm1 );
	movq_m2r( *((mmx_t*)(pred_lum_mb+opt->phy_width)), mm2 );
	
	/* mm2 := mm1 mm4 := mm2
	   mm1.w[0..3] := mm1.b[0..3]-mm2.b[0..3]
	*/ 
	
	movq_r2r( mm1, mm3 );
	punpcklbw_r2r( mm0, mm1 );
	movq_r2r( mm2, mm4 );
	punpcklbw_r2r( mm0, mm2 );
	psubw_r2r( mm2, mm1 );
	
	/* mm3.w[0..3] := mm3.b[4..7]-mm4.b[4..7]
	 */
	punpckhbw_r2r( mm0, mm3 );
	punpckhbw_r2r( mm0, mm4 );
	psubw_r2r( mm4, mm3 );

	/* 
	   sumbot_accs->w[0..3] += mm1.w[0..3];
	   sumbot_accs->w[0..3] += mm3.w[0..3];
	   mm2 := mm1; mm4 := mm3;
	*/
	movq_m2r( *sumbot_accs, mm5 );
	paddw_r2r( mm1, mm5 );
	movq_r2r( mm1, mm2 );
	paddw_r2r( mm3, mm5 );
	movq_r2r( mm3, mm4 );
	movq_r2m( mm5, *sumbot_accs );

	/* 
	   *sumsqbot_acc += mm1.w[0..3] * mm1.w[0..3];
	   *sumsqbot_acc += mm3.w[0..3] * mm3.w[0..3];
	*/
	pmaddwd_r2r( mm1, mm1 );
	movq_m2r( *sumsqbot_accs, mm5 );
	pmaddwd_r2r( mm3, mm3 );
	paddd_r2r( mm1, mm5 );
	paddd_r2r( mm3, mm5 );
	movq_r2m( mm5, *sumsqbot_accs );
	
	
	/* Accumulate cross-product 
	 *sum_xprod_acc += mm1.w[0..3] * mm6[0..3];
	 *sum_xprod_acc += mm3.w[0..3] * mm7[0..3];
	 */

	movq_m2r( *sumxprod_accs, mm5 );
	pmaddwd_r2r( mm6, mm2);
	pmaddwd_r2r( mm7, mm4);
	paddd_r2r( mm2, mm5 );
	paddd_r2r( mm4, mm5 );
	movq_r2m( mm5, *sumxprod_accs );
	emms();
}