예제 #1
0
static inline void mmx_average_2_U8 (uint8_t * dest,
                                     uint8_t * src1, uint8_t * src2)
{
    /* *dest = (*src1 + *src2 + 1)/ 2; */

    movq_m2r (*src1, mm1);      // load 8 src1 bytes
    movq_r2r (mm1, mm2);        // copy 8 src1 bytes

    movq_m2r (*src2, mm3);      // load 8 src2 bytes
    movq_r2r (mm3, mm4);        // copy 8 src2 bytes

    punpcklbw_r2r (mm0, mm1);   // unpack low src1 bytes
    punpckhbw_r2r (mm0, mm2);   // unpack high src1 bytes

    punpcklbw_r2r (mm0, mm3);   // unpack low src2 bytes
    punpckhbw_r2r (mm0, mm4);   // unpack high src2 bytes

    paddw_r2r (mm3, mm1);       // add lows to mm1
    paddw_m2r (round1, mm1);
    psraw_i2r (1, mm1);         // /2

    paddw_r2r (mm4, mm2);       // add highs to mm2
    paddw_m2r (round1, mm2);
    psraw_i2r (1, mm2);         // /2

    packuswb_r2r (mm2, mm1);    // pack (w/ saturation)
    movq_r2m (mm1, *dest);      // store result in dest
}
예제 #2
0
파일: motion_comp_mmx.c 프로젝트: 1c0n/xbmc
static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1,
				     const uint8_t * src2,
				     const uint8_t * src3,
				     const uint8_t * src4)
{
    /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */

    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */

    punpcklbw_r2r (mm0, mm1);	/* unpack low src1 bytes */
    punpckhbw_r2r (mm0, mm2);	/* unpack high src1 bytes */

    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src2 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src2 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    /* now have partials in mm1 and mm2 */

    movq_m2r (*src3, mm3);	/* load 8 src3 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src3 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src3 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src3 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    movq_m2r (*src4, mm5);	/* load 8 src4 bytes */
    movq_r2r (mm5, mm6);	/* copy 8 src4 bytes */

    punpcklbw_r2r (mm0, mm5);	/* unpack low src4 bytes */
    punpckhbw_r2r (mm0, mm6);	/* unpack high src4 bytes */

    paddw_r2r (mm5, mm1);	/* add lows */
    paddw_r2r (mm6, mm2);	/* add highs */

    /* now have subtotal in mm1 and mm2 */

    paddw_m2r (round4, mm1);
    psraw_i2r (2, mm1);		/* /4 */
    paddw_m2r (round4, mm2);
    psraw_i2r (2, mm2);		/* /4 */

    packuswb_r2r (mm2, mm1);	/* pack (w/ saturation) */
    movq_r2m (mm1, *dest);	/* store result in dest */
}
예제 #3
0
static inline void mmx_average_4_U8 (uint8_t * dest,
                                     uint8_t * src1, uint8_t * src2,
                                     uint8_t * src3, uint8_t * src4)
{
    /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */

    movq_m2r (*src1, mm1);      // load 8 src1 bytes
    movq_r2r (mm1, mm2);        // copy 8 src1 bytes

    punpcklbw_r2r (mm0, mm1);   // unpack low src1 bytes
    punpckhbw_r2r (mm0, mm2);   // unpack high src1 bytes

    movq_m2r (*src2, mm3);      // load 8 src2 bytes
    movq_r2r (mm3, mm4);        // copy 8 src2 bytes

    punpcklbw_r2r (mm0, mm3);   // unpack low src2 bytes
    punpckhbw_r2r (mm0, mm4);   // unpack high src2 bytes

    paddw_r2r (mm3, mm1);       // add lows
    paddw_r2r (mm4, mm2);       // add highs

    /* now have partials in mm1 and mm2 */

    movq_m2r (*src3, mm3);      // load 8 src3 bytes
    movq_r2r (mm3, mm4);        // copy 8 src3 bytes

    punpcklbw_r2r (mm0, mm3);   // unpack low src3 bytes
    punpckhbw_r2r (mm0, mm4);   // unpack high src3 bytes

    paddw_r2r (mm3, mm1);       // add lows
    paddw_r2r (mm4, mm2);       // add highs

    movq_m2r (*src4, mm5);      // load 8 src4 bytes
    movq_r2r (mm5, mm6);        // copy 8 src4 bytes

    punpcklbw_r2r (mm0, mm5);   // unpack low src4 bytes
    punpckhbw_r2r (mm0, mm6);   // unpack high src4 bytes

    paddw_r2r (mm5, mm1);       // add lows
    paddw_r2r (mm6, mm2);       // add highs

    /* now have subtotal in mm1 and mm2 */

    paddw_m2r (round4, mm1);
    psraw_i2r (2, mm1);         // /4
    paddw_m2r (round4, mm2);
    psraw_i2r (2, mm2);         // /4

    packuswb_r2r (mm2, mm1);    // pack (w/ saturation)
    movq_r2m (mm1, *dest);      // store result in dest
}
예제 #4
0
static void scale_uint8_x_4_x_bilinear_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
  {
  int i;
  uint8_t * src, * dst, *src_start;
  int32_t * factors;
  //  mmx_t tmp_mm;

/*
 *  mm0: Input1
 *  mm1: Factor mask
 *  mm2: 
 *  mm3: Output
 *  mm4: 
 *  mm5: Input2
 *  mm6: 0
 *  mm7: Factor
 *  
 */

//  fprintf(stderr, "scale_uint8_x_4_x_bilinear_mmx\n");
  
  src_start = ctx->src + scanline * ctx->src_stride;
  
  pxor_r2r(mm6, mm6);
  movq_m2r(factor_mask, mm1);
  dst = dest_start;
  for(i = 0; i < ctx->dst_size; i++)
    {
    src = src_start + 4*ctx->table_h.pixels[i].index;
    factors = ctx->table_h.pixels[i].factor_i;
    
    /* Load pixels */
    movd_m2r(*(src), mm0);
    punpcklbw_r2r(mm6, mm0);
    psllw_i2r(6, mm0); /* 14 bit */
    /* Load pixels */
    movd_m2r(*(src+4), mm5);
    punpcklbw_r2r(mm6, mm5);
    psllw_i2r(6, mm5); /* 14 bit */

    /* Load factors */
    LOAD_FACTOR_1_4_NOCLIP; /* 14 bit */
    /* Subtract */
    psubsw_r2r(mm5, mm0); /* s1(mm0) - s2(mm5) -> mm0 (14 bit) */
    pmulhw_r2r(mm7, mm0); /* factor * (s2 - s1) -> mm0 (12 bit) */
    
    psllw_i2r(2, mm0); /* (14 bit) */
    
    paddsw_r2r(mm5, mm0);/* (15 bit) */
    
    psraw_i2r(6, mm0);/* (8 bit) */
    packuswb_r2r(mm6, mm0);
    movd_r2m(mm0, *dst);
    
    dst+=4;
    }
  ctx->need_emms = 1;
  
  }
예제 #5
0
static void scale_uint8_x_4_x_generic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
  {
  int i, j;
  uint8_t * src, * dst, *src_start;
  int32_t * factors;
  //  mmx_t tmp_mm;

/*
 *  mm0: Input
 *  mm1: factor_mask
 *  mm2: Factor
 *  mm3: Output
 *  mm4: 
 *  mm5: 
 *  mm6: 0
 *  mm7: scratch
 *  
 */
  
  src_start = ctx->src + scanline * ctx->src_stride;
  
  pxor_r2r(mm6, mm6);
  movq_m2r(factor_mask, mm1);
  dst = dest_start;
  for(i = 0; i < ctx->dst_size; i++)
    {
    src = src_start + 4*ctx->table_h.pixels[i].index;
    factors = ctx->table_h.pixels[i].factor_i;
    pxor_r2r(mm3, mm3);

    for(j = 0; j < ctx->table_h.factors_per_pixel; j++)
      {
      /* Load pixels */
      movd_m2r(*(src), mm0);
      punpcklbw_r2r(mm6, mm0);
      psllw_i2r(7, mm0);
      /* Load factors */
      LOAD_FACTOR_1_4;
      /* Multiply */
      pmulhw_r2r(mm7, mm0);
      paddw_r2r(mm0, mm3);
      //    DUMP_MM("mm3_2", mm3);
      src += 4;
      factors++;
      
      }
    
    psraw_i2r(5, mm3);
    packuswb_r2r(mm6, mm3);
    movd_r2m(mm3, *dst);
    
    dst+=4;
    }
  ctx->need_emms = 1;
  
  }
예제 #6
0
static __inline__ void
mmx_sum_4_word_accs( mmx_t *accs, int32_t *res )
{
	movq_m2r( *accs, mm1 );
	movq_r2r( mm1, mm3 );
	movq_r2r( mm1, mm2 );
	/* Generate sign extensions for mm1 words! */
	psraw_i2r( 15, mm3 );
	punpcklwd_r2r( mm3, mm1 );
	punpckhwd_r2r( mm3, mm2 );
	paddd_r2r( mm1, mm2 );
	movq_r2r( mm2, mm3);
	psrlq_i2r( 32, mm2);
	paddd_r2r( mm2, mm3);
	movd_r2m( mm3, *res );
}
예제 #7
0
파일: algo_x.c 프로젝트: RodrigoNieves/vlc
static inline void XDeint8x8MergeMMXEXT( uint8_t *dst,  int i_dst,
                                         uint8_t *src1, int i_src1,
                                         uint8_t *src2, int i_src2 )
{
    static const uint64_t m_4 = INT64_C(0x0004000400040004);
    int y, x;

    /* Progressive */
    pxor_r2r( mm7, mm7 );
    for( y = 0; y < 8; y += 2 )
    {
        for( x = 0; x < 8; x +=4 )
        {
            movd_m2r( src1[x], mm0 );
            movd_r2m( mm0, dst[x] );

            movd_m2r( src2[x], mm1 );
            movd_m2r( src1[i_src1+x], mm2 );

            punpcklbw_r2r( mm7, mm0 );
            punpcklbw_r2r( mm7, mm1 );
            punpcklbw_r2r( mm7, mm2 );
            paddw_r2r( mm1, mm1 );
            movq_r2r( mm1, mm3 );
            paddw_r2r( mm3, mm3 );
            paddw_r2r( mm2, mm0 );
            paddw_r2r( mm3, mm1 );
            paddw_m2r( m_4, mm1 );
            paddw_r2r( mm1, mm0 );
            psraw_i2r( 3, mm0 );
            packuswb_r2r( mm7, mm0 );
            movd_r2m( mm0, dst[i_dst+x] );
        }
        dst += 2*i_dst;
        src1 += i_src1;
        src2 += i_src2;
    }
}
예제 #8
0
파일: motion_comp_mmx.c 프로젝트: 1c0n/xbmc
static inline void mmx_interp_average_4_U8 (uint8_t * dest,
					    const uint8_t * src1,
					    const uint8_t * src2,
					    const uint8_t * src3,
					    const uint8_t * src4)
{
    /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */

    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */

    punpcklbw_r2r (mm0, mm1);	/* unpack low src1 bytes */
    punpckhbw_r2r (mm0, mm2);	/* unpack high src1 bytes */

    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src2 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src2 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    /* now have partials in mm1 and mm2 */

    movq_m2r (*src3, mm3);	/* load 8 src3 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src3 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src3 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src3 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    movq_m2r (*src4, mm5);	/* load 8 src4 bytes */
    movq_r2r (mm5, mm6);	/* copy 8 src4 bytes */

    punpcklbw_r2r (mm0, mm5);	/* unpack low src4 bytes */
    punpckhbw_r2r (mm0, mm6);	/* unpack high src4 bytes */

    paddw_r2r (mm5, mm1);	/* add lows */
    paddw_r2r (mm6, mm2);	/* add highs */

    paddw_m2r (round4, mm1);
    psraw_i2r (2, mm1);		/* /4 */
    paddw_m2r (round4, mm2);
    psraw_i2r (2, mm2);		/* /4 */

    /* now have subtotal/4 in mm1 and mm2 */

    movq_m2r (*dest, mm3);	/* load 8 dest bytes */
    movq_r2r (mm3, mm4);	/* copy 8 dest bytes */

    packuswb_r2r (mm2, mm1);	/* pack (w/ saturation) */
    movq_r2r (mm1,mm2);		/* copy subresult */

    pxor_r2r (mm1, mm3);	/* xor srcavg and dest */
    pand_m2r (mask1, mm3);	/* mask lower bits */
    psrlq_i2r (1, mm3);		/* /2 */
    por_r2r (mm2, mm4);		/* or srcavg and dest */
    psubb_r2r (mm3, mm4);	/* subtract subresults */
    movq_r2m (mm4, *dest);	/* store result in dest */
}
예제 #9
0
static void
_evas_yv12torgb_sse(unsigned char **yuv, unsigned char *rgb, int w, int h)
{
#ifdef BUILD_MMX
   int xx, yy;
   register unsigned char *yp1, *up, *vp;
   unsigned char *dp1;

   /* destination pointers */
   dp1 = rgb;

   for (yy = 0; yy < h; yy++)
     {
	/* plane pointers */
	yp1 = yuv[yy];
	up = yuv[h + (yy / 2)];
	vp = yuv[h + (h / 2) + (yy / 2)];
	for (xx = 0; xx < (w - 7); xx += 8)
	  {
	     movd_m2r(*up, mm3);
	     movd_m2r(*vp, mm2);
	     movq_m2r(*yp1, mm0);

	     pxor_r2r(mm7, mm7);
	     punpcklbw_r2r(mm7, mm2);
	     punpcklbw_r2r(mm7, mm3);

	     movq_r2r(mm0, mm1);
	     psrlw_i2r(8, mm0);
	     psllw_i2r(8, mm1);
	     psrlw_i2r(8, mm1);

	     movq_m2r(CONST_16, mm4);
	     psubsw_r2r(mm4, mm0);
	     psubsw_r2r(mm4, mm1);

	     movq_m2r(CONST_128, mm5);
	     psubsw_r2r(mm5, mm2);
	     psubsw_r2r(mm5, mm3);

	     movq_m2r(CONST_YMUL, mm4);
	     pmullw_r2r(mm4, mm0);
	     pmullw_r2r(mm4, mm1);

	     movq_m2r(CONST_CRVCRV, mm7);
	     pmullw_r2r(mm3, mm7);
	     movq_m2r(CONST_CBUCBU, mm6);
	     pmullw_r2r(mm2, mm6);
	     movq_m2r(CONST_CGUCGU, mm5);
	     pmullw_r2r(mm2, mm5);
	     movq_m2r(CONST_CGVCGV, mm4);
	     pmullw_r2r(mm3, mm4);

	     movq_r2r(mm0, mm2);
	     paddsw_r2r(mm7, mm2);
	     paddsw_r2r(mm1, mm7);

	     psraw_i2r(RES, mm2);
	     psraw_i2r(RES, mm7);
	     packuswb_r2r(mm7, mm2);

	     pxor_r2r(mm7, mm7);
	     movq_r2r(mm2, mm3);
	     punpckhbw_r2r(mm7, mm2);
	     punpcklbw_r2r(mm3, mm7);
	     por_r2r(mm7, mm2);

	     movq_r2r(mm0, mm3);
	     psubsw_r2r(mm5, mm3);
	     psubsw_r2r(mm4, mm3);
	     paddsw_m2r(CONST_32, mm3);

	     movq_r2r(mm1, mm7);
	     psubsw_r2r(mm5, mm7);
	     psubsw_r2r(mm4, mm7);
	     paddsw_m2r(CONST_32, mm7);

	     psraw_i2r(RES, mm3);
	     psraw_i2r(RES, mm7);
	     packuswb_r2r(mm7, mm3);

	     pxor_r2r(mm7, mm7);
	     movq_r2r(mm3, mm4);
	     punpckhbw_r2r(mm7, mm3);
	     punpcklbw_r2r(mm4, mm7);
	     por_r2r(mm7, mm3);

	     movq_m2r(CONST_32, mm4);
	     paddsw_r2r(mm6, mm0);
	     paddsw_r2r(mm6, mm1);
	     paddsw_r2r(mm4, mm0);
	     paddsw_r2r(mm4, mm1);
	     psraw_i2r(RES, mm0);
	     psraw_i2r(RES, mm1);
	     packuswb_r2r(mm1, mm0);

	     pxor_r2r(mm7, mm7);
	     movq_r2r(mm0, mm5);
	     punpckhbw_r2r(mm7, mm0);
	     punpcklbw_r2r(mm5, mm7);
	     por_r2r(mm7, mm0);

	     movq_m2r(CONST_FF, mm1);
	     movq_r2r(mm0, mm5);
	     movq_r2r(mm3, mm6);
	     movq_r2r(mm2, mm7);
	     punpckhbw_r2r(mm3, mm2);
	     punpcklbw_r2r(mm6, mm7);
	     punpckhbw_r2r(mm1, mm0);
	     punpcklbw_r2r(mm1, mm5);

	     movq_r2r(mm7, mm1);
	     punpckhwd_r2r(mm5, mm7);
	     punpcklwd_r2r(mm5, mm1);

	     movq_r2r(mm2, mm4);
	     punpckhwd_r2r(mm0, mm2);
	     punpcklwd_r2r(mm0, mm4);

	     movntq_r2m(mm1, *(dp1));
	     movntq_r2m(mm7, *(dp1 + 8));
	     movntq_r2m(mm4, *(dp1 + 16));
	     movntq_r2m(mm2, *(dp1 + 24));

	     yp1 += 8;
	     up += 4;
	     vp += 4;
	     dp1 += 8 * 4;
	  }
	/* cleanup pixles that arent a multiple of 8 pixels wide */
	if (xx < w)
	  {
	     int y, u, v, r, g, b;

	     for (; xx < w; xx += 2)
	       {
		  u = (*up++) - 128;
		  v = (*vp++) - 128;

		  y = RZ(YMUL) * ((*yp1++) - 16);
		  r = LUT_CLIP((y + (_crv * v)) >> RES);
		  g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
		  b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
		  *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);

		  dp1 += 4;

		  y = RZ(YMUL) * ((*yp1++) - 16);
		  r = LUT_CLIP((y + (_crv * v)) >> RES);
		  g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
		  b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
		  *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);

		  dp1 += 4;
	       }
	  }
     }
예제 #10
0
파일: yuvrgb.cpp 프로젝트: fuxiang90/fawkes
void
yuv411planar_to_rgb_mmx (const unsigned char *yuv, unsigned char *rgb,
			 unsigned int w, unsigned int h)
{
  unsigned int xx, yy;
  register const unsigned char *yp1, *up, *vp;
  unsigned char *dp1;

  /* plane pointers */
  yp1 = yuv;
  up = yuv + (w * h);
  vp = up + (w * (h / 4));
  /* destination pointers */
  dp1 = rgb;



  yp1 = yuv;
  up = yuv + (w * h);
  vp = up + ((w / 2) * (h / 2));
  dp1 = rgb;
  for (yy = 0; yy < h; yy++)
    {
      for (xx = 0; xx < w; xx += 8)
	{
	  movq_m2r(*yp1, mm0);
	  movq_r2r(mm0, mm1);
	  psrlw_i2r(8, mm0);
	  psllw_i2r(8, mm1);
	  psrlw_i2r(8, mm1);

	  pxor_r2r(mm7, mm7);
	  movd_m2r(*up, mm3);
	  movd_m2r(*vp, mm2);

	  punpcklbw_r2r(mm7, mm2);
	  punpcklbw_r2r(mm7, mm3);

	  movq_m2r(CONST_16, mm4);
	  psubsw_r2r(mm4, mm0);
	  psubsw_r2r(mm4, mm1);

	  movq_m2r(CONST_128, mm5);
	  psubsw_r2r(mm5, mm2);
	  psubsw_r2r(mm5, mm3);

	  movq_m2r(CONST_YMUL, mm4);
	  pmullw_r2r(mm4, mm0);
	  pmullw_r2r(mm4, mm1);

	  movq_m2r(CONST_CRVCRV, mm7);
	  pmullw_r2r(mm3, mm7);

	  movq_m2r(CONST_CBUCBU, mm6);
	  pmullw_r2r(mm2, mm6);

	  movq_m2r(CONST_CGUCGU, mm5);
	  pmullw_r2r(mm2, mm5);

	  movq_m2r(CONST_CGVCGV, mm4);
	  pmullw_r2r(mm3, mm4);

	  movq_r2r(mm0, mm2);
	  paddsw_r2r(mm7, mm2);
	  paddsw_r2r(mm1, mm7);

	  psraw_i2r(RES, mm2);
	  psraw_i2r(RES, mm7);
	  packuswb_r2r(mm7, mm2);

	  pxor_r2r(mm7, mm7);
	  movq_r2r(mm2, mm3);
	  punpckhbw_r2r(mm7, mm2);
	  punpcklbw_r2r(mm3, mm7);
	  por_r2r(mm7, mm2);

	  movq_r2r(mm0, mm3);
	  psubsw_r2r(mm5, mm3);
	  psubsw_r2r(mm4, mm3);
	  paddsw_m2r(CONST_32, mm3);

	  movq_r2r(mm1, mm7);
	  psubsw_r2r(mm5, mm7);
	  psubsw_r2r(mm4, mm7);
	  paddsw_m2r(CONST_32, mm7);

	  psraw_i2r(RES, mm3);
	  psraw_i2r(RES, mm7);
	  packuswb_r2r(mm7, mm3);

	  pxor_r2r(mm7, mm7);
	  movq_r2r(mm3, mm4);
	  punpckhbw_r2r(mm7, mm3);
	  punpcklbw_r2r(mm4, mm7);
	  por_r2r(mm7, mm3);

	  movq_m2r(CONST_32, mm4);
	  paddsw_r2r(mm6, mm0);
	  paddsw_r2r(mm6, mm1);
	  paddsw_r2r(mm4, mm0);
	  paddsw_r2r(mm4, mm1);
	  psraw_i2r(RES, mm0);
	  psraw_i2r(RES, mm1);
	  packuswb_r2r(mm1, mm0);

	  pxor_r2r(mm7, mm7);
	  movq_r2r(mm0, mm5);
	  punpckhbw_r2r(mm7, mm0);
	  punpcklbw_r2r(mm5, mm7);
	  por_r2r(mm7, mm0);

	  pxor_r2r(mm1, mm1);
	  movq_r2r(mm0, mm5);
	  movq_r2r(mm3, mm6);
	  movq_r2r(mm2, mm7);
	  punpckhbw_r2r(mm3, mm2);
	  punpcklbw_r2r(mm6, mm7);
	  punpckhbw_r2r(mm1, mm0);
	  punpcklbw_r2r(mm1, mm5);

	  movq_r2r(mm7, mm1);
	  punpckhwd_r2r(mm5, mm7);
	  punpcklwd_r2r(mm5, mm1);

	  movq_r2r(mm2, mm4);
	  punpckhwd_r2r(mm0, mm2);
	  punpcklwd_r2r(mm0, mm4);

	  movntq_r2m(mm1, *(dp1));
	  movntq_r2m(mm7, *(dp1 + 8));
	  movntq_r2m(mm4, *(dp1 + 16));
	  movntq_r2m(mm2, *(dp1 + 24));

	  yp1 += 8;
	  up += 4;
	  vp += 4;
	  dp1 += 8 * 4;
	}
      if (yy & 0x1)
	{
	  up -= w / 2;
	  vp -= w / 2;
	}
    }
  emms();
}