Example #1
0
static inline void mmx_end(uint8_t *src3, uint8_t *src5,
                           uint8_t *dst, int X)
{
    punpcklbw_m2r (mm_cpool[0], mm4);
    punpckhbw_m2r (mm_cpool[0], mm5);
    psubusw_r2r (mm2, mm0);
    psubusw_r2r (mm3, mm1);
    movq_m2r (src5[X], mm2);
    movq_m2r (src5[X], mm3);
    punpcklbw_m2r (mm_cpool[0], mm2);
    punpckhbw_m2r (mm_cpool[0], mm3);
    psubusw_r2r (mm2, mm0);
    psubusw_r2r (mm3, mm1);
    psrlw_i2r (3, mm0);
    psrlw_i2r (3, mm1);
    psubw_r2r (mm6, mm4);
    psubw_r2r (mm7, mm5);
    packuswb_r2r (mm1,mm0);
    movq_r2r (mm4, mm6);
    movq_r2r (mm5, mm7);
    pcmpgtw_m2r (mm_lthr, mm4);
    pcmpgtw_m2r (mm_lthr, mm5);
    pcmpgtw_m2r (mm_hthr, mm6);
    pcmpgtw_m2r (mm_hthr, mm7);
    packsswb_r2r (mm5, mm4);
    packsswb_r2r (mm7, mm6);
    pxor_r2r (mm6, mm4);
    movq_r2r (mm4, mm5);
    pandn_r2r (mm0, mm4);
    pand_m2r (src3[X], mm5);
    por_r2r (mm4, mm5);
    movq_r2m (mm5, dst[X]);
}
Example #2
0
static void scale_uint16_x_4_x_generic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
  {
  int i, j;
  uint8_t * src, * dst, *src_start;
  int32_t * factors;
  //  mmx_t tmp_mm;

/*
 *  mm0: Input
 *  mm1: factor_mask
 *  mm2: Factor
 *  mm3: Output
 *  mm4: 
 *  mm5: 
 *  mm6: 0
 *  mm7: scratch
 *  
 */
  
  src_start = ctx->src + scanline * ctx->src_stride;
  
  pxor_r2r(mm6, mm6);
  movq_m2r(factor_mask, mm1);
  dst = dest_start;
  for(i = 0; i < ctx->dst_size; i++)
    {
    src = src_start + 8*ctx->table_h.pixels[i].index;
    factors = ctx->table_h.pixels[i].factor_i;
    pxor_r2r(mm3, mm3);

    for(j = 0; j < ctx->table_h.factors_per_pixel; j++)
      {
      /* Load pixels */
      movq_m2r(*(src), mm0);
      psrlw_i2r(1, mm0);
      /* Load factors */
      LOAD_FACTOR_1_4;
      /* Multiply */
      pmulhw_r2r(mm7, mm0);
      paddw_r2r(mm0, mm3);
      //    DUMP_MM("mm3_2", mm3);
      src += 8;
      factors++;
      }
    pminsw_m2r(max_13, mm3);
    pmaxsw_m2r(min_13, mm3);
    
    psllw_i2r(3, mm3);
    MOVQ_R2M(mm3, *dst);
    
    dst+=8;
    }
  ctx->need_emms = 1;
  
  }
Example #3
0
static void
deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4,
    uint8_t * lum_m3, uint8_t * lum_m2,
    uint8_t * lum_m1, uint8_t * lum, int size)
{
  mmx_t rounder;

  rounder.uw[0] = 4;
  rounder.uw[1] = 4;
  rounder.uw[2] = 4;
  rounder.uw[3] = 4;
  pxor_r2r (mm7, mm7);
  movq_m2r (rounder, mm6);

  for (; size > 3; size -= 4) {
    movd_m2r (*lum_m4, mm0);
    movd_m2r (*lum_m3, mm1);
    movd_m2r (*lum_m2, mm2);
    movd_m2r (*lum_m1, mm3);
    movd_m2r (*lum, mm4);
    punpcklbw_r2r (mm7, mm0);
    punpcklbw_r2r (mm7, mm1);
    punpcklbw_r2r (mm7, mm2);
    punpcklbw_r2r (mm7, mm3);
    punpcklbw_r2r (mm7, mm4);
    paddw_r2r (mm3, mm1);
    psllw_i2r (1, mm2);
    paddw_r2r (mm4, mm0);
    psllw_i2r (2, mm1);         // 2
    paddw_r2r (mm6, mm2);
    paddw_r2r (mm2, mm1);
    psubusw_r2r (mm0, mm1);
    psrlw_i2r (3, mm1);         // 3
    packuswb_r2r (mm7, mm1);
    movd_r2m (mm1, *dst);
    lum_m4 += 4;
    lum_m3 += 4;
    lum_m2 += 4;
    lum_m1 += 4;
    lum += 4;
    dst += 4;
  }
  emms ();

  /* Handle odd widths */
  if (size > 0)
    deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size);
}
Example #4
0
static void scale_uint16_x_1_x_bicubic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
  {
  int i;
  uint16_t * dst;
  uint8_t * src, *src_start;
  int32_t * factors;
  mmx_t tmp_mm;
  int32_t tmp;
  
  //  fprintf(stderr, "scale_uint8_x_1_x_bicubic_mmx\n");

  src_start = ctx->src + scanline * ctx->src_stride;
  
  pxor_r2r(mm6, mm6);
  dst = (uint16_t*)dest_start;
  for(i = 0; i < ctx->dst_size; i++)
    {
    src = src_start + 2*ctx->table_h.pixels[i].index;
    factors = ctx->table_h.pixels[i].factor_i;
    /* Load pixels */
    movq_m2r(*(src), mm0);
    psrlw_i2r(1, mm0);
    //    DUMP_MM("mm0", mm0);
    /* Load factors */
    movq_m2r(*factors, mm2);
    movq_m2r(*(factors+2), mm3);
    packssdw_r2r(mm3, mm2);
    /* Multiply */
    pmaddwd_r2r(mm2, mm0);
    MOVQ_R2M(mm0, tmp_mm);
    tmp = tmp_mm.d[0] + tmp_mm.d[1];
    tmp >>= 13;
    RECLIP(tmp, ctx->plane);
    *(dst++) = tmp;
    }
  ctx->need_emms = 1;
  }
Example #5
0
static void
_evas_yv12torgb_sse(unsigned char **yuv, unsigned char *rgb, int w, int h)
{
#ifdef BUILD_MMX
   int xx, yy;
   register unsigned char *yp1, *up, *vp;
   unsigned char *dp1;

   /* destination pointers */
   dp1 = rgb;

   for (yy = 0; yy < h; yy++)
     {
	/* plane pointers */
	yp1 = yuv[yy];
	up = yuv[h + (yy / 2)];
	vp = yuv[h + (h / 2) + (yy / 2)];
	for (xx = 0; xx < (w - 7); xx += 8)
	  {
	     movd_m2r(*up, mm3);
	     movd_m2r(*vp, mm2);
	     movq_m2r(*yp1, mm0);

	     pxor_r2r(mm7, mm7);
	     punpcklbw_r2r(mm7, mm2);
	     punpcklbw_r2r(mm7, mm3);

	     movq_r2r(mm0, mm1);
	     psrlw_i2r(8, mm0);
	     psllw_i2r(8, mm1);
	     psrlw_i2r(8, mm1);

	     movq_m2r(CONST_16, mm4);
	     psubsw_r2r(mm4, mm0);
	     psubsw_r2r(mm4, mm1);

	     movq_m2r(CONST_128, mm5);
	     psubsw_r2r(mm5, mm2);
	     psubsw_r2r(mm5, mm3);

	     movq_m2r(CONST_YMUL, mm4);
	     pmullw_r2r(mm4, mm0);
	     pmullw_r2r(mm4, mm1);

	     movq_m2r(CONST_CRVCRV, mm7);
	     pmullw_r2r(mm3, mm7);
	     movq_m2r(CONST_CBUCBU, mm6);
	     pmullw_r2r(mm2, mm6);
	     movq_m2r(CONST_CGUCGU, mm5);
	     pmullw_r2r(mm2, mm5);
	     movq_m2r(CONST_CGVCGV, mm4);
	     pmullw_r2r(mm3, mm4);

	     movq_r2r(mm0, mm2);
	     paddsw_r2r(mm7, mm2);
	     paddsw_r2r(mm1, mm7);

	     psraw_i2r(RES, mm2);
	     psraw_i2r(RES, mm7);
	     packuswb_r2r(mm7, mm2);

	     pxor_r2r(mm7, mm7);
	     movq_r2r(mm2, mm3);
	     punpckhbw_r2r(mm7, mm2);
	     punpcklbw_r2r(mm3, mm7);
	     por_r2r(mm7, mm2);

	     movq_r2r(mm0, mm3);
	     psubsw_r2r(mm5, mm3);
	     psubsw_r2r(mm4, mm3);
	     paddsw_m2r(CONST_32, mm3);

	     movq_r2r(mm1, mm7);
	     psubsw_r2r(mm5, mm7);
	     psubsw_r2r(mm4, mm7);
	     paddsw_m2r(CONST_32, mm7);

	     psraw_i2r(RES, mm3);
	     psraw_i2r(RES, mm7);
	     packuswb_r2r(mm7, mm3);

	     pxor_r2r(mm7, mm7);
	     movq_r2r(mm3, mm4);
	     punpckhbw_r2r(mm7, mm3);
	     punpcklbw_r2r(mm4, mm7);
	     por_r2r(mm7, mm3);

	     movq_m2r(CONST_32, mm4);
	     paddsw_r2r(mm6, mm0);
	     paddsw_r2r(mm6, mm1);
	     paddsw_r2r(mm4, mm0);
	     paddsw_r2r(mm4, mm1);
	     psraw_i2r(RES, mm0);
	     psraw_i2r(RES, mm1);
	     packuswb_r2r(mm1, mm0);

	     pxor_r2r(mm7, mm7);
	     movq_r2r(mm0, mm5);
	     punpckhbw_r2r(mm7, mm0);
	     punpcklbw_r2r(mm5, mm7);
	     por_r2r(mm7, mm0);

	     movq_m2r(CONST_FF, mm1);
	     movq_r2r(mm0, mm5);
	     movq_r2r(mm3, mm6);
	     movq_r2r(mm2, mm7);
	     punpckhbw_r2r(mm3, mm2);
	     punpcklbw_r2r(mm6, mm7);
	     punpckhbw_r2r(mm1, mm0);
	     punpcklbw_r2r(mm1, mm5);

	     movq_r2r(mm7, mm1);
	     punpckhwd_r2r(mm5, mm7);
	     punpcklwd_r2r(mm5, mm1);

	     movq_r2r(mm2, mm4);
	     punpckhwd_r2r(mm0, mm2);
	     punpcklwd_r2r(mm0, mm4);

	     movntq_r2m(mm1, *(dp1));
	     movntq_r2m(mm7, *(dp1 + 8));
	     movntq_r2m(mm4, *(dp1 + 16));
	     movntq_r2m(mm2, *(dp1 + 24));

	     yp1 += 8;
	     up += 4;
	     vp += 4;
	     dp1 += 8 * 4;
	  }
	/* cleanup pixles that arent a multiple of 8 pixels wide */
	if (xx < w)
	  {
	     int y, u, v, r, g, b;

	     for (; xx < w; xx += 2)
	       {
		  u = (*up++) - 128;
		  v = (*vp++) - 128;

		  y = RZ(YMUL) * ((*yp1++) - 16);
		  r = LUT_CLIP((y + (_crv * v)) >> RES);
		  g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
		  b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
		  *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);

		  dp1 += 4;

		  y = RZ(YMUL) * ((*yp1++) - 16);
		  r = LUT_CLIP((y + (_crv * v)) >> RES);
		  g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
		  b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
		  *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);

		  dp1 += 4;
	       }
	  }
     }
Example #6
0
int bsad_mmx(uint8_t *pf, uint8_t *pb, uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h)
{
    uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
    int s, s1, s2;

    pfa = pf + hxf;
    pfb = pf + lx * hyf;
    pfc = pfb + hxf;

    pba = pb + hxb;
    pbb = pb + lx * hyb; 
    pbc = pbb + hxb;

    s = 0; /* the accumulator */

    if (h > 0)
    {
        pxor_r2r(mm7, mm7);
        pxor_r2r(mm6, mm6);
        pcmpeqw_r2r(mm5, mm5);
        psubw_r2r(mm5, mm6);
        psllw_i2r(1, mm6);
		
        do {
            BSAD_LOAD(pf[0],mm0,mm1);
            BSAD_LOAD_ACC(pfa[0],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pfb[0],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pfc[0],mm2,mm3,mm0,mm1);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psrlw_i2r(2, mm0);
            psrlw_i2r(2, mm1);
			
            BSAD_LOAD(pb[0],mm2,mm3);
            BSAD_LOAD_ACC(pba[0],mm4,mm5,mm2,mm3);
            BSAD_LOAD_ACC(pbb[0],mm4,mm5,mm2,mm3);
            BSAD_LOAD_ACC(pbc[0],mm4,mm5,mm2,mm3);
            paddw_r2r(mm6, mm2);
            paddw_r2r(mm6, mm3);
            psrlw_i2r(2, mm2);
            psrlw_i2r(2, mm3);
			
            paddw_r2r(mm2, mm0);
            paddw_r2r(mm3, mm1);
            psrlw_i2r(1, mm6);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psllw_i2r(1, mm6);
            psrlw_i2r(1, mm0);
            psrlw_i2r(1, mm1);
            packuswb_r2r(mm1, mm0);
			
            movq_m2r(p2[0], mm1);
            movq_r2r(mm0, mm2);
            psubusb_r2r(mm1, mm0);
            psubusb_r2r(mm2, mm1);
            por_r2r(mm1, mm0);
            movq_r2r(mm0, mm1);
            punpcklbw_r2r(mm7, mm0);
            punpckhbw_r2r(mm7, mm1);
            paddw_r2r(mm1, mm0);
            movq_r2r(mm0, mm1);
            punpcklwd_r2r(mm7, mm0);
            punpckhwd_r2r(mm7, mm1);
			
            paddd_r2r(mm1, mm0);
            movd_r2g(mm0, s1);
            psrlq_i2r(32, mm0);
            movd_r2g(mm0, s2);
            s += s1 + s2;

            BSAD_LOAD(pf[8],mm0,mm1);
            BSAD_LOAD_ACC(pfa[8],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pfb[8],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pfc[8],mm2,mm3,mm0,mm1);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psrlw_i2r(2, mm0);
            psrlw_i2r(2, mm1);
			
            BSAD_LOAD(pb[8],mm2,mm3);
            BSAD_LOAD_ACC(pba[8],mm4,mm5,mm2,mm3);
            BSAD_LOAD_ACC(pbb[8],mm4,mm5,mm2,mm3);
            BSAD_LOAD_ACC(pbc[8],mm4,mm5,mm2,mm3);
            paddw_r2r(mm6, mm2);
            paddw_r2r(mm6, mm3);
            psrlw_i2r(2, mm2);
            psrlw_i2r(2, mm3);
						
            paddw_r2r(mm2, mm0);
            paddw_r2r(mm3, mm1);
            psrlw_i2r(1, mm6);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psllw_i2r(1, mm6);
            psrlw_i2r(1, mm0);
            psrlw_i2r(1, mm1);
            packuswb_r2r(mm1, mm0);
			
            movq_m2r(p2[8], mm1);
            movq_r2r(mm0, mm2);
            psubusb_r2r(mm1, mm0);
            psubusb_r2r(mm2, mm1);
            por_r2r(mm1, mm0);
            movq_r2r(mm0, mm1);
            punpcklbw_r2r(mm7, mm0);
            punpckhbw_r2r(mm7, mm1);
            paddw_r2r(mm1, mm0);
            movq_r2r(mm0, mm1);
            punpcklwd_r2r(mm7, mm0);
            punpckhwd_r2r(mm7, mm1);
			
            paddd_r2r(mm1, mm0);
            movd_r2g(mm0, s1);
            psrlq_i2r(32, mm0);
            movd_r2g(mm0, s2);
            s += s1 + s2;
			
            p2  += lx;
            pf  += lx;
            pfa += lx;
            pfb += lx;
            pfc += lx;
            pb  += lx;
            pba += lx;
            pbb += lx;
            pbc += lx;

            h--;
        } while (h > 0);	
	
    }
	
    emms();

    return s;
}
Example #7
0
static int bsad_1quad_mmxe(uint8_t *pf, uint8_t *pb, uint8_t *pb2, uint8_t *p2, int lx, int h)
{
    int s;

    s = 0; /* the accumulator */

    if (h > 0)
    {
        pcmpeqw_r2r(mm6, mm6);
        psrlw_i2r(15, mm6);
        paddw_r2r(mm6, mm6);

        pxor_r2r(mm7, mm7);
        pxor_r2r(mm5, mm5);
		
        do {
            BSAD_LOAD(pf[0],mm0,mm1);
            BSAD_LOAD_ACC(pf[1],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pf[lx],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pf[lx+1],mm2,mm3,mm0,mm1);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psrlw_i2r(2, mm0);
            psrlw_i2r(2, mm1);
            packuswb_r2r(mm1, mm0);
			
            movq_m2r(pb2[0],mm1);
            pavgb_m2r(pb[0],mm1);

            pavgb_r2r(mm1, mm0);
            psadbw_m2r(p2[0],mm0);
            paddd_r2r(mm0,mm5);

            BSAD_LOAD(pf[8],mm0,mm1);
            BSAD_LOAD_ACC(pf[9],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pf[lx+8],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pf[lx+9],mm2,mm3,mm0,mm1);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psrlw_i2r(2, mm0);
            psrlw_i2r(2, mm1);
            packuswb_r2r(mm1, mm0);
			
            movq_m2r(pb2[8],mm1);
            pavgb_m2r(pb[8],mm1);
						
            pavgb_r2r(mm1, mm0);
            psadbw_m2r(p2[8],mm0);
            paddd_r2r(mm0,mm5);
			
            p2  += lx;
            pf  += lx;
            pb  += lx;
            pb2 += lx;

            h--;
        } while (h > 0);	
	
    }
    movd_r2g(mm5,s);
	
    emms();

    return s;
}
Example #8
0
static void interpolate_packed422_scanline_mmx( uint8_t *output, uint8_t *top,
                                                uint8_t *bot, int width )
{
    const mmx_t shiftmask = { 0xfefffefffefffeffULL };  /* To avoid shifting chroma to luma. */
    int i;

    for( i = width/16; i; --i ) {
        movq_m2r( *bot, mm0 );
        movq_m2r( *top, mm1 );
        movq_m2r( *(bot + 8), mm2 );
        movq_m2r( *(top + 8), mm3 );
        movq_m2r( *(bot + 16), mm4 );
        movq_m2r( *(top + 16), mm5 );
        movq_m2r( *(bot + 24), mm6 );
        movq_m2r( *(top + 24), mm7 );
        pand_m2r( shiftmask, mm0 );
        pand_m2r( shiftmask, mm1 );
        pand_m2r( shiftmask, mm2 );
        pand_m2r( shiftmask, mm3 );
        pand_m2r( shiftmask, mm4 );
        pand_m2r( shiftmask, mm5 );
        pand_m2r( shiftmask, mm6 );
        pand_m2r( shiftmask, mm7 );
        psrlw_i2r( 1, mm0 );
        psrlw_i2r( 1, mm1 );
        psrlw_i2r( 1, mm2 );
        psrlw_i2r( 1, mm3 );
        psrlw_i2r( 1, mm4 );
        psrlw_i2r( 1, mm5 );
        psrlw_i2r( 1, mm6 );
        psrlw_i2r( 1, mm7 );
        paddb_r2r( mm1, mm0 );
        paddb_r2r( mm3, mm2 );
        paddb_r2r( mm5, mm4 );
        paddb_r2r( mm7, mm6 );
        movq_r2m( mm0, *output );
        movq_r2m( mm2, *(output + 8) );
        movq_r2m( mm4, *(output + 16) );
        movq_r2m( mm6, *(output + 24) );
        output += 32;
        top += 32;
        bot += 32;
    }
    width = (width & 0xf);

    for( i = width/4; i; --i ) {
        movq_m2r( *bot, mm0 );
        movq_m2r( *top, mm1 );
        pand_m2r( shiftmask, mm0 );
        pand_m2r( shiftmask, mm1 );
        psrlw_i2r( 1, mm0 );
        psrlw_i2r( 1, mm1 );
        paddb_r2r( mm1, mm0 );
        movq_r2m( mm0, *output );
        output += 8;
        top += 8;
        bot += 8;
    }
    width = width & 0x7;

    /* Handle last few pixels. */
    for( i = width * 2; i; --i ) {
        *output++ = ((*top++) + (*bot++)) >> 1;
    }

    emms();
}
Example #9
0
static void scale_uint16_x_4_x_quadratic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
  {
  int i;
  uint8_t * src, * dst, *src_start;
  int32_t * factors;
  //  mmx_t tmp_mm;

/*
 *  mm0: Input
 *  mm1: factor_mask
 *  mm2: Factor
 *  mm3: Output
 *  mm4: 
 *  mm5: 
 *  mm6: 0
 *  mm7: scratch
 *  
 */
  
  //  fprintf(stderr, "scale_uint8_x_1_x_bicubic_noclip_mmx\n");
  src_start = ctx->src + scanline * ctx->src_stride;
  
  pxor_r2r(mm6, mm6);
  movq_m2r(factor_mask, mm1);
  dst = dest_start;
  for(i = 0; i < ctx->dst_size; i++)
    {
    src = src_start + 8*ctx->table_h.pixels[i].index;
    factors = ctx->table_h.pixels[i].factor_i;
    
    /* Load pixels */
    movq_m2r(*(src), mm0);
    //    punpcklbw_r2r(mm6, mm0);
    psrlw_i2r(1, mm0);
    /* Load factors */
    LOAD_FACTOR_1_4_NOCLIP;
    /* Multiply */
    pmulhw_r2r(mm7, mm0);
    movq_r2r(mm0, mm3);
    //    DUMP_MM("mm3_1", mm3);
    src += 8;
    factors++;
    
    /* Load pixels */
    movq_m2r(*(src), mm0);
    //    punpcklbw_r2r(mm6, mm0);
    psrlw_i2r(1, mm0);
    /* Load factors */
    LOAD_FACTOR_1_4_NOCLIP;
    /* Multiply */
    pmulhw_r2r(mm7, mm0);
    paddw_r2r(mm0, mm3);
    //    DUMP_MM("mm3_2", mm3);
    src += 8;
    factors++;

    /* Load pixels */
    movq_m2r(*(src), mm0);
    //    punpcklbw_r2r(mm6, mm0);
    psrlw_i2r(1, mm0);
    /* Load factors */
    LOAD_FACTOR_1_4_NOCLIP;
    /* Multiply */
    pmulhw_r2r(mm7, mm0);
    paddw_r2r(mm0, mm3);
    //    DUMP_MM("mm3_3", mm3);
    src += 8;
    
    psllw_i2r(3, mm3);
    //    packuswb_r2r(mm6, mm3);
    MOVQ_R2M(mm3, *dst);
    
    dst+=8;
    }
  ctx->need_emms = 1;
  }
Example #10
0
static void
deinterlace_greedy_scanline_mmx (GstDeinterlaceMethodGreedyL * self,
    const guint8 * m0, const guint8 * t1,
    const guint8 * b1, const guint8 * m2, guint8 * output, gint width)
{
  mmx_t MaxComb;
  mmx_t ShiftMask;

  // How badly do we let it weave? 0-255
  MaxComb.ub[0] = self->max_comb;
  MaxComb.ub[1] = self->max_comb;
  MaxComb.ub[2] = self->max_comb;
  MaxComb.ub[3] = self->max_comb;
  MaxComb.ub[4] = self->max_comb;
  MaxComb.ub[5] = self->max_comb;
  MaxComb.ub[6] = self->max_comb;
  MaxComb.ub[7] = self->max_comb;

  ShiftMask.ub[0] = 0x7f;
  ShiftMask.ub[1] = 0x7f;
  ShiftMask.ub[2] = 0x7f;
  ShiftMask.ub[3] = 0x7f;
  ShiftMask.ub[4] = 0x7f;
  ShiftMask.ub[5] = 0x7f;
  ShiftMask.ub[6] = 0x7f;
  ShiftMask.ub[7] = 0x7f;

  // L2 == m0
  // L1 == t1
  // L3 == b1
  // LP2 == m2  

  movq_m2r (MaxComb, mm6);

  for (; width > 7; width -= 8) {
    movq_m2r (*t1, mm1);        // L1
    movq_m2r (*m0, mm2);        // L2
    movq_m2r (*b1, mm3);        // L3
    movq_m2r (*m2, mm0);        // LP2

    // average L1 and L3 leave result in mm4
    movq_r2r (mm1, mm4);        // L1
    movq_r2r (mm3, mm5);        // L3
    psrlw_i2r (1, mm4);         // L1/2
    pand_m2r (ShiftMask, mm4);
    psrlw_i2r (1, mm5);         // L3/2
    pand_m2r (ShiftMask, mm5);
    paddusb_r2r (mm5, mm4);     // (L1 + L3) / 2

    // get abs value of possible L2 comb
    movq_r2r (mm2, mm7);        // L2
    psubusb_r2r (mm4, mm7);     // L2 - avg
    movq_r2r (mm4, mm5);        // avg
    psubusb_r2r (mm2, mm5);     // avg - L2
    por_r2r (mm7, mm5);         // abs(avg-L2)

    // get abs value of possible LP2 comb
    movq_r2r (mm0, mm7);        // LP2
    psubusb_r2r (mm4, mm7);     // LP2 - avg
    psubusb_r2r (mm0, mm4);     // avg - LP2
    por_r2r (mm7, mm4);         // abs(avg-LP2)

    // use L2 or LP2 depending upon which makes smaller comb
    psubusb_r2r (mm5, mm4);     // see if it goes to zero
    psubusb_r2r (mm5, mm5);     // 0
    pcmpeqb_r2r (mm5, mm4);     // if (mm4=0) then FF else 0
    pcmpeqb_r2r (mm4, mm5);     // opposite of mm4

    // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
    pand_r2r (mm2, mm5);        // use L2 if mm5 == ff, else 0
    pand_r2r (mm0, mm4);        // use LP2 if mm4 = ff, else 0
    por_r2r (mm5, mm4);         // may the best win

    // Now lets clip our chosen value to be not outside of the range
    // of the high/low range L1-L3 by more than abs(L1-L3)
    // This allows some comb but limits the damages and also allows more
    // detail than a boring oversmoothed clip.

    movq_r2r (mm1, mm2);        // copy L1
    psubusb_r2r (mm3, mm2);     // - L3, with saturation
    paddusb_r2r (mm3, mm2);     // now = Max(L1,L3)

    pcmpeqb_r2r (mm7, mm7);     // all ffffffff
    psubusb_r2r (mm1, mm7);     // - L1 
    paddusb_r2r (mm7, mm3);     // add, may sat at fff..
    psubusb_r2r (mm7, mm3);     // now = Min(L1,L3)

    // allow the value to be above the high or below the low by amt of MaxComb
    paddusb_r2r (mm6, mm2);     // increase max by diff
    psubusb_r2r (mm6, mm3);     // lower min by diff

    psubusb_r2r (mm3, mm4);     // best - Min
    paddusb_r2r (mm3, mm4);     // now = Max(best,Min(L1,L3)

    pcmpeqb_r2r (mm7, mm7);     // all ffffffff
    psubusb_r2r (mm4, mm7);     // - Max(best,Min(best,L3) 
    paddusb_r2r (mm7, mm2);     // add may sat at FFF..
    psubusb_r2r (mm7, mm2);     // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped

    movq_r2m (mm2, *output);    // move in our clipped best

    // Advance to the next set of pixels.
    output += 8;
    m0 += 8;
    t1 += 8;
    b1 += 8;
    m2 += 8;
  }
  emms ();
  if (width > 0)
    deinterlace_greedy_scanline_c (self, m0, t1, b1, m2, output, width);
}
Example #11
0
static void scale_uint8_x_1_x_bilinear_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
  {
  int i, imax, index;
  uint8_t * src, * dst, *src_start;
  mmx_t tmp_mm;

  
/*
 *  mm0: Input1 Input2
 *  mm1: Factor
 *  mm2:
 *  mm3: 
 *  mm4: 
 *  mm5: 
 *  mm6: 0
 *  mm7: scratch
 *  
 */
  
  src_start = ctx->src + scanline * ctx->src_stride;
  
  pxor_r2r(mm6, mm6);
  dst = dest_start;

  imax = ctx->dst_size / 4;
  //  imax = 0;
  index = 0;
  
  for(i = 0; i < imax; i++)
    {
    
    /* Load pixels */
    src = src_start + ctx->table_h.pixels[index].index;
    tmp_mm.uw[0] = *src;
    tmp_mm.uw[1] = *(src+1);
    
    src = src_start + ctx->table_h.pixels[index+1].index;
    tmp_mm.uw[2] = *src;
    tmp_mm.uw[3] = *(src+1);
    
    movq_m2r(tmp_mm, mm0);
    /* Load factors */
    movq_m2r(ctx->table_h.pixels[index].factor_i[0], mm1);
    movq_m2r(ctx->table_h.pixels[index+1].factor_i[0], mm7);

    packssdw_r2r(mm7, mm1);
    pmaddwd_r2r(mm0, mm1);

    index += 2;
    
    /* Load pixels */
    src = src_start + ctx->table_h.pixels[index].index;
    tmp_mm.uw[0] = *src;
    tmp_mm.uw[1] = *(src+1);
    
    src = src_start + ctx->table_h.pixels[index+1].index;
    tmp_mm.uw[2] = *src;
    tmp_mm.uw[3] = *(src+1);
    
    movq_m2r(tmp_mm, mm0);
    /* Load factors */
    movq_m2r(ctx->table_h.pixels[index].factor_i[0], mm3);
    movq_m2r(ctx->table_h.pixels[index+1].factor_i[0], mm7);
    packssdw_r2r(mm7, mm3);
    pmaddwd_r2r(mm0, mm3);
    
    psrld_i2r(7, mm3);
    psrld_i2r(7, mm1);
    packssdw_r2r(mm3, mm1);
    psrlw_i2r(7, mm1);
    index += 2;
    
    packuswb_r2r(mm6, mm1);
    
    movd_r2m(mm1, *dst);
    //    *dst      = tmp_mm.ub[0];
    //    *(dst+1) = tmp_mm.ub[4];
    dst+=4;
    }
  ctx->need_emms = 1;

#if 1
  imax = ctx->dst_size % 4;
  for(i = 0; i < imax; i++)
    {
    src = (src_start + ctx->table_h.pixels[index].index);
    *dst = (ctx->table_h.pixels[index].factor_i[0] * *src +
      ctx->table_h.pixels[index].factor_i[1] * *(src+1)) >> 14;
    dst++;
    index++;
    }
#endif
  }
Example #12
0
static void
deinterlace_scanline_linear_mmx (GstDeinterlaceMethod * self,
    GstDeinterlace * parent, guint8 * out,
    GstDeinterlaceScanlineData * scanlines, gint width)
{
  const mmx_t shiftmask = { 0xfefffefffefffeffULL };    /* To avoid shifting chroma to luma. */
  int i;
  guint8 *bot = scanlines->b0, *top = scanlines->t0;

  for (i = width / 16; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    movq_m2r (*(bot + 8), mm2);
    movq_m2r (*(top + 8), mm3);
    movq_m2r (*(bot + 16), mm4);
    movq_m2r (*(top + 16), mm5);
    movq_m2r (*(bot + 24), mm6);
    movq_m2r (*(top + 24), mm7);
    pand_m2r (shiftmask, mm0);
    pand_m2r (shiftmask, mm1);
    pand_m2r (shiftmask, mm2);
    pand_m2r (shiftmask, mm3);
    pand_m2r (shiftmask, mm4);
    pand_m2r (shiftmask, mm5);
    pand_m2r (shiftmask, mm6);
    pand_m2r (shiftmask, mm7);
    psrlw_i2r (1, mm0);
    psrlw_i2r (1, mm1);
    psrlw_i2r (1, mm2);
    psrlw_i2r (1, mm3);
    psrlw_i2r (1, mm4);
    psrlw_i2r (1, mm5);
    psrlw_i2r (1, mm6);
    psrlw_i2r (1, mm7);
    paddb_r2r (mm1, mm0);
    paddb_r2r (mm3, mm2);
    paddb_r2r (mm5, mm4);
    paddb_r2r (mm7, mm6);
    movq_r2m (mm0, *out);
    movq_r2m (mm2, *(out + 8));
    movq_r2m (mm4, *(out + 16));
    movq_r2m (mm6, *(out + 24));
    out += 32;
    top += 32;
    bot += 32;
  }
  width = (width & 0xf);

  for (i = width / 4; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    pand_m2r (shiftmask, mm0);
    pand_m2r (shiftmask, mm1);
    psrlw_i2r (1, mm0);
    psrlw_i2r (1, mm1);
    paddb_r2r (mm1, mm0);
    movq_r2m (mm0, *out);
    out += 8;
    top += 8;
    bot += 8;
  }
  width = width & 0x7;

  /* Handle last few pixels. */
  for (i = width * 2; i; --i) {
    *out++ = ((*top++) + (*bot++)) >> 1;
  }

  emms ();
}
Example #13
0
File: vfir.c Project: jerbs/sinema
static void deinterlace_line( uint8_t *dst, uint8_t *lum_m4,
                              uint8_t *lum_m3, uint8_t *lum_m2,
                              uint8_t *lum_m1, uint8_t *lum, int size )
{
#if defined(__i386__) || defined(__x86_64__)
    mmx_t rounder;

    rounder.uw[0]=4;
    rounder.uw[1]=4;
    rounder.uw[2]=4;
    rounder.uw[3]=4;
    pxor_r2r(mm7,mm7);
    movq_m2r(rounder,mm6);

    for (;size > 3; size-=4) {
        movd_m2r(lum_m4[0],mm0);
        movd_m2r(lum_m3[0],mm1);
        movd_m2r(lum_m2[0],mm2);
        movd_m2r(lum_m1[0],mm3);
        movd_m2r(lum[0],mm4);
        punpcklbw_r2r(mm7,mm0);
        punpcklbw_r2r(mm7,mm1);
        punpcklbw_r2r(mm7,mm2);
        punpcklbw_r2r(mm7,mm3);
        punpcklbw_r2r(mm7,mm4);
        paddw_r2r(mm3,mm1);
        psllw_i2r(1,mm2);
        paddw_r2r(mm4,mm0);
        psllw_i2r(2,mm1);// 2
        paddw_r2r(mm6,mm2);
        paddw_r2r(mm2,mm1);
        psubusw_r2r(mm0,mm1);
        psrlw_i2r(3,mm1); // 3
        packuswb_r2r(mm7,mm1);
        movd_r2m(mm1,dst[0]);
        lum_m4+=4;
        lum_m3+=4;
        lum_m2+=4;
        lum_m1+=4;
        lum+=4;
        dst+=4;
    }
    emms();
#else
    /**
     * C implementation.
     */
    int sum;

    for(;size > 0;size--) {
        sum = -lum_m4[0];
        sum += lum_m3[0] << 2;
        sum += lum_m2[0] << 1;
        sum += lum_m1[0] << 2;
        sum += -lum[0];
        dst[0] = (sum + 4) >> 3; // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3];
        lum_m4++;
        lum_m3++;
        lum_m2++;
        lum_m1++;
        lum++;
        dst++;
    }
#endif
}
Example #14
0
void
yuv411planar_to_rgb_mmx (const unsigned char *yuv, unsigned char *rgb,
			 unsigned int w, unsigned int h)
{
  unsigned int xx, yy;
  register const unsigned char *yp1, *up, *vp;
  unsigned char *dp1;

  /* plane pointers */
  yp1 = yuv;
  up = yuv + (w * h);
  vp = up + (w * (h / 4));
  /* destination pointers */
  dp1 = rgb;



  yp1 = yuv;
  up = yuv + (w * h);
  vp = up + ((w / 2) * (h / 2));
  dp1 = rgb;
  for (yy = 0; yy < h; yy++)
    {
      for (xx = 0; xx < w; xx += 8)
	{
	  movq_m2r(*yp1, mm0);
	  movq_r2r(mm0, mm1);
	  psrlw_i2r(8, mm0);
	  psllw_i2r(8, mm1);
	  psrlw_i2r(8, mm1);

	  pxor_r2r(mm7, mm7);
	  movd_m2r(*up, mm3);
	  movd_m2r(*vp, mm2);

	  punpcklbw_r2r(mm7, mm2);
	  punpcklbw_r2r(mm7, mm3);

	  movq_m2r(CONST_16, mm4);
	  psubsw_r2r(mm4, mm0);
	  psubsw_r2r(mm4, mm1);

	  movq_m2r(CONST_128, mm5);
	  psubsw_r2r(mm5, mm2);
	  psubsw_r2r(mm5, mm3);

	  movq_m2r(CONST_YMUL, mm4);
	  pmullw_r2r(mm4, mm0);
	  pmullw_r2r(mm4, mm1);

	  movq_m2r(CONST_CRVCRV, mm7);
	  pmullw_r2r(mm3, mm7);

	  movq_m2r(CONST_CBUCBU, mm6);
	  pmullw_r2r(mm2, mm6);

	  movq_m2r(CONST_CGUCGU, mm5);
	  pmullw_r2r(mm2, mm5);

	  movq_m2r(CONST_CGVCGV, mm4);
	  pmullw_r2r(mm3, mm4);

	  movq_r2r(mm0, mm2);
	  paddsw_r2r(mm7, mm2);
	  paddsw_r2r(mm1, mm7);

	  psraw_i2r(RES, mm2);
	  psraw_i2r(RES, mm7);
	  packuswb_r2r(mm7, mm2);

	  pxor_r2r(mm7, mm7);
	  movq_r2r(mm2, mm3);
	  punpckhbw_r2r(mm7, mm2);
	  punpcklbw_r2r(mm3, mm7);
	  por_r2r(mm7, mm2);

	  movq_r2r(mm0, mm3);
	  psubsw_r2r(mm5, mm3);
	  psubsw_r2r(mm4, mm3);
	  paddsw_m2r(CONST_32, mm3);

	  movq_r2r(mm1, mm7);
	  psubsw_r2r(mm5, mm7);
	  psubsw_r2r(mm4, mm7);
	  paddsw_m2r(CONST_32, mm7);

	  psraw_i2r(RES, mm3);
	  psraw_i2r(RES, mm7);
	  packuswb_r2r(mm7, mm3);

	  pxor_r2r(mm7, mm7);
	  movq_r2r(mm3, mm4);
	  punpckhbw_r2r(mm7, mm3);
	  punpcklbw_r2r(mm4, mm7);
	  por_r2r(mm7, mm3);

	  movq_m2r(CONST_32, mm4);
	  paddsw_r2r(mm6, mm0);
	  paddsw_r2r(mm6, mm1);
	  paddsw_r2r(mm4, mm0);
	  paddsw_r2r(mm4, mm1);
	  psraw_i2r(RES, mm0);
	  psraw_i2r(RES, mm1);
	  packuswb_r2r(mm1, mm0);

	  pxor_r2r(mm7, mm7);
	  movq_r2r(mm0, mm5);
	  punpckhbw_r2r(mm7, mm0);
	  punpcklbw_r2r(mm5, mm7);
	  por_r2r(mm7, mm0);

	  pxor_r2r(mm1, mm1);
	  movq_r2r(mm0, mm5);
	  movq_r2r(mm3, mm6);
	  movq_r2r(mm2, mm7);
	  punpckhbw_r2r(mm3, mm2);
	  punpcklbw_r2r(mm6, mm7);
	  punpckhbw_r2r(mm1, mm0);
	  punpcklbw_r2r(mm1, mm5);

	  movq_r2r(mm7, mm1);
	  punpckhwd_r2r(mm5, mm7);
	  punpcklwd_r2r(mm5, mm1);

	  movq_r2r(mm2, mm4);
	  punpckhwd_r2r(mm0, mm2);
	  punpcklwd_r2r(mm0, mm4);

	  movntq_r2m(mm1, *(dp1));
	  movntq_r2m(mm7, *(dp1 + 8));
	  movntq_r2m(mm4, *(dp1 + 16));
	  movntq_r2m(mm2, *(dp1 + 24));

	  yp1 += 8;
	  up += 4;
	  vp += 4;
	  dp1 += 8 * 4;
	}
      if (yy & 0x1)
	{
	  up -= w / 2;
	  vp -= w / 2;
	}
    }
  emms();
}
Example #15
0
static void
deinterlace_scanline_linear_mmx (GstDeinterlaceSimpleMethod * self,
    guint8 * out, const guint8 * bot, const guint8 * top, gint size)
{
  const mmx_t shiftmask = { 0xfefffefffefffeffULL };    /* To avoid shifting chroma to luma. */
  int i;

  for (i = size / 32; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    movq_m2r (*(bot + 8), mm2);
    movq_m2r (*(top + 8), mm3);
    movq_m2r (*(bot + 16), mm4);
    movq_m2r (*(top + 16), mm5);
    movq_m2r (*(bot + 24), mm6);
    movq_m2r (*(top + 24), mm7);
    pand_m2r (shiftmask, mm0);
    pand_m2r (shiftmask, mm1);
    pand_m2r (shiftmask, mm2);
    pand_m2r (shiftmask, mm3);
    pand_m2r (shiftmask, mm4);
    pand_m2r (shiftmask, mm5);
    pand_m2r (shiftmask, mm6);
    pand_m2r (shiftmask, mm7);
    psrlw_i2r (1, mm0);
    psrlw_i2r (1, mm1);
    psrlw_i2r (1, mm2);
    psrlw_i2r (1, mm3);
    psrlw_i2r (1, mm4);
    psrlw_i2r (1, mm5);
    psrlw_i2r (1, mm6);
    psrlw_i2r (1, mm7);
    paddb_r2r (mm1, mm0);
    paddb_r2r (mm3, mm2);
    paddb_r2r (mm5, mm4);
    paddb_r2r (mm7, mm6);
    movq_r2m (mm0, *out);
    movq_r2m (mm2, *(out + 8));
    movq_r2m (mm4, *(out + 16));
    movq_r2m (mm6, *(out + 24));
    out += 32;
    top += 32;
    bot += 32;
  }
  size = (size & 0x1f);

  for (i = size / 8; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    pand_m2r (shiftmask, mm0);
    pand_m2r (shiftmask, mm1);
    psrlw_i2r (1, mm0);
    psrlw_i2r (1, mm1);
    paddb_r2r (mm1, mm0);
    movq_r2m (mm0, *out);
    out += 8;
    top += 8;
    bot += 8;
  }
  emms ();

  size = size & 0xf;

  /* Handle last few pixels. */
  for (i = size; i; --i) {
    *out++ = ((*top++) + (*bot++)) >> 1;
  }
}
Example #16
0
static void scale_uint16_x_1_x_generic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
  {
  int i, j, jmax;
  uint16_t * src, * dst;
  uint8_t * src_start;
  int32_t * factors;
  mmx_t tmp_mm;
  int tmp;
  
  src_start = ctx->src + scanline * ctx->src_stride;
  
  pxor_r2r(mm6, mm6);
  dst = (uint16_t*)dest_start;
  for(i = 0; i < ctx->dst_size; i++)
    {
    src = (uint16_t*)(src_start + 2*ctx->table_h.pixels[i].index);
    factors = ctx->table_h.pixels[i].factor_i;

    jmax = ctx->table_h.factors_per_pixel / 4;
    tmp = 0;
#if 1
    pxor_r2r(mm4, mm4);

    for(j = 0; j < jmax; j++)
      {
      /* Load pixels */
      movq_m2r(*(src), mm0);
      psrlw_i2r(1, mm0);
      //    DUMP_MM("mm0", mm0);
      /* Load factors */
      movq_m2r(*factors, mm2);
      movq_m2r(*(factors+2), mm3);
      packssdw_r2r(mm3, mm2);
      /* Multiply */
      pmaddwd_r2r(mm2, mm0);
      paddd_r2r(mm0, mm4);
      src += 4;
      factors += 4;
      }

    MOVQ_R2M(mm4, tmp_mm);
    tmp = tmp_mm.d[0] + tmp_mm.d[1];

    
    jmax = ctx->table_h.factors_per_pixel % 4;
#else
    jmax = ctx->table_h.factors_per_pixel;
#endif    
    for(j = 0; j < jmax; j++)
      {
      tmp += *factors * ((*src)>>1);
      factors++;
      src++;
      }

    
    //    if(tmp > (255 << 14)) tmp = 255 << 14;
    //    if(tmp < 0) tmp = 0;
    tmp >>= 13;
    RECLIP(tmp, ctx->plane);
    *(dst++) = tmp;
    
    }
  ctx->need_emms = 1;
  }
Example #17
0
static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv)
{
    static mmx_t mmx_80w = {0x0080008000800080LL};
    static mmx_t mmx_U_green = {0xf37df37df37df37dLL};
    static mmx_t mmx_U_blue = {0x4093409340934093LL};
    static mmx_t mmx_V_red = {0x3312331233123312LL};
    static mmx_t mmx_V_green = {0xe5fce5fce5fce5fcLL};
    static mmx_t mmx_10w = {0x1010101010101010LL};
    static mmx_t mmx_00ffw = {0x00ff00ff00ff00ffLL};
    static mmx_t mmx_Y_coeff = {0x253f253f253f253fLL};

    movd_m2r (*pu, mm0);		/* mm0 = 00 00 00 00 u3 u2 u1 u0 */
    movd_m2r (*pv, mm1);		/* mm1 = 00 00 00 00 v3 v2 v1 v0 */
    movq_m2r (*py, mm6);		/* mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
    pxor_r2r (mm4, mm4);		/* mm4 = 0 */
    /* XXX might do cache preload for image here */

    /*
     * Do the multiply part of the conversion for even and odd pixels
     * register usage:
     * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels
     * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels
     * mm6 -> Y even, mm7 -> Y odd
     */

    punpcklbw_r2r (mm4, mm0);		/* mm0 = u3 u2 u1 u0 */
    punpcklbw_r2r (mm4, mm1);		/* mm1 = v3 v2 v1 v0 */
    psubsw_m2r (mmx_80w, mm0);		/* u -= 128 */
    psubsw_m2r (mmx_80w, mm1);		/* v -= 128 */
    psllw_i2r (3, mm0);			/* promote precision */
    psllw_i2r (3, mm1);			/* promote precision */
    movq_r2r (mm0, mm2);		/* mm2 = u3 u2 u1 u0 */
    movq_r2r (mm1, mm3);		/* mm3 = v3 v2 v1 v0 */
    pmulhw_m2r (mmx_U_green, mm2);	/* mm2 = u * u_green */
    pmulhw_m2r (mmx_V_green, mm3);	/* mm3 = v * v_green */
    pmulhw_m2r (mmx_U_blue, mm0);	/* mm0 = chroma_b */
    pmulhw_m2r (mmx_V_red, mm1);	/* mm1 = chroma_r */
    paddsw_r2r (mm3, mm2);		/* mm2 = chroma_g */

    psubusb_m2r (mmx_10w, mm6);		/* Y -= 16 */
    movq_r2r (mm6, mm7);		/* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
    pand_m2r (mmx_00ffw, mm6);		/* mm6 =    Y6    Y4    Y2    Y0 */
    psrlw_i2r (8, mm7);			/* mm7 =    Y7    Y5    Y3    Y1 */
    psllw_i2r (3, mm6);			/* promote precision */
    psllw_i2r (3, mm7);			/* promote precision */
    pmulhw_m2r (mmx_Y_coeff, mm6);	/* mm6 = luma_rgb even */
    pmulhw_m2r (mmx_Y_coeff, mm7);	/* mm7 = luma_rgb odd */

    /*
     * Do the addition part of the conversion for even and odd pixels
     * register usage:
     * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels
     * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels
     * mm6 -> Y even, mm7 -> Y odd
     */

    movq_r2r (mm0, mm3);		/* mm3 = chroma_b */
    movq_r2r (mm1, mm4);		/* mm4 = chroma_r */
    movq_r2r (mm2, mm5);		/* mm5 = chroma_g */
    paddsw_r2r (mm6, mm0);		/* mm0 = B6 B4 B2 B0 */
    paddsw_r2r (mm7, mm3);		/* mm3 = B7 B5 B3 B1 */
    paddsw_r2r (mm6, mm1);		/* mm1 = R6 R4 R2 R0 */
    paddsw_r2r (mm7, mm4);		/* mm4 = R7 R5 R3 R1 */
    paddsw_r2r (mm6, mm2);		/* mm2 = G6 G4 G2 G0 */
    paddsw_r2r (mm7, mm5);		/* mm5 = G7 G5 G3 G1 */
    packuswb_r2r (mm0, mm0);		/* saturate to 0-255 */
    packuswb_r2r (mm1, mm1);		/* saturate to 0-255 */
    packuswb_r2r (mm2, mm2);		/* saturate to 0-255 */
    packuswb_r2r (mm3, mm3);		/* saturate to 0-255 */
    packuswb_r2r (mm4, mm4);		/* saturate to 0-255 */
    packuswb_r2r (mm5, mm5);		/* saturate to 0-255 */
    punpcklbw_r2r (mm3, mm0);		/* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */
    punpcklbw_r2r (mm4, mm1);		/* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */
    punpcklbw_r2r (mm5, mm2);		/* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */
}
void deinterlace_bob_yuv_mmx(uint8_t *pdst, uint8_t *psrc,
			     int width, int height )
{

  int Line;
  long long* YVal1;
  long long* YVal2;
  long long* YVal3;
  long long* Dest;
  uint8_t* pEvenLines = psrc;
  uint8_t* pOddLines = psrc+width;
  int LineLength = width;
  int Pitch = width * 2;
  int IsOdd = 1;
  long EdgeDetect = 625;
  long JaggieThreshold = 73;

  int n;

  unsigned long long qwEdgeDetect;
  unsigned long long qwThreshold;
  const unsigned long long Mask = 0xfefefefefefefefeULL;
  const unsigned long long YMask = 0x00ff00ff00ff00ffULL;

  qwEdgeDetect = EdgeDetect;
  qwEdgeDetect += (qwEdgeDetect << 48) + (qwEdgeDetect << 32) + (qwEdgeDetect << 16);
  qwThreshold = JaggieThreshold;
  qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16);


  // copy first even line no matter what, and the first odd line if we're
  // processing an odd field.
  ac_memcpy(pdst, pEvenLines, LineLength);
  if (IsOdd)
    ac_memcpy(pdst + LineLength, pOddLines, LineLength);

  height = height / 2;
  for (Line = 0; Line < height - 1; ++Line)
  {
    if (IsOdd)
    {
      YVal1 = (long long *)(pOddLines + Line * Pitch);
      YVal2 = (long long *)(pEvenLines + (Line + 1) * Pitch);
      YVal3 = (long long *)(pOddLines + (Line + 1) * Pitch);
      Dest = (long long *)(pdst + (Line * 2 + 2) * LineLength);
    }
    else
    {
      YVal1 = (long long *)(pEvenLines + Line * Pitch);
      YVal2 = (long long *)(pOddLines + Line * Pitch);
      YVal3 = (long long *)(pEvenLines + (Line + 1) * Pitch);
      Dest = (long long *)(pdst + (Line * 2 + 1) * LineLength);
    }

    // For ease of reading, the comments below assume that we're operating on an odd
    // field (i.e., that bIsOdd is true).  The exact same processing is done when we
    // operate on an even field, but the roles of the odd and even fields are reversed.
    // It's just too cumbersome to explain the algorithm in terms of "the next odd
    // line if we're doing an odd field, or the next even line if we're doing an
    // even field" etc.  So wherever you see "odd" or "even" below, keep in mind that
    // half the time this function is called, those words' meanings will invert.

    // Copy the odd line to the overlay verbatim.
    ac_memcpy((char *)Dest + LineLength, YVal3, LineLength);

    n = LineLength >> 3;
    while( n-- )
    {
      movq_m2r (*YVal1++, mm0);
      movq_m2r (*YVal2++, mm1);
      movq_m2r (*YVal3++, mm2);

      // get intensities in mm3 - 4
      movq_r2r ( mm0, mm3 );
      movq_r2r ( mm1, mm4 );
      movq_r2r ( mm2, mm5 );

      pand_m2r ( *&YMask, mm3 );
      pand_m2r ( *&YMask, mm4 );
      pand_m2r ( *&YMask, mm5 );

      // get average in mm0
      pand_m2r ( *&Mask, mm0 );
      pand_m2r ( *&Mask, mm2 );
      psrlw_i2r ( 01, mm0 );
      psrlw_i2r ( 01, mm2 );
      paddw_r2r ( mm2, mm0 );

      // work out (O1 - E) * (O2 - E) / 2 - EdgeDetect * (O1 - O2) ^ 2 >> 12
      // result will be in mm6

      psrlw_i2r ( 01, mm3 );
      psrlw_i2r ( 01, mm4 );
      psrlw_i2r ( 01, mm5 );

      movq_r2r ( mm3, mm6 );
      psubw_r2r ( mm4, mm6 );	//mm6 = O1 - E

      movq_r2r ( mm5, mm7 );
      psubw_r2r ( mm4, mm7 );	//mm7 = O2 - E

      pmullw_r2r ( mm7, mm6 );		// mm6 = (O1 - E) * (O2 - E)

      movq_r2r ( mm3, mm7 );
      psubw_r2r ( mm5, mm7 );		// mm7 = (O1 - O2)
      pmullw_r2r ( mm7, mm7 );	// mm7 = (O1 - O2) ^ 2
      psrlw_i2r ( 12, mm7 );		// mm7 = (O1 - O2) ^ 2 >> 12
      pmullw_m2r ( *&qwEdgeDetect, mm7 );// mm7  = EdgeDetect * (O1 - O2) ^ 2 >> 12

      psubw_r2r ( mm7, mm6 );      // mm6 is what we want

      pcmpgtw_m2r ( *&qwThreshold, mm6 );

      movq_r2r ( mm6, mm7 );

      pand_r2r ( mm6, mm0 );

      pandn_r2r ( mm1, mm7 );

      por_r2r ( mm0, mm7 );

      movq_r2m ( mm7, *Dest++ );
    }
  }

  // Copy last odd line if we're processing an even field.
  if (! IsOdd)
  {
    ac_memcpy(pdst + (height * 2 - 1) * LineLength,
                      pOddLines + (height - 1) * Pitch,
                      LineLength);
  }

  // clear out the MMX registers ready for doing floating point
  // again
  emms();
}