예제 #1
0
static inline void mmx_start(uint8_t *src1, uint8_t *src2,
                             uint8_t *src3, uint8_t *src4,
                             int X)
{
    movq_m2r (src2[X], mm0);
    movq_m2r (src2[X], mm1);
    movq_m2r (src4[X], mm2);
    movq_m2r (src4[X], mm3);
    movq_m2r (src3[X], mm4);
    movq_m2r (src3[X], mm5);
    punpcklbw_m2r (mm_cpool[0], mm0);
    punpckhbw_m2r (mm_cpool[0], mm1);
    punpcklbw_m2r (mm_cpool[0], mm2);
    punpckhbw_m2r (mm_cpool[0], mm3);
    movq_r2r (mm0, mm6);
    movq_r2r (mm1, mm7);
    paddw_r2r (mm2, mm0);
    paddw_r2r (mm3, mm1);
    movq_m2r (src3[X], mm2);
    movq_m2r (src3[X], mm3);
    psllw_i2r (2, mm0);
    psllw_i2r (2, mm1);
    punpcklbw_m2r (mm_cpool[0], mm2);
    punpckhbw_m2r (mm_cpool[0], mm3);
    psllw_i2r (1, mm2);
    psllw_i2r (1, mm3);
    paddw_r2r (mm2, mm0);
    paddw_r2r (mm3, mm1);
    movq_m2r (src1[X], mm2);
    movq_m2r (src1[X], mm3);
    punpcklbw_m2r (mm_cpool[0], mm2);
    punpckhbw_m2r (mm_cpool[0], mm3);
}
예제 #2
0
static inline void mmx_average_2_U8 (uint8_t * dest,
                                     uint8_t * src1, uint8_t * src2)
{
    /* *dest = (*src1 + *src2 + 1)/ 2; */

    movq_m2r (*src1, mm1);      // load 8 src1 bytes
    movq_r2r (mm1, mm2);        // copy 8 src1 bytes

    movq_m2r (*src2, mm3);      // load 8 src2 bytes
    movq_r2r (mm3, mm4);        // copy 8 src2 bytes

    punpcklbw_r2r (mm0, mm1);   // unpack low src1 bytes
    punpckhbw_r2r (mm0, mm2);   // unpack high src1 bytes

    punpcklbw_r2r (mm0, mm3);   // unpack low src2 bytes
    punpckhbw_r2r (mm0, mm4);   // unpack high src2 bytes

    paddw_r2r (mm3, mm1);       // add lows to mm1
    paddw_m2r (round1, mm1);
    psraw_i2r (1, mm1);         // /2

    paddw_r2r (mm4, mm2);       // add highs to mm2
    paddw_m2r (round1, mm2);
    psraw_i2r (1, mm2);         // /2

    packuswb_r2r (mm2, mm1);    // pack (w/ saturation)
    movq_r2m (mm1, *dest);      // store result in dest
}
예제 #3
0
static __inline__ int qblock_sad_mmxe(uint8_t *refblk, 
								  uint32_t h,
								  uint32_t rowstride)
{
	int res;
	pxor_r2r 	(mm4,mm4);
			
	movq_r2r	(mm0,mm5);		/* First row */
	movd_m2r	(*refblk, mm6);
	pxor_r2r    ( mm7, mm7);
	refblk += rowstride;
	punpcklbw_r2r	( mm7, mm5);
	punpcklbw_r2r	( mm7, mm6);
	psadbw_r2r      ( mm5, mm6);
	paddw_r2r     ( mm6, mm4 );
	


	movq_r2r	(mm1,mm5);		/* Second row */
	movd_m2r	(*refblk, mm6);
	refblk += rowstride;
	punpcklbw_r2r	( mm7, mm5);
	punpcklbw_r2r	( mm7, mm6);
	psadbw_r2r      ( mm5, mm6);
	paddw_r2r     ( mm6, mm4 );

	if( h == 4 )
	{

		movq_r2r	(mm2,mm5);		/* Third row */
		movd_m2r	(*refblk, mm6);
		refblk += rowstride;
		punpcklbw_r2r	( mm7, mm5);
		punpcklbw_r2r	( mm7, mm6);
		psadbw_r2r      ( mm5, mm6);
		paddw_r2r     ( mm6, mm4 );

		
		movq_r2r	(mm3,mm5);		/* Fourth row */
		movd_m2r	(*refblk, mm6);
		punpcklbw_r2r	( mm7, mm5);
		punpcklbw_r2r	( mm7, mm6);
		psadbw_r2r      ( mm5, mm6);
		paddw_r2r     ( mm6, mm4 );
		
	}
	movd_r2m      ( mm4, res );

	return res;
}
예제 #4
0
static void
deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4,
    uint8_t * lum_m3, uint8_t * lum_m2,
    uint8_t * lum_m1, uint8_t * lum, int size)
{
  mmx_t rounder;

  rounder.uw[0] = 4;
  rounder.uw[1] = 4;
  rounder.uw[2] = 4;
  rounder.uw[3] = 4;
  pxor_r2r (mm7, mm7);
  movq_m2r (rounder, mm6);

  for (; size > 3; size -= 4) {
    movd_m2r (*lum_m4, mm0);
    movd_m2r (*lum_m3, mm1);
    movd_m2r (*lum_m2, mm2);
    movd_m2r (*lum_m1, mm3);
    movd_m2r (*lum, mm4);
    punpcklbw_r2r (mm7, mm0);
    punpcklbw_r2r (mm7, mm1);
    punpcklbw_r2r (mm7, mm2);
    punpcklbw_r2r (mm7, mm3);
    punpcklbw_r2r (mm7, mm4);
    paddw_r2r (mm3, mm1);
    psllw_i2r (1, mm2);
    paddw_r2r (mm4, mm0);
    psllw_i2r (2, mm1);         // 2
    paddw_r2r (mm6, mm2);
    paddw_r2r (mm2, mm1);
    psubusw_r2r (mm0, mm1);
    psrlw_i2r (3, mm1);         // 3
    packuswb_r2r (mm7, mm1);
    movd_r2m (mm1, *dst);
    lum_m4 += 4;
    lum_m3 += 4;
    lum_m2 += 4;
    lum_m1 += 4;
    lum += 4;
    dst += 4;
  }
  emms ();

  /* Handle odd widths */
  if (size > 0)
    deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size);
}
예제 #5
0
static void scale_uint8_x_4_x_generic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
  {
  int i, j;
  uint8_t * src, * dst, *src_start;
  int32_t * factors;
  //  mmx_t tmp_mm;

/*
 *  mm0: Input
 *  mm1: factor_mask
 *  mm2: Factor
 *  mm3: Output
 *  mm4: 
 *  mm5: 
 *  mm6: 0
 *  mm7: scratch
 *  
 */
  
  src_start = ctx->src + scanline * ctx->src_stride;
  
  pxor_r2r(mm6, mm6);
  movq_m2r(factor_mask, mm1);
  dst = dest_start;
  for(i = 0; i < ctx->dst_size; i++)
    {
    src = src_start + 4*ctx->table_h.pixels[i].index;
    factors = ctx->table_h.pixels[i].factor_i;
    pxor_r2r(mm3, mm3);

    for(j = 0; j < ctx->table_h.factors_per_pixel; j++)
      {
      /* Load pixels */
      movd_m2r(*(src), mm0);
      punpcklbw_r2r(mm6, mm0);
      psllw_i2r(7, mm0);
      /* Load factors */
      LOAD_FACTOR_1_4;
      /* Multiply */
      pmulhw_r2r(mm7, mm0);
      paddw_r2r(mm0, mm3);
      //    DUMP_MM("mm3_2", mm3);
      src += 4;
      factors++;
      
      }
    
    psraw_i2r(5, mm3);
    packuswb_r2r(mm6, mm3);
    movd_r2m(mm3, *dst);
    
    dst+=4;
    }
  ctx->need_emms = 1;
  
  }
예제 #6
0
파일: algo_x.c 프로젝트: RodrigoNieves/vlc
static inline void XDeint8x8MergeMMXEXT( uint8_t *dst,  int i_dst,
                                         uint8_t *src1, int i_src1,
                                         uint8_t *src2, int i_src2 )
{
    static const uint64_t m_4 = INT64_C(0x0004000400040004);
    int y, x;

    /* Progressive */
    pxor_r2r( mm7, mm7 );
    for( y = 0; y < 8; y += 2 )
    {
        for( x = 0; x < 8; x +=4 )
        {
            movd_m2r( src1[x], mm0 );
            movd_r2m( mm0, dst[x] );

            movd_m2r( src2[x], mm1 );
            movd_m2r( src1[i_src1+x], mm2 );

            punpcklbw_r2r( mm7, mm0 );
            punpcklbw_r2r( mm7, mm1 );
            punpcklbw_r2r( mm7, mm2 );
            paddw_r2r( mm1, mm1 );
            movq_r2r( mm1, mm3 );
            paddw_r2r( mm3, mm3 );
            paddw_r2r( mm2, mm0 );
            paddw_r2r( mm3, mm1 );
            paddw_m2r( m_4, mm1 );
            paddw_r2r( mm1, mm0 );
            psraw_i2r( 3, mm0 );
            packuswb_r2r( mm7, mm0 );
            movd_r2m( mm0, dst[i_dst+x] );
        }
        dst += 2*i_dst;
        src1 += i_src1;
        src2 += i_src2;
    }
}
예제 #7
0
파일: motion_comp_mmx.c 프로젝트: 1c0n/xbmc
static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1,
				     const uint8_t * src2,
				     const uint8_t * src3,
				     const uint8_t * src4)
{
    /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */

    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */

    punpcklbw_r2r (mm0, mm1);	/* unpack low src1 bytes */
    punpckhbw_r2r (mm0, mm2);	/* unpack high src1 bytes */

    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src2 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src2 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    /* now have partials in mm1 and mm2 */

    movq_m2r (*src3, mm3);	/* load 8 src3 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src3 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src3 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src3 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    movq_m2r (*src4, mm5);	/* load 8 src4 bytes */
    movq_r2r (mm5, mm6);	/* copy 8 src4 bytes */

    punpcklbw_r2r (mm0, mm5);	/* unpack low src4 bytes */
    punpckhbw_r2r (mm0, mm6);	/* unpack high src4 bytes */

    paddw_r2r (mm5, mm1);	/* add lows */
    paddw_r2r (mm6, mm2);	/* add highs */

    /* now have subtotal in mm1 and mm2 */

    paddw_m2r (round4, mm1);
    psraw_i2r (2, mm1);		/* /4 */
    paddw_m2r (round4, mm2);
    psraw_i2r (2, mm2);		/* /4 */

    packuswb_r2r (mm2, mm1);	/* pack (w/ saturation) */
    movq_r2m (mm1, *dest);	/* store result in dest */
}
예제 #8
0
static inline void mmx_average_4_U8 (uint8_t * dest,
                                     uint8_t * src1, uint8_t * src2,
                                     uint8_t * src3, uint8_t * src4)
{
    /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */

    movq_m2r (*src1, mm1);      // load 8 src1 bytes
    movq_r2r (mm1, mm2);        // copy 8 src1 bytes

    punpcklbw_r2r (mm0, mm1);   // unpack low src1 bytes
    punpckhbw_r2r (mm0, mm2);   // unpack high src1 bytes

    movq_m2r (*src2, mm3);      // load 8 src2 bytes
    movq_r2r (mm3, mm4);        // copy 8 src2 bytes

    punpcklbw_r2r (mm0, mm3);   // unpack low src2 bytes
    punpckhbw_r2r (mm0, mm4);   // unpack high src2 bytes

    paddw_r2r (mm3, mm1);       // add lows
    paddw_r2r (mm4, mm2);       // add highs

    /* now have partials in mm1 and mm2 */

    movq_m2r (*src3, mm3);      // load 8 src3 bytes
    movq_r2r (mm3, mm4);        // copy 8 src3 bytes

    punpcklbw_r2r (mm0, mm3);   // unpack low src3 bytes
    punpckhbw_r2r (mm0, mm4);   // unpack high src3 bytes

    paddw_r2r (mm3, mm1);       // add lows
    paddw_r2r (mm4, mm2);       // add highs

    movq_m2r (*src4, mm5);      // load 8 src4 bytes
    movq_r2r (mm5, mm6);        // copy 8 src4 bytes

    punpcklbw_r2r (mm0, mm5);   // unpack low src4 bytes
    punpckhbw_r2r (mm0, mm6);   // unpack high src4 bytes

    paddw_r2r (mm5, mm1);       // add lows
    paddw_r2r (mm6, mm2);       // add highs

    /* now have subtotal in mm1 and mm2 */

    paddw_m2r (round4, mm1);
    psraw_i2r (2, mm1);         // /4
    paddw_m2r (round4, mm2);
    psraw_i2r (2, mm2);         // /4

    packuswb_r2r (mm2, mm1);    // pack (w/ saturation)
    movq_r2m (mm1, *dest);      // store result in dest
}
예제 #9
0
파일: color.c 프로젝트: afljafa/mythtv
static void vfilter_chroma_332_packed422_scanline_mmx( uint8_t *output, int width,
        uint8_t *m, uint8_t *t, uint8_t *b )
{
    int i;
    const mmx_t ymask = { 0x00ff00ff00ff00ffULL };
    const mmx_t cmask = { 0xff00ff00ff00ff00ULL };

    // Get width in bytes.
    width *= 2; 
    i = width / 8;
    width -= i * 8;

    movq_m2r( ymask, mm7 );
    movq_m2r( cmask, mm6 );

    while ( i-- ) 
    {
        movq_m2r( *t, mm0 );
        movq_m2r( *b, mm1 );
        movq_m2r( *m, mm2 );

        movq_r2r ( mm2, mm3 );
        pand_r2r ( mm7, mm3 );

        pand_r2r ( mm6, mm0 );
        pand_r2r ( mm6, mm1 );
        pand_r2r ( mm6, mm2 );

        psrlq_i2r( 8, mm0 );
        psrlq_i2r( 7, mm1 );
        psrlq_i2r( 8, mm2 );

        movq_r2r ( mm0, mm4 );
        psllw_i2r( 1, mm4 );
        paddw_r2r( mm4, mm0 );

        movq_r2r ( mm2, mm4 );
        psllw_i2r( 1, mm4 );
        paddw_r2r( mm4, mm2 );

        paddw_r2r( mm0, mm2 );
        paddw_r2r( mm1, mm2 );

        psllw_i2r( 5, mm2 );
        pand_r2r( mm6, mm2 );

        por_r2r ( mm3, mm2 );

        movq_r2m( mm2, *output );
        output += 8;
        t += 8;
        b += 8;
        m += 8;
    }
    output++; t++; b++; m++;
    while ( width-- ) 
    {
        *output = (3 * *t + 3 * *m + 2 * *b) >> 3;
        output +=2; t+=2; b+=2; m+=2;
    }

    emms();
}
예제 #10
0
파일: motion_comp_mmx.c 프로젝트: 1c0n/xbmc
static inline void mmx_interp_average_4_U8 (uint8_t * dest,
					    const uint8_t * src1,
					    const uint8_t * src2,
					    const uint8_t * src3,
					    const uint8_t * src4)
{
    /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */

    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */

    punpcklbw_r2r (mm0, mm1);	/* unpack low src1 bytes */
    punpckhbw_r2r (mm0, mm2);	/* unpack high src1 bytes */

    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src2 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src2 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    /* now have partials in mm1 and mm2 */

    movq_m2r (*src3, mm3);	/* load 8 src3 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src3 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src3 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src3 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    movq_m2r (*src4, mm5);	/* load 8 src4 bytes */
    movq_r2r (mm5, mm6);	/* copy 8 src4 bytes */

    punpcklbw_r2r (mm0, mm5);	/* unpack low src4 bytes */
    punpckhbw_r2r (mm0, mm6);	/* unpack high src4 bytes */

    paddw_r2r (mm5, mm1);	/* add lows */
    paddw_r2r (mm6, mm2);	/* add highs */

    paddw_m2r (round4, mm1);
    psraw_i2r (2, mm1);		/* /4 */
    paddw_m2r (round4, mm2);
    psraw_i2r (2, mm2);		/* /4 */

    /* now have subtotal/4 in mm1 and mm2 */

    movq_m2r (*dest, mm3);	/* load 8 dest bytes */
    movq_r2r (mm3, mm4);	/* copy 8 dest bytes */

    packuswb_r2r (mm2, mm1);	/* pack (w/ saturation) */
    movq_r2r (mm1,mm2);		/* copy subresult */

    pxor_r2r (mm1, mm3);	/* xor srcavg and dest */
    pand_m2r (mask1, mm3);	/* mask lower bits */
    psrlq_i2r (1, mm3);		/* /2 */
    por_r2r (mm2, mm4);		/* or srcavg and dest */
    psubb_r2r (mm3, mm4);	/* subtract subresults */
    movq_r2m (mm4, *dest);	/* store result in dest */
}
예제 #11
0
static __inline__ void
sum_sumsq_8bytes( uint8_t *cur_lum_mb, 
				  uint8_t *pred_lum_mb,
				  mmx_t *sumtop_accs,
				  mmx_t *sumbot_accs,
				  mmx_t *sumsqtop_accs,
				  mmx_t *sumsqbot_accs,
				  mmx_t *sumxprod_accs
	)
{
	pxor_r2r(mm0,mm0);

	/* Load pixels from top field into mm1.w,mm2.w
	 */
	movq_m2r( *((mmx_t*)cur_lum_mb), mm1 );
	movq_m2r( *((mmx_t*)pred_lum_mb), mm2 );
	
	/* mm3 := mm1 mm4 := mm2
	   mm1.w[0..3] := mm1.b[0..3]-mm2.b[0..3]
	*/ 
	
	movq_r2r( mm1, mm3 );
	punpcklbw_r2r( mm0, mm1 );
	movq_r2r( mm2, mm4 );
	punpcklbw_r2r( mm0, mm2 );
	psubw_r2r( mm2, mm1 );
	
	/* mm3.w[0..3] := mm3.b[4..7]-mm4.b[4..7]
	 */
	punpckhbw_r2r( mm0, mm3 );
	punpckhbw_r2r( mm0, mm4 );
	psubw_r2r( mm4, mm3 );

	/* sumtop_accs->w[0..3] += mm1.w[0..3];
	   sumtop_accs->w[0..3] += mm3.w[0..3];
	   mm6 = mm1; mm7 = mm3;
	*/
	movq_m2r( *sumtop_accs, mm5 );
	paddw_r2r( mm1, mm5 );
	paddw_r2r( mm3, mm5 );
	movq_r2r( mm1, mm6 );
	movq_r2r( mm3, mm7 );
	movq_r2m( mm5, *sumtop_accs );

	/* 
	   *sumsq_top_acc += mm1.w[0..3] * mm1.w[0..3];
	   *sumsq_top_acc += mm3.w[0..3] * mm3.w[0..3];
	*/
	pmaddwd_r2r( mm1, mm1 );
	movq_m2r( *sumsqtop_accs, mm5 );
	pmaddwd_r2r( mm3, mm3 );
	paddd_r2r( mm1, mm5 );
	paddd_r2r( mm3, mm5 );
	movq_r2m( mm5, *sumsqtop_accs );
	

	/* Load pixels from bot field into mm1.w,mm2.w
	 */
	movq_m2r( *((mmx_t*)(cur_lum_mb+opt->phy_width)), mm1 );
	movq_m2r( *((mmx_t*)(pred_lum_mb+opt->phy_width)), mm2 );
	
	/* mm2 := mm1 mm4 := mm2
	   mm1.w[0..3] := mm1.b[0..3]-mm2.b[0..3]
	*/ 
	
	movq_r2r( mm1, mm3 );
	punpcklbw_r2r( mm0, mm1 );
	movq_r2r( mm2, mm4 );
	punpcklbw_r2r( mm0, mm2 );
	psubw_r2r( mm2, mm1 );
	
	/* mm3.w[0..3] := mm3.b[4..7]-mm4.b[4..7]
	 */
	punpckhbw_r2r( mm0, mm3 );
	punpckhbw_r2r( mm0, mm4 );
	psubw_r2r( mm4, mm3 );

	/* 
	   sumbot_accs->w[0..3] += mm1.w[0..3];
	   sumbot_accs->w[0..3] += mm3.w[0..3];
	   mm2 := mm1; mm4 := mm3;
	*/
	movq_m2r( *sumbot_accs, mm5 );
	paddw_r2r( mm1, mm5 );
	movq_r2r( mm1, mm2 );
	paddw_r2r( mm3, mm5 );
	movq_r2r( mm3, mm4 );
	movq_r2m( mm5, *sumbot_accs );

	/* 
	   *sumsqbot_acc += mm1.w[0..3] * mm1.w[0..3];
	   *sumsqbot_acc += mm3.w[0..3] * mm3.w[0..3];
	*/
	pmaddwd_r2r( mm1, mm1 );
	movq_m2r( *sumsqbot_accs, mm5 );
	pmaddwd_r2r( mm3, mm3 );
	paddd_r2r( mm1, mm5 );
	paddd_r2r( mm3, mm5 );
	movq_r2m( mm5, *sumsqbot_accs );
	
	
	/* Accumulate cross-product 
	 *sum_xprod_acc += mm1.w[0..3] * mm6[0..3];
	 *sum_xprod_acc += mm3.w[0..3] * mm7[0..3];
	 */

	movq_m2r( *sumxprod_accs, mm5 );
	pmaddwd_r2r( mm6, mm2);
	pmaddwd_r2r( mm7, mm4);
	paddd_r2r( mm2, mm5 );
	paddd_r2r( mm4, mm5 );
	movq_r2m( mm5, *sumxprod_accs );
	emms();
}
예제 #12
0
int bsad_mmx(uint8_t *pf, uint8_t *pb, uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h)
{
    uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
    int s, s1, s2;

    pfa = pf + hxf;
    pfb = pf + lx * hyf;
    pfc = pfb + hxf;

    pba = pb + hxb;
    pbb = pb + lx * hyb; 
    pbc = pbb + hxb;

    s = 0; /* the accumulator */

    if (h > 0)
    {
        pxor_r2r(mm7, mm7);
        pxor_r2r(mm6, mm6);
        pcmpeqw_r2r(mm5, mm5);
        psubw_r2r(mm5, mm6);
        psllw_i2r(1, mm6);
		
        do {
            BSAD_LOAD(pf[0],mm0,mm1);
            BSAD_LOAD_ACC(pfa[0],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pfb[0],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pfc[0],mm2,mm3,mm0,mm1);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psrlw_i2r(2, mm0);
            psrlw_i2r(2, mm1);
			
            BSAD_LOAD(pb[0],mm2,mm3);
            BSAD_LOAD_ACC(pba[0],mm4,mm5,mm2,mm3);
            BSAD_LOAD_ACC(pbb[0],mm4,mm5,mm2,mm3);
            BSAD_LOAD_ACC(pbc[0],mm4,mm5,mm2,mm3);
            paddw_r2r(mm6, mm2);
            paddw_r2r(mm6, mm3);
            psrlw_i2r(2, mm2);
            psrlw_i2r(2, mm3);
			
            paddw_r2r(mm2, mm0);
            paddw_r2r(mm3, mm1);
            psrlw_i2r(1, mm6);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psllw_i2r(1, mm6);
            psrlw_i2r(1, mm0);
            psrlw_i2r(1, mm1);
            packuswb_r2r(mm1, mm0);
			
            movq_m2r(p2[0], mm1);
            movq_r2r(mm0, mm2);
            psubusb_r2r(mm1, mm0);
            psubusb_r2r(mm2, mm1);
            por_r2r(mm1, mm0);
            movq_r2r(mm0, mm1);
            punpcklbw_r2r(mm7, mm0);
            punpckhbw_r2r(mm7, mm1);
            paddw_r2r(mm1, mm0);
            movq_r2r(mm0, mm1);
            punpcklwd_r2r(mm7, mm0);
            punpckhwd_r2r(mm7, mm1);
			
            paddd_r2r(mm1, mm0);
            movd_r2g(mm0, s1);
            psrlq_i2r(32, mm0);
            movd_r2g(mm0, s2);
            s += s1 + s2;

            BSAD_LOAD(pf[8],mm0,mm1);
            BSAD_LOAD_ACC(pfa[8],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pfb[8],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pfc[8],mm2,mm3,mm0,mm1);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psrlw_i2r(2, mm0);
            psrlw_i2r(2, mm1);
			
            BSAD_LOAD(pb[8],mm2,mm3);
            BSAD_LOAD_ACC(pba[8],mm4,mm5,mm2,mm3);
            BSAD_LOAD_ACC(pbb[8],mm4,mm5,mm2,mm3);
            BSAD_LOAD_ACC(pbc[8],mm4,mm5,mm2,mm3);
            paddw_r2r(mm6, mm2);
            paddw_r2r(mm6, mm3);
            psrlw_i2r(2, mm2);
            psrlw_i2r(2, mm3);
						
            paddw_r2r(mm2, mm0);
            paddw_r2r(mm3, mm1);
            psrlw_i2r(1, mm6);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psllw_i2r(1, mm6);
            psrlw_i2r(1, mm0);
            psrlw_i2r(1, mm1);
            packuswb_r2r(mm1, mm0);
			
            movq_m2r(p2[8], mm1);
            movq_r2r(mm0, mm2);
            psubusb_r2r(mm1, mm0);
            psubusb_r2r(mm2, mm1);
            por_r2r(mm1, mm0);
            movq_r2r(mm0, mm1);
            punpcklbw_r2r(mm7, mm0);
            punpckhbw_r2r(mm7, mm1);
            paddw_r2r(mm1, mm0);
            movq_r2r(mm0, mm1);
            punpcklwd_r2r(mm7, mm0);
            punpckhwd_r2r(mm7, mm1);
			
            paddd_r2r(mm1, mm0);
            movd_r2g(mm0, s1);
            psrlq_i2r(32, mm0);
            movd_r2g(mm0, s2);
            s += s1 + s2;
			
            p2  += lx;
            pf  += lx;
            pfa += lx;
            pfb += lx;
            pfc += lx;
            pb  += lx;
            pba += lx;
            pbb += lx;
            pbc += lx;

            h--;
        } while (h > 0);	
	
    }
	
    emms();

    return s;
}
예제 #13
0
static int bsad_1quad_mmxe(uint8_t *pf, uint8_t *pb, uint8_t *pb2, uint8_t *p2, int lx, int h)
{
    int s;

    s = 0; /* the accumulator */

    if (h > 0)
    {
        pcmpeqw_r2r(mm6, mm6);
        psrlw_i2r(15, mm6);
        paddw_r2r(mm6, mm6);

        pxor_r2r(mm7, mm7);
        pxor_r2r(mm5, mm5);
		
        do {
            BSAD_LOAD(pf[0],mm0,mm1);
            BSAD_LOAD_ACC(pf[1],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pf[lx],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pf[lx+1],mm2,mm3,mm0,mm1);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psrlw_i2r(2, mm0);
            psrlw_i2r(2, mm1);
            packuswb_r2r(mm1, mm0);
			
            movq_m2r(pb2[0],mm1);
            pavgb_m2r(pb[0],mm1);

            pavgb_r2r(mm1, mm0);
            psadbw_m2r(p2[0],mm0);
            paddd_r2r(mm0,mm5);

            BSAD_LOAD(pf[8],mm0,mm1);
            BSAD_LOAD_ACC(pf[9],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pf[lx+8],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pf[lx+9],mm2,mm3,mm0,mm1);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psrlw_i2r(2, mm0);
            psrlw_i2r(2, mm1);
            packuswb_r2r(mm1, mm0);
			
            movq_m2r(pb2[8],mm1);
            pavgb_m2r(pb[8],mm1);
						
            pavgb_r2r(mm1, mm0);
            psadbw_m2r(p2[8],mm0);
            paddd_r2r(mm0,mm5);
			
            p2  += lx;
            pf  += lx;
            pb  += lx;
            pb2 += lx;

            h--;
        } while (h > 0);	
	
    }
    movd_r2g(mm5,s);
	
    emms();

    return s;
}
예제 #14
0
static void scale_uint16_x_4_x_quadratic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
  {
  int i;
  uint8_t * src, * dst, *src_start;
  int32_t * factors;
  //  mmx_t tmp_mm;

/*
 *  mm0: Input
 *  mm1: factor_mask
 *  mm2: Factor
 *  mm3: Output
 *  mm4: 
 *  mm5: 
 *  mm6: 0
 *  mm7: scratch
 *  
 */
  
  //  fprintf(stderr, "scale_uint8_x_1_x_bicubic_noclip_mmx\n");
  src_start = ctx->src + scanline * ctx->src_stride;
  
  pxor_r2r(mm6, mm6);
  movq_m2r(factor_mask, mm1);
  dst = dest_start;
  for(i = 0; i < ctx->dst_size; i++)
    {
    src = src_start + 8*ctx->table_h.pixels[i].index;
    factors = ctx->table_h.pixels[i].factor_i;
    
    /* Load pixels */
    movq_m2r(*(src), mm0);
    //    punpcklbw_r2r(mm6, mm0);
    psrlw_i2r(1, mm0);
    /* Load factors */
    LOAD_FACTOR_1_4_NOCLIP;
    /* Multiply */
    pmulhw_r2r(mm7, mm0);
    movq_r2r(mm0, mm3);
    //    DUMP_MM("mm3_1", mm3);
    src += 8;
    factors++;
    
    /* Load pixels */
    movq_m2r(*(src), mm0);
    //    punpcklbw_r2r(mm6, mm0);
    psrlw_i2r(1, mm0);
    /* Load factors */
    LOAD_FACTOR_1_4_NOCLIP;
    /* Multiply */
    pmulhw_r2r(mm7, mm0);
    paddw_r2r(mm0, mm3);
    //    DUMP_MM("mm3_2", mm3);
    src += 8;
    factors++;

    /* Load pixels */
    movq_m2r(*(src), mm0);
    //    punpcklbw_r2r(mm6, mm0);
    psrlw_i2r(1, mm0);
    /* Load factors */
    LOAD_FACTOR_1_4_NOCLIP;
    /* Multiply */
    pmulhw_r2r(mm7, mm0);
    paddw_r2r(mm0, mm3);
    //    DUMP_MM("mm3_3", mm3);
    src += 8;
    
    psllw_i2r(3, mm3);
    //    packuswb_r2r(mm6, mm3);
    MOVQ_R2M(mm3, *dst);
    
    dst+=8;
    }
  ctx->need_emms = 1;
  }
예제 #15
0
파일: vfir.c 프로젝트: jerbs/sinema
static void deinterlace_line( uint8_t *dst, uint8_t *lum_m4,
                              uint8_t *lum_m3, uint8_t *lum_m2,
                              uint8_t *lum_m1, uint8_t *lum, int size )
{
#if defined(__i386__) || defined(__x86_64__)
    mmx_t rounder;

    rounder.uw[0]=4;
    rounder.uw[1]=4;
    rounder.uw[2]=4;
    rounder.uw[3]=4;
    pxor_r2r(mm7,mm7);
    movq_m2r(rounder,mm6);

    for (;size > 3; size-=4) {
        movd_m2r(lum_m4[0],mm0);
        movd_m2r(lum_m3[0],mm1);
        movd_m2r(lum_m2[0],mm2);
        movd_m2r(lum_m1[0],mm3);
        movd_m2r(lum[0],mm4);
        punpcklbw_r2r(mm7,mm0);
        punpcklbw_r2r(mm7,mm1);
        punpcklbw_r2r(mm7,mm2);
        punpcklbw_r2r(mm7,mm3);
        punpcklbw_r2r(mm7,mm4);
        paddw_r2r(mm3,mm1);
        psllw_i2r(1,mm2);
        paddw_r2r(mm4,mm0);
        psllw_i2r(2,mm1);// 2
        paddw_r2r(mm6,mm2);
        paddw_r2r(mm2,mm1);
        psubusw_r2r(mm0,mm1);
        psrlw_i2r(3,mm1); // 3
        packuswb_r2r(mm7,mm1);
        movd_r2m(mm1,dst[0]);
        lum_m4+=4;
        lum_m3+=4;
        lum_m2+=4;
        lum_m1+=4;
        lum+=4;
        dst+=4;
    }
    emms();
#else
    /**
     * C implementation.
     */
    int sum;

    for(;size > 0;size--) {
        sum = -lum_m4[0];
        sum += lum_m3[0] << 2;
        sum += lum_m2[0] << 1;
        sum += lum_m1[0] << 2;
        sum += -lum[0];
        dst[0] = (sum + 4) >> 3; // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3];
        lum_m4++;
        lum_m3++;
        lum_m2++;
        lum_m1++;
        lum++;
        dst++;
    }
#endif
}
예제 #16
0
void deinterlace_bob_yuv_mmx(uint8_t *pdst, uint8_t *psrc,
			     int width, int height )
{

  int Line;
  long long* YVal1;
  long long* YVal2;
  long long* YVal3;
  long long* Dest;
  uint8_t* pEvenLines = psrc;
  uint8_t* pOddLines = psrc+width;
  int LineLength = width;
  int Pitch = width * 2;
  int IsOdd = 1;
  long EdgeDetect = 625;
  long JaggieThreshold = 73;

  int n;

  unsigned long long qwEdgeDetect;
  unsigned long long qwThreshold;
  const unsigned long long Mask = 0xfefefefefefefefeULL;
  const unsigned long long YMask = 0x00ff00ff00ff00ffULL;

  qwEdgeDetect = EdgeDetect;
  qwEdgeDetect += (qwEdgeDetect << 48) + (qwEdgeDetect << 32) + (qwEdgeDetect << 16);
  qwThreshold = JaggieThreshold;
  qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16);


  // copy first even line no matter what, and the first odd line if we're
  // processing an odd field.
  ac_memcpy(pdst, pEvenLines, LineLength);
  if (IsOdd)
    ac_memcpy(pdst + LineLength, pOddLines, LineLength);

  height = height / 2;
  for (Line = 0; Line < height - 1; ++Line)
  {
    if (IsOdd)
    {
      YVal1 = (long long *)(pOddLines + Line * Pitch);
      YVal2 = (long long *)(pEvenLines + (Line + 1) * Pitch);
      YVal3 = (long long *)(pOddLines + (Line + 1) * Pitch);
      Dest = (long long *)(pdst + (Line * 2 + 2) * LineLength);
    }
    else
    {
      YVal1 = (long long *)(pEvenLines + Line * Pitch);
      YVal2 = (long long *)(pOddLines + Line * Pitch);
      YVal3 = (long long *)(pEvenLines + (Line + 1) * Pitch);
      Dest = (long long *)(pdst + (Line * 2 + 1) * LineLength);
    }

    // For ease of reading, the comments below assume that we're operating on an odd
    // field (i.e., that bIsOdd is true).  The exact same processing is done when we
    // operate on an even field, but the roles of the odd and even fields are reversed.
    // It's just too cumbersome to explain the algorithm in terms of "the next odd
    // line if we're doing an odd field, or the next even line if we're doing an
    // even field" etc.  So wherever you see "odd" or "even" below, keep in mind that
    // half the time this function is called, those words' meanings will invert.

    // Copy the odd line to the overlay verbatim.
    ac_memcpy((char *)Dest + LineLength, YVal3, LineLength);

    n = LineLength >> 3;
    while( n-- )
    {
      movq_m2r (*YVal1++, mm0);
      movq_m2r (*YVal2++, mm1);
      movq_m2r (*YVal3++, mm2);

      // get intensities in mm3 - 4
      movq_r2r ( mm0, mm3 );
      movq_r2r ( mm1, mm4 );
      movq_r2r ( mm2, mm5 );

      pand_m2r ( *&YMask, mm3 );
      pand_m2r ( *&YMask, mm4 );
      pand_m2r ( *&YMask, mm5 );

      // get average in mm0
      pand_m2r ( *&Mask, mm0 );
      pand_m2r ( *&Mask, mm2 );
      psrlw_i2r ( 01, mm0 );
      psrlw_i2r ( 01, mm2 );
      paddw_r2r ( mm2, mm0 );

      // work out (O1 - E) * (O2 - E) / 2 - EdgeDetect * (O1 - O2) ^ 2 >> 12
      // result will be in mm6

      psrlw_i2r ( 01, mm3 );
      psrlw_i2r ( 01, mm4 );
      psrlw_i2r ( 01, mm5 );

      movq_r2r ( mm3, mm6 );
      psubw_r2r ( mm4, mm6 );	//mm6 = O1 - E

      movq_r2r ( mm5, mm7 );
      psubw_r2r ( mm4, mm7 );	//mm7 = O2 - E

      pmullw_r2r ( mm7, mm6 );		// mm6 = (O1 - E) * (O2 - E)

      movq_r2r ( mm3, mm7 );
      psubw_r2r ( mm5, mm7 );		// mm7 = (O1 - O2)
      pmullw_r2r ( mm7, mm7 );	// mm7 = (O1 - O2) ^ 2
      psrlw_i2r ( 12, mm7 );		// mm7 = (O1 - O2) ^ 2 >> 12
      pmullw_m2r ( *&qwEdgeDetect, mm7 );// mm7  = EdgeDetect * (O1 - O2) ^ 2 >> 12

      psubw_r2r ( mm7, mm6 );      // mm6 is what we want

      pcmpgtw_m2r ( *&qwThreshold, mm6 );

      movq_r2r ( mm6, mm7 );

      pand_r2r ( mm6, mm0 );

      pandn_r2r ( mm1, mm7 );

      por_r2r ( mm0, mm7 );

      movq_r2m ( mm7, *Dest++ );
    }
  }

  // Copy last odd line if we're processing an even field.
  if (! IsOdd)
  {
    ac_memcpy(pdst + (height * 2 - 1) * LineLength,
                      pOddLines + (height - 1) * Pitch,
                      LineLength);
  }

  // clear out the MMX registers ready for doing floating point
  // again
  emms();
}