Exemple #1
0
static inline void MC_avg4_16 (int height, uint8_t * dest, const uint8_t * ref,
			       const int stride, const int cpu)
{
    do {
	movq_m2r (*ref, mm0);
	movq_m2r (*(ref+stride+1), mm1);
	movq_r2r (mm0, mm7);
	movq_m2r (*(ref+1), mm2);
	pxor_r2r (mm1, mm7);
	movq_m2r (*(ref+stride), mm3);
	movq_r2r (mm2, mm6);
	pxor_r2r (mm3, mm6);
	pavg_r2r (mm1, mm0);
	pavg_r2r (mm3, mm2);
	por_r2r (mm6, mm7);
	movq_r2r (mm0, mm6);
	pxor_r2r (mm2, mm6);
	pand_r2r (mm6, mm7);
	pand_m2r (mask_one, mm7);
	pavg_r2r (mm2, mm0);
	psubusb_r2r (mm7, mm0);
	movq_m2r (*dest, mm1);
	pavg_r2r (mm1, mm0);
	movq_r2m (mm0, *dest);

	movq_m2r (*(ref+8), mm0);
	movq_m2r (*(ref+stride+9), mm1);
	movq_r2r (mm0, mm7);
	movq_m2r (*(ref+9), mm2);
	pxor_r2r (mm1, mm7);
	movq_m2r (*(ref+stride+8), mm3);
	movq_r2r (mm2, mm6);
	pxor_r2r (mm3, mm6);
	pavg_r2r (mm1, mm0);
	pavg_r2r (mm3, mm2);
	por_r2r (mm6, mm7);
	movq_r2r (mm0, mm6);
	pxor_r2r (mm2, mm6);
	pand_r2r (mm6, mm7);
	pand_m2r (mask_one, mm7);
	pavg_r2r (mm2, mm0);
	psubusb_r2r (mm7, mm0);
	movq_m2r (*(dest+8), mm1);
	pavg_r2r (mm1, mm0);
	ref += stride;
	movq_r2m (mm0, *(dest+8));
	dest += stride;
    } while (--height);
}
Exemple #2
0
static inline void MC_put4_8 (int height, uint8_t * dest, const uint8_t * ref,
			      const int stride, const int cpu)
{
    movq_m2r (*ref, mm0);
    movq_m2r (*(ref+1), mm1);
    movq_r2r (mm0, mm7);
    pxor_r2r (mm1, mm7);
    pavg_r2r (mm1, mm0);
    ref += stride;

    do {
	movq_m2r (*ref, mm2);
	movq_r2r (mm0, mm5);

	movq_m2r (*(ref+1), mm3);
	movq_r2r (mm2, mm6);

	pxor_r2r (mm3, mm6);
	pavg_r2r (mm3, mm2);

	por_r2r (mm6, mm7);
	pxor_r2r (mm2, mm5);

	pand_r2r (mm5, mm7);
	pavg_r2r (mm2, mm0);

	pand_m2r (mask_one, mm7);

	psubusb_r2r (mm7, mm0);

	ref += stride;
	movq_r2m (mm0, *dest);
	dest += stride;

	movq_r2r (mm6, mm7);	/* unroll ! */
	movq_r2r (mm2, mm0);	/* unroll ! */
    } while (--height);
}
Exemple #3
0
VLC_MMX
static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
                                    int i_pitch_prev, int i_pitch_curr,
                                    int* pi_top, int* pi_bot )
{
    int32_t i_motion = 0;
    int32_t i_top_motion = 0;
    int32_t i_bot_motion = 0;

    static alignas (8) const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
    pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
    movq_m2r( bT,  mm5 );

    pxor_r2r( mm3, mm3 ); /* score (top field) */
    pxor_r2r( mm4, mm4 ); /* score (bottom field) */
    for( int y = 0; y < 8; y+=2 )
    {
        /* top field */
        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
        movq_r2r( mm0, mm2 );
        psubusb_r2r( mm1, mm2 );
        psubusb_r2r( mm0, mm1 );

        pcmpgtb_r2r( mm5, mm2 );
        pcmpgtb_r2r( mm5, mm1 );
        psadbw_r2r(  mm6, mm2 );
        psadbw_r2r(  mm6, mm1 );

        paddd_r2r( mm2, mm1 );
        paddd_r2r( mm1, mm3 ); /* add to top field score */

        p_pix_c += i_pitch_curr;
        p_pix_p += i_pitch_prev;

        /* bottom field - handling identical to top field, except... */
        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
        movq_r2r( mm0, mm2 );
        psubusb_r2r( mm1, mm2 );
        psubusb_r2r( mm0, mm1 );

        pcmpgtb_r2r( mm5, mm2 );
        pcmpgtb_r2r( mm5, mm1 );
        psadbw_r2r(  mm6, mm2 );
        psadbw_r2r(  mm6, mm1 );

        paddd_r2r( mm2, mm1 );
        paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */

        p_pix_c += i_pitch_curr;
        p_pix_p += i_pitch_prev;
    }
    movq_r2r(  mm3, mm7 ); /* score (total) */
    paddd_r2r( mm4, mm7 );
    movd_r2m( mm3, i_top_motion );
    movd_r2m( mm4, i_bot_motion );
    movd_r2m( mm7, i_motion );

    /* The loop counts actual score * 255. */
    i_top_motion /= 255;
    i_bot_motion /= 255;
    i_motion     /= 255;

    emms();

    (*pi_top) = ( i_top_motion >= 8 );
    (*pi_bot) = ( i_bot_motion >= 8 );
    return (i_motion >= 8);
}
Exemple #4
0
int bsad_mmx(uint8_t *pf, uint8_t *pb, uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h)
{
    uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
    int s, s1, s2;

    pfa = pf + hxf;
    pfb = pf + lx * hyf;
    pfc = pfb + hxf;

    pba = pb + hxb;
    pbb = pb + lx * hyb; 
    pbc = pbb + hxb;

    s = 0; /* the accumulator */

    if (h > 0)
    {
        pxor_r2r(mm7, mm7);
        pxor_r2r(mm6, mm6);
        pcmpeqw_r2r(mm5, mm5);
        psubw_r2r(mm5, mm6);
        psllw_i2r(1, mm6);
		
        do {
            BSAD_LOAD(pf[0],mm0,mm1);
            BSAD_LOAD_ACC(pfa[0],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pfb[0],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pfc[0],mm2,mm3,mm0,mm1);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psrlw_i2r(2, mm0);
            psrlw_i2r(2, mm1);
			
            BSAD_LOAD(pb[0],mm2,mm3);
            BSAD_LOAD_ACC(pba[0],mm4,mm5,mm2,mm3);
            BSAD_LOAD_ACC(pbb[0],mm4,mm5,mm2,mm3);
            BSAD_LOAD_ACC(pbc[0],mm4,mm5,mm2,mm3);
            paddw_r2r(mm6, mm2);
            paddw_r2r(mm6, mm3);
            psrlw_i2r(2, mm2);
            psrlw_i2r(2, mm3);
			
            paddw_r2r(mm2, mm0);
            paddw_r2r(mm3, mm1);
            psrlw_i2r(1, mm6);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psllw_i2r(1, mm6);
            psrlw_i2r(1, mm0);
            psrlw_i2r(1, mm1);
            packuswb_r2r(mm1, mm0);
			
            movq_m2r(p2[0], mm1);
            movq_r2r(mm0, mm2);
            psubusb_r2r(mm1, mm0);
            psubusb_r2r(mm2, mm1);
            por_r2r(mm1, mm0);
            movq_r2r(mm0, mm1);
            punpcklbw_r2r(mm7, mm0);
            punpckhbw_r2r(mm7, mm1);
            paddw_r2r(mm1, mm0);
            movq_r2r(mm0, mm1);
            punpcklwd_r2r(mm7, mm0);
            punpckhwd_r2r(mm7, mm1);
			
            paddd_r2r(mm1, mm0);
            movd_r2g(mm0, s1);
            psrlq_i2r(32, mm0);
            movd_r2g(mm0, s2);
            s += s1 + s2;

            BSAD_LOAD(pf[8],mm0,mm1);
            BSAD_LOAD_ACC(pfa[8],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pfb[8],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pfc[8],mm2,mm3,mm0,mm1);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psrlw_i2r(2, mm0);
            psrlw_i2r(2, mm1);
			
            BSAD_LOAD(pb[8],mm2,mm3);
            BSAD_LOAD_ACC(pba[8],mm4,mm5,mm2,mm3);
            BSAD_LOAD_ACC(pbb[8],mm4,mm5,mm2,mm3);
            BSAD_LOAD_ACC(pbc[8],mm4,mm5,mm2,mm3);
            paddw_r2r(mm6, mm2);
            paddw_r2r(mm6, mm3);
            psrlw_i2r(2, mm2);
            psrlw_i2r(2, mm3);
						
            paddw_r2r(mm2, mm0);
            paddw_r2r(mm3, mm1);
            psrlw_i2r(1, mm6);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psllw_i2r(1, mm6);
            psrlw_i2r(1, mm0);
            psrlw_i2r(1, mm1);
            packuswb_r2r(mm1, mm0);
			
            movq_m2r(p2[8], mm1);
            movq_r2r(mm0, mm2);
            psubusb_r2r(mm1, mm0);
            psubusb_r2r(mm2, mm1);
            por_r2r(mm1, mm0);
            movq_r2r(mm0, mm1);
            punpcklbw_r2r(mm7, mm0);
            punpckhbw_r2r(mm7, mm1);
            paddw_r2r(mm1, mm0);
            movq_r2r(mm0, mm1);
            punpcklwd_r2r(mm7, mm0);
            punpckhwd_r2r(mm7, mm1);
			
            paddd_r2r(mm1, mm0);
            movd_r2g(mm0, s1);
            psrlq_i2r(32, mm0);
            movd_r2g(mm0, s2);
            s += s1 + s2;
			
            p2  += lx;
            pf  += lx;
            pfa += lx;
            pfb += lx;
            pfc += lx;
            pb  += lx;
            pba += lx;
            pbb += lx;
            pbc += lx;

            h--;
        } while (h > 0);	
	
    }
	
    emms();

    return s;
}
//VLC_MMX			// sunqueen delete
static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
                                    int i_pitch_prev, int i_pitch_curr,
                                    int* pi_top, int* pi_bot )
{
    int32_t i_motion = 0;
    int32_t i_top_motion = 0;
    int32_t i_bot_motion = 0;
	uint64_t ui_pix_c, ui_pix_p;			// sunqueen add

//    static const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
    __declspec(align(8)) static const mmx_t bT   = { 0x0A0A0A0A0A0A0A0AULL };			// sunqueen modify
    pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
    movq_m2r( bT,  mm5 );

    pxor_r2r( mm3, mm3 ); /* score (top field) */
    pxor_r2r( mm4, mm4 ); /* score (bottom field) */
    for( int y = 0; y < 8; y+=2 )
    {
        /* top field */
		// sunqueen add start
		ui_pix_c = *((uint64_t*)p_pix_c);
        movq_m2r( ui_pix_c, mm0 );
		ui_pix_p = *((uint64_t*)p_pix_p);
        movq_m2r( ui_pix_p, mm1 );
		// sunqueen add end
#if 0	// sunqueen delete start
        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
#endif	// sunqueen delete end
        movq_r2r( mm0, mm2 );
        psubusb_r2r( mm1, mm2 );
        psubusb_r2r( mm0, mm1 );

        pcmpgtb_r2r( mm5, mm2 );
        pcmpgtb_r2r( mm5, mm1 );
        psadbw_r2r(  mm6, mm2 );
        psadbw_r2r(  mm6, mm1 );

        paddd_r2r( mm2, mm1 );
        paddd_r2r( mm1, mm3 ); /* add to top field score */

        p_pix_c += i_pitch_curr;
        p_pix_p += i_pitch_prev;

        /* bottom field - handling identical to top field, except... */
		// sunqueen add start
		ui_pix_c = *((uint64_t*)p_pix_c);
        movq_m2r( ui_pix_c, mm0 );
		ui_pix_p = *((uint64_t*)p_pix_p);
        movq_m2r( ui_pix_p, mm1 );
		// sunqueen add end
#if 0	// sunqueen delete start
        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
#endif	// sunqueen delete end
        movq_r2r( mm0, mm2 );
        psubusb_r2r( mm1, mm2 );
        psubusb_r2r( mm0, mm1 );

        pcmpgtb_r2r( mm5, mm2 );
        pcmpgtb_r2r( mm5, mm1 );
        psadbw_r2r(  mm6, mm2 );
        psadbw_r2r(  mm6, mm1 );

        paddd_r2r( mm2, mm1 );
        paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */

        p_pix_c += i_pitch_curr;
        p_pix_p += i_pitch_prev;
    }
    movq_r2r(  mm3, mm7 ); /* score (total) */
    paddd_r2r( mm4, mm7 );
    movd_r2m( mm3, i_top_motion );
    movd_r2m( mm4, i_bot_motion );
    movd_r2m( mm7, i_motion );

    /* The loop counts actual score * 255. */
    i_top_motion /= 255;
    i_bot_motion /= 255;
    i_motion     /= 255;

    emms();

    (*pi_top) = ( i_top_motion >= 8 );
    (*pi_bot) = ( i_bot_motion >= 8 );
    return (i_motion >= 8);
}
Exemple #6
0
static void
deinterlace_greedy_scanline_mmxext (GstDeinterlaceMethodGreedyL *
    self, const guint8 * m0, const guint8 * t1, const guint8 * b1,
    const guint8 * m2, guint8 * output, gint width)
{
  mmx_t MaxComb;

  // How badly do we let it weave? 0-255
  MaxComb.ub[0] = self->max_comb;
  MaxComb.ub[1] = self->max_comb;
  MaxComb.ub[2] = self->max_comb;
  MaxComb.ub[3] = self->max_comb;
  MaxComb.ub[4] = self->max_comb;
  MaxComb.ub[5] = self->max_comb;
  MaxComb.ub[6] = self->max_comb;
  MaxComb.ub[7] = self->max_comb;

  // L2 == m0
  // L1 == t1
  // L3 == b1
  // LP2 == m2

  movq_m2r (MaxComb, mm6);

  for (; width > 7; width -= 8) {
    movq_m2r (*t1, mm1);        // L1
    movq_m2r (*m0, mm2);        // L2
    movq_m2r (*b1, mm3);        // L3
    movq_m2r (*m2, mm0);        // LP2

    // average L1 and L3 leave result in mm4
    movq_r2r (mm1, mm4);        // L1
    pavgb_r2r (mm3, mm4);       // (L1 + L3)/2

    // get abs value of possible L2 comb
    movq_r2r (mm2, mm7);        // L2
    psubusb_r2r (mm4, mm7);     // L2 - avg
    movq_r2r (mm4, mm5);        // avg
    psubusb_r2r (mm2, mm5);     // avg - L2
    por_r2r (mm7, mm5);         // abs(avg-L2)

    // get abs value of possible LP2 comb
    movq_r2r (mm0, mm7);        // LP2
    psubusb_r2r (mm4, mm7);     // LP2 - avg
    psubusb_r2r (mm0, mm4);     // avg - LP2
    por_r2r (mm7, mm4);         // abs(avg-LP2)

    // use L2 or LP2 depending upon which makes smaller comb
    psubusb_r2r (mm5, mm4);     // see if it goes to zero
    pxor_r2r (mm5, mm5);        // 0
    pcmpeqb_r2r (mm5, mm4);     // if (mm4=0) then FF else 0
    pcmpeqb_r2r (mm4, mm5);     // opposite of mm4

    // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
    pand_r2r (mm2, mm5);        // use L2 if mm5 == ff, else 0
    pand_r2r (mm0, mm4);        // use LP2 if mm4 = ff, else 0
    por_r2r (mm5, mm4);         // may the best win

    // Now lets clip our chosen value to be not outside of the range
    // of the high/low range L1-L3 by more than abs(L1-L3)
    // This allows some comb but limits the damages and also allows more
    // detail than a boring oversmoothed clip.

    movq_r2r (mm1, mm2);        // copy L1
    pmaxub_r2r (mm3, mm2);      // now = Max(L1,L3)

    pminub_r2r (mm1, mm3);      // now = Min(L1,L3)

    // allow the value to be above the high or below the low by amt of MaxComb
    paddusb_r2r (mm6, mm2);     // increase max by diff
    psubusb_r2r (mm6, mm3);     // lower min by diff


    pmaxub_r2r (mm3, mm4);      // now = Max(best,Min(L1,L3)
    pminub_r2r (mm4, mm2);      // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped

    movq_r2m (mm2, *output);    // move in our clipped best

    // Advance to the next set of pixels.
    output += 8;
    m0 += 8;
    t1 += 8;
    b1 += 8;
    m2 += 8;
  }
  emms ();

  if (width > 0)
    deinterlace_greedy_scanline_c (self, m0, t1, b1, m2, output, width);
}
Exemple #7
0
/**
 * Internal helper function for EstimateNumBlocksWithMotion():
 * estimates whether there is motion in the given 8x8 block on one plane
 * between two images. The block as a whole and its fields are evaluated
 * separately, and use different motion thresholds.
 *
 * This is a low-level function only used by EstimateNumBlocksWithMotion().
 * There is no need to call this function manually.
 *
 * For interpretation of pi_top and pi_bot, it is assumed that the block
 * starts on an even-numbered line (belonging to the top field).
 *
 * The b_mmx parameter avoids the need to call vlc_CPU() separately
 * for each block.
 *
 * @param[in] p_pix_p Base pointer to the block in previous picture
 * @param[in] p_pix_c Base pointer to the same block in current picture
 * @param i_pitch_prev i_pitch of previous picture
 * @param i_pitch_curr i_pitch of current picture
 * @param b_mmx (vlc_CPU() & CPU_CAPABILITY_MMXEXT) or false.
 * @param[out] pi_top 1 if top field of the block had motion, 0 if no
 * @param[out] pi_bot 1 if bottom field of the block had motion, 0 if no
 * @return 1 if the block had motion, 0 if no
 * @see EstimateNumBlocksWithMotion()
 */
static inline int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
                                        int i_pitch_prev, int i_pitch_curr,
                                        bool b_mmx,
                                        int* pi_top, int* pi_bot )
{
/* Pixel luma/chroma difference threshold to detect motion. */
#define T 10

    int32_t i_motion = 0;
    int32_t i_top_motion = 0;
    int32_t i_bot_motion = 0;

/* See below for the C version to see more quickly what this does. */
#ifdef CAN_COMPILE_MMXEXT
    if( b_mmx )
    {
        static const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
        pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
        movq_m2r( bT,  mm5 );

        pxor_r2r( mm3, mm3 ); /* score (top field) */
        pxor_r2r( mm4, mm4 ); /* score (bottom field) */
        for( int y = 0; y < 8; y+=2 )
        {
            /* top field */
            movq_m2r( *((uint64_t*)p_pix_c), mm0 );
            movq_m2r( *((uint64_t*)p_pix_p), mm1 );
            movq_r2r( mm0, mm2 );
            psubusb_r2r( mm1, mm2 );
            psubusb_r2r( mm0, mm1 );

            pcmpgtb_r2r( mm5, mm2 );
            pcmpgtb_r2r( mm5, mm1 );
            psadbw_r2r(  mm6, mm2 );
            psadbw_r2r(  mm6, mm1 );

            paddd_r2r( mm2, mm1 );
            paddd_r2r( mm1, mm3 ); /* add to top field score */

            p_pix_c += i_pitch_curr;
            p_pix_p += i_pitch_prev;

            /* bottom field - handling identical to top field, except... */
            movq_m2r( *((uint64_t*)p_pix_c), mm0 );
            movq_m2r( *((uint64_t*)p_pix_p), mm1 );
            movq_r2r( mm0, mm2 );
            psubusb_r2r( mm1, mm2 );
            psubusb_r2r( mm0, mm1 );

            pcmpgtb_r2r( mm5, mm2 );
            pcmpgtb_r2r( mm5, mm1 );
            psadbw_r2r(  mm6, mm2 );
            psadbw_r2r(  mm6, mm1 );

            paddd_r2r( mm2, mm1 );
            paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */

            p_pix_c += i_pitch_curr;
            p_pix_p += i_pitch_prev;
        }
        movq_r2r(  mm3, mm7 ); /* score (total) */
        paddd_r2r( mm4, mm7 );
        movd_r2m( mm3, i_top_motion );
        movd_r2m( mm4, i_bot_motion );
        movd_r2m( mm7, i_motion );

        /* The loop counts actual score * 255. */
        i_top_motion /= 255;
        i_bot_motion /= 255;
        i_motion     /= 255;

        emms();
    }
    else
#endif
    {
        for( int y = 0; y < 8; ++y )