Exemple #1
0
static inline void mmx_end(uint8_t *src3, uint8_t *src5,
                           uint8_t *dst, int X)
{
    punpcklbw_m2r (mm_cpool[0], mm4);
    punpckhbw_m2r (mm_cpool[0], mm5);
    psubusw_r2r (mm2, mm0);
    psubusw_r2r (mm3, mm1);
    movq_m2r (src5[X], mm2);
    movq_m2r (src5[X], mm3);
    punpcklbw_m2r (mm_cpool[0], mm2);
    punpckhbw_m2r (mm_cpool[0], mm3);
    psubusw_r2r (mm2, mm0);
    psubusw_r2r (mm3, mm1);
    psrlw_i2r (3, mm0);
    psrlw_i2r (3, mm1);
    psubw_r2r (mm6, mm4);
    psubw_r2r (mm7, mm5);
    packuswb_r2r (mm1,mm0);
    movq_r2r (mm4, mm6);
    movq_r2r (mm5, mm7);
    pcmpgtw_m2r (mm_lthr, mm4);
    pcmpgtw_m2r (mm_lthr, mm5);
    pcmpgtw_m2r (mm_hthr, mm6);
    pcmpgtw_m2r (mm_hthr, mm7);
    packsswb_r2r (mm5, mm4);
    packsswb_r2r (mm7, mm6);
    pxor_r2r (mm6, mm4);
    movq_r2r (mm4, mm5);
    pandn_r2r (mm0, mm4);
    pand_m2r (src3[X], mm5);
    por_r2r (mm4, mm5);
    movq_r2m (mm5, dst[X]);
}
Exemple #2
0
static inline void mmx_average_2_U8 (uint8_t * dest,
                                     uint8_t * src1, uint8_t * src2)
{
    /* *dest = (*src1 + *src2 + 1)/ 2; */

    movq_m2r (*src1, mm1);      // load 8 src1 bytes
    movq_r2r (mm1, mm2);        // copy 8 src1 bytes

    movq_m2r (*src2, mm3);      // load 8 src2 bytes
    movq_r2r (mm3, mm4);        // copy 8 src2 bytes

    punpcklbw_r2r (mm0, mm1);   // unpack low src1 bytes
    punpckhbw_r2r (mm0, mm2);   // unpack high src1 bytes

    punpcklbw_r2r (mm0, mm3);   // unpack low src2 bytes
    punpckhbw_r2r (mm0, mm4);   // unpack high src2 bytes

    paddw_r2r (mm3, mm1);       // add lows to mm1
    paddw_m2r (round1, mm1);
    psraw_i2r (1, mm1);         // /2

    paddw_r2r (mm4, mm2);       // add highs to mm2
    paddw_m2r (round1, mm2);
    psraw_i2r (1, mm2);         // /2

    packuswb_r2r (mm2, mm1);    // pack (w/ saturation)
    movq_r2m (mm1, *dest);      // store result in dest
}
static void frame_i2f_sse(u_char *src,float *dst,int l)
{
    int i;

    pxor_r2r(mm7,mm7);

    for( i=0; i<l; i+=8 ) {
        movq_m2r(*src,mm0);
        movq_r2r(mm0, mm2);
        punpcklbw_r2r(mm7, mm0);
        punpckhbw_r2r(mm7, mm2);
        movq_r2r(mm0, mm1);
        movq_r2r(mm2, mm3);
        punpcklwd_r2r(mm7, mm0);
        punpckhwd_r2r(mm7, mm1);
        punpcklwd_r2r(mm7, mm2);
        punpckhwd_r2r(mm7, mm3);
        cvtpi2ps_r2r(mm0,xmm0);
        cvtpi2ps_r2r(mm1,xmm1);
        cvtpi2ps_r2r(mm2,xmm2);
        cvtpi2ps_r2r(mm3,xmm3);
        movlps_r2m(xmm0,dst[0]);
        movlps_r2m(xmm1,dst[2]);
        movlps_r2m(xmm2,dst[4]);
        movlps_r2m(xmm3,dst[6]);

        src+=8;
        dst+=8;
    }
    emms();
}
Exemple #4
0
static inline void mmx_start(uint8_t *src1, uint8_t *src2,
                             uint8_t *src3, uint8_t *src4,
                             int X)
{
    movq_m2r (src2[X], mm0);
    movq_m2r (src2[X], mm1);
    movq_m2r (src4[X], mm2);
    movq_m2r (src4[X], mm3);
    movq_m2r (src3[X], mm4);
    movq_m2r (src3[X], mm5);
    punpcklbw_m2r (mm_cpool[0], mm0);
    punpckhbw_m2r (mm_cpool[0], mm1);
    punpcklbw_m2r (mm_cpool[0], mm2);
    punpckhbw_m2r (mm_cpool[0], mm3);
    movq_r2r (mm0, mm6);
    movq_r2r (mm1, mm7);
    paddw_r2r (mm2, mm0);
    paddw_r2r (mm3, mm1);
    movq_m2r (src3[X], mm2);
    movq_m2r (src3[X], mm3);
    psllw_i2r (2, mm0);
    psllw_i2r (2, mm1);
    punpcklbw_m2r (mm_cpool[0], mm2);
    punpckhbw_m2r (mm_cpool[0], mm3);
    psllw_i2r (1, mm2);
    psllw_i2r (1, mm3);
    paddw_r2r (mm2, mm0);
    paddw_r2r (mm3, mm1);
    movq_m2r (src1[X], mm2);
    movq_m2r (src1[X], mm3);
    punpcklbw_m2r (mm_cpool[0], mm2);
    punpckhbw_m2r (mm_cpool[0], mm3);
}
Exemple #5
0
static inline void MC_avg4_8 (int height, uint8_t * dest, const uint8_t * ref,
			      const int stride, const int cpu)
{
    do {
	movq_m2r (*ref, mm0);
	movq_m2r (*(ref+stride+1), mm1);
	movq_r2r (mm0, mm7);
	movq_m2r (*(ref+1), mm2);
	pxor_r2r (mm1, mm7);
	movq_m2r (*(ref+stride), mm3);
	movq_r2r (mm2, mm6);
	pxor_r2r (mm3, mm6);
	pavg_r2r (mm1, mm0);
	pavg_r2r (mm3, mm2);
	por_r2r (mm6, mm7);
	movq_r2r (mm0, mm6);
	pxor_r2r (mm2, mm6);
	pand_r2r (mm6, mm7);
	pand_m2r (mask_one, mm7);
	pavg_r2r (mm2, mm0);
	psubusb_r2r (mm7, mm0);
	movq_m2r (*dest, mm1);
	pavg_r2r (mm1, mm0);
	ref += stride;
	movq_r2m (mm0, *dest);
	dest += stride;
    } while (--height);
}
Exemple #6
0
static inline void mmx_interp_average_2_U8 (uint8_t * dest,
					    const uint8_t * src1,
					    const uint8_t * src2)
{
    /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */

    movq_m2r (*dest, mm1);	/* load 8 dest bytes */
    movq_r2r (mm1, mm2);	/* copy 8 dest bytes */

    movq_m2r (*src1, mm3);	/* load 8 src1 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src1 bytes */

    movq_m2r (*src2, mm5);	/* load 8 src2 bytes */
    movq_r2r (mm5, mm6);	/* copy 8 src2 bytes */

    pxor_r2r (mm3, mm5);	/* xor src1 and src2 */
    pand_m2r (mask1, mm5);	/* mask lower bits */
    psrlq_i2r (1, mm5);		/* /2 */
    por_r2r (mm4, mm6);		/* or src1 and src2 */
    psubb_r2r (mm5, mm6);	/* subtract subresults */
    movq_r2r (mm6, mm5);	/* copy subresult */

    pxor_r2r (mm1, mm5);	/* xor srcavg and dest */
    pand_m2r (mask1, mm5);	/* mask lower bits */
    psrlq_i2r (1, mm5);		/* /2 */
    por_r2r (mm2, mm6);		/* or srcavg and dest */
    psubb_r2r (mm5, mm6);	/* subtract subresults */
    movq_r2m (mm6, *dest);	/* store result in dest */
}
Exemple #7
0
static inline void mmx_unpack_16rgb (uint8_t * image, const int cpu)
{
    static mmx_t mmx_bluemask = {0xf8f8f8f8f8f8f8f8LL};
    static mmx_t mmx_greenmask = {0xfcfcfcfcfcfcfcfcLL};
    static mmx_t mmx_redmask = {0xf8f8f8f8f8f8f8f8LL};

    /*
     * convert RGB plane to RGB 16 bits
     * mm0 -> B, mm1 -> R, mm2 -> G
     * mm4 -> GB, mm5 -> AR pixel 4-7
     * mm6 -> GB, mm7 -> AR pixel 0-3
     */

    pand_m2r (mmx_bluemask, mm0);	/* mm0 = b7b6b5b4b3______ */
    pand_m2r (mmx_greenmask, mm2);	/* mm2 = g7g6g5g4g3g2____ */
    pand_m2r (mmx_redmask, mm1);	/* mm1 = r7r6r5r4r3______ */
    psrlq_i2r (3, mm0);			/* mm0 = ______b7b6b5b4b3 */
    pxor_r2r (mm4, mm4);		/* mm4 = 0 */
    movq_r2r (mm0, mm5);		/* mm5 = ______b7b6b5b4b3 */
    movq_r2r (mm2, mm7);		/* mm7 = g7g6g5g4g3g2____ */

    punpcklbw_r2r (mm4, mm2);
    punpcklbw_r2r (mm1, mm0);
    psllq_i2r (3, mm2);
    por_r2r (mm2, mm0);
    movntq (mm0, *image);

    punpckhbw_r2r (mm4, mm7);
    punpckhbw_r2r (mm1, mm5);
    psllq_i2r (3, mm7);
    por_r2r (mm7, mm5);
    movntq (mm5, *(image+8));
}
Exemple #8
0
static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1,
				     const uint8_t * src2,
				     const uint8_t * src3,
				     const uint8_t * src4)
{
    /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */

    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */

    punpcklbw_r2r (mm0, mm1);	/* unpack low src1 bytes */
    punpckhbw_r2r (mm0, mm2);	/* unpack high src1 bytes */

    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src2 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src2 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    /* now have partials in mm1 and mm2 */

    movq_m2r (*src3, mm3);	/* load 8 src3 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src3 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src3 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src3 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    movq_m2r (*src4, mm5);	/* load 8 src4 bytes */
    movq_r2r (mm5, mm6);	/* copy 8 src4 bytes */

    punpcklbw_r2r (mm0, mm5);	/* unpack low src4 bytes */
    punpckhbw_r2r (mm0, mm6);	/* unpack high src4 bytes */

    paddw_r2r (mm5, mm1);	/* add lows */
    paddw_r2r (mm6, mm2);	/* add highs */

    /* now have subtotal in mm1 and mm2 */

    paddw_m2r (round4, mm1);
    psraw_i2r (2, mm1);		/* /4 */
    paddw_m2r (round4, mm2);
    psraw_i2r (2, mm2);		/* /4 */

    packuswb_r2r (mm2, mm1);	/* pack (w/ saturation) */
    movq_r2m (mm1, *dest);	/* store result in dest */
}
Exemple #9
0
static inline void mmx_average_4_U8 (uint8_t * dest,
                                     uint8_t * src1, uint8_t * src2,
                                     uint8_t * src3, uint8_t * src4)
{
    /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */

    movq_m2r (*src1, mm1);      // load 8 src1 bytes
    movq_r2r (mm1, mm2);        // copy 8 src1 bytes

    punpcklbw_r2r (mm0, mm1);   // unpack low src1 bytes
    punpckhbw_r2r (mm0, mm2);   // unpack high src1 bytes

    movq_m2r (*src2, mm3);      // load 8 src2 bytes
    movq_r2r (mm3, mm4);        // copy 8 src2 bytes

    punpcklbw_r2r (mm0, mm3);   // unpack low src2 bytes
    punpckhbw_r2r (mm0, mm4);   // unpack high src2 bytes

    paddw_r2r (mm3, mm1);       // add lows
    paddw_r2r (mm4, mm2);       // add highs

    /* now have partials in mm1 and mm2 */

    movq_m2r (*src3, mm3);      // load 8 src3 bytes
    movq_r2r (mm3, mm4);        // copy 8 src3 bytes

    punpcklbw_r2r (mm0, mm3);   // unpack low src3 bytes
    punpckhbw_r2r (mm0, mm4);   // unpack high src3 bytes

    paddw_r2r (mm3, mm1);       // add lows
    paddw_r2r (mm4, mm2);       // add highs

    movq_m2r (*src4, mm5);      // load 8 src4 bytes
    movq_r2r (mm5, mm6);        // copy 8 src4 bytes

    punpcklbw_r2r (mm0, mm5);   // unpack low src4 bytes
    punpckhbw_r2r (mm0, mm6);   // unpack high src4 bytes

    paddw_r2r (mm5, mm1);       // add lows
    paddw_r2r (mm6, mm2);       // add highs

    /* now have subtotal in mm1 and mm2 */

    paddw_m2r (round4, mm1);
    psraw_i2r (2, mm1);         // /4
    paddw_m2r (round4, mm2);
    psraw_i2r (2, mm2);         // /4

    packuswb_r2r (mm2, mm1);    // pack (w/ saturation)
    movq_r2m (mm1, *dest);      // store result in dest
}
static __inline__ int qblock_sad_mmxe(uint8_t *refblk, 
								  uint32_t h,
								  uint32_t rowstride)
{
	int res;
	pxor_r2r 	(mm4,mm4);
			
	movq_r2r	(mm0,mm5);		/* First row */
	movd_m2r	(*refblk, mm6);
	pxor_r2r    ( mm7, mm7);
	refblk += rowstride;
	punpcklbw_r2r	( mm7, mm5);
	punpcklbw_r2r	( mm7, mm6);
	psadbw_r2r      ( mm5, mm6);
	paddw_r2r     ( mm6, mm4 );
	


	movq_r2r	(mm1,mm5);		/* Second row */
	movd_m2r	(*refblk, mm6);
	refblk += rowstride;
	punpcklbw_r2r	( mm7, mm5);
	punpcklbw_r2r	( mm7, mm6);
	psadbw_r2r      ( mm5, mm6);
	paddw_r2r     ( mm6, mm4 );

	if( h == 4 )
	{

		movq_r2r	(mm2,mm5);		/* Third row */
		movd_m2r	(*refblk, mm6);
		refblk += rowstride;
		punpcklbw_r2r	( mm7, mm5);
		punpcklbw_r2r	( mm7, mm6);
		psadbw_r2r      ( mm5, mm6);
		paddw_r2r     ( mm6, mm4 );

		
		movq_r2r	(mm3,mm5);		/* Fourth row */
		movd_m2r	(*refblk, mm6);
		punpcklbw_r2r	( mm7, mm5);
		punpcklbw_r2r	( mm7, mm6);
		psadbw_r2r      ( mm5, mm6);
		paddw_r2r     ( mm6, mm4 );
		
	}
	movd_r2m      ( mm4, res );

	return res;
}
Exemple #11
0
static __inline__ void
mmx_sum_4_word_accs( mmx_t *accs, int32_t *res )
{
	movq_m2r( *accs, mm1 );
	movq_r2r( mm1, mm3 );
	movq_r2r( mm1, mm2 );
	/* Generate sign extensions for mm1 words! */
	psraw_i2r( 15, mm3 );
	punpcklwd_r2r( mm3, mm1 );
	punpckhwd_r2r( mm3, mm2 );
	paddd_r2r( mm1, mm2 );
	movq_r2r( mm2, mm3);
	psrlq_i2r( 32, mm2);
	paddd_r2r( mm2, mm3);
	movd_r2m( mm3, *res );
}
Exemple #12
0
int main(int ac, char **av)
{
   int i, j, k, n;
   unsigned char dat0[8] = { 0x01, 0xf2, 0x03, 0x04, 0x05, 0x06, 0xf7, 0x08 };
   long long *datp = (long long *)&dat0;
   int16_t dat1[8] = { 0x10, 0x20, -0x130, -0x140, 0x50, -0x160, -0x170, 0x80 };
   volatile uint8_t *rfp = dat0;
   volatile int16_t *bp  = dat1;
   unsigned char ans1[8], ans2[8];

   n = 0;
   for( i=-32768; i<32768; ++i ) {
     j = 0;
     while( j < 256 ) {
        for( k=0; k<8; ++k ) {
          dat0[k] = i;
          dat1[k] = j++;
        }
       movq_m2r(m_(&rfp[0]),mm1);  /* rfp[0..7] */
       pxor_r2r(mm3,mm3);
       pxor_r2r(mm4,mm4);
       movq_m2r(m_(&bp[0]),mm5);   /* bp[0..3] */
       movq_r2r(mm1,mm2);
       movq_m2r(m_(&bp[4]),mm6);   /* bp[4..7] */
       punpcklbw_r2r(mm3,mm1);     /* rfp[0,2,4,6] */
       punpckhbw_r2r(mm3,mm2);     /* rfp[1,3,5,7] */
       paddsw_r2r(mm5,mm1);        /* bp[0..3] */
       paddsw_r2r(mm6,mm2);        /* bp[4..7] */
       pcmpgtw_r2r(mm1,mm3);
       pcmpgtw_r2r(mm2,mm4);
       pandn_r2r(mm1,mm3);
       pandn_r2r(mm2,mm4);
       packuswb_r2r(mm4,mm3);
       movq_r2m(mm3,m_(&ans1[0]));
       emms();

       ans2[0] = clip(bp[0] + rfp[0]);
       ans2[1] = clip(bp[1] + rfp[1]);
       ans2[2] = clip(bp[2] + rfp[2]);
       ans2[3] = clip(bp[3] + rfp[3]);
       ans2[4] = clip(bp[4] + rfp[4]);
       ans2[5] = clip(bp[5] + rfp[5]);
       ans2[6] = clip(bp[6] + rfp[6]);
       ans2[7] = clip(bp[7] + rfp[7]);

       if( *(uint64_t *)&ans1[0] != *(uint64_t *)&ans2[0] )
       {
         printf(" i=%5d %02x %02x %02x %02x  %02x %02x %02x %02x\n", i,
           ans1[0], ans1[1], ans1[2], ans1[3], ans1[4], ans1[5], ans1[6], ans1[7]);
         printf(" j=%5d %02x %02x %02x %02x  %02x %02x %02x %02x\n", j,
           ans2[0], ans2[1], ans2[2], ans2[3], ans2[4], ans2[5], ans2[6], ans2[7]);
       //  exit(0);
       }
       n += 8;
     }
   }

   printf("n=%d\n",n);
   return 0;
}
Exemple #13
0
static void
_op_mul_p_mas_dp_mmx(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
   DATA32 *e = d + l;
   MOV_A2R(ALPHA_255, mm5)
   pxor_r2r(mm0, mm0);
   while (d < e) {
	c = *m;
	switch(c)
	  {
	    case 0:
		break;
	    case 255:
		MOV_P2R(*d, mm1, mm0)
		MOV_P2R(*s, mm2, mm0)
		MUL4_SYM_R2R(mm2, mm1, mm5)
		MOV_R2P(mm1, *d, mm0)
		break;
	    default:
		c++;
		MOV_A2R(c, mm1)
		c = ~(*s);
		MOV_P2R(c, mm3, mm0)
		MUL4_256_R2R(mm3, mm1)
		movq_r2r(mm5, mm4);
		psubw_r2r(mm1, mm4);
		MOV_P2R(*d, mm1, mm0)
		MUL4_SYM_R2R(mm4, mm1, mm5)
		MOV_R2P(mm1, *d, mm0)
		break;
	  }
	s++;  m++;  d++;
     }
}
Exemple #14
0
static inline void mean8(unsigned char *refpix,unsigned char *pixel,int radius_count,int row_stride,int threshold,int8_t *diff,unsigned char *count)
{
    int a,b;

    pxor_r2r(mm6,mm6); // mm6 (aka count) = 0
    pxor_r2r(mm7,mm7); // mm7 (aka diff) = 0
    movq_m2r(*refpix,mm3); // mm3 = refpix[0]

    movd_g2r(0x80808080,mm4); // mm4 = 128
    punpcklbw_r2r(mm4,mm4);

    pxor_r2r(mm4,mm3); // mm3 = refpix[0]-128

    movd_g2r(threshold,mm5); // mm5 = threshold
    punpcklbw_r2r(mm5,mm5);
    punpcklbw_r2r(mm5,mm5);
    punpcklbw_r2r(mm5,mm5);

    for( b=0; b<radius_count; b++ ) {
        for( a=0; a<radius_count; a++ ) {
            movq_m2r(*pixel,mm0); // mm0  = pixel[0]
            pxor_r2r(mm4,mm0);    // mm0  = pixel[0]-128
            movq_r2r(mm3,mm2);    // mm2  = refpix[0]-128
            psubsb_r2r(mm0,mm2);  // mm2  = refpix[0]-pixel[0]
            psubsb_r2r(mm3,mm0);  // mm0  = pixel[0]-refpix[0]
            pminub_r2r(mm0,mm2);  // mm2  = abs(pixel[0]-refpix[0])
            movq_r2r(mm5,mm1);    // mm1  = threshold
            pcmpgtb_r2r(mm2,mm1); // mm1  = (threshold > abs(pixel[0]-refpix[0])) ? -1 : 0
            psubb_r2r(mm1,mm6);   // mm6 += (threshold > abs(pixel[0]-refpix[0]))
            pand_r2r(mm1,mm0);    // mm0  = (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0
            paddb_r2r(mm0,mm7);   // mm7 += (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0

            ++pixel;
        }
        pixel += row_stride - radius_count;
    }

    movq_r2m(mm6,*count);
    movq_r2m(mm7,*diff);

    emms();
 
}
Exemple #15
0
static inline void mmx_unpack_32rgb (uint8_t * image, const int cpu)
{
    /*
     * convert RGB plane to RGB packed format,
     * mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0,
     * mm4 -> GB, mm5 -> AR pixel 4-7,
     * mm6 -> GB, mm7 -> AR pixel 0-3
     */

    pxor_r2r (mm3, mm3);
    movq_r2r (mm0, mm6);
    movq_r2r (mm1, mm7);
    movq_r2r (mm0, mm4);
    movq_r2r (mm1, mm5);
    punpcklbw_r2r (mm2, mm6);
    punpcklbw_r2r (mm3, mm7);
    punpcklwd_r2r (mm7, mm6);
    movntq (mm6, *image);
    movq_r2r (mm0, mm6);
    punpcklbw_r2r (mm2, mm6);
    punpckhwd_r2r (mm7, mm6);
    movntq (mm6, *(image+8));
    punpckhbw_r2r (mm2, mm4);
    punpckhbw_r2r (mm3, mm5);
    punpcklwd_r2r (mm5, mm4);
    movntq (mm4, *(image+16));
    movq_r2r (mm0, mm4);
    punpckhbw_r2r (mm2, mm4);
    punpckhwd_r2r (mm5, mm4);
    movntq (mm4, *(image+24));
}
Exemple #16
0
static inline void MC_put4_8 (int height, uint8_t * dest, uint8_t * ref,
                              int stride, int cpu)
{
    movq_m2r (*ref, mm0);
    movq_m2r (*(ref+1), mm1);
    movq_r2r (mm0, mm7);
    pxor_r2r (mm1, mm7);
    pavg_r2r (mm1, mm0);
    ref += stride;

    do {
        movq_m2r (*ref, mm2);
        movq_r2r (mm0, mm5);

        movq_m2r (*(ref+1), mm3);
        movq_r2r (mm2, mm6);

        pxor_r2r (mm3, mm6);
        pavg_r2r (mm3, mm2);

        por_r2r (mm6, mm7);
        pxor_r2r (mm2, mm5);

        pand_r2r (mm5, mm7);
        pavg_r2r (mm2, mm0);

        pand_m2r (mask_one, mm7);

        psubusb_r2r (mm7, mm0);

        ref += stride;
        movq_r2m (mm0, *dest);
        dest += stride;

        movq_r2r (mm6, mm7);    // unroll !
        movq_r2r (mm2, mm0);    // unroll !
    } while (--height);
}
Exemple #17
0
static inline void XDeint8x8MergeMMXEXT( uint8_t *dst,  int i_dst,
                                         uint8_t *src1, int i_src1,
                                         uint8_t *src2, int i_src2 )
{
    static const uint64_t m_4 = INT64_C(0x0004000400040004);
    int y, x;

    /* Progressive */
    pxor_r2r( mm7, mm7 );
    for( y = 0; y < 8; y += 2 )
    {
        for( x = 0; x < 8; x +=4 )
        {
            movd_m2r( src1[x], mm0 );
            movd_r2m( mm0, dst[x] );

            movd_m2r( src2[x], mm1 );
            movd_m2r( src1[i_src1+x], mm2 );

            punpcklbw_r2r( mm7, mm0 );
            punpcklbw_r2r( mm7, mm1 );
            punpcklbw_r2r( mm7, mm2 );
            paddw_r2r( mm1, mm1 );
            movq_r2r( mm1, mm3 );
            paddw_r2r( mm3, mm3 );
            paddw_r2r( mm2, mm0 );
            paddw_r2r( mm3, mm1 );
            paddw_m2r( m_4, mm1 );
            paddw_r2r( mm1, mm0 );
            psraw_i2r( 3, mm0 );
            packuswb_r2r( mm7, mm0 );
            movd_r2m( mm0, dst[i_dst+x] );
        }
        dst += 2*i_dst;
        src1 += i_src1;
        src2 += i_src2;
    }
}
Exemple #18
0
void
yuv411planar_to_rgb_mmx (const unsigned char *yuv, unsigned char *rgb,
			 unsigned int w, unsigned int h)
{
  unsigned int xx, yy;
  register const unsigned char *yp1, *up, *vp;
  unsigned char *dp1;

  /* plane pointers */
  yp1 = yuv;
  up = yuv + (w * h);
  vp = up + (w * (h / 4));
  /* destination pointers */
  dp1 = rgb;



  yp1 = yuv;
  up = yuv + (w * h);
  vp = up + ((w / 2) * (h / 2));
  dp1 = rgb;
  for (yy = 0; yy < h; yy++)
    {
      for (xx = 0; xx < w; xx += 8)
	{
	  movq_m2r(*yp1, mm0);
	  movq_r2r(mm0, mm1);
	  psrlw_i2r(8, mm0);
	  psllw_i2r(8, mm1);
	  psrlw_i2r(8, mm1);

	  pxor_r2r(mm7, mm7);
	  movd_m2r(*up, mm3);
	  movd_m2r(*vp, mm2);

	  punpcklbw_r2r(mm7, mm2);
	  punpcklbw_r2r(mm7, mm3);

	  movq_m2r(CONST_16, mm4);
	  psubsw_r2r(mm4, mm0);
	  psubsw_r2r(mm4, mm1);

	  movq_m2r(CONST_128, mm5);
	  psubsw_r2r(mm5, mm2);
	  psubsw_r2r(mm5, mm3);

	  movq_m2r(CONST_YMUL, mm4);
	  pmullw_r2r(mm4, mm0);
	  pmullw_r2r(mm4, mm1);

	  movq_m2r(CONST_CRVCRV, mm7);
	  pmullw_r2r(mm3, mm7);

	  movq_m2r(CONST_CBUCBU, mm6);
	  pmullw_r2r(mm2, mm6);

	  movq_m2r(CONST_CGUCGU, mm5);
	  pmullw_r2r(mm2, mm5);

	  movq_m2r(CONST_CGVCGV, mm4);
	  pmullw_r2r(mm3, mm4);

	  movq_r2r(mm0, mm2);
	  paddsw_r2r(mm7, mm2);
	  paddsw_r2r(mm1, mm7);

	  psraw_i2r(RES, mm2);
	  psraw_i2r(RES, mm7);
	  packuswb_r2r(mm7, mm2);

	  pxor_r2r(mm7, mm7);
	  movq_r2r(mm2, mm3);
	  punpckhbw_r2r(mm7, mm2);
	  punpcklbw_r2r(mm3, mm7);
	  por_r2r(mm7, mm2);

	  movq_r2r(mm0, mm3);
	  psubsw_r2r(mm5, mm3);
	  psubsw_r2r(mm4, mm3);
	  paddsw_m2r(CONST_32, mm3);

	  movq_r2r(mm1, mm7);
	  psubsw_r2r(mm5, mm7);
	  psubsw_r2r(mm4, mm7);
	  paddsw_m2r(CONST_32, mm7);

	  psraw_i2r(RES, mm3);
	  psraw_i2r(RES, mm7);
	  packuswb_r2r(mm7, mm3);

	  pxor_r2r(mm7, mm7);
	  movq_r2r(mm3, mm4);
	  punpckhbw_r2r(mm7, mm3);
	  punpcklbw_r2r(mm4, mm7);
	  por_r2r(mm7, mm3);

	  movq_m2r(CONST_32, mm4);
	  paddsw_r2r(mm6, mm0);
	  paddsw_r2r(mm6, mm1);
	  paddsw_r2r(mm4, mm0);
	  paddsw_r2r(mm4, mm1);
	  psraw_i2r(RES, mm0);
	  psraw_i2r(RES, mm1);
	  packuswb_r2r(mm1, mm0);

	  pxor_r2r(mm7, mm7);
	  movq_r2r(mm0, mm5);
	  punpckhbw_r2r(mm7, mm0);
	  punpcklbw_r2r(mm5, mm7);
	  por_r2r(mm7, mm0);

	  pxor_r2r(mm1, mm1);
	  movq_r2r(mm0, mm5);
	  movq_r2r(mm3, mm6);
	  movq_r2r(mm2, mm7);
	  punpckhbw_r2r(mm3, mm2);
	  punpcklbw_r2r(mm6, mm7);
	  punpckhbw_r2r(mm1, mm0);
	  punpcklbw_r2r(mm1, mm5);

	  movq_r2r(mm7, mm1);
	  punpckhwd_r2r(mm5, mm7);
	  punpcklwd_r2r(mm5, mm1);

	  movq_r2r(mm2, mm4);
	  punpckhwd_r2r(mm0, mm2);
	  punpcklwd_r2r(mm0, mm4);

	  movntq_r2m(mm1, *(dp1));
	  movntq_r2m(mm7, *(dp1 + 8));
	  movntq_r2m(mm4, *(dp1 + 16));
	  movntq_r2m(mm2, *(dp1 + 24));

	  yp1 += 8;
	  up += 4;
	  vp += 4;
	  dp1 += 8 * 4;
	}
      if (yy & 0x1)
	{
	  up -= w / 2;
	  vp -= w / 2;
	}
    }
  emms();
}
Exemple #19
0
static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv)
{
    static mmx_t mmx_80w = {0x0080008000800080LL};
    static mmx_t mmx_U_green = {0xf37df37df37df37dLL};
    static mmx_t mmx_U_blue = {0x4093409340934093LL};
    static mmx_t mmx_V_red = {0x3312331233123312LL};
    static mmx_t mmx_V_green = {0xe5fce5fce5fce5fcLL};
    static mmx_t mmx_10w = {0x1010101010101010LL};
    static mmx_t mmx_00ffw = {0x00ff00ff00ff00ffLL};
    static mmx_t mmx_Y_coeff = {0x253f253f253f253fLL};

    movd_m2r (*pu, mm0);		/* mm0 = 00 00 00 00 u3 u2 u1 u0 */
    movd_m2r (*pv, mm1);		/* mm1 = 00 00 00 00 v3 v2 v1 v0 */
    movq_m2r (*py, mm6);		/* mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
    pxor_r2r (mm4, mm4);		/* mm4 = 0 */
    /* XXX might do cache preload for image here */

    /*
     * Do the multiply part of the conversion for even and odd pixels
     * register usage:
     * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels
     * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels
     * mm6 -> Y even, mm7 -> Y odd
     */

    punpcklbw_r2r (mm4, mm0);		/* mm0 = u3 u2 u1 u0 */
    punpcklbw_r2r (mm4, mm1);		/* mm1 = v3 v2 v1 v0 */
    psubsw_m2r (mmx_80w, mm0);		/* u -= 128 */
    psubsw_m2r (mmx_80w, mm1);		/* v -= 128 */
    psllw_i2r (3, mm0);			/* promote precision */
    psllw_i2r (3, mm1);			/* promote precision */
    movq_r2r (mm0, mm2);		/* mm2 = u3 u2 u1 u0 */
    movq_r2r (mm1, mm3);		/* mm3 = v3 v2 v1 v0 */
    pmulhw_m2r (mmx_U_green, mm2);	/* mm2 = u * u_green */
    pmulhw_m2r (mmx_V_green, mm3);	/* mm3 = v * v_green */
    pmulhw_m2r (mmx_U_blue, mm0);	/* mm0 = chroma_b */
    pmulhw_m2r (mmx_V_red, mm1);	/* mm1 = chroma_r */
    paddsw_r2r (mm3, mm2);		/* mm2 = chroma_g */

    psubusb_m2r (mmx_10w, mm6);		/* Y -= 16 */
    movq_r2r (mm6, mm7);		/* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */
    pand_m2r (mmx_00ffw, mm6);		/* mm6 =    Y6    Y4    Y2    Y0 */
    psrlw_i2r (8, mm7);			/* mm7 =    Y7    Y5    Y3    Y1 */
    psllw_i2r (3, mm6);			/* promote precision */
    psllw_i2r (3, mm7);			/* promote precision */
    pmulhw_m2r (mmx_Y_coeff, mm6);	/* mm6 = luma_rgb even */
    pmulhw_m2r (mmx_Y_coeff, mm7);	/* mm7 = luma_rgb odd */

    /*
     * Do the addition part of the conversion for even and odd pixels
     * register usage:
     * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels
     * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd  pixels
     * mm6 -> Y even, mm7 -> Y odd
     */

    movq_r2r (mm0, mm3);		/* mm3 = chroma_b */
    movq_r2r (mm1, mm4);		/* mm4 = chroma_r */
    movq_r2r (mm2, mm5);		/* mm5 = chroma_g */
    paddsw_r2r (mm6, mm0);		/* mm0 = B6 B4 B2 B0 */
    paddsw_r2r (mm7, mm3);		/* mm3 = B7 B5 B3 B1 */
    paddsw_r2r (mm6, mm1);		/* mm1 = R6 R4 R2 R0 */
    paddsw_r2r (mm7, mm4);		/* mm4 = R7 R5 R3 R1 */
    paddsw_r2r (mm6, mm2);		/* mm2 = G6 G4 G2 G0 */
    paddsw_r2r (mm7, mm5);		/* mm5 = G7 G5 G3 G1 */
    packuswb_r2r (mm0, mm0);		/* saturate to 0-255 */
    packuswb_r2r (mm1, mm1);		/* saturate to 0-255 */
    packuswb_r2r (mm2, mm2);		/* saturate to 0-255 */
    packuswb_r2r (mm3, mm3);		/* saturate to 0-255 */
    packuswb_r2r (mm4, mm4);		/* saturate to 0-255 */
    packuswb_r2r (mm5, mm5);		/* saturate to 0-255 */
    punpcklbw_r2r (mm3, mm0);		/* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */
    punpcklbw_r2r (mm4, mm1);		/* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */
    punpcklbw_r2r (mm5, mm2);		/* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */
}
Exemple #20
0
int main()
{
	int rval;
	mmx_t ma;
	mmx_t mb;

	movq_r2r(mm0, mm1);

	rval = mmx_ok();

	/* Announce return value of mmx_ok() */
//	printf("Value returned from init was %x.", rval);
//	printf(" (Indicates MMX %s available)\n\n",(rval)? "is" : "not");
//	fflush(stdout); fflush(stderr);

//	if(rval)
	{
		/* PADD *****************************************************/
		ma.q = 0x1111111180000000LL;
		mb.q = 0x7fffffff00000001LL;
		paddd(ma, mb);
		fprintf(stdout, "paddd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddd: mb.q is 9111111080000001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddw(ma, mb);
		fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddw: mb.q is 8001800000000002\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddw(ma, mb);
		fprintf(stdout, "paddw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddw: mb.q is 8001800000000001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddb(ma, mb);
		fprintf(stdout, "paddb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddb: mb.q is 8180000281800002\n");
		fflush(stdout); fflush(stderr);


		/* PADDS ****************************************************/
		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddsw(ma, mb);
		fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsw: mb.q is 80017fff00000002\n");

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddsw(ma, mb);
		fprintf(stdout, "paddsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsw: mb.q is 80017fff00000001\n");

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddsb(ma, mb);
		fprintf(stdout, "paddsb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddsb: mb.q is 817f0002817f0002\n");
		fflush(stdout); fflush(stderr);


		/* PADDUS ***************************************************/
		ma.q = 0x0001000100010001LL;
		mb.q = 0x80007fffffff0001LL;
		paddusw(ma, mb);
		fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusw: mb.q is 80018000ffff0002\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010000LL;
		paddusw(ma, mb);
		fprintf(stdout, "paddusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusw: mb.q is 80018000ffff0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x01010101807fff01LL;
		mb.q = 0x807fff0101010101LL;
		paddusb(ma, mb);
		fprintf(stdout, "paddusb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "paddusb: mb.q is 8180ff028180ff02\n");
		fflush(stdout); fflush(stderr);


		/* PSUB *****************************************************/
		ma.q = 0x7fffffff00000001LL;
		mb.q = 0x1111111180000000LL;
		psubd(ma, mb);
		fprintf(stdout, "psubd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubd: mb.q is 911111127fffffff\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubw(ma, mb);
		fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubw: mb.q is 8001800200020000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubw(ma, mb);
		fprintf(stdout, "psubw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubw: mb.q is 7fff7ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubb(ma, mb);
		fprintf(stdout, "psubb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubb: mb.q is 818202007f7efe00\n");
		fflush(stdout); fflush(stderr);


		/* PSUBS ****************************************************/
		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubsw(ma, mb);
		fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsw: mb.q is 7fff800200020000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubsw(ma, mb);
		fprintf(stdout, "psubsw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsw: mb.q is 80007ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubsb(ma, mb);
		fprintf(stdout, "psubsb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubsb: mb.q is 7f820200807efe00\n");
		fflush(stdout); fflush(stderr);
 

		/* PSUBUS ***************************************************/
		ma.q = 0x80007fffffff0001LL;
		mb.q = 0x0001000100010001LL;
		psubusw(ma, mb);
		fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x0001000100010000LL;
		mb.q = 0x80007fffffff0001LL;
		psubusw(ma, mb);
		fprintf(stdout, "psubusw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusw: mb.q is 7fff7ffefffe0001\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x807fff0101010101LL;
		mb.q = 0x01010101807fff01LL;
		psubusb(ma, mb);
		fprintf(stdout, "psubusb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psubusb: mb.q is 000000007f7efe00\n");
		fflush(stdout); fflush(stderr);


		/* PMUL *****************************************************/
		ma.q = 0x8000ffff00ff0000LL;
		mb.q = 0x0200ffff00ffffffLL;
		pmulhw(ma, mb);
		fprintf(stdout, "pmulhw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmulhw: mb.q is ff00000000000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0200ffff00ffffffLL;
		pmullw(ma, mb);
		fprintf(stdout, "pmullw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmullw: mb.q is 00000001fe010000\n");
		fflush(stdout); fflush(stderr);


		/* PMADD ****************************************************/
		ma.q = 0x8000345680007f34LL;
		mb.q = 0x93234a27ffff1707LL;

		pmaddwd(ma, mb);
		fprintf(stdout, "pmaddwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pmaddwd: mb.q is 4597551a0b71a66c\n");
		fflush(stdout); fflush(stderr);


		/* PCMPEQ ***************************************************/
		ma.q = 0x800034568f237f34LL;
		mb.q = 0x93009a568f237f34LL;

		pcmpeqd(ma, mb);
		fprintf(stdout, "pcmpeqd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqd: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x93009a568f237f34LL;
		pcmpeqw(ma, mb);
		fprintf(stdout, "pcmpeqw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqw: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x93009a568f237f34LL;
		pcmpeqb(ma, mb);
		fprintf(stdout, "pcmpeqb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpeqb: mb.q is 00ff00ffffffffff\n");
		fflush(stdout); fflush(stderr);



		/* PCMPGT ***************************************************/
		ma.q = 0x666688884477aaffLL;
		mb.q = 0x1234567890abcdefLL;

		pcmpgtd(ma, mb);
		fprintf(stdout, "pcmpgtd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtd: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x1234567890abcdefLL;
		pcmpgtw(ma, mb);
		fprintf(stdout, "pcmpgtw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtw: mb.q is 0000ffff0000ffff\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x1234567890abcdefLL;
		pcmpgtb(ma, mb);
		fprintf(stdout, "pcmpgtb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pcmpgtb: mb.q is 0000ffff0000ff00\n");
		fflush(stdout); fflush(stderr);


		/* PACKSS ***************************************************/
		ma.q = 0x00012222000abbbbLL;
		mb.q = 0x0000888800003333LL;

		packssdw(ma, mb);
		fprintf(stdout, "packssdw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packssdw: mb.q is 7fff7fff7fff3333\n");
		fflush(stdout); fflush(stderr);

		ma.q = 0x00aa00dd01009999LL;
		mb.q = 0x0011002200330044LL;

		packsswb(ma, mb);
		fprintf(stdout, "packsswb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packsswb: mb.q is 7f7f7f8011223344\n");
		fflush(stdout); fflush(stderr);


		/* PACKUS ***************************************************/
		ma.q = 0x00aa00dd01009999LL;
		mb.q = 0x0011002200330044LL;

		packuswb(ma, mb);
		fprintf(stdout, "packuswb: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "packuswb: mb.q is aaddff0011223344\n");
		fflush(stdout); fflush(stderr);


		/* PUNPCKH **************************************************/
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;

		punpckhdq(ma, mb);
		fprintf(stdout, "punpckhdq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhdq: mb.q is 090a0b0c01020304\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpckhwd(ma, mb);
		fprintf(stdout, "punpckhwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhwd: mb.q is 090a01020b0c0304\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpckhbw(ma, mb);
		fprintf(stdout, "punpckhbw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckhbw: mb.q is 09010a020b030c04\n");
		fflush(stdout); fflush(stderr);


		/* PUNPCKL **************************************************/
		ma.q = 0x090a0b0c0d0e0f00LL;
		mb.q = 0x0102030405060708LL;

		punpckldq(ma, mb);
		fprintf(stdout, "punpckldq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpckldq: mb.q is 0d0e0f0005060708\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpcklwd(ma, mb);
		fprintf(stdout, "punpcklwd: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpcklwd: mb.q is 0d0e05060f000708\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0102030405060708LL;
		punpcklbw(ma, mb);
		fprintf(stdout, "punpcklbw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "punpcklbw: mb.q is 0d050e060f070008\n");
		fflush(stdout); fflush(stderr);



		/* PAND, PANDN, POR, PXOR ***********************************/
		ma.q = 0x5555555555555555LL;
		mb.q = 0x3333333333333333LL;

		pand(ma, mb);
		fprintf(stdout, "pand: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pand: mb.q is 1111111111111111\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		pandn(ma, mb);
		fprintf(stdout, "pandn: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pandn: mb.q is 4444444444444444\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		por(ma, mb);
		fprintf(stdout, "por: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "por: mb.q is 7777777777777777\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x3333333333333333LL;
		pxor(ma, mb);
		fprintf(stdout, "pxor: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pxor: mb.q is 6666666666666666\n");
		fflush(stdout); fflush(stderr);



		/* PSLL *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psllq(ma, mb);
		fprintf(stdout, "psllq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psllq: mb.q is 6789abcdef000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		pslld(ma, mb);
		fprintf(stdout, "pslld: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "pslld: mb.q is 67000000ef000000\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psllw(ma, mb);
		fprintf(stdout, "psllw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psllw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);



		/* PSRL *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psrlq(ma, mb);
		fprintf(stdout, "psrlq: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrlq: mb.q is 0000000123456789\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psrld(ma, mb);
		fprintf(stdout, "psrld: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrld: mb.q is 0000000100000089\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psrlw(ma, mb);
		fprintf(stdout, "psrlw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrlw: mb.q is 0000000000000000\n");
		fflush(stdout); fflush(stderr);



		/* PSRA *****************************************************/
		ma.q = 0x0000000000000018LL;
		mb.q = 0x0123456789abcdefLL;

		psrad(ma, mb);
		fprintf(stdout, "psrad: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psrad: mb.q is 00000001ffffff89\n");
		fflush(stdout); fflush(stderr);

		mb.q = 0x0123456789abcdefLL;
		psraw(ma, mb);
		fprintf(stdout, "psraw: mb.q is %016llx\n", mb.q);
		fprintf(stderr, "psraw: mb.q is 00000000ffffffff\n");
		fflush(stdout); fflush(stderr);

		/* Exit MXX *************************************************/
		emms();
	}

	/* Clean-up and exit nicely */
	exit(0);
}
Exemple #21
0
/**
 * Internal helper function for EstimateNumBlocksWithMotion():
 * estimates whether there is motion in the given 8x8 block on one plane
 * between two images. The block as a whole and its fields are evaluated
 * separately, and use different motion thresholds.
 *
 * This is a low-level function only used by EstimateNumBlocksWithMotion().
 * There is no need to call this function manually.
 *
 * For interpretation of pi_top and pi_bot, it is assumed that the block
 * starts on an even-numbered line (belonging to the top field).
 *
 * The b_mmx parameter avoids the need to call vlc_CPU() separately
 * for each block.
 *
 * @param[in] p_pix_p Base pointer to the block in previous picture
 * @param[in] p_pix_c Base pointer to the same block in current picture
 * @param i_pitch_prev i_pitch of previous picture
 * @param i_pitch_curr i_pitch of current picture
 * @param b_mmx (vlc_CPU() & CPU_CAPABILITY_MMXEXT) or false.
 * @param[out] pi_top 1 if top field of the block had motion, 0 if no
 * @param[out] pi_bot 1 if bottom field of the block had motion, 0 if no
 * @return 1 if the block had motion, 0 if no
 * @see EstimateNumBlocksWithMotion()
 */
static inline int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
                                        int i_pitch_prev, int i_pitch_curr,
                                        bool b_mmx,
                                        int* pi_top, int* pi_bot )
{
/* Pixel luma/chroma difference threshold to detect motion. */
#define T 10

    int32_t i_motion = 0;
    int32_t i_top_motion = 0;
    int32_t i_bot_motion = 0;

/* See below for the C version to see more quickly what this does. */
#ifdef CAN_COMPILE_MMXEXT
    if( b_mmx )
    {
        static const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
        pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
        movq_m2r( bT,  mm5 );

        pxor_r2r( mm3, mm3 ); /* score (top field) */
        pxor_r2r( mm4, mm4 ); /* score (bottom field) */
        for( int y = 0; y < 8; y+=2 )
        {
            /* top field */
            movq_m2r( *((uint64_t*)p_pix_c), mm0 );
            movq_m2r( *((uint64_t*)p_pix_p), mm1 );
            movq_r2r( mm0, mm2 );
            psubusb_r2r( mm1, mm2 );
            psubusb_r2r( mm0, mm1 );

            pcmpgtb_r2r( mm5, mm2 );
            pcmpgtb_r2r( mm5, mm1 );
            psadbw_r2r(  mm6, mm2 );
            psadbw_r2r(  mm6, mm1 );

            paddd_r2r( mm2, mm1 );
            paddd_r2r( mm1, mm3 ); /* add to top field score */

            p_pix_c += i_pitch_curr;
            p_pix_p += i_pitch_prev;

            /* bottom field - handling identical to top field, except... */
            movq_m2r( *((uint64_t*)p_pix_c), mm0 );
            movq_m2r( *((uint64_t*)p_pix_p), mm1 );
            movq_r2r( mm0, mm2 );
            psubusb_r2r( mm1, mm2 );
            psubusb_r2r( mm0, mm1 );

            pcmpgtb_r2r( mm5, mm2 );
            pcmpgtb_r2r( mm5, mm1 );
            psadbw_r2r(  mm6, mm2 );
            psadbw_r2r(  mm6, mm1 );

            paddd_r2r( mm2, mm1 );
            paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */

            p_pix_c += i_pitch_curr;
            p_pix_p += i_pitch_prev;
        }
        movq_r2r(  mm3, mm7 ); /* score (total) */
        paddd_r2r( mm4, mm7 );
        movd_r2m( mm3, i_top_motion );
        movd_r2m( mm4, i_bot_motion );
        movd_r2m( mm7, i_motion );

        /* The loop counts actual score * 255. */
        i_top_motion /= 255;
        i_bot_motion /= 255;
        i_motion     /= 255;

        emms();
    }
    else
#endif
    {
        for( int y = 0; y < 8; ++y )
Exemple #22
0
static void vfilter_chroma_332_packed422_scanline_mmx( uint8_t *output, int width,
        uint8_t *m, uint8_t *t, uint8_t *b )
{
    int i;
    const mmx_t ymask = { 0x00ff00ff00ff00ffULL };
    const mmx_t cmask = { 0xff00ff00ff00ff00ULL };

    // Get width in bytes.
    width *= 2; 
    i = width / 8;
    width -= i * 8;

    movq_m2r( ymask, mm7 );
    movq_m2r( cmask, mm6 );

    while ( i-- ) 
    {
        movq_m2r( *t, mm0 );
        movq_m2r( *b, mm1 );
        movq_m2r( *m, mm2 );

        movq_r2r ( mm2, mm3 );
        pand_r2r ( mm7, mm3 );

        pand_r2r ( mm6, mm0 );
        pand_r2r ( mm6, mm1 );
        pand_r2r ( mm6, mm2 );

        psrlq_i2r( 8, mm0 );
        psrlq_i2r( 7, mm1 );
        psrlq_i2r( 8, mm2 );

        movq_r2r ( mm0, mm4 );
        psllw_i2r( 1, mm4 );
        paddw_r2r( mm4, mm0 );

        movq_r2r ( mm2, mm4 );
        psllw_i2r( 1, mm4 );
        paddw_r2r( mm4, mm2 );

        paddw_r2r( mm0, mm2 );
        paddw_r2r( mm1, mm2 );

        psllw_i2r( 5, mm2 );
        pand_r2r( mm6, mm2 );

        por_r2r ( mm3, mm2 );

        movq_r2m( mm2, *output );
        output += 8;
        t += 8;
        b += 8;
        m += 8;
    }
    output++; t++; b++; m++;
    while ( width-- ) 
    {
        *output = (3 * *t + 3 * *m + 2 * *b) >> 3;
        output +=2; t+=2; b+=2; m+=2;
    }

    emms();
}
Exemple #23
0
VLC_MMX
static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
                                    int i_pitch_prev, int i_pitch_curr,
                                    int* pi_top, int* pi_bot )
{
    int32_t i_motion = 0;
    int32_t i_top_motion = 0;
    int32_t i_bot_motion = 0;

    static alignas (8) const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
    pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
    movq_m2r( bT,  mm5 );

    pxor_r2r( mm3, mm3 ); /* score (top field) */
    pxor_r2r( mm4, mm4 ); /* score (bottom field) */
    for( int y = 0; y < 8; y+=2 )
    {
        /* top field */
        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
        movq_r2r( mm0, mm2 );
        psubusb_r2r( mm1, mm2 );
        psubusb_r2r( mm0, mm1 );

        pcmpgtb_r2r( mm5, mm2 );
        pcmpgtb_r2r( mm5, mm1 );
        psadbw_r2r(  mm6, mm2 );
        psadbw_r2r(  mm6, mm1 );

        paddd_r2r( mm2, mm1 );
        paddd_r2r( mm1, mm3 ); /* add to top field score */

        p_pix_c += i_pitch_curr;
        p_pix_p += i_pitch_prev;

        /* bottom field - handling identical to top field, except... */
        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
        movq_r2r( mm0, mm2 );
        psubusb_r2r( mm1, mm2 );
        psubusb_r2r( mm0, mm1 );

        pcmpgtb_r2r( mm5, mm2 );
        pcmpgtb_r2r( mm5, mm1 );
        psadbw_r2r(  mm6, mm2 );
        psadbw_r2r(  mm6, mm1 );

        paddd_r2r( mm2, mm1 );
        paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */

        p_pix_c += i_pitch_curr;
        p_pix_p += i_pitch_prev;
    }
    movq_r2r(  mm3, mm7 ); /* score (total) */
    paddd_r2r( mm4, mm7 );
    movd_r2m( mm3, i_top_motion );
    movd_r2m( mm4, i_bot_motion );
    movd_r2m( mm7, i_motion );

    /* The loop counts actual score * 255. */
    i_top_motion /= 255;
    i_bot_motion /= 255;
    i_motion     /= 255;

    emms();

    (*pi_top) = ( i_top_motion >= 8 );
    (*pi_bot) = ( i_bot_motion >= 8 );
    return (i_motion >= 8);
}
Exemple #24
0
static void scale_uint8_x_4_x_quadratic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
  {
  int i;
  uint8_t * src, * dst, *src_start;
  int32_t * factors;
  //  mmx_t tmp_mm;

/*
 *  mm0: Input
 *  mm1: factor_mask
 *  mm2: Factor
 *  mm3: Output
 *  mm4: 
 *  mm5: 
 *  mm6: 0
 *  mm7: scratch
 *  
 */
  
  //  fprintf(stderr, "scale_uint8_x_1_x_bicubic_noclip_mmx\n");
  src_start = ctx->src + scanline * ctx->src_stride;
  
  pxor_r2r(mm6, mm6);
  movq_m2r(factor_mask, mm1);
  dst = dest_start;
  for(i = 0; i < ctx->dst_size; i++)
    {
    src = src_start + 4*ctx->table_h.pixels[i].index;
    factors = ctx->table_h.pixels[i].factor_i;
    
    /* Load pixels */
    movd_m2r(*(src), mm0);
    punpcklbw_r2r(mm6, mm0);
    psllw_i2r(7, mm0);
    /* Load factors */
    LOAD_FACTOR_1_4_NOCLIP;
    /* Multiply */
    pmulhw_r2r(mm7, mm0);
    movq_r2r(mm0, mm3);
    //    DUMP_MM("mm3_1", mm3);
    src += 4;
    factors++;
    
    /* Load pixels */
    movd_m2r(*(src), mm0);
    punpcklbw_r2r(mm6, mm0);
    psllw_i2r(7, mm0);
    /* Load factors */
    LOAD_FACTOR_1_4_NOCLIP;
    /* Multiply */
    pmulhw_r2r(mm7, mm0);
    paddw_r2r(mm0, mm3);
    //    DUMP_MM("mm3_2", mm3);
    src += 4;
    factors++;

    /* Load pixels */
    movd_m2r(*(src), mm0);
    punpcklbw_r2r(mm6, mm0);
    psllw_i2r(7, mm0);
    /* Load factors */
    LOAD_FACTOR_1_4_NOCLIP;
    /* Multiply */
    pmulhw_r2r(mm7, mm0);
    paddw_r2r(mm0, mm3);
    //    DUMP_MM("mm3_3", mm3);
    src += 4;
    factors++;
    
    psraw_i2r(5, mm3);
    packuswb_r2r(mm6, mm3);
    movd_r2m(mm3, *dst);
    
    dst+=4;
    }
  ctx->need_emms = 1;
  }
Exemple #25
0
static void
deinterlace_greedy_scanline_mmxext (GstDeinterlaceMethodGreedyL *
    self, const guint8 * m0, const guint8 * t1, const guint8 * b1,
    const guint8 * m2, guint8 * output, gint width)
{
  mmx_t MaxComb;

  // How badly do we let it weave? 0-255
  MaxComb.ub[0] = self->max_comb;
  MaxComb.ub[1] = self->max_comb;
  MaxComb.ub[2] = self->max_comb;
  MaxComb.ub[3] = self->max_comb;
  MaxComb.ub[4] = self->max_comb;
  MaxComb.ub[5] = self->max_comb;
  MaxComb.ub[6] = self->max_comb;
  MaxComb.ub[7] = self->max_comb;

  // L2 == m0
  // L1 == t1
  // L3 == b1
  // LP2 == m2

  movq_m2r (MaxComb, mm6);

  for (; width > 7; width -= 8) {
    movq_m2r (*t1, mm1);        // L1
    movq_m2r (*m0, mm2);        // L2
    movq_m2r (*b1, mm3);        // L3
    movq_m2r (*m2, mm0);        // LP2

    // average L1 and L3 leave result in mm4
    movq_r2r (mm1, mm4);        // L1
    pavgb_r2r (mm3, mm4);       // (L1 + L3)/2

    // get abs value of possible L2 comb
    movq_r2r (mm2, mm7);        // L2
    psubusb_r2r (mm4, mm7);     // L2 - avg
    movq_r2r (mm4, mm5);        // avg
    psubusb_r2r (mm2, mm5);     // avg - L2
    por_r2r (mm7, mm5);         // abs(avg-L2)

    // get abs value of possible LP2 comb
    movq_r2r (mm0, mm7);        // LP2
    psubusb_r2r (mm4, mm7);     // LP2 - avg
    psubusb_r2r (mm0, mm4);     // avg - LP2
    por_r2r (mm7, mm4);         // abs(avg-LP2)

    // use L2 or LP2 depending upon which makes smaller comb
    psubusb_r2r (mm5, mm4);     // see if it goes to zero
    pxor_r2r (mm5, mm5);        // 0
    pcmpeqb_r2r (mm5, mm4);     // if (mm4=0) then FF else 0
    pcmpeqb_r2r (mm4, mm5);     // opposite of mm4

    // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
    pand_r2r (mm2, mm5);        // use L2 if mm5 == ff, else 0
    pand_r2r (mm0, mm4);        // use LP2 if mm4 = ff, else 0
    por_r2r (mm5, mm4);         // may the best win

    // Now lets clip our chosen value to be not outside of the range
    // of the high/low range L1-L3 by more than abs(L1-L3)
    // This allows some comb but limits the damages and also allows more
    // detail than a boring oversmoothed clip.

    movq_r2r (mm1, mm2);        // copy L1
    pmaxub_r2r (mm3, mm2);      // now = Max(L1,L3)

    pminub_r2r (mm1, mm3);      // now = Min(L1,L3)

    // allow the value to be above the high or below the low by amt of MaxComb
    paddusb_r2r (mm6, mm2);     // increase max by diff
    psubusb_r2r (mm6, mm3);     // lower min by diff


    pmaxub_r2r (mm3, mm4);      // now = Max(best,Min(L1,L3)
    pminub_r2r (mm4, mm2);      // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped

    movq_r2m (mm2, *output);    // move in our clipped best

    // Advance to the next set of pixels.
    output += 8;
    m0 += 8;
    t1 += 8;
    b1 += 8;
    m2 += 8;
  }
  emms ();

  if (width > 0)
    deinterlace_greedy_scanline_c (self, m0, t1, b1, m2, output, width);
}
Exemple #26
0
static inline void mmx_interp_average_4_U8 (uint8_t * dest,
					    const uint8_t * src1,
					    const uint8_t * src2,
					    const uint8_t * src3,
					    const uint8_t * src4)
{
    /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */

    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */

    punpcklbw_r2r (mm0, mm1);	/* unpack low src1 bytes */
    punpckhbw_r2r (mm0, mm2);	/* unpack high src1 bytes */

    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src2 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src2 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    /* now have partials in mm1 and mm2 */

    movq_m2r (*src3, mm3);	/* load 8 src3 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src3 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src3 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src3 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    movq_m2r (*src4, mm5);	/* load 8 src4 bytes */
    movq_r2r (mm5, mm6);	/* copy 8 src4 bytes */

    punpcklbw_r2r (mm0, mm5);	/* unpack low src4 bytes */
    punpckhbw_r2r (mm0, mm6);	/* unpack high src4 bytes */

    paddw_r2r (mm5, mm1);	/* add lows */
    paddw_r2r (mm6, mm2);	/* add highs */

    paddw_m2r (round4, mm1);
    psraw_i2r (2, mm1);		/* /4 */
    paddw_m2r (round4, mm2);
    psraw_i2r (2, mm2);		/* /4 */

    /* now have subtotal/4 in mm1 and mm2 */

    movq_m2r (*dest, mm3);	/* load 8 dest bytes */
    movq_r2r (mm3, mm4);	/* copy 8 dest bytes */

    packuswb_r2r (mm2, mm1);	/* pack (w/ saturation) */
    movq_r2r (mm1,mm2);		/* copy subresult */

    pxor_r2r (mm1, mm3);	/* xor srcavg and dest */
    pand_m2r (mask1, mm3);	/* mask lower bits */
    psrlq_i2r (1, mm3);		/* /2 */
    por_r2r (mm2, mm4);		/* or srcavg and dest */
    psubb_r2r (mm3, mm4);	/* subtract subresults */
    movq_r2m (mm4, *dest);	/* store result in dest */
}
//VLC_MMX			// sunqueen delete
static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
                                    int i_pitch_prev, int i_pitch_curr,
                                    int* pi_top, int* pi_bot )
{
    int32_t i_motion = 0;
    int32_t i_top_motion = 0;
    int32_t i_bot_motion = 0;
	uint64_t ui_pix_c, ui_pix_p;			// sunqueen add

//    static const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
    __declspec(align(8)) static const mmx_t bT   = { 0x0A0A0A0A0A0A0A0AULL };			// sunqueen modify
    pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
    movq_m2r( bT,  mm5 );

    pxor_r2r( mm3, mm3 ); /* score (top field) */
    pxor_r2r( mm4, mm4 ); /* score (bottom field) */
    for( int y = 0; y < 8; y+=2 )
    {
        /* top field */
		// sunqueen add start
		ui_pix_c = *((uint64_t*)p_pix_c);
        movq_m2r( ui_pix_c, mm0 );
		ui_pix_p = *((uint64_t*)p_pix_p);
        movq_m2r( ui_pix_p, mm1 );
		// sunqueen add end
#if 0	// sunqueen delete start
        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
#endif	// sunqueen delete end
        movq_r2r( mm0, mm2 );
        psubusb_r2r( mm1, mm2 );
        psubusb_r2r( mm0, mm1 );

        pcmpgtb_r2r( mm5, mm2 );
        pcmpgtb_r2r( mm5, mm1 );
        psadbw_r2r(  mm6, mm2 );
        psadbw_r2r(  mm6, mm1 );

        paddd_r2r( mm2, mm1 );
        paddd_r2r( mm1, mm3 ); /* add to top field score */

        p_pix_c += i_pitch_curr;
        p_pix_p += i_pitch_prev;

        /* bottom field - handling identical to top field, except... */
		// sunqueen add start
		ui_pix_c = *((uint64_t*)p_pix_c);
        movq_m2r( ui_pix_c, mm0 );
		ui_pix_p = *((uint64_t*)p_pix_p);
        movq_m2r( ui_pix_p, mm1 );
		// sunqueen add end
#if 0	// sunqueen delete start
        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
#endif	// sunqueen delete end
        movq_r2r( mm0, mm2 );
        psubusb_r2r( mm1, mm2 );
        psubusb_r2r( mm0, mm1 );

        pcmpgtb_r2r( mm5, mm2 );
        pcmpgtb_r2r( mm5, mm1 );
        psadbw_r2r(  mm6, mm2 );
        psadbw_r2r(  mm6, mm1 );

        paddd_r2r( mm2, mm1 );
        paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */

        p_pix_c += i_pitch_curr;
        p_pix_p += i_pitch_prev;
    }
    movq_r2r(  mm3, mm7 ); /* score (total) */
    paddd_r2r( mm4, mm7 );
    movd_r2m( mm3, i_top_motion );
    movd_r2m( mm4, i_bot_motion );
    movd_r2m( mm7, i_motion );

    /* The loop counts actual score * 255. */
    i_top_motion /= 255;
    i_bot_motion /= 255;
    i_motion     /= 255;

    emms();

    (*pi_top) = ( i_top_motion >= 8 );
    (*pi_bot) = ( i_bot_motion >= 8 );
    return (i_motion >= 8);
}
Exemple #28
0
int bsad_mmx(uint8_t *pf, uint8_t *pb, uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h)
{
    uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
    int s, s1, s2;

    pfa = pf + hxf;
    pfb = pf + lx * hyf;
    pfc = pfb + hxf;

    pba = pb + hxb;
    pbb = pb + lx * hyb; 
    pbc = pbb + hxb;

    s = 0; /* the accumulator */

    if (h > 0)
    {
        pxor_r2r(mm7, mm7);
        pxor_r2r(mm6, mm6);
        pcmpeqw_r2r(mm5, mm5);
        psubw_r2r(mm5, mm6);
        psllw_i2r(1, mm6);
		
        do {
            BSAD_LOAD(pf[0],mm0,mm1);
            BSAD_LOAD_ACC(pfa[0],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pfb[0],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pfc[0],mm2,mm3,mm0,mm1);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psrlw_i2r(2, mm0);
            psrlw_i2r(2, mm1);
			
            BSAD_LOAD(pb[0],mm2,mm3);
            BSAD_LOAD_ACC(pba[0],mm4,mm5,mm2,mm3);
            BSAD_LOAD_ACC(pbb[0],mm4,mm5,mm2,mm3);
            BSAD_LOAD_ACC(pbc[0],mm4,mm5,mm2,mm3);
            paddw_r2r(mm6, mm2);
            paddw_r2r(mm6, mm3);
            psrlw_i2r(2, mm2);
            psrlw_i2r(2, mm3);
			
            paddw_r2r(mm2, mm0);
            paddw_r2r(mm3, mm1);
            psrlw_i2r(1, mm6);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psllw_i2r(1, mm6);
            psrlw_i2r(1, mm0);
            psrlw_i2r(1, mm1);
            packuswb_r2r(mm1, mm0);
			
            movq_m2r(p2[0], mm1);
            movq_r2r(mm0, mm2);
            psubusb_r2r(mm1, mm0);
            psubusb_r2r(mm2, mm1);
            por_r2r(mm1, mm0);
            movq_r2r(mm0, mm1);
            punpcklbw_r2r(mm7, mm0);
            punpckhbw_r2r(mm7, mm1);
            paddw_r2r(mm1, mm0);
            movq_r2r(mm0, mm1);
            punpcklwd_r2r(mm7, mm0);
            punpckhwd_r2r(mm7, mm1);
			
            paddd_r2r(mm1, mm0);
            movd_r2g(mm0, s1);
            psrlq_i2r(32, mm0);
            movd_r2g(mm0, s2);
            s += s1 + s2;

            BSAD_LOAD(pf[8],mm0,mm1);
            BSAD_LOAD_ACC(pfa[8],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pfb[8],mm2,mm3,mm0,mm1);
            BSAD_LOAD_ACC(pfc[8],mm2,mm3,mm0,mm1);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psrlw_i2r(2, mm0);
            psrlw_i2r(2, mm1);
			
            BSAD_LOAD(pb[8],mm2,mm3);
            BSAD_LOAD_ACC(pba[8],mm4,mm5,mm2,mm3);
            BSAD_LOAD_ACC(pbb[8],mm4,mm5,mm2,mm3);
            BSAD_LOAD_ACC(pbc[8],mm4,mm5,mm2,mm3);
            paddw_r2r(mm6, mm2);
            paddw_r2r(mm6, mm3);
            psrlw_i2r(2, mm2);
            psrlw_i2r(2, mm3);
						
            paddw_r2r(mm2, mm0);
            paddw_r2r(mm3, mm1);
            psrlw_i2r(1, mm6);
            paddw_r2r(mm6, mm0);
            paddw_r2r(mm6, mm1);
            psllw_i2r(1, mm6);
            psrlw_i2r(1, mm0);
            psrlw_i2r(1, mm1);
            packuswb_r2r(mm1, mm0);
			
            movq_m2r(p2[8], mm1);
            movq_r2r(mm0, mm2);
            psubusb_r2r(mm1, mm0);
            psubusb_r2r(mm2, mm1);
            por_r2r(mm1, mm0);
            movq_r2r(mm0, mm1);
            punpcklbw_r2r(mm7, mm0);
            punpckhbw_r2r(mm7, mm1);
            paddw_r2r(mm1, mm0);
            movq_r2r(mm0, mm1);
            punpcklwd_r2r(mm7, mm0);
            punpckhwd_r2r(mm7, mm1);
			
            paddd_r2r(mm1, mm0);
            movd_r2g(mm0, s1);
            psrlq_i2r(32, mm0);
            movd_r2g(mm0, s2);
            s += s1 + s2;
			
            p2  += lx;
            pf  += lx;
            pfa += lx;
            pfb += lx;
            pfc += lx;
            pb  += lx;
            pba += lx;
            pbb += lx;
            pbc += lx;

            h--;
        } while (h > 0);	
	
    }
	
    emms();

    return s;
}
Exemple #29
0
static void
_evas_yv12torgb_sse(unsigned char **yuv, unsigned char *rgb, int w, int h)
{
#ifdef BUILD_MMX
   int xx, yy;
   register unsigned char *yp1, *up, *vp;
   unsigned char *dp1;

   /* destination pointers */
   dp1 = rgb;

   for (yy = 0; yy < h; yy++)
     {
	/* plane pointers */
	yp1 = yuv[yy];
	up = yuv[h + (yy / 2)];
	vp = yuv[h + (h / 2) + (yy / 2)];
	for (xx = 0; xx < (w - 7); xx += 8)
	  {
	     movd_m2r(*up, mm3);
	     movd_m2r(*vp, mm2);
	     movq_m2r(*yp1, mm0);

	     pxor_r2r(mm7, mm7);
	     punpcklbw_r2r(mm7, mm2);
	     punpcklbw_r2r(mm7, mm3);

	     movq_r2r(mm0, mm1);
	     psrlw_i2r(8, mm0);
	     psllw_i2r(8, mm1);
	     psrlw_i2r(8, mm1);

	     movq_m2r(CONST_16, mm4);
	     psubsw_r2r(mm4, mm0);
	     psubsw_r2r(mm4, mm1);

	     movq_m2r(CONST_128, mm5);
	     psubsw_r2r(mm5, mm2);
	     psubsw_r2r(mm5, mm3);

	     movq_m2r(CONST_YMUL, mm4);
	     pmullw_r2r(mm4, mm0);
	     pmullw_r2r(mm4, mm1);

	     movq_m2r(CONST_CRVCRV, mm7);
	     pmullw_r2r(mm3, mm7);
	     movq_m2r(CONST_CBUCBU, mm6);
	     pmullw_r2r(mm2, mm6);
	     movq_m2r(CONST_CGUCGU, mm5);
	     pmullw_r2r(mm2, mm5);
	     movq_m2r(CONST_CGVCGV, mm4);
	     pmullw_r2r(mm3, mm4);

	     movq_r2r(mm0, mm2);
	     paddsw_r2r(mm7, mm2);
	     paddsw_r2r(mm1, mm7);

	     psraw_i2r(RES, mm2);
	     psraw_i2r(RES, mm7);
	     packuswb_r2r(mm7, mm2);

	     pxor_r2r(mm7, mm7);
	     movq_r2r(mm2, mm3);
	     punpckhbw_r2r(mm7, mm2);
	     punpcklbw_r2r(mm3, mm7);
	     por_r2r(mm7, mm2);

	     movq_r2r(mm0, mm3);
	     psubsw_r2r(mm5, mm3);
	     psubsw_r2r(mm4, mm3);
	     paddsw_m2r(CONST_32, mm3);

	     movq_r2r(mm1, mm7);
	     psubsw_r2r(mm5, mm7);
	     psubsw_r2r(mm4, mm7);
	     paddsw_m2r(CONST_32, mm7);

	     psraw_i2r(RES, mm3);
	     psraw_i2r(RES, mm7);
	     packuswb_r2r(mm7, mm3);

	     pxor_r2r(mm7, mm7);
	     movq_r2r(mm3, mm4);
	     punpckhbw_r2r(mm7, mm3);
	     punpcklbw_r2r(mm4, mm7);
	     por_r2r(mm7, mm3);

	     movq_m2r(CONST_32, mm4);
	     paddsw_r2r(mm6, mm0);
	     paddsw_r2r(mm6, mm1);
	     paddsw_r2r(mm4, mm0);
	     paddsw_r2r(mm4, mm1);
	     psraw_i2r(RES, mm0);
	     psraw_i2r(RES, mm1);
	     packuswb_r2r(mm1, mm0);

	     pxor_r2r(mm7, mm7);
	     movq_r2r(mm0, mm5);
	     punpckhbw_r2r(mm7, mm0);
	     punpcklbw_r2r(mm5, mm7);
	     por_r2r(mm7, mm0);

	     movq_m2r(CONST_FF, mm1);
	     movq_r2r(mm0, mm5);
	     movq_r2r(mm3, mm6);
	     movq_r2r(mm2, mm7);
	     punpckhbw_r2r(mm3, mm2);
	     punpcklbw_r2r(mm6, mm7);
	     punpckhbw_r2r(mm1, mm0);
	     punpcklbw_r2r(mm1, mm5);

	     movq_r2r(mm7, mm1);
	     punpckhwd_r2r(mm5, mm7);
	     punpcklwd_r2r(mm5, mm1);

	     movq_r2r(mm2, mm4);
	     punpckhwd_r2r(mm0, mm2);
	     punpcklwd_r2r(mm0, mm4);

	     movntq_r2m(mm1, *(dp1));
	     movntq_r2m(mm7, *(dp1 + 8));
	     movntq_r2m(mm4, *(dp1 + 16));
	     movntq_r2m(mm2, *(dp1 + 24));

	     yp1 += 8;
	     up += 4;
	     vp += 4;
	     dp1 += 8 * 4;
	  }
	/* cleanup pixles that arent a multiple of 8 pixels wide */
	if (xx < w)
	  {
	     int y, u, v, r, g, b;

	     for (; xx < w; xx += 2)
	       {
		  u = (*up++) - 128;
		  v = (*vp++) - 128;

		  y = RZ(YMUL) * ((*yp1++) - 16);
		  r = LUT_CLIP((y + (_crv * v)) >> RES);
		  g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
		  b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
		  *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);

		  dp1 += 4;

		  y = RZ(YMUL) * ((*yp1++) - 16);
		  r = LUT_CLIP((y + (_crv * v)) >> RES);
		  g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
		  b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
		  *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);

		  dp1 += 4;
	       }
	  }
     }
Exemple #30
0
static __inline__ void
sum_sumsq_8bytes( uint8_t *cur_lum_mb, 
				  uint8_t *pred_lum_mb,
				  mmx_t *sumtop_accs,
				  mmx_t *sumbot_accs,
				  mmx_t *sumsqtop_accs,
				  mmx_t *sumsqbot_accs,
				  mmx_t *sumxprod_accs
	)
{
	pxor_r2r(mm0,mm0);

	/* Load pixels from top field into mm1.w,mm2.w
	 */
	movq_m2r( *((mmx_t*)cur_lum_mb), mm1 );
	movq_m2r( *((mmx_t*)pred_lum_mb), mm2 );
	
	/* mm3 := mm1 mm4 := mm2
	   mm1.w[0..3] := mm1.b[0..3]-mm2.b[0..3]
	*/ 
	
	movq_r2r( mm1, mm3 );
	punpcklbw_r2r( mm0, mm1 );
	movq_r2r( mm2, mm4 );
	punpcklbw_r2r( mm0, mm2 );
	psubw_r2r( mm2, mm1 );
	
	/* mm3.w[0..3] := mm3.b[4..7]-mm4.b[4..7]
	 */
	punpckhbw_r2r( mm0, mm3 );
	punpckhbw_r2r( mm0, mm4 );
	psubw_r2r( mm4, mm3 );

	/* sumtop_accs->w[0..3] += mm1.w[0..3];
	   sumtop_accs->w[0..3] += mm3.w[0..3];
	   mm6 = mm1; mm7 = mm3;
	*/
	movq_m2r( *sumtop_accs, mm5 );
	paddw_r2r( mm1, mm5 );
	paddw_r2r( mm3, mm5 );
	movq_r2r( mm1, mm6 );
	movq_r2r( mm3, mm7 );
	movq_r2m( mm5, *sumtop_accs );

	/* 
	   *sumsq_top_acc += mm1.w[0..3] * mm1.w[0..3];
	   *sumsq_top_acc += mm3.w[0..3] * mm3.w[0..3];
	*/
	pmaddwd_r2r( mm1, mm1 );
	movq_m2r( *sumsqtop_accs, mm5 );
	pmaddwd_r2r( mm3, mm3 );
	paddd_r2r( mm1, mm5 );
	paddd_r2r( mm3, mm5 );
	movq_r2m( mm5, *sumsqtop_accs );
	

	/* Load pixels from bot field into mm1.w,mm2.w
	 */
	movq_m2r( *((mmx_t*)(cur_lum_mb+opt->phy_width)), mm1 );
	movq_m2r( *((mmx_t*)(pred_lum_mb+opt->phy_width)), mm2 );
	
	/* mm2 := mm1 mm4 := mm2
	   mm1.w[0..3] := mm1.b[0..3]-mm2.b[0..3]
	*/ 
	
	movq_r2r( mm1, mm3 );
	punpcklbw_r2r( mm0, mm1 );
	movq_r2r( mm2, mm4 );
	punpcklbw_r2r( mm0, mm2 );
	psubw_r2r( mm2, mm1 );
	
	/* mm3.w[0..3] := mm3.b[4..7]-mm4.b[4..7]
	 */
	punpckhbw_r2r( mm0, mm3 );
	punpckhbw_r2r( mm0, mm4 );
	psubw_r2r( mm4, mm3 );

	/* 
	   sumbot_accs->w[0..3] += mm1.w[0..3];
	   sumbot_accs->w[0..3] += mm3.w[0..3];
	   mm2 := mm1; mm4 := mm3;
	*/
	movq_m2r( *sumbot_accs, mm5 );
	paddw_r2r( mm1, mm5 );
	movq_r2r( mm1, mm2 );
	paddw_r2r( mm3, mm5 );
	movq_r2r( mm3, mm4 );
	movq_r2m( mm5, *sumbot_accs );

	/* 
	   *sumsqbot_acc += mm1.w[0..3] * mm1.w[0..3];
	   *sumsqbot_acc += mm3.w[0..3] * mm3.w[0..3];
	*/
	pmaddwd_r2r( mm1, mm1 );
	movq_m2r( *sumsqbot_accs, mm5 );
	pmaddwd_r2r( mm3, mm3 );
	paddd_r2r( mm1, mm5 );
	paddd_r2r( mm3, mm5 );
	movq_r2m( mm5, *sumsqbot_accs );
	
	
	/* Accumulate cross-product 
	 *sum_xprod_acc += mm1.w[0..3] * mm6[0..3];
	 *sum_xprod_acc += mm3.w[0..3] * mm7[0..3];
	 */

	movq_m2r( *sumxprod_accs, mm5 );
	pmaddwd_r2r( mm6, mm2);
	pmaddwd_r2r( mm7, mm4);
	paddd_r2r( mm2, mm5 );
	paddd_r2r( mm4, mm5 );
	movq_r2m( mm5, *sumxprod_accs );
	emms();
}