Пример #1
0
int main(int ac, char **av)
{
   int i, j, k, n;
   unsigned char dat0[8] = { 0x01, 0xf2, 0x03, 0x04, 0x05, 0x06, 0xf7, 0x08 };
   long long *datp = (long long *)&dat0;
   int16_t dat1[8] = { 0x10, 0x20, -0x130, -0x140, 0x50, -0x160, -0x170, 0x80 };
   volatile uint8_t *rfp = dat0;
   volatile int16_t *bp  = dat1;
   unsigned char ans1[8], ans2[8];

   n = 0;
   for( i=-32768; i<32768; ++i ) {
     j = 0;
     while( j < 256 ) {
        for( k=0; k<8; ++k ) {
          dat0[k] = i;
          dat1[k] = j++;
        }
       movq_m2r(m_(&rfp[0]),mm1);  /* rfp[0..7] */
       pxor_r2r(mm3,mm3);
       pxor_r2r(mm4,mm4);
       movq_m2r(m_(&bp[0]),mm5);   /* bp[0..3] */
       movq_r2r(mm1,mm2);
       movq_m2r(m_(&bp[4]),mm6);   /* bp[4..7] */
       punpcklbw_r2r(mm3,mm1);     /* rfp[0,2,4,6] */
       punpckhbw_r2r(mm3,mm2);     /* rfp[1,3,5,7] */
       paddsw_r2r(mm5,mm1);        /* bp[0..3] */
       paddsw_r2r(mm6,mm2);        /* bp[4..7] */
       pcmpgtw_r2r(mm1,mm3);
       pcmpgtw_r2r(mm2,mm4);
       pandn_r2r(mm1,mm3);
       pandn_r2r(mm2,mm4);
       packuswb_r2r(mm4,mm3);
       movq_r2m(mm3,m_(&ans1[0]));
       emms();

       ans2[0] = clip(bp[0] + rfp[0]);
       ans2[1] = clip(bp[1] + rfp[1]);
       ans2[2] = clip(bp[2] + rfp[2]);
       ans2[3] = clip(bp[3] + rfp[3]);
       ans2[4] = clip(bp[4] + rfp[4]);
       ans2[5] = clip(bp[5] + rfp[5]);
       ans2[6] = clip(bp[6] + rfp[6]);
       ans2[7] = clip(bp[7] + rfp[7]);

       if( *(uint64_t *)&ans1[0] != *(uint64_t *)&ans2[0] )
       {
         printf(" i=%5d %02x %02x %02x %02x  %02x %02x %02x %02x\n", i,
           ans1[0], ans1[1], ans1[2], ans1[3], ans1[4], ans1[5], ans1[6], ans1[7]);
         printf(" j=%5d %02x %02x %02x %02x  %02x %02x %02x %02x\n", j,
           ans2[0], ans2[1], ans2[2], ans2[3], ans2[4], ans2[5], ans2[6], ans2[7]);
       //  exit(0);
       }
       n += 8;
     }
   }

   printf("n=%d\n",n);
   return 0;
}
Пример #2
0
static void
scale_uint8_x_1_x_bicubic_noclip_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
  {
  int i;
  uint8_t * src, * dst, *src_start;
  int32_t * factors;
  mmx_t tmp_mm;
  
  //  fprintf(stderr, "scale_uint8_x_1_x_bicubic_noclip_mmx\n");
  src_start = ctx->src + scanline * ctx->src_stride;
  
  pxor_r2r(mm6, mm6);
  dst = dest_start;
  for(i = 0; i < ctx->dst_size; i++)
    {
    src = src_start + ctx->table_h.pixels[i].index;
    factors = ctx->table_h.pixels[i].factor_i;
    /* Load pixels */
    movd_m2r(*(src), mm0);
    punpcklbw_r2r(mm6, mm0);
    /* Load factors */
    movq_m2r(*factors, mm2);
    movq_m2r(*(factors+2), mm3);
    packssdw_r2r(mm3, mm2);
    /* Multiply */
    pmaddwd_r2r(mm2, mm0);
    psrld_i2r(14, mm0);
    MOVQ_R2M(mm0, tmp_mm);
    *(dst++) = tmp_mm.d[0] + tmp_mm.d[1];
    }
  ctx->need_emms = 1;
  }
Пример #3
0
/* For a 16*h block, this computes
   (((((*pf + *pf2 + 1)>>1) + ((*pb + *pb2 + 1)>>1) + 1)>>1) + *p2 + 1)>>1
*/
static int bsad_0quad_mmxe(uint8_t *pf,uint8_t *pf2,uint8_t *pb,uint8_t *pb2,uint8_t *p2,int lx,int h)
{
    int32_t s=0;

    pxor_r2r(mm7, mm7);
    do {
        movq_m2r(pf2[0],mm0);
        movq_m2r(pf2[8],mm2);
        movq_m2r(pb2[0],mm1);
        movq_m2r(pb2[8],mm3);
        pavgb_m2r(pf[0],mm0);
        pavgb_m2r(pf[8],mm2);
        pavgb_m2r(pb[0],mm1);
        pavgb_m2r(pb[8],mm3);
        pavgb_r2r(mm1,mm0);
        pavgb_r2r(mm3,mm2);
        psadbw_m2r(p2[0],mm0);
        psadbw_m2r(p2[8],mm2);
        paddd_r2r(mm0,mm7);
        paddd_r2r(mm2,mm7);

        pf+=lx;
        pf2+=lx;
        pb+=lx;
        pb2+=lx;
        p2+=lx;

        h--;
    } while (h);
    movd_r2g(mm7,s);
    emms();
    return s;
}
Пример #4
0
static inline void MC_avg4_8 (int height, uint8_t * dest, const uint8_t * ref,
			      const int stride, const int cpu)
{
    do {
	movq_m2r (*ref, mm0);
	movq_m2r (*(ref+stride+1), mm1);
	movq_r2r (mm0, mm7);
	movq_m2r (*(ref+1), mm2);
	pxor_r2r (mm1, mm7);
	movq_m2r (*(ref+stride), mm3);
	movq_r2r (mm2, mm6);
	pxor_r2r (mm3, mm6);
	pavg_r2r (mm1, mm0);
	pavg_r2r (mm3, mm2);
	por_r2r (mm6, mm7);
	movq_r2r (mm0, mm6);
	pxor_r2r (mm2, mm6);
	pand_r2r (mm6, mm7);
	pand_m2r (mask_one, mm7);
	pavg_r2r (mm2, mm0);
	psubusb_r2r (mm7, mm0);
	movq_m2r (*dest, mm1);
	pavg_r2r (mm1, mm0);
	ref += stride;
	movq_r2m (mm0, *dest);
	dest += stride;
    } while (--height);
}
Пример #5
0
static inline void mmx_average_2_U8 (uint8_t * dest,
                                     uint8_t * src1, uint8_t * src2)
{
    /* *dest = (*src1 + *src2 + 1)/ 2; */

    movq_m2r (*src1, mm1);      // load 8 src1 bytes
    movq_r2r (mm1, mm2);        // copy 8 src1 bytes

    movq_m2r (*src2, mm3);      // load 8 src2 bytes
    movq_r2r (mm3, mm4);        // copy 8 src2 bytes

    punpcklbw_r2r (mm0, mm1);   // unpack low src1 bytes
    punpckhbw_r2r (mm0, mm2);   // unpack high src1 bytes

    punpcklbw_r2r (mm0, mm3);   // unpack low src2 bytes
    punpckhbw_r2r (mm0, mm4);   // unpack high src2 bytes

    paddw_r2r (mm3, mm1);       // add lows to mm1
    paddw_m2r (round1, mm1);
    psraw_i2r (1, mm1);         // /2

    paddw_r2r (mm4, mm2);       // add highs to mm2
    paddw_m2r (round1, mm2);
    psraw_i2r (1, mm2);         // /2

    packuswb_r2r (mm2, mm1);    // pack (w/ saturation)
    movq_r2m (mm1, *dest);      // store result in dest
}
Пример #6
0
static inline void mmx_interp_average_2_U8 (uint8_t * dest,
					    const uint8_t * src1,
					    const uint8_t * src2)
{
    /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */

    movq_m2r (*dest, mm1);	/* load 8 dest bytes */
    movq_r2r (mm1, mm2);	/* copy 8 dest bytes */

    movq_m2r (*src1, mm3);	/* load 8 src1 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src1 bytes */

    movq_m2r (*src2, mm5);	/* load 8 src2 bytes */
    movq_r2r (mm5, mm6);	/* copy 8 src2 bytes */

    pxor_r2r (mm3, mm5);	/* xor src1 and src2 */
    pand_m2r (mask1, mm5);	/* mask lower bits */
    psrlq_i2r (1, mm5);		/* /2 */
    por_r2r (mm4, mm6);		/* or src1 and src2 */
    psubb_r2r (mm5, mm6);	/* subtract subresults */
    movq_r2r (mm6, mm5);	/* copy subresult */

    pxor_r2r (mm1, mm5);	/* xor srcavg and dest */
    pand_m2r (mask1, mm5);	/* mask lower bits */
    psrlq_i2r (1, mm5);		/* /2 */
    por_r2r (mm2, mm6);		/* or srcavg and dest */
    psubb_r2r (mm5, mm6);	/* subtract subresults */
    movq_r2m (mm6, *dest);	/* store result in dest */
}
Пример #7
0
static inline void mmx_end(uint8_t *src3, uint8_t *src5,
                           uint8_t *dst, int X)
{
    punpcklbw_m2r (mm_cpool[0], mm4);
    punpckhbw_m2r (mm_cpool[0], mm5);
    psubusw_r2r (mm2, mm0);
    psubusw_r2r (mm3, mm1);
    movq_m2r (src5[X], mm2);
    movq_m2r (src5[X], mm3);
    punpcklbw_m2r (mm_cpool[0], mm2);
    punpckhbw_m2r (mm_cpool[0], mm3);
    psubusw_r2r (mm2, mm0);
    psubusw_r2r (mm3, mm1);
    psrlw_i2r (3, mm0);
    psrlw_i2r (3, mm1);
    psubw_r2r (mm6, mm4);
    psubw_r2r (mm7, mm5);
    packuswb_r2r (mm1,mm0);
    movq_r2r (mm4, mm6);
    movq_r2r (mm5, mm7);
    pcmpgtw_m2r (mm_lthr, mm4);
    pcmpgtw_m2r (mm_lthr, mm5);
    pcmpgtw_m2r (mm_hthr, mm6);
    pcmpgtw_m2r (mm_hthr, mm7);
    packsswb_r2r (mm5, mm4);
    packsswb_r2r (mm7, mm6);
    pxor_r2r (mm6, mm4);
    movq_r2r (mm4, mm5);
    pandn_r2r (mm0, mm4);
    pand_m2r (src3[X], mm5);
    por_r2r (mm4, mm5);
    movq_r2m (mm5, dst[X]);
}
Пример #8
0
static inline void mmx_start(uint8_t *src1, uint8_t *src2,
                             uint8_t *src3, uint8_t *src4,
                             int X)
{
    movq_m2r (src2[X], mm0);
    movq_m2r (src2[X], mm1);
    movq_m2r (src4[X], mm2);
    movq_m2r (src4[X], mm3);
    movq_m2r (src3[X], mm4);
    movq_m2r (src3[X], mm5);
    punpcklbw_m2r (mm_cpool[0], mm0);
    punpckhbw_m2r (mm_cpool[0], mm1);
    punpcklbw_m2r (mm_cpool[0], mm2);
    punpckhbw_m2r (mm_cpool[0], mm3);
    movq_r2r (mm0, mm6);
    movq_r2r (mm1, mm7);
    paddw_r2r (mm2, mm0);
    paddw_r2r (mm3, mm1);
    movq_m2r (src3[X], mm2);
    movq_m2r (src3[X], mm3);
    psllw_i2r (2, mm0);
    psllw_i2r (2, mm1);
    punpcklbw_m2r (mm_cpool[0], mm2);
    punpckhbw_m2r (mm_cpool[0], mm3);
    psllw_i2r (1, mm2);
    psllw_i2r (1, mm3);
    paddw_r2r (mm2, mm0);
    paddw_r2r (mm3, mm1);
    movq_m2r (src1[X], mm2);
    movq_m2r (src1[X], mm3);
    punpcklbw_m2r (mm_cpool[0], mm2);
    punpckhbw_m2r (mm_cpool[0], mm3);
}
Пример #9
0
static void scale_uint16_x_4_x_generic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
  {
  int i, j;
  uint8_t * src, * dst, *src_start;
  int32_t * factors;
  //  mmx_t tmp_mm;

/*
 *  mm0: Input
 *  mm1: factor_mask
 *  mm2: Factor
 *  mm3: Output
 *  mm4: 
 *  mm5: 
 *  mm6: 0
 *  mm7: scratch
 *  
 */
  
  src_start = ctx->src + scanline * ctx->src_stride;
  
  pxor_r2r(mm6, mm6);
  movq_m2r(factor_mask, mm1);
  dst = dest_start;
  for(i = 0; i < ctx->dst_size; i++)
    {
    src = src_start + 8*ctx->table_h.pixels[i].index;
    factors = ctx->table_h.pixels[i].factor_i;
    pxor_r2r(mm3, mm3);

    for(j = 0; j < ctx->table_h.factors_per_pixel; j++)
      {
      /* Load pixels */
      movq_m2r(*(src), mm0);
      psrlw_i2r(1, mm0);
      /* Load factors */
      LOAD_FACTOR_1_4;
      /* Multiply */
      pmulhw_r2r(mm7, mm0);
      paddw_r2r(mm0, mm3);
      //    DUMP_MM("mm3_2", mm3);
      src += 8;
      factors++;
      }
    pminsw_m2r(max_13, mm3);
    pmaxsw_m2r(min_13, mm3);
    
    psllw_i2r(3, mm3);
    MOVQ_R2M(mm3, *dst);
    
    dst+=8;
    }
  ctx->need_emms = 1;
  
  }
Пример #10
0
static inline void MC_put1_16 (int height, uint8_t * dest, const uint8_t * ref,
			       const int stride)
{
    do {
	movq_m2r (*ref, mm0);
	movq_m2r (*(ref+8), mm1);
	ref += stride;
	movq_r2m (mm0, *dest);
	movq_r2m (mm1, *(dest+8));
	dest += stride;
    } while (--height);
}
Пример #11
0
static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1,
				     const uint8_t * src2,
				     const uint8_t * src3,
				     const uint8_t * src4)
{
    /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */

    movq_m2r (*src1, mm1);	/* load 8 src1 bytes */
    movq_r2r (mm1, mm2);	/* copy 8 src1 bytes */

    punpcklbw_r2r (mm0, mm1);	/* unpack low src1 bytes */
    punpckhbw_r2r (mm0, mm2);	/* unpack high src1 bytes */

    movq_m2r (*src2, mm3);	/* load 8 src2 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src2 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src2 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src2 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    /* now have partials in mm1 and mm2 */

    movq_m2r (*src3, mm3);	/* load 8 src3 bytes */
    movq_r2r (mm3, mm4);	/* copy 8 src3 bytes */

    punpcklbw_r2r (mm0, mm3);	/* unpack low src3 bytes */
    punpckhbw_r2r (mm0, mm4);	/* unpack high src3 bytes */

    paddw_r2r (mm3, mm1);	/* add lows */
    paddw_r2r (mm4, mm2);	/* add highs */

    movq_m2r (*src4, mm5);	/* load 8 src4 bytes */
    movq_r2r (mm5, mm6);	/* copy 8 src4 bytes */

    punpcklbw_r2r (mm0, mm5);	/* unpack low src4 bytes */
    punpckhbw_r2r (mm0, mm6);	/* unpack high src4 bytes */

    paddw_r2r (mm5, mm1);	/* add lows */
    paddw_r2r (mm6, mm2);	/* add highs */

    /* now have subtotal in mm1 and mm2 */

    paddw_m2r (round4, mm1);
    psraw_i2r (2, mm1);		/* /4 */
    paddw_m2r (round4, mm2);
    psraw_i2r (2, mm2);		/* /4 */

    packuswb_r2r (mm2, mm1);	/* pack (w/ saturation) */
    movq_r2m (mm1, *dest);	/* store result in dest */
}
Пример #12
0
static inline void mmx_average_4_U8 (uint8_t * dest,
                                     uint8_t * src1, uint8_t * src2,
                                     uint8_t * src3, uint8_t * src4)
{
    /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */

    movq_m2r (*src1, mm1);      // load 8 src1 bytes
    movq_r2r (mm1, mm2);        // copy 8 src1 bytes

    punpcklbw_r2r (mm0, mm1);   // unpack low src1 bytes
    punpckhbw_r2r (mm0, mm2);   // unpack high src1 bytes

    movq_m2r (*src2, mm3);      // load 8 src2 bytes
    movq_r2r (mm3, mm4);        // copy 8 src2 bytes

    punpcklbw_r2r (mm0, mm3);   // unpack low src2 bytes
    punpckhbw_r2r (mm0, mm4);   // unpack high src2 bytes

    paddw_r2r (mm3, mm1);       // add lows
    paddw_r2r (mm4, mm2);       // add highs

    /* now have partials in mm1 and mm2 */

    movq_m2r (*src3, mm3);      // load 8 src3 bytes
    movq_r2r (mm3, mm4);        // copy 8 src3 bytes

    punpcklbw_r2r (mm0, mm3);   // unpack low src3 bytes
    punpckhbw_r2r (mm0, mm4);   // unpack high src3 bytes

    paddw_r2r (mm3, mm1);       // add lows
    paddw_r2r (mm4, mm2);       // add highs

    movq_m2r (*src4, mm5);      // load 8 src4 bytes
    movq_r2r (mm5, mm6);        // copy 8 src4 bytes

    punpcklbw_r2r (mm0, mm5);   // unpack low src4 bytes
    punpckhbw_r2r (mm0, mm6);   // unpack high src4 bytes

    paddw_r2r (mm5, mm1);       // add lows
    paddw_r2r (mm6, mm2);       // add highs

    /* now have subtotal in mm1 and mm2 */

    paddw_m2r (round4, mm1);
    psraw_i2r (2, mm1);         // /4
    paddw_m2r (round4, mm2);
    psraw_i2r (2, mm2);         // /4

    packuswb_r2r (mm2, mm1);    // pack (w/ saturation)
    movq_r2m (mm1, *dest);      // store result in dest
}
Пример #13
0
static inline void MC_put2_16 (int height, uint8_t * dest, uint8_t * ref,
                               int stride, int offset, int cpu)
{
    do {
        movq_m2r (*ref, mm0);
        movq_m2r (*(ref+8), mm1);
        pavg_m2r (*(ref+offset), mm0);
        pavg_m2r (*(ref+offset+8), mm1);
        movq_r2m (mm0, *dest);
        ref += stride;
        movq_r2m (mm1, *(dest+8));
        dest += stride;
    } while (--height);
}
Пример #14
0
static void
deinterlace_scanline_linear_mmxext (GstDeinterlaceMethod * self,
    GstDeinterlace * parent, guint8 * out,
    GstDeinterlaceScanlineData * scanlines, gint width)
{
  gint i;
  guint8 *bot = scanlines->b0, *top = scanlines->t0;

  for (i = width / 16; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    movq_m2r (*(bot + 8), mm2);
    movq_m2r (*(top + 8), mm3);
    movq_m2r (*(bot + 16), mm4);
    movq_m2r (*(top + 16), mm5);
    movq_m2r (*(bot + 24), mm6);
    movq_m2r (*(top + 24), mm7);
    pavgb_r2r (mm1, mm0);
    pavgb_r2r (mm3, mm2);
    pavgb_r2r (mm5, mm4);
    pavgb_r2r (mm7, mm6);
    movntq_r2m (mm0, *out);
    movntq_r2m (mm2, *(out + 8));
    movntq_r2m (mm4, *(out + 16));
    movntq_r2m (mm6, *(out + 24));
    out += 32;
    top += 32;
    bot += 32;
  }
  width = (width & 0xf);

  for (i = width / 4; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    pavgb_r2r (mm1, mm0);
    movntq_r2m (mm0, *out);
    out += 8;
    top += 8;
    bot += 8;
  }
  width = width & 0x7;

  /* Handle last few pixels. */
  for (i = width * 2; i; --i) {
    *out++ = ((*top++) + (*bot++)) >> 1;
  }

  emms ();
}
static void frame_i2f_sse(u_char *src,float *dst,int l)
{
    int i;

    pxor_r2r(mm7,mm7);

    for( i=0; i<l; i+=8 ) {
        movq_m2r(*src,mm0);
        movq_r2r(mm0, mm2);
        punpcklbw_r2r(mm7, mm0);
        punpckhbw_r2r(mm7, mm2);
        movq_r2r(mm0, mm1);
        movq_r2r(mm2, mm3);
        punpcklwd_r2r(mm7, mm0);
        punpckhwd_r2r(mm7, mm1);
        punpcklwd_r2r(mm7, mm2);
        punpckhwd_r2r(mm7, mm3);
        cvtpi2ps_r2r(mm0,xmm0);
        cvtpi2ps_r2r(mm1,xmm1);
        cvtpi2ps_r2r(mm2,xmm2);
        cvtpi2ps_r2r(mm3,xmm3);
        movlps_r2m(xmm0,dst[0]);
        movlps_r2m(xmm1,dst[2]);
        movlps_r2m(xmm2,dst[4]);
        movlps_r2m(xmm3,dst[6]);

        src+=8;
        dst+=8;
    }
    emms();
}
Пример #16
0
static void
deinterlace_scanline_linear_mmxext (GstDeinterlaceSimpleMethod * self,
    guint8 * out, const guint8 * bot, const guint8 * top, gint size)
{
  gint i;

  for (i = size / 32; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    movq_m2r (*(bot + 8), mm2);
    movq_m2r (*(top + 8), mm3);
    movq_m2r (*(bot + 16), mm4);
    movq_m2r (*(top + 16), mm5);
    movq_m2r (*(bot + 24), mm6);
    movq_m2r (*(top + 24), mm7);
    pavgb_r2r (mm1, mm0);
    pavgb_r2r (mm3, mm2);
    pavgb_r2r (mm5, mm4);
    pavgb_r2r (mm7, mm6);
    movntq_r2m (mm0, *out);
    movntq_r2m (mm2, *(out + 8));
    movntq_r2m (mm4, *(out + 16));
    movntq_r2m (mm6, *(out + 24));
    out += 32;
    top += 32;
    bot += 32;
  }
  size = (size & 0x1f);

  for (i = size / 8; i; --i) {
    movq_m2r (*bot, mm0);
    movq_m2r (*top, mm1);
    pavgb_r2r (mm1, mm0);
    movntq_r2m (mm0, *out);
    out += 8;
    top += 8;
    bot += 8;
  }
  emms ();

  size = size & 0xf;

  /* Handle last few pixels. */
  for (i = size; i; --i) {
    *out++ = ((*top++) + (*bot++)) >> 1;
  }
}
Пример #17
0
static void interpolate_packed422_scanline_mmxext( uint8_t *output, uint8_t *top,
                                                   uint8_t *bot, int width )
{
    int i;

    for( i = width/16; i; --i ) {
        movq_m2r( *bot, mm0 );
        movq_m2r( *top, mm1 );
        movq_m2r( *(bot + 8), mm2 );
        movq_m2r( *(top + 8), mm3 );
        movq_m2r( *(bot + 16), mm4 );
        movq_m2r( *(top + 16), mm5 );
        movq_m2r( *(bot + 24), mm6 );
        movq_m2r( *(top + 24), mm7 );
        pavgb_r2r( mm1, mm0 );
        pavgb_r2r( mm3, mm2 );
        pavgb_r2r( mm5, mm4 );
        pavgb_r2r( mm7, mm6 );
        movntq_r2m( mm0, *output );
        movntq_r2m( mm2, *(output + 8) );
        movntq_r2m( mm4, *(output + 16) );
        movntq_r2m( mm6, *(output + 24) );
        output += 32;
        top += 32;
        bot += 32;
    }
    width = (width & 0xf);

    for( i = width/4; i; --i ) {
        movq_m2r( *bot, mm0 );
        movq_m2r( *top, mm1 );
        pavgb_r2r( mm1, mm0 );
        movntq_r2m( mm0, *output );
        output += 8;
        top += 8;
        bot += 8;
    }
    width = width & 0x7;

    /* Handle last few pixels. */
    for( i = width * 2; i; --i ) {
        *output++ = ((*top++) + (*bot++)) >> 1;
    }

    sfence();
    emms();
}
Пример #18
0
static void scale_uint8_x_4_x_bilinear_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
  {
  int i;
  uint8_t * src, * dst, *src_start;
  int32_t * factors;
  //  mmx_t tmp_mm;

/*
 *  mm0: Input1
 *  mm1: Factor mask
 *  mm2: 
 *  mm3: Output
 *  mm4: 
 *  mm5: Input2
 *  mm6: 0
 *  mm7: Factor
 *  
 */

//  fprintf(stderr, "scale_uint8_x_4_x_bilinear_mmx\n");
  
  src_start = ctx->src + scanline * ctx->src_stride;
  
  pxor_r2r(mm6, mm6);
  movq_m2r(factor_mask, mm1);
  dst = dest_start;
  for(i = 0; i < ctx->dst_size; i++)
    {
    src = src_start + 4*ctx->table_h.pixels[i].index;
    factors = ctx->table_h.pixels[i].factor_i;
    
    /* Load pixels */
    movd_m2r(*(src), mm0);
    punpcklbw_r2r(mm6, mm0);
    psllw_i2r(6, mm0); /* 14 bit */
    /* Load pixels */
    movd_m2r(*(src+4), mm5);
    punpcklbw_r2r(mm6, mm5);
    psllw_i2r(6, mm5); /* 14 bit */

    /* Load factors */
    LOAD_FACTOR_1_4_NOCLIP; /* 14 bit */
    /* Subtract */
    psubsw_r2r(mm5, mm0); /* s1(mm0) - s2(mm5) -> mm0 (14 bit) */
    pmulhw_r2r(mm7, mm0); /* factor * (s2 - s1) -> mm0 (12 bit) */
    
    psllw_i2r(2, mm0); /* (14 bit) */
    
    paddsw_r2r(mm5, mm0);/* (15 bit) */
    
    psraw_i2r(6, mm0);/* (8 bit) */
    packuswb_r2r(mm6, mm0);
    movd_r2m(mm0, *dst);
    
    dst+=4;
    }
  ctx->need_emms = 1;
  
  }
Пример #19
0
static inline void MC_put1_8 (int height, uint8_t * dest, uint8_t * ref,
                              int stride)
{
    do {
        movq_m2r (*ref, mm0);
        movq_r2m (mm0, *dest);
        ref += stride;
        dest += stride;
    } while (--height);
}
Пример #20
0
static __inline__ void load_blk(uint8_t *blk, uint32_t rowstride, int h)
{
// Required to get GCC 4.0 to use the right registers as the source argument to
// movq
	uint8_t *blk2 = blk + rowstride * 2;

	movq_m2r( *blk, mm0);
	blk += rowstride;
	movq_m2r( *blk, mm1);


	if( h == 2 )
		return;


	movq_m2r( *blk2, mm2);
	blk2 += rowstride;
	movq_m2r( *blk2, mm3);
}
Пример #21
0
static inline void mean8(unsigned char *refpix,unsigned char *pixel,int radius_count,int row_stride,int threshold,int8_t *diff,unsigned char *count)
{
    int a,b;

    pxor_r2r(mm6,mm6); // mm6 (aka count) = 0
    pxor_r2r(mm7,mm7); // mm7 (aka diff) = 0
    movq_m2r(*refpix,mm3); // mm3 = refpix[0]

    movd_g2r(0x80808080,mm4); // mm4 = 128
    punpcklbw_r2r(mm4,mm4);

    pxor_r2r(mm4,mm3); // mm3 = refpix[0]-128

    movd_g2r(threshold,mm5); // mm5 = threshold
    punpcklbw_r2r(mm5,mm5);
    punpcklbw_r2r(mm5,mm5);
    punpcklbw_r2r(mm5,mm5);

    for( b=0; b<radius_count; b++ ) {
        for( a=0; a<radius_count; a++ ) {
            movq_m2r(*pixel,mm0); // mm0  = pixel[0]
            pxor_r2r(mm4,mm0);    // mm0  = pixel[0]-128
            movq_r2r(mm3,mm2);    // mm2  = refpix[0]-128
            psubsb_r2r(mm0,mm2);  // mm2  = refpix[0]-pixel[0]
            psubsb_r2r(mm3,mm0);  // mm0  = pixel[0]-refpix[0]
            pminub_r2r(mm0,mm2);  // mm2  = abs(pixel[0]-refpix[0])
            movq_r2r(mm5,mm1);    // mm1  = threshold
            pcmpgtb_r2r(mm2,mm1); // mm1  = (threshold > abs(pixel[0]-refpix[0])) ? -1 : 0
            psubb_r2r(mm1,mm6);   // mm6 += (threshold > abs(pixel[0]-refpix[0]))
            pand_r2r(mm1,mm0);    // mm0  = (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0
            paddb_r2r(mm0,mm7);   // mm7 += (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0

            ++pixel;
        }
        pixel += row_stride - radius_count;
    }

    movq_r2m(mm6,*count);
    movq_r2m(mm7,*diff);

    emms();
 
}
Пример #22
0
static inline void MC_put_mmx (const int width, int height, uint8_t * dest,
			       const uint8_t * ref, const int stride)
{
    mmx_zero_reg ();

    do {
	movq_m2r (* ref, mm1);	/* load 8 ref bytes */
	movq_r2m (mm1,* dest);	/* store 8 bytes at curr */

	if (width == 16)
	    {
		movq_m2r (* (ref+8), mm1);	/* load 8 ref bytes */
		movq_r2m (mm1,* (dest+8));	/* store 8 bytes at curr */
	    }

	dest += stride;
	ref += stride;
    } while (--height);
}
Пример #23
0
static inline void MC_avg1_8 (int height, uint8_t * dest, const uint8_t * ref,
			      const int stride, const int cpu)
{
    do {
	movq_m2r (*ref, mm0);
	pavg_m2r (*dest, mm0);
	ref += stride;
	movq_r2m (mm0, *dest);
	dest += stride;
    } while (--height);
}
Пример #24
0
static inline void MC_avg2_8 (int height, uint8_t * dest, uint8_t * ref,
                              int stride, int offset, int cpu)
{
    do {
        movq_m2r (*ref, mm0);
        pavg_m2r (*(ref+offset), mm0);
        pavg_m2r (*dest, mm0);
        ref += stride;
        movq_r2m (mm0, *dest);
        dest += stride;
    } while (--height);
}
Пример #25
0
static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
                                          uint8_t *src, int i_src )
{
    int y;

    /* Interlaced */
    for( y = 0; y < 8; y += 2 )
    {
        movq_m2r( src[0], mm0 );
        movq_r2m( mm0, dst[0] );
        dst += i_dst;

        movq_m2r( src[2*i_src], mm1 );
        pavgb_r2r( mm1, mm0 );

        movq_r2m( mm0, dst[0] );

        dst += 1*i_dst;
        src += 2*i_src;
    }
}
Пример #26
0
static inline void MC_put4_8 (int height, uint8_t * dest, uint8_t * ref,
                              int stride, int cpu)
{
    movq_m2r (*ref, mm0);
    movq_m2r (*(ref+1), mm1);
    movq_r2r (mm0, mm7);
    pxor_r2r (mm1, mm7);
    pavg_r2r (mm1, mm0);
    ref += stride;

    do {
        movq_m2r (*ref, mm2);
        movq_r2r (mm0, mm5);

        movq_m2r (*(ref+1), mm3);
        movq_r2r (mm2, mm6);

        pxor_r2r (mm3, mm6);
        pavg_r2r (mm3, mm2);

        por_r2r (mm6, mm7);
        pxor_r2r (mm2, mm5);

        pand_r2r (mm5, mm7);
        pavg_r2r (mm2, mm0);

        pand_m2r (mask_one, mm7);

        psubusb_r2r (mm7, mm0);

        ref += stride;
        movq_r2m (mm0, *dest);
        dest += stride;

        movq_r2r (mm6, mm7);    // unroll !
        movq_r2r (mm2, mm0);    // unroll !
    } while (--height);
}
Пример #27
0
static void fast_memcpy_mmxext( void *d, const void *s, size_t n )
{
    const uint8_t *src = s;
    uint8_t *dest = d;

    if( dest != src ) {
        while( n > 64 ) {
            movq_m2r( src[ 0 ], mm0 );
            movq_m2r( src[ 8 ], mm1 );
            movq_m2r( src[ 16 ], mm2 );
            movq_m2r( src[ 24 ], mm3 );
            movq_m2r( src[ 32 ], mm4 );
            movq_m2r( src[ 40 ], mm5 );
            movq_m2r( src[ 48 ], mm6 );
            movq_m2r( src[ 56 ], mm7 );
            movntq_r2m( mm0, dest[ 0 ] );
            movntq_r2m( mm1, dest[ 8 ] );
            movntq_r2m( mm2, dest[ 16 ] );
            movntq_r2m( mm3, dest[ 24 ] );
            movntq_r2m( mm4, dest[ 32 ] );
            movntq_r2m( mm5, dest[ 40 ] );
            movntq_r2m( mm6, dest[ 48 ] );
            movntq_r2m( mm7, dest[ 56 ] );
            dest += 64;
            src += 64;
            n -= 64;
        }

        while( n > 8 ) {
            movq_m2r( src[ 0 ], mm0 );
            movntq_r2m( mm0, dest[ 0 ] );
            dest += 8;
            src += 8;
            n -= 8;
        }

        if( n ) small_memcpy( dest, src, n );

        sfence();
        emms();
    }
}
Пример #28
0
static void scale_uint16_x_1_x_bicubic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start)
  {
  int i;
  uint16_t * dst;
  uint8_t * src, *src_start;
  int32_t * factors;
  mmx_t tmp_mm;
  int32_t tmp;
  
  //  fprintf(stderr, "scale_uint8_x_1_x_bicubic_mmx\n");

  src_start = ctx->src + scanline * ctx->src_stride;
  
  pxor_r2r(mm6, mm6);
  dst = (uint16_t*)dest_start;
  for(i = 0; i < ctx->dst_size; i++)
    {
    src = src_start + 2*ctx->table_h.pixels[i].index;
    factors = ctx->table_h.pixels[i].factor_i;
    /* Load pixels */
    movq_m2r(*(src), mm0);
    psrlw_i2r(1, mm0);
    //    DUMP_MM("mm0", mm0);
    /* Load factors */
    movq_m2r(*factors, mm2);
    movq_m2r(*(factors+2), mm3);
    packssdw_r2r(mm3, mm2);
    /* Multiply */
    pmaddwd_r2r(mm2, mm0);
    MOVQ_R2M(mm0, tmp_mm);
    tmp = tmp_mm.d[0] + tmp_mm.d[1];
    tmp >>= 13;
    RECLIP(tmp, ctx->plane);
    *(dst++) = tmp;
    }
  ctx->need_emms = 1;
  }
Пример #29
0
static void
deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4,
    uint8_t * lum_m3, uint8_t * lum_m2,
    uint8_t * lum_m1, uint8_t * lum, int size)
{
  mmx_t rounder;

  rounder.uw[0] = 4;
  rounder.uw[1] = 4;
  rounder.uw[2] = 4;
  rounder.uw[3] = 4;
  pxor_r2r (mm7, mm7);
  movq_m2r (rounder, mm6);

  for (; size > 3; size -= 4) {
    movd_m2r (*lum_m4, mm0);
    movd_m2r (*lum_m3, mm1);
    movd_m2r (*lum_m2, mm2);
    movd_m2r (*lum_m1, mm3);
    movd_m2r (*lum, mm4);
    punpcklbw_r2r (mm7, mm0);
    punpcklbw_r2r (mm7, mm1);
    punpcklbw_r2r (mm7, mm2);
    punpcklbw_r2r (mm7, mm3);
    punpcklbw_r2r (mm7, mm4);
    paddw_r2r (mm3, mm1);
    psllw_i2r (1, mm2);
    paddw_r2r (mm4, mm0);
    psllw_i2r (2, mm1);         // 2
    paddw_r2r (mm6, mm2);
    paddw_r2r (mm2, mm1);
    psubusw_r2r (mm0, mm1);
    psrlw_i2r (3, mm1);         // 3
    packuswb_r2r (mm7, mm1);
    movd_r2m (mm1, *dst);
    lum_m4 += 4;
    lum_m3 += 4;
    lum_m2 += 4;
    lum_m1 += 4;
    lum += 4;
    dst += 4;
  }
  emms ();

  /* Handle odd widths */
  if (size > 0)
    deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size);
}
Пример #30
0
static __inline__ void
mmx_sum_4_word_accs( mmx_t *accs, int32_t *res )
{
	movq_m2r( *accs, mm1 );
	movq_r2r( mm1, mm3 );
	movq_r2r( mm1, mm2 );
	/* Generate sign extensions for mm1 words! */
	psraw_i2r( 15, mm3 );
	punpcklwd_r2r( mm3, mm1 );
	punpckhwd_r2r( mm3, mm2 );
	paddd_r2r( mm1, mm2 );
	movq_r2r( mm2, mm3);
	psrlq_i2r( 32, mm2);
	paddd_r2r( mm2, mm3);
	movd_r2m( mm3, *res );
}