int main(int ac, char **av) { int i, j, k, n; unsigned char dat0[8] = { 0x01, 0xf2, 0x03, 0x04, 0x05, 0x06, 0xf7, 0x08 }; long long *datp = (long long *)&dat0; int16_t dat1[8] = { 0x10, 0x20, -0x130, -0x140, 0x50, -0x160, -0x170, 0x80 }; volatile uint8_t *rfp = dat0; volatile int16_t *bp = dat1; unsigned char ans1[8], ans2[8]; n = 0; for( i=-32768; i<32768; ++i ) { j = 0; while( j < 256 ) { for( k=0; k<8; ++k ) { dat0[k] = i; dat1[k] = j++; } movq_m2r(m_(&rfp[0]),mm1); /* rfp[0..7] */ pxor_r2r(mm3,mm3); pxor_r2r(mm4,mm4); movq_m2r(m_(&bp[0]),mm5); /* bp[0..3] */ movq_r2r(mm1,mm2); movq_m2r(m_(&bp[4]),mm6); /* bp[4..7] */ punpcklbw_r2r(mm3,mm1); /* rfp[0,2,4,6] */ punpckhbw_r2r(mm3,mm2); /* rfp[1,3,5,7] */ paddsw_r2r(mm5,mm1); /* bp[0..3] */ paddsw_r2r(mm6,mm2); /* bp[4..7] */ pcmpgtw_r2r(mm1,mm3); pcmpgtw_r2r(mm2,mm4); pandn_r2r(mm1,mm3); pandn_r2r(mm2,mm4); packuswb_r2r(mm4,mm3); movq_r2m(mm3,m_(&ans1[0])); emms(); ans2[0] = clip(bp[0] + rfp[0]); ans2[1] = clip(bp[1] + rfp[1]); ans2[2] = clip(bp[2] + rfp[2]); ans2[3] = clip(bp[3] + rfp[3]); ans2[4] = clip(bp[4] + rfp[4]); ans2[5] = clip(bp[5] + rfp[5]); ans2[6] = clip(bp[6] + rfp[6]); ans2[7] = clip(bp[7] + rfp[7]); if( *(uint64_t *)&ans1[0] != *(uint64_t *)&ans2[0] ) { printf(" i=%5d %02x %02x %02x %02x %02x %02x %02x %02x\n", i, ans1[0], ans1[1], ans1[2], ans1[3], ans1[4], ans1[5], ans1[6], ans1[7]); printf(" j=%5d %02x %02x %02x %02x %02x %02x %02x %02x\n", j, ans2[0], ans2[1], ans2[2], ans2[3], ans2[4], ans2[5], ans2[6], ans2[7]); // exit(0); } n += 8; } } printf("n=%d\n",n); return 0; }
static void scale_uint8_x_1_x_bicubic_noclip_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i; uint8_t * src, * dst, *src_start; int32_t * factors; mmx_t tmp_mm; // fprintf(stderr, "scale_uint8_x_1_x_bicubic_noclip_mmx\n"); src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); dst = dest_start; for(i = 0; i < ctx->dst_size; i++) { src = src_start + ctx->table_h.pixels[i].index; factors = ctx->table_h.pixels[i].factor_i; /* Load pixels */ movd_m2r(*(src), mm0); punpcklbw_r2r(mm6, mm0); /* Load factors */ movq_m2r(*factors, mm2); movq_m2r(*(factors+2), mm3); packssdw_r2r(mm3, mm2); /* Multiply */ pmaddwd_r2r(mm2, mm0); psrld_i2r(14, mm0); MOVQ_R2M(mm0, tmp_mm); *(dst++) = tmp_mm.d[0] + tmp_mm.d[1]; } ctx->need_emms = 1; }
/* For a 16*h block, this computes (((((*pf + *pf2 + 1)>>1) + ((*pb + *pb2 + 1)>>1) + 1)>>1) + *p2 + 1)>>1 */ static int bsad_0quad_mmxe(uint8_t *pf,uint8_t *pf2,uint8_t *pb,uint8_t *pb2,uint8_t *p2,int lx,int h) { int32_t s=0; pxor_r2r(mm7, mm7); do { movq_m2r(pf2[0],mm0); movq_m2r(pf2[8],mm2); movq_m2r(pb2[0],mm1); movq_m2r(pb2[8],mm3); pavgb_m2r(pf[0],mm0); pavgb_m2r(pf[8],mm2); pavgb_m2r(pb[0],mm1); pavgb_m2r(pb[8],mm3); pavgb_r2r(mm1,mm0); pavgb_r2r(mm3,mm2); psadbw_m2r(p2[0],mm0); psadbw_m2r(p2[8],mm2); paddd_r2r(mm0,mm7); paddd_r2r(mm2,mm7); pf+=lx; pf2+=lx; pb+=lx; pb2+=lx; p2+=lx; h--; } while (h); movd_r2g(mm7,s); emms(); return s; }
static inline void MC_avg4_8 (int height, uint8_t * dest, const uint8_t * ref, const int stride, const int cpu) { do { movq_m2r (*ref, mm0); movq_m2r (*(ref+stride+1), mm1); movq_r2r (mm0, mm7); movq_m2r (*(ref+1), mm2); pxor_r2r (mm1, mm7); movq_m2r (*(ref+stride), mm3); movq_r2r (mm2, mm6); pxor_r2r (mm3, mm6); pavg_r2r (mm1, mm0); pavg_r2r (mm3, mm2); por_r2r (mm6, mm7); movq_r2r (mm0, mm6); pxor_r2r (mm2, mm6); pand_r2r (mm6, mm7); pand_m2r (mask_one, mm7); pavg_r2r (mm2, mm0); psubusb_r2r (mm7, mm0); movq_m2r (*dest, mm1); pavg_r2r (mm1, mm0); ref += stride; movq_r2m (mm0, *dest); dest += stride; } while (--height); }
static inline void mmx_average_2_U8 (uint8_t * dest, uint8_t * src1, uint8_t * src2) { /* *dest = (*src1 + *src2 + 1)/ 2; */ movq_m2r (*src1, mm1); // load 8 src1 bytes movq_r2r (mm1, mm2); // copy 8 src1 bytes movq_m2r (*src2, mm3); // load 8 src2 bytes movq_r2r (mm3, mm4); // copy 8 src2 bytes punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes paddw_r2r (mm3, mm1); // add lows to mm1 paddw_m2r (round1, mm1); psraw_i2r (1, mm1); // /2 paddw_r2r (mm4, mm2); // add highs to mm2 paddw_m2r (round1, mm2); psraw_i2r (1, mm2); // /2 packuswb_r2r (mm2, mm1); // pack (w/ saturation) movq_r2m (mm1, *dest); // store result in dest }
static inline void mmx_interp_average_2_U8 (uint8_t * dest, const uint8_t * src1, const uint8_t * src2) { /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */ movq_m2r (*dest, mm1); /* load 8 dest bytes */ movq_r2r (mm1, mm2); /* copy 8 dest bytes */ movq_m2r (*src1, mm3); /* load 8 src1 bytes */ movq_r2r (mm3, mm4); /* copy 8 src1 bytes */ movq_m2r (*src2, mm5); /* load 8 src2 bytes */ movq_r2r (mm5, mm6); /* copy 8 src2 bytes */ pxor_r2r (mm3, mm5); /* xor src1 and src2 */ pand_m2r (mask1, mm5); /* mask lower bits */ psrlq_i2r (1, mm5); /* /2 */ por_r2r (mm4, mm6); /* or src1 and src2 */ psubb_r2r (mm5, mm6); /* subtract subresults */ movq_r2r (mm6, mm5); /* copy subresult */ pxor_r2r (mm1, mm5); /* xor srcavg and dest */ pand_m2r (mask1, mm5); /* mask lower bits */ psrlq_i2r (1, mm5); /* /2 */ por_r2r (mm2, mm6); /* or srcavg and dest */ psubb_r2r (mm5, mm6); /* subtract subresults */ movq_r2m (mm6, *dest); /* store result in dest */ }
static inline void mmx_end(uint8_t *src3, uint8_t *src5, uint8_t *dst, int X) { punpcklbw_m2r (mm_cpool[0], mm4); punpckhbw_m2r (mm_cpool[0], mm5); psubusw_r2r (mm2, mm0); psubusw_r2r (mm3, mm1); movq_m2r (src5[X], mm2); movq_m2r (src5[X], mm3); punpcklbw_m2r (mm_cpool[0], mm2); punpckhbw_m2r (mm_cpool[0], mm3); psubusw_r2r (mm2, mm0); psubusw_r2r (mm3, mm1); psrlw_i2r (3, mm0); psrlw_i2r (3, mm1); psubw_r2r (mm6, mm4); psubw_r2r (mm7, mm5); packuswb_r2r (mm1,mm0); movq_r2r (mm4, mm6); movq_r2r (mm5, mm7); pcmpgtw_m2r (mm_lthr, mm4); pcmpgtw_m2r (mm_lthr, mm5); pcmpgtw_m2r (mm_hthr, mm6); pcmpgtw_m2r (mm_hthr, mm7); packsswb_r2r (mm5, mm4); packsswb_r2r (mm7, mm6); pxor_r2r (mm6, mm4); movq_r2r (mm4, mm5); pandn_r2r (mm0, mm4); pand_m2r (src3[X], mm5); por_r2r (mm4, mm5); movq_r2m (mm5, dst[X]); }
static inline void mmx_start(uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4, int X) { movq_m2r (src2[X], mm0); movq_m2r (src2[X], mm1); movq_m2r (src4[X], mm2); movq_m2r (src4[X], mm3); movq_m2r (src3[X], mm4); movq_m2r (src3[X], mm5); punpcklbw_m2r (mm_cpool[0], mm0); punpckhbw_m2r (mm_cpool[0], mm1); punpcklbw_m2r (mm_cpool[0], mm2); punpckhbw_m2r (mm_cpool[0], mm3); movq_r2r (mm0, mm6); movq_r2r (mm1, mm7); paddw_r2r (mm2, mm0); paddw_r2r (mm3, mm1); movq_m2r (src3[X], mm2); movq_m2r (src3[X], mm3); psllw_i2r (2, mm0); psllw_i2r (2, mm1); punpcklbw_m2r (mm_cpool[0], mm2); punpckhbw_m2r (mm_cpool[0], mm3); psllw_i2r (1, mm2); psllw_i2r (1, mm3); paddw_r2r (mm2, mm0); paddw_r2r (mm3, mm1); movq_m2r (src1[X], mm2); movq_m2r (src1[X], mm3); punpcklbw_m2r (mm_cpool[0], mm2); punpckhbw_m2r (mm_cpool[0], mm3); }
static void scale_uint16_x_4_x_generic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i, j; uint8_t * src, * dst, *src_start; int32_t * factors; // mmx_t tmp_mm; /* * mm0: Input * mm1: factor_mask * mm2: Factor * mm3: Output * mm4: * mm5: * mm6: 0 * mm7: scratch * */ src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); movq_m2r(factor_mask, mm1); dst = dest_start; for(i = 0; i < ctx->dst_size; i++) { src = src_start + 8*ctx->table_h.pixels[i].index; factors = ctx->table_h.pixels[i].factor_i; pxor_r2r(mm3, mm3); for(j = 0; j < ctx->table_h.factors_per_pixel; j++) { /* Load pixels */ movq_m2r(*(src), mm0); psrlw_i2r(1, mm0); /* Load factors */ LOAD_FACTOR_1_4; /* Multiply */ pmulhw_r2r(mm7, mm0); paddw_r2r(mm0, mm3); // DUMP_MM("mm3_2", mm3); src += 8; factors++; } pminsw_m2r(max_13, mm3); pmaxsw_m2r(min_13, mm3); psllw_i2r(3, mm3); MOVQ_R2M(mm3, *dst); dst+=8; } ctx->need_emms = 1; }
static inline void MC_put1_16 (int height, uint8_t * dest, const uint8_t * ref, const int stride) { do { movq_m2r (*ref, mm0); movq_m2r (*(ref+8), mm1); ref += stride; movq_r2m (mm0, *dest); movq_r2m (mm1, *(dest+8)); dest += stride; } while (--height); }
static inline void mmx_average_4_U8 (uint8_t * dest, const uint8_t * src1, const uint8_t * src2, const uint8_t * src3, const uint8_t * src4) { /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */ movq_m2r (*src1, mm1); /* load 8 src1 bytes */ movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */ punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */ movq_m2r (*src2, mm3); /* load 8 src2 bytes */ movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */ punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */ paddw_r2r (mm3, mm1); /* add lows */ paddw_r2r (mm4, mm2); /* add highs */ /* now have partials in mm1 and mm2 */ movq_m2r (*src3, mm3); /* load 8 src3 bytes */ movq_r2r (mm3, mm4); /* copy 8 src3 bytes */ punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */ punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */ paddw_r2r (mm3, mm1); /* add lows */ paddw_r2r (mm4, mm2); /* add highs */ movq_m2r (*src4, mm5); /* load 8 src4 bytes */ movq_r2r (mm5, mm6); /* copy 8 src4 bytes */ punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */ punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */ paddw_r2r (mm5, mm1); /* add lows */ paddw_r2r (mm6, mm2); /* add highs */ /* now have subtotal in mm1 and mm2 */ paddw_m2r (round4, mm1); psraw_i2r (2, mm1); /* /4 */ paddw_m2r (round4, mm2); psraw_i2r (2, mm2); /* /4 */ packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */ movq_r2m (mm1, *dest); /* store result in dest */ }
static inline void mmx_average_4_U8 (uint8_t * dest, uint8_t * src1, uint8_t * src2, uint8_t * src3, uint8_t * src4) { /* *dest = (*src1 + *src2 + *src3 + *src4 + 2)/ 4; */ movq_m2r (*src1, mm1); // load 8 src1 bytes movq_r2r (mm1, mm2); // copy 8 src1 bytes punpcklbw_r2r (mm0, mm1); // unpack low src1 bytes punpckhbw_r2r (mm0, mm2); // unpack high src1 bytes movq_m2r (*src2, mm3); // load 8 src2 bytes movq_r2r (mm3, mm4); // copy 8 src2 bytes punpcklbw_r2r (mm0, mm3); // unpack low src2 bytes punpckhbw_r2r (mm0, mm4); // unpack high src2 bytes paddw_r2r (mm3, mm1); // add lows paddw_r2r (mm4, mm2); // add highs /* now have partials in mm1 and mm2 */ movq_m2r (*src3, mm3); // load 8 src3 bytes movq_r2r (mm3, mm4); // copy 8 src3 bytes punpcklbw_r2r (mm0, mm3); // unpack low src3 bytes punpckhbw_r2r (mm0, mm4); // unpack high src3 bytes paddw_r2r (mm3, mm1); // add lows paddw_r2r (mm4, mm2); // add highs movq_m2r (*src4, mm5); // load 8 src4 bytes movq_r2r (mm5, mm6); // copy 8 src4 bytes punpcklbw_r2r (mm0, mm5); // unpack low src4 bytes punpckhbw_r2r (mm0, mm6); // unpack high src4 bytes paddw_r2r (mm5, mm1); // add lows paddw_r2r (mm6, mm2); // add highs /* now have subtotal in mm1 and mm2 */ paddw_m2r (round4, mm1); psraw_i2r (2, mm1); // /4 paddw_m2r (round4, mm2); psraw_i2r (2, mm2); // /4 packuswb_r2r (mm2, mm1); // pack (w/ saturation) movq_r2m (mm1, *dest); // store result in dest }
static inline void MC_put2_16 (int height, uint8_t * dest, uint8_t * ref, int stride, int offset, int cpu) { do { movq_m2r (*ref, mm0); movq_m2r (*(ref+8), mm1); pavg_m2r (*(ref+offset), mm0); pavg_m2r (*(ref+offset+8), mm1); movq_r2m (mm0, *dest); ref += stride; movq_r2m (mm1, *(dest+8)); dest += stride; } while (--height); }
static void deinterlace_scanline_linear_mmxext (GstDeinterlaceMethod * self, GstDeinterlace * parent, guint8 * out, GstDeinterlaceScanlineData * scanlines, gint width) { gint i; guint8 *bot = scanlines->b0, *top = scanlines->t0; for (i = width / 16; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); movq_m2r (*(bot + 8), mm2); movq_m2r (*(top + 8), mm3); movq_m2r (*(bot + 16), mm4); movq_m2r (*(top + 16), mm5); movq_m2r (*(bot + 24), mm6); movq_m2r (*(top + 24), mm7); pavgb_r2r (mm1, mm0); pavgb_r2r (mm3, mm2); pavgb_r2r (mm5, mm4); pavgb_r2r (mm7, mm6); movntq_r2m (mm0, *out); movntq_r2m (mm2, *(out + 8)); movntq_r2m (mm4, *(out + 16)); movntq_r2m (mm6, *(out + 24)); out += 32; top += 32; bot += 32; } width = (width & 0xf); for (i = width / 4; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); pavgb_r2r (mm1, mm0); movntq_r2m (mm0, *out); out += 8; top += 8; bot += 8; } width = width & 0x7; /* Handle last few pixels. */ for (i = width * 2; i; --i) { *out++ = ((*top++) + (*bot++)) >> 1; } emms (); }
static void frame_i2f_sse(u_char *src,float *dst,int l) { int i; pxor_r2r(mm7,mm7); for( i=0; i<l; i+=8 ) { movq_m2r(*src,mm0); movq_r2r(mm0, mm2); punpcklbw_r2r(mm7, mm0); punpckhbw_r2r(mm7, mm2); movq_r2r(mm0, mm1); movq_r2r(mm2, mm3); punpcklwd_r2r(mm7, mm0); punpckhwd_r2r(mm7, mm1); punpcklwd_r2r(mm7, mm2); punpckhwd_r2r(mm7, mm3); cvtpi2ps_r2r(mm0,xmm0); cvtpi2ps_r2r(mm1,xmm1); cvtpi2ps_r2r(mm2,xmm2); cvtpi2ps_r2r(mm3,xmm3); movlps_r2m(xmm0,dst[0]); movlps_r2m(xmm1,dst[2]); movlps_r2m(xmm2,dst[4]); movlps_r2m(xmm3,dst[6]); src+=8; dst+=8; } emms(); }
static void deinterlace_scanline_linear_mmxext (GstDeinterlaceSimpleMethod * self, guint8 * out, const guint8 * bot, const guint8 * top, gint size) { gint i; for (i = size / 32; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); movq_m2r (*(bot + 8), mm2); movq_m2r (*(top + 8), mm3); movq_m2r (*(bot + 16), mm4); movq_m2r (*(top + 16), mm5); movq_m2r (*(bot + 24), mm6); movq_m2r (*(top + 24), mm7); pavgb_r2r (mm1, mm0); pavgb_r2r (mm3, mm2); pavgb_r2r (mm5, mm4); pavgb_r2r (mm7, mm6); movntq_r2m (mm0, *out); movntq_r2m (mm2, *(out + 8)); movntq_r2m (mm4, *(out + 16)); movntq_r2m (mm6, *(out + 24)); out += 32; top += 32; bot += 32; } size = (size & 0x1f); for (i = size / 8; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); pavgb_r2r (mm1, mm0); movntq_r2m (mm0, *out); out += 8; top += 8; bot += 8; } emms (); size = size & 0xf; /* Handle last few pixels. */ for (i = size; i; --i) { *out++ = ((*top++) + (*bot++)) >> 1; } }
static void interpolate_packed422_scanline_mmxext( uint8_t *output, uint8_t *top, uint8_t *bot, int width ) { int i; for( i = width/16; i; --i ) { movq_m2r( *bot, mm0 ); movq_m2r( *top, mm1 ); movq_m2r( *(bot + 8), mm2 ); movq_m2r( *(top + 8), mm3 ); movq_m2r( *(bot + 16), mm4 ); movq_m2r( *(top + 16), mm5 ); movq_m2r( *(bot + 24), mm6 ); movq_m2r( *(top + 24), mm7 ); pavgb_r2r( mm1, mm0 ); pavgb_r2r( mm3, mm2 ); pavgb_r2r( mm5, mm4 ); pavgb_r2r( mm7, mm6 ); movntq_r2m( mm0, *output ); movntq_r2m( mm2, *(output + 8) ); movntq_r2m( mm4, *(output + 16) ); movntq_r2m( mm6, *(output + 24) ); output += 32; top += 32; bot += 32; } width = (width & 0xf); for( i = width/4; i; --i ) { movq_m2r( *bot, mm0 ); movq_m2r( *top, mm1 ); pavgb_r2r( mm1, mm0 ); movntq_r2m( mm0, *output ); output += 8; top += 8; bot += 8; } width = width & 0x7; /* Handle last few pixels. */ for( i = width * 2; i; --i ) { *output++ = ((*top++) + (*bot++)) >> 1; } sfence(); emms(); }
static void scale_uint8_x_4_x_bilinear_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i; uint8_t * src, * dst, *src_start; int32_t * factors; // mmx_t tmp_mm; /* * mm0: Input1 * mm1: Factor mask * mm2: * mm3: Output * mm4: * mm5: Input2 * mm6: 0 * mm7: Factor * */ // fprintf(stderr, "scale_uint8_x_4_x_bilinear_mmx\n"); src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); movq_m2r(factor_mask, mm1); dst = dest_start; for(i = 0; i < ctx->dst_size; i++) { src = src_start + 4*ctx->table_h.pixels[i].index; factors = ctx->table_h.pixels[i].factor_i; /* Load pixels */ movd_m2r(*(src), mm0); punpcklbw_r2r(mm6, mm0); psllw_i2r(6, mm0); /* 14 bit */ /* Load pixels */ movd_m2r(*(src+4), mm5); punpcklbw_r2r(mm6, mm5); psllw_i2r(6, mm5); /* 14 bit */ /* Load factors */ LOAD_FACTOR_1_4_NOCLIP; /* 14 bit */ /* Subtract */ psubsw_r2r(mm5, mm0); /* s1(mm0) - s2(mm5) -> mm0 (14 bit) */ pmulhw_r2r(mm7, mm0); /* factor * (s2 - s1) -> mm0 (12 bit) */ psllw_i2r(2, mm0); /* (14 bit) */ paddsw_r2r(mm5, mm0);/* (15 bit) */ psraw_i2r(6, mm0);/* (8 bit) */ packuswb_r2r(mm6, mm0); movd_r2m(mm0, *dst); dst+=4; } ctx->need_emms = 1; }
static inline void MC_put1_8 (int height, uint8_t * dest, uint8_t * ref, int stride) { do { movq_m2r (*ref, mm0); movq_r2m (mm0, *dest); ref += stride; dest += stride; } while (--height); }
static __inline__ void load_blk(uint8_t *blk, uint32_t rowstride, int h) { // Required to get GCC 4.0 to use the right registers as the source argument to // movq uint8_t *blk2 = blk + rowstride * 2; movq_m2r( *blk, mm0); blk += rowstride; movq_m2r( *blk, mm1); if( h == 2 ) return; movq_m2r( *blk2, mm2); blk2 += rowstride; movq_m2r( *blk2, mm3); }
static inline void mean8(unsigned char *refpix,unsigned char *pixel,int radius_count,int row_stride,int threshold,int8_t *diff,unsigned char *count) { int a,b; pxor_r2r(mm6,mm6); // mm6 (aka count) = 0 pxor_r2r(mm7,mm7); // mm7 (aka diff) = 0 movq_m2r(*refpix,mm3); // mm3 = refpix[0] movd_g2r(0x80808080,mm4); // mm4 = 128 punpcklbw_r2r(mm4,mm4); pxor_r2r(mm4,mm3); // mm3 = refpix[0]-128 movd_g2r(threshold,mm5); // mm5 = threshold punpcklbw_r2r(mm5,mm5); punpcklbw_r2r(mm5,mm5); punpcklbw_r2r(mm5,mm5); for( b=0; b<radius_count; b++ ) { for( a=0; a<radius_count; a++ ) { movq_m2r(*pixel,mm0); // mm0 = pixel[0] pxor_r2r(mm4,mm0); // mm0 = pixel[0]-128 movq_r2r(mm3,mm2); // mm2 = refpix[0]-128 psubsb_r2r(mm0,mm2); // mm2 = refpix[0]-pixel[0] psubsb_r2r(mm3,mm0); // mm0 = pixel[0]-refpix[0] pminub_r2r(mm0,mm2); // mm2 = abs(pixel[0]-refpix[0]) movq_r2r(mm5,mm1); // mm1 = threshold pcmpgtb_r2r(mm2,mm1); // mm1 = (threshold > abs(pixel[0]-refpix[0])) ? -1 : 0 psubb_r2r(mm1,mm6); // mm6 += (threshold > abs(pixel[0]-refpix[0])) pand_r2r(mm1,mm0); // mm0 = (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0 paddb_r2r(mm0,mm7); // mm7 += (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0 ++pixel; } pixel += row_stride - radius_count; } movq_r2m(mm6,*count); movq_r2m(mm7,*diff); emms(); }
static inline void MC_put_mmx (const int width, int height, uint8_t * dest, const uint8_t * ref, const int stride) { mmx_zero_reg (); do { movq_m2r (* ref, mm1); /* load 8 ref bytes */ movq_r2m (mm1,* dest); /* store 8 bytes at curr */ if (width == 16) { movq_m2r (* (ref+8), mm1); /* load 8 ref bytes */ movq_r2m (mm1,* (dest+8)); /* store 8 bytes at curr */ } dest += stride; ref += stride; } while (--height); }
static inline void MC_avg1_8 (int height, uint8_t * dest, const uint8_t * ref, const int stride, const int cpu) { do { movq_m2r (*ref, mm0); pavg_m2r (*dest, mm0); ref += stride; movq_r2m (mm0, *dest); dest += stride; } while (--height); }
static inline void MC_avg2_8 (int height, uint8_t * dest, uint8_t * ref, int stride, int offset, int cpu) { do { movq_m2r (*ref, mm0); pavg_m2r (*(ref+offset), mm0); pavg_m2r (*dest, mm0); ref += stride; movq_r2m (mm0, *dest); dest += stride; } while (--height); }
static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst, uint8_t *src, int i_src ) { int y; /* Interlaced */ for( y = 0; y < 8; y += 2 ) { movq_m2r( src[0], mm0 ); movq_r2m( mm0, dst[0] ); dst += i_dst; movq_m2r( src[2*i_src], mm1 ); pavgb_r2r( mm1, mm0 ); movq_r2m( mm0, dst[0] ); dst += 1*i_dst; src += 2*i_src; } }
static inline void MC_put4_8 (int height, uint8_t * dest, uint8_t * ref, int stride, int cpu) { movq_m2r (*ref, mm0); movq_m2r (*(ref+1), mm1); movq_r2r (mm0, mm7); pxor_r2r (mm1, mm7); pavg_r2r (mm1, mm0); ref += stride; do { movq_m2r (*ref, mm2); movq_r2r (mm0, mm5); movq_m2r (*(ref+1), mm3); movq_r2r (mm2, mm6); pxor_r2r (mm3, mm6); pavg_r2r (mm3, mm2); por_r2r (mm6, mm7); pxor_r2r (mm2, mm5); pand_r2r (mm5, mm7); pavg_r2r (mm2, mm0); pand_m2r (mask_one, mm7); psubusb_r2r (mm7, mm0); ref += stride; movq_r2m (mm0, *dest); dest += stride; movq_r2r (mm6, mm7); // unroll ! movq_r2r (mm2, mm0); // unroll ! } while (--height); }
static void fast_memcpy_mmxext( void *d, const void *s, size_t n ) { const uint8_t *src = s; uint8_t *dest = d; if( dest != src ) { while( n > 64 ) { movq_m2r( src[ 0 ], mm0 ); movq_m2r( src[ 8 ], mm1 ); movq_m2r( src[ 16 ], mm2 ); movq_m2r( src[ 24 ], mm3 ); movq_m2r( src[ 32 ], mm4 ); movq_m2r( src[ 40 ], mm5 ); movq_m2r( src[ 48 ], mm6 ); movq_m2r( src[ 56 ], mm7 ); movntq_r2m( mm0, dest[ 0 ] ); movntq_r2m( mm1, dest[ 8 ] ); movntq_r2m( mm2, dest[ 16 ] ); movntq_r2m( mm3, dest[ 24 ] ); movntq_r2m( mm4, dest[ 32 ] ); movntq_r2m( mm5, dest[ 40 ] ); movntq_r2m( mm6, dest[ 48 ] ); movntq_r2m( mm7, dest[ 56 ] ); dest += 64; src += 64; n -= 64; } while( n > 8 ) { movq_m2r( src[ 0 ], mm0 ); movntq_r2m( mm0, dest[ 0 ] ); dest += 8; src += 8; n -= 8; } if( n ) small_memcpy( dest, src, n ); sfence(); emms(); } }
static void scale_uint16_x_1_x_bicubic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i; uint16_t * dst; uint8_t * src, *src_start; int32_t * factors; mmx_t tmp_mm; int32_t tmp; // fprintf(stderr, "scale_uint8_x_1_x_bicubic_mmx\n"); src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); dst = (uint16_t*)dest_start; for(i = 0; i < ctx->dst_size; i++) { src = src_start + 2*ctx->table_h.pixels[i].index; factors = ctx->table_h.pixels[i].factor_i; /* Load pixels */ movq_m2r(*(src), mm0); psrlw_i2r(1, mm0); // DUMP_MM("mm0", mm0); /* Load factors */ movq_m2r(*factors, mm2); movq_m2r(*(factors+2), mm3); packssdw_r2r(mm3, mm2); /* Multiply */ pmaddwd_r2r(mm2, mm0); MOVQ_R2M(mm0, tmp_mm); tmp = tmp_mm.d[0] + tmp_mm.d[1]; tmp >>= 13; RECLIP(tmp, ctx->plane); *(dst++) = tmp; } ctx->need_emms = 1; }
static void deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4, uint8_t * lum_m3, uint8_t * lum_m2, uint8_t * lum_m1, uint8_t * lum, int size) { mmx_t rounder; rounder.uw[0] = 4; rounder.uw[1] = 4; rounder.uw[2] = 4; rounder.uw[3] = 4; pxor_r2r (mm7, mm7); movq_m2r (rounder, mm6); for (; size > 3; size -= 4) { movd_m2r (*lum_m4, mm0); movd_m2r (*lum_m3, mm1); movd_m2r (*lum_m2, mm2); movd_m2r (*lum_m1, mm3); movd_m2r (*lum, mm4); punpcklbw_r2r (mm7, mm0); punpcklbw_r2r (mm7, mm1); punpcklbw_r2r (mm7, mm2); punpcklbw_r2r (mm7, mm3); punpcklbw_r2r (mm7, mm4); paddw_r2r (mm3, mm1); psllw_i2r (1, mm2); paddw_r2r (mm4, mm0); psllw_i2r (2, mm1); // 2 paddw_r2r (mm6, mm2); paddw_r2r (mm2, mm1); psubusw_r2r (mm0, mm1); psrlw_i2r (3, mm1); // 3 packuswb_r2r (mm7, mm1); movd_r2m (mm1, *dst); lum_m4 += 4; lum_m3 += 4; lum_m2 += 4; lum_m1 += 4; lum += 4; dst += 4; } emms (); /* Handle odd widths */ if (size > 0) deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size); }
static __inline__ void mmx_sum_4_word_accs( mmx_t *accs, int32_t *res ) { movq_m2r( *accs, mm1 ); movq_r2r( mm1, mm3 ); movq_r2r( mm1, mm2 ); /* Generate sign extensions for mm1 words! */ psraw_i2r( 15, mm3 ); punpcklwd_r2r( mm3, mm1 ); punpckhwd_r2r( mm3, mm2 ); paddd_r2r( mm1, mm2 ); movq_r2r( mm2, mm3); psrlq_i2r( 32, mm2); paddd_r2r( mm2, mm3); movd_r2m( mm3, *res ); }