static inline void mmx_interp_average_2_U8 (uint8_t * dest, const uint8_t * src1, const uint8_t * src2) { /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */ movq_m2r (*dest, mm1); /* load 8 dest bytes */ movq_r2r (mm1, mm2); /* copy 8 dest bytes */ movq_m2r (*src1, mm3); /* load 8 src1 bytes */ movq_r2r (mm3, mm4); /* copy 8 src1 bytes */ movq_m2r (*src2, mm5); /* load 8 src2 bytes */ movq_r2r (mm5, mm6); /* copy 8 src2 bytes */ pxor_r2r (mm3, mm5); /* xor src1 and src2 */ pand_m2r (mask1, mm5); /* mask lower bits */ psrlq_i2r (1, mm5); /* /2 */ por_r2r (mm4, mm6); /* or src1 and src2 */ psubb_r2r (mm5, mm6); /* subtract subresults */ movq_r2r (mm6, mm5); /* copy subresult */ pxor_r2r (mm1, mm5); /* xor srcavg and dest */ pand_m2r (mask1, mm5); /* mask lower bits */ psrlq_i2r (1, mm5); /* /2 */ por_r2r (mm2, mm6); /* or srcavg and dest */ psubb_r2r (mm5, mm6); /* subtract subresults */ movq_r2m (mm6, *dest); /* store result in dest */ }
static inline void MC_avg4_8 (int height, uint8_t * dest, const uint8_t * ref, const int stride, const int cpu) { do { movq_m2r (*ref, mm0); movq_m2r (*(ref+stride+1), mm1); movq_r2r (mm0, mm7); movq_m2r (*(ref+1), mm2); pxor_r2r (mm1, mm7); movq_m2r (*(ref+stride), mm3); movq_r2r (mm2, mm6); pxor_r2r (mm3, mm6); pavg_r2r (mm1, mm0); pavg_r2r (mm3, mm2); por_r2r (mm6, mm7); movq_r2r (mm0, mm6); pxor_r2r (mm2, mm6); pand_r2r (mm6, mm7); pand_m2r (mask_one, mm7); pavg_r2r (mm2, mm0); psubusb_r2r (mm7, mm0); movq_m2r (*dest, mm1); pavg_r2r (mm1, mm0); ref += stride; movq_r2m (mm0, *dest); dest += stride; } while (--height); }
int main(int ac, char **av) { int i, j, k, n; unsigned char dat0[8] = { 0x01, 0xf2, 0x03, 0x04, 0x05, 0x06, 0xf7, 0x08 }; long long *datp = (long long *)&dat0; int16_t dat1[8] = { 0x10, 0x20, -0x130, -0x140, 0x50, -0x160, -0x170, 0x80 }; volatile uint8_t *rfp = dat0; volatile int16_t *bp = dat1; unsigned char ans1[8], ans2[8]; n = 0; for( i=-32768; i<32768; ++i ) { j = 0; while( j < 256 ) { for( k=0; k<8; ++k ) { dat0[k] = i; dat1[k] = j++; } movq_m2r(m_(&rfp[0]),mm1); /* rfp[0..7] */ pxor_r2r(mm3,mm3); pxor_r2r(mm4,mm4); movq_m2r(m_(&bp[0]),mm5); /* bp[0..3] */ movq_r2r(mm1,mm2); movq_m2r(m_(&bp[4]),mm6); /* bp[4..7] */ punpcklbw_r2r(mm3,mm1); /* rfp[0,2,4,6] */ punpckhbw_r2r(mm3,mm2); /* rfp[1,3,5,7] */ paddsw_r2r(mm5,mm1); /* bp[0..3] */ paddsw_r2r(mm6,mm2); /* bp[4..7] */ pcmpgtw_r2r(mm1,mm3); pcmpgtw_r2r(mm2,mm4); pandn_r2r(mm1,mm3); pandn_r2r(mm2,mm4); packuswb_r2r(mm4,mm3); movq_r2m(mm3,m_(&ans1[0])); emms(); ans2[0] = clip(bp[0] + rfp[0]); ans2[1] = clip(bp[1] + rfp[1]); ans2[2] = clip(bp[2] + rfp[2]); ans2[3] = clip(bp[3] + rfp[3]); ans2[4] = clip(bp[4] + rfp[4]); ans2[5] = clip(bp[5] + rfp[5]); ans2[6] = clip(bp[6] + rfp[6]); ans2[7] = clip(bp[7] + rfp[7]); if( *(uint64_t *)&ans1[0] != *(uint64_t *)&ans2[0] ) { printf(" i=%5d %02x %02x %02x %02x %02x %02x %02x %02x\n", i, ans1[0], ans1[1], ans1[2], ans1[3], ans1[4], ans1[5], ans1[6], ans1[7]); printf(" j=%5d %02x %02x %02x %02x %02x %02x %02x %02x\n", j, ans2[0], ans2[1], ans2[2], ans2[3], ans2[4], ans2[5], ans2[6], ans2[7]); // exit(0); } n += 8; } } printf("n=%d\n",n); return 0; }
static void scale_uint8_x_4_x_generic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i, j; uint8_t * src, * dst, *src_start; int32_t * factors; // mmx_t tmp_mm; /* * mm0: Input * mm1: factor_mask * mm2: Factor * mm3: Output * mm4: * mm5: * mm6: 0 * mm7: scratch * */ src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); movq_m2r(factor_mask, mm1); dst = dest_start; for(i = 0; i < ctx->dst_size; i++) { src = src_start + 4*ctx->table_h.pixels[i].index; factors = ctx->table_h.pixels[i].factor_i; pxor_r2r(mm3, mm3); for(j = 0; j < ctx->table_h.factors_per_pixel; j++) { /* Load pixels */ movd_m2r(*(src), mm0); punpcklbw_r2r(mm6, mm0); psllw_i2r(7, mm0); /* Load factors */ LOAD_FACTOR_1_4; /* Multiply */ pmulhw_r2r(mm7, mm0); paddw_r2r(mm0, mm3); // DUMP_MM("mm3_2", mm3); src += 4; factors++; } psraw_i2r(5, mm3); packuswb_r2r(mm6, mm3); movd_r2m(mm3, *dst); dst+=4; } ctx->need_emms = 1; }
static __inline__ int qblock_sad_mmxe(uint8_t *refblk, uint32_t h, uint32_t rowstride) { int res; pxor_r2r (mm4,mm4); movq_r2r (mm0,mm5); /* First row */ movd_m2r (*refblk, mm6); pxor_r2r ( mm7, mm7); refblk += rowstride; punpcklbw_r2r ( mm7, mm5); punpcklbw_r2r ( mm7, mm6); psadbw_r2r ( mm5, mm6); paddw_r2r ( mm6, mm4 ); movq_r2r (mm1,mm5); /* Second row */ movd_m2r (*refblk, mm6); refblk += rowstride; punpcklbw_r2r ( mm7, mm5); punpcklbw_r2r ( mm7, mm6); psadbw_r2r ( mm5, mm6); paddw_r2r ( mm6, mm4 ); if( h == 4 ) { movq_r2r (mm2,mm5); /* Third row */ movd_m2r (*refblk, mm6); refblk += rowstride; punpcklbw_r2r ( mm7, mm5); punpcklbw_r2r ( mm7, mm6); psadbw_r2r ( mm5, mm6); paddw_r2r ( mm6, mm4 ); movq_r2r (mm3,mm5); /* Fourth row */ movd_m2r (*refblk, mm6); punpcklbw_r2r ( mm7, mm5); punpcklbw_r2r ( mm7, mm6); psadbw_r2r ( mm5, mm6); paddw_r2r ( mm6, mm4 ); } movd_r2m ( mm4, res ); return res; }
static inline void mmx_unpack_16rgb (uint8_t * image, const int cpu) { static mmx_t mmx_bluemask = {0xf8f8f8f8f8f8f8f8LL}; static mmx_t mmx_greenmask = {0xfcfcfcfcfcfcfcfcLL}; static mmx_t mmx_redmask = {0xf8f8f8f8f8f8f8f8LL}; /* * convert RGB plane to RGB 16 bits * mm0 -> B, mm1 -> R, mm2 -> G * mm4 -> GB, mm5 -> AR pixel 4-7 * mm6 -> GB, mm7 -> AR pixel 0-3 */ pand_m2r (mmx_bluemask, mm0); /* mm0 = b7b6b5b4b3______ */ pand_m2r (mmx_greenmask, mm2); /* mm2 = g7g6g5g4g3g2____ */ pand_m2r (mmx_redmask, mm1); /* mm1 = r7r6r5r4r3______ */ psrlq_i2r (3, mm0); /* mm0 = ______b7b6b5b4b3 */ pxor_r2r (mm4, mm4); /* mm4 = 0 */ movq_r2r (mm0, mm5); /* mm5 = ______b7b6b5b4b3 */ movq_r2r (mm2, mm7); /* mm7 = g7g6g5g4g3g2____ */ punpcklbw_r2r (mm4, mm2); punpcklbw_r2r (mm1, mm0); psllq_i2r (3, mm2); por_r2r (mm2, mm0); movntq (mm0, *image); punpckhbw_r2r (mm4, mm7); punpckhbw_r2r (mm1, mm5); psllq_i2r (3, mm7); por_r2r (mm7, mm5); movntq (mm5, *(image+8)); }
static inline void mmx_end(uint8_t *src3, uint8_t *src5, uint8_t *dst, int X) { punpcklbw_m2r (mm_cpool[0], mm4); punpckhbw_m2r (mm_cpool[0], mm5); psubusw_r2r (mm2, mm0); psubusw_r2r (mm3, mm1); movq_m2r (src5[X], mm2); movq_m2r (src5[X], mm3); punpcklbw_m2r (mm_cpool[0], mm2); punpckhbw_m2r (mm_cpool[0], mm3); psubusw_r2r (mm2, mm0); psubusw_r2r (mm3, mm1); psrlw_i2r (3, mm0); psrlw_i2r (3, mm1); psubw_r2r (mm6, mm4); psubw_r2r (mm7, mm5); packuswb_r2r (mm1,mm0); movq_r2r (mm4, mm6); movq_r2r (mm5, mm7); pcmpgtw_m2r (mm_lthr, mm4); pcmpgtw_m2r (mm_lthr, mm5); pcmpgtw_m2r (mm_hthr, mm6); pcmpgtw_m2r (mm_hthr, mm7); packsswb_r2r (mm5, mm4); packsswb_r2r (mm7, mm6); pxor_r2r (mm6, mm4); movq_r2r (mm4, mm5); pandn_r2r (mm0, mm4); pand_m2r (src3[X], mm5); por_r2r (mm4, mm5); movq_r2m (mm5, dst[X]); }
static void frame_i2f_sse(u_char *src,float *dst,int l) { int i; pxor_r2r(mm7,mm7); for( i=0; i<l; i+=8 ) { movq_m2r(*src,mm0); movq_r2r(mm0, mm2); punpcklbw_r2r(mm7, mm0); punpckhbw_r2r(mm7, mm2); movq_r2r(mm0, mm1); movq_r2r(mm2, mm3); punpcklwd_r2r(mm7, mm0); punpckhwd_r2r(mm7, mm1); punpcklwd_r2r(mm7, mm2); punpckhwd_r2r(mm7, mm3); cvtpi2ps_r2r(mm0,xmm0); cvtpi2ps_r2r(mm1,xmm1); cvtpi2ps_r2r(mm2,xmm2); cvtpi2ps_r2r(mm3,xmm3); movlps_r2m(xmm0,dst[0]); movlps_r2m(xmm1,dst[2]); movlps_r2m(xmm2,dst[4]); movlps_r2m(xmm3,dst[6]); src+=8; dst+=8; } emms(); }
static void scale_uint8_x_1_x_bicubic_noclip_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i; uint8_t * src, * dst, *src_start; int32_t * factors; mmx_t tmp_mm; // fprintf(stderr, "scale_uint8_x_1_x_bicubic_noclip_mmx\n"); src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); dst = dest_start; for(i = 0; i < ctx->dst_size; i++) { src = src_start + ctx->table_h.pixels[i].index; factors = ctx->table_h.pixels[i].factor_i; /* Load pixels */ movd_m2r(*(src), mm0); punpcklbw_r2r(mm6, mm0); /* Load factors */ movq_m2r(*factors, mm2); movq_m2r(*(factors+2), mm3); packssdw_r2r(mm3, mm2); /* Multiply */ pmaddwd_r2r(mm2, mm0); psrld_i2r(14, mm0); MOVQ_R2M(mm0, tmp_mm); *(dst++) = tmp_mm.d[0] + tmp_mm.d[1]; } ctx->need_emms = 1; }
static inline void mmx_unpack_32rgb (uint8_t * image, const int cpu) { /* * convert RGB plane to RGB packed format, * mm0 -> B, mm1 -> R, mm2 -> G, mm3 -> 0, * mm4 -> GB, mm5 -> AR pixel 4-7, * mm6 -> GB, mm7 -> AR pixel 0-3 */ pxor_r2r (mm3, mm3); movq_r2r (mm0, mm6); movq_r2r (mm1, mm7); movq_r2r (mm0, mm4); movq_r2r (mm1, mm5); punpcklbw_r2r (mm2, mm6); punpcklbw_r2r (mm3, mm7); punpcklwd_r2r (mm7, mm6); movntq (mm6, *image); movq_r2r (mm0, mm6); punpcklbw_r2r (mm2, mm6); punpckhwd_r2r (mm7, mm6); movntq (mm6, *(image+8)); punpckhbw_r2r (mm2, mm4); punpckhbw_r2r (mm3, mm5); punpcklwd_r2r (mm5, mm4); movntq (mm4, *(image+16)); movq_r2r (mm0, mm4); punpckhbw_r2r (mm2, mm4); punpckhwd_r2r (mm5, mm4); movntq (mm4, *(image+24)); }
/* For a 16*h block, this computes (((((*pf + *pf2 + 1)>>1) + ((*pb + *pb2 + 1)>>1) + 1)>>1) + *p2 + 1)>>1 */ static int bsad_0quad_mmxe(uint8_t *pf,uint8_t *pf2,uint8_t *pb,uint8_t *pb2,uint8_t *p2,int lx,int h) { int32_t s=0; pxor_r2r(mm7, mm7); do { movq_m2r(pf2[0],mm0); movq_m2r(pf2[8],mm2); movq_m2r(pb2[0],mm1); movq_m2r(pb2[8],mm3); pavgb_m2r(pf[0],mm0); pavgb_m2r(pf[8],mm2); pavgb_m2r(pb[0],mm1); pavgb_m2r(pb[8],mm3); pavgb_r2r(mm1,mm0); pavgb_r2r(mm3,mm2); psadbw_m2r(p2[0],mm0); psadbw_m2r(p2[8],mm2); paddd_r2r(mm0,mm7); paddd_r2r(mm2,mm7); pf+=lx; pf2+=lx; pb+=lx; pb2+=lx; p2+=lx; h--; } while (h); movd_r2g(mm7,s); emms(); return s; }
void evas_common_cpu_mmx_test(void) { #ifdef BUILD_MMX pxor_r2r(mm4, mm4); #endif }
static void _op_mul_p_mas_dp_mmx(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { DATA32 *e = d + l; MOV_A2R(ALPHA_255, mm5) pxor_r2r(mm0, mm0); while (d < e) { c = *m; switch(c) { case 0: break; case 255: MOV_P2R(*d, mm1, mm0) MOV_P2R(*s, mm2, mm0) MUL4_SYM_R2R(mm2, mm1, mm5) MOV_R2P(mm1, *d, mm0) break; default: c++; MOV_A2R(c, mm1) c = ~(*s); MOV_P2R(c, mm3, mm0) MUL4_256_R2R(mm3, mm1) movq_r2r(mm5, mm4); psubw_r2r(mm1, mm4); MOV_P2R(*d, mm1, mm0) MUL4_SYM_R2R(mm4, mm1, mm5) MOV_R2P(mm1, *d, mm0) break; } s++; m++; d++; } }
static void scale_uint8_x_4_x_bilinear_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i; uint8_t * src, * dst, *src_start; int32_t * factors; // mmx_t tmp_mm; /* * mm0: Input1 * mm1: Factor mask * mm2: * mm3: Output * mm4: * mm5: Input2 * mm6: 0 * mm7: Factor * */ // fprintf(stderr, "scale_uint8_x_4_x_bilinear_mmx\n"); src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); movq_m2r(factor_mask, mm1); dst = dest_start; for(i = 0; i < ctx->dst_size; i++) { src = src_start + 4*ctx->table_h.pixels[i].index; factors = ctx->table_h.pixels[i].factor_i; /* Load pixels */ movd_m2r(*(src), mm0); punpcklbw_r2r(mm6, mm0); psllw_i2r(6, mm0); /* 14 bit */ /* Load pixels */ movd_m2r(*(src+4), mm5); punpcklbw_r2r(mm6, mm5); psllw_i2r(6, mm5); /* 14 bit */ /* Load factors */ LOAD_FACTOR_1_4_NOCLIP; /* 14 bit */ /* Subtract */ psubsw_r2r(mm5, mm0); /* s1(mm0) - s2(mm5) -> mm0 (14 bit) */ pmulhw_r2r(mm7, mm0); /* factor * (s2 - s1) -> mm0 (12 bit) */ psllw_i2r(2, mm0); /* (14 bit) */ paddsw_r2r(mm5, mm0);/* (15 bit) */ psraw_i2r(6, mm0);/* (8 bit) */ packuswb_r2r(mm6, mm0); movd_r2m(mm0, *dst); dst+=4; } ctx->need_emms = 1; }
static inline void mean8(unsigned char *refpix,unsigned char *pixel,int radius_count,int row_stride,int threshold,int8_t *diff,unsigned char *count) { int a,b; pxor_r2r(mm6,mm6); // mm6 (aka count) = 0 pxor_r2r(mm7,mm7); // mm7 (aka diff) = 0 movq_m2r(*refpix,mm3); // mm3 = refpix[0] movd_g2r(0x80808080,mm4); // mm4 = 128 punpcklbw_r2r(mm4,mm4); pxor_r2r(mm4,mm3); // mm3 = refpix[0]-128 movd_g2r(threshold,mm5); // mm5 = threshold punpcklbw_r2r(mm5,mm5); punpcklbw_r2r(mm5,mm5); punpcklbw_r2r(mm5,mm5); for( b=0; b<radius_count; b++ ) { for( a=0; a<radius_count; a++ ) { movq_m2r(*pixel,mm0); // mm0 = pixel[0] pxor_r2r(mm4,mm0); // mm0 = pixel[0]-128 movq_r2r(mm3,mm2); // mm2 = refpix[0]-128 psubsb_r2r(mm0,mm2); // mm2 = refpix[0]-pixel[0] psubsb_r2r(mm3,mm0); // mm0 = pixel[0]-refpix[0] pminub_r2r(mm0,mm2); // mm2 = abs(pixel[0]-refpix[0]) movq_r2r(mm5,mm1); // mm1 = threshold pcmpgtb_r2r(mm2,mm1); // mm1 = (threshold > abs(pixel[0]-refpix[0])) ? -1 : 0 psubb_r2r(mm1,mm6); // mm6 += (threshold > abs(pixel[0]-refpix[0])) pand_r2r(mm1,mm0); // mm0 = (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0 paddb_r2r(mm0,mm7); // mm7 += (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0 ++pixel; } pixel += row_stride - radius_count; } movq_r2m(mm6,*count); movq_r2m(mm7,*diff); emms(); }
static inline void MC_put4_8 (int height, uint8_t * dest, uint8_t * ref, int stride, int cpu) { movq_m2r (*ref, mm0); movq_m2r (*(ref+1), mm1); movq_r2r (mm0, mm7); pxor_r2r (mm1, mm7); pavg_r2r (mm1, mm0); ref += stride; do { movq_m2r (*ref, mm2); movq_r2r (mm0, mm5); movq_m2r (*(ref+1), mm3); movq_r2r (mm2, mm6); pxor_r2r (mm3, mm6); pavg_r2r (mm3, mm2); por_r2r (mm6, mm7); pxor_r2r (mm2, mm5); pand_r2r (mm5, mm7); pavg_r2r (mm2, mm0); pand_m2r (mask_one, mm7); psubusb_r2r (mm7, mm0); ref += stride; movq_r2m (mm0, *dest); dest += stride; movq_r2r (mm6, mm7); // unroll ! movq_r2r (mm2, mm0); // unroll ! } while (--height); }
static void deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4, uint8_t * lum_m3, uint8_t * lum_m2, uint8_t * lum_m1, uint8_t * lum, int size) { mmx_t rounder; rounder.uw[0] = 4; rounder.uw[1] = 4; rounder.uw[2] = 4; rounder.uw[3] = 4; pxor_r2r (mm7, mm7); movq_m2r (rounder, mm6); for (; size > 3; size -= 4) { movd_m2r (*lum_m4, mm0); movd_m2r (*lum_m3, mm1); movd_m2r (*lum_m2, mm2); movd_m2r (*lum_m1, mm3); movd_m2r (*lum, mm4); punpcklbw_r2r (mm7, mm0); punpcklbw_r2r (mm7, mm1); punpcklbw_r2r (mm7, mm2); punpcklbw_r2r (mm7, mm3); punpcklbw_r2r (mm7, mm4); paddw_r2r (mm3, mm1); psllw_i2r (1, mm2); paddw_r2r (mm4, mm0); psllw_i2r (2, mm1); // 2 paddw_r2r (mm6, mm2); paddw_r2r (mm2, mm1); psubusw_r2r (mm0, mm1); psrlw_i2r (3, mm1); // 3 packuswb_r2r (mm7, mm1); movd_r2m (mm1, *dst); lum_m4 += 4; lum_m3 += 4; lum_m2 += 4; lum_m1 += 4; lum += 4; dst += 4; } emms (); /* Handle odd widths */ if (size > 0) deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size); }
/* filter parameters: [-1 4 2 4 -1] // 8 */ static inline void deinterlace_line(uint8_t *dst, const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size) { #ifndef USE_MMX uint8_t *cm = ff_cropTbl + MAX_NEG_CROP; int sum; for(;size > 0;size--) { sum = -lum_m4[0]; sum += lum_m3[0] << 2; sum += lum_m2[0] << 1; sum += lum_m1[0] << 2; sum += -lum[0]; dst[0] = cm[(sum + 4) >> 3]; lum_m4++; lum_m3++; lum_m2++; lum_m1++; lum++; dst++; } #else { mmx_t rounder; rounder.uw[0]=4; rounder.uw[1]=4; rounder.uw[2]=4; rounder.uw[3]=4; pxor_r2r(mm7,mm7); movq_m2r(rounder,mm6); } for (;size > 3; size-=4) { DEINT_LINE_LUM lum_m4+=4; lum_m3+=4; lum_m2+=4; lum_m1+=4; lum+=4; dst+=4; } #endif }
static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2 ) { static const uint64_t m_4 = INT64_C(0x0004000400040004); int y, x; /* Progressive */ pxor_r2r( mm7, mm7 ); for( y = 0; y < 8; y += 2 ) { for( x = 0; x < 8; x +=4 ) { movd_m2r( src1[x], mm0 ); movd_r2m( mm0, dst[x] ); movd_m2r( src2[x], mm1 ); movd_m2r( src1[i_src1+x], mm2 ); punpcklbw_r2r( mm7, mm0 ); punpcklbw_r2r( mm7, mm1 ); punpcklbw_r2r( mm7, mm2 ); paddw_r2r( mm1, mm1 ); movq_r2r( mm1, mm3 ); paddw_r2r( mm3, mm3 ); paddw_r2r( mm2, mm0 ); paddw_r2r( mm3, mm1 ); paddw_m2r( m_4, mm1 ); paddw_r2r( mm1, mm0 ); psraw_i2r( 3, mm0 ); packuswb_r2r( mm7, mm0 ); movd_r2m( mm0, dst[i_dst+x] ); } dst += 2*i_dst; src1 += i_src1; src2 += i_src2; } }
static void scale_uint16_x_1_x_bicubic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i; uint16_t * dst; uint8_t * src, *src_start; int32_t * factors; mmx_t tmp_mm; int32_t tmp; // fprintf(stderr, "scale_uint8_x_1_x_bicubic_mmx\n"); src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); dst = (uint16_t*)dest_start; for(i = 0; i < ctx->dst_size; i++) { src = src_start + 2*ctx->table_h.pixels[i].index; factors = ctx->table_h.pixels[i].factor_i; /* Load pixels */ movq_m2r(*(src), mm0); psrlw_i2r(1, mm0); // DUMP_MM("mm0", mm0); /* Load factors */ movq_m2r(*factors, mm2); movq_m2r(*(factors+2), mm3); packssdw_r2r(mm3, mm2); /* Multiply */ pmaddwd_r2r(mm2, mm0); MOVQ_R2M(mm0, tmp_mm); tmp = tmp_mm.d[0] + tmp_mm.d[1]; tmp >>= 13; RECLIP(tmp, ctx->plane); *(dst++) = tmp; } ctx->need_emms = 1; }
int field_dct_best_mmx( uint8_t *cur_lum_mb, uint8_t *pred_lum_mb) { /* * calculate prediction error (cur-pred) for top (blk0) * and bottom field (blk1) */ double r,d; int rowoffs = 0; int sumtop, sumbot, sumsqtop, sumsqbot, sumbottop; int j; int dct_type; int topvar, botvar; mmx_t sumtop_accs, sumbot_accs; mmx_t sumsqtop_accs, sumsqbot_accs, sumxprod_accs; int32_t sumtop_acc, sumbot_acc; int32_t sumsqtop_acc, sumsqbot_acc, sumxprod_acc; pxor_r2r(mm0,mm0); movq_r2m( mm0, *(&sumtop_accs) ); movq_r2m( mm0, *(&sumbot_accs) ); movq_r2m( mm0, *(&sumsqtop_accs) ); movq_r2m( mm0, *(&sumsqbot_accs) ); movq_r2m( mm0, *(&sumxprod_accs) ); sumtop = sumsqtop = sumbot = sumsqbot = sumbottop = 0; sumtop_acc = sumbot_acc = sumsqtop_acc = sumsqbot_acc = sumxprod_acc = 0; for (j=0; j<8; j++) { #ifdef ORIGINAL_CODE for (i=0; i<16; i++) { register int toppix = cur_lum_mb[rowoffs+i] - pred_lum_mb[rowoffs+i]; register int botpix = cur_lum_mb[rowoffs+width+i] - pred_lum_mb[rowoffs+width+i]; sumtop += toppix; sumsqtop += toppix*toppix; sumbot += botpix; sumsqbot += botpix*botpix; sumbottop += toppix*botpix; } #endif sum_sumsq_8bytes( &cur_lum_mb[rowoffs], &pred_lum_mb[rowoffs], &sumtop_accs, &sumbot_accs, &sumsqtop_accs, &sumsqbot_accs, &sumxprod_accs ); sum_sumsq_8bytes( &cur_lum_mb[rowoffs+8], &pred_lum_mb[rowoffs+8], &sumtop_accs, &sumbot_accs, &sumsqtop_accs, &sumsqbot_accs, &sumxprod_accs ); rowoffs += (opt->phy_width<<1); } mmx_sum_4_word_accs( &sumtop_accs, &sumtop ); mmx_sum_4_word_accs( &sumbot_accs, &sumbot ); emms(); sumsqtop = sumsqtop_accs.d[0] + sumsqtop_accs.d[1]; sumsqbot = sumsqbot_accs.d[0] + sumsqbot_accs.d[1]; sumbottop = sumxprod_accs.d[0] + sumxprod_accs.d[1]; /* Calculate Variances top and bottom. If they're of similar sign estimate correlation if its good use frame DCT otherwise use field. */ r = 0.0; topvar = sumsqtop-sumtop*sumtop/128; botvar = sumsqbot-sumbot*sumbot/128; if ( !((topvar <= 0) ^ (botvar <= 0)) ) { d = ((double) topvar) * ((double)botvar); r = (sumbottop-(sumtop*sumbot)/128); if (r>0.5*sqrt(d)) return 0; /* frame DCT */ else return 1; /* field DCT */ } else return 1; /* field DCT */ return dct_type; }
static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv) { static mmx_t mmx_80w = {0x0080008000800080LL}; static mmx_t mmx_U_green = {0xf37df37df37df37dLL}; static mmx_t mmx_U_blue = {0x4093409340934093LL}; static mmx_t mmx_V_red = {0x3312331233123312LL}; static mmx_t mmx_V_green = {0xe5fce5fce5fce5fcLL}; static mmx_t mmx_10w = {0x1010101010101010LL}; static mmx_t mmx_00ffw = {0x00ff00ff00ff00ffLL}; static mmx_t mmx_Y_coeff = {0x253f253f253f253fLL}; movd_m2r (*pu, mm0); /* mm0 = 00 00 00 00 u3 u2 u1 u0 */ movd_m2r (*pv, mm1); /* mm1 = 00 00 00 00 v3 v2 v1 v0 */ movq_m2r (*py, mm6); /* mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ pxor_r2r (mm4, mm4); /* mm4 = 0 */ /* XXX might do cache preload for image here */ /* * Do the multiply part of the conversion for even and odd pixels * register usage: * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels * mm6 -> Y even, mm7 -> Y odd */ punpcklbw_r2r (mm4, mm0); /* mm0 = u3 u2 u1 u0 */ punpcklbw_r2r (mm4, mm1); /* mm1 = v3 v2 v1 v0 */ psubsw_m2r (mmx_80w, mm0); /* u -= 128 */ psubsw_m2r (mmx_80w, mm1); /* v -= 128 */ psllw_i2r (3, mm0); /* promote precision */ psllw_i2r (3, mm1); /* promote precision */ movq_r2r (mm0, mm2); /* mm2 = u3 u2 u1 u0 */ movq_r2r (mm1, mm3); /* mm3 = v3 v2 v1 v0 */ pmulhw_m2r (mmx_U_green, mm2); /* mm2 = u * u_green */ pmulhw_m2r (mmx_V_green, mm3); /* mm3 = v * v_green */ pmulhw_m2r (mmx_U_blue, mm0); /* mm0 = chroma_b */ pmulhw_m2r (mmx_V_red, mm1); /* mm1 = chroma_r */ paddsw_r2r (mm3, mm2); /* mm2 = chroma_g */ psubusb_m2r (mmx_10w, mm6); /* Y -= 16 */ movq_r2r (mm6, mm7); /* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ pand_m2r (mmx_00ffw, mm6); /* mm6 = Y6 Y4 Y2 Y0 */ psrlw_i2r (8, mm7); /* mm7 = Y7 Y5 Y3 Y1 */ psllw_i2r (3, mm6); /* promote precision */ psllw_i2r (3, mm7); /* promote precision */ pmulhw_m2r (mmx_Y_coeff, mm6); /* mm6 = luma_rgb even */ pmulhw_m2r (mmx_Y_coeff, mm7); /* mm7 = luma_rgb odd */ /* * Do the addition part of the conversion for even and odd pixels * register usage: * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels * mm6 -> Y even, mm7 -> Y odd */ movq_r2r (mm0, mm3); /* mm3 = chroma_b */ movq_r2r (mm1, mm4); /* mm4 = chroma_r */ movq_r2r (mm2, mm5); /* mm5 = chroma_g */ paddsw_r2r (mm6, mm0); /* mm0 = B6 B4 B2 B0 */ paddsw_r2r (mm7, mm3); /* mm3 = B7 B5 B3 B1 */ paddsw_r2r (mm6, mm1); /* mm1 = R6 R4 R2 R0 */ paddsw_r2r (mm7, mm4); /* mm4 = R7 R5 R3 R1 */ paddsw_r2r (mm6, mm2); /* mm2 = G6 G4 G2 G0 */ paddsw_r2r (mm7, mm5); /* mm5 = G7 G5 G3 G1 */ packuswb_r2r (mm0, mm0); /* saturate to 0-255 */ packuswb_r2r (mm1, mm1); /* saturate to 0-255 */ packuswb_r2r (mm2, mm2); /* saturate to 0-255 */ packuswb_r2r (mm3, mm3); /* saturate to 0-255 */ packuswb_r2r (mm4, mm4); /* saturate to 0-255 */ packuswb_r2r (mm5, mm5); /* saturate to 0-255 */ punpcklbw_r2r (mm3, mm0); /* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */ punpcklbw_r2r (mm4, mm1); /* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */ punpcklbw_r2r (mm5, mm2); /* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */ }
static int bsad_1quad_mmxe(uint8_t *pf, uint8_t *pb, uint8_t *pb2, uint8_t *p2, int lx, int h) { int s; s = 0; /* the accumulator */ if (h > 0) { pcmpeqw_r2r(mm6, mm6); psrlw_i2r(15, mm6); paddw_r2r(mm6, mm6); pxor_r2r(mm7, mm7); pxor_r2r(mm5, mm5); do { BSAD_LOAD(pf[0],mm0,mm1); BSAD_LOAD_ACC(pf[1],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pf[lx],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pf[lx+1],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); packuswb_r2r(mm1, mm0); movq_m2r(pb2[0],mm1); pavgb_m2r(pb[0],mm1); pavgb_r2r(mm1, mm0); psadbw_m2r(p2[0],mm0); paddd_r2r(mm0,mm5); BSAD_LOAD(pf[8],mm0,mm1); BSAD_LOAD_ACC(pf[9],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pf[lx+8],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pf[lx+9],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); packuswb_r2r(mm1, mm0); movq_m2r(pb2[8],mm1); pavgb_m2r(pb[8],mm1); pavgb_r2r(mm1, mm0); psadbw_m2r(p2[8],mm0); paddd_r2r(mm0,mm5); p2 += lx; pf += lx; pb += lx; pb2 += lx; h--; } while (h > 0); } movd_r2g(mm5,s); emms(); return s; }
VLC_MMX static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c, int i_pitch_prev, int i_pitch_curr, int* pi_top, int* pi_bot ) { int32_t i_motion = 0; int32_t i_top_motion = 0; int32_t i_bot_motion = 0; static alignas (8) const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } }; pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */ movq_m2r( bT, mm5 ); pxor_r2r( mm3, mm3 ); /* score (top field) */ pxor_r2r( mm4, mm4 ); /* score (bottom field) */ for( int y = 0; y < 8; y+=2 ) { /* top field */ movq_m2r( *((uint64_t*)p_pix_c), mm0 ); movq_m2r( *((uint64_t*)p_pix_p), mm1 ); movq_r2r( mm0, mm2 ); psubusb_r2r( mm1, mm2 ); psubusb_r2r( mm0, mm1 ); pcmpgtb_r2r( mm5, mm2 ); pcmpgtb_r2r( mm5, mm1 ); psadbw_r2r( mm6, mm2 ); psadbw_r2r( mm6, mm1 ); paddd_r2r( mm2, mm1 ); paddd_r2r( mm1, mm3 ); /* add to top field score */ p_pix_c += i_pitch_curr; p_pix_p += i_pitch_prev; /* bottom field - handling identical to top field, except... */ movq_m2r( *((uint64_t*)p_pix_c), mm0 ); movq_m2r( *((uint64_t*)p_pix_p), mm1 ); movq_r2r( mm0, mm2 ); psubusb_r2r( mm1, mm2 ); psubusb_r2r( mm0, mm1 ); pcmpgtb_r2r( mm5, mm2 ); pcmpgtb_r2r( mm5, mm1 ); psadbw_r2r( mm6, mm2 ); psadbw_r2r( mm6, mm1 ); paddd_r2r( mm2, mm1 ); paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */ p_pix_c += i_pitch_curr; p_pix_p += i_pitch_prev; } movq_r2r( mm3, mm7 ); /* score (total) */ paddd_r2r( mm4, mm7 ); movd_r2m( mm3, i_top_motion ); movd_r2m( mm4, i_bot_motion ); movd_r2m( mm7, i_motion ); /* The loop counts actual score * 255. */ i_top_motion /= 255; i_bot_motion /= 255; i_motion /= 255; emms(); (*pi_top) = ( i_top_motion >= 8 ); (*pi_bot) = ( i_bot_motion >= 8 ); return (i_motion >= 8); }
VLC_MMX static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top, const picture_t* p_pic_bot ) { assert( p_pic_top->i_planes == p_pic_bot->i_planes ); /* Amount of bits must be known for MMX, thus int32_t. Doesn't hurt the C implementation. */ int32_t i_score_mmx = 0; /* this must be divided by 255 when finished */ int32_t i_score_c = 0; /* this counts as-is (used for non-MMX parts) */ pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */ for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane ) { /* Sanity check */ if( p_pic_top->p[i_plane].i_visible_lines != p_pic_bot->p[i_plane].i_visible_lines ) return -1; const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1; const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch, p_pic_bot->p[i_plane].i_visible_pitch ); const int wm8 = w % 8; /* remainder */ const int w8 = w - wm8; /* part of width that is divisible by 8 */ /* Current line / neighbouring lines picture pointers */ const picture_t *cur = p_pic_bot; const picture_t *ngh = p_pic_top; int wc = cur->p[i_plane].i_pitch; int wn = ngh->p[i_plane].i_pitch; /* Transcode 1.1.5 only checks every other line. Checking every line works better for anime, which may contain horizontal, one pixel thick cartoon outlines. */ for( int y = 1; y < i_lasty; ++y ) { uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc]; /* this line */ uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */ uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */ int x = 0; /* Easy-to-read C version further below. Assumptions: 0 < T < 127 # of pixels < (2^32)/255 Note: calculates score * 255 */ static alignas (8) const mmx_t b0 = { .uq = 0x0000000000000000ULL }; static alignas (8) const mmx_t b128 = { .uq = 0x8080808080808080ULL }; static alignas (8) const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } }; for( ; x < w8; x += 8 ) { movq_m2r( *((int64_t*)p_c), mm0 ); movq_m2r( *((int64_t*)p_p), mm1 ); movq_m2r( *((int64_t*)p_n), mm2 ); psubb_m2r( b128, mm0 ); psubb_m2r( b128, mm1 ); psubb_m2r( b128, mm2 ); psubsb_r2r( mm0, mm1 ); psubsb_r2r( mm0, mm2 ); pxor_r2r( mm3, mm3 ); pxor_r2r( mm4, mm4 ); pxor_r2r( mm5, mm5 ); pxor_r2r( mm6, mm6 ); punpcklbw_r2r( mm1, mm3 ); punpcklbw_r2r( mm2, mm4 ); punpckhbw_r2r( mm1, mm5 ); punpckhbw_r2r( mm2, mm6 ); pmulhw_r2r( mm3, mm4 ); pmulhw_r2r( mm5, mm6 ); packsswb_r2r(mm4, mm6); pcmpgtb_m2r( bT, mm6 ); psadbw_m2r( b0, mm6 ); paddd_r2r( mm6, mm7 ); p_c += 8; p_p += 8; p_n += 8; } for( ; x < w; ++x ) { /* Worst case: need 17 bits for "comb". */ int_fast32_t C = *p_c; int_fast32_t P = *p_p; int_fast32_t N = *p_n; /* Comments in Transcode's filter_ivtc.c attribute this combing metric to Gunnar Thalin. The idea is that if the picture is interlaced, both expressions will have the same sign, and this comes up positive. The value T = 100 has been chosen such that a pixel difference of 10 (on average) will trigger the detector. */ int_fast32_t comb = (P - C) * (N - C); if( comb > T ) ++i_score_c; ++p_c; ++p_p; ++p_n; } /* Now the other field - swap current and neighbour pictures */ const picture_t *tmp = cur; cur = ngh; ngh = tmp; int tmp_pitch = wc; wc = wn; wn = tmp_pitch; } } movd_r2m( mm7, i_score_mmx ); emms(); return i_score_mmx/255 + i_score_c; } #endif /* See header for function doc. */ int CalculateInterlaceScore( const picture_t* p_pic_top, const picture_t* p_pic_bot ) { /* We use the comb metric from the IVTC filter of Transcode 1.1.5. This was found to work better for the particular purpose of IVTC than RenderX()'s comb metric. Note that we *must not* subsample at all in order to catch interlacing in telecined frames with localized motion (e.g. anime with characters talking, where only mouths move and everything else stays still.) */ assert( p_pic_top != NULL ); assert( p_pic_bot != NULL ); if( p_pic_top->i_planes != p_pic_bot->i_planes ) return -1; #ifdef CAN_COMPILE_MMXEXT if (vlc_CPU_MMXEXT()) return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot ); #endif int32_t i_score = 0; for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane ) { /* Sanity check */ if( p_pic_top->p[i_plane].i_visible_lines != p_pic_bot->p[i_plane].i_visible_lines ) return -1; const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1; const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch, p_pic_bot->p[i_plane].i_visible_pitch ); /* Current line / neighbouring lines picture pointers */ const picture_t *cur = p_pic_bot; const picture_t *ngh = p_pic_top; int wc = cur->p[i_plane].i_pitch; int wn = ngh->p[i_plane].i_pitch; /* Transcode 1.1.5 only checks every other line. Checking every line works better for anime, which may contain horizontal, one pixel thick cartoon outlines. */ for( int y = 1; y < i_lasty; ++y ) { uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc]; /* this line */ uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */ uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */ for( int x = 0; x < w; ++x ) { /* Worst case: need 17 bits for "comb". */ int_fast32_t C = *p_c; int_fast32_t P = *p_p; int_fast32_t N = *p_n; /* Comments in Transcode's filter_ivtc.c attribute this combing metric to Gunnar Thalin. The idea is that if the picture is interlaced, both expressions will have the same sign, and this comes up positive. The value T = 100 has been chosen such that a pixel difference of 10 (on average) will trigger the detector. */ int_fast32_t comb = (P - C) * (N - C); if( comb > T ) ++i_score; ++p_c; ++p_p; ++p_n; } /* Now the other field - swap current and neighbour pictures */ const picture_t *tmp = cur; cur = ngh; ngh = tmp; int tmp_pitch = wc; wc = wn; wn = tmp_pitch; } } return i_score; }
int bsad_mmx(uint8_t *pf, uint8_t *pb, uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h) { uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc; int s, s1, s2; pfa = pf + hxf; pfb = pf + lx * hyf; pfc = pfb + hxf; pba = pb + hxb; pbb = pb + lx * hyb; pbc = pbb + hxb; s = 0; /* the accumulator */ if (h > 0) { pxor_r2r(mm7, mm7); pxor_r2r(mm6, mm6); pcmpeqw_r2r(mm5, mm5); psubw_r2r(mm5, mm6); psllw_i2r(1, mm6); do { BSAD_LOAD(pf[0],mm0,mm1); BSAD_LOAD_ACC(pfa[0],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfb[0],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfc[0],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); BSAD_LOAD(pb[0],mm2,mm3); BSAD_LOAD_ACC(pba[0],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbb[0],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbc[0],mm4,mm5,mm2,mm3); paddw_r2r(mm6, mm2); paddw_r2r(mm6, mm3); psrlw_i2r(2, mm2); psrlw_i2r(2, mm3); paddw_r2r(mm2, mm0); paddw_r2r(mm3, mm1); psrlw_i2r(1, mm6); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psllw_i2r(1, mm6); psrlw_i2r(1, mm0); psrlw_i2r(1, mm1); packuswb_r2r(mm1, mm0); movq_m2r(p2[0], mm1); movq_r2r(mm0, mm2); psubusb_r2r(mm1, mm0); psubusb_r2r(mm2, mm1); por_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm7, mm0); punpckhbw_r2r(mm7, mm1); paddw_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklwd_r2r(mm7, mm0); punpckhwd_r2r(mm7, mm1); paddd_r2r(mm1, mm0); movd_r2g(mm0, s1); psrlq_i2r(32, mm0); movd_r2g(mm0, s2); s += s1 + s2; BSAD_LOAD(pf[8],mm0,mm1); BSAD_LOAD_ACC(pfa[8],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfb[8],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfc[8],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); BSAD_LOAD(pb[8],mm2,mm3); BSAD_LOAD_ACC(pba[8],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbb[8],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbc[8],mm4,mm5,mm2,mm3); paddw_r2r(mm6, mm2); paddw_r2r(mm6, mm3); psrlw_i2r(2, mm2); psrlw_i2r(2, mm3); paddw_r2r(mm2, mm0); paddw_r2r(mm3, mm1); psrlw_i2r(1, mm6); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psllw_i2r(1, mm6); psrlw_i2r(1, mm0); psrlw_i2r(1, mm1); packuswb_r2r(mm1, mm0); movq_m2r(p2[8], mm1); movq_r2r(mm0, mm2); psubusb_r2r(mm1, mm0); psubusb_r2r(mm2, mm1); por_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm7, mm0); punpckhbw_r2r(mm7, mm1); paddw_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklwd_r2r(mm7, mm0); punpckhwd_r2r(mm7, mm1); paddd_r2r(mm1, mm0); movd_r2g(mm0, s1); psrlq_i2r(32, mm0); movd_r2g(mm0, s2); s += s1 + s2; p2 += lx; pf += lx; pfa += lx; pfb += lx; pfc += lx; pb += lx; pba += lx; pbb += lx; pbc += lx; h--; } while (h > 0); } emms(); return s; }
static void _evas_yv12torgb_sse(unsigned char **yuv, unsigned char *rgb, int w, int h) { #ifdef BUILD_MMX int xx, yy; register unsigned char *yp1, *up, *vp; unsigned char *dp1; /* destination pointers */ dp1 = rgb; for (yy = 0; yy < h; yy++) { /* plane pointers */ yp1 = yuv[yy]; up = yuv[h + (yy / 2)]; vp = yuv[h + (h / 2) + (yy / 2)]; for (xx = 0; xx < (w - 7); xx += 8) { movd_m2r(*up, mm3); movd_m2r(*vp, mm2); movq_m2r(*yp1, mm0); pxor_r2r(mm7, mm7); punpcklbw_r2r(mm7, mm2); punpcklbw_r2r(mm7, mm3); movq_r2r(mm0, mm1); psrlw_i2r(8, mm0); psllw_i2r(8, mm1); psrlw_i2r(8, mm1); movq_m2r(CONST_16, mm4); psubsw_r2r(mm4, mm0); psubsw_r2r(mm4, mm1); movq_m2r(CONST_128, mm5); psubsw_r2r(mm5, mm2); psubsw_r2r(mm5, mm3); movq_m2r(CONST_YMUL, mm4); pmullw_r2r(mm4, mm0); pmullw_r2r(mm4, mm1); movq_m2r(CONST_CRVCRV, mm7); pmullw_r2r(mm3, mm7); movq_m2r(CONST_CBUCBU, mm6); pmullw_r2r(mm2, mm6); movq_m2r(CONST_CGUCGU, mm5); pmullw_r2r(mm2, mm5); movq_m2r(CONST_CGVCGV, mm4); pmullw_r2r(mm3, mm4); movq_r2r(mm0, mm2); paddsw_r2r(mm7, mm2); paddsw_r2r(mm1, mm7); psraw_i2r(RES, mm2); psraw_i2r(RES, mm7); packuswb_r2r(mm7, mm2); pxor_r2r(mm7, mm7); movq_r2r(mm2, mm3); punpckhbw_r2r(mm7, mm2); punpcklbw_r2r(mm3, mm7); por_r2r(mm7, mm2); movq_r2r(mm0, mm3); psubsw_r2r(mm5, mm3); psubsw_r2r(mm4, mm3); paddsw_m2r(CONST_32, mm3); movq_r2r(mm1, mm7); psubsw_r2r(mm5, mm7); psubsw_r2r(mm4, mm7); paddsw_m2r(CONST_32, mm7); psraw_i2r(RES, mm3); psraw_i2r(RES, mm7); packuswb_r2r(mm7, mm3); pxor_r2r(mm7, mm7); movq_r2r(mm3, mm4); punpckhbw_r2r(mm7, mm3); punpcklbw_r2r(mm4, mm7); por_r2r(mm7, mm3); movq_m2r(CONST_32, mm4); paddsw_r2r(mm6, mm0); paddsw_r2r(mm6, mm1); paddsw_r2r(mm4, mm0); paddsw_r2r(mm4, mm1); psraw_i2r(RES, mm0); psraw_i2r(RES, mm1); packuswb_r2r(mm1, mm0); pxor_r2r(mm7, mm7); movq_r2r(mm0, mm5); punpckhbw_r2r(mm7, mm0); punpcklbw_r2r(mm5, mm7); por_r2r(mm7, mm0); movq_m2r(CONST_FF, mm1); movq_r2r(mm0, mm5); movq_r2r(mm3, mm6); movq_r2r(mm2, mm7); punpckhbw_r2r(mm3, mm2); punpcklbw_r2r(mm6, mm7); punpckhbw_r2r(mm1, mm0); punpcklbw_r2r(mm1, mm5); movq_r2r(mm7, mm1); punpckhwd_r2r(mm5, mm7); punpcklwd_r2r(mm5, mm1); movq_r2r(mm2, mm4); punpckhwd_r2r(mm0, mm2); punpcklwd_r2r(mm0, mm4); movntq_r2m(mm1, *(dp1)); movntq_r2m(mm7, *(dp1 + 8)); movntq_r2m(mm4, *(dp1 + 16)); movntq_r2m(mm2, *(dp1 + 24)); yp1 += 8; up += 4; vp += 4; dp1 += 8 * 4; } /* cleanup pixles that arent a multiple of 8 pixels wide */ if (xx < w) { int y, u, v, r, g, b; for (; xx < w; xx += 2) { u = (*up++) - 128; v = (*vp++) - 128; y = RZ(YMUL) * ((*yp1++) - 16); r = LUT_CLIP((y + (_crv * v)) >> RES); g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES); b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES); *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b); dp1 += 4; y = RZ(YMUL) * ((*yp1++) - 16); r = LUT_CLIP((y + (_crv * v)) >> RES); g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES); b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES); *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b); dp1 += 4; } } }
static inline void mmx_zero_reg (void) { /* load 0 into mm0 */ pxor_r2r (mm0, mm0); }
static inline void mmx_interp_average_4_U8 (uint8_t * dest, const uint8_t * src1, const uint8_t * src2, const uint8_t * src3, const uint8_t * src4) { /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */ movq_m2r (*src1, mm1); /* load 8 src1 bytes */ movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */ punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */ movq_m2r (*src2, mm3); /* load 8 src2 bytes */ movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */ punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */ paddw_r2r (mm3, mm1); /* add lows */ paddw_r2r (mm4, mm2); /* add highs */ /* now have partials in mm1 and mm2 */ movq_m2r (*src3, mm3); /* load 8 src3 bytes */ movq_r2r (mm3, mm4); /* copy 8 src3 bytes */ punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */ punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */ paddw_r2r (mm3, mm1); /* add lows */ paddw_r2r (mm4, mm2); /* add highs */ movq_m2r (*src4, mm5); /* load 8 src4 bytes */ movq_r2r (mm5, mm6); /* copy 8 src4 bytes */ punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */ punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */ paddw_r2r (mm5, mm1); /* add lows */ paddw_r2r (mm6, mm2); /* add highs */ paddw_m2r (round4, mm1); psraw_i2r (2, mm1); /* /4 */ paddw_m2r (round4, mm2); psraw_i2r (2, mm2); /* /4 */ /* now have subtotal/4 in mm1 and mm2 */ movq_m2r (*dest, mm3); /* load 8 dest bytes */ movq_r2r (mm3, mm4); /* copy 8 dest bytes */ packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */ movq_r2r (mm1,mm2); /* copy subresult */ pxor_r2r (mm1, mm3); /* xor srcavg and dest */ pand_m2r (mask1, mm3); /* mask lower bits */ psrlq_i2r (1, mm3); /* /2 */ por_r2r (mm2, mm4); /* or srcavg and dest */ psubb_r2r (mm3, mm4); /* subtract subresults */ movq_r2m (mm4, *dest); /* store result in dest */ }
static __inline__ void sum_sumsq_8bytes( uint8_t *cur_lum_mb, uint8_t *pred_lum_mb, mmx_t *sumtop_accs, mmx_t *sumbot_accs, mmx_t *sumsqtop_accs, mmx_t *sumsqbot_accs, mmx_t *sumxprod_accs ) { pxor_r2r(mm0,mm0); /* Load pixels from top field into mm1.w,mm2.w */ movq_m2r( *((mmx_t*)cur_lum_mb), mm1 ); movq_m2r( *((mmx_t*)pred_lum_mb), mm2 ); /* mm3 := mm1 mm4 := mm2 mm1.w[0..3] := mm1.b[0..3]-mm2.b[0..3] */ movq_r2r( mm1, mm3 ); punpcklbw_r2r( mm0, mm1 ); movq_r2r( mm2, mm4 ); punpcklbw_r2r( mm0, mm2 ); psubw_r2r( mm2, mm1 ); /* mm3.w[0..3] := mm3.b[4..7]-mm4.b[4..7] */ punpckhbw_r2r( mm0, mm3 ); punpckhbw_r2r( mm0, mm4 ); psubw_r2r( mm4, mm3 ); /* sumtop_accs->w[0..3] += mm1.w[0..3]; sumtop_accs->w[0..3] += mm3.w[0..3]; mm6 = mm1; mm7 = mm3; */ movq_m2r( *sumtop_accs, mm5 ); paddw_r2r( mm1, mm5 ); paddw_r2r( mm3, mm5 ); movq_r2r( mm1, mm6 ); movq_r2r( mm3, mm7 ); movq_r2m( mm5, *sumtop_accs ); /* *sumsq_top_acc += mm1.w[0..3] * mm1.w[0..3]; *sumsq_top_acc += mm3.w[0..3] * mm3.w[0..3]; */ pmaddwd_r2r( mm1, mm1 ); movq_m2r( *sumsqtop_accs, mm5 ); pmaddwd_r2r( mm3, mm3 ); paddd_r2r( mm1, mm5 ); paddd_r2r( mm3, mm5 ); movq_r2m( mm5, *sumsqtop_accs ); /* Load pixels from bot field into mm1.w,mm2.w */ movq_m2r( *((mmx_t*)(cur_lum_mb+opt->phy_width)), mm1 ); movq_m2r( *((mmx_t*)(pred_lum_mb+opt->phy_width)), mm2 ); /* mm2 := mm1 mm4 := mm2 mm1.w[0..3] := mm1.b[0..3]-mm2.b[0..3] */ movq_r2r( mm1, mm3 ); punpcklbw_r2r( mm0, mm1 ); movq_r2r( mm2, mm4 ); punpcklbw_r2r( mm0, mm2 ); psubw_r2r( mm2, mm1 ); /* mm3.w[0..3] := mm3.b[4..7]-mm4.b[4..7] */ punpckhbw_r2r( mm0, mm3 ); punpckhbw_r2r( mm0, mm4 ); psubw_r2r( mm4, mm3 ); /* sumbot_accs->w[0..3] += mm1.w[0..3]; sumbot_accs->w[0..3] += mm3.w[0..3]; mm2 := mm1; mm4 := mm3; */ movq_m2r( *sumbot_accs, mm5 ); paddw_r2r( mm1, mm5 ); movq_r2r( mm1, mm2 ); paddw_r2r( mm3, mm5 ); movq_r2r( mm3, mm4 ); movq_r2m( mm5, *sumbot_accs ); /* *sumsqbot_acc += mm1.w[0..3] * mm1.w[0..3]; *sumsqbot_acc += mm3.w[0..3] * mm3.w[0..3]; */ pmaddwd_r2r( mm1, mm1 ); movq_m2r( *sumsqbot_accs, mm5 ); pmaddwd_r2r( mm3, mm3 ); paddd_r2r( mm1, mm5 ); paddd_r2r( mm3, mm5 ); movq_r2m( mm5, *sumsqbot_accs ); /* Accumulate cross-product *sum_xprod_acc += mm1.w[0..3] * mm6[0..3]; *sum_xprod_acc += mm3.w[0..3] * mm7[0..3]; */ movq_m2r( *sumxprod_accs, mm5 ); pmaddwd_r2r( mm6, mm2); pmaddwd_r2r( mm7, mm4); paddd_r2r( mm2, mm5 ); paddd_r2r( mm4, mm5 ); movq_r2m( mm5, *sumxprod_accs ); emms(); }