static inline void mmx_end(uint8_t *src3, uint8_t *src5, uint8_t *dst, int X) { punpcklbw_m2r (mm_cpool[0], mm4); punpckhbw_m2r (mm_cpool[0], mm5); psubusw_r2r (mm2, mm0); psubusw_r2r (mm3, mm1); movq_m2r (src5[X], mm2); movq_m2r (src5[X], mm3); punpcklbw_m2r (mm_cpool[0], mm2); punpckhbw_m2r (mm_cpool[0], mm3); psubusw_r2r (mm2, mm0); psubusw_r2r (mm3, mm1); psrlw_i2r (3, mm0); psrlw_i2r (3, mm1); psubw_r2r (mm6, mm4); psubw_r2r (mm7, mm5); packuswb_r2r (mm1,mm0); movq_r2r (mm4, mm6); movq_r2r (mm5, mm7); pcmpgtw_m2r (mm_lthr, mm4); pcmpgtw_m2r (mm_lthr, mm5); pcmpgtw_m2r (mm_hthr, mm6); pcmpgtw_m2r (mm_hthr, mm7); packsswb_r2r (mm5, mm4); packsswb_r2r (mm7, mm6); pxor_r2r (mm6, mm4); movq_r2r (mm4, mm5); pandn_r2r (mm0, mm4); pand_m2r (src3[X], mm5); por_r2r (mm4, mm5); movq_r2m (mm5, dst[X]); }
static void scale_uint16_x_4_x_generic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i, j; uint8_t * src, * dst, *src_start; int32_t * factors; // mmx_t tmp_mm; /* * mm0: Input * mm1: factor_mask * mm2: Factor * mm3: Output * mm4: * mm5: * mm6: 0 * mm7: scratch * */ src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); movq_m2r(factor_mask, mm1); dst = dest_start; for(i = 0; i < ctx->dst_size; i++) { src = src_start + 8*ctx->table_h.pixels[i].index; factors = ctx->table_h.pixels[i].factor_i; pxor_r2r(mm3, mm3); for(j = 0; j < ctx->table_h.factors_per_pixel; j++) { /* Load pixels */ movq_m2r(*(src), mm0); psrlw_i2r(1, mm0); /* Load factors */ LOAD_FACTOR_1_4; /* Multiply */ pmulhw_r2r(mm7, mm0); paddw_r2r(mm0, mm3); // DUMP_MM("mm3_2", mm3); src += 8; factors++; } pminsw_m2r(max_13, mm3); pmaxsw_m2r(min_13, mm3); psllw_i2r(3, mm3); MOVQ_R2M(mm3, *dst); dst+=8; } ctx->need_emms = 1; }
static void deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4, uint8_t * lum_m3, uint8_t * lum_m2, uint8_t * lum_m1, uint8_t * lum, int size) { mmx_t rounder; rounder.uw[0] = 4; rounder.uw[1] = 4; rounder.uw[2] = 4; rounder.uw[3] = 4; pxor_r2r (mm7, mm7); movq_m2r (rounder, mm6); for (; size > 3; size -= 4) { movd_m2r (*lum_m4, mm0); movd_m2r (*lum_m3, mm1); movd_m2r (*lum_m2, mm2); movd_m2r (*lum_m1, mm3); movd_m2r (*lum, mm4); punpcklbw_r2r (mm7, mm0); punpcklbw_r2r (mm7, mm1); punpcklbw_r2r (mm7, mm2); punpcklbw_r2r (mm7, mm3); punpcklbw_r2r (mm7, mm4); paddw_r2r (mm3, mm1); psllw_i2r (1, mm2); paddw_r2r (mm4, mm0); psllw_i2r (2, mm1); // 2 paddw_r2r (mm6, mm2); paddw_r2r (mm2, mm1); psubusw_r2r (mm0, mm1); psrlw_i2r (3, mm1); // 3 packuswb_r2r (mm7, mm1); movd_r2m (mm1, *dst); lum_m4 += 4; lum_m3 += 4; lum_m2 += 4; lum_m1 += 4; lum += 4; dst += 4; } emms (); /* Handle odd widths */ if (size > 0) deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size); }
static void scale_uint16_x_1_x_bicubic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i; uint16_t * dst; uint8_t * src, *src_start; int32_t * factors; mmx_t tmp_mm; int32_t tmp; // fprintf(stderr, "scale_uint8_x_1_x_bicubic_mmx\n"); src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); dst = (uint16_t*)dest_start; for(i = 0; i < ctx->dst_size; i++) { src = src_start + 2*ctx->table_h.pixels[i].index; factors = ctx->table_h.pixels[i].factor_i; /* Load pixels */ movq_m2r(*(src), mm0); psrlw_i2r(1, mm0); // DUMP_MM("mm0", mm0); /* Load factors */ movq_m2r(*factors, mm2); movq_m2r(*(factors+2), mm3); packssdw_r2r(mm3, mm2); /* Multiply */ pmaddwd_r2r(mm2, mm0); MOVQ_R2M(mm0, tmp_mm); tmp = tmp_mm.d[0] + tmp_mm.d[1]; tmp >>= 13; RECLIP(tmp, ctx->plane); *(dst++) = tmp; } ctx->need_emms = 1; }
static void _evas_yv12torgb_sse(unsigned char **yuv, unsigned char *rgb, int w, int h) { #ifdef BUILD_MMX int xx, yy; register unsigned char *yp1, *up, *vp; unsigned char *dp1; /* destination pointers */ dp1 = rgb; for (yy = 0; yy < h; yy++) { /* plane pointers */ yp1 = yuv[yy]; up = yuv[h + (yy / 2)]; vp = yuv[h + (h / 2) + (yy / 2)]; for (xx = 0; xx < (w - 7); xx += 8) { movd_m2r(*up, mm3); movd_m2r(*vp, mm2); movq_m2r(*yp1, mm0); pxor_r2r(mm7, mm7); punpcklbw_r2r(mm7, mm2); punpcklbw_r2r(mm7, mm3); movq_r2r(mm0, mm1); psrlw_i2r(8, mm0); psllw_i2r(8, mm1); psrlw_i2r(8, mm1); movq_m2r(CONST_16, mm4); psubsw_r2r(mm4, mm0); psubsw_r2r(mm4, mm1); movq_m2r(CONST_128, mm5); psubsw_r2r(mm5, mm2); psubsw_r2r(mm5, mm3); movq_m2r(CONST_YMUL, mm4); pmullw_r2r(mm4, mm0); pmullw_r2r(mm4, mm1); movq_m2r(CONST_CRVCRV, mm7); pmullw_r2r(mm3, mm7); movq_m2r(CONST_CBUCBU, mm6); pmullw_r2r(mm2, mm6); movq_m2r(CONST_CGUCGU, mm5); pmullw_r2r(mm2, mm5); movq_m2r(CONST_CGVCGV, mm4); pmullw_r2r(mm3, mm4); movq_r2r(mm0, mm2); paddsw_r2r(mm7, mm2); paddsw_r2r(mm1, mm7); psraw_i2r(RES, mm2); psraw_i2r(RES, mm7); packuswb_r2r(mm7, mm2); pxor_r2r(mm7, mm7); movq_r2r(mm2, mm3); punpckhbw_r2r(mm7, mm2); punpcklbw_r2r(mm3, mm7); por_r2r(mm7, mm2); movq_r2r(mm0, mm3); psubsw_r2r(mm5, mm3); psubsw_r2r(mm4, mm3); paddsw_m2r(CONST_32, mm3); movq_r2r(mm1, mm7); psubsw_r2r(mm5, mm7); psubsw_r2r(mm4, mm7); paddsw_m2r(CONST_32, mm7); psraw_i2r(RES, mm3); psraw_i2r(RES, mm7); packuswb_r2r(mm7, mm3); pxor_r2r(mm7, mm7); movq_r2r(mm3, mm4); punpckhbw_r2r(mm7, mm3); punpcklbw_r2r(mm4, mm7); por_r2r(mm7, mm3); movq_m2r(CONST_32, mm4); paddsw_r2r(mm6, mm0); paddsw_r2r(mm6, mm1); paddsw_r2r(mm4, mm0); paddsw_r2r(mm4, mm1); psraw_i2r(RES, mm0); psraw_i2r(RES, mm1); packuswb_r2r(mm1, mm0); pxor_r2r(mm7, mm7); movq_r2r(mm0, mm5); punpckhbw_r2r(mm7, mm0); punpcklbw_r2r(mm5, mm7); por_r2r(mm7, mm0); movq_m2r(CONST_FF, mm1); movq_r2r(mm0, mm5); movq_r2r(mm3, mm6); movq_r2r(mm2, mm7); punpckhbw_r2r(mm3, mm2); punpcklbw_r2r(mm6, mm7); punpckhbw_r2r(mm1, mm0); punpcklbw_r2r(mm1, mm5); movq_r2r(mm7, mm1); punpckhwd_r2r(mm5, mm7); punpcklwd_r2r(mm5, mm1); movq_r2r(mm2, mm4); punpckhwd_r2r(mm0, mm2); punpcklwd_r2r(mm0, mm4); movntq_r2m(mm1, *(dp1)); movntq_r2m(mm7, *(dp1 + 8)); movntq_r2m(mm4, *(dp1 + 16)); movntq_r2m(mm2, *(dp1 + 24)); yp1 += 8; up += 4; vp += 4; dp1 += 8 * 4; } /* cleanup pixles that arent a multiple of 8 pixels wide */ if (xx < w) { int y, u, v, r, g, b; for (; xx < w; xx += 2) { u = (*up++) - 128; v = (*vp++) - 128; y = RZ(YMUL) * ((*yp1++) - 16); r = LUT_CLIP((y + (_crv * v)) >> RES); g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES); b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES); *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b); dp1 += 4; y = RZ(YMUL) * ((*yp1++) - 16); r = LUT_CLIP((y + (_crv * v)) >> RES); g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES); b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES); *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b); dp1 += 4; } } }
int bsad_mmx(uint8_t *pf, uint8_t *pb, uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h) { uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc; int s, s1, s2; pfa = pf + hxf; pfb = pf + lx * hyf; pfc = pfb + hxf; pba = pb + hxb; pbb = pb + lx * hyb; pbc = pbb + hxb; s = 0; /* the accumulator */ if (h > 0) { pxor_r2r(mm7, mm7); pxor_r2r(mm6, mm6); pcmpeqw_r2r(mm5, mm5); psubw_r2r(mm5, mm6); psllw_i2r(1, mm6); do { BSAD_LOAD(pf[0],mm0,mm1); BSAD_LOAD_ACC(pfa[0],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfb[0],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfc[0],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); BSAD_LOAD(pb[0],mm2,mm3); BSAD_LOAD_ACC(pba[0],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbb[0],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbc[0],mm4,mm5,mm2,mm3); paddw_r2r(mm6, mm2); paddw_r2r(mm6, mm3); psrlw_i2r(2, mm2); psrlw_i2r(2, mm3); paddw_r2r(mm2, mm0); paddw_r2r(mm3, mm1); psrlw_i2r(1, mm6); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psllw_i2r(1, mm6); psrlw_i2r(1, mm0); psrlw_i2r(1, mm1); packuswb_r2r(mm1, mm0); movq_m2r(p2[0], mm1); movq_r2r(mm0, mm2); psubusb_r2r(mm1, mm0); psubusb_r2r(mm2, mm1); por_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm7, mm0); punpckhbw_r2r(mm7, mm1); paddw_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklwd_r2r(mm7, mm0); punpckhwd_r2r(mm7, mm1); paddd_r2r(mm1, mm0); movd_r2g(mm0, s1); psrlq_i2r(32, mm0); movd_r2g(mm0, s2); s += s1 + s2; BSAD_LOAD(pf[8],mm0,mm1); BSAD_LOAD_ACC(pfa[8],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfb[8],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfc[8],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); BSAD_LOAD(pb[8],mm2,mm3); BSAD_LOAD_ACC(pba[8],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbb[8],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbc[8],mm4,mm5,mm2,mm3); paddw_r2r(mm6, mm2); paddw_r2r(mm6, mm3); psrlw_i2r(2, mm2); psrlw_i2r(2, mm3); paddw_r2r(mm2, mm0); paddw_r2r(mm3, mm1); psrlw_i2r(1, mm6); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psllw_i2r(1, mm6); psrlw_i2r(1, mm0); psrlw_i2r(1, mm1); packuswb_r2r(mm1, mm0); movq_m2r(p2[8], mm1); movq_r2r(mm0, mm2); psubusb_r2r(mm1, mm0); psubusb_r2r(mm2, mm1); por_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm7, mm0); punpckhbw_r2r(mm7, mm1); paddw_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklwd_r2r(mm7, mm0); punpckhwd_r2r(mm7, mm1); paddd_r2r(mm1, mm0); movd_r2g(mm0, s1); psrlq_i2r(32, mm0); movd_r2g(mm0, s2); s += s1 + s2; p2 += lx; pf += lx; pfa += lx; pfb += lx; pfc += lx; pb += lx; pba += lx; pbb += lx; pbc += lx; h--; } while (h > 0); } emms(); return s; }
static int bsad_1quad_mmxe(uint8_t *pf, uint8_t *pb, uint8_t *pb2, uint8_t *p2, int lx, int h) { int s; s = 0; /* the accumulator */ if (h > 0) { pcmpeqw_r2r(mm6, mm6); psrlw_i2r(15, mm6); paddw_r2r(mm6, mm6); pxor_r2r(mm7, mm7); pxor_r2r(mm5, mm5); do { BSAD_LOAD(pf[0],mm0,mm1); BSAD_LOAD_ACC(pf[1],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pf[lx],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pf[lx+1],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); packuswb_r2r(mm1, mm0); movq_m2r(pb2[0],mm1); pavgb_m2r(pb[0],mm1); pavgb_r2r(mm1, mm0); psadbw_m2r(p2[0],mm0); paddd_r2r(mm0,mm5); BSAD_LOAD(pf[8],mm0,mm1); BSAD_LOAD_ACC(pf[9],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pf[lx+8],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pf[lx+9],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); packuswb_r2r(mm1, mm0); movq_m2r(pb2[8],mm1); pavgb_m2r(pb[8],mm1); pavgb_r2r(mm1, mm0); psadbw_m2r(p2[8],mm0); paddd_r2r(mm0,mm5); p2 += lx; pf += lx; pb += lx; pb2 += lx; h--; } while (h > 0); } movd_r2g(mm5,s); emms(); return s; }
static void interpolate_packed422_scanline_mmx( uint8_t *output, uint8_t *top, uint8_t *bot, int width ) { const mmx_t shiftmask = { 0xfefffefffefffeffULL }; /* To avoid shifting chroma to luma. */ int i; for( i = width/16; i; --i ) { movq_m2r( *bot, mm0 ); movq_m2r( *top, mm1 ); movq_m2r( *(bot + 8), mm2 ); movq_m2r( *(top + 8), mm3 ); movq_m2r( *(bot + 16), mm4 ); movq_m2r( *(top + 16), mm5 ); movq_m2r( *(bot + 24), mm6 ); movq_m2r( *(top + 24), mm7 ); pand_m2r( shiftmask, mm0 ); pand_m2r( shiftmask, mm1 ); pand_m2r( shiftmask, mm2 ); pand_m2r( shiftmask, mm3 ); pand_m2r( shiftmask, mm4 ); pand_m2r( shiftmask, mm5 ); pand_m2r( shiftmask, mm6 ); pand_m2r( shiftmask, mm7 ); psrlw_i2r( 1, mm0 ); psrlw_i2r( 1, mm1 ); psrlw_i2r( 1, mm2 ); psrlw_i2r( 1, mm3 ); psrlw_i2r( 1, mm4 ); psrlw_i2r( 1, mm5 ); psrlw_i2r( 1, mm6 ); psrlw_i2r( 1, mm7 ); paddb_r2r( mm1, mm0 ); paddb_r2r( mm3, mm2 ); paddb_r2r( mm5, mm4 ); paddb_r2r( mm7, mm6 ); movq_r2m( mm0, *output ); movq_r2m( mm2, *(output + 8) ); movq_r2m( mm4, *(output + 16) ); movq_r2m( mm6, *(output + 24) ); output += 32; top += 32; bot += 32; } width = (width & 0xf); for( i = width/4; i; --i ) { movq_m2r( *bot, mm0 ); movq_m2r( *top, mm1 ); pand_m2r( shiftmask, mm0 ); pand_m2r( shiftmask, mm1 ); psrlw_i2r( 1, mm0 ); psrlw_i2r( 1, mm1 ); paddb_r2r( mm1, mm0 ); movq_r2m( mm0, *output ); output += 8; top += 8; bot += 8; } width = width & 0x7; /* Handle last few pixels. */ for( i = width * 2; i; --i ) { *output++ = ((*top++) + (*bot++)) >> 1; } emms(); }
static void scale_uint16_x_4_x_quadratic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i; uint8_t * src, * dst, *src_start; int32_t * factors; // mmx_t tmp_mm; /* * mm0: Input * mm1: factor_mask * mm2: Factor * mm3: Output * mm4: * mm5: * mm6: 0 * mm7: scratch * */ // fprintf(stderr, "scale_uint8_x_1_x_bicubic_noclip_mmx\n"); src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); movq_m2r(factor_mask, mm1); dst = dest_start; for(i = 0; i < ctx->dst_size; i++) { src = src_start + 8*ctx->table_h.pixels[i].index; factors = ctx->table_h.pixels[i].factor_i; /* Load pixels */ movq_m2r(*(src), mm0); // punpcklbw_r2r(mm6, mm0); psrlw_i2r(1, mm0); /* Load factors */ LOAD_FACTOR_1_4_NOCLIP; /* Multiply */ pmulhw_r2r(mm7, mm0); movq_r2r(mm0, mm3); // DUMP_MM("mm3_1", mm3); src += 8; factors++; /* Load pixels */ movq_m2r(*(src), mm0); // punpcklbw_r2r(mm6, mm0); psrlw_i2r(1, mm0); /* Load factors */ LOAD_FACTOR_1_4_NOCLIP; /* Multiply */ pmulhw_r2r(mm7, mm0); paddw_r2r(mm0, mm3); // DUMP_MM("mm3_2", mm3); src += 8; factors++; /* Load pixels */ movq_m2r(*(src), mm0); // punpcklbw_r2r(mm6, mm0); psrlw_i2r(1, mm0); /* Load factors */ LOAD_FACTOR_1_4_NOCLIP; /* Multiply */ pmulhw_r2r(mm7, mm0); paddw_r2r(mm0, mm3); // DUMP_MM("mm3_3", mm3); src += 8; psllw_i2r(3, mm3); // packuswb_r2r(mm6, mm3); MOVQ_R2M(mm3, *dst); dst+=8; } ctx->need_emms = 1; }
static void deinterlace_greedy_scanline_mmx (GstDeinterlaceMethodGreedyL * self, const guint8 * m0, const guint8 * t1, const guint8 * b1, const guint8 * m2, guint8 * output, gint width) { mmx_t MaxComb; mmx_t ShiftMask; // How badly do we let it weave? 0-255 MaxComb.ub[0] = self->max_comb; MaxComb.ub[1] = self->max_comb; MaxComb.ub[2] = self->max_comb; MaxComb.ub[3] = self->max_comb; MaxComb.ub[4] = self->max_comb; MaxComb.ub[5] = self->max_comb; MaxComb.ub[6] = self->max_comb; MaxComb.ub[7] = self->max_comb; ShiftMask.ub[0] = 0x7f; ShiftMask.ub[1] = 0x7f; ShiftMask.ub[2] = 0x7f; ShiftMask.ub[3] = 0x7f; ShiftMask.ub[4] = 0x7f; ShiftMask.ub[5] = 0x7f; ShiftMask.ub[6] = 0x7f; ShiftMask.ub[7] = 0x7f; // L2 == m0 // L1 == t1 // L3 == b1 // LP2 == m2 movq_m2r (MaxComb, mm6); for (; width > 7; width -= 8) { movq_m2r (*t1, mm1); // L1 movq_m2r (*m0, mm2); // L2 movq_m2r (*b1, mm3); // L3 movq_m2r (*m2, mm0); // LP2 // average L1 and L3 leave result in mm4 movq_r2r (mm1, mm4); // L1 movq_r2r (mm3, mm5); // L3 psrlw_i2r (1, mm4); // L1/2 pand_m2r (ShiftMask, mm4); psrlw_i2r (1, mm5); // L3/2 pand_m2r (ShiftMask, mm5); paddusb_r2r (mm5, mm4); // (L1 + L3) / 2 // get abs value of possible L2 comb movq_r2r (mm2, mm7); // L2 psubusb_r2r (mm4, mm7); // L2 - avg movq_r2r (mm4, mm5); // avg psubusb_r2r (mm2, mm5); // avg - L2 por_r2r (mm7, mm5); // abs(avg-L2) // get abs value of possible LP2 comb movq_r2r (mm0, mm7); // LP2 psubusb_r2r (mm4, mm7); // LP2 - avg psubusb_r2r (mm0, mm4); // avg - LP2 por_r2r (mm7, mm4); // abs(avg-LP2) // use L2 or LP2 depending upon which makes smaller comb psubusb_r2r (mm5, mm4); // see if it goes to zero psubusb_r2r (mm5, mm5); // 0 pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0 pcmpeqb_r2r (mm4, mm5); // opposite of mm4 // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0 pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0 por_r2r (mm5, mm4); // may the best win // Now lets clip our chosen value to be not outside of the range // of the high/low range L1-L3 by more than abs(L1-L3) // This allows some comb but limits the damages and also allows more // detail than a boring oversmoothed clip. movq_r2r (mm1, mm2); // copy L1 psubusb_r2r (mm3, mm2); // - L3, with saturation paddusb_r2r (mm3, mm2); // now = Max(L1,L3) pcmpeqb_r2r (mm7, mm7); // all ffffffff psubusb_r2r (mm1, mm7); // - L1 paddusb_r2r (mm7, mm3); // add, may sat at fff.. psubusb_r2r (mm7, mm3); // now = Min(L1,L3) // allow the value to be above the high or below the low by amt of MaxComb paddusb_r2r (mm6, mm2); // increase max by diff psubusb_r2r (mm6, mm3); // lower min by diff psubusb_r2r (mm3, mm4); // best - Min paddusb_r2r (mm3, mm4); // now = Max(best,Min(L1,L3) pcmpeqb_r2r (mm7, mm7); // all ffffffff psubusb_r2r (mm4, mm7); // - Max(best,Min(best,L3) paddusb_r2r (mm7, mm2); // add may sat at FFF.. psubusb_r2r (mm7, mm2); // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped movq_r2m (mm2, *output); // move in our clipped best // Advance to the next set of pixels. output += 8; m0 += 8; t1 += 8; b1 += 8; m2 += 8; } emms (); if (width > 0) deinterlace_greedy_scanline_c (self, m0, t1, b1, m2, output, width); }
static void scale_uint8_x_1_x_bilinear_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i, imax, index; uint8_t * src, * dst, *src_start; mmx_t tmp_mm; /* * mm0: Input1 Input2 * mm1: Factor * mm2: * mm3: * mm4: * mm5: * mm6: 0 * mm7: scratch * */ src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); dst = dest_start; imax = ctx->dst_size / 4; // imax = 0; index = 0; for(i = 0; i < imax; i++) { /* Load pixels */ src = src_start + ctx->table_h.pixels[index].index; tmp_mm.uw[0] = *src; tmp_mm.uw[1] = *(src+1); src = src_start + ctx->table_h.pixels[index+1].index; tmp_mm.uw[2] = *src; tmp_mm.uw[3] = *(src+1); movq_m2r(tmp_mm, mm0); /* Load factors */ movq_m2r(ctx->table_h.pixels[index].factor_i[0], mm1); movq_m2r(ctx->table_h.pixels[index+1].factor_i[0], mm7); packssdw_r2r(mm7, mm1); pmaddwd_r2r(mm0, mm1); index += 2; /* Load pixels */ src = src_start + ctx->table_h.pixels[index].index; tmp_mm.uw[0] = *src; tmp_mm.uw[1] = *(src+1); src = src_start + ctx->table_h.pixels[index+1].index; tmp_mm.uw[2] = *src; tmp_mm.uw[3] = *(src+1); movq_m2r(tmp_mm, mm0); /* Load factors */ movq_m2r(ctx->table_h.pixels[index].factor_i[0], mm3); movq_m2r(ctx->table_h.pixels[index+1].factor_i[0], mm7); packssdw_r2r(mm7, mm3); pmaddwd_r2r(mm0, mm3); psrld_i2r(7, mm3); psrld_i2r(7, mm1); packssdw_r2r(mm3, mm1); psrlw_i2r(7, mm1); index += 2; packuswb_r2r(mm6, mm1); movd_r2m(mm1, *dst); // *dst = tmp_mm.ub[0]; // *(dst+1) = tmp_mm.ub[4]; dst+=4; } ctx->need_emms = 1; #if 1 imax = ctx->dst_size % 4; for(i = 0; i < imax; i++) { src = (src_start + ctx->table_h.pixels[index].index); *dst = (ctx->table_h.pixels[index].factor_i[0] * *src + ctx->table_h.pixels[index].factor_i[1] * *(src+1)) >> 14; dst++; index++; } #endif }
static void deinterlace_scanline_linear_mmx (GstDeinterlaceMethod * self, GstDeinterlace * parent, guint8 * out, GstDeinterlaceScanlineData * scanlines, gint width) { const mmx_t shiftmask = { 0xfefffefffefffeffULL }; /* To avoid shifting chroma to luma. */ int i; guint8 *bot = scanlines->b0, *top = scanlines->t0; for (i = width / 16; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); movq_m2r (*(bot + 8), mm2); movq_m2r (*(top + 8), mm3); movq_m2r (*(bot + 16), mm4); movq_m2r (*(top + 16), mm5); movq_m2r (*(bot + 24), mm6); movq_m2r (*(top + 24), mm7); pand_m2r (shiftmask, mm0); pand_m2r (shiftmask, mm1); pand_m2r (shiftmask, mm2); pand_m2r (shiftmask, mm3); pand_m2r (shiftmask, mm4); pand_m2r (shiftmask, mm5); pand_m2r (shiftmask, mm6); pand_m2r (shiftmask, mm7); psrlw_i2r (1, mm0); psrlw_i2r (1, mm1); psrlw_i2r (1, mm2); psrlw_i2r (1, mm3); psrlw_i2r (1, mm4); psrlw_i2r (1, mm5); psrlw_i2r (1, mm6); psrlw_i2r (1, mm7); paddb_r2r (mm1, mm0); paddb_r2r (mm3, mm2); paddb_r2r (mm5, mm4); paddb_r2r (mm7, mm6); movq_r2m (mm0, *out); movq_r2m (mm2, *(out + 8)); movq_r2m (mm4, *(out + 16)); movq_r2m (mm6, *(out + 24)); out += 32; top += 32; bot += 32; } width = (width & 0xf); for (i = width / 4; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); pand_m2r (shiftmask, mm0); pand_m2r (shiftmask, mm1); psrlw_i2r (1, mm0); psrlw_i2r (1, mm1); paddb_r2r (mm1, mm0); movq_r2m (mm0, *out); out += 8; top += 8; bot += 8; } width = width & 0x7; /* Handle last few pixels. */ for (i = width * 2; i; --i) { *out++ = ((*top++) + (*bot++)) >> 1; } emms (); }
static void deinterlace_line( uint8_t *dst, uint8_t *lum_m4, uint8_t *lum_m3, uint8_t *lum_m2, uint8_t *lum_m1, uint8_t *lum, int size ) { #if defined(__i386__) || defined(__x86_64__) mmx_t rounder; rounder.uw[0]=4; rounder.uw[1]=4; rounder.uw[2]=4; rounder.uw[3]=4; pxor_r2r(mm7,mm7); movq_m2r(rounder,mm6); for (;size > 3; size-=4) { movd_m2r(lum_m4[0],mm0); movd_m2r(lum_m3[0],mm1); movd_m2r(lum_m2[0],mm2); movd_m2r(lum_m1[0],mm3); movd_m2r(lum[0],mm4); punpcklbw_r2r(mm7,mm0); punpcklbw_r2r(mm7,mm1); punpcklbw_r2r(mm7,mm2); punpcklbw_r2r(mm7,mm3); punpcklbw_r2r(mm7,mm4); paddw_r2r(mm3,mm1); psllw_i2r(1,mm2); paddw_r2r(mm4,mm0); psllw_i2r(2,mm1);// 2 paddw_r2r(mm6,mm2); paddw_r2r(mm2,mm1); psubusw_r2r(mm0,mm1); psrlw_i2r(3,mm1); // 3 packuswb_r2r(mm7,mm1); movd_r2m(mm1,dst[0]); lum_m4+=4; lum_m3+=4; lum_m2+=4; lum_m1+=4; lum+=4; dst+=4; } emms(); #else /** * C implementation. */ int sum; for(;size > 0;size--) { sum = -lum_m4[0]; sum += lum_m3[0] << 2; sum += lum_m2[0] << 1; sum += lum_m1[0] << 2; sum += -lum[0]; dst[0] = (sum + 4) >> 3; // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3]; lum_m4++; lum_m3++; lum_m2++; lum_m1++; lum++; dst++; } #endif }
void yuv411planar_to_rgb_mmx (const unsigned char *yuv, unsigned char *rgb, unsigned int w, unsigned int h) { unsigned int xx, yy; register const unsigned char *yp1, *up, *vp; unsigned char *dp1; /* plane pointers */ yp1 = yuv; up = yuv + (w * h); vp = up + (w * (h / 4)); /* destination pointers */ dp1 = rgb; yp1 = yuv; up = yuv + (w * h); vp = up + ((w / 2) * (h / 2)); dp1 = rgb; for (yy = 0; yy < h; yy++) { for (xx = 0; xx < w; xx += 8) { movq_m2r(*yp1, mm0); movq_r2r(mm0, mm1); psrlw_i2r(8, mm0); psllw_i2r(8, mm1); psrlw_i2r(8, mm1); pxor_r2r(mm7, mm7); movd_m2r(*up, mm3); movd_m2r(*vp, mm2); punpcklbw_r2r(mm7, mm2); punpcklbw_r2r(mm7, mm3); movq_m2r(CONST_16, mm4); psubsw_r2r(mm4, mm0); psubsw_r2r(mm4, mm1); movq_m2r(CONST_128, mm5); psubsw_r2r(mm5, mm2); psubsw_r2r(mm5, mm3); movq_m2r(CONST_YMUL, mm4); pmullw_r2r(mm4, mm0); pmullw_r2r(mm4, mm1); movq_m2r(CONST_CRVCRV, mm7); pmullw_r2r(mm3, mm7); movq_m2r(CONST_CBUCBU, mm6); pmullw_r2r(mm2, mm6); movq_m2r(CONST_CGUCGU, mm5); pmullw_r2r(mm2, mm5); movq_m2r(CONST_CGVCGV, mm4); pmullw_r2r(mm3, mm4); movq_r2r(mm0, mm2); paddsw_r2r(mm7, mm2); paddsw_r2r(mm1, mm7); psraw_i2r(RES, mm2); psraw_i2r(RES, mm7); packuswb_r2r(mm7, mm2); pxor_r2r(mm7, mm7); movq_r2r(mm2, mm3); punpckhbw_r2r(mm7, mm2); punpcklbw_r2r(mm3, mm7); por_r2r(mm7, mm2); movq_r2r(mm0, mm3); psubsw_r2r(mm5, mm3); psubsw_r2r(mm4, mm3); paddsw_m2r(CONST_32, mm3); movq_r2r(mm1, mm7); psubsw_r2r(mm5, mm7); psubsw_r2r(mm4, mm7); paddsw_m2r(CONST_32, mm7); psraw_i2r(RES, mm3); psraw_i2r(RES, mm7); packuswb_r2r(mm7, mm3); pxor_r2r(mm7, mm7); movq_r2r(mm3, mm4); punpckhbw_r2r(mm7, mm3); punpcklbw_r2r(mm4, mm7); por_r2r(mm7, mm3); movq_m2r(CONST_32, mm4); paddsw_r2r(mm6, mm0); paddsw_r2r(mm6, mm1); paddsw_r2r(mm4, mm0); paddsw_r2r(mm4, mm1); psraw_i2r(RES, mm0); psraw_i2r(RES, mm1); packuswb_r2r(mm1, mm0); pxor_r2r(mm7, mm7); movq_r2r(mm0, mm5); punpckhbw_r2r(mm7, mm0); punpcklbw_r2r(mm5, mm7); por_r2r(mm7, mm0); pxor_r2r(mm1, mm1); movq_r2r(mm0, mm5); movq_r2r(mm3, mm6); movq_r2r(mm2, mm7); punpckhbw_r2r(mm3, mm2); punpcklbw_r2r(mm6, mm7); punpckhbw_r2r(mm1, mm0); punpcklbw_r2r(mm1, mm5); movq_r2r(mm7, mm1); punpckhwd_r2r(mm5, mm7); punpcklwd_r2r(mm5, mm1); movq_r2r(mm2, mm4); punpckhwd_r2r(mm0, mm2); punpcklwd_r2r(mm0, mm4); movntq_r2m(mm1, *(dp1)); movntq_r2m(mm7, *(dp1 + 8)); movntq_r2m(mm4, *(dp1 + 16)); movntq_r2m(mm2, *(dp1 + 24)); yp1 += 8; up += 4; vp += 4; dp1 += 8 * 4; } if (yy & 0x1) { up -= w / 2; vp -= w / 2; } } emms(); }
static void deinterlace_scanline_linear_mmx (GstDeinterlaceSimpleMethod * self, guint8 * out, const guint8 * bot, const guint8 * top, gint size) { const mmx_t shiftmask = { 0xfefffefffefffeffULL }; /* To avoid shifting chroma to luma. */ int i; for (i = size / 32; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); movq_m2r (*(bot + 8), mm2); movq_m2r (*(top + 8), mm3); movq_m2r (*(bot + 16), mm4); movq_m2r (*(top + 16), mm5); movq_m2r (*(bot + 24), mm6); movq_m2r (*(top + 24), mm7); pand_m2r (shiftmask, mm0); pand_m2r (shiftmask, mm1); pand_m2r (shiftmask, mm2); pand_m2r (shiftmask, mm3); pand_m2r (shiftmask, mm4); pand_m2r (shiftmask, mm5); pand_m2r (shiftmask, mm6); pand_m2r (shiftmask, mm7); psrlw_i2r (1, mm0); psrlw_i2r (1, mm1); psrlw_i2r (1, mm2); psrlw_i2r (1, mm3); psrlw_i2r (1, mm4); psrlw_i2r (1, mm5); psrlw_i2r (1, mm6); psrlw_i2r (1, mm7); paddb_r2r (mm1, mm0); paddb_r2r (mm3, mm2); paddb_r2r (mm5, mm4); paddb_r2r (mm7, mm6); movq_r2m (mm0, *out); movq_r2m (mm2, *(out + 8)); movq_r2m (mm4, *(out + 16)); movq_r2m (mm6, *(out + 24)); out += 32; top += 32; bot += 32; } size = (size & 0x1f); for (i = size / 8; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); pand_m2r (shiftmask, mm0); pand_m2r (shiftmask, mm1); psrlw_i2r (1, mm0); psrlw_i2r (1, mm1); paddb_r2r (mm1, mm0); movq_r2m (mm0, *out); out += 8; top += 8; bot += 8; } emms (); size = size & 0xf; /* Handle last few pixels. */ for (i = size; i; --i) { *out++ = ((*top++) + (*bot++)) >> 1; } }
static void scale_uint16_x_1_x_generic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i, j, jmax; uint16_t * src, * dst; uint8_t * src_start; int32_t * factors; mmx_t tmp_mm; int tmp; src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); dst = (uint16_t*)dest_start; for(i = 0; i < ctx->dst_size; i++) { src = (uint16_t*)(src_start + 2*ctx->table_h.pixels[i].index); factors = ctx->table_h.pixels[i].factor_i; jmax = ctx->table_h.factors_per_pixel / 4; tmp = 0; #if 1 pxor_r2r(mm4, mm4); for(j = 0; j < jmax; j++) { /* Load pixels */ movq_m2r(*(src), mm0); psrlw_i2r(1, mm0); // DUMP_MM("mm0", mm0); /* Load factors */ movq_m2r(*factors, mm2); movq_m2r(*(factors+2), mm3); packssdw_r2r(mm3, mm2); /* Multiply */ pmaddwd_r2r(mm2, mm0); paddd_r2r(mm0, mm4); src += 4; factors += 4; } MOVQ_R2M(mm4, tmp_mm); tmp = tmp_mm.d[0] + tmp_mm.d[1]; jmax = ctx->table_h.factors_per_pixel % 4; #else jmax = ctx->table_h.factors_per_pixel; #endif for(j = 0; j < jmax; j++) { tmp += *factors * ((*src)>>1); factors++; src++; } // if(tmp > (255 << 14)) tmp = 255 << 14; // if(tmp < 0) tmp = 0; tmp >>= 13; RECLIP(tmp, ctx->plane); *(dst++) = tmp; } ctx->need_emms = 1; }
static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv) { static mmx_t mmx_80w = {0x0080008000800080LL}; static mmx_t mmx_U_green = {0xf37df37df37df37dLL}; static mmx_t mmx_U_blue = {0x4093409340934093LL}; static mmx_t mmx_V_red = {0x3312331233123312LL}; static mmx_t mmx_V_green = {0xe5fce5fce5fce5fcLL}; static mmx_t mmx_10w = {0x1010101010101010LL}; static mmx_t mmx_00ffw = {0x00ff00ff00ff00ffLL}; static mmx_t mmx_Y_coeff = {0x253f253f253f253fLL}; movd_m2r (*pu, mm0); /* mm0 = 00 00 00 00 u3 u2 u1 u0 */ movd_m2r (*pv, mm1); /* mm1 = 00 00 00 00 v3 v2 v1 v0 */ movq_m2r (*py, mm6); /* mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ pxor_r2r (mm4, mm4); /* mm4 = 0 */ /* XXX might do cache preload for image here */ /* * Do the multiply part of the conversion for even and odd pixels * register usage: * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels * mm6 -> Y even, mm7 -> Y odd */ punpcklbw_r2r (mm4, mm0); /* mm0 = u3 u2 u1 u0 */ punpcklbw_r2r (mm4, mm1); /* mm1 = v3 v2 v1 v0 */ psubsw_m2r (mmx_80w, mm0); /* u -= 128 */ psubsw_m2r (mmx_80w, mm1); /* v -= 128 */ psllw_i2r (3, mm0); /* promote precision */ psllw_i2r (3, mm1); /* promote precision */ movq_r2r (mm0, mm2); /* mm2 = u3 u2 u1 u0 */ movq_r2r (mm1, mm3); /* mm3 = v3 v2 v1 v0 */ pmulhw_m2r (mmx_U_green, mm2); /* mm2 = u * u_green */ pmulhw_m2r (mmx_V_green, mm3); /* mm3 = v * v_green */ pmulhw_m2r (mmx_U_blue, mm0); /* mm0 = chroma_b */ pmulhw_m2r (mmx_V_red, mm1); /* mm1 = chroma_r */ paddsw_r2r (mm3, mm2); /* mm2 = chroma_g */ psubusb_m2r (mmx_10w, mm6); /* Y -= 16 */ movq_r2r (mm6, mm7); /* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ pand_m2r (mmx_00ffw, mm6); /* mm6 = Y6 Y4 Y2 Y0 */ psrlw_i2r (8, mm7); /* mm7 = Y7 Y5 Y3 Y1 */ psllw_i2r (3, mm6); /* promote precision */ psllw_i2r (3, mm7); /* promote precision */ pmulhw_m2r (mmx_Y_coeff, mm6); /* mm6 = luma_rgb even */ pmulhw_m2r (mmx_Y_coeff, mm7); /* mm7 = luma_rgb odd */ /* * Do the addition part of the conversion for even and odd pixels * register usage: * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels * mm6 -> Y even, mm7 -> Y odd */ movq_r2r (mm0, mm3); /* mm3 = chroma_b */ movq_r2r (mm1, mm4); /* mm4 = chroma_r */ movq_r2r (mm2, mm5); /* mm5 = chroma_g */ paddsw_r2r (mm6, mm0); /* mm0 = B6 B4 B2 B0 */ paddsw_r2r (mm7, mm3); /* mm3 = B7 B5 B3 B1 */ paddsw_r2r (mm6, mm1); /* mm1 = R6 R4 R2 R0 */ paddsw_r2r (mm7, mm4); /* mm4 = R7 R5 R3 R1 */ paddsw_r2r (mm6, mm2); /* mm2 = G6 G4 G2 G0 */ paddsw_r2r (mm7, mm5); /* mm5 = G7 G5 G3 G1 */ packuswb_r2r (mm0, mm0); /* saturate to 0-255 */ packuswb_r2r (mm1, mm1); /* saturate to 0-255 */ packuswb_r2r (mm2, mm2); /* saturate to 0-255 */ packuswb_r2r (mm3, mm3); /* saturate to 0-255 */ packuswb_r2r (mm4, mm4); /* saturate to 0-255 */ packuswb_r2r (mm5, mm5); /* saturate to 0-255 */ punpcklbw_r2r (mm3, mm0); /* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */ punpcklbw_r2r (mm4, mm1); /* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */ punpcklbw_r2r (mm5, mm2); /* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */ }
void deinterlace_bob_yuv_mmx(uint8_t *pdst, uint8_t *psrc, int width, int height ) { int Line; long long* YVal1; long long* YVal2; long long* YVal3; long long* Dest; uint8_t* pEvenLines = psrc; uint8_t* pOddLines = psrc+width; int LineLength = width; int Pitch = width * 2; int IsOdd = 1; long EdgeDetect = 625; long JaggieThreshold = 73; int n; unsigned long long qwEdgeDetect; unsigned long long qwThreshold; const unsigned long long Mask = 0xfefefefefefefefeULL; const unsigned long long YMask = 0x00ff00ff00ff00ffULL; qwEdgeDetect = EdgeDetect; qwEdgeDetect += (qwEdgeDetect << 48) + (qwEdgeDetect << 32) + (qwEdgeDetect << 16); qwThreshold = JaggieThreshold; qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16); // copy first even line no matter what, and the first odd line if we're // processing an odd field. ac_memcpy(pdst, pEvenLines, LineLength); if (IsOdd) ac_memcpy(pdst + LineLength, pOddLines, LineLength); height = height / 2; for (Line = 0; Line < height - 1; ++Line) { if (IsOdd) { YVal1 = (long long *)(pOddLines + Line * Pitch); YVal2 = (long long *)(pEvenLines + (Line + 1) * Pitch); YVal3 = (long long *)(pOddLines + (Line + 1) * Pitch); Dest = (long long *)(pdst + (Line * 2 + 2) * LineLength); } else { YVal1 = (long long *)(pEvenLines + Line * Pitch); YVal2 = (long long *)(pOddLines + Line * Pitch); YVal3 = (long long *)(pEvenLines + (Line + 1) * Pitch); Dest = (long long *)(pdst + (Line * 2 + 1) * LineLength); } // For ease of reading, the comments below assume that we're operating on an odd // field (i.e., that bIsOdd is true). The exact same processing is done when we // operate on an even field, but the roles of the odd and even fields are reversed. // It's just too cumbersome to explain the algorithm in terms of "the next odd // line if we're doing an odd field, or the next even line if we're doing an // even field" etc. So wherever you see "odd" or "even" below, keep in mind that // half the time this function is called, those words' meanings will invert. // Copy the odd line to the overlay verbatim. ac_memcpy((char *)Dest + LineLength, YVal3, LineLength); n = LineLength >> 3; while( n-- ) { movq_m2r (*YVal1++, mm0); movq_m2r (*YVal2++, mm1); movq_m2r (*YVal3++, mm2); // get intensities in mm3 - 4 movq_r2r ( mm0, mm3 ); movq_r2r ( mm1, mm4 ); movq_r2r ( mm2, mm5 ); pand_m2r ( *&YMask, mm3 ); pand_m2r ( *&YMask, mm4 ); pand_m2r ( *&YMask, mm5 ); // get average in mm0 pand_m2r ( *&Mask, mm0 ); pand_m2r ( *&Mask, mm2 ); psrlw_i2r ( 01, mm0 ); psrlw_i2r ( 01, mm2 ); paddw_r2r ( mm2, mm0 ); // work out (O1 - E) * (O2 - E) / 2 - EdgeDetect * (O1 - O2) ^ 2 >> 12 // result will be in mm6 psrlw_i2r ( 01, mm3 ); psrlw_i2r ( 01, mm4 ); psrlw_i2r ( 01, mm5 ); movq_r2r ( mm3, mm6 ); psubw_r2r ( mm4, mm6 ); //mm6 = O1 - E movq_r2r ( mm5, mm7 ); psubw_r2r ( mm4, mm7 ); //mm7 = O2 - E pmullw_r2r ( mm7, mm6 ); // mm6 = (O1 - E) * (O2 - E) movq_r2r ( mm3, mm7 ); psubw_r2r ( mm5, mm7 ); // mm7 = (O1 - O2) pmullw_r2r ( mm7, mm7 ); // mm7 = (O1 - O2) ^ 2 psrlw_i2r ( 12, mm7 ); // mm7 = (O1 - O2) ^ 2 >> 12 pmullw_m2r ( *&qwEdgeDetect, mm7 );// mm7 = EdgeDetect * (O1 - O2) ^ 2 >> 12 psubw_r2r ( mm7, mm6 ); // mm6 is what we want pcmpgtw_m2r ( *&qwThreshold, mm6 ); movq_r2r ( mm6, mm7 ); pand_r2r ( mm6, mm0 ); pandn_r2r ( mm1, mm7 ); por_r2r ( mm0, mm7 ); movq_r2m ( mm7, *Dest++ ); } } // Copy last odd line if we're processing an even field. if (! IsOdd) { ac_memcpy(pdst + (height * 2 - 1) * LineLength, pOddLines + (height - 1) * Pitch, LineLength); } // clear out the MMX registers ready for doing floating point // again emms(); }