/* For a 16*h block, this computes (((((*pf + *pf2 + 1)>>1) + ((*pb + *pb2 + 1)>>1) + 1)>>1) + *p2 + 1)>>1 */ static int bsad_0quad_mmxe(uint8_t *pf,uint8_t *pf2,uint8_t *pb,uint8_t *pb2,uint8_t *p2,int lx,int h) { int32_t s=0; pxor_r2r(mm7, mm7); do { movq_m2r(pf2[0],mm0); movq_m2r(pf2[8],mm2); movq_m2r(pb2[0],mm1); movq_m2r(pb2[8],mm3); pavgb_m2r(pf[0],mm0); pavgb_m2r(pf[8],mm2); pavgb_m2r(pb[0],mm1); pavgb_m2r(pb[8],mm3); pavgb_r2r(mm1,mm0); pavgb_r2r(mm3,mm2); psadbw_m2r(p2[0],mm0); psadbw_m2r(p2[8],mm2); paddd_r2r(mm0,mm7); paddd_r2r(mm2,mm7); pf+=lx; pf2+=lx; pb+=lx; pb2+=lx; p2+=lx; h--; } while (h); movd_r2g(mm7,s); emms(); return s; }
static void deinterlace_scanline_linear_mmxext (GstDeinterlaceMethod * self, GstDeinterlace * parent, guint8 * out, GstDeinterlaceScanlineData * scanlines, gint width) { gint i; guint8 *bot = scanlines->b0, *top = scanlines->t0; for (i = width / 16; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); movq_m2r (*(bot + 8), mm2); movq_m2r (*(top + 8), mm3); movq_m2r (*(bot + 16), mm4); movq_m2r (*(top + 16), mm5); movq_m2r (*(bot + 24), mm6); movq_m2r (*(top + 24), mm7); pavgb_r2r (mm1, mm0); pavgb_r2r (mm3, mm2); pavgb_r2r (mm5, mm4); pavgb_r2r (mm7, mm6); movntq_r2m (mm0, *out); movntq_r2m (mm2, *(out + 8)); movntq_r2m (mm4, *(out + 16)); movntq_r2m (mm6, *(out + 24)); out += 32; top += 32; bot += 32; } width = (width & 0xf); for (i = width / 4; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); pavgb_r2r (mm1, mm0); movntq_r2m (mm0, *out); out += 8; top += 8; bot += 8; } width = width & 0x7; /* Handle last few pixels. */ for (i = width * 2; i; --i) { *out++ = ((*top++) + (*bot++)) >> 1; } emms (); }
static void deinterlace_scanline_linear_mmxext (GstDeinterlaceSimpleMethod * self, guint8 * out, const guint8 * bot, const guint8 * top, gint size) { gint i; for (i = size / 32; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); movq_m2r (*(bot + 8), mm2); movq_m2r (*(top + 8), mm3); movq_m2r (*(bot + 16), mm4); movq_m2r (*(top + 16), mm5); movq_m2r (*(bot + 24), mm6); movq_m2r (*(top + 24), mm7); pavgb_r2r (mm1, mm0); pavgb_r2r (mm3, mm2); pavgb_r2r (mm5, mm4); pavgb_r2r (mm7, mm6); movntq_r2m (mm0, *out); movntq_r2m (mm2, *(out + 8)); movntq_r2m (mm4, *(out + 16)); movntq_r2m (mm6, *(out + 24)); out += 32; top += 32; bot += 32; } size = (size & 0x1f); for (i = size / 8; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); pavgb_r2r (mm1, mm0); movntq_r2m (mm0, *out); out += 8; top += 8; bot += 8; } emms (); size = size & 0xf; /* Handle last few pixels. */ for (i = size; i; --i) { *out++ = ((*top++) + (*bot++)) >> 1; } }
static void interpolate_packed422_scanline_mmxext( uint8_t *output, uint8_t *top, uint8_t *bot, int width ) { int i; for( i = width/16; i; --i ) { movq_m2r( *bot, mm0 ); movq_m2r( *top, mm1 ); movq_m2r( *(bot + 8), mm2 ); movq_m2r( *(top + 8), mm3 ); movq_m2r( *(bot + 16), mm4 ); movq_m2r( *(top + 16), mm5 ); movq_m2r( *(bot + 24), mm6 ); movq_m2r( *(top + 24), mm7 ); pavgb_r2r( mm1, mm0 ); pavgb_r2r( mm3, mm2 ); pavgb_r2r( mm5, mm4 ); pavgb_r2r( mm7, mm6 ); movntq_r2m( mm0, *output ); movntq_r2m( mm2, *(output + 8) ); movntq_r2m( mm4, *(output + 16) ); movntq_r2m( mm6, *(output + 24) ); output += 32; top += 32; bot += 32; } width = (width & 0xf); for( i = width/4; i; --i ) { movq_m2r( *bot, mm0 ); movq_m2r( *top, mm1 ); pavgb_r2r( mm1, mm0 ); movntq_r2m( mm0, *output ); output += 8; top += 8; bot += 8; } width = width & 0x7; /* Handle last few pixels. */ for( i = width * 2; i; --i ) { *output++ = ((*top++) + (*bot++)) >> 1; } sfence(); emms(); }
static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst, uint8_t *src, int i_src ) { int y; /* Interlaced */ for( y = 0; y < 8; y += 2 ) { movq_m2r( src[0], mm0 ); movq_r2m( mm0, dst[0] ); dst += i_dst; movq_m2r( src[2*i_src], mm1 ); pavgb_r2r( mm1, mm0 ); movq_r2m( mm0, dst[0] ); dst += 1*i_dst; src += 2*i_src; } }
static int bsad_1quad_mmxe(uint8_t *pf, uint8_t *pb, uint8_t *pb2, uint8_t *p2, int lx, int h) { int s; s = 0; /* the accumulator */ if (h > 0) { pcmpeqw_r2r(mm6, mm6); psrlw_i2r(15, mm6); paddw_r2r(mm6, mm6); pxor_r2r(mm7, mm7); pxor_r2r(mm5, mm5); do { BSAD_LOAD(pf[0],mm0,mm1); BSAD_LOAD_ACC(pf[1],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pf[lx],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pf[lx+1],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); packuswb_r2r(mm1, mm0); movq_m2r(pb2[0],mm1); pavgb_m2r(pb[0],mm1); pavgb_r2r(mm1, mm0); psadbw_m2r(p2[0],mm0); paddd_r2r(mm0,mm5); BSAD_LOAD(pf[8],mm0,mm1); BSAD_LOAD_ACC(pf[9],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pf[lx+8],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pf[lx+9],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); packuswb_r2r(mm1, mm0); movq_m2r(pb2[8],mm1); pavgb_m2r(pb[8],mm1); pavgb_r2r(mm1, mm0); psadbw_m2r(p2[8],mm0); paddd_r2r(mm0,mm5); p2 += lx; pf += lx; pb += lx; pb2 += lx; h--; } while (h > 0); } movd_r2g(mm5,s); emms(); return s; }
static void deinterlace_greedy_scanline_mmxext (GstDeinterlaceMethodGreedyL * self, const guint8 * m0, const guint8 * t1, const guint8 * b1, const guint8 * m2, guint8 * output, gint width) { mmx_t MaxComb; // How badly do we let it weave? 0-255 MaxComb.ub[0] = self->max_comb; MaxComb.ub[1] = self->max_comb; MaxComb.ub[2] = self->max_comb; MaxComb.ub[3] = self->max_comb; MaxComb.ub[4] = self->max_comb; MaxComb.ub[5] = self->max_comb; MaxComb.ub[6] = self->max_comb; MaxComb.ub[7] = self->max_comb; // L2 == m0 // L1 == t1 // L3 == b1 // LP2 == m2 movq_m2r (MaxComb, mm6); for (; width > 7; width -= 8) { movq_m2r (*t1, mm1); // L1 movq_m2r (*m0, mm2); // L2 movq_m2r (*b1, mm3); // L3 movq_m2r (*m2, mm0); // LP2 // average L1 and L3 leave result in mm4 movq_r2r (mm1, mm4); // L1 pavgb_r2r (mm3, mm4); // (L1 + L3)/2 // get abs value of possible L2 comb movq_r2r (mm2, mm7); // L2 psubusb_r2r (mm4, mm7); // L2 - avg movq_r2r (mm4, mm5); // avg psubusb_r2r (mm2, mm5); // avg - L2 por_r2r (mm7, mm5); // abs(avg-L2) // get abs value of possible LP2 comb movq_r2r (mm0, mm7); // LP2 psubusb_r2r (mm4, mm7); // LP2 - avg psubusb_r2r (mm0, mm4); // avg - LP2 por_r2r (mm7, mm4); // abs(avg-LP2) // use L2 or LP2 depending upon which makes smaller comb psubusb_r2r (mm5, mm4); // see if it goes to zero pxor_r2r (mm5, mm5); // 0 pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0 pcmpeqb_r2r (mm4, mm5); // opposite of mm4 // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0 pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0 por_r2r (mm5, mm4); // may the best win // Now lets clip our chosen value to be not outside of the range // of the high/low range L1-L3 by more than abs(L1-L3) // This allows some comb but limits the damages and also allows more // detail than a boring oversmoothed clip. movq_r2r (mm1, mm2); // copy L1 pmaxub_r2r (mm3, mm2); // now = Max(L1,L3) pminub_r2r (mm1, mm3); // now = Min(L1,L3) // allow the value to be above the high or below the low by amt of MaxComb paddusb_r2r (mm6, mm2); // increase max by diff psubusb_r2r (mm6, mm3); // lower min by diff pmaxub_r2r (mm3, mm4); // now = Max(best,Min(L1,L3) pminub_r2r (mm4, mm2); // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped movq_r2m (mm2, *output); // move in our clipped best // Advance to the next set of pixels. output += 8; m0 += 8; t1 += 8; b1 += 8; m2 += 8; } emms (); if (width > 0) deinterlace_greedy_scanline_c (self, m0, t1, b1, m2, output, width); }