static inline void mmx_interp_average_2_U8 (uint8_t * dest, const uint8_t * src1, const uint8_t * src2) { /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */ movq_m2r (*dest, mm1); /* load 8 dest bytes */ movq_r2r (mm1, mm2); /* copy 8 dest bytes */ movq_m2r (*src1, mm3); /* load 8 src1 bytes */ movq_r2r (mm3, mm4); /* copy 8 src1 bytes */ movq_m2r (*src2, mm5); /* load 8 src2 bytes */ movq_r2r (mm5, mm6); /* copy 8 src2 bytes */ pxor_r2r (mm3, mm5); /* xor src1 and src2 */ pand_m2r (mask1, mm5); /* mask lower bits */ psrlq_i2r (1, mm5); /* /2 */ por_r2r (mm4, mm6); /* or src1 and src2 */ psubb_r2r (mm5, mm6); /* subtract subresults */ movq_r2r (mm6, mm5); /* copy subresult */ pxor_r2r (mm1, mm5); /* xor srcavg and dest */ pand_m2r (mask1, mm5); /* mask lower bits */ psrlq_i2r (1, mm5); /* /2 */ por_r2r (mm2, mm6); /* or srcavg and dest */ psubb_r2r (mm5, mm6); /* subtract subresults */ movq_r2m (mm6, *dest); /* store result in dest */ }
static inline void mmx_unpack_16rgb (uint8_t * image, const int cpu) { static mmx_t mmx_bluemask = {0xf8f8f8f8f8f8f8f8LL}; static mmx_t mmx_greenmask = {0xfcfcfcfcfcfcfcfcLL}; static mmx_t mmx_redmask = {0xf8f8f8f8f8f8f8f8LL}; /* * convert RGB plane to RGB 16 bits * mm0 -> B, mm1 -> R, mm2 -> G * mm4 -> GB, mm5 -> AR pixel 4-7 * mm6 -> GB, mm7 -> AR pixel 0-3 */ pand_m2r (mmx_bluemask, mm0); /* mm0 = b7b6b5b4b3______ */ pand_m2r (mmx_greenmask, mm2); /* mm2 = g7g6g5g4g3g2____ */ pand_m2r (mmx_redmask, mm1); /* mm1 = r7r6r5r4r3______ */ psrlq_i2r (3, mm0); /* mm0 = ______b7b6b5b4b3 */ pxor_r2r (mm4, mm4); /* mm4 = 0 */ movq_r2r (mm0, mm5); /* mm5 = ______b7b6b5b4b3 */ movq_r2r (mm2, mm7); /* mm7 = g7g6g5g4g3g2____ */ punpcklbw_r2r (mm4, mm2); punpcklbw_r2r (mm1, mm0); psllq_i2r (3, mm2); por_r2r (mm2, mm0); movntq (mm0, *image); punpckhbw_r2r (mm4, mm7); punpckhbw_r2r (mm1, mm5); psllq_i2r (3, mm7); por_r2r (mm7, mm5); movntq (mm5, *(image+8)); }
static inline void MC_avg4_16 (int height, uint8_t * dest, const uint8_t * ref, const int stride, const int cpu) { do { movq_m2r (*ref, mm0); movq_m2r (*(ref+stride+1), mm1); movq_r2r (mm0, mm7); movq_m2r (*(ref+1), mm2); pxor_r2r (mm1, mm7); movq_m2r (*(ref+stride), mm3); movq_r2r (mm2, mm6); pxor_r2r (mm3, mm6); pavg_r2r (mm1, mm0); pavg_r2r (mm3, mm2); por_r2r (mm6, mm7); movq_r2r (mm0, mm6); pxor_r2r (mm2, mm6); pand_r2r (mm6, mm7); pand_m2r (mask_one, mm7); pavg_r2r (mm2, mm0); psubusb_r2r (mm7, mm0); movq_m2r (*dest, mm1); pavg_r2r (mm1, mm0); movq_r2m (mm0, *dest); movq_m2r (*(ref+8), mm0); movq_m2r (*(ref+stride+9), mm1); movq_r2r (mm0, mm7); movq_m2r (*(ref+9), mm2); pxor_r2r (mm1, mm7); movq_m2r (*(ref+stride+8), mm3); movq_r2r (mm2, mm6); pxor_r2r (mm3, mm6); pavg_r2r (mm1, mm0); pavg_r2r (mm3, mm2); por_r2r (mm6, mm7); movq_r2r (mm0, mm6); pxor_r2r (mm2, mm6); pand_r2r (mm6, mm7); pand_m2r (mask_one, mm7); pavg_r2r (mm2, mm0); psubusb_r2r (mm7, mm0); movq_m2r (*(dest+8), mm1); pavg_r2r (mm1, mm0); ref += stride; movq_r2m (mm0, *(dest+8)); dest += stride; } while (--height); }
static inline void mmx_end(uint8_t *src3, uint8_t *src5, uint8_t *dst, int X) { punpcklbw_m2r (mm_cpool[0], mm4); punpckhbw_m2r (mm_cpool[0], mm5); psubusw_r2r (mm2, mm0); psubusw_r2r (mm3, mm1); movq_m2r (src5[X], mm2); movq_m2r (src5[X], mm3); punpcklbw_m2r (mm_cpool[0], mm2); punpckhbw_m2r (mm_cpool[0], mm3); psubusw_r2r (mm2, mm0); psubusw_r2r (mm3, mm1); psrlw_i2r (3, mm0); psrlw_i2r (3, mm1); psubw_r2r (mm6, mm4); psubw_r2r (mm7, mm5); packuswb_r2r (mm1,mm0); movq_r2r (mm4, mm6); movq_r2r (mm5, mm7); pcmpgtw_m2r (mm_lthr, mm4); pcmpgtw_m2r (mm_lthr, mm5); pcmpgtw_m2r (mm_hthr, mm6); pcmpgtw_m2r (mm_hthr, mm7); packsswb_r2r (mm5, mm4); packsswb_r2r (mm7, mm6); pxor_r2r (mm6, mm4); movq_r2r (mm4, mm5); pandn_r2r (mm0, mm4); pand_m2r (src3[X], mm5); por_r2r (mm4, mm5); movq_r2m (mm5, dst[X]); }
static inline void MC_put4_8 (int height, uint8_t * dest, const uint8_t * ref, const int stride, const int cpu) { movq_m2r (*ref, mm0); movq_m2r (*(ref+1), mm1); movq_r2r (mm0, mm7); pxor_r2r (mm1, mm7); pavg_r2r (mm1, mm0); ref += stride; do { movq_m2r (*ref, mm2); movq_r2r (mm0, mm5); movq_m2r (*(ref+1), mm3); movq_r2r (mm2, mm6); pxor_r2r (mm3, mm6); pavg_r2r (mm3, mm2); por_r2r (mm6, mm7); pxor_r2r (mm2, mm5); pand_r2r (mm5, mm7); pavg_r2r (mm2, mm0); pand_m2r (mask_one, mm7); psubusb_r2r (mm7, mm0); ref += stride; movq_r2m (mm0, *dest); dest += stride; movq_r2r (mm6, mm7); /* unroll ! */ movq_r2r (mm2, mm0); /* unroll ! */ } while (--height); }
static inline void mmx_yuv2rgb (uint8_t * py, uint8_t * pu, uint8_t * pv) { static mmx_t mmx_80w = {0x0080008000800080LL}; static mmx_t mmx_U_green = {0xf37df37df37df37dLL}; static mmx_t mmx_U_blue = {0x4093409340934093LL}; static mmx_t mmx_V_red = {0x3312331233123312LL}; static mmx_t mmx_V_green = {0xe5fce5fce5fce5fcLL}; static mmx_t mmx_10w = {0x1010101010101010LL}; static mmx_t mmx_00ffw = {0x00ff00ff00ff00ffLL}; static mmx_t mmx_Y_coeff = {0x253f253f253f253fLL}; movd_m2r (*pu, mm0); /* mm0 = 00 00 00 00 u3 u2 u1 u0 */ movd_m2r (*pv, mm1); /* mm1 = 00 00 00 00 v3 v2 v1 v0 */ movq_m2r (*py, mm6); /* mm6 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ pxor_r2r (mm4, mm4); /* mm4 = 0 */ /* XXX might do cache preload for image here */ /* * Do the multiply part of the conversion for even and odd pixels * register usage: * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels * mm6 -> Y even, mm7 -> Y odd */ punpcklbw_r2r (mm4, mm0); /* mm0 = u3 u2 u1 u0 */ punpcklbw_r2r (mm4, mm1); /* mm1 = v3 v2 v1 v0 */ psubsw_m2r (mmx_80w, mm0); /* u -= 128 */ psubsw_m2r (mmx_80w, mm1); /* v -= 128 */ psllw_i2r (3, mm0); /* promote precision */ psllw_i2r (3, mm1); /* promote precision */ movq_r2r (mm0, mm2); /* mm2 = u3 u2 u1 u0 */ movq_r2r (mm1, mm3); /* mm3 = v3 v2 v1 v0 */ pmulhw_m2r (mmx_U_green, mm2); /* mm2 = u * u_green */ pmulhw_m2r (mmx_V_green, mm3); /* mm3 = v * v_green */ pmulhw_m2r (mmx_U_blue, mm0); /* mm0 = chroma_b */ pmulhw_m2r (mmx_V_red, mm1); /* mm1 = chroma_r */ paddsw_r2r (mm3, mm2); /* mm2 = chroma_g */ psubusb_m2r (mmx_10w, mm6); /* Y -= 16 */ movq_r2r (mm6, mm7); /* mm7 = Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 */ pand_m2r (mmx_00ffw, mm6); /* mm6 = Y6 Y4 Y2 Y0 */ psrlw_i2r (8, mm7); /* mm7 = Y7 Y5 Y3 Y1 */ psllw_i2r (3, mm6); /* promote precision */ psllw_i2r (3, mm7); /* promote precision */ pmulhw_m2r (mmx_Y_coeff, mm6); /* mm6 = luma_rgb even */ pmulhw_m2r (mmx_Y_coeff, mm7); /* mm7 = luma_rgb odd */ /* * Do the addition part of the conversion for even and odd pixels * register usage: * mm0 -> Cblue, mm1 -> Cred, mm2 -> Cgreen even pixels * mm3 -> Cblue, mm4 -> Cred, mm5 -> Cgreen odd pixels * mm6 -> Y even, mm7 -> Y odd */ movq_r2r (mm0, mm3); /* mm3 = chroma_b */ movq_r2r (mm1, mm4); /* mm4 = chroma_r */ movq_r2r (mm2, mm5); /* mm5 = chroma_g */ paddsw_r2r (mm6, mm0); /* mm0 = B6 B4 B2 B0 */ paddsw_r2r (mm7, mm3); /* mm3 = B7 B5 B3 B1 */ paddsw_r2r (mm6, mm1); /* mm1 = R6 R4 R2 R0 */ paddsw_r2r (mm7, mm4); /* mm4 = R7 R5 R3 R1 */ paddsw_r2r (mm6, mm2); /* mm2 = G6 G4 G2 G0 */ paddsw_r2r (mm7, mm5); /* mm5 = G7 G5 G3 G1 */ packuswb_r2r (mm0, mm0); /* saturate to 0-255 */ packuswb_r2r (mm1, mm1); /* saturate to 0-255 */ packuswb_r2r (mm2, mm2); /* saturate to 0-255 */ packuswb_r2r (mm3, mm3); /* saturate to 0-255 */ packuswb_r2r (mm4, mm4); /* saturate to 0-255 */ packuswb_r2r (mm5, mm5); /* saturate to 0-255 */ punpcklbw_r2r (mm3, mm0); /* mm0 = B7 B6 B5 B4 B3 B2 B1 B0 */ punpcklbw_r2r (mm4, mm1); /* mm1 = R7 R6 R5 R4 R3 R2 R1 R0 */ punpcklbw_r2r (mm5, mm2); /* mm2 = G7 G6 G5 G4 G3 G2 G1 G0 */ }
static void deinterlace_scanline_linear_mmx (GstDeinterlaceSimpleMethod * self, guint8 * out, const guint8 * bot, const guint8 * top, gint size) { const mmx_t shiftmask = { 0xfefffefffefffeffULL }; /* To avoid shifting chroma to luma. */ int i; for (i = size / 32; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); movq_m2r (*(bot + 8), mm2); movq_m2r (*(top + 8), mm3); movq_m2r (*(bot + 16), mm4); movq_m2r (*(top + 16), mm5); movq_m2r (*(bot + 24), mm6); movq_m2r (*(top + 24), mm7); pand_m2r (shiftmask, mm0); pand_m2r (shiftmask, mm1); pand_m2r (shiftmask, mm2); pand_m2r (shiftmask, mm3); pand_m2r (shiftmask, mm4); pand_m2r (shiftmask, mm5); pand_m2r (shiftmask, mm6); pand_m2r (shiftmask, mm7); psrlw_i2r (1, mm0); psrlw_i2r (1, mm1); psrlw_i2r (1, mm2); psrlw_i2r (1, mm3); psrlw_i2r (1, mm4); psrlw_i2r (1, mm5); psrlw_i2r (1, mm6); psrlw_i2r (1, mm7); paddb_r2r (mm1, mm0); paddb_r2r (mm3, mm2); paddb_r2r (mm5, mm4); paddb_r2r (mm7, mm6); movq_r2m (mm0, *out); movq_r2m (mm2, *(out + 8)); movq_r2m (mm4, *(out + 16)); movq_r2m (mm6, *(out + 24)); out += 32; top += 32; bot += 32; } size = (size & 0x1f); for (i = size / 8; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); pand_m2r (shiftmask, mm0); pand_m2r (shiftmask, mm1); psrlw_i2r (1, mm0); psrlw_i2r (1, mm1); paddb_r2r (mm1, mm0); movq_r2m (mm0, *out); out += 8; top += 8; bot += 8; } emms (); size = size & 0xf; /* Handle last few pixels. */ for (i = size; i; --i) { *out++ = ((*top++) + (*bot++)) >> 1; } }
static inline void mmx_interp_average_4_U8 (uint8_t * dest, const uint8_t * src1, const uint8_t * src2, const uint8_t * src3, const uint8_t * src4) { /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */ movq_m2r (*src1, mm1); /* load 8 src1 bytes */ movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */ punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */ movq_m2r (*src2, mm3); /* load 8 src2 bytes */ movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */ punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */ paddw_r2r (mm3, mm1); /* add lows */ paddw_r2r (mm4, mm2); /* add highs */ /* now have partials in mm1 and mm2 */ movq_m2r (*src3, mm3); /* load 8 src3 bytes */ movq_r2r (mm3, mm4); /* copy 8 src3 bytes */ punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */ punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */ paddw_r2r (mm3, mm1); /* add lows */ paddw_r2r (mm4, mm2); /* add highs */ movq_m2r (*src4, mm5); /* load 8 src4 bytes */ movq_r2r (mm5, mm6); /* copy 8 src4 bytes */ punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */ punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */ paddw_r2r (mm5, mm1); /* add lows */ paddw_r2r (mm6, mm2); /* add highs */ paddw_m2r (round4, mm1); psraw_i2r (2, mm1); /* /4 */ paddw_m2r (round4, mm2); psraw_i2r (2, mm2); /* /4 */ /* now have subtotal/4 in mm1 and mm2 */ movq_m2r (*dest, mm3); /* load 8 dest bytes */ movq_r2r (mm3, mm4); /* copy 8 dest bytes */ packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */ movq_r2r (mm1,mm2); /* copy subresult */ pxor_r2r (mm1, mm3); /* xor srcavg and dest */ pand_m2r (mask1, mm3); /* mask lower bits */ psrlq_i2r (1, mm3); /* /2 */ por_r2r (mm2, mm4); /* or srcavg and dest */ psubb_r2r (mm3, mm4); /* subtract subresults */ movq_r2m (mm4, *dest); /* store result in dest */ }
static void deinterlace_greedy_scanline_mmx (GstDeinterlaceMethodGreedyL * self, const guint8 * m0, const guint8 * t1, const guint8 * b1, const guint8 * m2, guint8 * output, gint width) { mmx_t MaxComb; mmx_t ShiftMask; // How badly do we let it weave? 0-255 MaxComb.ub[0] = self->max_comb; MaxComb.ub[1] = self->max_comb; MaxComb.ub[2] = self->max_comb; MaxComb.ub[3] = self->max_comb; MaxComb.ub[4] = self->max_comb; MaxComb.ub[5] = self->max_comb; MaxComb.ub[6] = self->max_comb; MaxComb.ub[7] = self->max_comb; ShiftMask.ub[0] = 0x7f; ShiftMask.ub[1] = 0x7f; ShiftMask.ub[2] = 0x7f; ShiftMask.ub[3] = 0x7f; ShiftMask.ub[4] = 0x7f; ShiftMask.ub[5] = 0x7f; ShiftMask.ub[6] = 0x7f; ShiftMask.ub[7] = 0x7f; // L2 == m0 // L1 == t1 // L3 == b1 // LP2 == m2 movq_m2r (MaxComb, mm6); for (; width > 7; width -= 8) { movq_m2r (*t1, mm1); // L1 movq_m2r (*m0, mm2); // L2 movq_m2r (*b1, mm3); // L3 movq_m2r (*m2, mm0); // LP2 // average L1 and L3 leave result in mm4 movq_r2r (mm1, mm4); // L1 movq_r2r (mm3, mm5); // L3 psrlw_i2r (1, mm4); // L1/2 pand_m2r (ShiftMask, mm4); psrlw_i2r (1, mm5); // L3/2 pand_m2r (ShiftMask, mm5); paddusb_r2r (mm5, mm4); // (L1 + L3) / 2 // get abs value of possible L2 comb movq_r2r (mm2, mm7); // L2 psubusb_r2r (mm4, mm7); // L2 - avg movq_r2r (mm4, mm5); // avg psubusb_r2r (mm2, mm5); // avg - L2 por_r2r (mm7, mm5); // abs(avg-L2) // get abs value of possible LP2 comb movq_r2r (mm0, mm7); // LP2 psubusb_r2r (mm4, mm7); // LP2 - avg psubusb_r2r (mm0, mm4); // avg - LP2 por_r2r (mm7, mm4); // abs(avg-LP2) // use L2 or LP2 depending upon which makes smaller comb psubusb_r2r (mm5, mm4); // see if it goes to zero psubusb_r2r (mm5, mm5); // 0 pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0 pcmpeqb_r2r (mm4, mm5); // opposite of mm4 // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0 pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0 por_r2r (mm5, mm4); // may the best win // Now lets clip our chosen value to be not outside of the range // of the high/low range L1-L3 by more than abs(L1-L3) // This allows some comb but limits the damages and also allows more // detail than a boring oversmoothed clip. movq_r2r (mm1, mm2); // copy L1 psubusb_r2r (mm3, mm2); // - L3, with saturation paddusb_r2r (mm3, mm2); // now = Max(L1,L3) pcmpeqb_r2r (mm7, mm7); // all ffffffff psubusb_r2r (mm1, mm7); // - L1 paddusb_r2r (mm7, mm3); // add, may sat at fff.. psubusb_r2r (mm7, mm3); // now = Min(L1,L3) // allow the value to be above the high or below the low by amt of MaxComb paddusb_r2r (mm6, mm2); // increase max by diff psubusb_r2r (mm6, mm3); // lower min by diff psubusb_r2r (mm3, mm4); // best - Min paddusb_r2r (mm3, mm4); // now = Max(best,Min(L1,L3) pcmpeqb_r2r (mm7, mm7); // all ffffffff psubusb_r2r (mm4, mm7); // - Max(best,Min(best,L3) paddusb_r2r (mm7, mm2); // add may sat at FFF.. psubusb_r2r (mm7, mm2); // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped movq_r2m (mm2, *output); // move in our clipped best // Advance to the next set of pixels. output += 8; m0 += 8; t1 += 8; b1 += 8; m2 += 8; } emms (); if (width > 0) deinterlace_greedy_scanline_c (self, m0, t1, b1, m2, output, width); }
static void deinterlace_scanline_linear_mmx (GstDeinterlaceMethod * self, GstDeinterlace * parent, guint8 * out, GstDeinterlaceScanlineData * scanlines, gint width) { const mmx_t shiftmask = { 0xfefffefffefffeffULL }; /* To avoid shifting chroma to luma. */ int i; guint8 *bot = scanlines->b0, *top = scanlines->t0; for (i = width / 16; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); movq_m2r (*(bot + 8), mm2); movq_m2r (*(top + 8), mm3); movq_m2r (*(bot + 16), mm4); movq_m2r (*(top + 16), mm5); movq_m2r (*(bot + 24), mm6); movq_m2r (*(top + 24), mm7); pand_m2r (shiftmask, mm0); pand_m2r (shiftmask, mm1); pand_m2r (shiftmask, mm2); pand_m2r (shiftmask, mm3); pand_m2r (shiftmask, mm4); pand_m2r (shiftmask, mm5); pand_m2r (shiftmask, mm6); pand_m2r (shiftmask, mm7); psrlw_i2r (1, mm0); psrlw_i2r (1, mm1); psrlw_i2r (1, mm2); psrlw_i2r (1, mm3); psrlw_i2r (1, mm4); psrlw_i2r (1, mm5); psrlw_i2r (1, mm6); psrlw_i2r (1, mm7); paddb_r2r (mm1, mm0); paddb_r2r (mm3, mm2); paddb_r2r (mm5, mm4); paddb_r2r (mm7, mm6); movq_r2m (mm0, *out); movq_r2m (mm2, *(out + 8)); movq_r2m (mm4, *(out + 16)); movq_r2m (mm6, *(out + 24)); out += 32; top += 32; bot += 32; } width = (width & 0xf); for (i = width / 4; i; --i) { movq_m2r (*bot, mm0); movq_m2r (*top, mm1); pand_m2r (shiftmask, mm0); pand_m2r (shiftmask, mm1); psrlw_i2r (1, mm0); psrlw_i2r (1, mm1); paddb_r2r (mm1, mm0); movq_r2m (mm0, *out); out += 8; top += 8; bot += 8; } width = width & 0x7; /* Handle last few pixels. */ for (i = width * 2; i; --i) { *out++ = ((*top++) + (*bot++)) >> 1; } emms (); }
static void interpolate_packed422_scanline_mmx( uint8_t *output, uint8_t *top, uint8_t *bot, int width ) { const mmx_t shiftmask = { 0xfefffefffefffeffULL }; /* To avoid shifting chroma to luma. */ int i; for( i = width/16; i; --i ) { movq_m2r( *bot, mm0 ); movq_m2r( *top, mm1 ); movq_m2r( *(bot + 8), mm2 ); movq_m2r( *(top + 8), mm3 ); movq_m2r( *(bot + 16), mm4 ); movq_m2r( *(top + 16), mm5 ); movq_m2r( *(bot + 24), mm6 ); movq_m2r( *(top + 24), mm7 ); pand_m2r( shiftmask, mm0 ); pand_m2r( shiftmask, mm1 ); pand_m2r( shiftmask, mm2 ); pand_m2r( shiftmask, mm3 ); pand_m2r( shiftmask, mm4 ); pand_m2r( shiftmask, mm5 ); pand_m2r( shiftmask, mm6 ); pand_m2r( shiftmask, mm7 ); psrlw_i2r( 1, mm0 ); psrlw_i2r( 1, mm1 ); psrlw_i2r( 1, mm2 ); psrlw_i2r( 1, mm3 ); psrlw_i2r( 1, mm4 ); psrlw_i2r( 1, mm5 ); psrlw_i2r( 1, mm6 ); psrlw_i2r( 1, mm7 ); paddb_r2r( mm1, mm0 ); paddb_r2r( mm3, mm2 ); paddb_r2r( mm5, mm4 ); paddb_r2r( mm7, mm6 ); movq_r2m( mm0, *output ); movq_r2m( mm2, *(output + 8) ); movq_r2m( mm4, *(output + 16) ); movq_r2m( mm6, *(output + 24) ); output += 32; top += 32; bot += 32; } width = (width & 0xf); for( i = width/4; i; --i ) { movq_m2r( *bot, mm0 ); movq_m2r( *top, mm1 ); pand_m2r( shiftmask, mm0 ); pand_m2r( shiftmask, mm1 ); psrlw_i2r( 1, mm0 ); psrlw_i2r( 1, mm1 ); paddb_r2r( mm1, mm0 ); movq_r2m( mm0, *output ); output += 8; top += 8; bot += 8; } width = width & 0x7; /* Handle last few pixels. */ for( i = width * 2; i; --i ) { *output++ = ((*top++) + (*bot++)) >> 1; } emms(); }
void deinterlace_bob_yuv_mmx(uint8_t *pdst, uint8_t *psrc, int width, int height ) { int Line; long long* YVal1; long long* YVal2; long long* YVal3; long long* Dest; uint8_t* pEvenLines = psrc; uint8_t* pOddLines = psrc+width; int LineLength = width; int Pitch = width * 2; int IsOdd = 1; long EdgeDetect = 625; long JaggieThreshold = 73; int n; unsigned long long qwEdgeDetect; unsigned long long qwThreshold; const unsigned long long Mask = 0xfefefefefefefefeULL; const unsigned long long YMask = 0x00ff00ff00ff00ffULL; qwEdgeDetect = EdgeDetect; qwEdgeDetect += (qwEdgeDetect << 48) + (qwEdgeDetect << 32) + (qwEdgeDetect << 16); qwThreshold = JaggieThreshold; qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16); // copy first even line no matter what, and the first odd line if we're // processing an odd field. ac_memcpy(pdst, pEvenLines, LineLength); if (IsOdd) ac_memcpy(pdst + LineLength, pOddLines, LineLength); height = height / 2; for (Line = 0; Line < height - 1; ++Line) { if (IsOdd) { YVal1 = (long long *)(pOddLines + Line * Pitch); YVal2 = (long long *)(pEvenLines + (Line + 1) * Pitch); YVal3 = (long long *)(pOddLines + (Line + 1) * Pitch); Dest = (long long *)(pdst + (Line * 2 + 2) * LineLength); } else { YVal1 = (long long *)(pEvenLines + Line * Pitch); YVal2 = (long long *)(pOddLines + Line * Pitch); YVal3 = (long long *)(pEvenLines + (Line + 1) * Pitch); Dest = (long long *)(pdst + (Line * 2 + 1) * LineLength); } // For ease of reading, the comments below assume that we're operating on an odd // field (i.e., that bIsOdd is true). The exact same processing is done when we // operate on an even field, but the roles of the odd and even fields are reversed. // It's just too cumbersome to explain the algorithm in terms of "the next odd // line if we're doing an odd field, or the next even line if we're doing an // even field" etc. So wherever you see "odd" or "even" below, keep in mind that // half the time this function is called, those words' meanings will invert. // Copy the odd line to the overlay verbatim. ac_memcpy((char *)Dest + LineLength, YVal3, LineLength); n = LineLength >> 3; while( n-- ) { movq_m2r (*YVal1++, mm0); movq_m2r (*YVal2++, mm1); movq_m2r (*YVal3++, mm2); // get intensities in mm3 - 4 movq_r2r ( mm0, mm3 ); movq_r2r ( mm1, mm4 ); movq_r2r ( mm2, mm5 ); pand_m2r ( *&YMask, mm3 ); pand_m2r ( *&YMask, mm4 ); pand_m2r ( *&YMask, mm5 ); // get average in mm0 pand_m2r ( *&Mask, mm0 ); pand_m2r ( *&Mask, mm2 ); psrlw_i2r ( 01, mm0 ); psrlw_i2r ( 01, mm2 ); paddw_r2r ( mm2, mm0 ); // work out (O1 - E) * (O2 - E) / 2 - EdgeDetect * (O1 - O2) ^ 2 >> 12 // result will be in mm6 psrlw_i2r ( 01, mm3 ); psrlw_i2r ( 01, mm4 ); psrlw_i2r ( 01, mm5 ); movq_r2r ( mm3, mm6 ); psubw_r2r ( mm4, mm6 ); //mm6 = O1 - E movq_r2r ( mm5, mm7 ); psubw_r2r ( mm4, mm7 ); //mm7 = O2 - E pmullw_r2r ( mm7, mm6 ); // mm6 = (O1 - E) * (O2 - E) movq_r2r ( mm3, mm7 ); psubw_r2r ( mm5, mm7 ); // mm7 = (O1 - O2) pmullw_r2r ( mm7, mm7 ); // mm7 = (O1 - O2) ^ 2 psrlw_i2r ( 12, mm7 ); // mm7 = (O1 - O2) ^ 2 >> 12 pmullw_m2r ( *&qwEdgeDetect, mm7 );// mm7 = EdgeDetect * (O1 - O2) ^ 2 >> 12 psubw_r2r ( mm7, mm6 ); // mm6 is what we want pcmpgtw_m2r ( *&qwThreshold, mm6 ); movq_r2r ( mm6, mm7 ); pand_r2r ( mm6, mm0 ); pandn_r2r ( mm1, mm7 ); por_r2r ( mm0, mm7 ); movq_r2m ( mm7, *Dest++ ); } } // Copy last odd line if we're processing an even field. if (! IsOdd) { ac_memcpy(pdst + (height * 2 - 1) * LineLength, pOddLines + (height - 1) * Pitch, LineLength); } // clear out the MMX registers ready for doing floating point // again emms(); }