static inline void MC_avg4_16 (int height, uint8_t * dest, const uint8_t * ref, const int stride, const int cpu) { do { movq_m2r (*ref, mm0); movq_m2r (*(ref+stride+1), mm1); movq_r2r (mm0, mm7); movq_m2r (*(ref+1), mm2); pxor_r2r (mm1, mm7); movq_m2r (*(ref+stride), mm3); movq_r2r (mm2, mm6); pxor_r2r (mm3, mm6); pavg_r2r (mm1, mm0); pavg_r2r (mm3, mm2); por_r2r (mm6, mm7); movq_r2r (mm0, mm6); pxor_r2r (mm2, mm6); pand_r2r (mm6, mm7); pand_m2r (mask_one, mm7); pavg_r2r (mm2, mm0); psubusb_r2r (mm7, mm0); movq_m2r (*dest, mm1); pavg_r2r (mm1, mm0); movq_r2m (mm0, *dest); movq_m2r (*(ref+8), mm0); movq_m2r (*(ref+stride+9), mm1); movq_r2r (mm0, mm7); movq_m2r (*(ref+9), mm2); pxor_r2r (mm1, mm7); movq_m2r (*(ref+stride+8), mm3); movq_r2r (mm2, mm6); pxor_r2r (mm3, mm6); pavg_r2r (mm1, mm0); pavg_r2r (mm3, mm2); por_r2r (mm6, mm7); movq_r2r (mm0, mm6); pxor_r2r (mm2, mm6); pand_r2r (mm6, mm7); pand_m2r (mask_one, mm7); pavg_r2r (mm2, mm0); psubusb_r2r (mm7, mm0); movq_m2r (*(dest+8), mm1); pavg_r2r (mm1, mm0); ref += stride; movq_r2m (mm0, *(dest+8)); dest += stride; } while (--height); }
static inline void mean8(unsigned char *refpix,unsigned char *pixel,int radius_count,int row_stride,int threshold,int8_t *diff,unsigned char *count) { int a,b; pxor_r2r(mm6,mm6); // mm6 (aka count) = 0 pxor_r2r(mm7,mm7); // mm7 (aka diff) = 0 movq_m2r(*refpix,mm3); // mm3 = refpix[0] movd_g2r(0x80808080,mm4); // mm4 = 128 punpcklbw_r2r(mm4,mm4); pxor_r2r(mm4,mm3); // mm3 = refpix[0]-128 movd_g2r(threshold,mm5); // mm5 = threshold punpcklbw_r2r(mm5,mm5); punpcklbw_r2r(mm5,mm5); punpcklbw_r2r(mm5,mm5); for( b=0; b<radius_count; b++ ) { for( a=0; a<radius_count; a++ ) { movq_m2r(*pixel,mm0); // mm0 = pixel[0] pxor_r2r(mm4,mm0); // mm0 = pixel[0]-128 movq_r2r(mm3,mm2); // mm2 = refpix[0]-128 psubsb_r2r(mm0,mm2); // mm2 = refpix[0]-pixel[0] psubsb_r2r(mm3,mm0); // mm0 = pixel[0]-refpix[0] pminub_r2r(mm0,mm2); // mm2 = abs(pixel[0]-refpix[0]) movq_r2r(mm5,mm1); // mm1 = threshold pcmpgtb_r2r(mm2,mm1); // mm1 = (threshold > abs(pixel[0]-refpix[0])) ? -1 : 0 psubb_r2r(mm1,mm6); // mm6 += (threshold > abs(pixel[0]-refpix[0])) pand_r2r(mm1,mm0); // mm0 = (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0 paddb_r2r(mm0,mm7); // mm7 += (threshold > abs(pixel[0]-refpix[0])) ? pixel[0]-refpix[0] : 0 ++pixel; } pixel += row_stride - radius_count; } movq_r2m(mm6,*count); movq_r2m(mm7,*diff); emms(); }
static inline void MC_put4_8 (int height, uint8_t * dest, const uint8_t * ref, const int stride, const int cpu) { movq_m2r (*ref, mm0); movq_m2r (*(ref+1), mm1); movq_r2r (mm0, mm7); pxor_r2r (mm1, mm7); pavg_r2r (mm1, mm0); ref += stride; do { movq_m2r (*ref, mm2); movq_r2r (mm0, mm5); movq_m2r (*(ref+1), mm3); movq_r2r (mm2, mm6); pxor_r2r (mm3, mm6); pavg_r2r (mm3, mm2); por_r2r (mm6, mm7); pxor_r2r (mm2, mm5); pand_r2r (mm5, mm7); pavg_r2r (mm2, mm0); pand_m2r (mask_one, mm7); psubusb_r2r (mm7, mm0); ref += stride; movq_r2m (mm0, *dest); dest += stride; movq_r2r (mm6, mm7); /* unroll ! */ movq_r2r (mm2, mm0); /* unroll ! */ } while (--height); }
static void vfilter_chroma_332_packed422_scanline_mmx( uint8_t *output, int width, uint8_t *m, uint8_t *t, uint8_t *b ) { int i; const mmx_t ymask = { 0x00ff00ff00ff00ffULL }; const mmx_t cmask = { 0xff00ff00ff00ff00ULL }; // Get width in bytes. width *= 2; i = width / 8; width -= i * 8; movq_m2r( ymask, mm7 ); movq_m2r( cmask, mm6 ); while ( i-- ) { movq_m2r( *t, mm0 ); movq_m2r( *b, mm1 ); movq_m2r( *m, mm2 ); movq_r2r ( mm2, mm3 ); pand_r2r ( mm7, mm3 ); pand_r2r ( mm6, mm0 ); pand_r2r ( mm6, mm1 ); pand_r2r ( mm6, mm2 ); psrlq_i2r( 8, mm0 ); psrlq_i2r( 7, mm1 ); psrlq_i2r( 8, mm2 ); movq_r2r ( mm0, mm4 ); psllw_i2r( 1, mm4 ); paddw_r2r( mm4, mm0 ); movq_r2r ( mm2, mm4 ); psllw_i2r( 1, mm4 ); paddw_r2r( mm4, mm2 ); paddw_r2r( mm0, mm2 ); paddw_r2r( mm1, mm2 ); psllw_i2r( 5, mm2 ); pand_r2r( mm6, mm2 ); por_r2r ( mm3, mm2 ); movq_r2m( mm2, *output ); output += 8; t += 8; b += 8; m += 8; } output++; t++; b++; m++; while ( width-- ) { *output = (3 * *t + 3 * *m + 2 * *b) >> 3; output +=2; t+=2; b+=2; m+=2; } emms(); }
static void deinterlace_greedy_scanline_mmxext (GstDeinterlaceMethodGreedyL * self, const guint8 * m0, const guint8 * t1, const guint8 * b1, const guint8 * m2, guint8 * output, gint width) { mmx_t MaxComb; // How badly do we let it weave? 0-255 MaxComb.ub[0] = self->max_comb; MaxComb.ub[1] = self->max_comb; MaxComb.ub[2] = self->max_comb; MaxComb.ub[3] = self->max_comb; MaxComb.ub[4] = self->max_comb; MaxComb.ub[5] = self->max_comb; MaxComb.ub[6] = self->max_comb; MaxComb.ub[7] = self->max_comb; // L2 == m0 // L1 == t1 // L3 == b1 // LP2 == m2 movq_m2r (MaxComb, mm6); for (; width > 7; width -= 8) { movq_m2r (*t1, mm1); // L1 movq_m2r (*m0, mm2); // L2 movq_m2r (*b1, mm3); // L3 movq_m2r (*m2, mm0); // LP2 // average L1 and L3 leave result in mm4 movq_r2r (mm1, mm4); // L1 pavgb_r2r (mm3, mm4); // (L1 + L3)/2 // get abs value of possible L2 comb movq_r2r (mm2, mm7); // L2 psubusb_r2r (mm4, mm7); // L2 - avg movq_r2r (mm4, mm5); // avg psubusb_r2r (mm2, mm5); // avg - L2 por_r2r (mm7, mm5); // abs(avg-L2) // get abs value of possible LP2 comb movq_r2r (mm0, mm7); // LP2 psubusb_r2r (mm4, mm7); // LP2 - avg psubusb_r2r (mm0, mm4); // avg - LP2 por_r2r (mm7, mm4); // abs(avg-LP2) // use L2 or LP2 depending upon which makes smaller comb psubusb_r2r (mm5, mm4); // see if it goes to zero pxor_r2r (mm5, mm5); // 0 pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0 pcmpeqb_r2r (mm4, mm5); // opposite of mm4 // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0 pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0 por_r2r (mm5, mm4); // may the best win // Now lets clip our chosen value to be not outside of the range // of the high/low range L1-L3 by more than abs(L1-L3) // This allows some comb but limits the damages and also allows more // detail than a boring oversmoothed clip. movq_r2r (mm1, mm2); // copy L1 pmaxub_r2r (mm3, mm2); // now = Max(L1,L3) pminub_r2r (mm1, mm3); // now = Min(L1,L3) // allow the value to be above the high or below the low by amt of MaxComb paddusb_r2r (mm6, mm2); // increase max by diff psubusb_r2r (mm6, mm3); // lower min by diff pmaxub_r2r (mm3, mm4); // now = Max(best,Min(L1,L3) pminub_r2r (mm4, mm2); // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped movq_r2m (mm2, *output); // move in our clipped best // Advance to the next set of pixels. output += 8; m0 += 8; t1 += 8; b1 += 8; m2 += 8; } emms (); if (width > 0) deinterlace_greedy_scanline_c (self, m0, t1, b1, m2, output, width); }
void deinterlace_bob_yuv_mmx(uint8_t *pdst, uint8_t *psrc, int width, int height ) { int Line; long long* YVal1; long long* YVal2; long long* YVal3; long long* Dest; uint8_t* pEvenLines = psrc; uint8_t* pOddLines = psrc+width; int LineLength = width; int Pitch = width * 2; int IsOdd = 1; long EdgeDetect = 625; long JaggieThreshold = 73; int n; unsigned long long qwEdgeDetect; unsigned long long qwThreshold; const unsigned long long Mask = 0xfefefefefefefefeULL; const unsigned long long YMask = 0x00ff00ff00ff00ffULL; qwEdgeDetect = EdgeDetect; qwEdgeDetect += (qwEdgeDetect << 48) + (qwEdgeDetect << 32) + (qwEdgeDetect << 16); qwThreshold = JaggieThreshold; qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16); // copy first even line no matter what, and the first odd line if we're // processing an odd field. ac_memcpy(pdst, pEvenLines, LineLength); if (IsOdd) ac_memcpy(pdst + LineLength, pOddLines, LineLength); height = height / 2; for (Line = 0; Line < height - 1; ++Line) { if (IsOdd) { YVal1 = (long long *)(pOddLines + Line * Pitch); YVal2 = (long long *)(pEvenLines + (Line + 1) * Pitch); YVal3 = (long long *)(pOddLines + (Line + 1) * Pitch); Dest = (long long *)(pdst + (Line * 2 + 2) * LineLength); } else { YVal1 = (long long *)(pEvenLines + Line * Pitch); YVal2 = (long long *)(pOddLines + Line * Pitch); YVal3 = (long long *)(pEvenLines + (Line + 1) * Pitch); Dest = (long long *)(pdst + (Line * 2 + 1) * LineLength); } // For ease of reading, the comments below assume that we're operating on an odd // field (i.e., that bIsOdd is true). The exact same processing is done when we // operate on an even field, but the roles of the odd and even fields are reversed. // It's just too cumbersome to explain the algorithm in terms of "the next odd // line if we're doing an odd field, or the next even line if we're doing an // even field" etc. So wherever you see "odd" or "even" below, keep in mind that // half the time this function is called, those words' meanings will invert. // Copy the odd line to the overlay verbatim. ac_memcpy((char *)Dest + LineLength, YVal3, LineLength); n = LineLength >> 3; while( n-- ) { movq_m2r (*YVal1++, mm0); movq_m2r (*YVal2++, mm1); movq_m2r (*YVal3++, mm2); // get intensities in mm3 - 4 movq_r2r ( mm0, mm3 ); movq_r2r ( mm1, mm4 ); movq_r2r ( mm2, mm5 ); pand_m2r ( *&YMask, mm3 ); pand_m2r ( *&YMask, mm4 ); pand_m2r ( *&YMask, mm5 ); // get average in mm0 pand_m2r ( *&Mask, mm0 ); pand_m2r ( *&Mask, mm2 ); psrlw_i2r ( 01, mm0 ); psrlw_i2r ( 01, mm2 ); paddw_r2r ( mm2, mm0 ); // work out (O1 - E) * (O2 - E) / 2 - EdgeDetect * (O1 - O2) ^ 2 >> 12 // result will be in mm6 psrlw_i2r ( 01, mm3 ); psrlw_i2r ( 01, mm4 ); psrlw_i2r ( 01, mm5 ); movq_r2r ( mm3, mm6 ); psubw_r2r ( mm4, mm6 ); //mm6 = O1 - E movq_r2r ( mm5, mm7 ); psubw_r2r ( mm4, mm7 ); //mm7 = O2 - E pmullw_r2r ( mm7, mm6 ); // mm6 = (O1 - E) * (O2 - E) movq_r2r ( mm3, mm7 ); psubw_r2r ( mm5, mm7 ); // mm7 = (O1 - O2) pmullw_r2r ( mm7, mm7 ); // mm7 = (O1 - O2) ^ 2 psrlw_i2r ( 12, mm7 ); // mm7 = (O1 - O2) ^ 2 >> 12 pmullw_m2r ( *&qwEdgeDetect, mm7 );// mm7 = EdgeDetect * (O1 - O2) ^ 2 >> 12 psubw_r2r ( mm7, mm6 ); // mm6 is what we want pcmpgtw_m2r ( *&qwThreshold, mm6 ); movq_r2r ( mm6, mm7 ); pand_r2r ( mm6, mm0 ); pandn_r2r ( mm1, mm7 ); por_r2r ( mm0, mm7 ); movq_r2m ( mm7, *Dest++ ); } } // Copy last odd line if we're processing an even field. if (! IsOdd) { ac_memcpy(pdst + (height * 2 - 1) * LineLength, pOddLines + (height - 1) * Pitch, LineLength); } // clear out the MMX registers ready for doing floating point // again emms(); }