static inline void mmx_end(uint8_t *src3, uint8_t *src5, uint8_t *dst, int X) { punpcklbw_m2r (mm_cpool[0], mm4); punpckhbw_m2r (mm_cpool[0], mm5); psubusw_r2r (mm2, mm0); psubusw_r2r (mm3, mm1); movq_m2r (src5[X], mm2); movq_m2r (src5[X], mm3); punpcklbw_m2r (mm_cpool[0], mm2); punpckhbw_m2r (mm_cpool[0], mm3); psubusw_r2r (mm2, mm0); psubusw_r2r (mm3, mm1); psrlw_i2r (3, mm0); psrlw_i2r (3, mm1); psubw_r2r (mm6, mm4); psubw_r2r (mm7, mm5); packuswb_r2r (mm1,mm0); movq_r2r (mm4, mm6); movq_r2r (mm5, mm7); pcmpgtw_m2r (mm_lthr, mm4); pcmpgtw_m2r (mm_lthr, mm5); pcmpgtw_m2r (mm_hthr, mm6); pcmpgtw_m2r (mm_hthr, mm7); packsswb_r2r (mm5, mm4); packsswb_r2r (mm7, mm6); pxor_r2r (mm6, mm4); movq_r2r (mm4, mm5); pandn_r2r (mm0, mm4); pand_m2r (src3[X], mm5); por_r2r (mm4, mm5); movq_r2m (mm5, dst[X]); }
static void _op_mul_p_mas_dp_mmx(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { DATA32 *e = d + l; MOV_A2R(ALPHA_255, mm5) pxor_r2r(mm0, mm0); while (d < e) { c = *m; switch(c) { case 0: break; case 255: MOV_P2R(*d, mm1, mm0) MOV_P2R(*s, mm2, mm0) MUL4_SYM_R2R(mm2, mm1, mm5) MOV_R2P(mm1, *d, mm0) break; default: c++; MOV_A2R(c, mm1) c = ~(*s); MOV_P2R(c, mm3, mm0) MUL4_256_R2R(mm3, mm1) movq_r2r(mm5, mm4); psubw_r2r(mm1, mm4); MOV_P2R(*d, mm1, mm0) MUL4_SYM_R2R(mm4, mm1, mm5) MOV_R2P(mm1, *d, mm0) break; } s++; m++; d++; } }
static void frame_f2i_sse(float *src,u_char *dst,int l) { int i; // put 128 in all 4 words of mm7 movd_g2r(128,mm7); punpcklwd_r2r(mm7,mm7); punpckldq_r2r(mm7,mm7); // put 128 in all 8 bytes of mm6 movd_g2r(128,mm6); punpcklbw_r2r(mm6,mm6); punpcklwd_r2r(mm6,mm6); punpckldq_r2r(mm6,mm6); for( i=0; i<l; i+=8 ) { movaps_m2r(src[0],xmm0); movaps_m2r(src[4],xmm2); movhlps_r2r(xmm0,xmm1); cvtps2pi_r2r(xmm0,mm0); cvtps2pi_r2r(xmm1,mm1); movhlps_r2r(xmm2,xmm3); cvtps2pi_r2r(xmm2,mm2); cvtps2pi_r2r(xmm3,mm3); packssdw_r2r(mm1,mm0); packssdw_r2r(mm3,mm2); psubw_r2r(mm7,mm0); psubw_r2r(mm7,mm2); packsswb_r2r(mm2, mm0); paddb_r2r(mm6, mm0); movq_r2m(mm0,dst[0]); src+=8; dst+=8; } emms(); }
static __inline__ void sum_sumsq_8bytes( uint8_t *cur_lum_mb, uint8_t *pred_lum_mb, mmx_t *sumtop_accs, mmx_t *sumbot_accs, mmx_t *sumsqtop_accs, mmx_t *sumsqbot_accs, mmx_t *sumxprod_accs ) { pxor_r2r(mm0,mm0); /* Load pixels from top field into mm1.w,mm2.w */ movq_m2r( *((mmx_t*)cur_lum_mb), mm1 ); movq_m2r( *((mmx_t*)pred_lum_mb), mm2 ); /* mm3 := mm1 mm4 := mm2 mm1.w[0..3] := mm1.b[0..3]-mm2.b[0..3] */ movq_r2r( mm1, mm3 ); punpcklbw_r2r( mm0, mm1 ); movq_r2r( mm2, mm4 ); punpcklbw_r2r( mm0, mm2 ); psubw_r2r( mm2, mm1 ); /* mm3.w[0..3] := mm3.b[4..7]-mm4.b[4..7] */ punpckhbw_r2r( mm0, mm3 ); punpckhbw_r2r( mm0, mm4 ); psubw_r2r( mm4, mm3 ); /* sumtop_accs->w[0..3] += mm1.w[0..3]; sumtop_accs->w[0..3] += mm3.w[0..3]; mm6 = mm1; mm7 = mm3; */ movq_m2r( *sumtop_accs, mm5 ); paddw_r2r( mm1, mm5 ); paddw_r2r( mm3, mm5 ); movq_r2r( mm1, mm6 ); movq_r2r( mm3, mm7 ); movq_r2m( mm5, *sumtop_accs ); /* *sumsq_top_acc += mm1.w[0..3] * mm1.w[0..3]; *sumsq_top_acc += mm3.w[0..3] * mm3.w[0..3]; */ pmaddwd_r2r( mm1, mm1 ); movq_m2r( *sumsqtop_accs, mm5 ); pmaddwd_r2r( mm3, mm3 ); paddd_r2r( mm1, mm5 ); paddd_r2r( mm3, mm5 ); movq_r2m( mm5, *sumsqtop_accs ); /* Load pixels from bot field into mm1.w,mm2.w */ movq_m2r( *((mmx_t*)(cur_lum_mb+opt->phy_width)), mm1 ); movq_m2r( *((mmx_t*)(pred_lum_mb+opt->phy_width)), mm2 ); /* mm2 := mm1 mm4 := mm2 mm1.w[0..3] := mm1.b[0..3]-mm2.b[0..3] */ movq_r2r( mm1, mm3 ); punpcklbw_r2r( mm0, mm1 ); movq_r2r( mm2, mm4 ); punpcklbw_r2r( mm0, mm2 ); psubw_r2r( mm2, mm1 ); /* mm3.w[0..3] := mm3.b[4..7]-mm4.b[4..7] */ punpckhbw_r2r( mm0, mm3 ); punpckhbw_r2r( mm0, mm4 ); psubw_r2r( mm4, mm3 ); /* sumbot_accs->w[0..3] += mm1.w[0..3]; sumbot_accs->w[0..3] += mm3.w[0..3]; mm2 := mm1; mm4 := mm3; */ movq_m2r( *sumbot_accs, mm5 ); paddw_r2r( mm1, mm5 ); movq_r2r( mm1, mm2 ); paddw_r2r( mm3, mm5 ); movq_r2r( mm3, mm4 ); movq_r2m( mm5, *sumbot_accs ); /* *sumsqbot_acc += mm1.w[0..3] * mm1.w[0..3]; *sumsqbot_acc += mm3.w[0..3] * mm3.w[0..3]; */ pmaddwd_r2r( mm1, mm1 ); movq_m2r( *sumsqbot_accs, mm5 ); pmaddwd_r2r( mm3, mm3 ); paddd_r2r( mm1, mm5 ); paddd_r2r( mm3, mm5 ); movq_r2m( mm5, *sumsqbot_accs ); /* Accumulate cross-product *sum_xprod_acc += mm1.w[0..3] * mm6[0..3]; *sum_xprod_acc += mm3.w[0..3] * mm7[0..3]; */ movq_m2r( *sumxprod_accs, mm5 ); pmaddwd_r2r( mm6, mm2); pmaddwd_r2r( mm7, mm4); paddd_r2r( mm2, mm5 ); paddd_r2r( mm4, mm5 ); movq_r2m( mm5, *sumxprod_accs ); emms(); }
int bsad_mmx(uint8_t *pf, uint8_t *pb, uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h) { uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc; int s, s1, s2; pfa = pf + hxf; pfb = pf + lx * hyf; pfc = pfb + hxf; pba = pb + hxb; pbb = pb + lx * hyb; pbc = pbb + hxb; s = 0; /* the accumulator */ if (h > 0) { pxor_r2r(mm7, mm7); pxor_r2r(mm6, mm6); pcmpeqw_r2r(mm5, mm5); psubw_r2r(mm5, mm6); psllw_i2r(1, mm6); do { BSAD_LOAD(pf[0],mm0,mm1); BSAD_LOAD_ACC(pfa[0],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfb[0],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfc[0],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); BSAD_LOAD(pb[0],mm2,mm3); BSAD_LOAD_ACC(pba[0],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbb[0],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbc[0],mm4,mm5,mm2,mm3); paddw_r2r(mm6, mm2); paddw_r2r(mm6, mm3); psrlw_i2r(2, mm2); psrlw_i2r(2, mm3); paddw_r2r(mm2, mm0); paddw_r2r(mm3, mm1); psrlw_i2r(1, mm6); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psllw_i2r(1, mm6); psrlw_i2r(1, mm0); psrlw_i2r(1, mm1); packuswb_r2r(mm1, mm0); movq_m2r(p2[0], mm1); movq_r2r(mm0, mm2); psubusb_r2r(mm1, mm0); psubusb_r2r(mm2, mm1); por_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm7, mm0); punpckhbw_r2r(mm7, mm1); paddw_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklwd_r2r(mm7, mm0); punpckhwd_r2r(mm7, mm1); paddd_r2r(mm1, mm0); movd_r2g(mm0, s1); psrlq_i2r(32, mm0); movd_r2g(mm0, s2); s += s1 + s2; BSAD_LOAD(pf[8],mm0,mm1); BSAD_LOAD_ACC(pfa[8],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfb[8],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfc[8],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); BSAD_LOAD(pb[8],mm2,mm3); BSAD_LOAD_ACC(pba[8],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbb[8],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbc[8],mm4,mm5,mm2,mm3); paddw_r2r(mm6, mm2); paddw_r2r(mm6, mm3); psrlw_i2r(2, mm2); psrlw_i2r(2, mm3); paddw_r2r(mm2, mm0); paddw_r2r(mm3, mm1); psrlw_i2r(1, mm6); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psllw_i2r(1, mm6); psrlw_i2r(1, mm0); psrlw_i2r(1, mm1); packuswb_r2r(mm1, mm0); movq_m2r(p2[8], mm1); movq_r2r(mm0, mm2); psubusb_r2r(mm1, mm0); psubusb_r2r(mm2, mm1); por_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm7, mm0); punpckhbw_r2r(mm7, mm1); paddw_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklwd_r2r(mm7, mm0); punpckhwd_r2r(mm7, mm1); paddd_r2r(mm1, mm0); movd_r2g(mm0, s1); psrlq_i2r(32, mm0); movd_r2g(mm0, s2); s += s1 + s2; p2 += lx; pf += lx; pfa += lx; pfb += lx; pfc += lx; pb += lx; pba += lx; pbb += lx; pbc += lx; h--; } while (h > 0); } emms(); return s; }
void deinterlace_bob_yuv_mmx(uint8_t *pdst, uint8_t *psrc, int width, int height ) { int Line; long long* YVal1; long long* YVal2; long long* YVal3; long long* Dest; uint8_t* pEvenLines = psrc; uint8_t* pOddLines = psrc+width; int LineLength = width; int Pitch = width * 2; int IsOdd = 1; long EdgeDetect = 625; long JaggieThreshold = 73; int n; unsigned long long qwEdgeDetect; unsigned long long qwThreshold; const unsigned long long Mask = 0xfefefefefefefefeULL; const unsigned long long YMask = 0x00ff00ff00ff00ffULL; qwEdgeDetect = EdgeDetect; qwEdgeDetect += (qwEdgeDetect << 48) + (qwEdgeDetect << 32) + (qwEdgeDetect << 16); qwThreshold = JaggieThreshold; qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16); // copy first even line no matter what, and the first odd line if we're // processing an odd field. ac_memcpy(pdst, pEvenLines, LineLength); if (IsOdd) ac_memcpy(pdst + LineLength, pOddLines, LineLength); height = height / 2; for (Line = 0; Line < height - 1; ++Line) { if (IsOdd) { YVal1 = (long long *)(pOddLines + Line * Pitch); YVal2 = (long long *)(pEvenLines + (Line + 1) * Pitch); YVal3 = (long long *)(pOddLines + (Line + 1) * Pitch); Dest = (long long *)(pdst + (Line * 2 + 2) * LineLength); } else { YVal1 = (long long *)(pEvenLines + Line * Pitch); YVal2 = (long long *)(pOddLines + Line * Pitch); YVal3 = (long long *)(pEvenLines + (Line + 1) * Pitch); Dest = (long long *)(pdst + (Line * 2 + 1) * LineLength); } // For ease of reading, the comments below assume that we're operating on an odd // field (i.e., that bIsOdd is true). The exact same processing is done when we // operate on an even field, but the roles of the odd and even fields are reversed. // It's just too cumbersome to explain the algorithm in terms of "the next odd // line if we're doing an odd field, or the next even line if we're doing an // even field" etc. So wherever you see "odd" or "even" below, keep in mind that // half the time this function is called, those words' meanings will invert. // Copy the odd line to the overlay verbatim. ac_memcpy((char *)Dest + LineLength, YVal3, LineLength); n = LineLength >> 3; while( n-- ) { movq_m2r (*YVal1++, mm0); movq_m2r (*YVal2++, mm1); movq_m2r (*YVal3++, mm2); // get intensities in mm3 - 4 movq_r2r ( mm0, mm3 ); movq_r2r ( mm1, mm4 ); movq_r2r ( mm2, mm5 ); pand_m2r ( *&YMask, mm3 ); pand_m2r ( *&YMask, mm4 ); pand_m2r ( *&YMask, mm5 ); // get average in mm0 pand_m2r ( *&Mask, mm0 ); pand_m2r ( *&Mask, mm2 ); psrlw_i2r ( 01, mm0 ); psrlw_i2r ( 01, mm2 ); paddw_r2r ( mm2, mm0 ); // work out (O1 - E) * (O2 - E) / 2 - EdgeDetect * (O1 - O2) ^ 2 >> 12 // result will be in mm6 psrlw_i2r ( 01, mm3 ); psrlw_i2r ( 01, mm4 ); psrlw_i2r ( 01, mm5 ); movq_r2r ( mm3, mm6 ); psubw_r2r ( mm4, mm6 ); //mm6 = O1 - E movq_r2r ( mm5, mm7 ); psubw_r2r ( mm4, mm7 ); //mm7 = O2 - E pmullw_r2r ( mm7, mm6 ); // mm6 = (O1 - E) * (O2 - E) movq_r2r ( mm3, mm7 ); psubw_r2r ( mm5, mm7 ); // mm7 = (O1 - O2) pmullw_r2r ( mm7, mm7 ); // mm7 = (O1 - O2) ^ 2 psrlw_i2r ( 12, mm7 ); // mm7 = (O1 - O2) ^ 2 >> 12 pmullw_m2r ( *&qwEdgeDetect, mm7 );// mm7 = EdgeDetect * (O1 - O2) ^ 2 >> 12 psubw_r2r ( mm7, mm6 ); // mm6 is what we want pcmpgtw_m2r ( *&qwThreshold, mm6 ); movq_r2r ( mm6, mm7 ); pand_r2r ( mm6, mm0 ); pandn_r2r ( mm1, mm7 ); por_r2r ( mm0, mm7 ); movq_r2m ( mm7, *Dest++ ); } } // Copy last odd line if we're processing an even field. if (! IsOdd) { ac_memcpy(pdst + (height * 2 - 1) * LineLength, pOddLines + (height - 1) * Pitch, LineLength); } // clear out the MMX registers ready for doing floating point // again emms(); }
static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src ) { int y, x; int32_t ff, fr; int fc; /* Detect interlacing */ fc = 0; pxor_r2r( mm7, mm7 ); for( y = 0; y < 9; y += 2 ) { ff = fr = 0; pxor_r2r( mm5, mm5 ); pxor_r2r( mm6, mm6 ); for( x = 0; x < 8; x+=4 ) { movd_m2r( src[ x], mm0 ); movd_m2r( src[1*i_src+x], mm1 ); movd_m2r( src[2*i_src+x], mm2 ); movd_m2r( src[3*i_src+x], mm3 ); punpcklbw_r2r( mm7, mm0 ); punpcklbw_r2r( mm7, mm1 ); punpcklbw_r2r( mm7, mm2 ); punpcklbw_r2r( mm7, mm3 ); movq_r2r( mm0, mm4 ); psubw_r2r( mm1, mm0 ); psubw_r2r( mm2, mm4 ); psubw_r2r( mm1, mm2 ); psubw_r2r( mm1, mm3 ); pmaddwd_r2r( mm0, mm0 ); pmaddwd_r2r( mm4, mm4 ); pmaddwd_r2r( mm2, mm2 ); pmaddwd_r2r( mm3, mm3 ); paddd_r2r( mm0, mm2 ); paddd_r2r( mm4, mm3 ); paddd_r2r( mm2, mm5 ); paddd_r2r( mm3, mm6 ); } movq_r2r( mm5, mm0 ); psrlq_i2r( 32, mm0 ); paddd_r2r( mm0, mm5 ); movd_r2m( mm5, fr ); movq_r2r( mm6, mm0 ); psrlq_i2r( 32, mm0 ); paddd_r2r( mm0, mm6 ); movd_r2m( mm6, ff ); if( ff < 6*fr/8 && fr > 32 ) fc++; src += 2*i_src; } return fc; }