static inline void MC_avg4_16 (int height, uint8_t * dest, const uint8_t * ref, const int stride, const int cpu) { do { movq_m2r (*ref, mm0); movq_m2r (*(ref+stride+1), mm1); movq_r2r (mm0, mm7); movq_m2r (*(ref+1), mm2); pxor_r2r (mm1, mm7); movq_m2r (*(ref+stride), mm3); movq_r2r (mm2, mm6); pxor_r2r (mm3, mm6); pavg_r2r (mm1, mm0); pavg_r2r (mm3, mm2); por_r2r (mm6, mm7); movq_r2r (mm0, mm6); pxor_r2r (mm2, mm6); pand_r2r (mm6, mm7); pand_m2r (mask_one, mm7); pavg_r2r (mm2, mm0); psubusb_r2r (mm7, mm0); movq_m2r (*dest, mm1); pavg_r2r (mm1, mm0); movq_r2m (mm0, *dest); movq_m2r (*(ref+8), mm0); movq_m2r (*(ref+stride+9), mm1); movq_r2r (mm0, mm7); movq_m2r (*(ref+9), mm2); pxor_r2r (mm1, mm7); movq_m2r (*(ref+stride+8), mm3); movq_r2r (mm2, mm6); pxor_r2r (mm3, mm6); pavg_r2r (mm1, mm0); pavg_r2r (mm3, mm2); por_r2r (mm6, mm7); movq_r2r (mm0, mm6); pxor_r2r (mm2, mm6); pand_r2r (mm6, mm7); pand_m2r (mask_one, mm7); pavg_r2r (mm2, mm0); psubusb_r2r (mm7, mm0); movq_m2r (*(dest+8), mm1); pavg_r2r (mm1, mm0); ref += stride; movq_r2m (mm0, *(dest+8)); dest += stride; } while (--height); }
static inline void MC_put4_8 (int height, uint8_t * dest, const uint8_t * ref, const int stride, const int cpu) { movq_m2r (*ref, mm0); movq_m2r (*(ref+1), mm1); movq_r2r (mm0, mm7); pxor_r2r (mm1, mm7); pavg_r2r (mm1, mm0); ref += stride; do { movq_m2r (*ref, mm2); movq_r2r (mm0, mm5); movq_m2r (*(ref+1), mm3); movq_r2r (mm2, mm6); pxor_r2r (mm3, mm6); pavg_r2r (mm3, mm2); por_r2r (mm6, mm7); pxor_r2r (mm2, mm5); pand_r2r (mm5, mm7); pavg_r2r (mm2, mm0); pand_m2r (mask_one, mm7); psubusb_r2r (mm7, mm0); ref += stride; movq_r2m (mm0, *dest); dest += stride; movq_r2r (mm6, mm7); /* unroll ! */ movq_r2r (mm2, mm0); /* unroll ! */ } while (--height); }
VLC_MMX static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c, int i_pitch_prev, int i_pitch_curr, int* pi_top, int* pi_bot ) { int32_t i_motion = 0; int32_t i_top_motion = 0; int32_t i_bot_motion = 0; static alignas (8) const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } }; pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */ movq_m2r( bT, mm5 ); pxor_r2r( mm3, mm3 ); /* score (top field) */ pxor_r2r( mm4, mm4 ); /* score (bottom field) */ for( int y = 0; y < 8; y+=2 ) { /* top field */ movq_m2r( *((uint64_t*)p_pix_c), mm0 ); movq_m2r( *((uint64_t*)p_pix_p), mm1 ); movq_r2r( mm0, mm2 ); psubusb_r2r( mm1, mm2 ); psubusb_r2r( mm0, mm1 ); pcmpgtb_r2r( mm5, mm2 ); pcmpgtb_r2r( mm5, mm1 ); psadbw_r2r( mm6, mm2 ); psadbw_r2r( mm6, mm1 ); paddd_r2r( mm2, mm1 ); paddd_r2r( mm1, mm3 ); /* add to top field score */ p_pix_c += i_pitch_curr; p_pix_p += i_pitch_prev; /* bottom field - handling identical to top field, except... */ movq_m2r( *((uint64_t*)p_pix_c), mm0 ); movq_m2r( *((uint64_t*)p_pix_p), mm1 ); movq_r2r( mm0, mm2 ); psubusb_r2r( mm1, mm2 ); psubusb_r2r( mm0, mm1 ); pcmpgtb_r2r( mm5, mm2 ); pcmpgtb_r2r( mm5, mm1 ); psadbw_r2r( mm6, mm2 ); psadbw_r2r( mm6, mm1 ); paddd_r2r( mm2, mm1 ); paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */ p_pix_c += i_pitch_curr; p_pix_p += i_pitch_prev; } movq_r2r( mm3, mm7 ); /* score (total) */ paddd_r2r( mm4, mm7 ); movd_r2m( mm3, i_top_motion ); movd_r2m( mm4, i_bot_motion ); movd_r2m( mm7, i_motion ); /* The loop counts actual score * 255. */ i_top_motion /= 255; i_bot_motion /= 255; i_motion /= 255; emms(); (*pi_top) = ( i_top_motion >= 8 ); (*pi_bot) = ( i_bot_motion >= 8 ); return (i_motion >= 8); }
int bsad_mmx(uint8_t *pf, uint8_t *pb, uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h) { uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc; int s, s1, s2; pfa = pf + hxf; pfb = pf + lx * hyf; pfc = pfb + hxf; pba = pb + hxb; pbb = pb + lx * hyb; pbc = pbb + hxb; s = 0; /* the accumulator */ if (h > 0) { pxor_r2r(mm7, mm7); pxor_r2r(mm6, mm6); pcmpeqw_r2r(mm5, mm5); psubw_r2r(mm5, mm6); psllw_i2r(1, mm6); do { BSAD_LOAD(pf[0],mm0,mm1); BSAD_LOAD_ACC(pfa[0],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfb[0],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfc[0],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); BSAD_LOAD(pb[0],mm2,mm3); BSAD_LOAD_ACC(pba[0],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbb[0],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbc[0],mm4,mm5,mm2,mm3); paddw_r2r(mm6, mm2); paddw_r2r(mm6, mm3); psrlw_i2r(2, mm2); psrlw_i2r(2, mm3); paddw_r2r(mm2, mm0); paddw_r2r(mm3, mm1); psrlw_i2r(1, mm6); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psllw_i2r(1, mm6); psrlw_i2r(1, mm0); psrlw_i2r(1, mm1); packuswb_r2r(mm1, mm0); movq_m2r(p2[0], mm1); movq_r2r(mm0, mm2); psubusb_r2r(mm1, mm0); psubusb_r2r(mm2, mm1); por_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm7, mm0); punpckhbw_r2r(mm7, mm1); paddw_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklwd_r2r(mm7, mm0); punpckhwd_r2r(mm7, mm1); paddd_r2r(mm1, mm0); movd_r2g(mm0, s1); psrlq_i2r(32, mm0); movd_r2g(mm0, s2); s += s1 + s2; BSAD_LOAD(pf[8],mm0,mm1); BSAD_LOAD_ACC(pfa[8],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfb[8],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfc[8],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); BSAD_LOAD(pb[8],mm2,mm3); BSAD_LOAD_ACC(pba[8],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbb[8],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbc[8],mm4,mm5,mm2,mm3); paddw_r2r(mm6, mm2); paddw_r2r(mm6, mm3); psrlw_i2r(2, mm2); psrlw_i2r(2, mm3); paddw_r2r(mm2, mm0); paddw_r2r(mm3, mm1); psrlw_i2r(1, mm6); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psllw_i2r(1, mm6); psrlw_i2r(1, mm0); psrlw_i2r(1, mm1); packuswb_r2r(mm1, mm0); movq_m2r(p2[8], mm1); movq_r2r(mm0, mm2); psubusb_r2r(mm1, mm0); psubusb_r2r(mm2, mm1); por_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm7, mm0); punpckhbw_r2r(mm7, mm1); paddw_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklwd_r2r(mm7, mm0); punpckhwd_r2r(mm7, mm1); paddd_r2r(mm1, mm0); movd_r2g(mm0, s1); psrlq_i2r(32, mm0); movd_r2g(mm0, s2); s += s1 + s2; p2 += lx; pf += lx; pfa += lx; pfb += lx; pfc += lx; pb += lx; pba += lx; pbb += lx; pbc += lx; h--; } while (h > 0); } emms(); return s; }
//VLC_MMX // sunqueen delete static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c, int i_pitch_prev, int i_pitch_curr, int* pi_top, int* pi_bot ) { int32_t i_motion = 0; int32_t i_top_motion = 0; int32_t i_bot_motion = 0; uint64_t ui_pix_c, ui_pix_p; // sunqueen add // static const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } }; __declspec(align(8)) static const mmx_t bT = { 0x0A0A0A0A0A0A0A0AULL }; // sunqueen modify pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */ movq_m2r( bT, mm5 ); pxor_r2r( mm3, mm3 ); /* score (top field) */ pxor_r2r( mm4, mm4 ); /* score (bottom field) */ for( int y = 0; y < 8; y+=2 ) { /* top field */ // sunqueen add start ui_pix_c = *((uint64_t*)p_pix_c); movq_m2r( ui_pix_c, mm0 ); ui_pix_p = *((uint64_t*)p_pix_p); movq_m2r( ui_pix_p, mm1 ); // sunqueen add end #if 0 // sunqueen delete start movq_m2r( *((uint64_t*)p_pix_c), mm0 ); movq_m2r( *((uint64_t*)p_pix_p), mm1 ); #endif // sunqueen delete end movq_r2r( mm0, mm2 ); psubusb_r2r( mm1, mm2 ); psubusb_r2r( mm0, mm1 ); pcmpgtb_r2r( mm5, mm2 ); pcmpgtb_r2r( mm5, mm1 ); psadbw_r2r( mm6, mm2 ); psadbw_r2r( mm6, mm1 ); paddd_r2r( mm2, mm1 ); paddd_r2r( mm1, mm3 ); /* add to top field score */ p_pix_c += i_pitch_curr; p_pix_p += i_pitch_prev; /* bottom field - handling identical to top field, except... */ // sunqueen add start ui_pix_c = *((uint64_t*)p_pix_c); movq_m2r( ui_pix_c, mm0 ); ui_pix_p = *((uint64_t*)p_pix_p); movq_m2r( ui_pix_p, mm1 ); // sunqueen add end #if 0 // sunqueen delete start movq_m2r( *((uint64_t*)p_pix_c), mm0 ); movq_m2r( *((uint64_t*)p_pix_p), mm1 ); #endif // sunqueen delete end movq_r2r( mm0, mm2 ); psubusb_r2r( mm1, mm2 ); psubusb_r2r( mm0, mm1 ); pcmpgtb_r2r( mm5, mm2 ); pcmpgtb_r2r( mm5, mm1 ); psadbw_r2r( mm6, mm2 ); psadbw_r2r( mm6, mm1 ); paddd_r2r( mm2, mm1 ); paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */ p_pix_c += i_pitch_curr; p_pix_p += i_pitch_prev; } movq_r2r( mm3, mm7 ); /* score (total) */ paddd_r2r( mm4, mm7 ); movd_r2m( mm3, i_top_motion ); movd_r2m( mm4, i_bot_motion ); movd_r2m( mm7, i_motion ); /* The loop counts actual score * 255. */ i_top_motion /= 255; i_bot_motion /= 255; i_motion /= 255; emms(); (*pi_top) = ( i_top_motion >= 8 ); (*pi_bot) = ( i_bot_motion >= 8 ); return (i_motion >= 8); }
static void deinterlace_greedy_scanline_mmxext (GstDeinterlaceMethodGreedyL * self, const guint8 * m0, const guint8 * t1, const guint8 * b1, const guint8 * m2, guint8 * output, gint width) { mmx_t MaxComb; // How badly do we let it weave? 0-255 MaxComb.ub[0] = self->max_comb; MaxComb.ub[1] = self->max_comb; MaxComb.ub[2] = self->max_comb; MaxComb.ub[3] = self->max_comb; MaxComb.ub[4] = self->max_comb; MaxComb.ub[5] = self->max_comb; MaxComb.ub[6] = self->max_comb; MaxComb.ub[7] = self->max_comb; // L2 == m0 // L1 == t1 // L3 == b1 // LP2 == m2 movq_m2r (MaxComb, mm6); for (; width > 7; width -= 8) { movq_m2r (*t1, mm1); // L1 movq_m2r (*m0, mm2); // L2 movq_m2r (*b1, mm3); // L3 movq_m2r (*m2, mm0); // LP2 // average L1 and L3 leave result in mm4 movq_r2r (mm1, mm4); // L1 pavgb_r2r (mm3, mm4); // (L1 + L3)/2 // get abs value of possible L2 comb movq_r2r (mm2, mm7); // L2 psubusb_r2r (mm4, mm7); // L2 - avg movq_r2r (mm4, mm5); // avg psubusb_r2r (mm2, mm5); // avg - L2 por_r2r (mm7, mm5); // abs(avg-L2) // get abs value of possible LP2 comb movq_r2r (mm0, mm7); // LP2 psubusb_r2r (mm4, mm7); // LP2 - avg psubusb_r2r (mm0, mm4); // avg - LP2 por_r2r (mm7, mm4); // abs(avg-LP2) // use L2 or LP2 depending upon which makes smaller comb psubusb_r2r (mm5, mm4); // see if it goes to zero pxor_r2r (mm5, mm5); // 0 pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0 pcmpeqb_r2r (mm4, mm5); // opposite of mm4 // if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55 pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0 pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0 por_r2r (mm5, mm4); // may the best win // Now lets clip our chosen value to be not outside of the range // of the high/low range L1-L3 by more than abs(L1-L3) // This allows some comb but limits the damages and also allows more // detail than a boring oversmoothed clip. movq_r2r (mm1, mm2); // copy L1 pmaxub_r2r (mm3, mm2); // now = Max(L1,L3) pminub_r2r (mm1, mm3); // now = Min(L1,L3) // allow the value to be above the high or below the low by amt of MaxComb paddusb_r2r (mm6, mm2); // increase max by diff psubusb_r2r (mm6, mm3); // lower min by diff pmaxub_r2r (mm3, mm4); // now = Max(best,Min(L1,L3) pminub_r2r (mm4, mm2); // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped movq_r2m (mm2, *output); // move in our clipped best // Advance to the next set of pixels. output += 8; m0 += 8; t1 += 8; b1 += 8; m2 += 8; } emms (); if (width > 0) deinterlace_greedy_scanline_c (self, m0, t1, b1, m2, output, width); }
/** * Internal helper function for EstimateNumBlocksWithMotion(): * estimates whether there is motion in the given 8x8 block on one plane * between two images. The block as a whole and its fields are evaluated * separately, and use different motion thresholds. * * This is a low-level function only used by EstimateNumBlocksWithMotion(). * There is no need to call this function manually. * * For interpretation of pi_top and pi_bot, it is assumed that the block * starts on an even-numbered line (belonging to the top field). * * The b_mmx parameter avoids the need to call vlc_CPU() separately * for each block. * * @param[in] p_pix_p Base pointer to the block in previous picture * @param[in] p_pix_c Base pointer to the same block in current picture * @param i_pitch_prev i_pitch of previous picture * @param i_pitch_curr i_pitch of current picture * @param b_mmx (vlc_CPU() & CPU_CAPABILITY_MMXEXT) or false. * @param[out] pi_top 1 if top field of the block had motion, 0 if no * @param[out] pi_bot 1 if bottom field of the block had motion, 0 if no * @return 1 if the block had motion, 0 if no * @see EstimateNumBlocksWithMotion() */ static inline int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c, int i_pitch_prev, int i_pitch_curr, bool b_mmx, int* pi_top, int* pi_bot ) { /* Pixel luma/chroma difference threshold to detect motion. */ #define T 10 int32_t i_motion = 0; int32_t i_top_motion = 0; int32_t i_bot_motion = 0; /* See below for the C version to see more quickly what this does. */ #ifdef CAN_COMPILE_MMXEXT if( b_mmx ) { static const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } }; pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */ movq_m2r( bT, mm5 ); pxor_r2r( mm3, mm3 ); /* score (top field) */ pxor_r2r( mm4, mm4 ); /* score (bottom field) */ for( int y = 0; y < 8; y+=2 ) { /* top field */ movq_m2r( *((uint64_t*)p_pix_c), mm0 ); movq_m2r( *((uint64_t*)p_pix_p), mm1 ); movq_r2r( mm0, mm2 ); psubusb_r2r( mm1, mm2 ); psubusb_r2r( mm0, mm1 ); pcmpgtb_r2r( mm5, mm2 ); pcmpgtb_r2r( mm5, mm1 ); psadbw_r2r( mm6, mm2 ); psadbw_r2r( mm6, mm1 ); paddd_r2r( mm2, mm1 ); paddd_r2r( mm1, mm3 ); /* add to top field score */ p_pix_c += i_pitch_curr; p_pix_p += i_pitch_prev; /* bottom field - handling identical to top field, except... */ movq_m2r( *((uint64_t*)p_pix_c), mm0 ); movq_m2r( *((uint64_t*)p_pix_p), mm1 ); movq_r2r( mm0, mm2 ); psubusb_r2r( mm1, mm2 ); psubusb_r2r( mm0, mm1 ); pcmpgtb_r2r( mm5, mm2 ); pcmpgtb_r2r( mm5, mm1 ); psadbw_r2r( mm6, mm2 ); psadbw_r2r( mm6, mm1 ); paddd_r2r( mm2, mm1 ); paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */ p_pix_c += i_pitch_curr; p_pix_p += i_pitch_prev; } movq_r2r( mm3, mm7 ); /* score (total) */ paddd_r2r( mm4, mm7 ); movd_r2m( mm3, i_top_motion ); movd_r2m( mm4, i_bot_motion ); movd_r2m( mm7, i_motion ); /* The loop counts actual score * 255. */ i_top_motion /= 255; i_bot_motion /= 255; i_motion /= 255; emms(); } else #endif { for( int y = 0; y < 8; ++y )