/* For a 16*h block, this computes (((((*pf + *pf2 + 1)>>1) + ((*pb + *pb2 + 1)>>1) + 1)>>1) + *p2 + 1)>>1 */ static int bsad_0quad_mmxe(uint8_t *pf,uint8_t *pf2,uint8_t *pb,uint8_t *pb2,uint8_t *p2,int lx,int h) { int32_t s=0; pxor_r2r(mm7, mm7); do { movq_m2r(pf2[0],mm0); movq_m2r(pf2[8],mm2); movq_m2r(pb2[0],mm1); movq_m2r(pb2[8],mm3); pavgb_m2r(pf[0],mm0); pavgb_m2r(pf[8],mm2); pavgb_m2r(pb[0],mm1); pavgb_m2r(pb[8],mm3); pavgb_r2r(mm1,mm0); pavgb_r2r(mm3,mm2); psadbw_m2r(p2[0],mm0); psadbw_m2r(p2[8],mm2); paddd_r2r(mm0,mm7); paddd_r2r(mm2,mm7); pf+=lx; pf2+=lx; pb+=lx; pb2+=lx; p2+=lx; h--; } while (h); movd_r2g(mm7,s); emms(); return s; }
VLC_MMX static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top, const picture_t* p_pic_bot ) { assert( p_pic_top->i_planes == p_pic_bot->i_planes ); /* Amount of bits must be known for MMX, thus int32_t. Doesn't hurt the C implementation. */ int32_t i_score_mmx = 0; /* this must be divided by 255 when finished */ int32_t i_score_c = 0; /* this counts as-is (used for non-MMX parts) */ pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */ for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane ) { /* Sanity check */ if( p_pic_top->p[i_plane].i_visible_lines != p_pic_bot->p[i_plane].i_visible_lines ) return -1; const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1; const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch, p_pic_bot->p[i_plane].i_visible_pitch ); const int wm8 = w % 8; /* remainder */ const int w8 = w - wm8; /* part of width that is divisible by 8 */ /* Current line / neighbouring lines picture pointers */ const picture_t *cur = p_pic_bot; const picture_t *ngh = p_pic_top; int wc = cur->p[i_plane].i_pitch; int wn = ngh->p[i_plane].i_pitch; /* Transcode 1.1.5 only checks every other line. Checking every line works better for anime, which may contain horizontal, one pixel thick cartoon outlines. */ for( int y = 1; y < i_lasty; ++y ) { uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc]; /* this line */ uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */ uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */ int x = 0; /* Easy-to-read C version further below. Assumptions: 0 < T < 127 # of pixels < (2^32)/255 Note: calculates score * 255 */ static alignas (8) const mmx_t b0 = { .uq = 0x0000000000000000ULL }; static alignas (8) const mmx_t b128 = { .uq = 0x8080808080808080ULL }; static alignas (8) const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } }; for( ; x < w8; x += 8 ) { movq_m2r( *((int64_t*)p_c), mm0 ); movq_m2r( *((int64_t*)p_p), mm1 ); movq_m2r( *((int64_t*)p_n), mm2 ); psubb_m2r( b128, mm0 ); psubb_m2r( b128, mm1 ); psubb_m2r( b128, mm2 ); psubsb_r2r( mm0, mm1 ); psubsb_r2r( mm0, mm2 ); pxor_r2r( mm3, mm3 ); pxor_r2r( mm4, mm4 ); pxor_r2r( mm5, mm5 ); pxor_r2r( mm6, mm6 ); punpcklbw_r2r( mm1, mm3 ); punpcklbw_r2r( mm2, mm4 ); punpckhbw_r2r( mm1, mm5 ); punpckhbw_r2r( mm2, mm6 ); pmulhw_r2r( mm3, mm4 ); pmulhw_r2r( mm5, mm6 ); packsswb_r2r(mm4, mm6); pcmpgtb_m2r( bT, mm6 ); psadbw_m2r( b0, mm6 ); paddd_r2r( mm6, mm7 ); p_c += 8; p_p += 8; p_n += 8; } for( ; x < w; ++x ) { /* Worst case: need 17 bits for "comb". */ int_fast32_t C = *p_c; int_fast32_t P = *p_p; int_fast32_t N = *p_n; /* Comments in Transcode's filter_ivtc.c attribute this combing metric to Gunnar Thalin. The idea is that if the picture is interlaced, both expressions will have the same sign, and this comes up positive. The value T = 100 has been chosen such that a pixel difference of 10 (on average) will trigger the detector. */ int_fast32_t comb = (P - C) * (N - C); if( comb > T ) ++i_score_c; ++p_c; ++p_p; ++p_n; } /* Now the other field - swap current and neighbour pictures */ const picture_t *tmp = cur; cur = ngh; ngh = tmp; int tmp_pitch = wc; wc = wn; wn = tmp_pitch; } } movd_r2m( mm7, i_score_mmx ); emms(); return i_score_mmx/255 + i_score_c; } #endif /* See header for function doc. */ int CalculateInterlaceScore( const picture_t* p_pic_top, const picture_t* p_pic_bot ) { /* We use the comb metric from the IVTC filter of Transcode 1.1.5. This was found to work better for the particular purpose of IVTC than RenderX()'s comb metric. Note that we *must not* subsample at all in order to catch interlacing in telecined frames with localized motion (e.g. anime with characters talking, where only mouths move and everything else stays still.) */ assert( p_pic_top != NULL ); assert( p_pic_bot != NULL ); if( p_pic_top->i_planes != p_pic_bot->i_planes ) return -1; #ifdef CAN_COMPILE_MMXEXT if (vlc_CPU_MMXEXT()) return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot ); #endif int32_t i_score = 0; for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane ) { /* Sanity check */ if( p_pic_top->p[i_plane].i_visible_lines != p_pic_bot->p[i_plane].i_visible_lines ) return -1; const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1; const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch, p_pic_bot->p[i_plane].i_visible_pitch ); /* Current line / neighbouring lines picture pointers */ const picture_t *cur = p_pic_bot; const picture_t *ngh = p_pic_top; int wc = cur->p[i_plane].i_pitch; int wn = ngh->p[i_plane].i_pitch; /* Transcode 1.1.5 only checks every other line. Checking every line works better for anime, which may contain horizontal, one pixel thick cartoon outlines. */ for( int y = 1; y < i_lasty; ++y ) { uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc]; /* this line */ uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */ uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */ for( int x = 0; x < w; ++x ) { /* Worst case: need 17 bits for "comb". */ int_fast32_t C = *p_c; int_fast32_t P = *p_p; int_fast32_t N = *p_n; /* Comments in Transcode's filter_ivtc.c attribute this combing metric to Gunnar Thalin. The idea is that if the picture is interlaced, both expressions will have the same sign, and this comes up positive. The value T = 100 has been chosen such that a pixel difference of 10 (on average) will trigger the detector. */ int_fast32_t comb = (P - C) * (N - C); if( comb > T ) ++i_score; ++p_c; ++p_p; ++p_n; } /* Now the other field - swap current and neighbour pictures */ const picture_t *tmp = cur; cur = ngh; ngh = tmp; int tmp_pitch = wc; wc = wn; wn = tmp_pitch; } } return i_score; }
static int bsad_1quad_mmxe(uint8_t *pf, uint8_t *pb, uint8_t *pb2, uint8_t *p2, int lx, int h) { int s; s = 0; /* the accumulator */ if (h > 0) { pcmpeqw_r2r(mm6, mm6); psrlw_i2r(15, mm6); paddw_r2r(mm6, mm6); pxor_r2r(mm7, mm7); pxor_r2r(mm5, mm5); do { BSAD_LOAD(pf[0],mm0,mm1); BSAD_LOAD_ACC(pf[1],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pf[lx],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pf[lx+1],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); packuswb_r2r(mm1, mm0); movq_m2r(pb2[0],mm1); pavgb_m2r(pb[0],mm1); pavgb_r2r(mm1, mm0); psadbw_m2r(p2[0],mm0); paddd_r2r(mm0,mm5); BSAD_LOAD(pf[8],mm0,mm1); BSAD_LOAD_ACC(pf[9],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pf[lx+8],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pf[lx+9],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); packuswb_r2r(mm1, mm0); movq_m2r(pb2[8],mm1); pavgb_m2r(pb[8],mm1); pavgb_r2r(mm1, mm0); psadbw_m2r(p2[8],mm0); paddd_r2r(mm0,mm5); p2 += lx; pf += lx; pb += lx; pb2 += lx; h--; } while (h > 0); } movd_r2g(mm5,s); emms(); return s; }
//VLC_MMX // sunqueen delete static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top, const picture_t* p_pic_bot ) { assert( p_pic_top->i_planes == p_pic_bot->i_planes ); /* Amount of bits must be known for MMX, thus int32_t. Doesn't hurt the C implementation. */ int32_t i_score_mmx = 0; /* this must be divided by 255 when finished */ int32_t i_score_c = 0; /* this counts as-is (used for non-MMX parts) */ pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */ for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane ) { /* Sanity check */ if( p_pic_top->p[i_plane].i_visible_lines != p_pic_bot->p[i_plane].i_visible_lines ) return -1; const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1; const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch, p_pic_bot->p[i_plane].i_visible_pitch ); const int wm8 = w % 8; /* remainder */ const int w8 = w - wm8; /* part of width that is divisible by 8 */ /* Current line / neighbouring lines picture pointers */ const picture_t *cur = p_pic_bot; const picture_t *ngh = p_pic_top; int wc = cur->p[i_plane].i_pitch; int wn = ngh->p[i_plane].i_pitch; /* Transcode 1.1.5 only checks every other line. Checking every line works better for anime, which may contain horizontal, one pixel thick cartoon outlines. */ for( int y = 1; y < i_lasty; ++y ) { uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc]; /* this line */ uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */ uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */ int64_t i_p_c, i_p_p, i_p_n; // sunqueen add int x = 0; /* Easy-to-read C version further below. Assumptions: 0 < T < 127 # of pixels < (2^32)/255 Note: calculates score * 255 */ // static const mmx_t b0 = { .uq = 0x0000000000000000ULL }; __declspec(align(8)) static const mmx_t b0 = { /*.uq =*/ 0x0000000000000000ULL }; // sunqueen modify // static const mmx_t b128 = { .uq = 0x8080808080808080ULL }; __declspec(align(8)) static const mmx_t b128 = { /*.uq =*/ 0x8080808080808080ULL }; // sunqueen modify // static const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } }; __declspec(align(8)) static const mmx_t bT = { 0x6464646464646464ULL }; // sunqueen modify for( ; x < w8; x += 8 ) { // sunqueen add start i_p_c = *((int64_t*)p_c); movq_m2r( i_p_c, mm0 ); i_p_p = *((int64_t*)p_p); movq_m2r( i_p_p, mm1 ); i_p_n = *((int64_t*)p_n); movq_m2r( i_p_n, mm2 ); // sunqueen add end #if 0 // sunqueen delete start movq_m2r( *((int64_t*)p_c), mm0 ); movq_m2r( *((int64_t*)p_p), mm1 ); movq_m2r( *((int64_t*)p_n), mm2 ); #endif // sunqueen delete end psubb_m2r( b128, mm0 ); psubb_m2r( b128, mm1 ); psubb_m2r( b128, mm2 ); psubsb_r2r( mm0, mm1 ); psubsb_r2r( mm0, mm2 ); pxor_r2r( mm3, mm3 ); pxor_r2r( mm4, mm4 ); pxor_r2r( mm5, mm5 ); pxor_r2r( mm6, mm6 ); punpcklbw_r2r( mm1, mm3 ); punpcklbw_r2r( mm2, mm4 ); punpckhbw_r2r( mm1, mm5 ); punpckhbw_r2r( mm2, mm6 ); pmulhw_r2r( mm3, mm4 ); pmulhw_r2r( mm5, mm6 ); packsswb_r2r(mm4, mm6); pcmpgtb_m2r( bT, mm6 ); psadbw_m2r( b0, mm6 ); paddd_r2r( mm6, mm7 ); p_c += 8; p_p += 8; p_n += 8; } for( ; x < w; ++x ) { /* Worst case: need 17 bits for "comb". */ int_fast32_t C = *p_c; int_fast32_t P = *p_p; int_fast32_t N = *p_n; /* Comments in Transcode's filter_ivtc.c attribute this combing metric to Gunnar Thalin. The idea is that if the picture is interlaced, both expressions will have the same sign, and this comes up positive. The value T = 100 has been chosen such that a pixel difference of 10 (on average) will trigger the detector. */ int_fast32_t comb = (P - C) * (N - C); if( comb > T ) ++i_score_c; ++p_c; ++p_p; ++p_n; } /* Now the other field - swap current and neighbour pictures */ const picture_t *tmp = cur; cur = ngh; ngh = tmp; int tmp_pitch = wc; wc = wn; wn = tmp_pitch; } } movd_r2m( mm7, i_score_mmx ); emms(); return i_score_mmx/255 + i_score_c; }