static void scale_uint8_x_4_x_bilinear_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i; uint8_t * src, * dst, *src_start; int32_t * factors; // mmx_t tmp_mm; /* * mm0: Input1 * mm1: Factor mask * mm2: * mm3: Output * mm4: * mm5: Input2 * mm6: 0 * mm7: Factor * */ // fprintf(stderr, "scale_uint8_x_4_x_bilinear_mmx\n"); src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); movq_m2r(factor_mask, mm1); dst = dest_start; for(i = 0; i < ctx->dst_size; i++) { src = src_start + 4*ctx->table_h.pixels[i].index; factors = ctx->table_h.pixels[i].factor_i; /* Load pixels */ movd_m2r(*(src), mm0); punpcklbw_r2r(mm6, mm0); psllw_i2r(6, mm0); /* 14 bit */ /* Load pixels */ movd_m2r(*(src+4), mm5); punpcklbw_r2r(mm6, mm5); psllw_i2r(6, mm5); /* 14 bit */ /* Load factors */ LOAD_FACTOR_1_4_NOCLIP; /* 14 bit */ /* Subtract */ psubsw_r2r(mm5, mm0); /* s1(mm0) - s2(mm5) -> mm0 (14 bit) */ pmulhw_r2r(mm7, mm0); /* factor * (s2 - s1) -> mm0 (12 bit) */ psllw_i2r(2, mm0); /* (14 bit) */ paddsw_r2r(mm5, mm0);/* (15 bit) */ psraw_i2r(6, mm0);/* (8 bit) */ packuswb_r2r(mm6, mm0); movd_r2m(mm0, *dst); dst+=4; } ctx->need_emms = 1; }
static void scale_uint8_x_4_x_generic_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i, j; uint8_t * src, * dst, *src_start; int32_t * factors; // mmx_t tmp_mm; /* * mm0: Input * mm1: factor_mask * mm2: Factor * mm3: Output * mm4: * mm5: * mm6: 0 * mm7: scratch * */ src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); movq_m2r(factor_mask, mm1); dst = dest_start; for(i = 0; i < ctx->dst_size; i++) { src = src_start + 4*ctx->table_h.pixels[i].index; factors = ctx->table_h.pixels[i].factor_i; pxor_r2r(mm3, mm3); for(j = 0; j < ctx->table_h.factors_per_pixel; j++) { /* Load pixels */ movd_m2r(*(src), mm0); punpcklbw_r2r(mm6, mm0); psllw_i2r(7, mm0); /* Load factors */ LOAD_FACTOR_1_4; /* Multiply */ pmulhw_r2r(mm7, mm0); paddw_r2r(mm0, mm3); // DUMP_MM("mm3_2", mm3); src += 4; factors++; } psraw_i2r(5, mm3); packuswb_r2r(mm6, mm3); movd_r2m(mm3, *dst); dst+=4; } ctx->need_emms = 1; }
static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, uint8_t *src2, int i_src2 ) { static const uint64_t m_4 = INT64_C(0x0004000400040004); int y, x; /* Progressive */ pxor_r2r( mm7, mm7 ); for( y = 0; y < 8; y += 2 ) { for( x = 0; x < 8; x +=4 ) { movd_m2r( src1[x], mm0 ); movd_r2m( mm0, dst[x] ); movd_m2r( src2[x], mm1 ); movd_m2r( src1[i_src1+x], mm2 ); punpcklbw_r2r( mm7, mm0 ); punpcklbw_r2r( mm7, mm1 ); punpcklbw_r2r( mm7, mm2 ); paddw_r2r( mm1, mm1 ); movq_r2r( mm1, mm3 ); paddw_r2r( mm3, mm3 ); paddw_r2r( mm2, mm0 ); paddw_r2r( mm3, mm1 ); paddw_m2r( m_4, mm1 ); paddw_r2r( mm1, mm0 ); psraw_i2r( 3, mm0 ); packuswb_r2r( mm7, mm0 ); movd_r2m( mm0, dst[i_dst+x] ); } dst += 2*i_dst; src1 += i_src1; src2 += i_src2; } }
static __inline__ int qblock_sad_mmxe(uint8_t *refblk, uint32_t h, uint32_t rowstride) { int res; pxor_r2r (mm4,mm4); movq_r2r (mm0,mm5); /* First row */ movd_m2r (*refblk, mm6); pxor_r2r ( mm7, mm7); refblk += rowstride; punpcklbw_r2r ( mm7, mm5); punpcklbw_r2r ( mm7, mm6); psadbw_r2r ( mm5, mm6); paddw_r2r ( mm6, mm4 ); movq_r2r (mm1,mm5); /* Second row */ movd_m2r (*refblk, mm6); refblk += rowstride; punpcklbw_r2r ( mm7, mm5); punpcklbw_r2r ( mm7, mm6); psadbw_r2r ( mm5, mm6); paddw_r2r ( mm6, mm4 ); if( h == 4 ) { movq_r2r (mm2,mm5); /* Third row */ movd_m2r (*refblk, mm6); refblk += rowstride; punpcklbw_r2r ( mm7, mm5); punpcklbw_r2r ( mm7, mm6); psadbw_r2r ( mm5, mm6); paddw_r2r ( mm6, mm4 ); movq_r2r (mm3,mm5); /* Fourth row */ movd_m2r (*refblk, mm6); punpcklbw_r2r ( mm7, mm5); punpcklbw_r2r ( mm7, mm6); psadbw_r2r ( mm5, mm6); paddw_r2r ( mm6, mm4 ); } movd_r2m ( mm4, res ); return res; }
static void deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4, uint8_t * lum_m3, uint8_t * lum_m2, uint8_t * lum_m1, uint8_t * lum, int size) { mmx_t rounder; rounder.uw[0] = 4; rounder.uw[1] = 4; rounder.uw[2] = 4; rounder.uw[3] = 4; pxor_r2r (mm7, mm7); movq_m2r (rounder, mm6); for (; size > 3; size -= 4) { movd_m2r (*lum_m4, mm0); movd_m2r (*lum_m3, mm1); movd_m2r (*lum_m2, mm2); movd_m2r (*lum_m1, mm3); movd_m2r (*lum, mm4); punpcklbw_r2r (mm7, mm0); punpcklbw_r2r (mm7, mm1); punpcklbw_r2r (mm7, mm2); punpcklbw_r2r (mm7, mm3); punpcklbw_r2r (mm7, mm4); paddw_r2r (mm3, mm1); psllw_i2r (1, mm2); paddw_r2r (mm4, mm0); psllw_i2r (2, mm1); // 2 paddw_r2r (mm6, mm2); paddw_r2r (mm2, mm1); psubusw_r2r (mm0, mm1); psrlw_i2r (3, mm1); // 3 packuswb_r2r (mm7, mm1); movd_r2m (mm1, *dst); lum_m4 += 4; lum_m3 += 4; lum_m2 += 4; lum_m1 += 4; lum += 4; dst += 4; } emms (); /* Handle odd widths */ if (size > 0) deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size); }
static __inline__ void mmx_sum_4_word_accs( mmx_t *accs, int32_t *res ) { movq_m2r( *accs, mm1 ); movq_r2r( mm1, mm3 ); movq_r2r( mm1, mm2 ); /* Generate sign extensions for mm1 words! */ psraw_i2r( 15, mm3 ); punpcklwd_r2r( mm3, mm1 ); punpckhwd_r2r( mm3, mm2 ); paddd_r2r( mm1, mm2 ); movq_r2r( mm2, mm3); psrlq_i2r( 32, mm2); paddd_r2r( mm2, mm3); movd_r2m( mm3, *res ); }
VLC_MMX static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top, const picture_t* p_pic_bot ) { assert( p_pic_top->i_planes == p_pic_bot->i_planes ); /* Amount of bits must be known for MMX, thus int32_t. Doesn't hurt the C implementation. */ int32_t i_score_mmx = 0; /* this must be divided by 255 when finished */ int32_t i_score_c = 0; /* this counts as-is (used for non-MMX parts) */ pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */ for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane ) { /* Sanity check */ if( p_pic_top->p[i_plane].i_visible_lines != p_pic_bot->p[i_plane].i_visible_lines ) return -1; const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1; const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch, p_pic_bot->p[i_plane].i_visible_pitch ); const int wm8 = w % 8; /* remainder */ const int w8 = w - wm8; /* part of width that is divisible by 8 */ /* Current line / neighbouring lines picture pointers */ const picture_t *cur = p_pic_bot; const picture_t *ngh = p_pic_top; int wc = cur->p[i_plane].i_pitch; int wn = ngh->p[i_plane].i_pitch; /* Transcode 1.1.5 only checks every other line. Checking every line works better for anime, which may contain horizontal, one pixel thick cartoon outlines. */ for( int y = 1; y < i_lasty; ++y ) { uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc]; /* this line */ uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */ uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */ int x = 0; /* Easy-to-read C version further below. Assumptions: 0 < T < 127 # of pixels < (2^32)/255 Note: calculates score * 255 */ static alignas (8) const mmx_t b0 = { .uq = 0x0000000000000000ULL }; static alignas (8) const mmx_t b128 = { .uq = 0x8080808080808080ULL }; static alignas (8) const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } }; for( ; x < w8; x += 8 ) { movq_m2r( *((int64_t*)p_c), mm0 ); movq_m2r( *((int64_t*)p_p), mm1 ); movq_m2r( *((int64_t*)p_n), mm2 ); psubb_m2r( b128, mm0 ); psubb_m2r( b128, mm1 ); psubb_m2r( b128, mm2 ); psubsb_r2r( mm0, mm1 ); psubsb_r2r( mm0, mm2 ); pxor_r2r( mm3, mm3 ); pxor_r2r( mm4, mm4 ); pxor_r2r( mm5, mm5 ); pxor_r2r( mm6, mm6 ); punpcklbw_r2r( mm1, mm3 ); punpcklbw_r2r( mm2, mm4 ); punpckhbw_r2r( mm1, mm5 ); punpckhbw_r2r( mm2, mm6 ); pmulhw_r2r( mm3, mm4 ); pmulhw_r2r( mm5, mm6 ); packsswb_r2r(mm4, mm6); pcmpgtb_m2r( bT, mm6 ); psadbw_m2r( b0, mm6 ); paddd_r2r( mm6, mm7 ); p_c += 8; p_p += 8; p_n += 8; } for( ; x < w; ++x ) { /* Worst case: need 17 bits for "comb". */ int_fast32_t C = *p_c; int_fast32_t P = *p_p; int_fast32_t N = *p_n; /* Comments in Transcode's filter_ivtc.c attribute this combing metric to Gunnar Thalin. The idea is that if the picture is interlaced, both expressions will have the same sign, and this comes up positive. The value T = 100 has been chosen such that a pixel difference of 10 (on average) will trigger the detector. */ int_fast32_t comb = (P - C) * (N - C); if( comb > T ) ++i_score_c; ++p_c; ++p_p; ++p_n; } /* Now the other field - swap current and neighbour pictures */ const picture_t *tmp = cur; cur = ngh; ngh = tmp; int tmp_pitch = wc; wc = wn; wn = tmp_pitch; } } movd_r2m( mm7, i_score_mmx ); emms(); return i_score_mmx/255 + i_score_c; } #endif /* See header for function doc. */ int CalculateInterlaceScore( const picture_t* p_pic_top, const picture_t* p_pic_bot ) { /* We use the comb metric from the IVTC filter of Transcode 1.1.5. This was found to work better for the particular purpose of IVTC than RenderX()'s comb metric. Note that we *must not* subsample at all in order to catch interlacing in telecined frames with localized motion (e.g. anime with characters talking, where only mouths move and everything else stays still.) */ assert( p_pic_top != NULL ); assert( p_pic_bot != NULL ); if( p_pic_top->i_planes != p_pic_bot->i_planes ) return -1; #ifdef CAN_COMPILE_MMXEXT if (vlc_CPU_MMXEXT()) return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot ); #endif int32_t i_score = 0; for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane ) { /* Sanity check */ if( p_pic_top->p[i_plane].i_visible_lines != p_pic_bot->p[i_plane].i_visible_lines ) return -1; const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1; const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch, p_pic_bot->p[i_plane].i_visible_pitch ); /* Current line / neighbouring lines picture pointers */ const picture_t *cur = p_pic_bot; const picture_t *ngh = p_pic_top; int wc = cur->p[i_plane].i_pitch; int wn = ngh->p[i_plane].i_pitch; /* Transcode 1.1.5 only checks every other line. Checking every line works better for anime, which may contain horizontal, one pixel thick cartoon outlines. */ for( int y = 1; y < i_lasty; ++y ) { uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc]; /* this line */ uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */ uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */ for( int x = 0; x < w; ++x ) { /* Worst case: need 17 bits for "comb". */ int_fast32_t C = *p_c; int_fast32_t P = *p_p; int_fast32_t N = *p_n; /* Comments in Transcode's filter_ivtc.c attribute this combing metric to Gunnar Thalin. The idea is that if the picture is interlaced, both expressions will have the same sign, and this comes up positive. The value T = 100 has been chosen such that a pixel difference of 10 (on average) will trigger the detector. */ int_fast32_t comb = (P - C) * (N - C); if( comb > T ) ++i_score; ++p_c; ++p_p; ++p_n; } /* Now the other field - swap current and neighbour pictures */ const picture_t *tmp = cur; cur = ngh; ngh = tmp; int tmp_pitch = wc; wc = wn; wn = tmp_pitch; } } return i_score; }
VLC_MMX static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c, int i_pitch_prev, int i_pitch_curr, int* pi_top, int* pi_bot ) { int32_t i_motion = 0; int32_t i_top_motion = 0; int32_t i_bot_motion = 0; static alignas (8) const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } }; pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */ movq_m2r( bT, mm5 ); pxor_r2r( mm3, mm3 ); /* score (top field) */ pxor_r2r( mm4, mm4 ); /* score (bottom field) */ for( int y = 0; y < 8; y+=2 ) { /* top field */ movq_m2r( *((uint64_t*)p_pix_c), mm0 ); movq_m2r( *((uint64_t*)p_pix_p), mm1 ); movq_r2r( mm0, mm2 ); psubusb_r2r( mm1, mm2 ); psubusb_r2r( mm0, mm1 ); pcmpgtb_r2r( mm5, mm2 ); pcmpgtb_r2r( mm5, mm1 ); psadbw_r2r( mm6, mm2 ); psadbw_r2r( mm6, mm1 ); paddd_r2r( mm2, mm1 ); paddd_r2r( mm1, mm3 ); /* add to top field score */ p_pix_c += i_pitch_curr; p_pix_p += i_pitch_prev; /* bottom field - handling identical to top field, except... */ movq_m2r( *((uint64_t*)p_pix_c), mm0 ); movq_m2r( *((uint64_t*)p_pix_p), mm1 ); movq_r2r( mm0, mm2 ); psubusb_r2r( mm1, mm2 ); psubusb_r2r( mm0, mm1 ); pcmpgtb_r2r( mm5, mm2 ); pcmpgtb_r2r( mm5, mm1 ); psadbw_r2r( mm6, mm2 ); psadbw_r2r( mm6, mm1 ); paddd_r2r( mm2, mm1 ); paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */ p_pix_c += i_pitch_curr; p_pix_p += i_pitch_prev; } movq_r2r( mm3, mm7 ); /* score (total) */ paddd_r2r( mm4, mm7 ); movd_r2m( mm3, i_top_motion ); movd_r2m( mm4, i_bot_motion ); movd_r2m( mm7, i_motion ); /* The loop counts actual score * 255. */ i_top_motion /= 255; i_bot_motion /= 255; i_motion /= 255; emms(); (*pi_top) = ( i_top_motion >= 8 ); (*pi_bot) = ( i_bot_motion >= 8 ); return (i_motion >= 8); }
//VLC_MMX // sunqueen delete static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top, const picture_t* p_pic_bot ) { assert( p_pic_top->i_planes == p_pic_bot->i_planes ); /* Amount of bits must be known for MMX, thus int32_t. Doesn't hurt the C implementation. */ int32_t i_score_mmx = 0; /* this must be divided by 255 when finished */ int32_t i_score_c = 0; /* this counts as-is (used for non-MMX parts) */ pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */ for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane ) { /* Sanity check */ if( p_pic_top->p[i_plane].i_visible_lines != p_pic_bot->p[i_plane].i_visible_lines ) return -1; const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1; const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch, p_pic_bot->p[i_plane].i_visible_pitch ); const int wm8 = w % 8; /* remainder */ const int w8 = w - wm8; /* part of width that is divisible by 8 */ /* Current line / neighbouring lines picture pointers */ const picture_t *cur = p_pic_bot; const picture_t *ngh = p_pic_top; int wc = cur->p[i_plane].i_pitch; int wn = ngh->p[i_plane].i_pitch; /* Transcode 1.1.5 only checks every other line. Checking every line works better for anime, which may contain horizontal, one pixel thick cartoon outlines. */ for( int y = 1; y < i_lasty; ++y ) { uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc]; /* this line */ uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */ uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */ int64_t i_p_c, i_p_p, i_p_n; // sunqueen add int x = 0; /* Easy-to-read C version further below. Assumptions: 0 < T < 127 # of pixels < (2^32)/255 Note: calculates score * 255 */ // static const mmx_t b0 = { .uq = 0x0000000000000000ULL }; __declspec(align(8)) static const mmx_t b0 = { /*.uq =*/ 0x0000000000000000ULL }; // sunqueen modify // static const mmx_t b128 = { .uq = 0x8080808080808080ULL }; __declspec(align(8)) static const mmx_t b128 = { /*.uq =*/ 0x8080808080808080ULL }; // sunqueen modify // static const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } }; __declspec(align(8)) static const mmx_t bT = { 0x6464646464646464ULL }; // sunqueen modify for( ; x < w8; x += 8 ) { // sunqueen add start i_p_c = *((int64_t*)p_c); movq_m2r( i_p_c, mm0 ); i_p_p = *((int64_t*)p_p); movq_m2r( i_p_p, mm1 ); i_p_n = *((int64_t*)p_n); movq_m2r( i_p_n, mm2 ); // sunqueen add end #if 0 // sunqueen delete start movq_m2r( *((int64_t*)p_c), mm0 ); movq_m2r( *((int64_t*)p_p), mm1 ); movq_m2r( *((int64_t*)p_n), mm2 ); #endif // sunqueen delete end psubb_m2r( b128, mm0 ); psubb_m2r( b128, mm1 ); psubb_m2r( b128, mm2 ); psubsb_r2r( mm0, mm1 ); psubsb_r2r( mm0, mm2 ); pxor_r2r( mm3, mm3 ); pxor_r2r( mm4, mm4 ); pxor_r2r( mm5, mm5 ); pxor_r2r( mm6, mm6 ); punpcklbw_r2r( mm1, mm3 ); punpcklbw_r2r( mm2, mm4 ); punpckhbw_r2r( mm1, mm5 ); punpckhbw_r2r( mm2, mm6 ); pmulhw_r2r( mm3, mm4 ); pmulhw_r2r( mm5, mm6 ); packsswb_r2r(mm4, mm6); pcmpgtb_m2r( bT, mm6 ); psadbw_m2r( b0, mm6 ); paddd_r2r( mm6, mm7 ); p_c += 8; p_p += 8; p_n += 8; } for( ; x < w; ++x ) { /* Worst case: need 17 bits for "comb". */ int_fast32_t C = *p_c; int_fast32_t P = *p_p; int_fast32_t N = *p_n; /* Comments in Transcode's filter_ivtc.c attribute this combing metric to Gunnar Thalin. The idea is that if the picture is interlaced, both expressions will have the same sign, and this comes up positive. The value T = 100 has been chosen such that a pixel difference of 10 (on average) will trigger the detector. */ int_fast32_t comb = (P - C) * (N - C); if( comb > T ) ++i_score_c; ++p_c; ++p_p; ++p_n; } /* Now the other field - swap current and neighbour pictures */ const picture_t *tmp = cur; cur = ngh; ngh = tmp; int tmp_pitch = wc; wc = wn; wn = tmp_pitch; } } movd_r2m( mm7, i_score_mmx ); emms(); return i_score_mmx/255 + i_score_c; }
//VLC_MMX // sunqueen delete static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c, int i_pitch_prev, int i_pitch_curr, int* pi_top, int* pi_bot ) { int32_t i_motion = 0; int32_t i_top_motion = 0; int32_t i_bot_motion = 0; uint64_t ui_pix_c, ui_pix_p; // sunqueen add // static const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } }; __declspec(align(8)) static const mmx_t bT = { 0x0A0A0A0A0A0A0A0AULL }; // sunqueen modify pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */ movq_m2r( bT, mm5 ); pxor_r2r( mm3, mm3 ); /* score (top field) */ pxor_r2r( mm4, mm4 ); /* score (bottom field) */ for( int y = 0; y < 8; y+=2 ) { /* top field */ // sunqueen add start ui_pix_c = *((uint64_t*)p_pix_c); movq_m2r( ui_pix_c, mm0 ); ui_pix_p = *((uint64_t*)p_pix_p); movq_m2r( ui_pix_p, mm1 ); // sunqueen add end #if 0 // sunqueen delete start movq_m2r( *((uint64_t*)p_pix_c), mm0 ); movq_m2r( *((uint64_t*)p_pix_p), mm1 ); #endif // sunqueen delete end movq_r2r( mm0, mm2 ); psubusb_r2r( mm1, mm2 ); psubusb_r2r( mm0, mm1 ); pcmpgtb_r2r( mm5, mm2 ); pcmpgtb_r2r( mm5, mm1 ); psadbw_r2r( mm6, mm2 ); psadbw_r2r( mm6, mm1 ); paddd_r2r( mm2, mm1 ); paddd_r2r( mm1, mm3 ); /* add to top field score */ p_pix_c += i_pitch_curr; p_pix_p += i_pitch_prev; /* bottom field - handling identical to top field, except... */ // sunqueen add start ui_pix_c = *((uint64_t*)p_pix_c); movq_m2r( ui_pix_c, mm0 ); ui_pix_p = *((uint64_t*)p_pix_p); movq_m2r( ui_pix_p, mm1 ); // sunqueen add end #if 0 // sunqueen delete start movq_m2r( *((uint64_t*)p_pix_c), mm0 ); movq_m2r( *((uint64_t*)p_pix_p), mm1 ); #endif // sunqueen delete end movq_r2r( mm0, mm2 ); psubusb_r2r( mm1, mm2 ); psubusb_r2r( mm0, mm1 ); pcmpgtb_r2r( mm5, mm2 ); pcmpgtb_r2r( mm5, mm1 ); psadbw_r2r( mm6, mm2 ); psadbw_r2r( mm6, mm1 ); paddd_r2r( mm2, mm1 ); paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */ p_pix_c += i_pitch_curr; p_pix_p += i_pitch_prev; } movq_r2r( mm3, mm7 ); /* score (total) */ paddd_r2r( mm4, mm7 ); movd_r2m( mm3, i_top_motion ); movd_r2m( mm4, i_bot_motion ); movd_r2m( mm7, i_motion ); /* The loop counts actual score * 255. */ i_top_motion /= 255; i_bot_motion /= 255; i_motion /= 255; emms(); (*pi_top) = ( i_top_motion >= 8 ); (*pi_bot) = ( i_bot_motion >= 8 ); return (i_motion >= 8); }
static void scale_uint8_x_1_x_bilinear_mmx(gavl_video_scale_context_t * ctx, int scanline, uint8_t * dest_start) { int i, imax, index; uint8_t * src, * dst, *src_start; mmx_t tmp_mm; /* * mm0: Input1 Input2 * mm1: Factor * mm2: * mm3: * mm4: * mm5: * mm6: 0 * mm7: scratch * */ src_start = ctx->src + scanline * ctx->src_stride; pxor_r2r(mm6, mm6); dst = dest_start; imax = ctx->dst_size / 4; // imax = 0; index = 0; for(i = 0; i < imax; i++) { /* Load pixels */ src = src_start + ctx->table_h.pixels[index].index; tmp_mm.uw[0] = *src; tmp_mm.uw[1] = *(src+1); src = src_start + ctx->table_h.pixels[index+1].index; tmp_mm.uw[2] = *src; tmp_mm.uw[3] = *(src+1); movq_m2r(tmp_mm, mm0); /* Load factors */ movq_m2r(ctx->table_h.pixels[index].factor_i[0], mm1); movq_m2r(ctx->table_h.pixels[index+1].factor_i[0], mm7); packssdw_r2r(mm7, mm1); pmaddwd_r2r(mm0, mm1); index += 2; /* Load pixels */ src = src_start + ctx->table_h.pixels[index].index; tmp_mm.uw[0] = *src; tmp_mm.uw[1] = *(src+1); src = src_start + ctx->table_h.pixels[index+1].index; tmp_mm.uw[2] = *src; tmp_mm.uw[3] = *(src+1); movq_m2r(tmp_mm, mm0); /* Load factors */ movq_m2r(ctx->table_h.pixels[index].factor_i[0], mm3); movq_m2r(ctx->table_h.pixels[index+1].factor_i[0], mm7); packssdw_r2r(mm7, mm3); pmaddwd_r2r(mm0, mm3); psrld_i2r(7, mm3); psrld_i2r(7, mm1); packssdw_r2r(mm3, mm1); psrlw_i2r(7, mm1); index += 2; packuswb_r2r(mm6, mm1); movd_r2m(mm1, *dst); // *dst = tmp_mm.ub[0]; // *(dst+1) = tmp_mm.ub[4]; dst+=4; } ctx->need_emms = 1; #if 1 imax = ctx->dst_size % 4; for(i = 0; i < imax; i++) { src = (src_start + ctx->table_h.pixels[index].index); *dst = (ctx->table_h.pixels[index].factor_i[0] * *src + ctx->table_h.pixels[index].factor_i[1] * *(src+1)) >> 14; dst++; index++; } #endif }
/** * Internal helper function for EstimateNumBlocksWithMotion(): * estimates whether there is motion in the given 8x8 block on one plane * between two images. The block as a whole and its fields are evaluated * separately, and use different motion thresholds. * * This is a low-level function only used by EstimateNumBlocksWithMotion(). * There is no need to call this function manually. * * For interpretation of pi_top and pi_bot, it is assumed that the block * starts on an even-numbered line (belonging to the top field). * * The b_mmx parameter avoids the need to call vlc_CPU() separately * for each block. * * @param[in] p_pix_p Base pointer to the block in previous picture * @param[in] p_pix_c Base pointer to the same block in current picture * @param i_pitch_prev i_pitch of previous picture * @param i_pitch_curr i_pitch of current picture * @param b_mmx (vlc_CPU() & CPU_CAPABILITY_MMXEXT) or false. * @param[out] pi_top 1 if top field of the block had motion, 0 if no * @param[out] pi_bot 1 if bottom field of the block had motion, 0 if no * @return 1 if the block had motion, 0 if no * @see EstimateNumBlocksWithMotion() */ static inline int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c, int i_pitch_prev, int i_pitch_curr, bool b_mmx, int* pi_top, int* pi_bot ) { /* Pixel luma/chroma difference threshold to detect motion. */ #define T 10 int32_t i_motion = 0; int32_t i_top_motion = 0; int32_t i_bot_motion = 0; /* See below for the C version to see more quickly what this does. */ #ifdef CAN_COMPILE_MMXEXT if( b_mmx ) { static const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } }; pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */ movq_m2r( bT, mm5 ); pxor_r2r( mm3, mm3 ); /* score (top field) */ pxor_r2r( mm4, mm4 ); /* score (bottom field) */ for( int y = 0; y < 8; y+=2 ) { /* top field */ movq_m2r( *((uint64_t*)p_pix_c), mm0 ); movq_m2r( *((uint64_t*)p_pix_p), mm1 ); movq_r2r( mm0, mm2 ); psubusb_r2r( mm1, mm2 ); psubusb_r2r( mm0, mm1 ); pcmpgtb_r2r( mm5, mm2 ); pcmpgtb_r2r( mm5, mm1 ); psadbw_r2r( mm6, mm2 ); psadbw_r2r( mm6, mm1 ); paddd_r2r( mm2, mm1 ); paddd_r2r( mm1, mm3 ); /* add to top field score */ p_pix_c += i_pitch_curr; p_pix_p += i_pitch_prev; /* bottom field - handling identical to top field, except... */ movq_m2r( *((uint64_t*)p_pix_c), mm0 ); movq_m2r( *((uint64_t*)p_pix_p), mm1 ); movq_r2r( mm0, mm2 ); psubusb_r2r( mm1, mm2 ); psubusb_r2r( mm0, mm1 ); pcmpgtb_r2r( mm5, mm2 ); pcmpgtb_r2r( mm5, mm1 ); psadbw_r2r( mm6, mm2 ); psadbw_r2r( mm6, mm1 ); paddd_r2r( mm2, mm1 ); paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */ p_pix_c += i_pitch_curr; p_pix_p += i_pitch_prev; } movq_r2r( mm3, mm7 ); /* score (total) */ paddd_r2r( mm4, mm7 ); movd_r2m( mm3, i_top_motion ); movd_r2m( mm4, i_bot_motion ); movd_r2m( mm7, i_motion ); /* The loop counts actual score * 255. */ i_top_motion /= 255; i_bot_motion /= 255; i_motion /= 255; emms(); } else #endif { for( int y = 0; y < 8; ++y )
static void deinterlace_line( uint8_t *dst, uint8_t *lum_m4, uint8_t *lum_m3, uint8_t *lum_m2, uint8_t *lum_m1, uint8_t *lum, int size ) { #if defined(__i386__) || defined(__x86_64__) mmx_t rounder; rounder.uw[0]=4; rounder.uw[1]=4; rounder.uw[2]=4; rounder.uw[3]=4; pxor_r2r(mm7,mm7); movq_m2r(rounder,mm6); for (;size > 3; size-=4) { movd_m2r(lum_m4[0],mm0); movd_m2r(lum_m3[0],mm1); movd_m2r(lum_m2[0],mm2); movd_m2r(lum_m1[0],mm3); movd_m2r(lum[0],mm4); punpcklbw_r2r(mm7,mm0); punpcklbw_r2r(mm7,mm1); punpcklbw_r2r(mm7,mm2); punpcklbw_r2r(mm7,mm3); punpcklbw_r2r(mm7,mm4); paddw_r2r(mm3,mm1); psllw_i2r(1,mm2); paddw_r2r(mm4,mm0); psllw_i2r(2,mm1);// 2 paddw_r2r(mm6,mm2); paddw_r2r(mm2,mm1); psubusw_r2r(mm0,mm1); psrlw_i2r(3,mm1); // 3 packuswb_r2r(mm7,mm1); movd_r2m(mm1,dst[0]); lum_m4+=4; lum_m3+=4; lum_m2+=4; lum_m1+=4; lum+=4; dst+=4; } emms(); #else /** * C implementation. */ int sum; for(;size > 0;size--) { sum = -lum_m4[0]; sum += lum_m3[0] << 2; sum += lum_m2[0] << 1; sum += lum_m1[0] << 2; sum += -lum[0]; dst[0] = (sum + 4) >> 3; // This needs to be clipped at 0 and 255: cm[(sum + 4) >> 3]; lum_m4++; lum_m3++; lum_m2++; lum_m1++; lum++; dst++; } #endif }
static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src ) { int y, x; int32_t ff, fr; int fc; /* Detect interlacing */ fc = 0; pxor_r2r( mm7, mm7 ); for( y = 0; y < 9; y += 2 ) { ff = fr = 0; pxor_r2r( mm5, mm5 ); pxor_r2r( mm6, mm6 ); for( x = 0; x < 8; x+=4 ) { movd_m2r( src[ x], mm0 ); movd_m2r( src[1*i_src+x], mm1 ); movd_m2r( src[2*i_src+x], mm2 ); movd_m2r( src[3*i_src+x], mm3 ); punpcklbw_r2r( mm7, mm0 ); punpcklbw_r2r( mm7, mm1 ); punpcklbw_r2r( mm7, mm2 ); punpcklbw_r2r( mm7, mm3 ); movq_r2r( mm0, mm4 ); psubw_r2r( mm1, mm0 ); psubw_r2r( mm2, mm4 ); psubw_r2r( mm1, mm2 ); psubw_r2r( mm1, mm3 ); pmaddwd_r2r( mm0, mm0 ); pmaddwd_r2r( mm4, mm4 ); pmaddwd_r2r( mm2, mm2 ); pmaddwd_r2r( mm3, mm3 ); paddd_r2r( mm0, mm2 ); paddd_r2r( mm4, mm3 ); paddd_r2r( mm2, mm5 ); paddd_r2r( mm3, mm6 ); } movq_r2r( mm5, mm0 ); psrlq_i2r( 32, mm0 ); paddd_r2r( mm0, mm5 ); movd_r2m( mm5, fr ); movq_r2r( mm6, mm0 ); psrlq_i2r( 32, mm0 ); paddd_r2r( mm0, mm6 ); movd_r2m( mm6, ff ); if( ff < 6*fr/8 && fr > 32 ) fc++; src += 2*i_src; } return fc; }