/* * Do a shift right on the 4*4 block in the MMX registers * */ static __inline__ void shift_blk(const uint32_t shift) { psrlq_i2r( shift,mm0); psrlq_i2r( shift,mm1); psrlq_i2r( shift,mm2); psrlq_i2r( shift,mm3); }
static inline void mmx_interp_average_2_U8 (uint8_t * dest, const uint8_t * src1, const uint8_t * src2) { /* *dest = (*dest + (*src1 + *src2 + 1)/ 2 + 1)/ 2; */ movq_m2r (*dest, mm1); /* load 8 dest bytes */ movq_r2r (mm1, mm2); /* copy 8 dest bytes */ movq_m2r (*src1, mm3); /* load 8 src1 bytes */ movq_r2r (mm3, mm4); /* copy 8 src1 bytes */ movq_m2r (*src2, mm5); /* load 8 src2 bytes */ movq_r2r (mm5, mm6); /* copy 8 src2 bytes */ pxor_r2r (mm3, mm5); /* xor src1 and src2 */ pand_m2r (mask1, mm5); /* mask lower bits */ psrlq_i2r (1, mm5); /* /2 */ por_r2r (mm4, mm6); /* or src1 and src2 */ psubb_r2r (mm5, mm6); /* subtract subresults */ movq_r2r (mm6, mm5); /* copy subresult */ pxor_r2r (mm1, mm5); /* xor srcavg and dest */ pand_m2r (mask1, mm5); /* mask lower bits */ psrlq_i2r (1, mm5); /* /2 */ por_r2r (mm2, mm6); /* or srcavg and dest */ psubb_r2r (mm5, mm6); /* subtract subresults */ movq_r2m (mm6, *dest); /* store result in dest */ }
static inline void mmx_unpack_16rgb (uint8_t * image, const int cpu) { static mmx_t mmx_bluemask = {0xf8f8f8f8f8f8f8f8LL}; static mmx_t mmx_greenmask = {0xfcfcfcfcfcfcfcfcLL}; static mmx_t mmx_redmask = {0xf8f8f8f8f8f8f8f8LL}; /* * convert RGB plane to RGB 16 bits * mm0 -> B, mm1 -> R, mm2 -> G * mm4 -> GB, mm5 -> AR pixel 4-7 * mm6 -> GB, mm7 -> AR pixel 0-3 */ pand_m2r (mmx_bluemask, mm0); /* mm0 = b7b6b5b4b3______ */ pand_m2r (mmx_greenmask, mm2); /* mm2 = g7g6g5g4g3g2____ */ pand_m2r (mmx_redmask, mm1); /* mm1 = r7r6r5r4r3______ */ psrlq_i2r (3, mm0); /* mm0 = ______b7b6b5b4b3 */ pxor_r2r (mm4, mm4); /* mm4 = 0 */ movq_r2r (mm0, mm5); /* mm5 = ______b7b6b5b4b3 */ movq_r2r (mm2, mm7); /* mm7 = g7g6g5g4g3g2____ */ punpcklbw_r2r (mm4, mm2); punpcklbw_r2r (mm1, mm0); psllq_i2r (3, mm2); por_r2r (mm2, mm0); movntq (mm0, *image); punpckhbw_r2r (mm4, mm7); punpckhbw_r2r (mm1, mm5); psllq_i2r (3, mm7); por_r2r (mm7, mm5); movntq (mm5, *(image+8)); }
static __inline__ void mmx_sum_4_word_accs( mmx_t *accs, int32_t *res ) { movq_m2r( *accs, mm1 ); movq_r2r( mm1, mm3 ); movq_r2r( mm1, mm2 ); /* Generate sign extensions for mm1 words! */ psraw_i2r( 15, mm3 ); punpcklwd_r2r( mm3, mm1 ); punpckhwd_r2r( mm3, mm2 ); paddd_r2r( mm1, mm2 ); movq_r2r( mm2, mm3); psrlq_i2r( 32, mm2); paddd_r2r( mm2, mm3); movd_r2m( mm3, *res ); }
static void vfilter_chroma_332_packed422_scanline_mmx( uint8_t *output, int width, uint8_t *m, uint8_t *t, uint8_t *b ) { int i; const mmx_t ymask = { 0x00ff00ff00ff00ffULL }; const mmx_t cmask = { 0xff00ff00ff00ff00ULL }; // Get width in bytes. width *= 2; i = width / 8; width -= i * 8; movq_m2r( ymask, mm7 ); movq_m2r( cmask, mm6 ); while ( i-- ) { movq_m2r( *t, mm0 ); movq_m2r( *b, mm1 ); movq_m2r( *m, mm2 ); movq_r2r ( mm2, mm3 ); pand_r2r ( mm7, mm3 ); pand_r2r ( mm6, mm0 ); pand_r2r ( mm6, mm1 ); pand_r2r ( mm6, mm2 ); psrlq_i2r( 8, mm0 ); psrlq_i2r( 7, mm1 ); psrlq_i2r( 8, mm2 ); movq_r2r ( mm0, mm4 ); psllw_i2r( 1, mm4 ); paddw_r2r( mm4, mm0 ); movq_r2r ( mm2, mm4 ); psllw_i2r( 1, mm4 ); paddw_r2r( mm4, mm2 ); paddw_r2r( mm0, mm2 ); paddw_r2r( mm1, mm2 ); psllw_i2r( 5, mm2 ); pand_r2r( mm6, mm2 ); por_r2r ( mm3, mm2 ); movq_r2m( mm2, *output ); output += 8; t += 8; b += 8; m += 8; } output++; t++; b++; m++; while ( width-- ) { *output = (3 * *t + 3 * *m + 2 * *b) >> 3; output +=2; t+=2; b+=2; m+=2; } emms(); }
static inline void mmx_interp_average_4_U8 (uint8_t * dest, const uint8_t * src1, const uint8_t * src2, const uint8_t * src3, const uint8_t * src4) { /* *dest = (*dest + (*src1 + *src2 + *src3 + *src4 + 2)/ 4 + 1)/ 2; */ movq_m2r (*src1, mm1); /* load 8 src1 bytes */ movq_r2r (mm1, mm2); /* copy 8 src1 bytes */ punpcklbw_r2r (mm0, mm1); /* unpack low src1 bytes */ punpckhbw_r2r (mm0, mm2); /* unpack high src1 bytes */ movq_m2r (*src2, mm3); /* load 8 src2 bytes */ movq_r2r (mm3, mm4); /* copy 8 src2 bytes */ punpcklbw_r2r (mm0, mm3); /* unpack low src2 bytes */ punpckhbw_r2r (mm0, mm4); /* unpack high src2 bytes */ paddw_r2r (mm3, mm1); /* add lows */ paddw_r2r (mm4, mm2); /* add highs */ /* now have partials in mm1 and mm2 */ movq_m2r (*src3, mm3); /* load 8 src3 bytes */ movq_r2r (mm3, mm4); /* copy 8 src3 bytes */ punpcklbw_r2r (mm0, mm3); /* unpack low src3 bytes */ punpckhbw_r2r (mm0, mm4); /* unpack high src3 bytes */ paddw_r2r (mm3, mm1); /* add lows */ paddw_r2r (mm4, mm2); /* add highs */ movq_m2r (*src4, mm5); /* load 8 src4 bytes */ movq_r2r (mm5, mm6); /* copy 8 src4 bytes */ punpcklbw_r2r (mm0, mm5); /* unpack low src4 bytes */ punpckhbw_r2r (mm0, mm6); /* unpack high src4 bytes */ paddw_r2r (mm5, mm1); /* add lows */ paddw_r2r (mm6, mm2); /* add highs */ paddw_m2r (round4, mm1); psraw_i2r (2, mm1); /* /4 */ paddw_m2r (round4, mm2); psraw_i2r (2, mm2); /* /4 */ /* now have subtotal/4 in mm1 and mm2 */ movq_m2r (*dest, mm3); /* load 8 dest bytes */ movq_r2r (mm3, mm4); /* copy 8 dest bytes */ packuswb_r2r (mm2, mm1); /* pack (w/ saturation) */ movq_r2r (mm1,mm2); /* copy subresult */ pxor_r2r (mm1, mm3); /* xor srcavg and dest */ pand_m2r (mask1, mm3); /* mask lower bits */ psrlq_i2r (1, mm3); /* /2 */ por_r2r (mm2, mm4); /* or srcavg and dest */ psubb_r2r (mm3, mm4); /* subtract subresults */ movq_r2m (mm4, *dest); /* store result in dest */ }
static __inline__ int qblock_sad_mmx(uint8_t *refblk, uint32_t h, uint32_t rowstride) { int res; pxor_r2r (mm4,mm4); movq_r2r (mm0,mm5); /* First row */ movd_m2r (*refblk, mm6); pxor_r2r ( mm7, mm7); refblk += rowstride; punpcklbw_r2r ( mm7, mm5); punpcklbw_r2r ( mm7, mm6); movq_r2r ( mm5, mm7); psubusw_r2r ( mm6, mm5); psubusw_r2r ( mm7, mm6); paddw_r2r ( mm5, mm4); paddw_r2r ( mm6, mm4 ); movq_r2r (mm1,mm5); /* Second row */ movd_m2r (*refblk, mm6); pxor_r2r ( mm7, mm7); refblk += rowstride; punpcklbw_r2r ( mm7, mm5); punpcklbw_r2r ( mm7, mm6); movq_r2r ( mm5, mm7); psubusw_r2r ( mm6, mm5); psubusw_r2r ( mm7, mm6); paddw_r2r ( mm5, mm4); paddw_r2r ( mm6, mm4 ); if( h == 4 ) { movq_r2r (mm2,mm5); /* Third row */ movd_m2r (*refblk, mm6); pxor_r2r ( mm7, mm7); refblk += rowstride; punpcklbw_r2r ( mm7, mm5); punpcklbw_r2r ( mm7, mm6); movq_r2r ( mm5, mm7); psubusw_r2r ( mm6, mm5); psubusw_r2r ( mm7, mm6); paddw_r2r ( mm5, mm4); paddw_r2r ( mm6, mm4 ); movq_r2r (mm3,mm5); /* Fourth row */ movd_m2r (*refblk, mm6); pxor_r2r ( mm7, mm7); punpcklbw_r2r ( mm7, mm5); punpcklbw_r2r ( mm7, mm6); movq_r2r ( mm5, mm7); psubusw_r2r ( mm6, mm5); psubusw_r2r ( mm7, mm6); paddw_r2r ( mm5, mm4); paddw_r2r ( mm6, mm4 ); } movq_r2r ( mm4, mm5 ); psrlq_i2r ( 32, mm5 ); paddw_r2r ( mm5, mm4 ); movq_r2r ( mm4, mm6 ); psrlq_i2r ( 16, mm6 ); paddw_r2r ( mm6, mm4 ); movd_r2m ( mm4, res ); return res & 0xffff; }
int bsad_mmx(uint8_t *pf, uint8_t *pb, uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h) { uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc; int s, s1, s2; pfa = pf + hxf; pfb = pf + lx * hyf; pfc = pfb + hxf; pba = pb + hxb; pbb = pb + lx * hyb; pbc = pbb + hxb; s = 0; /* the accumulator */ if (h > 0) { pxor_r2r(mm7, mm7); pxor_r2r(mm6, mm6); pcmpeqw_r2r(mm5, mm5); psubw_r2r(mm5, mm6); psllw_i2r(1, mm6); do { BSAD_LOAD(pf[0],mm0,mm1); BSAD_LOAD_ACC(pfa[0],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfb[0],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfc[0],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); BSAD_LOAD(pb[0],mm2,mm3); BSAD_LOAD_ACC(pba[0],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbb[0],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbc[0],mm4,mm5,mm2,mm3); paddw_r2r(mm6, mm2); paddw_r2r(mm6, mm3); psrlw_i2r(2, mm2); psrlw_i2r(2, mm3); paddw_r2r(mm2, mm0); paddw_r2r(mm3, mm1); psrlw_i2r(1, mm6); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psllw_i2r(1, mm6); psrlw_i2r(1, mm0); psrlw_i2r(1, mm1); packuswb_r2r(mm1, mm0); movq_m2r(p2[0], mm1); movq_r2r(mm0, mm2); psubusb_r2r(mm1, mm0); psubusb_r2r(mm2, mm1); por_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm7, mm0); punpckhbw_r2r(mm7, mm1); paddw_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklwd_r2r(mm7, mm0); punpckhwd_r2r(mm7, mm1); paddd_r2r(mm1, mm0); movd_r2g(mm0, s1); psrlq_i2r(32, mm0); movd_r2g(mm0, s2); s += s1 + s2; BSAD_LOAD(pf[8],mm0,mm1); BSAD_LOAD_ACC(pfa[8],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfb[8],mm2,mm3,mm0,mm1); BSAD_LOAD_ACC(pfc[8],mm2,mm3,mm0,mm1); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psrlw_i2r(2, mm0); psrlw_i2r(2, mm1); BSAD_LOAD(pb[8],mm2,mm3); BSAD_LOAD_ACC(pba[8],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbb[8],mm4,mm5,mm2,mm3); BSAD_LOAD_ACC(pbc[8],mm4,mm5,mm2,mm3); paddw_r2r(mm6, mm2); paddw_r2r(mm6, mm3); psrlw_i2r(2, mm2); psrlw_i2r(2, mm3); paddw_r2r(mm2, mm0); paddw_r2r(mm3, mm1); psrlw_i2r(1, mm6); paddw_r2r(mm6, mm0); paddw_r2r(mm6, mm1); psllw_i2r(1, mm6); psrlw_i2r(1, mm0); psrlw_i2r(1, mm1); packuswb_r2r(mm1, mm0); movq_m2r(p2[8], mm1); movq_r2r(mm0, mm2); psubusb_r2r(mm1, mm0); psubusb_r2r(mm2, mm1); por_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklbw_r2r(mm7, mm0); punpckhbw_r2r(mm7, mm1); paddw_r2r(mm1, mm0); movq_r2r(mm0, mm1); punpcklwd_r2r(mm7, mm0); punpckhwd_r2r(mm7, mm1); paddd_r2r(mm1, mm0); movd_r2g(mm0, s1); psrlq_i2r(32, mm0); movd_r2g(mm0, s2); s += s1 + s2; p2 += lx; pf += lx; pfa += lx; pfb += lx; pfc += lx; pb += lx; pba += lx; pbb += lx; pbc += lx; h--; } while (h > 0); } emms(); return s; }
static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src ) { int y, x; int32_t ff, fr; int fc; /* Detect interlacing */ fc = 0; pxor_r2r( mm7, mm7 ); for( y = 0; y < 9; y += 2 ) { ff = fr = 0; pxor_r2r( mm5, mm5 ); pxor_r2r( mm6, mm6 ); for( x = 0; x < 8; x+=4 ) { movd_m2r( src[ x], mm0 ); movd_m2r( src[1*i_src+x], mm1 ); movd_m2r( src[2*i_src+x], mm2 ); movd_m2r( src[3*i_src+x], mm3 ); punpcklbw_r2r( mm7, mm0 ); punpcklbw_r2r( mm7, mm1 ); punpcklbw_r2r( mm7, mm2 ); punpcklbw_r2r( mm7, mm3 ); movq_r2r( mm0, mm4 ); psubw_r2r( mm1, mm0 ); psubw_r2r( mm2, mm4 ); psubw_r2r( mm1, mm2 ); psubw_r2r( mm1, mm3 ); pmaddwd_r2r( mm0, mm0 ); pmaddwd_r2r( mm4, mm4 ); pmaddwd_r2r( mm2, mm2 ); pmaddwd_r2r( mm3, mm3 ); paddd_r2r( mm0, mm2 ); paddd_r2r( mm4, mm3 ); paddd_r2r( mm2, mm5 ); paddd_r2r( mm3, mm6 ); } movq_r2r( mm5, mm0 ); psrlq_i2r( 32, mm0 ); paddd_r2r( mm0, mm5 ); movd_r2m( mm5, fr ); movq_r2r( mm6, mm0 ); psrlq_i2r( 32, mm0 ); paddd_r2r( mm0, mm6 ); movd_r2m( mm6, ff ); if( ff < 6*fr/8 && fr > 32 ) fc++; src += 2*i_src; } return fc; }