int main(int ac, char **av) { int i, j, k, n; unsigned char dat0[8] = { 0x01, 0xf2, 0x03, 0x04, 0x05, 0x06, 0xf7, 0x08 }; long long *datp = (long long *)&dat0; int16_t dat1[8] = { 0x10, 0x20, -0x130, -0x140, 0x50, -0x160, -0x170, 0x80 }; volatile uint8_t *rfp = dat0; volatile int16_t *bp = dat1; unsigned char ans1[8], ans2[8]; n = 0; for( i=-32768; i<32768; ++i ) { j = 0; while( j < 256 ) { for( k=0; k<8; ++k ) { dat0[k] = i; dat1[k] = j++; } movq_m2r(m_(&rfp[0]),mm1); /* rfp[0..7] */ pxor_r2r(mm3,mm3); pxor_r2r(mm4,mm4); movq_m2r(m_(&bp[0]),mm5); /* bp[0..3] */ movq_r2r(mm1,mm2); movq_m2r(m_(&bp[4]),mm6); /* bp[4..7] */ punpcklbw_r2r(mm3,mm1); /* rfp[0,2,4,6] */ punpckhbw_r2r(mm3,mm2); /* rfp[1,3,5,7] */ paddsw_r2r(mm5,mm1); /* bp[0..3] */ paddsw_r2r(mm6,mm2); /* bp[4..7] */ pcmpgtw_r2r(mm1,mm3); pcmpgtw_r2r(mm2,mm4); pandn_r2r(mm1,mm3); pandn_r2r(mm2,mm4); packuswb_r2r(mm4,mm3); movq_r2m(mm3,m_(&ans1[0])); emms(); ans2[0] = clip(bp[0] + rfp[0]); ans2[1] = clip(bp[1] + rfp[1]); ans2[2] = clip(bp[2] + rfp[2]); ans2[3] = clip(bp[3] + rfp[3]); ans2[4] = clip(bp[4] + rfp[4]); ans2[5] = clip(bp[5] + rfp[5]); ans2[6] = clip(bp[6] + rfp[6]); ans2[7] = clip(bp[7] + rfp[7]); if( *(uint64_t *)&ans1[0] != *(uint64_t *)&ans2[0] ) { printf(" i=%5d %02x %02x %02x %02x %02x %02x %02x %02x\n", i, ans1[0], ans1[1], ans1[2], ans1[3], ans1[4], ans1[5], ans1[6], ans1[7]); printf(" j=%5d %02x %02x %02x %02x %02x %02x %02x %02x\n", j, ans2[0], ans2[1], ans2[2], ans2[3], ans2[4], ans2[5], ans2[6], ans2[7]); // exit(0); } n += 8; } } printf("n=%d\n",n); return 0; }
static inline void mmx_end(uint8_t *src3, uint8_t *src5, uint8_t *dst, int X) { punpcklbw_m2r (mm_cpool[0], mm4); punpckhbw_m2r (mm_cpool[0], mm5); psubusw_r2r (mm2, mm0); psubusw_r2r (mm3, mm1); movq_m2r (src5[X], mm2); movq_m2r (src5[X], mm3); punpcklbw_m2r (mm_cpool[0], mm2); punpckhbw_m2r (mm_cpool[0], mm3); psubusw_r2r (mm2, mm0); psubusw_r2r (mm3, mm1); psrlw_i2r (3, mm0); psrlw_i2r (3, mm1); psubw_r2r (mm6, mm4); psubw_r2r (mm7, mm5); packuswb_r2r (mm1,mm0); movq_r2r (mm4, mm6); movq_r2r (mm5, mm7); pcmpgtw_m2r (mm_lthr, mm4); pcmpgtw_m2r (mm_lthr, mm5); pcmpgtw_m2r (mm_hthr, mm6); pcmpgtw_m2r (mm_hthr, mm7); packsswb_r2r (mm5, mm4); packsswb_r2r (mm7, mm6); pxor_r2r (mm6, mm4); movq_r2r (mm4, mm5); pandn_r2r (mm0, mm4); pand_m2r (src3[X], mm5); por_r2r (mm4, mm5); movq_r2m (mm5, dst[X]); }
void deinterlace_bob_yuv_mmx(uint8_t *pdst, uint8_t *psrc, int width, int height ) { int Line; long long* YVal1; long long* YVal2; long long* YVal3; long long* Dest; uint8_t* pEvenLines = psrc; uint8_t* pOddLines = psrc+width; int LineLength = width; int Pitch = width * 2; int IsOdd = 1; long EdgeDetect = 625; long JaggieThreshold = 73; int n; unsigned long long qwEdgeDetect; unsigned long long qwThreshold; const unsigned long long Mask = 0xfefefefefefefefeULL; const unsigned long long YMask = 0x00ff00ff00ff00ffULL; qwEdgeDetect = EdgeDetect; qwEdgeDetect += (qwEdgeDetect << 48) + (qwEdgeDetect << 32) + (qwEdgeDetect << 16); qwThreshold = JaggieThreshold; qwThreshold += (qwThreshold << 48) + (qwThreshold << 32) + (qwThreshold << 16); // copy first even line no matter what, and the first odd line if we're // processing an odd field. ac_memcpy(pdst, pEvenLines, LineLength); if (IsOdd) ac_memcpy(pdst + LineLength, pOddLines, LineLength); height = height / 2; for (Line = 0; Line < height - 1; ++Line) { if (IsOdd) { YVal1 = (long long *)(pOddLines + Line * Pitch); YVal2 = (long long *)(pEvenLines + (Line + 1) * Pitch); YVal3 = (long long *)(pOddLines + (Line + 1) * Pitch); Dest = (long long *)(pdst + (Line * 2 + 2) * LineLength); } else { YVal1 = (long long *)(pEvenLines + Line * Pitch); YVal2 = (long long *)(pOddLines + Line * Pitch); YVal3 = (long long *)(pEvenLines + (Line + 1) * Pitch); Dest = (long long *)(pdst + (Line * 2 + 1) * LineLength); } // For ease of reading, the comments below assume that we're operating on an odd // field (i.e., that bIsOdd is true). The exact same processing is done when we // operate on an even field, but the roles of the odd and even fields are reversed. // It's just too cumbersome to explain the algorithm in terms of "the next odd // line if we're doing an odd field, or the next even line if we're doing an // even field" etc. So wherever you see "odd" or "even" below, keep in mind that // half the time this function is called, those words' meanings will invert. // Copy the odd line to the overlay verbatim. ac_memcpy((char *)Dest + LineLength, YVal3, LineLength); n = LineLength >> 3; while( n-- ) { movq_m2r (*YVal1++, mm0); movq_m2r (*YVal2++, mm1); movq_m2r (*YVal3++, mm2); // get intensities in mm3 - 4 movq_r2r ( mm0, mm3 ); movq_r2r ( mm1, mm4 ); movq_r2r ( mm2, mm5 ); pand_m2r ( *&YMask, mm3 ); pand_m2r ( *&YMask, mm4 ); pand_m2r ( *&YMask, mm5 ); // get average in mm0 pand_m2r ( *&Mask, mm0 ); pand_m2r ( *&Mask, mm2 ); psrlw_i2r ( 01, mm0 ); psrlw_i2r ( 01, mm2 ); paddw_r2r ( mm2, mm0 ); // work out (O1 - E) * (O2 - E) / 2 - EdgeDetect * (O1 - O2) ^ 2 >> 12 // result will be in mm6 psrlw_i2r ( 01, mm3 ); psrlw_i2r ( 01, mm4 ); psrlw_i2r ( 01, mm5 ); movq_r2r ( mm3, mm6 ); psubw_r2r ( mm4, mm6 ); //mm6 = O1 - E movq_r2r ( mm5, mm7 ); psubw_r2r ( mm4, mm7 ); //mm7 = O2 - E pmullw_r2r ( mm7, mm6 ); // mm6 = (O1 - E) * (O2 - E) movq_r2r ( mm3, mm7 ); psubw_r2r ( mm5, mm7 ); // mm7 = (O1 - O2) pmullw_r2r ( mm7, mm7 ); // mm7 = (O1 - O2) ^ 2 psrlw_i2r ( 12, mm7 ); // mm7 = (O1 - O2) ^ 2 >> 12 pmullw_m2r ( *&qwEdgeDetect, mm7 );// mm7 = EdgeDetect * (O1 - O2) ^ 2 >> 12 psubw_r2r ( mm7, mm6 ); // mm6 is what we want pcmpgtw_m2r ( *&qwThreshold, mm6 ); movq_r2r ( mm6, mm7 ); pand_r2r ( mm6, mm0 ); pandn_r2r ( mm1, mm7 ); por_r2r ( mm0, mm7 ); movq_r2m ( mm7, *Dest++ ); } } // Copy last odd line if we're processing an even field. if (! IsOdd) { ac_memcpy(pdst + (height * 2 - 1) * LineLength, pOddLines + (height - 1) * Pitch, LineLength); } // clear out the MMX registers ready for doing floating point // again emms(); }