// run nIter iterations of Horn & Schunk optical flow (alters Vx, Vy) void opticalFlowHsMex( float *Vx, float *Vy, const float *Ex, const float *Ey, const float *Et, const float *Z, const int h, const int w, const int nIter ) { int x, y, x1, i, t, s; float my, mx, m, *Vx0, *Vy0; s=w*h*sizeof(float); Vx0=new float[s]; Vy0=new float[s]; for( t=0; t<nIter; t++ ) { memcpy(Vx0,Vx,s); memcpy(Vy0,Vy,s); for( x=1; x<w-1; x++ ) { // do as much work as possible in SSE (assume non-aligned memory) for( y=1; y<h-4; y+=4 ) { x1=x*h; i=x1+y; __m128 _mx, _my, _m; _my=MUL(ADD(LDu(Vy0[x1-h+y]),LDu(Vy0[x1+h+y]), LDu(Vy0[x1+y-1]),LDu(Vy0[x1+y+1])),.25f); _mx=MUL(ADD(LDu(Vx0[x1-h+y]),LDu(Vx0[x1+h+y]), LDu(Vx0[x1+y-1]),LDu(Vx0[x1+y+1])),.25f); _m=MUL(ADD(MUL(LDu(Ey[i]),_my),MUL(LDu(Ex[i]),_mx), LDu(Et[i])),LDu(Z[i])); STRu(Vx[i],SUB(_mx,MUL(LDu(Ex[i]),_m))); STRu(Vy[i],SUB(_my,MUL(LDu(Ey[i]),_m))); } // do remainder of work in regular loop for( ; y<h-1; y++ ) { x1=x*h; i=x1+y; mx=.25f*(Vx0[x1-h+y]+Vx0[x1+h+y]+Vx0[x1+y-1]+Vx0[x1+y+1]); my=.25f*(Vy0[x1-h+y]+Vy0[x1+h+y]+Vy0[x1+y-1]+Vy0[x1+y+1]); m = (Ex[i]*mx + Ey[i]*my + Et[i])*Z[i]; Vx[i]=mx-Ex[i]*m; Vy[i]=my-Ey[i]*m; } } } delete [] Vx0; delete [] Vy0; }
// compute gradient magnitude and orientation at each location (uses sse) void gradMag( float *I, float *M, float *O, int h, int w, int d, bool full ) { int x, y, y1, c, h4, s; float *Gx, *Gy, *M2; __m128 *_Gx, *_Gy, *_M2, _m; float *acost = acosTable(), acMult = 10000.0f; // allocate memory for storing one column of output (padded so h4%4==0) h4 = (h % 4 == 0) ? h : h - (h % 4) + 4; s = d * h4 * sizeof(float); M2 = (float*) alMalloc(s, 16); _M2 = (__m128*) M2; Gx = (float*) alMalloc(s, 16); _Gx = (__m128*) Gx; Gy = (float*) alMalloc(s, 16); _Gy = (__m128*) Gy; // compute gradient magnitude and orientation for each column for ( x = 0; x < w; x++ ) { // compute gradients (Gx, Gy) with maximum squared magnitude (M2) for (c = 0; c < d; c++) { grad1( I + x * h + c * w * h, Gx + c * h4, Gy + c * h4, h, w, x ); for ( y = 0; y < h4 / 4; y++ ) { y1 = h4 / 4 * c + y; _M2[y1] = ADD(MUL(_Gx[y1], _Gx[y1]), MUL(_Gy[y1], _Gy[y1])); if ( c == 0 ) { continue; } _m = CMPGT( _M2[y1], _M2[y] ); _M2[y] = OR( AND(_m, _M2[y1]), ANDNOT(_m, _M2[y]) ); _Gx[y] = OR( AND(_m, _Gx[y1]), ANDNOT(_m, _Gx[y]) ); _Gy[y] = OR( AND(_m, _Gy[y1]), ANDNOT(_m, _Gy[y]) ); } } // compute gradient mangitude (M) and normalize Gx for ( y = 0; y < h4 / 4; y++ ) { _m = MINsse( RCPSQRT(_M2[y]), SET(1e10f) ); _M2[y] = RCP(_m); if (O) { _Gx[y] = MUL( MUL(_Gx[y], _m), SET(acMult) ); } if (O) { _Gx[y] = XOR( _Gx[y], AND(_Gy[y], SET(-0.f)) ); } }; memcpy( M + x * h, M2, h * sizeof(float) ); // compute and store gradient orientation (O) via table lookup if ( O != 0 ) for ( y = 0; y < h; y++ ) { O[x * h + y] = acost[(int)Gx[y]]; } if ( O != 0 && full ) { y1 = ((~size_t(O + x * h) + 1) & 15) / 4; y = 0; for ( ; y < y1; y++ ) { O[y + x * h] += (Gy[y] < 0) * PI; } for ( ; y < h - 4; y += 4 ) STRu( O[y + x * h], ADD( LDu(O[y + x * h]), AND(CMPLT(LDu(Gy[y]), SET(0.f)), SET(PI)) ) ); for ( ; y < h; y++ ) { O[y + x * h] += (Gy[y] < 0) * PI; } } } alFree(Gx); alFree(Gy); alFree(M2); }