// convolve I by a 2rx1 triangle filter (uses SSE) void convTri( float *I, float *O, int h, int w, int d, int r, int s ) { r++; float nrm = 1.0f/(r*r*r*r); int i, j, k=(s-1)/2, h0, h1, w0; if(h%4==0) h0=h1=h; else { h0=h-(h%4); h1=h0+4; } w0=(w/s)*s; float *T=(float*) alMalloc(2*h1*sizeof(float),16), *U=T+h1; while(d-- > 0) { // initialize T and U for(j=0; j<h0; j+=4) STR(U[j], STR(T[j], LDu(I[j]))); for(i=1; i<r; i++) for(j=0; j<h0; j+=4) INC(U[j],INC(T[j],LDu(I[j+i*h]))); for(j=0; j<h0; j+=4) STR(U[j],MUL(nrm,(SUB(MUL(2,LD(U[j])),LD(T[j]))))); for(j=0; j<h0; j+=4) STR(T[j],0); for(j=h0; j<h; j++ ) U[j]=T[j]=I[j]; for(i=1; i<r; i++) for(j=h0; j<h; j++ ) U[j]+=T[j]+=I[j+i*h]; for(j=h0; j<h; j++ ) { U[j] = nrm * (2*U[j]-T[j]); T[j]=0; } // prepare and convolve each column in turn k++; if(k==s) { k=0; convTriY(U,O,h,r-1,s); O+=h/s; } for( i=1; i<w0; i++ ) { float *Il=I+(i-1-r)*h; if(i<=r) Il=I+(r-i)*h; float *Im=I+(i-1)*h; float *Ir=I+(i-1+r)*h; if(i>w-r) Ir=I+(2*w-r-i)*h; for( j=0; j<h0; j+=4 ) { INC(T[j],ADD(LDu(Il[j]),LDu(Ir[j]),MUL(-2,LDu(Im[j])))); INC(U[j],MUL(nrm,LD(T[j]))); } for( j=h0; j<h; j++ ) U[j]+=nrm*(T[j]+=Il[j]+Ir[j]-2*Im[j]); k++; if(k==s) { k=0; convTriY(U,O,h,r-1,s); O+=h/s; } } I+=w*h; } alFree(T); }
// helper for gradHist, quantize O and M into O0, O1 and M0, M1 (uses sse) void gradQuantize( float *O, float *M, int *O0, int *O1, float *M0, float *M1, int nb, int n, float norm, int nOrients, bool full, bool interpolate ) { // assumes all *OUTPUT* matrices are 4-byte aligned int i, o0, o1; float o, od, m; __m128i _o0, _o1, *_O0, *_O1; __m128 _o, _od, _m, *_M0, *_M1; // define useful constants const float oMult=(float)nOrients/(full?2*PI:PI); const int oMax=nOrients*nb; const __m128 _norm=SET(norm), _oMult=SET(oMult), _nbf=SET((float)nb); const __m128i _oMax=SET(oMax), _nb=SET(nb); // perform the majority of the work with sse _O0=(__m128i*) O0; _O1=(__m128i*) O1; _M0=(__m128*) M0; _M1=(__m128*) M1; if( interpolate ) for( i=0; i<=n-4; i+=4 ) { _o=MUL(LDu(O[i]),_oMult); _o0=CVT(_o); _od=SUB(_o,CVT(_o0)); _o0=CVT(MUL(CVT(_o0),_nbf)); _o0=AND(CMPGT(_oMax,_o0),_o0); *_O0++=_o0; _o1=ADD(_o0,_nb); _o1=AND(CMPGT(_oMax,_o1),_o1); *_O1++=_o1; _m=MUL(LDu(M[i]),_norm); *_M1=MUL(_od,_m); *_M0++=SUB(_m,*_M1); _M1++; } else for( i=0; i<=n-4; i+=4 ) { _o=MUL(LDu(O[i]),_oMult); _o0=CVT(ADD(_o,SET(.5f))); _o0=CVT(MUL(CVT(_o0),_nbf)); _o0=AND(CMPGT(_oMax,_o0),_o0); *_O0++=_o0; *_M0++=MUL(LDu(M[i]),_norm); *_M1++=SET(0.f); *_O1++=SET(0); } // compute trailing locations without sse if( interpolate ) for(; i<n; i++ ) { o=O[i]*oMult; o0=(int) o; od=o-o0; o0*=nb; if(o0>=oMax) o0=0; O0[i]=o0; o1=o0+nb; if(o1==oMax) o1=0; O1[i]=o1; m=M[i]*norm; M1[i]=od*m; M0[i]=m-M1[i]; } else for(; i<n; i++ ) { o=O[i]*oMult; o0=(int) (o+.5f); o0*=nb; if(o0>=oMax) o0=0; O0[i]=o0; M0[i]=M[i]*norm; M1[i]=0; O1[i]=0; } }
// convolve I by a [1 1; 1 1] filter (uses SSE) void conv11( float *I, float *O, int h, int w, int d, int side, int s ) { const float nrm = 0.25f; int i, j; float *I0, *I1, *T = (float*) alMalloc(h*sizeof(float),16); for( int d0=0; d0<d; d0++ ) for( i=s/2; i<w; i+=s ) { I0=I1=I+i*h+d0*h*w; if(side%2) { if(i<w-1) I1+=h; } else { if(i) I0-=h; } for( j=0; j<h-4; j+=4 ) STR( T[j], MUL(nrm,ADD(LDu(I0[j]),LDu(I1[j]))) ); for( ; j<h; j++ ) T[j]=nrm*(I0[j]+I1[j]); conv11Y(T,O,h,side,s); O+=h/s; } alFree(T); }
// convolve I by a [1 p 1] filter (uses SSE) void convTri1( float *I, float *O, int h, int w, int d, float p, int s ) { const float nrm = 1.0f/((p+2)*(p+2)); int i, j, h0=h-(h%4); float *Il, *Im, *Ir, *T=(float*) alMalloc(h*sizeof(float),16); for( int d0=0; d0<d; d0++ ) for( i=s/2; i<w; i+=s ) { Il=Im=Ir=I+i*h+d0*h*w; if(i>0) Il-=h; if(i<w-1) Ir+=h; for( j=0; j<h0; j+=4 ) STR(T[j],MUL(nrm,ADD(ADD(LDu(Il[j]),MUL(p,LDu(Im[j]))),LDu(Ir[j])))); for( j=h0; j<h; j++ ) T[j]=nrm*(Il[j]+p*Im[j]+Ir[j]); convTri1Y(T,O,h,p,s); O+=h/s; } alFree(T); }
// compute gradient magnitude and orientation at each location (uses sse) void gradMag( float *I, float *M, float *O, int h, int w, int d, bool full ) { int x, y, y1, c, h4, s; float *Gx, *Gy, *M2; __m128 *_Gx, *_Gy, *_M2, _m; float *acost = acosTable(), acMult = 10000.0f; // allocate memory for storing one column of output (padded so h4%4==0) h4 = (h % 4 == 0) ? h : h - (h % 4) + 4; s = d * h4 * sizeof(float); M2 = (float*) alMalloc(s, 16); _M2 = (__m128*) M2; Gx = (float*) alMalloc(s, 16); _Gx = (__m128*) Gx; Gy = (float*) alMalloc(s, 16); _Gy = (__m128*) Gy; // compute gradient magnitude and orientation for each column for ( x = 0; x < w; x++ ) { // compute gradients (Gx, Gy) with maximum squared magnitude (M2) for (c = 0; c < d; c++) { grad1( I + x * h + c * w * h, Gx + c * h4, Gy + c * h4, h, w, x ); for ( y = 0; y < h4 / 4; y++ ) { y1 = h4 / 4 * c + y; _M2[y1] = ADD(MUL(_Gx[y1], _Gx[y1]), MUL(_Gy[y1], _Gy[y1])); if ( c == 0 ) { continue; } _m = CMPGT( _M2[y1], _M2[y] ); _M2[y] = OR( AND(_m, _M2[y1]), ANDNOT(_m, _M2[y]) ); _Gx[y] = OR( AND(_m, _Gx[y1]), ANDNOT(_m, _Gx[y]) ); _Gy[y] = OR( AND(_m, _Gy[y1]), ANDNOT(_m, _Gy[y]) ); } } // compute gradient mangitude (M) and normalize Gx for ( y = 0; y < h4 / 4; y++ ) { _m = MINsse( RCPSQRT(_M2[y]), SET(1e10f) ); _M2[y] = RCP(_m); if (O) { _Gx[y] = MUL( MUL(_Gx[y], _m), SET(acMult) ); } if (O) { _Gx[y] = XOR( _Gx[y], AND(_Gy[y], SET(-0.f)) ); } }; memcpy( M + x * h, M2, h * sizeof(float) ); // compute and store gradient orientation (O) via table lookup if ( O != 0 ) for ( y = 0; y < h; y++ ) { O[x * h + y] = acost[(int)Gx[y]]; } if ( O != 0 && full ) { y1 = ((~size_t(O + x * h) + 1) & 15) / 4; y = 0; for ( ; y < y1; y++ ) { O[y + x * h] += (Gy[y] < 0) * PI; } for ( ; y < h - 4; y += 4 ) STRu( O[y + x * h], ADD( LDu(O[y + x * h]), AND(CMPLT(LDu(Gy[y]), SET(0.f)), SET(PI)) ) ); for ( ; y < h; y++ ) { O[y + x * h] += (Gy[y] < 0) * PI; } } } alFree(Gx); alFree(Gy); alFree(M2); }
void pcl::people::HOG::gradQuantize (float *O, float *M, int *O0, int *O1, float *M0, float *M1, int n_orients, int nb, int n, float norm) const { #if defined(__SSE2__) // assumes all *OUTPUT* matrices are 4-byte aligned int i, o0, o1; float o, od, m; __m128i _o0, _o1, *_O0, *_O1; __m128 _o, _o0f, _m, *_M0, *_M1; // define useful constants const float oMult=(float)n_orients/M_PI; const int oMax=n_orients*nb; const __m128 _norm=SET(norm), _oMult=SET(oMult), _nbf=SET((float)nb); const __m128i _oMax=SET(oMax), _nb=SET(nb); // perform the majority of the work with sse _O0=(__m128i*) O0; _O1=(__m128i*) O1; _M0=(__m128*) M0; _M1=(__m128*) M1; for( i=0; i<=n-4; i+=4 ) { _o=MUL(LDu(O[i]),_oMult); _o0f=CVT(CVT(_o)); _o0=CVT(MUL(_o0f,_nbf)); _o1=ADD(_o0,_nb); _o1=AND(CMPGT(_oMax,_o1),_o1); *_O0++=_o0; *_O1++=_o1; _m=MUL(LDu(M[i]),_norm); *_M1=MUL(SUB(_o,_o0f),_m); *_M0=SUB(_m,*_M1); _M0++; _M1++; } // compute trailing locations without sse for( ; i<n; i++ ) { o=O[i]*oMult; m=M[i]*norm; o0=(int) o; od=o-o0; o0*=nb; o1=o0+nb; if(o1==oMax) o1=0; O0[i]=o0; O1[i]=o1; M1[i]=od*m; M0[i]=m-M1[i]; } #else int i, o0, o1; float o, od, m; // define useful constants const float oMult=(float)n_orients/M_PI; const int oMax=n_orients*nb; // compute trailing locations without sse for( i = 0; i<n; i++ ) { o=O[i]*oMult; m=M[i]*norm; o0=(int) o; od=o-o0; o0*=nb; o1=o0+nb; if(o1==oMax) o1=0; O0[i]=o0; O1[i]=o1; M1[i]=od*m; M0[i]=m-M1[i]; } #endif }
// compute x and y gradients for just one column (uses sse) void grad1( float *I, float *Gx, float *Gy, int h, int w, int x ) { int y, y1; float *Ip, *In, r; __m128 *_Ip, *_In, *_G, _r; // compute column of Gx Ip = I - h; In = I + h; r = .5f; if (x == 0) { r = 1; Ip += h; } else if (x == w - 1) { r = 1; In -= h; } if ( h < 4 || h % 4 > 0 || (size_t(I) & 15) || (size_t(Gx) & 15) ) { for ( y = 0; y < h; y++ ) { *Gx++ = (*In++ -*Ip++) * r; } } else { _G = (__m128*) Gx; _Ip = (__m128*) Ip; _In = (__m128*) In; _r = SET(r); for (y = 0; y < h; y += 4) { *_G++ = MUL(SUB(*_In++, *_Ip++), _r); } } // compute column of Gy #define GRADY(r) *Gy++=(*In++-*Ip++)*r; Ip = I; In = Ip + 1; // GRADY(1); Ip--; for(y=1; y<h-1; y++) GRADY(.5f); In--; GRADY(1); y1 = ((~((size_t) Gy) + 1) & 15) / 4; if (y1 == 0) { y1 = 4; } if (y1 > h - 1) { y1 = h - 1; } GRADY(1); Ip--; for (y = 1; y < y1; y++) { GRADY(.5f); } _r = SET(.5f); _G = (__m128*) Gy; for (; y + 4 < h - 1; y += 4, Ip += 4, In += 4, Gy += 4) { *_G++ = MUL(SUB(LDu(*In), LDu(*Ip)), _r); } for (; y < h - 1; y++) { GRADY(.5f); } In--; GRADY(1); #undef GRADY }
// convolve I by a 2r+1 x 2r+1 ones filter (uses SSE) void convBox( float *I, float *O, int h, int w, int d, int r, int s ) { float nrm = 1.0f/((2*r+1)*(2*r+1)); int i, j, k=(s-1)/2, h0, h1, w0; // s=1 if(h%4==0) h0=h1=h; else { h0=h-(h%4); h1=h0+4; } w0=(w/s)*s; float *T=(float*) alMalloc(h1*sizeof(float),16); while(d-- > 0) { // initialize T memset( T, 0, h1*sizeof(float) ); for(i=0; i<=r; i++) for(j=0; j<h0; j+=4) INC(T[j],LDu(I[j+i*h])); for(j=0; j<h0; j+=4) STR(T[j],MUL(nrm,SUB(MUL(2,LD(T[j])),LDu(I[j+r*h])))); for(i=0; i<=r; i++) for(j=h0; j<h; j++ ) T[j]+=I[j+i*h]; // assemble just perform like following 2 lines for(j=h0; j<h; j++ ) T[j]=nrm*(2*T[j]-I[j+r*h]); // prepare and convolve each column in turn k++; if(k==s) { k=0; convBoxY(T,O,h,r,s); O+=h/s; } for( i=1; i<w0; i++ ) { float *Il=I+(i-1-r)*h; if(i<=r) Il=I+(r-i)*h; float *Ir=I+(i+r)*h; if(i>=w-r) Ir=I+(2*w-r-i-1)*h; for(j=0; j<h0; j+=4) DEC(T[j],MUL(nrm,SUB(LDu(Il[j]),LDu(Ir[j])))); for(j=h0; j<h; j++ ) T[j]-=nrm*(Il[j]-Ir[j]); k++; if(k==s) { k=0; convBoxY(T,O,h,r,s); O+=h/s; } } I+=w*h; } alFree(T); }
// run nIter iterations of Horn & Schunk optical flow (alters Vx, Vy) void opticalFlowHsMex( float *Vx, float *Vy, const float *Ex, const float *Ey, const float *Et, const float *Z, const int h, const int w, const int nIter ) { int x, y, x1, i, t, s; float my, mx, m, *Vx0, *Vy0; s=w*h*sizeof(float); Vx0=new float[s]; Vy0=new float[s]; for( t=0; t<nIter; t++ ) { memcpy(Vx0,Vx,s); memcpy(Vy0,Vy,s); for( x=1; x<w-1; x++ ) { // do as much work as possible in SSE (assume non-aligned memory) for( y=1; y<h-4; y+=4 ) { x1=x*h; i=x1+y; __m128 _mx, _my, _m; _my=MUL(ADD(LDu(Vy0[x1-h+y]),LDu(Vy0[x1+h+y]), LDu(Vy0[x1+y-1]),LDu(Vy0[x1+y+1])),.25f); _mx=MUL(ADD(LDu(Vx0[x1-h+y]),LDu(Vx0[x1+h+y]), LDu(Vx0[x1+y-1]),LDu(Vx0[x1+y+1])),.25f); _m=MUL(ADD(MUL(LDu(Ey[i]),_my),MUL(LDu(Ex[i]),_mx), LDu(Et[i])),LDu(Z[i])); STRu(Vx[i],SUB(_mx,MUL(LDu(Ex[i]),_m))); STRu(Vy[i],SUB(_my,MUL(LDu(Ey[i]),_m))); } // do remainder of work in regular loop for( ; y<h-1; y++ ) { x1=x*h; i=x1+y; mx=.25f*(Vx0[x1-h+y]+Vx0[x1+h+y]+Vx0[x1+y-1]+Vx0[x1+y+1]); my=.25f*(Vy0[x1-h+y]+Vy0[x1+h+y]+Vy0[x1+y-1]+Vy0[x1+y+1]); m = (Ex[i]*mx + Ey[i]*my + Et[i])*Z[i]; Vx[i]=mx-Ex[i]*m; Vy[i]=my-Ey[i]*m; } } } delete [] Vx0; delete [] Vy0; }