double calcPi_simd_rcp(void) { double x; int i; double width = 1./(double) num_rects; // __m256d __width = _mm256_set1_pd(width); // __m256d __half = _mm256_set1_pd(0.5); __m256d __four = _mm256_set1_pd(4.0); __m256d __one = _mm256_set1_pd(1.0); // __m256d __sum = _mm256_set1_pd(0.0); __m256d __i = _mm256_set_pd(0.5, 1.5, 2.5, 3.5); for (i = 0; i < num_rects; i += 4) { __m256d __x = __i *__width; __m256d __y = RCP(__one + __x*__x); __sum += __four * __y; // (__one + __x*__x); __i = __i + __four; } double sum; // sum = ((double*) &__sum)[0]; sum += ((double*) &__sum)[1]; sum += ((double*) &__sum)[2]; sum += ((double*) &__sum)[3]; // return width*sum; }
// normalize gradient magnitude at each location (uses sse) void gradMagNorm( float *M, float *S, int h, int w, float norm ) { __m128 *_M, *_S, _norm; int i=0, n=h*w, n4=n/4; _S = (__m128*) S; _M = (__m128*) M; _norm = SET(norm); bool sse = !(size_t(M)&15) && !(size_t(S)&15); if(sse) for(; i<n4; i++) { *_M=MUL(*_M,RCP(ADD(*_S++,_norm))); _M++; } if(sse) i*=4; for(; i<n; i++) M[i] /= (S[i] + norm); }
// compute gradient magnitude and orientation at each location (uses sse) void gradMag( float *I, float *M, float *O, int h, int w, int d ) { int x, y, y1, c, h4, s; float *Gx, *Gy, *M2; __m128 *_Gx, *_Gy, *_M2, _m; float *acost = acosTable(), acMult=25000/2.02f; // allocate memory for storing one column of output (padded so h4%4==0) h4=(h%4==0) ? h : h-(h%4)+4; s=d*h4*sizeof(float); M2=(float*) alMalloc(s,16); _M2=(__m128*) M2; Gx=(float*) alMalloc(s,16); _Gx=(__m128*) Gx; Gy=(float*) alMalloc(s,16); _Gy=(__m128*) Gy; // compute gradient magnitude and orientation for each column for( x=0; x<w; x++ ) { // compute gradients (Gx, Gy) and squared magnitude (M2) for each channel for( c=0; c<d; c++ ) grad1( I+x*h+c*w*h, Gx+c*h4, Gy+c*h4, h, w, x ); for( y=0; y<d*h4/4; y++ ) _M2[y]=ADD(MUL(_Gx[y],_Gx[y]),MUL(_Gy[y],_Gy[y])); // store gradients with maximum response in the first channel for(c=1; c<d; c++) { for( y=0; y<h4/4; y++ ) { y1=h4/4*c+y; _m = CMPGT( _M2[y1], _M2[y] ); _M2[y] = OR( AND(_m,_M2[y1]), ANDNOT(_m,_M2[y]) ); _Gx[y] = OR( AND(_m,_Gx[y1]), ANDNOT(_m,_Gx[y]) ); _Gy[y] = OR( AND(_m,_Gy[y1]), ANDNOT(_m,_Gy[y]) ); } } // compute gradient mangitude (M) and normalize Gx for( y=0; y<h4/4; y++ ) { _m = MIN( RCPSQRT(_M2[y]), SET(1e10f) ); _M2[y] = RCP(_m); _Gx[y] = MUL( MUL(_Gx[y],_m), SET(acMult) ); _Gx[y] = XOR( _Gx[y], AND(_Gy[y], SET(-0.f)) ); }; memcpy( M+x*h, M2, h*sizeof(float) ); // compute and store gradient orientation (O) via table lookup if(O!=0) for( y=0; y<h; y++ ) O[x*h+y] = acost[(int)Gx[y]]; } alFree(Gx); alFree(Gy); alFree(M2); }
// gradMagNorm( M, S, norm ) - operates on M - see gradientMag.m // gradientMex('gradientMagNorm',M,S,normConst); // normalize gradient magnitude at each location (uses sse) void GradientMagnitudeChannel::gradMagNorm(float *M, float *S, int h, int w) { float norm = normalizationConstant; __m128 *_M, *_S, _norm; int i=0, n=h*w, n4=n/4; _S = (__m128*) S; _M = (__m128*) M; _norm = SET(norm); bool sse = !(size_t(M)&15) && !(size_t(S)&15); if(sse) { for(; i<n4; i++) *_M++=MUL(*_M,RCP(ADD(*_S++,_norm))); i*=4; } for(; i<n; i++) M[i] /= (S[i] + norm); }
static void RegisterApplication ( Widget top ) { /* -- Register widget classes and constructors */ RCP( top, termEd ); /* -- Register application specific actions */ /* -- Register application specific callbacks */ RCB( top, quit_cb ); RCB( top, test_cb ); RCB( top, exec_lua_cb ); }
static void RegisterApplication ( Widget top ) { /* -- Register widget classes and constructors */ RCP( top, wcap ); /* -- Register application specific actions */ /* -- Register application specific callbacks */ RCB( top, quit_gui ); RCB( top, startstop_cb ); }
// compute gradient magnitude and orientation at each location (uses sse) void gradMag( float *I, float *M, float *O, int h, int w, int d, bool full ) { int x, y, y1, c, h4, s; float *Gx, *Gy, *M2; __m128 *_Gx, *_Gy, *_M2, _m; float *acost = acosTable(), acMult = 10000.0f; // allocate memory for storing one column of output (padded so h4%4==0) h4 = (h % 4 == 0) ? h : h - (h % 4) + 4; s = d * h4 * sizeof(float); M2 = (float*) alMalloc(s, 16); _M2 = (__m128*) M2; Gx = (float*) alMalloc(s, 16); _Gx = (__m128*) Gx; Gy = (float*) alMalloc(s, 16); _Gy = (__m128*) Gy; // compute gradient magnitude and orientation for each column for ( x = 0; x < w; x++ ) { // compute gradients (Gx, Gy) with maximum squared magnitude (M2) for (c = 0; c < d; c++) { grad1( I + x * h + c * w * h, Gx + c * h4, Gy + c * h4, h, w, x ); for ( y = 0; y < h4 / 4; y++ ) { y1 = h4 / 4 * c + y; _M2[y1] = ADD(MUL(_Gx[y1], _Gx[y1]), MUL(_Gy[y1], _Gy[y1])); if ( c == 0 ) { continue; } _m = CMPGT( _M2[y1], _M2[y] ); _M2[y] = OR( AND(_m, _M2[y1]), ANDNOT(_m, _M2[y]) ); _Gx[y] = OR( AND(_m, _Gx[y1]), ANDNOT(_m, _Gx[y]) ); _Gy[y] = OR( AND(_m, _Gy[y1]), ANDNOT(_m, _Gy[y]) ); } } // compute gradient mangitude (M) and normalize Gx for ( y = 0; y < h4 / 4; y++ ) { _m = MINsse( RCPSQRT(_M2[y]), SET(1e10f) ); _M2[y] = RCP(_m); if (O) { _Gx[y] = MUL( MUL(_Gx[y], _m), SET(acMult) ); } if (O) { _Gx[y] = XOR( _Gx[y], AND(_Gy[y], SET(-0.f)) ); } }; memcpy( M + x * h, M2, h * sizeof(float) ); // compute and store gradient orientation (O) via table lookup if ( O != 0 ) for ( y = 0; y < h; y++ ) { O[x * h + y] = acost[(int)Gx[y]]; } if ( O != 0 && full ) { y1 = ((~size_t(O + x * h) + 1) & 15) / 4; y = 0; for ( ; y < y1; y++ ) { O[y + x * h] += (Gy[y] < 0) * PI; } for ( ; y < h - 4; y += 4 ) STRu( O[y + x * h], ADD( LDu(O[y + x * h]), AND(CMPLT(LDu(Gy[y]), SET(0.f)), SET(PI)) ) ); for ( ; y < h; y++ ) { O[y + x * h] += (Gy[y] < 0) * PI; } } } alFree(Gx); alFree(Gy); alFree(M2); }
// Convert from rgb to luv using sse template<class iT> void rgb2luv_sse( iT *I, float *J, int n, float nrm ) { const int k=256; float R[k], G[k], B[k]; if( (size_t(R)&15||size_t(G)&15||size_t(B)&15||size_t(I)&15||size_t(J)&15) || n%4>0 ) { rgb2luv(I,J,n,nrm); return; } int i=0, i1, n1; float minu, minv, un, vn, mr[3], mg[3], mb[3]; float *lTable = rgb2luv_setup(nrm,mr,mg,mb,minu,minv,un,vn); while( i<n ) { n1 = i+k; if(n1>n) n1=n; float *J1=J+i; float *R1, *G1, *B1; // convert to floats (and load input into cache) if( typeid(iT) != typeid(float) ) { R1=R; G1=G; B1=B; iT *Ri=I+i, *Gi=Ri+n, *Bi=Gi+n; for( i1=0; i1<(n1-i); i1++ ) { R1[i1] = (float) *Ri++; G1[i1] = (float) *Gi++; B1[i1] = (float) *Bi++; } } else { R1=((float*)I)+i; G1=R1+n; B1=G1+n; } // compute RGB -> XYZ for( int j=0; j<3; j++ ) { __m128 _mr, _mg, _mb, *_J=(__m128*) (J1+j*n); __m128 *_R=(__m128*) R1, *_G=(__m128*) G1, *_B=(__m128*) B1; _mr=SET(mr[j]); _mg=SET(mg[j]); _mb=SET(mb[j]); for( i1=i; i1<n1; i1+=4 ) *(_J++) = ADD( ADD(MUL(*(_R++),_mr), MUL(*(_G++),_mg)),MUL(*(_B++),_mb)); } { // compute XZY -> LUV (without doing L lookup/normalization) __m128 _c15, _c3, _cEps, _c52, _c117, _c1024, _cun, _cvn; _c15=SET(15.0f); _c3=SET(3.0f); _cEps=SET(1e-35f); _c52=SET(52.0f); _c117=SET(117.0f), _c1024=SET(1024.0f); _cun=SET(13*un); _cvn=SET(13*vn); __m128 *_X, *_Y, *_Z, _x, _y, _z; _X=(__m128*) J1; _Y=(__m128*) (J1+n); _Z=(__m128*) (J1+2*n); for( i1=i; i1<n1; i1+=4 ) { _x = *_X; _y=*_Y; _z=*_Z; _z = RCP(ADD(_x,ADD(_cEps,ADD(MUL(_c15,_y),MUL(_c3,_z))))); *(_X++) = MUL(_c1024,_y); *(_Y++) = SUB(MUL(MUL(_c52,_x),_z),_cun); *(_Z++) = SUB(MUL(MUL(_c117,_y),_z),_cvn); } } { // perform lookup for L and finalize computation of U and V for( i1=i; i1<n1; i1++ ) J[i1] = lTable[(int)J[i1]]; __m128 *_L, *_U, *_V, _l, _cminu, _cminv; _L=(__m128*) J1; _U=(__m128*) (J1+n); _V=(__m128*) (J1+2*n); _cminu=SET(minu); _cminv=SET(minv); for( i1=i; i1<n1; i1+=4 ) { _l = *(_L++); *_U = SUB(MUL(_l,*_U),_cminu); _U++; *_V = SUB(MUL(_l,*_V),_cminv); _V++; } } i = n1; } }
void pcl::people::HOG::gradMag( float *I, int h, int w, int d, float *M, float *O ) const { #if defined(__SSE2__) int x, y, y1, c, h4, s; float *Gx, *Gy, *M2; __m128 *_Gx, *_Gy, *_M2, _m; float *acost = acosTable(), acMult=25000/2.02f; // allocate memory for storing one column of output (padded so h4%4==0) h4=(h%4==0) ? h : h-(h%4)+4; s=d*h4*sizeof(float); M2=(float*) alMalloc(s,16); _M2=(__m128*) M2; Gx=(float*) alMalloc(s,16); _Gx=(__m128*) Gx; Gy=(float*) alMalloc(s,16); _Gy=(__m128*) Gy; // compute gradient magnitude and orientation for each column for( x=0; x<w; x++ ) { // compute gradients (Gx, Gy) and squared magnitude (M2) for each channel for( c=0; c<d; c++ ) grad1( I+x*h+c*w*h, Gx+c*h4, Gy+c*h4, h, w, x ); for( y=0; y<d*h4/4; y++ ) _M2[y]=ADD(MUL(_Gx[y],_Gx[y]),MUL(_Gy[y],_Gy[y])); // store gradients with maximum response in the first channel for(c=1; c<d; c++) { for( y=0; y<h4/4; y++ ) { y1=h4/4*c+y; _m = CMPGT( _M2[y1], _M2[y] ); _M2[y] = OR( AND(_m,_M2[y1]), ANDNOT(_m,_M2[y]) ); _Gx[y] = OR( AND(_m,_Gx[y1]), ANDNOT(_m,_Gx[y]) ); _Gy[y] = OR( AND(_m,_Gy[y1]), ANDNOT(_m,_Gy[y]) ); } } // compute gradient magnitude (M) and normalize Gx for( y=0; y<h4/4; y++ ) { _m = MIN( RCPSQRT(_M2[y]), SET(1e10f) ); _M2[y] = RCP(_m); _Gx[y] = MUL( MUL(_Gx[y],_m), SET(acMult) ); _Gx[y] = XOR( _Gx[y], AND(_Gy[y], SET(-0.f)) ); }; memcpy( M+x*h, M2, h*sizeof(float) ); // compute and store gradient orientation (O) via table lookup if(O!=0) for( y=0; y<h; y++ ) O[x*h+y] = acost[(int)Gx[y]]; } alFree(Gx); alFree(Gy); alFree(M2); #else int x, y, y1, c, h4, s; float *Gx, *Gy, *M2; float *acost = acosTable(), acMult=25000/2.02f; // allocate memory for storing one column of output (padded so h4%4==0) h4=(h%4==0) ? h : h-(h%4)+4; s=d*h4*sizeof(float); M2=(float*) alMalloc(s,16); Gx=(float*) alMalloc(s,16); Gy=(float*) alMalloc(s,16); float m; // compute gradient magnitude and orientation for each column for( x=0; x<w; x++ ) { // compute gradients (Gx, Gy) and squared magnitude (M2) for each channel for( c=0; c<d; c++ ) grad1( I+x*h+c*w*h, Gx+c*h4, Gy+c*h4, h, w, x ); for( y=0; y<d*h4; y++ ) { M2[y] = Gx[y] * Gx[y] + Gy[y] * Gy[y]; } // store gradients with maximum response in the first channel for(c=1; c<d; c++) { for( y=0; y<h4/4; y++ ) { y1=h4/4*c+y; for (int ii = 0; ii < 4; ++ii) { if (M2[y1 * 4 + ii] > M2[y * 4 + ii]) { M2[y * 4 + ii] = M2[y1 * 4 + ii]; Gx[y * 4 + ii] = Gx[y1 * 4 + ii]; Gy[y * 4 + ii] = Gy[y1 * 4 + ii]; } } } } // compute gradient magnitude (M) and normalize Gx for( y=0; y<h4; y++ ) { m = 1.0f/sqrtf(M2[y]); m = m < 1e10f ? m : 1e10f; M2[y] = 1.0f / m; Gx[y] = ((Gx[y] * m) * acMult); if (Gy[y] < 0) Gx[y] = -Gx[y]; } memcpy( M+x*h, M2, h*sizeof(float) ); // compute and store gradient orientation (O) via table lookup if(O!=0) for( y=0; y<h; y++ ) O[x*h+y] = acost[(int)Gx[y]]; } alFree(Gx); alFree(Gy); alFree(M2); #endif }