// helper for gradHist, quantize O and M into O0, O1 and M0, M1 (uses sse) void gradQuantize( float *O, float *M, int *O0, int *O1, float *M0, float *M1, int nb, int n, float norm, int nOrients, bool full, bool interpolate ) { // assumes all *OUTPUT* matrices are 4-byte aligned int i, o0, o1; float o, od, m; __m128i _o0, _o1, *_O0, *_O1; __m128 _o, _od, _m, *_M0, *_M1; // define useful constants const float oMult=(float)nOrients/(full?2*PI:PI); const int oMax=nOrients*nb; const __m128 _norm=SET(norm), _oMult=SET(oMult), _nbf=SET((float)nb); const __m128i _oMax=SET(oMax), _nb=SET(nb); // perform the majority of the work with sse _O0=(__m128i*) O0; _O1=(__m128i*) O1; _M0=(__m128*) M0; _M1=(__m128*) M1; if( interpolate ) for( i=0; i<=n-4; i+=4 ) { _o=MUL(LDu(O[i]),_oMult); _o0=CVT(_o); _od=SUB(_o,CVT(_o0)); _o0=CVT(MUL(CVT(_o0),_nbf)); _o0=AND(CMPGT(_oMax,_o0),_o0); *_O0++=_o0; _o1=ADD(_o0,_nb); _o1=AND(CMPGT(_oMax,_o1),_o1); *_O1++=_o1; _m=MUL(LDu(M[i]),_norm); *_M1=MUL(_od,_m); *_M0++=SUB(_m,*_M1); _M1++; } else for( i=0; i<=n-4; i+=4 ) { _o=MUL(LDu(O[i]),_oMult); _o0=CVT(ADD(_o,SET(.5f))); _o0=CVT(MUL(CVT(_o0),_nbf)); _o0=AND(CMPGT(_oMax,_o0),_o0); *_O0++=_o0; *_M0++=MUL(LDu(M[i]),_norm); *_M1++=SET(0.f); *_O1++=SET(0); } // compute trailing locations without sse if( interpolate ) for(; i<n; i++ ) { o=O[i]*oMult; o0=(int) o; od=o-o0; o0*=nb; if(o0>=oMax) o0=0; O0[i]=o0; o1=o0+nb; if(o1==oMax) o1=0; O1[i]=o1; m=M[i]*norm; M1[i]=od*m; M0[i]=m-M1[i]; } else for(; i<n; i++ ) { o=O[i]*oMult; o0=(int) (o+.5f); o0*=nb; if(o0>=oMax) o0=0; O0[i]=o0; M0[i]=M[i]*norm; M1[i]=0; O1[i]=0; } }
// compute gradient magnitude and orientation at each location (uses sse) void gradMag( float *I, float *M, float *O, int h, int w, int d ) { int x, y, y1, c, h4, s; float *Gx, *Gy, *M2; __m128 *_Gx, *_Gy, *_M2, _m; float *acost = acosTable(), acMult=25000/2.02f; // allocate memory for storing one column of output (padded so h4%4==0) h4=(h%4==0) ? h : h-(h%4)+4; s=d*h4*sizeof(float); M2=(float*) alMalloc(s,16); _M2=(__m128*) M2; Gx=(float*) alMalloc(s,16); _Gx=(__m128*) Gx; Gy=(float*) alMalloc(s,16); _Gy=(__m128*) Gy; // compute gradient magnitude and orientation for each column for( x=0; x<w; x++ ) { // compute gradients (Gx, Gy) and squared magnitude (M2) for each channel for( c=0; c<d; c++ ) grad1( I+x*h+c*w*h, Gx+c*h4, Gy+c*h4, h, w, x ); for( y=0; y<d*h4/4; y++ ) _M2[y]=ADD(MUL(_Gx[y],_Gx[y]),MUL(_Gy[y],_Gy[y])); // store gradients with maximum response in the first channel for(c=1; c<d; c++) { for( y=0; y<h4/4; y++ ) { y1=h4/4*c+y; _m = CMPGT( _M2[y1], _M2[y] ); _M2[y] = OR( AND(_m,_M2[y1]), ANDNOT(_m,_M2[y]) ); _Gx[y] = OR( AND(_m,_Gx[y1]), ANDNOT(_m,_Gx[y]) ); _Gy[y] = OR( AND(_m,_Gy[y1]), ANDNOT(_m,_Gy[y]) ); } } // compute gradient mangitude (M) and normalize Gx for( y=0; y<h4/4; y++ ) { _m = MIN( RCPSQRT(_M2[y]), SET(1e10f) ); _M2[y] = RCP(_m); _Gx[y] = MUL( MUL(_Gx[y],_m), SET(acMult) ); _Gx[y] = XOR( _Gx[y], AND(_Gy[y], SET(-0.f)) ); }; memcpy( M+x*h, M2, h*sizeof(float) ); // compute and store gradient orientation (O) via table lookup if(O!=0) for( y=0; y<h; y++ ) O[x*h+y] = acost[(int)Gx[y]]; } alFree(Gx); alFree(Gy); alFree(M2); }
static inline __m256i enc_translate (const __m256i in) { // LUT contains Absolute offset for all ranges: const __m256i lut = _mm256_setr_epi8(65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0, 65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0); // Translate values 0..63 to the Base64 alphabet. There are five sets: // # From To Abs Index Characters // 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ // 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz // 2 [52..61] [48..57] -4 [2..11] 0123456789 // 3 [62] [43] -19 12 + // 4 [63] [47] -16 13 / // Create LUT indices from input: // the index for range #0 is right, others are 1 less than expected: __m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51)); // mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0: __m256i mask = CMPGT(in, 25); // substract -1, so add 1 to indices for range #[1..4], All indices are now correct: indices = _mm256_sub_epi8(indices, mask); // Add offsets to input values: __m256i out = _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices)); return out; }
// compute gradient magnitude and orientation at each location (uses sse) void gradMag( float *I, float *M, float *O, int h, int w, int d, bool full ) { int x, y, y1, c, h4, s; float *Gx, *Gy, *M2; __m128 *_Gx, *_Gy, *_M2, _m; float *acost = acosTable(), acMult = 10000.0f; // allocate memory for storing one column of output (padded so h4%4==0) h4 = (h % 4 == 0) ? h : h - (h % 4) + 4; s = d * h4 * sizeof(float); M2 = (float*) alMalloc(s, 16); _M2 = (__m128*) M2; Gx = (float*) alMalloc(s, 16); _Gx = (__m128*) Gx; Gy = (float*) alMalloc(s, 16); _Gy = (__m128*) Gy; // compute gradient magnitude and orientation for each column for ( x = 0; x < w; x++ ) { // compute gradients (Gx, Gy) with maximum squared magnitude (M2) for (c = 0; c < d; c++) { grad1( I + x * h + c * w * h, Gx + c * h4, Gy + c * h4, h, w, x ); for ( y = 0; y < h4 / 4; y++ ) { y1 = h4 / 4 * c + y; _M2[y1] = ADD(MUL(_Gx[y1], _Gx[y1]), MUL(_Gy[y1], _Gy[y1])); if ( c == 0 ) { continue; } _m = CMPGT( _M2[y1], _M2[y] ); _M2[y] = OR( AND(_m, _M2[y1]), ANDNOT(_m, _M2[y]) ); _Gx[y] = OR( AND(_m, _Gx[y1]), ANDNOT(_m, _Gx[y]) ); _Gy[y] = OR( AND(_m, _Gy[y1]), ANDNOT(_m, _Gy[y]) ); } } // compute gradient mangitude (M) and normalize Gx for ( y = 0; y < h4 / 4; y++ ) { _m = MINsse( RCPSQRT(_M2[y]), SET(1e10f) ); _M2[y] = RCP(_m); if (O) { _Gx[y] = MUL( MUL(_Gx[y], _m), SET(acMult) ); } if (O) { _Gx[y] = XOR( _Gx[y], AND(_Gy[y], SET(-0.f)) ); } }; memcpy( M + x * h, M2, h * sizeof(float) ); // compute and store gradient orientation (O) via table lookup if ( O != 0 ) for ( y = 0; y < h; y++ ) { O[x * h + y] = acost[(int)Gx[y]]; } if ( O != 0 && full ) { y1 = ((~size_t(O + x * h) + 1) & 15) / 4; y = 0; for ( ; y < y1; y++ ) { O[y + x * h] += (Gy[y] < 0) * PI; } for ( ; y < h - 4; y += 4 ) STRu( O[y + x * h], ADD( LDu(O[y + x * h]), AND(CMPLT(LDu(Gy[y]), SET(0.f)), SET(PI)) ) ); for ( ; y < h; y++ ) { O[y + x * h] += (Gy[y] < 0) * PI; } } } alFree(Gx); alFree(Gy); alFree(M2); }
void pcl::people::HOG::gradQuantize (float *O, float *M, int *O0, int *O1, float *M0, float *M1, int n_orients, int nb, int n, float norm) const { #if defined(__SSE2__) // assumes all *OUTPUT* matrices are 4-byte aligned int i, o0, o1; float o, od, m; __m128i _o0, _o1, *_O0, *_O1; __m128 _o, _o0f, _m, *_M0, *_M1; // define useful constants const float oMult=(float)n_orients/M_PI; const int oMax=n_orients*nb; const __m128 _norm=SET(norm), _oMult=SET(oMult), _nbf=SET((float)nb); const __m128i _oMax=SET(oMax), _nb=SET(nb); // perform the majority of the work with sse _O0=(__m128i*) O0; _O1=(__m128i*) O1; _M0=(__m128*) M0; _M1=(__m128*) M1; for( i=0; i<=n-4; i+=4 ) { _o=MUL(LDu(O[i]),_oMult); _o0f=CVT(CVT(_o)); _o0=CVT(MUL(_o0f,_nbf)); _o1=ADD(_o0,_nb); _o1=AND(CMPGT(_oMax,_o1),_o1); *_O0++=_o0; *_O1++=_o1; _m=MUL(LDu(M[i]),_norm); *_M1=MUL(SUB(_o,_o0f),_m); *_M0=SUB(_m,*_M1); _M0++; _M1++; } // compute trailing locations without sse for( ; i<n; i++ ) { o=O[i]*oMult; m=M[i]*norm; o0=(int) o; od=o-o0; o0*=nb; o1=o0+nb; if(o1==oMax) o1=0; O0[i]=o0; O1[i]=o1; M1[i]=od*m; M0[i]=m-M1[i]; } #else int i, o0, o1; float o, od, m; // define useful constants const float oMult=(float)n_orients/M_PI; const int oMax=n_orients*nb; // compute trailing locations without sse for( i = 0; i<n; i++ ) { o=O[i]*oMult; m=M[i]*norm; o0=(int) o; od=o-o0; o0*=nb; o1=o0+nb; if(o1==oMax) o1=0; O0[i]=o0; O1[i]=o1; M1[i]=od*m; M0[i]=m-M1[i]; } #endif }
void pcl::people::HOG::gradMag( float *I, int h, int w, int d, float *M, float *O ) const { #if defined(__SSE2__) int x, y, y1, c, h4, s; float *Gx, *Gy, *M2; __m128 *_Gx, *_Gy, *_M2, _m; float *acost = acosTable(), acMult=25000/2.02f; // allocate memory for storing one column of output (padded so h4%4==0) h4=(h%4==0) ? h : h-(h%4)+4; s=d*h4*sizeof(float); M2=(float*) alMalloc(s,16); _M2=(__m128*) M2; Gx=(float*) alMalloc(s,16); _Gx=(__m128*) Gx; Gy=(float*) alMalloc(s,16); _Gy=(__m128*) Gy; // compute gradient magnitude and orientation for each column for( x=0; x<w; x++ ) { // compute gradients (Gx, Gy) and squared magnitude (M2) for each channel for( c=0; c<d; c++ ) grad1( I+x*h+c*w*h, Gx+c*h4, Gy+c*h4, h, w, x ); for( y=0; y<d*h4/4; y++ ) _M2[y]=ADD(MUL(_Gx[y],_Gx[y]),MUL(_Gy[y],_Gy[y])); // store gradients with maximum response in the first channel for(c=1; c<d; c++) { for( y=0; y<h4/4; y++ ) { y1=h4/4*c+y; _m = CMPGT( _M2[y1], _M2[y] ); _M2[y] = OR( AND(_m,_M2[y1]), ANDNOT(_m,_M2[y]) ); _Gx[y] = OR( AND(_m,_Gx[y1]), ANDNOT(_m,_Gx[y]) ); _Gy[y] = OR( AND(_m,_Gy[y1]), ANDNOT(_m,_Gy[y]) ); } } // compute gradient magnitude (M) and normalize Gx for( y=0; y<h4/4; y++ ) { _m = MIN( RCPSQRT(_M2[y]), SET(1e10f) ); _M2[y] = RCP(_m); _Gx[y] = MUL( MUL(_Gx[y],_m), SET(acMult) ); _Gx[y] = XOR( _Gx[y], AND(_Gy[y], SET(-0.f)) ); }; memcpy( M+x*h, M2, h*sizeof(float) ); // compute and store gradient orientation (O) via table lookup if(O!=0) for( y=0; y<h; y++ ) O[x*h+y] = acost[(int)Gx[y]]; } alFree(Gx); alFree(Gy); alFree(M2); #else int x, y, y1, c, h4, s; float *Gx, *Gy, *M2; float *acost = acosTable(), acMult=25000/2.02f; // allocate memory for storing one column of output (padded so h4%4==0) h4=(h%4==0) ? h : h-(h%4)+4; s=d*h4*sizeof(float); M2=(float*) alMalloc(s,16); Gx=(float*) alMalloc(s,16); Gy=(float*) alMalloc(s,16); float m; // compute gradient magnitude and orientation for each column for( x=0; x<w; x++ ) { // compute gradients (Gx, Gy) and squared magnitude (M2) for each channel for( c=0; c<d; c++ ) grad1( I+x*h+c*w*h, Gx+c*h4, Gy+c*h4, h, w, x ); for( y=0; y<d*h4; y++ ) { M2[y] = Gx[y] * Gx[y] + Gy[y] * Gy[y]; } // store gradients with maximum response in the first channel for(c=1; c<d; c++) { for( y=0; y<h4/4; y++ ) { y1=h4/4*c+y; for (int ii = 0; ii < 4; ++ii) { if (M2[y1 * 4 + ii] > M2[y * 4 + ii]) { M2[y * 4 + ii] = M2[y1 * 4 + ii]; Gx[y * 4 + ii] = Gx[y1 * 4 + ii]; Gy[y * 4 + ii] = Gy[y1 * 4 + ii]; } } } } // compute gradient magnitude (M) and normalize Gx for( y=0; y<h4; y++ ) { m = 1.0f/sqrtf(M2[y]); m = m < 1e10f ? m : 1e10f; M2[y] = 1.0f / m; Gx[y] = ((Gx[y] * m) * acMult); if (Gy[y] < 0) Gx[y] = -Gx[y]; } memcpy( M+x*h, M2, h*sizeof(float) ); // compute and store gradient orientation (O) via table lookup if(O!=0) for( y=0; y<h; y++ ) O[x*h+y] = acost[(int)Gx[y]]; } alFree(Gx); alFree(Gy); alFree(M2); #endif }
static inline uint8x16x4_t enc_translate (uint8x16x4_t in) { uint8x16x4_t mask1, mask2, mask3, mask4, out; // Translate values 0..63 to the Base64 alphabet. There are five sets: // # From To Abs Delta Characters // 0 [0..25] [65..90] +65 +65 ABCDEFGHIJKLMNOPQRSTUVWXYZ // 1 [26..51] [97..122] +71 +6 abcdefghijklmnopqrstuvwxyz // 2 [52..61] [48..57] -4 -75 0123456789 // 3 [62] [43] -19 -15 + // 4 [63] [47] -16 +3 / // Create cumulative masks for characters in sets [1,2,3,4], [2,3,4], // [3,4], and [4]: mask1.val[0] = CMPGT(in.val[0], 25); mask1.val[1] = CMPGT(in.val[1], 25); mask1.val[2] = CMPGT(in.val[2], 25); mask1.val[3] = CMPGT(in.val[3], 25); mask2.val[0] = CMPGT(in.val[0], 51); mask2.val[1] = CMPGT(in.val[1], 51); mask2.val[2] = CMPGT(in.val[2], 51); mask2.val[3] = CMPGT(in.val[3], 51); mask3.val[0] = CMPGT(in.val[0], 61); mask3.val[1] = CMPGT(in.val[1], 61); mask3.val[2] = CMPGT(in.val[2], 61); mask3.val[3] = CMPGT(in.val[3], 61); mask4.val[0] = CMPEQ(in.val[0], 63); mask4.val[1] = CMPEQ(in.val[1], 63); mask4.val[2] = CMPEQ(in.val[2], 63); mask4.val[3] = CMPEQ(in.val[3], 63); // All characters are at least in cumulative set 0, so add 'A': out.val[0] = vaddq_u8(in.val[0], vdupq_n_u8(65)); out.val[1] = vaddq_u8(in.val[1], vdupq_n_u8(65)); out.val[2] = vaddq_u8(in.val[2], vdupq_n_u8(65)); out.val[3] = vaddq_u8(in.val[3], vdupq_n_u8(65)); // For inputs which are also in any of the other cumulative sets, // add delta values against the previous set(s) to correct the shift: out.val[0] = vaddq_u8(out.val[0], REPLACE(mask1.val[0], 6)); out.val[1] = vaddq_u8(out.val[1], REPLACE(mask1.val[1], 6)); out.val[2] = vaddq_u8(out.val[2], REPLACE(mask1.val[2], 6)); out.val[3] = vaddq_u8(out.val[3], REPLACE(mask1.val[3], 6)); out.val[0] = vsubq_u8(out.val[0], REPLACE(mask2.val[0], 75)); out.val[1] = vsubq_u8(out.val[1], REPLACE(mask2.val[1], 75)); out.val[2] = vsubq_u8(out.val[2], REPLACE(mask2.val[2], 75)); out.val[3] = vsubq_u8(out.val[3], REPLACE(mask2.val[3], 75)); out.val[0] = vsubq_u8(out.val[0], REPLACE(mask3.val[0], 15)); out.val[1] = vsubq_u8(out.val[1], REPLACE(mask3.val[1], 15)); out.val[2] = vsubq_u8(out.val[2], REPLACE(mask3.val[2], 15)); out.val[3] = vsubq_u8(out.val[3], REPLACE(mask3.val[3], 15)); out.val[0] = vaddq_u8(out.val[0], REPLACE(mask4.val[0], 3)); out.val[1] = vaddq_u8(out.val[1], REPLACE(mask4.val[1], 3)); out.val[2] = vaddq_u8(out.val[2], REPLACE(mask4.val[2], 3)); out.val[3] = vaddq_u8(out.val[3], REPLACE(mask4.val[3], 3)); return out; }