コード例 #1
0
// helper for gradHist, quantize O and M into O0, O1 and M0, M1 (uses sse)
void gradQuantize( float *O, float *M, int *O0, int *O1, float *M0, float *M1,
  int nb, int n, float norm, int nOrients, bool full, bool interpolate )
{
  // assumes all *OUTPUT* matrices are 4-byte aligned
  int i, o0, o1; float o, od, m;
  __m128i _o0, _o1, *_O0, *_O1; __m128 _o, _od, _m, *_M0, *_M1;
  // define useful constants
  const float oMult=(float)nOrients/(full?2*PI:PI); const int oMax=nOrients*nb;
  const __m128 _norm=SET(norm), _oMult=SET(oMult), _nbf=SET((float)nb);
  const __m128i _oMax=SET(oMax), _nb=SET(nb);
  // perform the majority of the work with sse
  _O0=(__m128i*) O0; _O1=(__m128i*) O1; _M0=(__m128*) M0; _M1=(__m128*) M1;
  if( interpolate ) for( i=0; i<=n-4; i+=4 ) {
    _o=MUL(LDu(O[i]),_oMult); _o0=CVT(_o); _od=SUB(_o,CVT(_o0));
    _o0=CVT(MUL(CVT(_o0),_nbf)); _o0=AND(CMPGT(_oMax,_o0),_o0); *_O0++=_o0;
    _o1=ADD(_o0,_nb); _o1=AND(CMPGT(_oMax,_o1),_o1); *_O1++=_o1;
    _m=MUL(LDu(M[i]),_norm); *_M1=MUL(_od,_m); *_M0++=SUB(_m,*_M1); _M1++;
  } else for( i=0; i<=n-4; i+=4 ) {
    _o=MUL(LDu(O[i]),_oMult); _o0=CVT(ADD(_o,SET(.5f)));
    _o0=CVT(MUL(CVT(_o0),_nbf)); _o0=AND(CMPGT(_oMax,_o0),_o0); *_O0++=_o0;
    *_M0++=MUL(LDu(M[i]),_norm); *_M1++=SET(0.f); *_O1++=SET(0);
  }
  // compute trailing locations without sse
  if( interpolate ) for(; i<n; i++ ) {
    o=O[i]*oMult; o0=(int) o; od=o-o0;
    o0*=nb; if(o0>=oMax) o0=0; O0[i]=o0;
    o1=o0+nb; if(o1==oMax) o1=0; O1[i]=o1;
    m=M[i]*norm; M1[i]=od*m; M0[i]=m-M1[i];
  } else for(; i<n; i++ ) {
    o=O[i]*oMult; o0=(int) (o+.5f);
    o0*=nb; if(o0>=oMax) o0=0; O0[i]=o0;
    M0[i]=M[i]*norm; M1[i]=0; O1[i]=0;
  }
}
コード例 #2
0
// compute gradient magnitude and orientation at each location (uses sse)
void gradMag( float *I, float *M, float *O, int h, int w, int d ) {
  int x, y, y1, c, h4, s; float *Gx, *Gy, *M2; __m128 *_Gx, *_Gy, *_M2, _m;
  float *acost = acosTable(), acMult=25000/2.02f;
  // allocate memory for storing one column of output (padded so h4%4==0)
  h4=(h%4==0) ? h : h-(h%4)+4; s=d*h4*sizeof(float);
  M2=(float*) alMalloc(s,16); _M2=(__m128*) M2;
  Gx=(float*) alMalloc(s,16); _Gx=(__m128*) Gx;
  Gy=(float*) alMalloc(s,16); _Gy=(__m128*) Gy;
  // compute gradient magnitude and orientation for each column
  for( x=0; x<w; x++ ) {
    // compute gradients (Gx, Gy) and squared magnitude (M2) for each channel
    for( c=0; c<d; c++ ) grad1( I+x*h+c*w*h, Gx+c*h4, Gy+c*h4, h, w, x );
    for( y=0; y<d*h4/4; y++ ) _M2[y]=ADD(MUL(_Gx[y],_Gx[y]),MUL(_Gy[y],_Gy[y]));
    // store gradients with maximum response in the first channel
    for(c=1; c<d; c++) {
      for( y=0; y<h4/4; y++ ) {
        y1=h4/4*c+y; _m = CMPGT( _M2[y1], _M2[y] );
        _M2[y] = OR( AND(_m,_M2[y1]), ANDNOT(_m,_M2[y]) );
        _Gx[y] = OR( AND(_m,_Gx[y1]), ANDNOT(_m,_Gx[y]) );
        _Gy[y] = OR( AND(_m,_Gy[y1]), ANDNOT(_m,_Gy[y]) );
      }
    }
    // compute gradient mangitude (M) and normalize Gx
    for( y=0; y<h4/4; y++ ) {
      _m = MIN( RCPSQRT(_M2[y]), SET(1e10f) );
      _M2[y] = RCP(_m);
      _Gx[y] = MUL( MUL(_Gx[y],_m), SET(acMult) );
      _Gx[y] = XOR( _Gx[y], AND(_Gy[y], SET(-0.f)) );
    };
    memcpy( M+x*h, M2, h*sizeof(float) );
    // compute and store gradient orientation (O) via table lookup
    if(O!=0) for( y=0; y<h; y++ ) O[x*h+y] = acost[(int)Gx[y]];
  }
  alFree(Gx); alFree(Gy); alFree(M2);
}
コード例 #3
0
static inline __m256i
enc_translate (const __m256i in)
{
	// LUT contains Absolute offset for all ranges:
	const __m256i lut = _mm256_setr_epi8(65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
	                                     65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
	// Translate values 0..63 to the Base64 alphabet. There are five sets:
	// #  From      To         Abs    Index  Characters
	// 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
	// 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
	// 2  [52..61]  [48..57]    -4  [2..11]  0123456789
	// 3  [62]      [43]       -19       12  +
	// 4  [63]      [47]       -16       13  /

	// Create LUT indices from input:
	// the index for range #0 is right, others are 1 less than expected:
	__m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));

	// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
	__m256i mask = CMPGT(in, 25);

	// substract -1, so add 1 to indices for range #[1..4], All indices are now correct:
	indices = _mm256_sub_epi8(indices, mask);

	// Add offsets to input values:
	__m256i out = _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));

	return out;
}
コード例 #4
0
ファイル: gradient.cpp プロジェクト: joelgallant/skcf
// compute gradient magnitude and orientation at each location (uses sse)
void gradMag( float *I, float *M, float *O, int h, int w, int d, bool full ) {
    int x, y, y1, c, h4, s;
    float *Gx, *Gy, *M2;
    __m128 *_Gx, *_Gy, *_M2, _m;
    float *acost = acosTable(), acMult = 10000.0f;
    // allocate memory for storing one column of output (padded so h4%4==0)
    h4 = (h % 4 == 0) ? h : h - (h % 4) + 4;
    s = d * h4 * sizeof(float);
    M2 = (float*) alMalloc(s, 16);
    _M2 = (__m128*) M2;
    Gx = (float*) alMalloc(s, 16);
    _Gx = (__m128*) Gx;
    Gy = (float*) alMalloc(s, 16);
    _Gy = (__m128*) Gy;
    // compute gradient magnitude and orientation for each column
    for ( x = 0; x < w; x++ ) {
        // compute gradients (Gx, Gy) with maximum squared magnitude (M2)
        for (c = 0; c < d; c++) {
            grad1( I + x * h + c * w * h, Gx + c * h4, Gy + c * h4, h, w, x );
            for ( y = 0; y < h4 / 4; y++ ) {
                y1 = h4 / 4 * c + y;
                _M2[y1] = ADD(MUL(_Gx[y1], _Gx[y1]), MUL(_Gy[y1], _Gy[y1]));
                if ( c == 0 ) { continue; }
                _m = CMPGT( _M2[y1], _M2[y] );
                _M2[y] = OR( AND(_m, _M2[y1]), ANDNOT(_m, _M2[y]) );
                _Gx[y] = OR( AND(_m, _Gx[y1]), ANDNOT(_m, _Gx[y]) );
                _Gy[y] = OR( AND(_m, _Gy[y1]), ANDNOT(_m, _Gy[y]) );
            }
        }
        // compute gradient mangitude (M) and normalize Gx
        for ( y = 0; y < h4 / 4; y++ ) {
            _m = MINsse( RCPSQRT(_M2[y]), SET(1e10f) );
            _M2[y] = RCP(_m);
            if (O) { _Gx[y] = MUL( MUL(_Gx[y], _m), SET(acMult) ); }
            if (O) { _Gx[y] = XOR( _Gx[y], AND(_Gy[y], SET(-0.f)) ); }
        };
        memcpy( M + x * h, M2, h * sizeof(float) );
        // compute and store gradient orientation (O) via table lookup
        if ( O != 0 ) for ( y = 0; y < h; y++ ) { O[x * h + y] = acost[(int)Gx[y]]; }
        if ( O != 0 && full ) {
            y1 = ((~size_t(O + x * h) + 1) & 15) / 4;
            y = 0;
            for ( ; y < y1; y++ ) { O[y + x * h] += (Gy[y] < 0) * PI; }
            for ( ; y < h - 4; y += 4 ) STRu( O[y + x * h],
                                                  ADD( LDu(O[y + x * h]), AND(CMPLT(LDu(Gy[y]), SET(0.f)), SET(PI)) ) );
            for ( ; y < h; y++ ) { O[y + x * h] += (Gy[y] < 0) * PI; }
        }
    }
    alFree(Gx);
    alFree(Gy);
    alFree(M2);
}
コード例 #5
0
ファイル: hog.cpp プロジェクト: WuNL/pcl
void 
pcl::people::HOG::gradQuantize (float *O, float *M, int *O0, int *O1, float *M0, float *M1, int n_orients, int nb, int n, float norm) const
{
#if defined(__SSE2__)
  // assumes all *OUTPUT* matrices are 4-byte aligned
  int i, o0, o1; float o, od, m;
  __m128i _o0, _o1, *_O0, *_O1; __m128 _o, _o0f, _m, *_M0, *_M1;
  // define useful constants
  const float oMult=(float)n_orients/M_PI; const int oMax=n_orients*nb;
  const __m128 _norm=SET(norm), _oMult=SET(oMult), _nbf=SET((float)nb);
  const __m128i _oMax=SET(oMax), _nb=SET(nb);

  // perform the majority of the work with sse
  _O0=(__m128i*) O0; _O1=(__m128i*) O1; _M0=(__m128*) M0; _M1=(__m128*) M1;
  for( i=0; i<=n-4; i+=4 ) {
  _o=MUL(LDu(O[i]),_oMult); _o0f=CVT(CVT(_o)); _o0=CVT(MUL(_o0f,_nbf));
  _o1=ADD(_o0,_nb); _o1=AND(CMPGT(_oMax,_o1),_o1);
  *_O0++=_o0; *_O1++=_o1; _m=MUL(LDu(M[i]),_norm);
  *_M1=MUL(SUB(_o,_o0f),_m); *_M0=SUB(_m,*_M1); _M0++; _M1++;
  }

  // compute trailing locations without sse
  for( ; i<n; i++ ) {
  o=O[i]*oMult; m=M[i]*norm; o0=(int) o; od=o-o0;
  o0*=nb; o1=o0+nb; if(o1==oMax) o1=0;
  O0[i]=o0; O1[i]=o1; M1[i]=od*m; M0[i]=m-M1[i];
  }
#else
  int i, o0, o1;
  float o, od, m;

  // define useful constants
  const float oMult=(float)n_orients/M_PI; const int oMax=n_orients*nb;

  // compute trailing locations without sse
  for( i = 0; i<n; i++ )
  {
    o=O[i]*oMult; m=M[i]*norm; o0=(int) o; od=o-o0;
    o0*=nb; o1=o0+nb; if(o1==oMax) o1=0;
    O0[i]=o0; O1[i]=o1; M1[i]=od*m; M0[i]=m-M1[i];
  }
#endif
}
コード例 #6
0
ファイル: hog.cpp プロジェクト: WuNL/pcl
void 
pcl::people::HOG::gradMag( float *I, int h, int w, int d, float *M, float *O ) const
{
#if defined(__SSE2__)
  int x, y, y1, c, h4, s; float *Gx, *Gy, *M2; __m128 *_Gx, *_Gy, *_M2, _m;
  float *acost = acosTable(), acMult=25000/2.02f;

  // allocate memory for storing one column of output (padded so h4%4==0)
  h4=(h%4==0) ? h : h-(h%4)+4; s=d*h4*sizeof(float);

  M2=(float*) alMalloc(s,16);
  _M2=(__m128*) M2;
  Gx=(float*) alMalloc(s,16); _Gx=(__m128*) Gx;
  Gy=(float*) alMalloc(s,16); _Gy=(__m128*) Gy;

  // compute gradient magnitude and orientation for each column
  for( x=0; x<w; x++ ) {
  // compute gradients (Gx, Gy) and squared magnitude (M2) for each channel
  for( c=0; c<d; c++ ) grad1( I+x*h+c*w*h, Gx+c*h4, Gy+c*h4, h, w, x );
  for( y=0; y<d*h4/4; y++ ) _M2[y]=ADD(MUL(_Gx[y],_Gx[y]),MUL(_Gy[y],_Gy[y]));
  // store gradients with maximum response in the first channel
  for(c=1; c<d; c++) {
    for( y=0; y<h4/4; y++ ) {
    y1=h4/4*c+y; _m = CMPGT( _M2[y1], _M2[y] );
    _M2[y] = OR( AND(_m,_M2[y1]), ANDNOT(_m,_M2[y]) );
    _Gx[y] = OR( AND(_m,_Gx[y1]), ANDNOT(_m,_Gx[y]) );
    _Gy[y] = OR( AND(_m,_Gy[y1]), ANDNOT(_m,_Gy[y]) );
    }
  }
  // compute gradient magnitude (M) and normalize Gx
  for( y=0; y<h4/4; y++ ) {
    _m = MIN( RCPSQRT(_M2[y]), SET(1e10f) );
    _M2[y] = RCP(_m);
    _Gx[y] = MUL( MUL(_Gx[y],_m), SET(acMult) );
    _Gx[y] = XOR( _Gx[y], AND(_Gy[y], SET(-0.f)) );
  };

  memcpy( M+x*h, M2, h*sizeof(float) );
  // compute and store gradient orientation (O) via table lookup
  if(O!=0) for( y=0; y<h; y++ ) O[x*h+y] = acost[(int)Gx[y]];
  }
  alFree(Gx); alFree(Gy); alFree(M2); 
#else
  int x, y, y1, c, h4, s; float *Gx, *Gy, *M2; 
  float *acost = acosTable(), acMult=25000/2.02f;

  // allocate memory for storing one column of output (padded so h4%4==0)
  h4=(h%4==0) ? h : h-(h%4)+4; s=d*h4*sizeof(float);

  M2=(float*) alMalloc(s,16);
  Gx=(float*) alMalloc(s,16); 
  Gy=(float*) alMalloc(s,16); 
  float m;

  // compute gradient magnitude and orientation for each column
  for( x=0; x<w; x++ ) {
  // compute gradients (Gx, Gy) and squared magnitude (M2) for each channel
  for( c=0; c<d; c++ ) grad1( I+x*h+c*w*h, Gx+c*h4, Gy+c*h4, h, w, x );
  for( y=0; y<d*h4; y++ ) 
  {
    M2[y] = Gx[y] * Gx[y] + Gy[y] * Gy[y];
  }  
  
  // store gradients with maximum response in the first channel
  for(c=1; c<d; c++) 
  {
  for( y=0; y<h4/4; y++ ) 
    {
      y1=h4/4*c+y; 

    for (int ii = 0; ii < 4; ++ii)
    {
      if (M2[y1 * 4 + ii] > M2[y * 4 + ii])
      {
        M2[y * 4 + ii] = M2[y1 * 4 + ii];
        Gx[y * 4 + ii] = Gx[y1 * 4 + ii];
        Gy[y * 4 + ii] = Gy[y1 * 4 + ii];
      }
    }
    }
  }
  // compute gradient magnitude (M) and normalize Gx
  for( y=0; y<h4; y++ ) 
  {
  m = 1.0f/sqrtf(M2[y]);
  m = m < 1e10f ? m : 1e10f;
    M2[y] = 1.0f / m;
    Gx[y] = ((Gx[y] * m) * acMult);
    if (Gy[y] < 0)
    Gx[y] = -Gx[y];
  }
  
  memcpy( M+x*h, M2, h*sizeof(float) );
  // compute and store gradient orientation (O) via table lookup
  if(O!=0) for( y=0; y<h; y++ ) O[x*h+y] = acost[(int)Gx[y]];
  }
  alFree(Gx); alFree(Gy); alFree(M2);
#endif  
}
コード例 #7
0
static inline uint8x16x4_t
enc_translate (uint8x16x4_t in)
{
	uint8x16x4_t mask1, mask2, mask3, mask4, out;

	// Translate values 0..63 to the Base64 alphabet. There are five sets:
	// #  From      To         Abs  Delta  Characters
	// 0  [0..25]   [65..90]   +65  +65    ABCDEFGHIJKLMNOPQRSTUVWXYZ
	// 1  [26..51]  [97..122]  +71   +6    abcdefghijklmnopqrstuvwxyz
	// 2  [52..61]  [48..57]    -4  -75    0123456789
	// 3  [62]      [43]       -19  -15    +
	// 4  [63]      [47]       -16   +3    /

	// Create cumulative masks for characters in sets [1,2,3,4], [2,3,4],
	// [3,4], and [4]:
	mask1.val[0] = CMPGT(in.val[0], 25);
	mask1.val[1] = CMPGT(in.val[1], 25);
	mask1.val[2] = CMPGT(in.val[2], 25);
	mask1.val[3] = CMPGT(in.val[3], 25);

	mask2.val[0] = CMPGT(in.val[0], 51);
	mask2.val[1] = CMPGT(in.val[1], 51);
	mask2.val[2] = CMPGT(in.val[2], 51);
	mask2.val[3] = CMPGT(in.val[3], 51);

	mask3.val[0] = CMPGT(in.val[0], 61);
	mask3.val[1] = CMPGT(in.val[1], 61);
	mask3.val[2] = CMPGT(in.val[2], 61);
	mask3.val[3] = CMPGT(in.val[3], 61);

	mask4.val[0] = CMPEQ(in.val[0], 63);
	mask4.val[1] = CMPEQ(in.val[1], 63);
	mask4.val[2] = CMPEQ(in.val[2], 63);
	mask4.val[3] = CMPEQ(in.val[3], 63);

	// All characters are at least in cumulative set 0, so add 'A':
	out.val[0] = vaddq_u8(in.val[0], vdupq_n_u8(65));
	out.val[1] = vaddq_u8(in.val[1], vdupq_n_u8(65));
	out.val[2] = vaddq_u8(in.val[2], vdupq_n_u8(65));
	out.val[3] = vaddq_u8(in.val[3], vdupq_n_u8(65));

	// For inputs which are also in any of the other cumulative sets,
	// add delta values against the previous set(s) to correct the shift:
	out.val[0] = vaddq_u8(out.val[0], REPLACE(mask1.val[0], 6));
	out.val[1] = vaddq_u8(out.val[1], REPLACE(mask1.val[1], 6));
	out.val[2] = vaddq_u8(out.val[2], REPLACE(mask1.val[2], 6));
	out.val[3] = vaddq_u8(out.val[3], REPLACE(mask1.val[3], 6));

	out.val[0] = vsubq_u8(out.val[0], REPLACE(mask2.val[0], 75));
	out.val[1] = vsubq_u8(out.val[1], REPLACE(mask2.val[1], 75));
	out.val[2] = vsubq_u8(out.val[2], REPLACE(mask2.val[2], 75));
	out.val[3] = vsubq_u8(out.val[3], REPLACE(mask2.val[3], 75));

	out.val[0] = vsubq_u8(out.val[0], REPLACE(mask3.val[0], 15));
	out.val[1] = vsubq_u8(out.val[1], REPLACE(mask3.val[1], 15));
	out.val[2] = vsubq_u8(out.val[2], REPLACE(mask3.val[2], 15));
	out.val[3] = vsubq_u8(out.val[3], REPLACE(mask3.val[3], 15));

	out.val[0] = vaddq_u8(out.val[0], REPLACE(mask4.val[0], 3));
	out.val[1] = vaddq_u8(out.val[1], REPLACE(mask4.val[1], 3));
	out.val[2] = vaddq_u8(out.val[2], REPLACE(mask4.val[2], 3));
	out.val[3] = vaddq_u8(out.val[3], REPLACE(mask4.val[3], 3));

	return out;
}