Пример #1
0
// convolve I by a 2rx1 triangle filter (uses SSE)
void convTri( float *I, float *O, int h, int w, int d, int r, int s ) {
  r++; float nrm = 1.0f/(r*r*r*r); int i, j, k=(s-1)/2, h0, h1, w0;
  if(h%4==0) h0=h1=h; else { h0=h-(h%4); h1=h0+4; } w0=(w/s)*s;
  float *T=(float*) alMalloc(2*h1*sizeof(float),16), *U=T+h1;
  while(d-- > 0) {
    // initialize T and U
    for(j=0; j<h0; j+=4) STR(U[j], STR(T[j], LDu(I[j])));
    for(i=1; i<r; i++) for(j=0; j<h0; j+=4) INC(U[j],INC(T[j],LDu(I[j+i*h])));
    for(j=0; j<h0; j+=4) STR(U[j],MUL(nrm,(SUB(MUL(2,LD(U[j])),LD(T[j])))));
    for(j=0; j<h0; j+=4) STR(T[j],0);
    for(j=h0; j<h; j++ ) U[j]=T[j]=I[j];
    for(i=1; i<r; i++) for(j=h0; j<h; j++ ) U[j]+=T[j]+=I[j+i*h];
    for(j=h0; j<h; j++ ) { U[j] = nrm * (2*U[j]-T[j]); T[j]=0; }
    // prepare and convolve each column in turn
    k++; if(k==s) { k=0; convTriY(U,O,h,r-1,s); O+=h/s; }
    for( i=1; i<w0; i++ ) {
      float *Il=I+(i-1-r)*h; if(i<=r) Il=I+(r-i)*h; float *Im=I+(i-1)*h;
      float *Ir=I+(i-1+r)*h; if(i>w-r) Ir=I+(2*w-r-i)*h;
      for( j=0; j<h0; j+=4 ) {
        INC(T[j],ADD(LDu(Il[j]),LDu(Ir[j]),MUL(-2,LDu(Im[j]))));
        INC(U[j],MUL(nrm,LD(T[j])));
      }
      for( j=h0; j<h; j++ ) U[j]+=nrm*(T[j]+=Il[j]+Ir[j]-2*Im[j]);
      k++; if(k==s) { k=0; convTriY(U,O,h,r-1,s); O+=h/s; }
    }
    I+=w*h;
  }
  alFree(T);
}
Пример #2
0
// helper for gradHist, quantize O and M into O0, O1 and M0, M1 (uses sse)
void gradQuantize( float *O, float *M, int *O0, int *O1, float *M0, float *M1,
  int nb, int n, float norm, int nOrients, bool full, bool interpolate )
{
  // assumes all *OUTPUT* matrices are 4-byte aligned
  int i, o0, o1; float o, od, m;
  __m128i _o0, _o1, *_O0, *_O1; __m128 _o, _od, _m, *_M0, *_M1;
  // define useful constants
  const float oMult=(float)nOrients/(full?2*PI:PI); const int oMax=nOrients*nb;
  const __m128 _norm=SET(norm), _oMult=SET(oMult), _nbf=SET((float)nb);
  const __m128i _oMax=SET(oMax), _nb=SET(nb);
  // perform the majority of the work with sse
  _O0=(__m128i*) O0; _O1=(__m128i*) O1; _M0=(__m128*) M0; _M1=(__m128*) M1;
  if( interpolate ) for( i=0; i<=n-4; i+=4 ) {
    _o=MUL(LDu(O[i]),_oMult); _o0=CVT(_o); _od=SUB(_o,CVT(_o0));
    _o0=CVT(MUL(CVT(_o0),_nbf)); _o0=AND(CMPGT(_oMax,_o0),_o0); *_O0++=_o0;
    _o1=ADD(_o0,_nb); _o1=AND(CMPGT(_oMax,_o1),_o1); *_O1++=_o1;
    _m=MUL(LDu(M[i]),_norm); *_M1=MUL(_od,_m); *_M0++=SUB(_m,*_M1); _M1++;
  } else for( i=0; i<=n-4; i+=4 ) {
    _o=MUL(LDu(O[i]),_oMult); _o0=CVT(ADD(_o,SET(.5f)));
    _o0=CVT(MUL(CVT(_o0),_nbf)); _o0=AND(CMPGT(_oMax,_o0),_o0); *_O0++=_o0;
    *_M0++=MUL(LDu(M[i]),_norm); *_M1++=SET(0.f); *_O1++=SET(0);
  }
  // compute trailing locations without sse
  if( interpolate ) for(; i<n; i++ ) {
    o=O[i]*oMult; o0=(int) o; od=o-o0;
    o0*=nb; if(o0>=oMax) o0=0; O0[i]=o0;
    o1=o0+nb; if(o1==oMax) o1=0; O1[i]=o1;
    m=M[i]*norm; M1[i]=od*m; M0[i]=m-M1[i];
  } else for(; i<n; i++ ) {
    o=O[i]*oMult; o0=(int) (o+.5f);
    o0*=nb; if(o0>=oMax) o0=0; O0[i]=o0;
    M0[i]=M[i]*norm; M1[i]=0; O1[i]=0;
  }
}
Пример #3
0
// convolve I by a [1 1; 1 1] filter (uses SSE)
void conv11( float *I, float *O, int h, int w, int d, int side, int s ) {
  const float nrm = 0.25f; int i, j;
  float *I0, *I1, *T = (float*) alMalloc(h*sizeof(float),16);
  for( int d0=0; d0<d; d0++ ) for( i=s/2; i<w; i+=s ) {
    I0=I1=I+i*h+d0*h*w; if(side%2) { if(i<w-1) I1+=h; } else { if(i) I0-=h; }
    for( j=0; j<h-4; j+=4 ) STR( T[j], MUL(nrm,ADD(LDu(I0[j]),LDu(I1[j]))) );
    for( ; j<h; j++ ) T[j]=nrm*(I0[j]+I1[j]);
    conv11Y(T,O,h,side,s); O+=h/s;
  }
  alFree(T);
}
Пример #4
0
// convolve I by a [1 p 1] filter (uses SSE)
void convTri1( float *I, float *O, int h, int w, int d, float p, int s ) {
  const float nrm = 1.0f/((p+2)*(p+2)); int i, j, h0=h-(h%4);
  float *Il, *Im, *Ir, *T=(float*) alMalloc(h*sizeof(float),16);
  for( int d0=0; d0<d; d0++ ) for( i=s/2; i<w; i+=s ) {
    Il=Im=Ir=I+i*h+d0*h*w; if(i>0) Il-=h; if(i<w-1) Ir+=h;
    for( j=0; j<h0; j+=4 )
      STR(T[j],MUL(nrm,ADD(ADD(LDu(Il[j]),MUL(p,LDu(Im[j]))),LDu(Ir[j]))));
    for( j=h0; j<h; j++ ) T[j]=nrm*(Il[j]+p*Im[j]+Ir[j]);
    convTri1Y(T,O,h,p,s); O+=h/s;
  }
  alFree(T);
}
Пример #5
0
// compute gradient magnitude and orientation at each location (uses sse)
void gradMag( float *I, float *M, float *O, int h, int w, int d, bool full ) {
    int x, y, y1, c, h4, s;
    float *Gx, *Gy, *M2;
    __m128 *_Gx, *_Gy, *_M2, _m;
    float *acost = acosTable(), acMult = 10000.0f;
    // allocate memory for storing one column of output (padded so h4%4==0)
    h4 = (h % 4 == 0) ? h : h - (h % 4) + 4;
    s = d * h4 * sizeof(float);
    M2 = (float*) alMalloc(s, 16);
    _M2 = (__m128*) M2;
    Gx = (float*) alMalloc(s, 16);
    _Gx = (__m128*) Gx;
    Gy = (float*) alMalloc(s, 16);
    _Gy = (__m128*) Gy;
    // compute gradient magnitude and orientation for each column
    for ( x = 0; x < w; x++ ) {
        // compute gradients (Gx, Gy) with maximum squared magnitude (M2)
        for (c = 0; c < d; c++) {
            grad1( I + x * h + c * w * h, Gx + c * h4, Gy + c * h4, h, w, x );
            for ( y = 0; y < h4 / 4; y++ ) {
                y1 = h4 / 4 * c + y;
                _M2[y1] = ADD(MUL(_Gx[y1], _Gx[y1]), MUL(_Gy[y1], _Gy[y1]));
                if ( c == 0 ) { continue; }
                _m = CMPGT( _M2[y1], _M2[y] );
                _M2[y] = OR( AND(_m, _M2[y1]), ANDNOT(_m, _M2[y]) );
                _Gx[y] = OR( AND(_m, _Gx[y1]), ANDNOT(_m, _Gx[y]) );
                _Gy[y] = OR( AND(_m, _Gy[y1]), ANDNOT(_m, _Gy[y]) );
            }
        }
        // compute gradient mangitude (M) and normalize Gx
        for ( y = 0; y < h4 / 4; y++ ) {
            _m = MINsse( RCPSQRT(_M2[y]), SET(1e10f) );
            _M2[y] = RCP(_m);
            if (O) { _Gx[y] = MUL( MUL(_Gx[y], _m), SET(acMult) ); }
            if (O) { _Gx[y] = XOR( _Gx[y], AND(_Gy[y], SET(-0.f)) ); }
        };
        memcpy( M + x * h, M2, h * sizeof(float) );
        // compute and store gradient orientation (O) via table lookup
        if ( O != 0 ) for ( y = 0; y < h; y++ ) { O[x * h + y] = acost[(int)Gx[y]]; }
        if ( O != 0 && full ) {
            y1 = ((~size_t(O + x * h) + 1) & 15) / 4;
            y = 0;
            for ( ; y < y1; y++ ) { O[y + x * h] += (Gy[y] < 0) * PI; }
            for ( ; y < h - 4; y += 4 ) STRu( O[y + x * h],
                                                  ADD( LDu(O[y + x * h]), AND(CMPLT(LDu(Gy[y]), SET(0.f)), SET(PI)) ) );
            for ( ; y < h; y++ ) { O[y + x * h] += (Gy[y] < 0) * PI; }
        }
    }
    alFree(Gx);
    alFree(Gy);
    alFree(M2);
}
Пример #6
0
Файл: hog.cpp Проект: WuNL/pcl
void 
pcl::people::HOG::gradQuantize (float *O, float *M, int *O0, int *O1, float *M0, float *M1, int n_orients, int nb, int n, float norm) const
{
#if defined(__SSE2__)
  // assumes all *OUTPUT* matrices are 4-byte aligned
  int i, o0, o1; float o, od, m;
  __m128i _o0, _o1, *_O0, *_O1; __m128 _o, _o0f, _m, *_M0, *_M1;
  // define useful constants
  const float oMult=(float)n_orients/M_PI; const int oMax=n_orients*nb;
  const __m128 _norm=SET(norm), _oMult=SET(oMult), _nbf=SET((float)nb);
  const __m128i _oMax=SET(oMax), _nb=SET(nb);

  // perform the majority of the work with sse
  _O0=(__m128i*) O0; _O1=(__m128i*) O1; _M0=(__m128*) M0; _M1=(__m128*) M1;
  for( i=0; i<=n-4; i+=4 ) {
  _o=MUL(LDu(O[i]),_oMult); _o0f=CVT(CVT(_o)); _o0=CVT(MUL(_o0f,_nbf));
  _o1=ADD(_o0,_nb); _o1=AND(CMPGT(_oMax,_o1),_o1);
  *_O0++=_o0; *_O1++=_o1; _m=MUL(LDu(M[i]),_norm);
  *_M1=MUL(SUB(_o,_o0f),_m); *_M0=SUB(_m,*_M1); _M0++; _M1++;
  }

  // compute trailing locations without sse
  for( ; i<n; i++ ) {
  o=O[i]*oMult; m=M[i]*norm; o0=(int) o; od=o-o0;
  o0*=nb; o1=o0+nb; if(o1==oMax) o1=0;
  O0[i]=o0; O1[i]=o1; M1[i]=od*m; M0[i]=m-M1[i];
  }
#else
  int i, o0, o1;
  float o, od, m;

  // define useful constants
  const float oMult=(float)n_orients/M_PI; const int oMax=n_orients*nb;

  // compute trailing locations without sse
  for( i = 0; i<n; i++ )
  {
    o=O[i]*oMult; m=M[i]*norm; o0=(int) o; od=o-o0;
    o0*=nb; o1=o0+nb; if(o1==oMax) o1=0;
    O0[i]=o0; O1[i]=o1; M1[i]=od*m; M0[i]=m-M1[i];
  }
#endif
}
Пример #7
0
// compute x and y gradients for just one column (uses sse)
void grad1( float *I, float *Gx, float *Gy, int h, int w, int x ) {
    int y, y1;
    float *Ip, *In, r;
    __m128 *_Ip, *_In, *_G, _r;
    // compute column of Gx
    Ip = I - h;
    In = I + h;
    r = .5f;
    if (x == 0) { r = 1; Ip += h; }
    else if (x == w - 1) { r = 1; In -= h; }
    if ( h < 4 || h % 4 > 0 || (size_t(I) & 15) || (size_t(Gx) & 15) ) {
        for ( y = 0; y < h; y++ ) { *Gx++ = (*In++ -*Ip++) * r; }
    } else {
        _G = (__m128*) Gx;
        _Ip = (__m128*) Ip;
        _In = (__m128*) In;
        _r = SET(r);
        for (y = 0; y < h; y += 4) { *_G++ = MUL(SUB(*_In++, *_Ip++), _r); }
    }
    // compute column of Gy
#define GRADY(r) *Gy++=(*In++-*Ip++)*r;
    Ip = I;
    In = Ip + 1;
    // GRADY(1); Ip--; for(y=1; y<h-1; y++) GRADY(.5f); In--; GRADY(1);
    y1 = ((~((size_t) Gy) + 1) & 15) / 4;
    if (y1 == 0) { y1 = 4; }
    if (y1 > h - 1) { y1 = h - 1; }
    GRADY(1);
    Ip--;
    for (y = 1; y < y1; y++) { GRADY(.5f); }
    _r = SET(.5f);
    _G = (__m128*) Gy;
    for (; y + 4 < h - 1; y += 4, Ip += 4, In += 4, Gy += 4)
    { *_G++ = MUL(SUB(LDu(*In), LDu(*Ip)), _r); }
    for (; y < h - 1; y++) { GRADY(.5f); }
    In--;
    GRADY(1);
#undef GRADY
}
Пример #8
0
// convolve I by a 2r+1 x 2r+1 ones filter (uses SSE)
void convBox( float *I, float *O, int h, int w, int d, int r, int s ) {
  float nrm = 1.0f/((2*r+1)*(2*r+1)); int i, j, k=(s-1)/2, h0, h1, w0; // s=1
  if(h%4==0) h0=h1=h; else { h0=h-(h%4); h1=h0+4; } w0=(w/s)*s;
  float *T=(float*) alMalloc(h1*sizeof(float),16);
  while(d-- > 0) {
    // initialize T
    memset( T, 0, h1*sizeof(float) );
    for(i=0; i<=r; i++) for(j=0; j<h0; j+=4) INC(T[j],LDu(I[j+i*h]));
    for(j=0; j<h0; j+=4) STR(T[j],MUL(nrm,SUB(MUL(2,LD(T[j])),LDu(I[j+r*h]))));
    for(i=0; i<=r; i++) for(j=h0; j<h; j++ ) T[j]+=I[j+i*h]; // assemble just perform like following 2 lines
    for(j=h0; j<h; j++ ) T[j]=nrm*(2*T[j]-I[j+r*h]);
    // prepare and convolve each column in turn
    k++; if(k==s) { k=0; convBoxY(T,O,h,r,s); O+=h/s; }
    for( i=1; i<w0; i++ ) {
      float *Il=I+(i-1-r)*h; if(i<=r) Il=I+(r-i)*h;
      float *Ir=I+(i+r)*h; if(i>=w-r) Ir=I+(2*w-r-i-1)*h;
      for(j=0; j<h0; j+=4) DEC(T[j],MUL(nrm,SUB(LDu(Il[j]),LDu(Ir[j]))));
      for(j=h0; j<h; j++ ) T[j]-=nrm*(Il[j]-Ir[j]);
      k++; if(k==s) { k=0; convBoxY(T,O,h,r,s); O+=h/s; }
    }
    I+=w*h;
  }
  alFree(T);
}
Пример #9
0
// run nIter iterations of Horn & Schunk optical flow (alters Vx, Vy)
void opticalFlowHsMex( float *Vx, float *Vy, const float *Ex, const float *Ey,
  const float *Et, const float *Z, const int h, const int w, const int nIter )
{
  int x, y, x1, i, t, s; float my, mx, m, *Vx0, *Vy0;
  s=w*h*sizeof(float); Vx0=new float[s]; Vy0=new float[s];
  for( t=0; t<nIter; t++ ) {
    memcpy(Vx0,Vx,s); memcpy(Vy0,Vy,s);
    for( x=1; x<w-1; x++ ) {
      // do as much work as possible in SSE (assume non-aligned memory)
      for( y=1; y<h-4; y+=4 ) {
        x1=x*h; i=x1+y; __m128 _mx, _my, _m;
        _my=MUL(ADD(LDu(Vy0[x1-h+y]),LDu(Vy0[x1+h+y]),
          LDu(Vy0[x1+y-1]),LDu(Vy0[x1+y+1])),.25f);
        _mx=MUL(ADD(LDu(Vx0[x1-h+y]),LDu(Vx0[x1+h+y]),
          LDu(Vx0[x1+y-1]),LDu(Vx0[x1+y+1])),.25f);
        _m=MUL(ADD(MUL(LDu(Ey[i]),_my),MUL(LDu(Ex[i]),_mx),
          LDu(Et[i])),LDu(Z[i]));
        STRu(Vx[i],SUB(_mx,MUL(LDu(Ex[i]),_m)));
        STRu(Vy[i],SUB(_my,MUL(LDu(Ey[i]),_m)));
      }
      // do remainder of work in regular loop
      for( ; y<h-1; y++ ) {
        x1=x*h; i=x1+y;
        mx=.25f*(Vx0[x1-h+y]+Vx0[x1+h+y]+Vx0[x1+y-1]+Vx0[x1+y+1]);
        my=.25f*(Vy0[x1-h+y]+Vy0[x1+h+y]+Vy0[x1+y-1]+Vy0[x1+y+1]);
        m = (Ex[i]*mx + Ey[i]*my + Et[i])*Z[i];
        Vx[i]=mx-Ex[i]*m; Vy[i]=my-Ey[i]*m;
      }
    }
  }
  delete [] Vx0; delete [] Vy0;
}