static inline void
inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
  gint i = 0;
  __m128d sum[2], t;
  const gdouble *c[2] = { (gdouble *) ((gint8 *) b + 0 * bstride),
    (gdouble *) ((gint8 *) b + 1 * bstride)
  };

  sum[0] = sum[1] = _mm_setzero_pd ();

  for (; i < len; i += 4) {
    t = _mm_loadu_pd (a + i + 0);
    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0)));
    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0)));
    t = _mm_loadu_pd (a + i + 2);
    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2)));
    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2)));
  }
  sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff));
  sum[0] = _mm_add_pd (sum[0], sum[1]);
  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
  _mm_store_sd (o, sum[0]);
}
示例#2
0
__m128d test_mm_unpackhi_pd(__m128d A, __m128d B) {
  // DAG-LABEL: test_mm_unpackhi_pd
  // DAG: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 3>
  //
  // ASM-LABEL: test_mm_unpackhi_pd
  // ASM: unpckhpd
  return _mm_unpackhi_pd(A, B);
}
	void Shuffle16Elems(__m128 &io_data0, __m128 &io_data1, __m128 &io_data2,
		__m128 &io_data3)
	{
		__m128 ccdd1 = _mm_unpackhi_ps(io_data0, io_data1);
		__m128 ccdd2 = _mm_unpackhi_ps(io_data2, io_data3);
		__m128 aabb1 = _mm_unpacklo_ps(io_data0, io_data1);
		__m128 aabb2 = _mm_unpacklo_ps(io_data2, io_data3);

		io_data0 = 
			_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(aabb1), _mm_castps_pd(aabb2)));
		io_data1 =
			_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(aabb1), _mm_castps_pd(aabb2)));
		io_data2 =
			_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(ccdd1), _mm_castps_pd(ccdd2)));
		io_data3 = 
			_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(ccdd1), _mm_castps_pd(ccdd2)));
	}
示例#4
0
static __inline __m128d ZMUL(__m128d a, __m128d b)
{
    __m128d ar, ai;

    ar = _mm_movedup_pd(a);       /* ar = [a.r a.r] */
    ar = _mm_mul_pd(ar, b);       /* ar = [a.r*b.r a.r*b.i] */
    ai = _mm_unpackhi_pd(a, a);   /* ai = [a.i a.i] */
    b = _mm_shuffle_pd(b, b, 1);  /* b = [b.i b.r] */
    ai = _mm_mul_pd(ai, b);       /* ai = [a.i*b.i a.i*b.r] */

    return _mm_addsub_pd(ar, ai); /* [a.r*b.r-a.i*b.i a.r*b.i+a.i*b.r] */
}
static inline void
inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
  gint i;
  __m128d f[2], sum[4], t;
  const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride),
    (gdouble *) ((gint8 *) b + 1 * bstride),
    (gdouble *) ((gint8 *) b + 2 * bstride),
    (gdouble *) ((gint8 *) b + 3 * bstride)
  };

  f[0] = _mm_loadu_pd (icoeff + 0);
  f[1] = _mm_loadu_pd (icoeff + 2);
  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd ();

  for (i = 0; i < len; i += 2) {
    t = _mm_loadu_pd (a + i + 0);
    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i)));
    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i)));
    sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i)));
    sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i)));
  }
  sum[0] =
      _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0)));
  sum[1] =
      _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1)));
  sum[2] =
      _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0)));
  sum[3] =
      _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1)));
  sum[0] = _mm_add_pd (sum[0], sum[1]);
  sum[2] = _mm_add_pd (sum[2], sum[3]);
  sum[0] = _mm_add_pd (sum[0], sum[2]);
  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
  _mm_store_sd (o, sum[0]);
}
static inline void
inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a,
    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
  gint i = 0;
  __m128d sum = _mm_setzero_pd ();

  for (; i < len; i += 8) {
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
            _mm_load_pd (b + i + 0)));
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
            _mm_load_pd (b + i + 2)));
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
            _mm_load_pd (b + i + 4)));
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
            _mm_load_pd (b + i + 6)));
  }
  sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
  _mm_store_sd (o, sum);
}
void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
    // ****************   PARSE THE INPUTS  ******************* //
    myAssert(nrhs==4,"fitStumpUINT8_c: bad nrhs");
    const mxArray* mxX = prhs[0];
    myAssert(mxIsUint8(mxX), "fitStumpUINT8_c: X must be uint8");
    unsigned char* X = (unsigned char*) mxGetPr(mxX);
    int N = mxGetM(mxX);    int p = mxGetN(mxX);
    
    const mxArray* mxWWY = prhs[1];
    myAssert(mxIsDouble(prhs[1]), "fitStumpUINT8_c: wwy must be double");
    double* wwy = (double*) mxGetPr(mxWWY);
    myAssert(mxGetM(mxWWY)==2, "fitStumpUINT8_c: wwy must be 2 x N");
    myAssert(mxGetN(mxWWY)==N, "fitStumpUINT8_c: wwy must be 2 x N");
    const mxArray* mxCandVars = prhs[2];
    myAssert(mxIsUint32(mxCandVars), "fitStumpUINT8_c: mxCandVars must be uint32");
    unsigned int* candVars = (unsigned int*) mxGetPr(mxCandVars);
    int nCand = mxGetNumberOfElements(mxCandVars);
    
    const mxArray* mxGoodInd = prhs[3];
    int nGd = mxGetNumberOfElements(mxGoodInd);
    myAssert(nGd==0 || mxIsUint32(mxGoodInd), "fitStumpUINT8_c: mxGoodInd must be uint32");
    unsigned int* goodInd = (unsigned int*) mxGetPr(mxGoodInd);
        
    // ******************  SET UP THE OUTPUTS  ******************* //
    plhs[0] = mxCreateNumericMatrix(1,nCand,mxINT32_CLASS,mxREAL);
    int* cutInd = (int*) mxGetPr(plhs[0]);
	plhs[1] = mxCreateNumericMatrix(1,nCand,mxDOUBLE_CLASS, mxREAL);
    double* ssxBest = (double*) mxGetPr(plhs[1]);
	plhs[2] = mxCreateNumericMatrix(1,nCand,mxDOUBLE_CLASS,mxREAL);
	double* muL = (double*) mxGetPr(plhs[2]);
	plhs[3] = mxCreateNumericMatrix(1,nCand,mxDOUBLE_CLASS,mxREAL);
    double* muR = (double*) mxGetPr(plhs[3]);
    // ************** MAIN LOOP OVER ALL CANDIDATE VARS *********** //
    for(int m=0; m<nCand; m++) {
        unsigned char* x = X + N*candVars[m];
        double* wwyBuck = (double *) mxMalloc(2*256*sizeof(double));
        // fill weights with small epsilon for numerical stability
        for(int i=0; i<256; i++) {
            wwyBuck[i*2] = 1.0E-10;
            wwyBuck[i*2+1] = 0;
        }
        // make weighted histogram of w and wy
        buckSums(wwyBuck, wwy, N, x, goodInd, nGd);
        // cumsum
        __m128d* wwyBuck128 = (__m128d*) wwyBuck;
        for(int i=1; i<256; i++)
            wwyBuck128[i] = _mm_add_pd(wwyBuck128[i], wwyBuck128[i-1]);
        // compute -ssx
        __m128d wCumEnd = _mm_set_pd(wwyBuck[256*2-2], wwyBuck[256*2-2]);
        __m128d wyCumEnd = _mm_set_pd(wwyBuck[256*2-1], wwyBuck[256*2-1]);

        __m128d* ssx128 = (__m128d*) mxMalloc(1*256*sizeof(__m128d));
        for(int i=0; i<128; i++) {
            __m128d wwyBuck1 = wwyBuck128[i*2];
            __m128d wwyBuck2 = wwyBuck128[i*2+1];
            __m128d wyBuck = _mm_unpackhi_pd(wwyBuck1,wwyBuck2);
            __m128d wBuck = _mm_unpacklo_pd(wwyBuck1,wwyBuck2);
            ssx128[i] = _mm_div_pd(_mm_mul_pd(wyBuck,wyBuck),wBuck);

            __m128d tmp1 = _mm_sub_pd(wyCumEnd,wyBuck);
            tmp1 = _mm_mul_pd(tmp1,tmp1);
            __m128d tmp2 = _mm_sub_pd(wCumEnd,wBuck);

            ssx128[i] = _mm_add_pd(ssx128[i],_mm_div_pd(tmp1,tmp2));
        }
        // find best split location for this candidate variable
        double* ssx = (double*) ssx128;
        double mx = ssx[0];     cutInd[m] = 0;
        
        for(int i=1;i<256;i++) {
            if(ssx[i] > mx) {
                mx = ssx[i];	cutInd[m] = i;
            }
        }
        ssxBest[m] = -mx;
        muL[m] = wwyBuck[cutInd[m]*2+1] / wwyBuck[cutInd[m]*2];
        muR[m] = (wwyBuck[256*2-1] - wwyBuck[cutInd[m]*2+1]) / (wwyBuck[256*2-2] - wwyBuck[cutInd[m]*2]);
    }
}
/* 
 * Intel single precision, _mm_stream_pd version, used for transposing
 * from a stripe buffer to columns. 
 */
static void fftOPSubTrans(
  const FFTComplex	*_src,
  FFTComplex		*_dst,
  size_t			srcRowSize,		// src, in FFTComplex, a.k.a. src numCols
  size_t			dstRowSize)		// dst, in FFTComplex, a.k.a. dst numCols
{
	double *src = (double *)_src;
	double *dst = (double *)_dst;
	
	dumpSub("fftOPSubTrans start", _src, srcRowSize);
	
	/* 
	 * row and col refer to coordinates in src 
	 * row size of dst is dstRowSize
	 */
	unsigned curcol;
	
	for(curcol=0; curcol<FFT_COMPLEX_PER_SUBMATRIX; curcol+=2) {
		__m128d vin1;
		__m128d vin2;
		__m128d vin3;
		__m128d vin4;
		__m128d vin5;
		__m128d vin6;
		__m128d vin7;
		__m128d vin8;
		
		__m128d vOut_row1_1;
		__m128d vOut_row1_2;
		__m128d vOut_row1_3;
		__m128d vOut_row1_4;
		__m128d vOut_row2_1;
		__m128d vOut_row2_2;
		__m128d vOut_row2_3;
		__m128d vOut_row2_4;
		
		const double *pIn = src + curcol;
		double *pOut = dst + curcol*dstRowSize;
		
		// load in two columns from src at curcol
		vin1 = _mm_load_pd(pIn+0*srcRowSize);
		vin2 = _mm_load_pd(pIn+1*srcRowSize);
		vin3 = _mm_load_pd(pIn+2*srcRowSize);
		vin4 = _mm_load_pd(pIn+3*srcRowSize);
		vin5 = _mm_load_pd(pIn+4*srcRowSize);
		vin6 = _mm_load_pd(pIn+5*srcRowSize);
		vin7 = _mm_load_pd(pIn+6*srcRowSize);
		vin8 = _mm_load_pd(pIn+7*srcRowSize);
		
		///////////////////////////////////////////////
		// transpose for first row out
		
		vOut_row1_1 = _mm_unpacklo_pd(vin1, vin2);
		vOut_row1_2 = _mm_unpacklo_pd(vin3, vin4);
		vOut_row1_3 = _mm_unpacklo_pd(vin5, vin6);
		vOut_row1_4 = _mm_unpacklo_pd(vin7, vin8);
		
		_mm_stream_pd(pOut+(0*FFT_COMPLEX_PER_VECTOR), vOut_row1_1);
		_mm_stream_pd(pOut+(1*FFT_COMPLEX_PER_VECTOR), vOut_row1_2);
		_mm_stream_pd(pOut+(2*FFT_COMPLEX_PER_VECTOR), vOut_row1_3);
		_mm_stream_pd(pOut+(3*FFT_COMPLEX_PER_VECTOR), vOut_row1_4);
		
		///////////////////////////////////////////////
		// transpose for second row out
		pOut += dstRowSize;
		
		vOut_row2_1 = _mm_unpackhi_pd(vin1, vin2);
		vOut_row2_2 = _mm_unpackhi_pd(vin3, vin4);
		vOut_row2_3 = _mm_unpackhi_pd(vin5, vin6);
		vOut_row2_4 = _mm_unpackhi_pd(vin7, vin8);
		
		_mm_stream_pd(pOut+(0*FFT_COMPLEX_PER_VECTOR), vOut_row2_1);
		_mm_stream_pd(pOut+(1*FFT_COMPLEX_PER_VECTOR), vOut_row2_2);
		_mm_stream_pd(pOut+(2*FFT_COMPLEX_PER_VECTOR), vOut_row2_3);
		_mm_stream_pd(pOut+(3*FFT_COMPLEX_PER_VECTOR), vOut_row2_4);
	}
	
	dumpSub("fftOPSubTrans end", _dst, dstRowSize);
}
示例#9
0
void AVXFMA4DNoise(Vector3d& result, const Vector3d& EPoint)
{
    DBL x, y, z;
    int ix, iy, iz;
    int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_DNoise]++;

    x = EPoint[X];
    y = EPoint[Y];
    z = EPoint[Z];

    /* its equivalent integer lattice point. */
    /*ix = (int)x; iy = (int)y; iz = (int)z;
    x_ix = x - ix; y_iy = y - iy; z_iz = z - iz;*/
                /* JB fix for the range problem */

    __m128d xy = _mm_setr_pd(x, y);
    __m128d zn = _mm_set_sd(z);
    __m128d epsy = _mm_set1_pd(1.0 - EPSILON);
    __m128d xy_e = _mm_sub_pd(xy, epsy);
    __m128d zn_e = _mm_sub_sd(zn, epsy);
    __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy));
    __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn));

    __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0);
    __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ);

    __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy));
    __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn));

    const __m128i fff = _mm_set1_epi32(0xfff);
    __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff);
    __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff);

    ix = _mm_extract_epi32(i_xy, 0);
    iy = _mm_extract_epi32(i_xy, 1);
    iz = _mm_extract_epi32(i_zn, 0);

    ixiy_hash = Hash2d(ix, iy);
    jxiy_hash = Hash2d(ix + 1, iy);
    ixjy_hash = Hash2d(ix, iy + 1);
    jxjy_hash = Hash2d(ix + 1, iy + 1);

    DBL* mp1 = &RTable[Hash1dRTableIndex(ixiy_hash, iz)];
    DBL* mp2 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)];
    DBL* mp3 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)];
    DBL* mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)];
    DBL* mp5 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)];
    DBL* mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)];
    DBL* mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)];
    DBL* mp8 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)];

    const __m128d three = _mm_set1_pd(3.0);
    const __m128d two = _mm_set1_pd(2.0);
    const __m128d one = _mm_set1_pd(1.0);

    __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy);
    __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy);
    __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn);

    __m128d jx_mm = _mm_sub_pd(ix_mm, one);
    __m128d jy_mm = _mm_sub_pd(iy_mm, one);
    __m128d jz_mm = _mm_sub_pd(iz_mm, one);

    __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three));

    __m128d mm_tz = _mm_sub_pd(one, mm_sz);

    __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three));

    __m128d mm_txy = _mm_sub_pd(one, mm_sxy);
    __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy);
    __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy);
    __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy);

    __m128d mm_txty_txsy_tz = _mm_mul_pd(mm_txty_txsy, mm_tz);
    __m128d mm_txty_txsy_sz = _mm_mul_pd(mm_txty_txsy, mm_sz);
    __m128d mm_sxty_sxsy_tz = _mm_mul_pd(mm_sxty_sxsy, mm_tz);
    __m128d mm_sxty_sxsy_sz = _mm_mul_pd(mm_sxty_sxsy, mm_sz);

    __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p;
    __m128d sum_X_Y = _mm_setzero_pd();
    __m128d sum__Z = _mm_setzero_pd();

    __m128d mm_s1 = _mm_unpacklo_pd(mm_txty_txsy_tz, mm_txty_txsy_tz);
    INCRSUMP2(mp1, mp1 + 8, mm_s1, ix_mm, iy_mm, iz_mm, sum_X_Y);

    __m128d mm_s2 = _mm_unpacklo_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz);
    INCRSUMP2(mp2, mp2 + 8, mm_s2, jx_mm, iy_mm, iz_mm, sum_X_Y);

    __m128d mm_s3 = _mm_unpackhi_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz);
    INCRSUMP2(mp3, mp3 + 8, mm_s3, jx_mm, jy_mm, iz_mm, sum_X_Y);

    __m128d mm_s4 = _mm_unpackhi_pd(mm_txty_txsy_tz, mm_txty_txsy_tz);
    INCRSUMP2(mp4, mp4 + 8, mm_s4, ix_mm, jy_mm, iz_mm, sum_X_Y);

    __m128d mm_s5 = _mm_unpackhi_pd(mm_txty_txsy_sz, mm_txty_txsy_sz);
    INCRSUMP2(mp5, mp5 + 8, mm_s5, ix_mm, jy_mm, jz_mm, sum_X_Y);

    __m128d mm_s6 = _mm_unpackhi_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz);
    INCRSUMP2(mp6, mp6 + 8, mm_s6, jx_mm, jy_mm, jz_mm, sum_X_Y);

    __m128d mm_s7 = _mm_unpacklo_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz);
    INCRSUMP2(mp7, mp7 + 8, mm_s7, jx_mm, iy_mm, jz_mm, sum_X_Y);

    __m128d mm_s8 = _mm_unpacklo_pd(mm_txty_txsy_sz, mm_txty_txsy_sz);
    INCRSUMP2(mp8, mp8 + 8, mm_s8, ix_mm, iy_mm, jz_mm, sum_X_Y);

    __m128d iy_jy = _mm_unpacklo_pd(iy_mm, jy_mm);
    INCRSUMP2(mp1 + 16, mp4 + 16, mm_txty_txsy_tz, ix_mm, iy_jy, iz_mm, sum__Z);
    INCRSUMP2(mp8 + 16, mp5 + 16, mm_txty_txsy_sz, ix_mm, iy_jy, jz_mm, sum__Z);
    INCRSUMP2(mp2 + 16, mp3 + 16, mm_sxty_sxsy_tz, jx_mm, iy_jy, iz_mm, sum__Z);
    INCRSUMP2(mp7 + 16, mp6 + 16, mm_sxty_sxsy_sz, jx_mm, iy_jy, jz_mm, sum__Z);

    sum__Z = _mm_hadd_pd(sum__Z, sum__Z);

    _mm_storeu_pd(*result, sum_X_Y);
    _mm_store_sd(&result[Z], sum__Z);
}
示例#10
0
DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator)
{
    DBL x, y, z;
    DBL *mp;
    int ix, iy, iz;
    int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;
    DBL sum;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_Noise]++;

    if (noise_generator==kNoiseGen_Perlin)
    {
        // The 1.59 and 0.985 are to correct for some biasing problems with
        // the random # generator used to create the noise tables.  Final
        // range of values is about 5.0e-4 below 0.0 and above 1.0.  Mean
        // value is 0.49 (ideally it would be 0.5).
        sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985);

        // Clamp final value to 0-1 range
            if (sum < 0.0) sum = 0.0;
            if (sum > 1.0) sum = 1.0;

        return sum;
    }

    x = EPoint[X];
    y = EPoint[Y];
    z = EPoint[Z];

    /* its equivalent integer lattice point. */
    /* ix = (int)x; iy = (int)y; iz = (long)z; */
    /* JB fix for the range problem */

    __m128d xy = _mm_setr_pd(x, y);
    __m128d zn = _mm_set_sd(z);
    __m128d epsy = _mm_set1_pd(1.0 - EPSILON);
    __m128d xy_e = _mm_sub_pd(xy, epsy);
    __m128d zn_e = _mm_sub_sd(zn, epsy);
    __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy));
    __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn));

    __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0);
    __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ);

    __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy));
    __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn));

    const __m128i fff = _mm_set1_epi32(0xfff);
    __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff);
    __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff);

    ix = _mm_extract_epi32(i_xy, 0);
    iy = _mm_extract_epi32(i_xy, 1);
    iz = _mm_extract_epi32(i_zn, 0);

    ixiy_hash = Hash2d(ix, iy);
    jxiy_hash = Hash2d(ix + 1, iy);
    ixjy_hash = Hash2d(ix, iy + 1);
    jxjy_hash = Hash2d(ix + 1, iy + 1);

    mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)];
    DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)];
    DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)];
    DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)];
    DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)];
    DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)];
    DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)];
    DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)];

    const __m128d three = _mm_set1_pd(3.0);
    const __m128d two = _mm_set1_pd(2.0);
    const __m128d one = _mm_set1_pd(1.0);

    __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy);
    __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy);
    __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn);

    __m128d jx_mm = _mm_sub_pd(ix_mm, one);
    __m128d jy_mm = _mm_sub_pd(iy_mm, one);
    __m128d jz_mm = _mm_sub_pd(iz_mm, one);

    __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three));
    __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three));

    __m128d mm_tz = _mm_sub_pd(one, mm_sz);
    __m128d mm_txy = _mm_sub_pd(one, mm_sxy);
    __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy);
    __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy);
    __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy);

    __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm);

    __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm;
    __m128d int_sum1 = _mm_setzero_pd();

    s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz);
    INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz);
    INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz);
    INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz);
    INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1);

    int_sum1 = _mm_hadd_pd(int_sum1, int_sum1);

    if(noise_generator==kNoiseGen_RangeCorrected)
    {
        /* details of range here:
        Min, max: -1.05242, 0.988997
        Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828

        We want to change it to as close to [0,1] as possible.
        */
        const __m128d r2 = _mm_set_sd(0.48985582);
        const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582);
        int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2);
    }
    else
    {
        int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5));
    }

    int_sum1 = _mm_min_sd(one, int_sum1);
    int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1);
    _mm_store_sd(&sum, int_sum1);

    return (sum);
}
示例#11
0
static inline void   sacEvaluateModelSPRT(PROSAC_HEST* p){
	unsigned i;
	unsigned isInlier;
	double   lambda       = 1.0;
	double   lambdaReject = ((1.0 - p->delta) / (1.0 - p->epsilon));
	double   lambdaAccept = ((   p->delta   ) / (    p->epsilon  ));
	float    distSq = p->maxD*p->maxD;
	float*   src = (float*)p->src;
	float*   dst = (float*)p->dst;
	float*   H   = p->H;
	
	
	p->inl      = 0;
	p->N_tested = 0;
	p->good     = 1;
	
	
	/* VECTOR */
	const __m128 distSqV=_mm_set1_ps(distSq);
	
	const __m128 H00=_mm_set1_ps(H[0]);
	const __m128 H01=_mm_set1_ps(H[1]);
	const __m128 H02=_mm_set1_ps(H[2]);
	const __m128 H10=_mm_set1_ps(H[4]);
	const __m128 H11=_mm_set1_ps(H[5]);
	const __m128 H12=_mm_set1_ps(H[6]);
	const __m128 H20=_mm_set1_ps(H[8]);
	const __m128 H21=_mm_set1_ps(H[9]);
	const __m128 H22=_mm_set1_ps(H[10]);
	
	for(i=0;i<(p->N-3) && p->good;i+=4){
		/* Backproject */
		__m128 x, y, X, Y, inter0, inter1, inter2, inter3;
		x=_mm_load_ps(src+2*i);
		y=_mm_load_ps(src+2*i+4);
		X=_mm_load_ps(dst+2*i);
		Y=_mm_load_ps(dst+2*i+4);
		
		inter0=_mm_unpacklo_ps(x,y);// y1 y0 x1 x0
		inter1=_mm_unpackhi_ps(x,y);// y3 y2 x3 x2
		inter2=_mm_unpacklo_ps(X,Y);// Y1 Y0 X1 X0
		inter3=_mm_unpackhi_ps(X,Y);// Y3 Y2 X3 X2
		
		x=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1)));
		y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1)));
		X=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3)));
		Y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3)));
		
		__m128 reprojX = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H00, x), _mm_mul_ps(H01, y)), H02);
		__m128 reprojY = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H10, x), _mm_mul_ps(H11, y)), H12);
		__m128 reprojZ = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H20, x), _mm_mul_ps(H21, y)), H22);
		
		__m128 recipZ = _mm_rcp_ps(reprojZ);
		reprojX = _mm_mul_ps(reprojX, recipZ);
		reprojY = _mm_mul_ps(reprojY, recipZ);
		//reprojX = _mm_div_ps(reprojX, reprojZ);
		//reprojY = _mm_div_ps(reprojY, reprojZ);
		
		reprojX = _mm_sub_ps(reprojX, X);
		reprojY = _mm_sub_ps(reprojY, Y);
		
		reprojX = _mm_mul_ps(reprojX, reprojX);
		reprojY = _mm_mul_ps(reprojY, reprojY);
		
		__m128 reprojDistV = _mm_add_ps(reprojX, reprojY);
		
		__m128 cmp = _mm_cmple_ps(reprojDistV, distSqV);
		int msk = _mm_movemask_ps(cmp);
		
		/* ... */
		/*                   0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15*/
		unsigned bitCnt[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
		p->inl     += bitCnt[msk];
		
		
		/* SPRT */
		lambda *= p->lambdaTBL[msk];
		p->good = lambda <= p->A;
		/* If !p->good, the threshold A was exceeded, so we're rejecting */
	}
	
	/* SCALAR */
	for(;i<p->N && p->good;i++){
		/* Backproject */
		float x=src[i*2],y=src[i*2+1];
		float X=dst[i*2],Y=dst[i*2+1];
		
		float reprojX=H[0]*x+H[1]*y+H[2]; //  ( X_1 )     ( H_11 H_12    H_13  ) (x_1)
		float reprojY=H[4]*x+H[5]*y+H[6]; //  ( X_2 )  =  ( H_21 H_22    H_23  ) (x_2)
		float reprojZ=H[8]*x+H[9]*y+H[10];//  ( X_3 )     ( H_31 H_32 H_33=1.0 ) (x_3 = 1.0)
		
		//reproj is in homogeneous coordinates. To bring back to "regular" coordinates, divide by Z.
		reprojX/=reprojZ;
		reprojY/=reprojZ;
		
		//Compute distance
		reprojX-=X;
		reprojY-=Y;
		reprojX*=reprojX;
		reprojY*=reprojY;
		float reprojDist = reprojX+reprojY;
		
		/* ... */
		isInlier    = reprojDist <= distSq;
		p->inl     += isInlier;
		
		
		/* SPRT */
		lambda *= isInlier ? lambdaAccept : lambdaReject;
		p->good = lambda <= p->A;
		/* If !p->good, the threshold A was exceeded, so we're rejecting */
	}
	
	
	p->N_tested = i;
}
test (__m128d s1, __m128d s2)
{
  return _mm_unpackhi_pd (s1, s2); 
}