static inline void inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a, const gdouble * b, gint len, const gdouble * icoeff, gint bstride) { gint i = 0; __m128d sum[2], t; const gdouble *c[2] = { (gdouble *) ((gint8 *) b + 0 * bstride), (gdouble *) ((gint8 *) b + 1 * bstride) }; sum[0] = sum[1] = _mm_setzero_pd (); for (; i < len; i += 4) { t = _mm_loadu_pd (a + i + 0); sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0))); sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0))); t = _mm_loadu_pd (a + i + 2); sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2))); sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2))); } sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff)); sum[0] = _mm_add_pd (sum[0], sum[1]); sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0])); _mm_store_sd (o, sum[0]); }
__m128d test_mm_unpackhi_pd(__m128d A, __m128d B) { // DAG-LABEL: test_mm_unpackhi_pd // DAG: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 3> // // ASM-LABEL: test_mm_unpackhi_pd // ASM: unpckhpd return _mm_unpackhi_pd(A, B); }
void Shuffle16Elems(__m128 &io_data0, __m128 &io_data1, __m128 &io_data2, __m128 &io_data3) { __m128 ccdd1 = _mm_unpackhi_ps(io_data0, io_data1); __m128 ccdd2 = _mm_unpackhi_ps(io_data2, io_data3); __m128 aabb1 = _mm_unpacklo_ps(io_data0, io_data1); __m128 aabb2 = _mm_unpacklo_ps(io_data2, io_data3); io_data0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(aabb1), _mm_castps_pd(aabb2))); io_data1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(aabb1), _mm_castps_pd(aabb2))); io_data2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(ccdd1), _mm_castps_pd(ccdd2))); io_data3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(ccdd1), _mm_castps_pd(ccdd2))); }
static __inline __m128d ZMUL(__m128d a, __m128d b) { __m128d ar, ai; ar = _mm_movedup_pd(a); /* ar = [a.r a.r] */ ar = _mm_mul_pd(ar, b); /* ar = [a.r*b.r a.r*b.i] */ ai = _mm_unpackhi_pd(a, a); /* ai = [a.i a.i] */ b = _mm_shuffle_pd(b, b, 1); /* b = [b.i b.r] */ ai = _mm_mul_pd(ai, b); /* ai = [a.i*b.i a.i*b.r] */ return _mm_addsub_pd(ar, ai); /* [a.r*b.r-a.i*b.i a.r*b.i+a.i*b.r] */ }
static inline void inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a, const gdouble * b, gint len, const gdouble * icoeff, gint bstride) { gint i; __m128d f[2], sum[4], t; const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride), (gdouble *) ((gint8 *) b + 1 * bstride), (gdouble *) ((gint8 *) b + 2 * bstride), (gdouble *) ((gint8 *) b + 3 * bstride) }; f[0] = _mm_loadu_pd (icoeff + 0); f[1] = _mm_loadu_pd (icoeff + 2); sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd (); for (i = 0; i < len; i += 2) { t = _mm_loadu_pd (a + i + 0); sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i))); sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i))); sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i))); sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i))); } sum[0] = _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0))); sum[1] = _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1))); sum[2] = _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0))); sum[3] = _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1))); sum[0] = _mm_add_pd (sum[0], sum[1]); sum[2] = _mm_add_pd (sum[2], sum[3]); sum[0] = _mm_add_pd (sum[0], sum[2]); sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0])); _mm_store_sd (o, sum[0]); }
static inline void inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a, const gdouble * b, gint len, const gdouble * icoeff, gint bstride) { gint i = 0; __m128d sum = _mm_setzero_pd (); for (; i < len; i += 8) { sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0), _mm_load_pd (b + i + 0))); sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2), _mm_load_pd (b + i + 2))); sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4), _mm_load_pd (b + i + 4))); sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6), _mm_load_pd (b + i + 6))); } sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum)); _mm_store_sd (o, sum); }
void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { // **************** PARSE THE INPUTS ******************* // myAssert(nrhs==4,"fitStumpUINT8_c: bad nrhs"); const mxArray* mxX = prhs[0]; myAssert(mxIsUint8(mxX), "fitStumpUINT8_c: X must be uint8"); unsigned char* X = (unsigned char*) mxGetPr(mxX); int N = mxGetM(mxX); int p = mxGetN(mxX); const mxArray* mxWWY = prhs[1]; myAssert(mxIsDouble(prhs[1]), "fitStumpUINT8_c: wwy must be double"); double* wwy = (double*) mxGetPr(mxWWY); myAssert(mxGetM(mxWWY)==2, "fitStumpUINT8_c: wwy must be 2 x N"); myAssert(mxGetN(mxWWY)==N, "fitStumpUINT8_c: wwy must be 2 x N"); const mxArray* mxCandVars = prhs[2]; myAssert(mxIsUint32(mxCandVars), "fitStumpUINT8_c: mxCandVars must be uint32"); unsigned int* candVars = (unsigned int*) mxGetPr(mxCandVars); int nCand = mxGetNumberOfElements(mxCandVars); const mxArray* mxGoodInd = prhs[3]; int nGd = mxGetNumberOfElements(mxGoodInd); myAssert(nGd==0 || mxIsUint32(mxGoodInd), "fitStumpUINT8_c: mxGoodInd must be uint32"); unsigned int* goodInd = (unsigned int*) mxGetPr(mxGoodInd); // ****************** SET UP THE OUTPUTS ******************* // plhs[0] = mxCreateNumericMatrix(1,nCand,mxINT32_CLASS,mxREAL); int* cutInd = (int*) mxGetPr(plhs[0]); plhs[1] = mxCreateNumericMatrix(1,nCand,mxDOUBLE_CLASS, mxREAL); double* ssxBest = (double*) mxGetPr(plhs[1]); plhs[2] = mxCreateNumericMatrix(1,nCand,mxDOUBLE_CLASS,mxREAL); double* muL = (double*) mxGetPr(plhs[2]); plhs[3] = mxCreateNumericMatrix(1,nCand,mxDOUBLE_CLASS,mxREAL); double* muR = (double*) mxGetPr(plhs[3]); // ************** MAIN LOOP OVER ALL CANDIDATE VARS *********** // for(int m=0; m<nCand; m++) { unsigned char* x = X + N*candVars[m]; double* wwyBuck = (double *) mxMalloc(2*256*sizeof(double)); // fill weights with small epsilon for numerical stability for(int i=0; i<256; i++) { wwyBuck[i*2] = 1.0E-10; wwyBuck[i*2+1] = 0; } // make weighted histogram of w and wy buckSums(wwyBuck, wwy, N, x, goodInd, nGd); // cumsum __m128d* wwyBuck128 = (__m128d*) wwyBuck; for(int i=1; i<256; i++) wwyBuck128[i] = _mm_add_pd(wwyBuck128[i], wwyBuck128[i-1]); // compute -ssx __m128d wCumEnd = _mm_set_pd(wwyBuck[256*2-2], wwyBuck[256*2-2]); __m128d wyCumEnd = _mm_set_pd(wwyBuck[256*2-1], wwyBuck[256*2-1]); __m128d* ssx128 = (__m128d*) mxMalloc(1*256*sizeof(__m128d)); for(int i=0; i<128; i++) { __m128d wwyBuck1 = wwyBuck128[i*2]; __m128d wwyBuck2 = wwyBuck128[i*2+1]; __m128d wyBuck = _mm_unpackhi_pd(wwyBuck1,wwyBuck2); __m128d wBuck = _mm_unpacklo_pd(wwyBuck1,wwyBuck2); ssx128[i] = _mm_div_pd(_mm_mul_pd(wyBuck,wyBuck),wBuck); __m128d tmp1 = _mm_sub_pd(wyCumEnd,wyBuck); tmp1 = _mm_mul_pd(tmp1,tmp1); __m128d tmp2 = _mm_sub_pd(wCumEnd,wBuck); ssx128[i] = _mm_add_pd(ssx128[i],_mm_div_pd(tmp1,tmp2)); } // find best split location for this candidate variable double* ssx = (double*) ssx128; double mx = ssx[0]; cutInd[m] = 0; for(int i=1;i<256;i++) { if(ssx[i] > mx) { mx = ssx[i]; cutInd[m] = i; } } ssxBest[m] = -mx; muL[m] = wwyBuck[cutInd[m]*2+1] / wwyBuck[cutInd[m]*2]; muR[m] = (wwyBuck[256*2-1] - wwyBuck[cutInd[m]*2+1]) / (wwyBuck[256*2-2] - wwyBuck[cutInd[m]*2]); } }
/* * Intel single precision, _mm_stream_pd version, used for transposing * from a stripe buffer to columns. */ static void fftOPSubTrans( const FFTComplex *_src, FFTComplex *_dst, size_t srcRowSize, // src, in FFTComplex, a.k.a. src numCols size_t dstRowSize) // dst, in FFTComplex, a.k.a. dst numCols { double *src = (double *)_src; double *dst = (double *)_dst; dumpSub("fftOPSubTrans start", _src, srcRowSize); /* * row and col refer to coordinates in src * row size of dst is dstRowSize */ unsigned curcol; for(curcol=0; curcol<FFT_COMPLEX_PER_SUBMATRIX; curcol+=2) { __m128d vin1; __m128d vin2; __m128d vin3; __m128d vin4; __m128d vin5; __m128d vin6; __m128d vin7; __m128d vin8; __m128d vOut_row1_1; __m128d vOut_row1_2; __m128d vOut_row1_3; __m128d vOut_row1_4; __m128d vOut_row2_1; __m128d vOut_row2_2; __m128d vOut_row2_3; __m128d vOut_row2_4; const double *pIn = src + curcol; double *pOut = dst + curcol*dstRowSize; // load in two columns from src at curcol vin1 = _mm_load_pd(pIn+0*srcRowSize); vin2 = _mm_load_pd(pIn+1*srcRowSize); vin3 = _mm_load_pd(pIn+2*srcRowSize); vin4 = _mm_load_pd(pIn+3*srcRowSize); vin5 = _mm_load_pd(pIn+4*srcRowSize); vin6 = _mm_load_pd(pIn+5*srcRowSize); vin7 = _mm_load_pd(pIn+6*srcRowSize); vin8 = _mm_load_pd(pIn+7*srcRowSize); /////////////////////////////////////////////// // transpose for first row out vOut_row1_1 = _mm_unpacklo_pd(vin1, vin2); vOut_row1_2 = _mm_unpacklo_pd(vin3, vin4); vOut_row1_3 = _mm_unpacklo_pd(vin5, vin6); vOut_row1_4 = _mm_unpacklo_pd(vin7, vin8); _mm_stream_pd(pOut+(0*FFT_COMPLEX_PER_VECTOR), vOut_row1_1); _mm_stream_pd(pOut+(1*FFT_COMPLEX_PER_VECTOR), vOut_row1_2); _mm_stream_pd(pOut+(2*FFT_COMPLEX_PER_VECTOR), vOut_row1_3); _mm_stream_pd(pOut+(3*FFT_COMPLEX_PER_VECTOR), vOut_row1_4); /////////////////////////////////////////////// // transpose for second row out pOut += dstRowSize; vOut_row2_1 = _mm_unpackhi_pd(vin1, vin2); vOut_row2_2 = _mm_unpackhi_pd(vin3, vin4); vOut_row2_3 = _mm_unpackhi_pd(vin5, vin6); vOut_row2_4 = _mm_unpackhi_pd(vin7, vin8); _mm_stream_pd(pOut+(0*FFT_COMPLEX_PER_VECTOR), vOut_row2_1); _mm_stream_pd(pOut+(1*FFT_COMPLEX_PER_VECTOR), vOut_row2_2); _mm_stream_pd(pOut+(2*FFT_COMPLEX_PER_VECTOR), vOut_row2_3); _mm_stream_pd(pOut+(3*FFT_COMPLEX_PER_VECTOR), vOut_row2_4); } dumpSub("fftOPSubTrans end", _dst, dstRowSize); }
void AVXFMA4DNoise(Vector3d& result, const Vector3d& EPoint) { DBL x, y, z; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; // TODO FIXME - global statistics reference // Stats[Calls_To_DNoise]++; x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /*ix = (int)x; iy = (int)y; iz = (int)z; x_ix = x - ix; y_iy = y - iy; z_iz = z - iz;*/ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); DBL* mp1 = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL* mp2 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL* mp3 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL* mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL* mp5 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL* mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; DBL* mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL* mp8 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d mm_txty_txsy_tz = _mm_mul_pd(mm_txty_txsy, mm_tz); __m128d mm_txty_txsy_sz = _mm_mul_pd(mm_txty_txsy, mm_sz); __m128d mm_sxty_sxsy_tz = _mm_mul_pd(mm_sxty_sxsy, mm_tz); __m128d mm_sxty_sxsy_sz = _mm_mul_pd(mm_sxty_sxsy, mm_sz); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p; __m128d sum_X_Y = _mm_setzero_pd(); __m128d sum__Z = _mm_setzero_pd(); __m128d mm_s1 = _mm_unpacklo_pd(mm_txty_txsy_tz, mm_txty_txsy_tz); INCRSUMP2(mp1, mp1 + 8, mm_s1, ix_mm, iy_mm, iz_mm, sum_X_Y); __m128d mm_s2 = _mm_unpacklo_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz); INCRSUMP2(mp2, mp2 + 8, mm_s2, jx_mm, iy_mm, iz_mm, sum_X_Y); __m128d mm_s3 = _mm_unpackhi_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz); INCRSUMP2(mp3, mp3 + 8, mm_s3, jx_mm, jy_mm, iz_mm, sum_X_Y); __m128d mm_s4 = _mm_unpackhi_pd(mm_txty_txsy_tz, mm_txty_txsy_tz); INCRSUMP2(mp4, mp4 + 8, mm_s4, ix_mm, jy_mm, iz_mm, sum_X_Y); __m128d mm_s5 = _mm_unpackhi_pd(mm_txty_txsy_sz, mm_txty_txsy_sz); INCRSUMP2(mp5, mp5 + 8, mm_s5, ix_mm, jy_mm, jz_mm, sum_X_Y); __m128d mm_s6 = _mm_unpackhi_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz); INCRSUMP2(mp6, mp6 + 8, mm_s6, jx_mm, jy_mm, jz_mm, sum_X_Y); __m128d mm_s7 = _mm_unpacklo_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz); INCRSUMP2(mp7, mp7 + 8, mm_s7, jx_mm, iy_mm, jz_mm, sum_X_Y); __m128d mm_s8 = _mm_unpacklo_pd(mm_txty_txsy_sz, mm_txty_txsy_sz); INCRSUMP2(mp8, mp8 + 8, mm_s8, ix_mm, iy_mm, jz_mm, sum_X_Y); __m128d iy_jy = _mm_unpacklo_pd(iy_mm, jy_mm); INCRSUMP2(mp1 + 16, mp4 + 16, mm_txty_txsy_tz, ix_mm, iy_jy, iz_mm, sum__Z); INCRSUMP2(mp8 + 16, mp5 + 16, mm_txty_txsy_sz, ix_mm, iy_jy, jz_mm, sum__Z); INCRSUMP2(mp2 + 16, mp3 + 16, mm_sxty_sxsy_tz, jx_mm, iy_jy, iz_mm, sum__Z); INCRSUMP2(mp7 + 16, mp6 + 16, mm_sxty_sxsy_sz, jx_mm, iy_jy, jz_mm, sum__Z); sum__Z = _mm_hadd_pd(sum__Z, sum__Z); _mm_storeu_pd(*result, sum_X_Y); _mm_store_sd(&result[Z], sum__Z); }
DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator) { DBL x, y, z; DBL *mp; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; DBL sum; // TODO FIXME - global statistics reference // Stats[Calls_To_Noise]++; if (noise_generator==kNoiseGen_Perlin) { // The 1.59 and 0.985 are to correct for some biasing problems with // the random # generator used to create the noise tables. Final // range of values is about 5.0e-4 below 0.0 and above 1.0. Mean // value is 0.49 (ideally it would be 0.5). sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985); // Clamp final value to 0-1 range if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; return sum; } x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /* ix = (int)x; iy = (int)y; iz = (long)z; */ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm; __m128d int_sum1 = _mm_setzero_pd(); s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz); INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz); INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz); INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz); INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1); int_sum1 = _mm_hadd_pd(int_sum1, int_sum1); if(noise_generator==kNoiseGen_RangeCorrected) { /* details of range here: Min, max: -1.05242, 0.988997 Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828 We want to change it to as close to [0,1] as possible. */ const __m128d r2 = _mm_set_sd(0.48985582); const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582); int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2); } else { int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5)); } int_sum1 = _mm_min_sd(one, int_sum1); int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1); _mm_store_sd(&sum, int_sum1); return (sum); }
static inline void sacEvaluateModelSPRT(PROSAC_HEST* p){ unsigned i; unsigned isInlier; double lambda = 1.0; double lambdaReject = ((1.0 - p->delta) / (1.0 - p->epsilon)); double lambdaAccept = (( p->delta ) / ( p->epsilon )); float distSq = p->maxD*p->maxD; float* src = (float*)p->src; float* dst = (float*)p->dst; float* H = p->H; p->inl = 0; p->N_tested = 0; p->good = 1; /* VECTOR */ const __m128 distSqV=_mm_set1_ps(distSq); const __m128 H00=_mm_set1_ps(H[0]); const __m128 H01=_mm_set1_ps(H[1]); const __m128 H02=_mm_set1_ps(H[2]); const __m128 H10=_mm_set1_ps(H[4]); const __m128 H11=_mm_set1_ps(H[5]); const __m128 H12=_mm_set1_ps(H[6]); const __m128 H20=_mm_set1_ps(H[8]); const __m128 H21=_mm_set1_ps(H[9]); const __m128 H22=_mm_set1_ps(H[10]); for(i=0;i<(p->N-3) && p->good;i+=4){ /* Backproject */ __m128 x, y, X, Y, inter0, inter1, inter2, inter3; x=_mm_load_ps(src+2*i); y=_mm_load_ps(src+2*i+4); X=_mm_load_ps(dst+2*i); Y=_mm_load_ps(dst+2*i+4); inter0=_mm_unpacklo_ps(x,y);// y1 y0 x1 x0 inter1=_mm_unpackhi_ps(x,y);// y3 y2 x3 x2 inter2=_mm_unpacklo_ps(X,Y);// Y1 Y0 X1 X0 inter3=_mm_unpackhi_ps(X,Y);// Y3 Y2 X3 X2 x=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1))); y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1))); X=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3))); Y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3))); __m128 reprojX = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H00, x), _mm_mul_ps(H01, y)), H02); __m128 reprojY = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H10, x), _mm_mul_ps(H11, y)), H12); __m128 reprojZ = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H20, x), _mm_mul_ps(H21, y)), H22); __m128 recipZ = _mm_rcp_ps(reprojZ); reprojX = _mm_mul_ps(reprojX, recipZ); reprojY = _mm_mul_ps(reprojY, recipZ); //reprojX = _mm_div_ps(reprojX, reprojZ); //reprojY = _mm_div_ps(reprojY, reprojZ); reprojX = _mm_sub_ps(reprojX, X); reprojY = _mm_sub_ps(reprojY, Y); reprojX = _mm_mul_ps(reprojX, reprojX); reprojY = _mm_mul_ps(reprojY, reprojY); __m128 reprojDistV = _mm_add_ps(reprojX, reprojY); __m128 cmp = _mm_cmple_ps(reprojDistV, distSqV); int msk = _mm_movemask_ps(cmp); /* ... */ /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15*/ unsigned bitCnt[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; p->inl += bitCnt[msk]; /* SPRT */ lambda *= p->lambdaTBL[msk]; p->good = lambda <= p->A; /* If !p->good, the threshold A was exceeded, so we're rejecting */ } /* SCALAR */ for(;i<p->N && p->good;i++){ /* Backproject */ float x=src[i*2],y=src[i*2+1]; float X=dst[i*2],Y=dst[i*2+1]; float reprojX=H[0]*x+H[1]*y+H[2]; // ( X_1 ) ( H_11 H_12 H_13 ) (x_1) float reprojY=H[4]*x+H[5]*y+H[6]; // ( X_2 ) = ( H_21 H_22 H_23 ) (x_2) float reprojZ=H[8]*x+H[9]*y+H[10];// ( X_3 ) ( H_31 H_32 H_33=1.0 ) (x_3 = 1.0) //reproj is in homogeneous coordinates. To bring back to "regular" coordinates, divide by Z. reprojX/=reprojZ; reprojY/=reprojZ; //Compute distance reprojX-=X; reprojY-=Y; reprojX*=reprojX; reprojY*=reprojY; float reprojDist = reprojX+reprojY; /* ... */ isInlier = reprojDist <= distSq; p->inl += isInlier; /* SPRT */ lambda *= isInlier ? lambdaAccept : lambdaReject; p->good = lambda <= p->A; /* If !p->good, the threshold A was exceeded, so we're rejecting */ } p->N_tested = i; }
test (__m128d s1, __m128d s2) { return _mm_unpackhi_pd (s1, s2); }