__m128d test_mm_unpacklo_pd(__m128d A, __m128d B) { // DAG-LABEL: test_mm_unpacklo_pd // DAG: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 0, i32 2> // // ASM-LABEL: test_mm_unpacklo_pd // ASM: unpcklpd return _mm_unpacklo_pd(A, B); }
void Shuffle16Elems(__m128 &io_data0, __m128 &io_data1, __m128 &io_data2, __m128 &io_data3) { __m128 ccdd1 = _mm_unpackhi_ps(io_data0, io_data1); __m128 ccdd2 = _mm_unpackhi_ps(io_data2, io_data3); __m128 aabb1 = _mm_unpacklo_ps(io_data0, io_data1); __m128 aabb2 = _mm_unpacklo_ps(io_data2, io_data3); io_data0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(aabb1), _mm_castps_pd(aabb2))); io_data1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(aabb1), _mm_castps_pd(aabb2))); io_data2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(ccdd1), _mm_castps_pd(ccdd2))); io_data3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(ccdd1), _mm_castps_pd(ccdd2))); }
void transpose_misaligned(double *a, double *b, int N1, int N2, double factor) { int i,j,k,k1,it,jt,itt,jtt,it_bound,jt_bound,itt_bound,jtt_bound; int conflict,tmp,tmpN,offset,line_offset,setnum,set[8192/(4*sizeof(double))]; double *pA, *pB; register __m128d x, y, z, w, t, t1,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); itt_bound = (N1/tilesize)*tilesize; for (itt = 0; itt < itt_bound; itt=itt+5*tilesize) { jtt_bound =(N2/tilesize)*tilesize; for (jtt = 0; jtt < jtt_bound; jtt=jtt+5*tilesize) { it_bound = (itt+5*tilesize > itt_bound)?itt_bound:itt+5*tilesize; for (it = itt; it < it_bound; it = it+tilesize) { jt_bound = (jtt+5*tilesize>itt_bound)?jtt_bound:jtt+5*tilesize; for (jt = jtt; jt < jt_bound; jt = jt+tilesize) { k = 0; for (j = jt; j < jt+tilesize; j=j+2) { for (i = it; i < it+tilesize; i=i+2) { pA = a+i*N2+j; pB = b+j*N1+i; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } for (i = itt; i < itt+5*tilesize && i < itt_bound; i++) { for (j = jtt_bound; j < N2; j++) { b[j*N1+i] = factor * a[i*N2+j]; } } } for (i = itt_bound; i < N1; i++) { for (j = 0; j < N2; j++) { b[j*N1+i] = factor * a[i*N2+j]; } } }
void transpose_aligned(double *a, double *b, int N1, int N2, double factor) { int i,j,k,k1,it,jt,itt,jtt,conflict,tmp,tmpN; double *pA, *pB; register __m128d x, y, z, w,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); for (it = 0; it < N1; it=it+tilesize) { for (jt = 0; jt < N2; jt=jt+tilesize) { k = 0; for (j = jt; j < jt+tilesize; j=j+2) { for (i = it; i < it+tilesize; i=i+2) { pA = a+i*N2+j; x = _mm_load_pd(pA); y = _mm_load_pd(pA + N2); x = _mm_mul_pd(x,fac_vector); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); k = (j-jt)*tilesize + (i-it); _mm_store_pd(buf + k,z); _mm_store_pd(buf + k + tilesize,w); } } k = 0; k1 = 0; for (j = jt; j < jt+tilesize; j++) { pB = b+j*N1+it; k = (j-jt)*tilesize; x = _mm_load_pd(&buf[k]); y = _mm_load_pd(&buf[k]+2); z = _mm_load_pd(&buf[k]+2*2); w = _mm_load_pd(&buf[k]+3*2); _mm_stream_pd(pB,x); _mm_stream_pd(pB+2,y); _mm_stream_pd(pB+2*2,z); _mm_stream_pd(pB+3*2,w); } } } }
void matMult_opt(int N, const double *matA, const double *matB, double *matC) { int i, j, k; if((N%2)==0) { //int TwoN=N; //N=N/2; for(i=0; i<N; i++) { for(j=0;j<N;j++) { __m128d matA_value = _mm_load_sd(&matA[j*N+i]); matA_value = _mm_unpacklo_pd(matA_value, matA_value); for(k=0; k<N; k+=2) { __m128d matB_value = _mm_load_pd(&matB[i*N+k]); __m128d matC_value = _mm_load_pd(&matC[j*N+k]); _mm_store_pd(&matC[j*N+k], _mm_add_pd(_mm_mul_pd(matA_value, matB_value), matC_value)); } } } } else { for(i=0; i<N; i++) { for(j=0;j<N;j++) { for(k=0; k<N; k++) { matC[j*N+k] += matA[j*N+i]*matB[i*N+k]; } } } } }
void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { // **************** PARSE THE INPUTS ******************* // myAssert(nrhs==4,"fitStumpUINT8_c: bad nrhs"); const mxArray* mxX = prhs[0]; myAssert(mxIsUint8(mxX), "fitStumpUINT8_c: X must be uint8"); unsigned char* X = (unsigned char*) mxGetPr(mxX); int N = mxGetM(mxX); int p = mxGetN(mxX); const mxArray* mxWWY = prhs[1]; myAssert(mxIsDouble(prhs[1]), "fitStumpUINT8_c: wwy must be double"); double* wwy = (double*) mxGetPr(mxWWY); myAssert(mxGetM(mxWWY)==2, "fitStumpUINT8_c: wwy must be 2 x N"); myAssert(mxGetN(mxWWY)==N, "fitStumpUINT8_c: wwy must be 2 x N"); const mxArray* mxCandVars = prhs[2]; myAssert(mxIsUint32(mxCandVars), "fitStumpUINT8_c: mxCandVars must be uint32"); unsigned int* candVars = (unsigned int*) mxGetPr(mxCandVars); int nCand = mxGetNumberOfElements(mxCandVars); const mxArray* mxGoodInd = prhs[3]; int nGd = mxGetNumberOfElements(mxGoodInd); myAssert(nGd==0 || mxIsUint32(mxGoodInd), "fitStumpUINT8_c: mxGoodInd must be uint32"); unsigned int* goodInd = (unsigned int*) mxGetPr(mxGoodInd); // ****************** SET UP THE OUTPUTS ******************* // plhs[0] = mxCreateNumericMatrix(1,nCand,mxINT32_CLASS,mxREAL); int* cutInd = (int*) mxGetPr(plhs[0]); plhs[1] = mxCreateNumericMatrix(1,nCand,mxDOUBLE_CLASS, mxREAL); double* ssxBest = (double*) mxGetPr(plhs[1]); plhs[2] = mxCreateNumericMatrix(1,nCand,mxDOUBLE_CLASS,mxREAL); double* muL = (double*) mxGetPr(plhs[2]); plhs[3] = mxCreateNumericMatrix(1,nCand,mxDOUBLE_CLASS,mxREAL); double* muR = (double*) mxGetPr(plhs[3]); // ************** MAIN LOOP OVER ALL CANDIDATE VARS *********** // for(int m=0; m<nCand; m++) { unsigned char* x = X + N*candVars[m]; double* wwyBuck = (double *) mxMalloc(2*256*sizeof(double)); // fill weights with small epsilon for numerical stability for(int i=0; i<256; i++) { wwyBuck[i*2] = 1.0E-10; wwyBuck[i*2+1] = 0; } // make weighted histogram of w and wy buckSums(wwyBuck, wwy, N, x, goodInd, nGd); // cumsum __m128d* wwyBuck128 = (__m128d*) wwyBuck; for(int i=1; i<256; i++) wwyBuck128[i] = _mm_add_pd(wwyBuck128[i], wwyBuck128[i-1]); // compute -ssx __m128d wCumEnd = _mm_set_pd(wwyBuck[256*2-2], wwyBuck[256*2-2]); __m128d wyCumEnd = _mm_set_pd(wwyBuck[256*2-1], wwyBuck[256*2-1]); __m128d* ssx128 = (__m128d*) mxMalloc(1*256*sizeof(__m128d)); for(int i=0; i<128; i++) { __m128d wwyBuck1 = wwyBuck128[i*2]; __m128d wwyBuck2 = wwyBuck128[i*2+1]; __m128d wyBuck = _mm_unpackhi_pd(wwyBuck1,wwyBuck2); __m128d wBuck = _mm_unpacklo_pd(wwyBuck1,wwyBuck2); ssx128[i] = _mm_div_pd(_mm_mul_pd(wyBuck,wyBuck),wBuck); __m128d tmp1 = _mm_sub_pd(wyCumEnd,wyBuck); tmp1 = _mm_mul_pd(tmp1,tmp1); __m128d tmp2 = _mm_sub_pd(wCumEnd,wBuck); ssx128[i] = _mm_add_pd(ssx128[i],_mm_div_pd(tmp1,tmp2)); } // find best split location for this candidate variable double* ssx = (double*) ssx128; double mx = ssx[0]; cutInd[m] = 0; for(int i=1;i<256;i++) { if(ssx[i] > mx) { mx = ssx[i]; cutInd[m] = i; } } ssxBest[m] = -mx; muL[m] = wwyBuck[cutInd[m]*2+1] / wwyBuck[cutInd[m]*2]; muR[m] = (wwyBuck[256*2-1] - wwyBuck[cutInd[m]*2+1]) / (wwyBuck[256*2-2] - wwyBuck[cutInd[m]*2]); } }
void transpose_4321_loop_3241_( double *unsorted, double *sorted, int *p_dim1, int *p_dim2, int *p_dim3, int *p_dim4, double *p_factor ) { int dim1,dim2,dim3,dim4; int dim1mod,dim2mod,dim3mod,dim4mod; unsigned int old_offset,new_offset; unsigned int j1,j2,j3,j4; double factor = *p_factor; double *pA, *pB; register __m128d x, y, z, w, t, t1,fac_vector; unsigned int N1,N2; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); dim1 = *p_dim1; dim2 = *p_dim2; dim3 = *p_dim3; dim4 = *p_dim4; N1 = dim2*dim3*dim4; N2 = dim2*dim3*dim4; dim1mod = (int) floor( (float)dim1 / (float) 4); dim2mod = (int) floor( (float)dim2 / (float) 4); dim3mod = (int) floor( (float)dim3 / (float) 4); dim4mod = (int) floor( (float)dim4 / (float) 4); /* pluto start (dim1,dim2,dim3,dim4) */ #pragma ivdep #pragma parallel #pragma loop count min(10) max(80) avg(40) #pragma unroll for( j3 = 0; j3<dim3; j3++) { #pragma loop count min(10) max(80) avg(40) #pragma unroll for( j2 = 0; j2<dim2; j2++) { #pragma loop count min(10) max(80) avg(40) #pragma unroll #pragma vector always for( j4 = 0; j4<dim4; j4+=2) { #pragma loop count min(10) max(80) avg(40) #pragma unroll #pragma vector always for( j1 = 0; j1<dim1; j1+=2) { //sorted[j1+dim1*(j2+dim2*(j3+dim3*j4))] = unsorted[j4+dim4*(j3+dim3*(j2+dim2*j1))] * factor; pA = unsorted + j4+dim4*(j3+dim3*(j2+dim2*j1)); pB = sorted + j1+dim1*(j2+dim2*(j3+dim3*j4)); x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } /* pluto end */ return; }
/* * Intel single precision, _mm_stream_pd version, used for transposing * from a stripe buffer to columns. */ static void fftOPSubTrans( const FFTComplex *_src, FFTComplex *_dst, size_t srcRowSize, // src, in FFTComplex, a.k.a. src numCols size_t dstRowSize) // dst, in FFTComplex, a.k.a. dst numCols { double *src = (double *)_src; double *dst = (double *)_dst; dumpSub("fftOPSubTrans start", _src, srcRowSize); /* * row and col refer to coordinates in src * row size of dst is dstRowSize */ unsigned curcol; for(curcol=0; curcol<FFT_COMPLEX_PER_SUBMATRIX; curcol+=2) { __m128d vin1; __m128d vin2; __m128d vin3; __m128d vin4; __m128d vin5; __m128d vin6; __m128d vin7; __m128d vin8; __m128d vOut_row1_1; __m128d vOut_row1_2; __m128d vOut_row1_3; __m128d vOut_row1_4; __m128d vOut_row2_1; __m128d vOut_row2_2; __m128d vOut_row2_3; __m128d vOut_row2_4; const double *pIn = src + curcol; double *pOut = dst + curcol*dstRowSize; // load in two columns from src at curcol vin1 = _mm_load_pd(pIn+0*srcRowSize); vin2 = _mm_load_pd(pIn+1*srcRowSize); vin3 = _mm_load_pd(pIn+2*srcRowSize); vin4 = _mm_load_pd(pIn+3*srcRowSize); vin5 = _mm_load_pd(pIn+4*srcRowSize); vin6 = _mm_load_pd(pIn+5*srcRowSize); vin7 = _mm_load_pd(pIn+6*srcRowSize); vin8 = _mm_load_pd(pIn+7*srcRowSize); /////////////////////////////////////////////// // transpose for first row out vOut_row1_1 = _mm_unpacklo_pd(vin1, vin2); vOut_row1_2 = _mm_unpacklo_pd(vin3, vin4); vOut_row1_3 = _mm_unpacklo_pd(vin5, vin6); vOut_row1_4 = _mm_unpacklo_pd(vin7, vin8); _mm_stream_pd(pOut+(0*FFT_COMPLEX_PER_VECTOR), vOut_row1_1); _mm_stream_pd(pOut+(1*FFT_COMPLEX_PER_VECTOR), vOut_row1_2); _mm_stream_pd(pOut+(2*FFT_COMPLEX_PER_VECTOR), vOut_row1_3); _mm_stream_pd(pOut+(3*FFT_COMPLEX_PER_VECTOR), vOut_row1_4); /////////////////////////////////////////////// // transpose for second row out pOut += dstRowSize; vOut_row2_1 = _mm_unpackhi_pd(vin1, vin2); vOut_row2_2 = _mm_unpackhi_pd(vin3, vin4); vOut_row2_3 = _mm_unpackhi_pd(vin5, vin6); vOut_row2_4 = _mm_unpackhi_pd(vin7, vin8); _mm_stream_pd(pOut+(0*FFT_COMPLEX_PER_VECTOR), vOut_row2_1); _mm_stream_pd(pOut+(1*FFT_COMPLEX_PER_VECTOR), vOut_row2_2); _mm_stream_pd(pOut+(2*FFT_COMPLEX_PER_VECTOR), vOut_row2_3); _mm_stream_pd(pOut+(3*FFT_COMPLEX_PER_VECTOR), vOut_row2_4); } dumpSub("fftOPSubTrans end", _dst, dstRowSize); }
void tce_sort_6_simd(double* unsorted,double* sorted, int a, int b, int c, int d, int e, int f, int i, int j, int k, int l, int m, int n, double factor) { int id[6],jd[6],ia,ib,j1,j2,j3,j4,j5,j6; int l1,l2,l3,l4,l5,l6; int ia1,ia2,ia3,ia4,ia5,ia6; int ib1,ib2,ib3,ib4,ib5,ib6; int rangea1,rangea2,rangea3,rangea4,rangea5,rangea6; int rangeb1,rangeb2,rangeb3,rangeb4,rangeb5,rangeb6; int range[6],order[6],order_r[6]; int jj1,jj2,jj3,jj4,jj5,jj6; int jj1_bound,jj2_bound,jj3_bound,jj4_bound,jj5_bound,jj6_bound; int N1,N2; double *pA, *pB; register __m128d x, y, z, w, p, q,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); jd[0] = a; jd[1] = b; jd[2] = c; jd[3] = d; jd[4] = e; jd[5] = f; // prefer writes range[0] = b*c*d*e*f; range[1] = c*d*e*f; range[2] = d*e*f; range[3] = e*f; range[4] = f; range[5] = 1; l1 = jd[i]; l2 = jd[j]; l3 = jd[k]; l4 = jd[l]; l5 = jd[m]; l6 = jd[n]; rangea1 = range[i]; rangea2 = range[j]; rangea3 = range[k]; rangea4 = range[l]; rangea5 = range[m]; rangea6 = range[n]; rangeb1 = l2*l3*l4*l5*l6; rangeb2 = l3*l4*l5*l6; rangeb3 = l4*l5*l6; rangeb4 = l5*l6; rangeb5 = l6; rangeb6 = 1; // here vectorization can rely on the compiler if (n == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia4 = ia3 + j4*rangea4; ib4 = ib3 + j4*rangeb4; for (j5 = 0; j5 < l5; j5++) { ia5 = ia4 + j5*rangea5; ib5 = ib4 + j5*rangeb5; for (j6 = 0; j6 < l6; j6++) { ia = ia5 + j6*rangea6; ib = ib5 + j6*rangeb6; sorted[ib] = unsorted[ia] * factor; } } } } } } } if (m == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia4 = ia3 + j4*rangea4; ib4 = ib3 + j4*rangeb4; for (j5 = 0; j5 < l5; j5 += tilesize) { for (j6 = 0; j6 < l6; j6 += tilesize) { jj5_bound = (j5 + tilesize > l5)? l5 :j5+tilesize; for (jj5 = j5; jj5 < jj5_bound; jj5 += 2) { ia5 = ia4 + jj5*rangea5; ib5 = ib4 + jj5*rangeb5; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia5 + jj6*rangea6; ib = ib5 + jj6*rangeb6; N1 = rangeb5; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } if (l == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4 += tilesize) { for (j5 = 0; j5 < l5; j5++) { ia5 = ia3 + j5*rangea5; ib5 = ib3 + j5*rangeb5; for (j6 = 0; j6 < l6; j6 += tilesize) { jj4_bound = (j4 + tilesize > l4)? l4 :j4+tilesize; for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) { ia4 = ia5 + jj4*rangea4; ib4 = ib5 + jj4*rangeb4; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia4 + jj6*rangea6; ib = ib4 + jj6*rangeb6; N1 = rangeb4; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } if (k == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3 += tilesize) { for (j4 = 0; j4 < l4; j4++) { ia4 = ia2 + j4*rangea4; ib4 = ib2 + j4*rangeb4; for (j5 = 0; j5 < l5; j5++) { ia5 = ia4 + j5*rangea5; ib5 = ib4 + j5*rangeb5; for (j6 = 0; j6 < l6; j6 += tilesize) { jj3_bound = (j3 + tilesize > l3)? l3 :j3+tilesize; for (jj3 = j3; jj3 < jj3_bound; jj3 += 2) { ia3 = ia5 + jj3*rangea3; ib3 = ib5 + jj3*rangeb3; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia3 + jj6*rangea6; ib = ib3 + jj6*rangeb6; N1 = rangeb3; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } if (j == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2 += tilesize) { for (j3 = 0; j3 < l3; j3++) { ia3 = ia1 + j3*rangea3; ib3 = ib1 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia4 = ia3 + j4*rangea4; ib4 = ib3 + j4*rangeb4; for (j5 = 0; j5 < l5; j5++) { ia5 = ia4 + j5*rangea5; ib5 = ib4 + j5*rangeb5; for (j6 = 0; j6 < l6; j6 += tilesize) { jj2_bound = (j2 + tilesize > l2)? l2 :j2+tilesize; for (jj2 = j2; jj2 < jj2_bound; jj2 += 2) { ia2 = ia5 + jj2*rangea2; ib2 = ib5 + jj2*rangeb2; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia2 + jj6*rangea6; ib = ib2 + jj6*rangeb6; N1 = rangeb2; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } if (i == 5) { for (j1 = 0; j1 < l1; j1 += tilesize) { for (j2 = 0; j2 < l2; j2++) { ia2 = j2*rangea2; ib2 = j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia4 = ia3 + j4*rangea4; ib4 = ib3 + j4*rangeb4; for (j5 = 0; j5 < l5; j5++) { ia5 = ia4 + j5*rangea5; ib5 = ib4 + j5*rangeb5; for (j6 = 0; j6 < l6; j6 += tilesize) { jj1_bound = (j1 + tilesize > l1)? l1 :j1+tilesize; for (jj1 = j1; jj1 < jj1_bound; jj1 += 2) { ia1 = ia5 + jj1*rangea1; ib1 = ib5 + jj1*rangeb1; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia1 + jj6*rangea6; ib = ib1 + jj6*rangeb6; N1 = rangeb1; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } }
void tce_sort_4_simd(double* unsorted,double* sorted, int a, int b, int c, int d, int i, int j, int k, int l, double factor) { int id[4],jd[4],ia,ib,j1,j2,j3,j4; int l1,l2,l3,l4; int ia1,ia2,ia3,ia4; int ib1,ib2,ib3,ib4; int rangea1,rangea2,rangea3,rangea4; int rangeb1,rangeb2,rangeb3,rangeb4; int range[4],order[4],order_r[4]; int jj1,jj2,jj3,jj4; int jj1_bound,jj2_bound,jj3_bound,jj4_bound; int count,ir,jr,kr,lr,N1,N2; double *pA, *pB; register __m128d x, y, z, w, t, t1,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); jd[0] = a; jd[1] = b; jd[2] = c; jd[3] = d; // prefer writes range[0] = b*c*d; range[1] = c*d; range[2] = d; range[3] = 1; l1 = jd[i]; l2 = jd[j]; l3 = jd[k]; l4 = jd[l]; rangea1 = range[i]; rangea2 = range[j]; rangea3 = range[k]; rangea4 = range[l]; rangeb1 = l2*l3*l4; rangeb2 = l3*l4; rangeb3 = l4; rangeb4 = 1; // here vectorization can rely on the compiler if (l == 3) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia = ia3 + j4*rangea4; ib = ib3 + j4*rangeb4; sorted[ib] = unsorted[ia] * factor; } } } } } if (k == 3) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3 += tilesize) { for (j4 = 0; j4 < l4; j4 += tilesize) { jj3_bound = (j3 + tilesize > l3)? l3 :j3+tilesize; for (jj3 = j3; jj3 < jj3_bound; jj3 += 2) { ia3 = ia2 + jj3*rangea3; ib3 = ib2 + jj3*rangeb3; jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize; for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) { ia = ia3 + jj4*rangea4; ib = ib3 + jj4*rangeb4; N1 = rangeb3; N2 = rangea4; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } if (j == 3) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2 += tilesize) { for (j3 = 0; j3 < l3; j3++) { ia3 = ia1 + j3*rangea3; ib3 = ib1 + j3*rangeb3; for (j4 = 0; j4 < l4; j4 += tilesize) { jj2_bound = (j2 + tilesize > l2)? l2 :j2+tilesize; for (jj2 = j2; jj2 < jj2_bound; jj2 += 2) { ia2 = ia3 + jj2*rangea2; ib2 = ib3 + jj2*rangeb2; jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize; for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) { ia = ia2 + jj4*rangea4; ib = ib2 + jj4*rangeb4; N1 = rangeb2; N2 = rangea4; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } if (i == 3) { for (j1 = 0; j1 < l1; j1 += tilesize) { for (j2 = 0; j2 < l2; j2++) { ia2 = j2*rangea2; ib2 = j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4 += tilesize) { jj1_bound = (j1 + tilesize > l1)? l1 :j1+tilesize; for (jj1 = j1; jj1 < jj1_bound; jj1 += 2) { ia1 = ia3 + jj1*rangea1; ib1 = ib3 + jj1*rangeb1; jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize; for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) { ia = ia1 + jj4*rangea4; ib = ib1 + jj4*rangeb4; N1 = rangeb1; N2 = rangea4; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } }
void AVXFMA4DNoise(Vector3d& result, const Vector3d& EPoint) { DBL x, y, z; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; // TODO FIXME - global statistics reference // Stats[Calls_To_DNoise]++; x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /*ix = (int)x; iy = (int)y; iz = (int)z; x_ix = x - ix; y_iy = y - iy; z_iz = z - iz;*/ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); DBL* mp1 = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL* mp2 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL* mp3 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL* mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL* mp5 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL* mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; DBL* mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL* mp8 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d mm_txty_txsy_tz = _mm_mul_pd(mm_txty_txsy, mm_tz); __m128d mm_txty_txsy_sz = _mm_mul_pd(mm_txty_txsy, mm_sz); __m128d mm_sxty_sxsy_tz = _mm_mul_pd(mm_sxty_sxsy, mm_tz); __m128d mm_sxty_sxsy_sz = _mm_mul_pd(mm_sxty_sxsy, mm_sz); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p; __m128d sum_X_Y = _mm_setzero_pd(); __m128d sum__Z = _mm_setzero_pd(); __m128d mm_s1 = _mm_unpacklo_pd(mm_txty_txsy_tz, mm_txty_txsy_tz); INCRSUMP2(mp1, mp1 + 8, mm_s1, ix_mm, iy_mm, iz_mm, sum_X_Y); __m128d mm_s2 = _mm_unpacklo_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz); INCRSUMP2(mp2, mp2 + 8, mm_s2, jx_mm, iy_mm, iz_mm, sum_X_Y); __m128d mm_s3 = _mm_unpackhi_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz); INCRSUMP2(mp3, mp3 + 8, mm_s3, jx_mm, jy_mm, iz_mm, sum_X_Y); __m128d mm_s4 = _mm_unpackhi_pd(mm_txty_txsy_tz, mm_txty_txsy_tz); INCRSUMP2(mp4, mp4 + 8, mm_s4, ix_mm, jy_mm, iz_mm, sum_X_Y); __m128d mm_s5 = _mm_unpackhi_pd(mm_txty_txsy_sz, mm_txty_txsy_sz); INCRSUMP2(mp5, mp5 + 8, mm_s5, ix_mm, jy_mm, jz_mm, sum_X_Y); __m128d mm_s6 = _mm_unpackhi_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz); INCRSUMP2(mp6, mp6 + 8, mm_s6, jx_mm, jy_mm, jz_mm, sum_X_Y); __m128d mm_s7 = _mm_unpacklo_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz); INCRSUMP2(mp7, mp7 + 8, mm_s7, jx_mm, iy_mm, jz_mm, sum_X_Y); __m128d mm_s8 = _mm_unpacklo_pd(mm_txty_txsy_sz, mm_txty_txsy_sz); INCRSUMP2(mp8, mp8 + 8, mm_s8, ix_mm, iy_mm, jz_mm, sum_X_Y); __m128d iy_jy = _mm_unpacklo_pd(iy_mm, jy_mm); INCRSUMP2(mp1 + 16, mp4 + 16, mm_txty_txsy_tz, ix_mm, iy_jy, iz_mm, sum__Z); INCRSUMP2(mp8 + 16, mp5 + 16, mm_txty_txsy_sz, ix_mm, iy_jy, jz_mm, sum__Z); INCRSUMP2(mp2 + 16, mp3 + 16, mm_sxty_sxsy_tz, jx_mm, iy_jy, iz_mm, sum__Z); INCRSUMP2(mp7 + 16, mp6 + 16, mm_sxty_sxsy_sz, jx_mm, iy_jy, jz_mm, sum__Z); sum__Z = _mm_hadd_pd(sum__Z, sum__Z); _mm_storeu_pd(*result, sum_X_Y); _mm_store_sd(&result[Z], sum__Z); }
DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator) { DBL x, y, z; DBL *mp; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; DBL sum; // TODO FIXME - global statistics reference // Stats[Calls_To_Noise]++; if (noise_generator==kNoiseGen_Perlin) { // The 1.59 and 0.985 are to correct for some biasing problems with // the random # generator used to create the noise tables. Final // range of values is about 5.0e-4 below 0.0 and above 1.0. Mean // value is 0.49 (ideally it would be 0.5). sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985); // Clamp final value to 0-1 range if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; return sum; } x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /* ix = (int)x; iy = (int)y; iz = (long)z; */ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm; __m128d int_sum1 = _mm_setzero_pd(); s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz); INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz); INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz); INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz); INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1); int_sum1 = _mm_hadd_pd(int_sum1, int_sum1); if(noise_generator==kNoiseGen_RangeCorrected) { /* details of range here: Min, max: -1.05242, 0.988997 Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828 We want to change it to as close to [0,1] as possible. */ const __m128d r2 = _mm_set_sd(0.48985582); const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582); int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2); } else { int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5)); } int_sum1 = _mm_min_sd(one, int_sum1); int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1); _mm_store_sd(&sum, int_sum1); return (sum); }
static inline void sacEvaluateModelSPRT(PROSAC_HEST* p){ unsigned i; unsigned isInlier; double lambda = 1.0; double lambdaReject = ((1.0 - p->delta) / (1.0 - p->epsilon)); double lambdaAccept = (( p->delta ) / ( p->epsilon )); float distSq = p->maxD*p->maxD; float* src = (float*)p->src; float* dst = (float*)p->dst; float* H = p->H; p->inl = 0; p->N_tested = 0; p->good = 1; /* VECTOR */ const __m128 distSqV=_mm_set1_ps(distSq); const __m128 H00=_mm_set1_ps(H[0]); const __m128 H01=_mm_set1_ps(H[1]); const __m128 H02=_mm_set1_ps(H[2]); const __m128 H10=_mm_set1_ps(H[4]); const __m128 H11=_mm_set1_ps(H[5]); const __m128 H12=_mm_set1_ps(H[6]); const __m128 H20=_mm_set1_ps(H[8]); const __m128 H21=_mm_set1_ps(H[9]); const __m128 H22=_mm_set1_ps(H[10]); for(i=0;i<(p->N-3) && p->good;i+=4){ /* Backproject */ __m128 x, y, X, Y, inter0, inter1, inter2, inter3; x=_mm_load_ps(src+2*i); y=_mm_load_ps(src+2*i+4); X=_mm_load_ps(dst+2*i); Y=_mm_load_ps(dst+2*i+4); inter0=_mm_unpacklo_ps(x,y);// y1 y0 x1 x0 inter1=_mm_unpackhi_ps(x,y);// y3 y2 x3 x2 inter2=_mm_unpacklo_ps(X,Y);// Y1 Y0 X1 X0 inter3=_mm_unpackhi_ps(X,Y);// Y3 Y2 X3 X2 x=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1))); y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1))); X=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3))); Y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3))); __m128 reprojX = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H00, x), _mm_mul_ps(H01, y)), H02); __m128 reprojY = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H10, x), _mm_mul_ps(H11, y)), H12); __m128 reprojZ = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H20, x), _mm_mul_ps(H21, y)), H22); __m128 recipZ = _mm_rcp_ps(reprojZ); reprojX = _mm_mul_ps(reprojX, recipZ); reprojY = _mm_mul_ps(reprojY, recipZ); //reprojX = _mm_div_ps(reprojX, reprojZ); //reprojY = _mm_div_ps(reprojY, reprojZ); reprojX = _mm_sub_ps(reprojX, X); reprojY = _mm_sub_ps(reprojY, Y); reprojX = _mm_mul_ps(reprojX, reprojX); reprojY = _mm_mul_ps(reprojY, reprojY); __m128 reprojDistV = _mm_add_ps(reprojX, reprojY); __m128 cmp = _mm_cmple_ps(reprojDistV, distSqV); int msk = _mm_movemask_ps(cmp); /* ... */ /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15*/ unsigned bitCnt[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; p->inl += bitCnt[msk]; /* SPRT */ lambda *= p->lambdaTBL[msk]; p->good = lambda <= p->A; /* If !p->good, the threshold A was exceeded, so we're rejecting */ } /* SCALAR */ for(;i<p->N && p->good;i++){ /* Backproject */ float x=src[i*2],y=src[i*2+1]; float X=dst[i*2],Y=dst[i*2+1]; float reprojX=H[0]*x+H[1]*y+H[2]; // ( X_1 ) ( H_11 H_12 H_13 ) (x_1) float reprojY=H[4]*x+H[5]*y+H[6]; // ( X_2 ) = ( H_21 H_22 H_23 ) (x_2) float reprojZ=H[8]*x+H[9]*y+H[10];// ( X_3 ) ( H_31 H_32 H_33=1.0 ) (x_3 = 1.0) //reproj is in homogeneous coordinates. To bring back to "regular" coordinates, divide by Z. reprojX/=reprojZ; reprojY/=reprojZ; //Compute distance reprojX-=X; reprojY-=Y; reprojX*=reprojX; reprojY*=reprojY; float reprojDist = reprojX+reprojY; /* ... */ isInlier = reprojDist <= distSq; p->inl += isInlier; /* SPRT */ lambda *= isInlier ? lambdaAccept : lambdaReject; p->good = lambda <= p->A; /* If !p->good, the threshold A was exceeded, so we're rejecting */ } p->N_tested = i; }
static inline __m128d my_invrsq_pd(__m128d x) { const __m128d three = (const __m128d) {3.0f, 3.0f}; const __m128d half = (const __m128d) {0.5f, 0.5f}; __m128 t = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */ __m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */ /* First Newton-Rapson step, accuracy is now 24 bits */ __m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1))))); /* Return second Newton-Rapson step, accuracy 48 bits */ return (__m128d) _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2))))); } /* to extract single integers from a __m128i datatype */ #define _mm_extract_epi64(x, imm) \ _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm))) void nb_kernel400_x86_64_sse2(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * Vc, int * type, int * p_ntype, double * vdwparam, double * Vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,ntype,nthreads,offset; int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid; double facel,krf,crf,tabscl,gbtabscl,vct,vgbt; double shX,shY,shZ,isai_d,dva; gmx_gbdata_t *gbdata; float * gpol; __m128d ix,iy,iz,jx,jy,jz; __m128d dx,dy,dz,t1,t2,t3; __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2; __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj; __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d; __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8; __m128d fac,tabscale,gbtabscale; __m128i n0,nnn; const __m128d neg = {-1.0f,-1.0f}; const __m128d zero = {0.0f,0.0f}; const __m128d half = {0.5f,0.5f}; const __m128d two = {2.0f,2.0f}; const __m128d three = {3.0f,3.0f}; gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent)); krf = *p_krf; crf = *p_crf; tabscl = *p_tabscale; gbtabscl = *p_gbtabscale; nj1 = 0; /* Splat variables */ fac = _mm_load1_pd(&facel); tabscale = _mm_load1_pd(&tabscl); gbtabscale = _mm_load1_pd(&gbtabscl); /* Keep compiler happy */ dvdatmp = _mm_setzero_pd(); vgb = _mm_setzero_pd(); dvdaj = _mm_setzero_pd(); isaj = _mm_setzero_pd(); vcoul = _mm_setzero_pd(); t1 = _mm_setzero_pd(); t2 = _mm_setzero_pd(); t3 = _mm_setzero_pd(); jnr1=jnr2=0; j13=j23=0; for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; offset = (nj1-nj0)%2; ii = iinr[n]; ii3 = ii*3; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shX+pos[ii3+1]); iz = _mm_set1_pd(shX+pos[ii3+2]); q = _mm_set1_pd(charge[ii]); iq = _mm_mul_pd(fac,q); isai_d = invsqrta[ii]; isai = _mm_load1_pd(&isai_d); fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); vctot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); for(k=nj0;k<nj1-offset; k+=2) { jnr1 = jjnr[k]; jnr2 = jjnr[k+1]; j13 = jnr1 * 3; j23 = jnr2 * 3; /* Load coordinates */ xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */ xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */ xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */ xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */ /* transpose */ jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* distances */ dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); /* Load invsqrta */ isaj = _mm_loadl_pd(isaj,invsqrta+jnr1); isaj = _mm_loadh_pd(isaj,invsqrta+jnr2); isaprod = _mm_mul_pd(isai,isaj); /* Load charges */ q = _mm_loadl_pd(q,charge+jnr1); q = _mm_loadh_pd(q,charge+jnr2); qq = _mm_mul_pd(iq,q); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); qq = _mm_mul_pd(isaprod,qq); qq = _mm_mul_pd(qq,neg); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Load dvdaj */ dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1); dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2); r = _mm_mul_pd(rsq11,rinv); rt = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_pd(rt,n0d); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); H = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,H); vgb = _mm_mul_pd(qq,VV); fijC = _mm_mul_pd(qq,FF); fijC = _mm_mul_pd(fijC,gbscale); dvdatmp = _mm_mul_pd(fijC,r); dvdatmp = _mm_add_pd(vgb,dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp,neg); dvdatmp = _mm_mul_pd(dvdatmp,half); dvdasum = _mm_add_pd(dvdasum,dvdatmp); xmm1 = _mm_mul_pd(dvdatmp,isaj); xmm1 = _mm_mul_pd(xmm1,isaj); dvdaj = _mm_add_pd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); _mm_storeh_pd(dvda+jnr2,dvdaj); vctot = _mm_add_pd(vctot,vcoul); vgbtot = _mm_add_pd(vgbtot,vgb); fscal = _mm_sub_pd(fijC,fscal); fscal = _mm_mul_pd(fscal,neg); fscal = _mm_mul_pd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_pd(fscal,dx); t2 = _mm_mul_pd(fscal,dy); t3 = _mm_mul_pd(fscal,dz); /* update the i force */ fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); /* accumulate forces from memory */ xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */ xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */ xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */ xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */ /* transpose */ xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */ xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */ xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* subtract partial forces */ xmm5 = _mm_sub_pd(xmm5,t1); xmm6 = _mm_sub_pd(xmm6,t2); xmm7 = _mm_sub_pd(xmm7,t3); xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */ xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* store fx and fy */ _mm_storeu_pd(faction+j13,xmm1); _mm_storeu_pd(faction+j23,xmm2); /* .. then fz */ _mm_storel_pd(faction+j13+2,xmm7); _mm_storel_pd(faction+j23+2,xmm7); } /* In double precision, offset can only be either 0 or 1 */ if(offset!=0) { jnr1 = jjnr[k]; j13 = jnr1*3; jx = _mm_load_sd(pos+j13); jy = _mm_load_sd(pos+j13+1); jz = _mm_load_sd(pos+j13+2); isaj = _mm_load_sd(invsqrta+jnr1); isaprod = _mm_mul_sd(isai,isaj); dvdaj = _mm_load_sd(dvda+jnr1); q = _mm_load_sd(charge+jnr1); qq = _mm_mul_sd(iq,q); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); qq = _mm_mul_sd(isaprod,qq); qq = _mm_mul_sd(qq,neg); gbscale = _mm_mul_sd(isaprod,gbtabscale); r = _mm_mul_sd(rsq11,rinv); rt = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_sd(rt,n0d); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); H = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,H); vgb = _mm_mul_sd(qq,VV); fijC = _mm_mul_sd(qq,FF); fijC = _mm_mul_sd(fijC,gbscale); dvdatmp = _mm_mul_sd(fijC,r); dvdatmp = _mm_add_sd(vgb,dvdatmp); dvdatmp = _mm_mul_sd(dvdatmp,neg); dvdatmp = _mm_mul_sd(dvdatmp,half); dvdasum = _mm_add_sd(dvdasum,dvdatmp); xmm1 = _mm_mul_sd(dvdatmp,isaj); xmm1 = _mm_mul_sd(xmm1,isaj); dvdaj = _mm_add_sd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); vctot = _mm_add_sd(vctot,vcoul); vgbtot = _mm_add_sd(vgbtot,vgb); fscal = _mm_sub_sd(fijC,fscal); fscal = _mm_mul_sd(fscal,neg); fscal = _mm_mul_sd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_sd(fscal,dx); t2 = _mm_mul_sd(fscal,dy); t3 = _mm_mul_sd(fscal,dz); /* update the i force */ fix = _mm_add_sd(fix,t1); fiy = _mm_add_sd(fiy,t2); fiz = _mm_add_sd(fiz,t3); /* accumulate forces from memory */ xmm5 = _mm_load_sd(faction+j13); /* fx */ xmm6 = _mm_load_sd(faction+j13+1); /* fy */ xmm7 = _mm_load_sd(faction+j13+2); /* fz */ /* subtract partial forces */ xmm5 = _mm_sub_sd(xmm5,t1); xmm6 = _mm_sub_sd(xmm6,t2); xmm7 = _mm_sub_sd(xmm7,t3); /* store forces */ _mm_store_sd(faction+j13,xmm5); _mm_store_sd(faction+j13+1,xmm6); _mm_store_sd(faction+j13+2,xmm7); } /* fix/fiy/fiz now contain four partial terms, that all should be * added to the i particle forces */ t1 = _mm_unpacklo_pd(t1,fix); t2 = _mm_unpacklo_pd(t2,fiy); t3 = _mm_unpacklo_pd(t3,fiz); fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1)); fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1)); fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1)); /* Load i forces from memory */ xmm1 = _mm_load_sd(faction+ii3); xmm2 = _mm_load_sd(faction+ii3+1); xmm3 = _mm_load_sd(faction+ii3+2); /* Add to i force */ fix = _mm_add_sd(fix,xmm1); fiy = _mm_add_sd(fiy,xmm2); fiz = _mm_add_sd(fiz,xmm3); /* store i forces to memory */ _mm_store_sd(faction+ii3,fix); _mm_store_sd(faction+ii3+1,fiy); _mm_store_sd(faction+ii3+2,fiz); /* now do dvda */ dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum); dvdasum = _mm_add_pd(dvdasum,dvdatmp); _mm_storeh_pd(&dva,dvdasum); dvda[ii] = dvda[ii] + dva*isai_d*isai_d; ggid = gid[n]; /* Coulomb potential */ vcoul = _mm_unpacklo_pd(vcoul,vctot); vctot = _mm_add_pd(vctot,vcoul); _mm_storeh_pd(&vct,vctot); Vc[ggid] = Vc[ggid] + vct; /* GB potential */ vgb = _mm_unpacklo_pd(vgb,vgbtot); vgbtot = _mm_add_pd(vgbtot,vgb); _mm_storeh_pd(&vgbt,vgbtot); gpol[ggid] = gpol[ggid] + vgbt; } *outeriter = nri; *inneriter = nj1; }
static NPT_INLINE __m128d npt_mm_loaddup_pd(const double* ptr) { __m128d temp = _mm_load_sd(ptr); return _mm_unpacklo_pd(temp, temp); }