Example #1
0
__m128d test_mm_unpacklo_pd(__m128d A, __m128d B) {
  // DAG-LABEL: test_mm_unpacklo_pd
  // DAG: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 0, i32 2>
  //
  // ASM-LABEL: test_mm_unpacklo_pd
  // ASM: unpcklpd
  return _mm_unpacklo_pd(A, B);
}
	void Shuffle16Elems(__m128 &io_data0, __m128 &io_data1, __m128 &io_data2,
		__m128 &io_data3)
	{
		__m128 ccdd1 = _mm_unpackhi_ps(io_data0, io_data1);
		__m128 ccdd2 = _mm_unpackhi_ps(io_data2, io_data3);
		__m128 aabb1 = _mm_unpacklo_ps(io_data0, io_data1);
		__m128 aabb2 = _mm_unpacklo_ps(io_data2, io_data3);

		io_data0 = 
			_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(aabb1), _mm_castps_pd(aabb2)));
		io_data1 =
			_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(aabb1), _mm_castps_pd(aabb2)));
		io_data2 =
			_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(ccdd1), _mm_castps_pd(ccdd2)));
		io_data3 = 
			_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(ccdd1), _mm_castps_pd(ccdd2)));
	}
void transpose_misaligned(double *a, double *b, int N1, int N2, double factor) {

    int i,j,k,k1,it,jt,itt,jtt,it_bound,jt_bound,itt_bound,jtt_bound;
    int conflict,tmp,tmpN,offset,line_offset,setnum,set[8192/(4*sizeof(double))];
    double *pA, *pB;


    register __m128d x, y, z, w, t, t1,fac_vector;

    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);

    itt_bound = (N1/tilesize)*tilesize;
    for (itt = 0; itt < itt_bound; itt=itt+5*tilesize) {
        jtt_bound =(N2/tilesize)*tilesize;
        for (jtt = 0; jtt < jtt_bound; jtt=jtt+5*tilesize) {
            it_bound = (itt+5*tilesize > itt_bound)?itt_bound:itt+5*tilesize;
            for (it = itt; it < it_bound; it = it+tilesize) {
                jt_bound = (jtt+5*tilesize>itt_bound)?jtt_bound:jtt+5*tilesize;
                for (jt = jtt; jt < jt_bound; jt = jt+tilesize) {
                    k = 0;
                    for (j = jt; j < jt+tilesize; j=j+2) {
                        for (i = it; i < it+tilesize; i=i+2) {
                            pA = a+i*N2+j;
                            pB = b+j*N1+i;
                            x = _mm_loadu_pd(pA);
                            x = _mm_mul_pd(x,fac_vector);
                            y = _mm_loadu_pd(pA + N2);
                            y = _mm_mul_pd(y,fac_vector);
                            z = _mm_shuffle_pd( x, y, 0);
                            w = _mm_shuffle_pd( x, y, 3);
                            _mm_storeu_pd(pB,z);
                            _mm_storeu_pd(pB + N1,w);
                        }
                    }
                }
            }
        }
        for (i = itt; i < itt+5*tilesize && i < itt_bound; i++) {
            for (j = jtt_bound; j < N2; j++) {
                b[j*N1+i] = factor * a[i*N2+j];
            }
        }
    }
    for (i = itt_bound; i < N1; i++) {
        for (j = 0; j < N2; j++) {
            b[j*N1+i] = factor * a[i*N2+j];
        }
    }
}
void transpose_aligned(double *a, double *b, int N1, int N2, double factor) {

    int i,j,k,k1,it,jt,itt,jtt,conflict,tmp,tmpN;
    double *pA, *pB;


    register __m128d x, y, z, w,fac_vector;

    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);

    for (it = 0; it < N1; it=it+tilesize) {
        for (jt = 0; jt < N2; jt=jt+tilesize) {

            k = 0;
            for (j = jt; j < jt+tilesize; j=j+2) {
                for (i = it; i < it+tilesize; i=i+2) {
                    pA = a+i*N2+j;
                    x = _mm_load_pd(pA);
                    y = _mm_load_pd(pA + N2);
                    x = _mm_mul_pd(x,fac_vector);
                    y = _mm_mul_pd(y,fac_vector);
                    z = _mm_shuffle_pd( x, y, 0);
                    w = _mm_shuffle_pd( x, y, 3);
                    k = (j-jt)*tilesize + (i-it);
                    _mm_store_pd(buf + k,z);
                    _mm_store_pd(buf + k + tilesize,w);
                }
            }

            k = 0;
            k1 = 0;
            for (j = jt; j < jt+tilesize; j++) {
                pB = b+j*N1+it;
                k = (j-jt)*tilesize;
                x = _mm_load_pd(&buf[k]);
                y = _mm_load_pd(&buf[k]+2);
                z = _mm_load_pd(&buf[k]+2*2);
                w = _mm_load_pd(&buf[k]+3*2);
                _mm_stream_pd(pB,x);
                _mm_stream_pd(pB+2,y);
                _mm_stream_pd(pB+2*2,z);
                _mm_stream_pd(pB+3*2,w);

            }
        }
    }
}
void matMult_opt(int N, const double *matA, const double *matB, double *matC)
{
    int i, j, k;
    if((N%2)==0)
    {
        //int TwoN=N;
        //N=N/2;
        for(i=0; i<N; i++)
        {
            for(j=0;j<N;j++)
            {
                __m128d matA_value = _mm_load_sd(&matA[j*N+i]);
                matA_value = _mm_unpacklo_pd(matA_value, matA_value);
                for(k=0; k<N; k+=2)
                {
                   __m128d matB_value = _mm_load_pd(&matB[i*N+k]);
                   __m128d matC_value = _mm_load_pd(&matC[j*N+k]);
                   _mm_store_pd(&matC[j*N+k], _mm_add_pd(_mm_mul_pd(matA_value, matB_value), matC_value));
                }
            }
        }
    }
    else
    {
        for(i=0; i<N; i++)
        {
            for(j=0;j<N;j++)
            {
                for(k=0; k<N; k++)
                {
                    matC[j*N+k] += matA[j*N+i]*matB[i*N+k];
                }
            }
        }
    }

}
void mexFunction( int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) {
    // ****************   PARSE THE INPUTS  ******************* //
    myAssert(nrhs==4,"fitStumpUINT8_c: bad nrhs");
    const mxArray* mxX = prhs[0];
    myAssert(mxIsUint8(mxX), "fitStumpUINT8_c: X must be uint8");
    unsigned char* X = (unsigned char*) mxGetPr(mxX);
    int N = mxGetM(mxX);    int p = mxGetN(mxX);
    
    const mxArray* mxWWY = prhs[1];
    myAssert(mxIsDouble(prhs[1]), "fitStumpUINT8_c: wwy must be double");
    double* wwy = (double*) mxGetPr(mxWWY);
    myAssert(mxGetM(mxWWY)==2, "fitStumpUINT8_c: wwy must be 2 x N");
    myAssert(mxGetN(mxWWY)==N, "fitStumpUINT8_c: wwy must be 2 x N");
    const mxArray* mxCandVars = prhs[2];
    myAssert(mxIsUint32(mxCandVars), "fitStumpUINT8_c: mxCandVars must be uint32");
    unsigned int* candVars = (unsigned int*) mxGetPr(mxCandVars);
    int nCand = mxGetNumberOfElements(mxCandVars);
    
    const mxArray* mxGoodInd = prhs[3];
    int nGd = mxGetNumberOfElements(mxGoodInd);
    myAssert(nGd==0 || mxIsUint32(mxGoodInd), "fitStumpUINT8_c: mxGoodInd must be uint32");
    unsigned int* goodInd = (unsigned int*) mxGetPr(mxGoodInd);
        
    // ******************  SET UP THE OUTPUTS  ******************* //
    plhs[0] = mxCreateNumericMatrix(1,nCand,mxINT32_CLASS,mxREAL);
    int* cutInd = (int*) mxGetPr(plhs[0]);
	plhs[1] = mxCreateNumericMatrix(1,nCand,mxDOUBLE_CLASS, mxREAL);
    double* ssxBest = (double*) mxGetPr(plhs[1]);
	plhs[2] = mxCreateNumericMatrix(1,nCand,mxDOUBLE_CLASS,mxREAL);
	double* muL = (double*) mxGetPr(plhs[2]);
	plhs[3] = mxCreateNumericMatrix(1,nCand,mxDOUBLE_CLASS,mxREAL);
    double* muR = (double*) mxGetPr(plhs[3]);
    // ************** MAIN LOOP OVER ALL CANDIDATE VARS *********** //
    for(int m=0; m<nCand; m++) {
        unsigned char* x = X + N*candVars[m];
        double* wwyBuck = (double *) mxMalloc(2*256*sizeof(double));
        // fill weights with small epsilon for numerical stability
        for(int i=0; i<256; i++) {
            wwyBuck[i*2] = 1.0E-10;
            wwyBuck[i*2+1] = 0;
        }
        // make weighted histogram of w and wy
        buckSums(wwyBuck, wwy, N, x, goodInd, nGd);
        // cumsum
        __m128d* wwyBuck128 = (__m128d*) wwyBuck;
        for(int i=1; i<256; i++)
            wwyBuck128[i] = _mm_add_pd(wwyBuck128[i], wwyBuck128[i-1]);
        // compute -ssx
        __m128d wCumEnd = _mm_set_pd(wwyBuck[256*2-2], wwyBuck[256*2-2]);
        __m128d wyCumEnd = _mm_set_pd(wwyBuck[256*2-1], wwyBuck[256*2-1]);

        __m128d* ssx128 = (__m128d*) mxMalloc(1*256*sizeof(__m128d));
        for(int i=0; i<128; i++) {
            __m128d wwyBuck1 = wwyBuck128[i*2];
            __m128d wwyBuck2 = wwyBuck128[i*2+1];
            __m128d wyBuck = _mm_unpackhi_pd(wwyBuck1,wwyBuck2);
            __m128d wBuck = _mm_unpacklo_pd(wwyBuck1,wwyBuck2);
            ssx128[i] = _mm_div_pd(_mm_mul_pd(wyBuck,wyBuck),wBuck);

            __m128d tmp1 = _mm_sub_pd(wyCumEnd,wyBuck);
            tmp1 = _mm_mul_pd(tmp1,tmp1);
            __m128d tmp2 = _mm_sub_pd(wCumEnd,wBuck);

            ssx128[i] = _mm_add_pd(ssx128[i],_mm_div_pd(tmp1,tmp2));
        }
        // find best split location for this candidate variable
        double* ssx = (double*) ssx128;
        double mx = ssx[0];     cutInd[m] = 0;
        
        for(int i=1;i<256;i++) {
            if(ssx[i] > mx) {
                mx = ssx[i];	cutInd[m] = i;
            }
        }
        ssxBest[m] = -mx;
        muL[m] = wwyBuck[cutInd[m]*2+1] / wwyBuck[cutInd[m]*2];
        muR[m] = (wwyBuck[256*2-1] - wwyBuck[cutInd[m]*2+1]) / (wwyBuck[256*2-2] - wwyBuck[cutInd[m]*2]);
    }
}
void transpose_4321_loop_3241_( double *unsorted, double *sorted,
        int *p_dim1, int *p_dim2, int *p_dim3, int *p_dim4, double *p_factor ) {

    int dim1,dim2,dim3,dim4;
    int dim1mod,dim2mod,dim3mod,dim4mod;
    unsigned int old_offset,new_offset;
    unsigned int j1,j2,j3,j4;
    double factor = *p_factor;
    double *pA, *pB;
    register __m128d x, y, z, w, t, t1,fac_vector;
    unsigned int N1,N2;
    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);


    dim1 = *p_dim1;
    dim2 = *p_dim2;
    dim3 = *p_dim3;
    dim4 = *p_dim4;

    N1 = dim2*dim3*dim4;
    N2 = dim2*dim3*dim4;

    dim1mod = (int) floor( (float)dim1 / (float) 4);
    dim2mod = (int) floor( (float)dim2 / (float) 4);
    dim3mod = (int) floor( (float)dim3 / (float) 4);
    dim4mod = (int) floor( (float)dim4 / (float) 4);

    /* pluto start (dim1,dim2,dim3,dim4) */
#pragma ivdep
#pragma parallel
#pragma loop count min(10) max(80) avg(40)
#pragma unroll
    for( j3 = 0; j3<dim3; j3++) {
#pragma loop count min(10) max(80) avg(40)
#pragma unroll
        for( j2 = 0; j2<dim2; j2++) {
#pragma loop count min(10) max(80) avg(40)
#pragma unroll
#pragma vector always
            for( j4 = 0; j4<dim4; j4+=2) {
#pragma loop count min(10) max(80) avg(40)
#pragma unroll
#pragma vector always
                for( j1 = 0; j1<dim1; j1+=2) {
                    //sorted[j1+dim1*(j2+dim2*(j3+dim3*j4))] = unsorted[j4+dim4*(j3+dim3*(j2+dim2*j1))] * factor;

                    pA = unsorted + j4+dim4*(j3+dim3*(j2+dim2*j1));
                    pB = sorted   + j1+dim1*(j2+dim2*(j3+dim3*j4));
                    x = _mm_loadu_pd(pA);
                    x = _mm_mul_pd(x,fac_vector);
                    y = _mm_loadu_pd(pA + N2);
                    y = _mm_mul_pd(y,fac_vector);
                    z = _mm_shuffle_pd( x, y, 0);
                    w = _mm_shuffle_pd( x, y, 3);
                    _mm_storeu_pd(pB,z);
                    _mm_storeu_pd(pB + N1,w);

                }
            }
        }
    }
    /* pluto end */
    return;
}
/* 
 * Intel single precision, _mm_stream_pd version, used for transposing
 * from a stripe buffer to columns. 
 */
static void fftOPSubTrans(
  const FFTComplex	*_src,
  FFTComplex		*_dst,
  size_t			srcRowSize,		// src, in FFTComplex, a.k.a. src numCols
  size_t			dstRowSize)		// dst, in FFTComplex, a.k.a. dst numCols
{
	double *src = (double *)_src;
	double *dst = (double *)_dst;
	
	dumpSub("fftOPSubTrans start", _src, srcRowSize);
	
	/* 
	 * row and col refer to coordinates in src 
	 * row size of dst is dstRowSize
	 */
	unsigned curcol;
	
	for(curcol=0; curcol<FFT_COMPLEX_PER_SUBMATRIX; curcol+=2) {
		__m128d vin1;
		__m128d vin2;
		__m128d vin3;
		__m128d vin4;
		__m128d vin5;
		__m128d vin6;
		__m128d vin7;
		__m128d vin8;
		
		__m128d vOut_row1_1;
		__m128d vOut_row1_2;
		__m128d vOut_row1_3;
		__m128d vOut_row1_4;
		__m128d vOut_row2_1;
		__m128d vOut_row2_2;
		__m128d vOut_row2_3;
		__m128d vOut_row2_4;
		
		const double *pIn = src + curcol;
		double *pOut = dst + curcol*dstRowSize;
		
		// load in two columns from src at curcol
		vin1 = _mm_load_pd(pIn+0*srcRowSize);
		vin2 = _mm_load_pd(pIn+1*srcRowSize);
		vin3 = _mm_load_pd(pIn+2*srcRowSize);
		vin4 = _mm_load_pd(pIn+3*srcRowSize);
		vin5 = _mm_load_pd(pIn+4*srcRowSize);
		vin6 = _mm_load_pd(pIn+5*srcRowSize);
		vin7 = _mm_load_pd(pIn+6*srcRowSize);
		vin8 = _mm_load_pd(pIn+7*srcRowSize);
		
		///////////////////////////////////////////////
		// transpose for first row out
		
		vOut_row1_1 = _mm_unpacklo_pd(vin1, vin2);
		vOut_row1_2 = _mm_unpacklo_pd(vin3, vin4);
		vOut_row1_3 = _mm_unpacklo_pd(vin5, vin6);
		vOut_row1_4 = _mm_unpacklo_pd(vin7, vin8);
		
		_mm_stream_pd(pOut+(0*FFT_COMPLEX_PER_VECTOR), vOut_row1_1);
		_mm_stream_pd(pOut+(1*FFT_COMPLEX_PER_VECTOR), vOut_row1_2);
		_mm_stream_pd(pOut+(2*FFT_COMPLEX_PER_VECTOR), vOut_row1_3);
		_mm_stream_pd(pOut+(3*FFT_COMPLEX_PER_VECTOR), vOut_row1_4);
		
		///////////////////////////////////////////////
		// transpose for second row out
		pOut += dstRowSize;
		
		vOut_row2_1 = _mm_unpackhi_pd(vin1, vin2);
		vOut_row2_2 = _mm_unpackhi_pd(vin3, vin4);
		vOut_row2_3 = _mm_unpackhi_pd(vin5, vin6);
		vOut_row2_4 = _mm_unpackhi_pd(vin7, vin8);
		
		_mm_stream_pd(pOut+(0*FFT_COMPLEX_PER_VECTOR), vOut_row2_1);
		_mm_stream_pd(pOut+(1*FFT_COMPLEX_PER_VECTOR), vOut_row2_2);
		_mm_stream_pd(pOut+(2*FFT_COMPLEX_PER_VECTOR), vOut_row2_3);
		_mm_stream_pd(pOut+(3*FFT_COMPLEX_PER_VECTOR), vOut_row2_4);
	}
	
	dumpSub("fftOPSubTrans end", _dst, dstRowSize);
}
void tce_sort_6_simd(double* unsorted,double* sorted,
                     int a, int b, int c, int d, int e, int f,
                     int i, int j, int k, int l, int m, int n,
                     double factor) {
    int id[6],jd[6],ia,ib,j1,j2,j3,j4,j5,j6;
    int l1,l2,l3,l4,l5,l6;
    int ia1,ia2,ia3,ia4,ia5,ia6;
    int ib1,ib2,ib3,ib4,ib5,ib6;
    int rangea1,rangea2,rangea3,rangea4,rangea5,rangea6;
    int rangeb1,rangeb2,rangeb3,rangeb4,rangeb5,rangeb6;
    int range[6],order[6],order_r[6];
    int jj1,jj2,jj3,jj4,jj5,jj6;
    int jj1_bound,jj2_bound,jj3_bound,jj4_bound,jj5_bound,jj6_bound;
    int N1,N2;

    double *pA, *pB;
    register __m128d x, y, z, w, p, q,fac_vector;

    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);

    jd[0] = a;
    jd[1] = b;
    jd[2] = c;
    jd[3] = d;
    jd[4] = e;
    jd[5] = f;

    // prefer writes
    range[0] = b*c*d*e*f;
    range[1] = c*d*e*f;
    range[2] = d*e*f;
    range[3] = e*f;
    range[4] = f;
    range[5] = 1;

    l1 = jd[i];
    l2 = jd[j];
    l3 = jd[k];
    l4 = jd[l];
    l5 = jd[m];
    l6 = jd[n];


    rangea1 = range[i];
    rangea2 = range[j];
    rangea3 = range[k];
    rangea4 = range[l];
    rangea5 = range[m];
    rangea6 = range[n];


    rangeb1 = l2*l3*l4*l5*l6;
    rangeb2 = l3*l4*l5*l6;
    rangeb3 = l4*l5*l6;
    rangeb4 = l5*l6;
    rangeb5 = l6;
    rangeb6 = 1;

    // here vectorization can rely on the compiler
    if (n == 5) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4++) {
                        ia4 = ia3 + j4*rangea4;
                        ib4 = ib3 + j4*rangeb4;
                        for (j5 = 0; j5 < l5; j5++) {
                            ia5 = ia4 + j5*rangea5;
                            ib5 = ib4 + j5*rangeb5;
                            for (j6 = 0; j6 < l6; j6++) {
                                ia = ia5 + j6*rangea6;
                                ib = ib5 + j6*rangeb6;
                                sorted[ib] = unsorted[ia] * factor;
                            }
                        }
                    }
                }
            }
        }
    }

    if (m == 5) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4++) {
                        ia4 = ia3 + j4*rangea4;
                        ib4 = ib3 + j4*rangeb4;
                        for (j5 = 0; j5 < l5; j5 += tilesize) {
                            for (j6 = 0; j6 < l6; j6 += tilesize) {
                                jj5_bound = (j5 + tilesize > l5)? l5 :j5+tilesize;
                                for (jj5 = j5; jj5 < jj5_bound; jj5 += 2) {
                                    ia5 = ia4 + jj5*rangea5;
                                    ib5 = ib4 + jj5*rangeb5;
                                    jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize;
                                    for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) {
                                        ia = ia5 + jj6*rangea6;
                                        ib = ib5 + jj6*rangeb6;
                                        N1 = rangeb5;
                                        N2 = rangea6;

                                        pA = unsorted+ia;
                                        pB = sorted+ib;
                                        x = _mm_loadu_pd(pA);
                                        x = _mm_mul_pd(x,fac_vector);
                                        y = _mm_loadu_pd(pA + N2);
                                        y = _mm_mul_pd(y,fac_vector);
                                        z = _mm_shuffle_pd( x, y, 0);
                                        w = _mm_shuffle_pd( x, y, 3);
                                        _mm_storeu_pd(pB,z);
                                        _mm_storeu_pd(pB + N1,w);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    if (l == 5) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4 += tilesize) {
                        for (j5 = 0; j5 < l5; j5++) {
                            ia5 = ia3 + j5*rangea5;
                            ib5 = ib3 + j5*rangeb5;
                            for (j6 = 0; j6 < l6; j6 += tilesize) {
                                jj4_bound = (j4 + tilesize > l4)? l4 :j4+tilesize;
                                for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) {
                                    ia4 = ia5 + jj4*rangea4;
                                    ib4 = ib5 + jj4*rangeb4;
                                    jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize;
                                    for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) {
                                        ia = ia4 + jj6*rangea6;
                                        ib = ib4 + jj6*rangeb6;
                                        N1 = rangeb4;
                                        N2 = rangea6;

                                        pA = unsorted+ia;
                                        pB = sorted+ib;
                                        x = _mm_loadu_pd(pA);
                                        x = _mm_mul_pd(x,fac_vector);
                                        y = _mm_loadu_pd(pA + N2);
                                        y = _mm_mul_pd(y,fac_vector);
                                        z = _mm_shuffle_pd( x, y, 0);
                                        w = _mm_shuffle_pd( x, y, 3);
                                        _mm_storeu_pd(pB,z);
                                        _mm_storeu_pd(pB + N1,w);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    if (k == 5) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3 += tilesize) {
                    for (j4 = 0; j4 < l4; j4++) {
                        ia4 = ia2 + j4*rangea4;
                        ib4 = ib2 + j4*rangeb4;
                        for (j5 = 0; j5 < l5; j5++) {
                            ia5 = ia4 + j5*rangea5;
                            ib5 = ib4 + j5*rangeb5;
                            for (j6 = 0; j6 < l6; j6 += tilesize) {
                                jj3_bound = (j3 + tilesize > l3)? l3 :j3+tilesize;
                                for (jj3 = j3; jj3 < jj3_bound; jj3 += 2) {
                                    ia3 = ia5 + jj3*rangea3;
                                    ib3 = ib5 + jj3*rangeb3;
                                    jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize;
                                    for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) {
                                        ia = ia3 + jj6*rangea6;
                                        ib = ib3 + jj6*rangeb6;
                                        N1 = rangeb3;
                                        N2 = rangea6;

                                        pA = unsorted+ia;
                                        pB = sorted+ib;
                                        x = _mm_loadu_pd(pA);
                                        x = _mm_mul_pd(x,fac_vector);
                                        y = _mm_loadu_pd(pA + N2);
                                        y = _mm_mul_pd(y,fac_vector);
                                        z = _mm_shuffle_pd( x, y, 0);
                                        w = _mm_shuffle_pd( x, y, 3);
                                        _mm_storeu_pd(pB,z);
                                        _mm_storeu_pd(pB + N1,w);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }


    if (j == 5) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2 += tilesize) {
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia1 + j3*rangea3;
                    ib3 = ib1 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4++) {
                        ia4 = ia3 + j4*rangea4;
                        ib4 = ib3 + j4*rangeb4;
                        for (j5 = 0; j5 < l5; j5++) {
                            ia5 = ia4 + j5*rangea5;
                            ib5 = ib4 + j5*rangeb5;
                            for (j6 = 0; j6 < l6; j6 += tilesize) {
                                jj2_bound = (j2 + tilesize > l2)? l2 :j2+tilesize;
                                for (jj2 = j2; jj2 < jj2_bound; jj2 += 2) {
                                    ia2 = ia5 + jj2*rangea2;
                                    ib2 = ib5 + jj2*rangeb2;
                                    jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize;
                                    for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) {
                                        ia = ia2 + jj6*rangea6;
                                        ib = ib2 + jj6*rangeb6;
                                        N1 = rangeb2;
                                        N2 = rangea6;

                                        pA = unsorted+ia;
                                        pB = sorted+ib;
                                        x = _mm_loadu_pd(pA);
                                        x = _mm_mul_pd(x,fac_vector);
                                        y = _mm_loadu_pd(pA + N2);
                                        y = _mm_mul_pd(y,fac_vector);
                                        z = _mm_shuffle_pd( x, y, 0);
                                        w = _mm_shuffle_pd( x, y, 3);
                                        _mm_storeu_pd(pB,z);
                                        _mm_storeu_pd(pB + N1,w);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    if (i == 5) {
        for (j1 = 0; j1 < l1; j1 += tilesize) {
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = j2*rangea2;
                ib2 = j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4++) {
                        ia4 = ia3 + j4*rangea4;
                        ib4 = ib3 + j4*rangeb4;
                        for (j5 = 0; j5 < l5; j5++) {
                            ia5 = ia4 + j5*rangea5;
                            ib5 = ib4 + j5*rangeb5;
                            for (j6 = 0; j6 < l6; j6 += tilesize) {
                                jj1_bound = (j1 + tilesize > l1)? l1 :j1+tilesize;
                                for (jj1 = j1; jj1 < jj1_bound; jj1 += 2) {
                                    ia1 = ia5 + jj1*rangea1;
                                    ib1 = ib5 + jj1*rangeb1;
                                    jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize;
                                    for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) {
                                        ia = ia1 + jj6*rangea6;
                                        ib = ib1 + jj6*rangeb6;
                                        N1 = rangeb1;
                                        N2 = rangea6;

                                        pA = unsorted+ia;
                                        pB = sorted+ib;
                                        x = _mm_loadu_pd(pA);
                                        x = _mm_mul_pd(x,fac_vector);
                                        y = _mm_loadu_pd(pA + N2);
                                        y = _mm_mul_pd(y,fac_vector);
                                        z = _mm_shuffle_pd( x, y, 0);
                                        w = _mm_shuffle_pd( x, y, 3);
                                        _mm_storeu_pd(pB,z);
                                        _mm_storeu_pd(pB + N1,w);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

}
void tce_sort_4_simd(double* unsorted,double* sorted,
                     int a, int b, int c, int d,
                     int i, int j, int k, int l,
                     double factor) {
    int id[4],jd[4],ia,ib,j1,j2,j3,j4;
    int l1,l2,l3,l4;
    int ia1,ia2,ia3,ia4;
    int ib1,ib2,ib3,ib4;
    int rangea1,rangea2,rangea3,rangea4;
    int rangeb1,rangeb2,rangeb3,rangeb4;
    int range[4],order[4],order_r[4];
    int jj1,jj2,jj3,jj4;
    int jj1_bound,jj2_bound,jj3_bound,jj4_bound;
    int count,ir,jr,kr,lr,N1,N2;

    double *pA, *pB;
    register __m128d x, y, z, w, t, t1,fac_vector;

    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);

    jd[0] = a;
    jd[1] = b;
    jd[2] = c;
    jd[3] = d;
    // prefer writes

    range[0] = b*c*d;
    range[1] = c*d;
    range[2] = d;
    range[3] = 1;

    l1 = jd[i];
    l2 = jd[j];
    l3 = jd[k];
    l4 = jd[l];

    rangea1 = range[i];
    rangea2 = range[j];
    rangea3 = range[k];
    rangea4 = range[l];

    rangeb1 = l2*l3*l4;
    rangeb2 = l3*l4;
    rangeb3 = l4;
    rangeb4 = 1;


    // here vectorization can rely on the compiler
    if (l == 3) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4++) {
                        ia = ia3 + j4*rangea4;
                        ib = ib3 + j4*rangeb4;
                        sorted[ib] = unsorted[ia] * factor;
                    }
                }
            }
        }
    }

    if (k == 3) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3 += tilesize) {
                    for (j4 = 0; j4 < l4; j4 += tilesize) {
                        jj3_bound = (j3 + tilesize > l3)? l3 :j3+tilesize;
                        for (jj3 = j3; jj3 < jj3_bound; jj3 += 2) {
                            ia3 = ia2 + jj3*rangea3;
                            ib3 = ib2 + jj3*rangeb3;
                            jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize;
                            for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) {
                                ia = ia3 + jj4*rangea4;
                                ib = ib3 + jj4*rangeb4;
                                N1 = rangeb3;
                                N2 = rangea4;

                                pA = unsorted+ia;
                                pB = sorted+ib;
                                x = _mm_loadu_pd(pA);
                                x = _mm_mul_pd(x,fac_vector);
                                y = _mm_loadu_pd(pA + N2);
                                y = _mm_mul_pd(y,fac_vector);
                                z = _mm_shuffle_pd( x, y, 0);
                                w = _mm_shuffle_pd( x, y, 3);
                                _mm_storeu_pd(pB,z);
                                _mm_storeu_pd(pB + N1,w);
                            }
                        }
                    }
                }
            }
        }
    }

    if (j == 3) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2 += tilesize) {
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia1 + j3*rangea3;
                    ib3 = ib1 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4 += tilesize) {
                        jj2_bound = (j2 + tilesize > l2)? l2 :j2+tilesize;
                        for (jj2 = j2; jj2 < jj2_bound; jj2 += 2) {
                            ia2 = ia3 + jj2*rangea2;
                            ib2 = ib3 + jj2*rangeb2;
                            jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize;
                            for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) {
                                ia = ia2 + jj4*rangea4;
                                ib = ib2 + jj4*rangeb4;
                                N1 = rangeb2;
                                N2 = rangea4;

                                pA = unsorted+ia;
                                pB = sorted+ib;
                                x = _mm_loadu_pd(pA);
                                x = _mm_mul_pd(x,fac_vector);
                                y = _mm_loadu_pd(pA + N2);
                                y = _mm_mul_pd(y,fac_vector);
                                z = _mm_shuffle_pd( x, y, 0);
                                w = _mm_shuffle_pd( x, y, 3);
                                _mm_storeu_pd(pB,z);
                                _mm_storeu_pd(pB + N1,w);
                            }
                        }
                    }
                }
            }
        }
    }

    if (i == 3) {
        for (j1 = 0; j1 < l1; j1 += tilesize) {
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = j2*rangea2;
                ib2 = j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4 += tilesize) {
                        jj1_bound = (j1 + tilesize > l1)? l1 :j1+tilesize;
                        for (jj1 = j1; jj1 < jj1_bound; jj1 += 2) {
                            ia1 = ia3 + jj1*rangea1;
                            ib1 = ib3 + jj1*rangeb1;
                            jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize;
                            for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) {
                                ia = ia1 + jj4*rangea4;
                                ib = ib1 + jj4*rangeb4;
                                N1 = rangeb1;
                                N2 = rangea4;

                                pA = unsorted+ia;
                                pB = sorted+ib;
                                x = _mm_loadu_pd(pA);
                                x = _mm_mul_pd(x,fac_vector);
                                y = _mm_loadu_pd(pA + N2);
                                y = _mm_mul_pd(y,fac_vector);
                                z = _mm_shuffle_pd( x, y, 0);
                                w = _mm_shuffle_pd( x, y, 3);
                                _mm_storeu_pd(pB,z);
                                _mm_storeu_pd(pB + N1,w);
                            }
                        }
                    }
                }
            }
        }
    }

}
Example #11
0
void AVXFMA4DNoise(Vector3d& result, const Vector3d& EPoint)
{
    DBL x, y, z;
    int ix, iy, iz;
    int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_DNoise]++;

    x = EPoint[X];
    y = EPoint[Y];
    z = EPoint[Z];

    /* its equivalent integer lattice point. */
    /*ix = (int)x; iy = (int)y; iz = (int)z;
    x_ix = x - ix; y_iy = y - iy; z_iz = z - iz;*/
                /* JB fix for the range problem */

    __m128d xy = _mm_setr_pd(x, y);
    __m128d zn = _mm_set_sd(z);
    __m128d epsy = _mm_set1_pd(1.0 - EPSILON);
    __m128d xy_e = _mm_sub_pd(xy, epsy);
    __m128d zn_e = _mm_sub_sd(zn, epsy);
    __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy));
    __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn));

    __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0);
    __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ);

    __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy));
    __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn));

    const __m128i fff = _mm_set1_epi32(0xfff);
    __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff);
    __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff);

    ix = _mm_extract_epi32(i_xy, 0);
    iy = _mm_extract_epi32(i_xy, 1);
    iz = _mm_extract_epi32(i_zn, 0);

    ixiy_hash = Hash2d(ix, iy);
    jxiy_hash = Hash2d(ix + 1, iy);
    ixjy_hash = Hash2d(ix, iy + 1);
    jxjy_hash = Hash2d(ix + 1, iy + 1);

    DBL* mp1 = &RTable[Hash1dRTableIndex(ixiy_hash, iz)];
    DBL* mp2 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)];
    DBL* mp3 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)];
    DBL* mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)];
    DBL* mp5 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)];
    DBL* mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)];
    DBL* mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)];
    DBL* mp8 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)];

    const __m128d three = _mm_set1_pd(3.0);
    const __m128d two = _mm_set1_pd(2.0);
    const __m128d one = _mm_set1_pd(1.0);

    __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy);
    __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy);
    __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn);

    __m128d jx_mm = _mm_sub_pd(ix_mm, one);
    __m128d jy_mm = _mm_sub_pd(iy_mm, one);
    __m128d jz_mm = _mm_sub_pd(iz_mm, one);

    __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three));

    __m128d mm_tz = _mm_sub_pd(one, mm_sz);

    __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three));

    __m128d mm_txy = _mm_sub_pd(one, mm_sxy);
    __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy);
    __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy);
    __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy);

    __m128d mm_txty_txsy_tz = _mm_mul_pd(mm_txty_txsy, mm_tz);
    __m128d mm_txty_txsy_sz = _mm_mul_pd(mm_txty_txsy, mm_sz);
    __m128d mm_sxty_sxsy_tz = _mm_mul_pd(mm_sxty_sxsy, mm_tz);
    __m128d mm_sxty_sxsy_sz = _mm_mul_pd(mm_sxty_sxsy, mm_sz);

    __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p;
    __m128d sum_X_Y = _mm_setzero_pd();
    __m128d sum__Z = _mm_setzero_pd();

    __m128d mm_s1 = _mm_unpacklo_pd(mm_txty_txsy_tz, mm_txty_txsy_tz);
    INCRSUMP2(mp1, mp1 + 8, mm_s1, ix_mm, iy_mm, iz_mm, sum_X_Y);

    __m128d mm_s2 = _mm_unpacklo_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz);
    INCRSUMP2(mp2, mp2 + 8, mm_s2, jx_mm, iy_mm, iz_mm, sum_X_Y);

    __m128d mm_s3 = _mm_unpackhi_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz);
    INCRSUMP2(mp3, mp3 + 8, mm_s3, jx_mm, jy_mm, iz_mm, sum_X_Y);

    __m128d mm_s4 = _mm_unpackhi_pd(mm_txty_txsy_tz, mm_txty_txsy_tz);
    INCRSUMP2(mp4, mp4 + 8, mm_s4, ix_mm, jy_mm, iz_mm, sum_X_Y);

    __m128d mm_s5 = _mm_unpackhi_pd(mm_txty_txsy_sz, mm_txty_txsy_sz);
    INCRSUMP2(mp5, mp5 + 8, mm_s5, ix_mm, jy_mm, jz_mm, sum_X_Y);

    __m128d mm_s6 = _mm_unpackhi_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz);
    INCRSUMP2(mp6, mp6 + 8, mm_s6, jx_mm, jy_mm, jz_mm, sum_X_Y);

    __m128d mm_s7 = _mm_unpacklo_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz);
    INCRSUMP2(mp7, mp7 + 8, mm_s7, jx_mm, iy_mm, jz_mm, sum_X_Y);

    __m128d mm_s8 = _mm_unpacklo_pd(mm_txty_txsy_sz, mm_txty_txsy_sz);
    INCRSUMP2(mp8, mp8 + 8, mm_s8, ix_mm, iy_mm, jz_mm, sum_X_Y);

    __m128d iy_jy = _mm_unpacklo_pd(iy_mm, jy_mm);
    INCRSUMP2(mp1 + 16, mp4 + 16, mm_txty_txsy_tz, ix_mm, iy_jy, iz_mm, sum__Z);
    INCRSUMP2(mp8 + 16, mp5 + 16, mm_txty_txsy_sz, ix_mm, iy_jy, jz_mm, sum__Z);
    INCRSUMP2(mp2 + 16, mp3 + 16, mm_sxty_sxsy_tz, jx_mm, iy_jy, iz_mm, sum__Z);
    INCRSUMP2(mp7 + 16, mp6 + 16, mm_sxty_sxsy_sz, jx_mm, iy_jy, jz_mm, sum__Z);

    sum__Z = _mm_hadd_pd(sum__Z, sum__Z);

    _mm_storeu_pd(*result, sum_X_Y);
    _mm_store_sd(&result[Z], sum__Z);
}
Example #12
0
DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator)
{
    DBL x, y, z;
    DBL *mp;
    int ix, iy, iz;
    int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;
    DBL sum;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_Noise]++;

    if (noise_generator==kNoiseGen_Perlin)
    {
        // The 1.59 and 0.985 are to correct for some biasing problems with
        // the random # generator used to create the noise tables.  Final
        // range of values is about 5.0e-4 below 0.0 and above 1.0.  Mean
        // value is 0.49 (ideally it would be 0.5).
        sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985);

        // Clamp final value to 0-1 range
            if (sum < 0.0) sum = 0.0;
            if (sum > 1.0) sum = 1.0;

        return sum;
    }

    x = EPoint[X];
    y = EPoint[Y];
    z = EPoint[Z];

    /* its equivalent integer lattice point. */
    /* ix = (int)x; iy = (int)y; iz = (long)z; */
    /* JB fix for the range problem */

    __m128d xy = _mm_setr_pd(x, y);
    __m128d zn = _mm_set_sd(z);
    __m128d epsy = _mm_set1_pd(1.0 - EPSILON);
    __m128d xy_e = _mm_sub_pd(xy, epsy);
    __m128d zn_e = _mm_sub_sd(zn, epsy);
    __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy));
    __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn));

    __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0);
    __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ);

    __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy));
    __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn));

    const __m128i fff = _mm_set1_epi32(0xfff);
    __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff);
    __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff);

    ix = _mm_extract_epi32(i_xy, 0);
    iy = _mm_extract_epi32(i_xy, 1);
    iz = _mm_extract_epi32(i_zn, 0);

    ixiy_hash = Hash2d(ix, iy);
    jxiy_hash = Hash2d(ix + 1, iy);
    ixjy_hash = Hash2d(ix, iy + 1);
    jxjy_hash = Hash2d(ix + 1, iy + 1);

    mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)];
    DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)];
    DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)];
    DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)];
    DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)];
    DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)];
    DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)];
    DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)];

    const __m128d three = _mm_set1_pd(3.0);
    const __m128d two = _mm_set1_pd(2.0);
    const __m128d one = _mm_set1_pd(1.0);

    __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy);
    __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy);
    __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn);

    __m128d jx_mm = _mm_sub_pd(ix_mm, one);
    __m128d jy_mm = _mm_sub_pd(iy_mm, one);
    __m128d jz_mm = _mm_sub_pd(iz_mm, one);

    __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three));
    __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three));

    __m128d mm_tz = _mm_sub_pd(one, mm_sz);
    __m128d mm_txy = _mm_sub_pd(one, mm_sxy);
    __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy);
    __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy);
    __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy);

    __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm);

    __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm;
    __m128d int_sum1 = _mm_setzero_pd();

    s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz);
    INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz);
    INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz);
    INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz);
    INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1);

    int_sum1 = _mm_hadd_pd(int_sum1, int_sum1);

    if(noise_generator==kNoiseGen_RangeCorrected)
    {
        /* details of range here:
        Min, max: -1.05242, 0.988997
        Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828

        We want to change it to as close to [0,1] as possible.
        */
        const __m128d r2 = _mm_set_sd(0.48985582);
        const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582);
        int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2);
    }
    else
    {
        int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5));
    }

    int_sum1 = _mm_min_sd(one, int_sum1);
    int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1);
    _mm_store_sd(&sum, int_sum1);

    return (sum);
}
Example #13
0
static inline void   sacEvaluateModelSPRT(PROSAC_HEST* p){
	unsigned i;
	unsigned isInlier;
	double   lambda       = 1.0;
	double   lambdaReject = ((1.0 - p->delta) / (1.0 - p->epsilon));
	double   lambdaAccept = ((   p->delta   ) / (    p->epsilon  ));
	float    distSq = p->maxD*p->maxD;
	float*   src = (float*)p->src;
	float*   dst = (float*)p->dst;
	float*   H   = p->H;
	
	
	p->inl      = 0;
	p->N_tested = 0;
	p->good     = 1;
	
	
	/* VECTOR */
	const __m128 distSqV=_mm_set1_ps(distSq);
	
	const __m128 H00=_mm_set1_ps(H[0]);
	const __m128 H01=_mm_set1_ps(H[1]);
	const __m128 H02=_mm_set1_ps(H[2]);
	const __m128 H10=_mm_set1_ps(H[4]);
	const __m128 H11=_mm_set1_ps(H[5]);
	const __m128 H12=_mm_set1_ps(H[6]);
	const __m128 H20=_mm_set1_ps(H[8]);
	const __m128 H21=_mm_set1_ps(H[9]);
	const __m128 H22=_mm_set1_ps(H[10]);
	
	for(i=0;i<(p->N-3) && p->good;i+=4){
		/* Backproject */
		__m128 x, y, X, Y, inter0, inter1, inter2, inter3;
		x=_mm_load_ps(src+2*i);
		y=_mm_load_ps(src+2*i+4);
		X=_mm_load_ps(dst+2*i);
		Y=_mm_load_ps(dst+2*i+4);
		
		inter0=_mm_unpacklo_ps(x,y);// y1 y0 x1 x0
		inter1=_mm_unpackhi_ps(x,y);// y3 y2 x3 x2
		inter2=_mm_unpacklo_ps(X,Y);// Y1 Y0 X1 X0
		inter3=_mm_unpackhi_ps(X,Y);// Y3 Y2 X3 X2
		
		x=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1)));
		y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1)));
		X=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3)));
		Y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3)));
		
		__m128 reprojX = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H00, x), _mm_mul_ps(H01, y)), H02);
		__m128 reprojY = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H10, x), _mm_mul_ps(H11, y)), H12);
		__m128 reprojZ = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H20, x), _mm_mul_ps(H21, y)), H22);
		
		__m128 recipZ = _mm_rcp_ps(reprojZ);
		reprojX = _mm_mul_ps(reprojX, recipZ);
		reprojY = _mm_mul_ps(reprojY, recipZ);
		//reprojX = _mm_div_ps(reprojX, reprojZ);
		//reprojY = _mm_div_ps(reprojY, reprojZ);
		
		reprojX = _mm_sub_ps(reprojX, X);
		reprojY = _mm_sub_ps(reprojY, Y);
		
		reprojX = _mm_mul_ps(reprojX, reprojX);
		reprojY = _mm_mul_ps(reprojY, reprojY);
		
		__m128 reprojDistV = _mm_add_ps(reprojX, reprojY);
		
		__m128 cmp = _mm_cmple_ps(reprojDistV, distSqV);
		int msk = _mm_movemask_ps(cmp);
		
		/* ... */
		/*                   0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15*/
		unsigned bitCnt[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
		p->inl     += bitCnt[msk];
		
		
		/* SPRT */
		lambda *= p->lambdaTBL[msk];
		p->good = lambda <= p->A;
		/* If !p->good, the threshold A was exceeded, so we're rejecting */
	}
	
	/* SCALAR */
	for(;i<p->N && p->good;i++){
		/* Backproject */
		float x=src[i*2],y=src[i*2+1];
		float X=dst[i*2],Y=dst[i*2+1];
		
		float reprojX=H[0]*x+H[1]*y+H[2]; //  ( X_1 )     ( H_11 H_12    H_13  ) (x_1)
		float reprojY=H[4]*x+H[5]*y+H[6]; //  ( X_2 )  =  ( H_21 H_22    H_23  ) (x_2)
		float reprojZ=H[8]*x+H[9]*y+H[10];//  ( X_3 )     ( H_31 H_32 H_33=1.0 ) (x_3 = 1.0)
		
		//reproj is in homogeneous coordinates. To bring back to "regular" coordinates, divide by Z.
		reprojX/=reprojZ;
		reprojY/=reprojZ;
		
		//Compute distance
		reprojX-=X;
		reprojY-=Y;
		reprojX*=reprojX;
		reprojY*=reprojY;
		float reprojDist = reprojX+reprojY;
		
		/* ... */
		isInlier    = reprojDist <= distSq;
		p->inl     += isInlier;
		
		
		/* SPRT */
		lambda *= isInlier ? lambdaAccept : lambdaReject;
		p->good = lambda <= p->A;
		/* If !p->good, the threshold A was exceeded, so we're rejecting */
	}
	
	
	p->N_tested = i;
}
static inline __m128d
my_invrsq_pd(__m128d x)
{
	const __m128d three = (const __m128d) {3.0f, 3.0f};
	const __m128d half  = (const __m128d) {0.5f, 0.5f};
	
	__m128  t  = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */
	__m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */
	
	/* First Newton-Rapson step, accuracy is now 24 bits */
	__m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1)))));
	
	/* Return second Newton-Rapson step, accuracy 48 bits */
	return (__m128d) _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2)))));
}

/* to extract single integers from a __m128i datatype */
#define _mm_extract_epi64(x, imm) \
    _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm)))
	
void nb_kernel400_x86_64_sse2(int *           p_nri,
                    int *           iinr,
                    int *           jindex,
                    int *           jjnr,
                    int *           shift,
                    double *         shiftvec,
                    double *         fshift,
                    int *           gid,
                    double *         pos,
                    double *         faction,
                    double *         charge,
                    double *         p_facel,
                    double *         p_krf,
                    double *         p_crf,
                    double *         Vc,
                    int *           type,
                    int *           p_ntype,
                    double *         vdwparam,
                    double *         Vvdw,
                    double *         p_tabscale,
                    double *         VFtab,
                    double *         invsqrta,
                    double *         dvda,
                    double *         p_gbtabscale,
                    double *         GBtab,
                    int *           p_nthreads,
                    int *           count,
                    void *          mtx,
                    int *           outeriter,
                    int *           inneriter,
                    double *         work)
{
	int           nri,ntype,nthreads,offset;
	int           n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
	double        facel,krf,crf,tabscl,gbtabscl,vct,vgbt;
	double        shX,shY,shZ,isai_d,dva;
	gmx_gbdata_t *gbdata;
	float *        gpol;

	__m128d       ix,iy,iz,jx,jy,jz;
	__m128d		  dx,dy,dz,t1,t2,t3;
	__m128d		  fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
	__m128d		  q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
	__m128d       Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d;
	__m128d		  xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
	__m128d       fac,tabscale,gbtabscale;
	__m128i       n0,nnn;
	
	const __m128d neg    = {-1.0f,-1.0f};
	const __m128d zero   = {0.0f,0.0f};
	const __m128d half   = {0.5f,0.5f};
	const __m128d two    = {2.0f,2.0f};
	const __m128d three  = {3.0f,3.0f};
	
	gbdata     = (gmx_gbdata_t *)work;
	gpol       = gbdata->gpol;

	nri        = *p_nri;
	ntype      = *p_ntype;
	nthreads   = *p_nthreads; 
    facel      = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent));       
	krf        = *p_krf;
	crf        = *p_crf;
	tabscl     = *p_tabscale;
	gbtabscl   = *p_gbtabscale;
	nj1        = 0;
	
	/* Splat variables */
	fac        = _mm_load1_pd(&facel);
	tabscale   = _mm_load1_pd(&tabscl);
	gbtabscale = _mm_load1_pd(&gbtabscl);
		
	/* Keep compiler happy */
	dvdatmp = _mm_setzero_pd();
	vgb     = _mm_setzero_pd();
	dvdaj   = _mm_setzero_pd();
	isaj    = _mm_setzero_pd();
	vcoul   = _mm_setzero_pd();
	t1      = _mm_setzero_pd();
	t2      = _mm_setzero_pd();
	t3      = _mm_setzero_pd();

	jnr1=jnr2=0;
	j13=j23=0;
	
	for(n=0;n<nri;n++)
	{
		is3     = 3*shift[n];
		shX     = shiftvec[is3];
		shY     = shiftvec[is3+1];
		shZ     = shiftvec[is3+2];
		
		nj0     = jindex[n];      
        nj1     = jindex[n+1];  
		offset  = (nj1-nj0)%2;
		
		ii      = iinr[n];
		ii3     = ii*3;
		
		ix      = _mm_set1_pd(shX+pos[ii3+0]);
		iy      = _mm_set1_pd(shX+pos[ii3+1]);
		iz      = _mm_set1_pd(shX+pos[ii3+2]); 
		q       = _mm_set1_pd(charge[ii]);
		
		iq      = _mm_mul_pd(fac,q); 
		isai_d  = invsqrta[ii];
		isai    = _mm_load1_pd(&isai_d);
		
		fix     = _mm_setzero_pd();
		fiy     = _mm_setzero_pd();
		fiz     = _mm_setzero_pd();
		dvdasum = _mm_setzero_pd();
		vctot   = _mm_setzero_pd();
		vgbtot  = _mm_setzero_pd();
		
		for(k=nj0;k<nj1-offset; k+=2)
		{
			jnr1    = jjnr[k];
			jnr2    = jjnr[k+1];
						
			j13     = jnr1 * 3;
			j23     = jnr2 * 3;
			
			/* Load coordinates */
			xmm1    = _mm_loadu_pd(pos+j13); /* x1 y1 */
			xmm2    = _mm_loadu_pd(pos+j23); /* x2 y2 */
			
			xmm5    = _mm_load_sd(pos+j13+2); /* z1 - */
			xmm6    = _mm_load_sd(pos+j23+2); /* z2 - */
			
			/* transpose */
			jx      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
			jy      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
			jz      = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); 
			
			/* distances */
			dx      = _mm_sub_pd(ix,jx);
			dy		= _mm_sub_pd(iy,jy);
			dz		= _mm_sub_pd(iz,jz);
			
			rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
			rinv    = my_invrsq_pd(rsq11);
						
			/* Load invsqrta */
			isaj	= _mm_loadl_pd(isaj,invsqrta+jnr1);
			isaj	= _mm_loadh_pd(isaj,invsqrta+jnr2);
			isaprod = _mm_mul_pd(isai,isaj);
			
			/* Load charges */
			q		= _mm_loadl_pd(q,charge+jnr1);
			q		= _mm_loadh_pd(q,charge+jnr2);
			qq		= _mm_mul_pd(iq,q);
			
			vcoul	= _mm_mul_pd(qq,rinv);
			fscal	= _mm_mul_pd(vcoul,rinv);
			qq		= _mm_mul_pd(isaprod,qq);
			qq		= _mm_mul_pd(qq,neg);
			gbscale	= _mm_mul_pd(isaprod,gbtabscale);
			
			/* Load dvdaj */
			dvdaj	= _mm_loadl_pd(dvdaj, dvda+jnr1);
			dvdaj	= _mm_loadh_pd(dvdaj, dvda+jnr2);
			
			r		= _mm_mul_pd(rsq11,rinv);
			rt		= _mm_mul_pd(r,gbscale);
			n0		= _mm_cvttpd_epi32(rt);
			n0d		= _mm_cvtepi32_pd(n0);
			eps		= _mm_sub_pd(rt,n0d);
			eps2	= _mm_mul_pd(eps,eps);
			
			nnn		= _mm_slli_epi64(n0,2);
			
			xmm1	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G		= _mm_mul_pd(G,eps);
			H		= _mm_mul_pd(H,eps2);
			Fp		= _mm_add_pd(F,G);
			Fp		= _mm_add_pd(Fp,H);
			VV		= _mm_mul_pd(Fp,eps);
			VV		= _mm_add_pd(Y,VV);
			H		= _mm_mul_pd(two,H);
			FF		= _mm_add_pd(Fp,G);
			FF		= _mm_add_pd(FF,H);
			vgb		= _mm_mul_pd(qq,VV);
			fijC	= _mm_mul_pd(qq,FF);
			fijC	= _mm_mul_pd(fijC,gbscale);
			
			dvdatmp = _mm_mul_pd(fijC,r);
			dvdatmp	= _mm_add_pd(vgb,dvdatmp);
			dvdatmp = _mm_mul_pd(dvdatmp,neg);
			dvdatmp = _mm_mul_pd(dvdatmp,half);
			dvdasum	= _mm_add_pd(dvdasum,dvdatmp);
			
			xmm1	= _mm_mul_pd(dvdatmp,isaj);
			xmm1	= _mm_mul_pd(xmm1,isaj);
			dvdaj	= _mm_add_pd(dvdaj,xmm1);
			
			/* store dvda */
			_mm_storel_pd(dvda+jnr1,dvdaj);
			_mm_storeh_pd(dvda+jnr2,dvdaj);
			
			vctot	= _mm_add_pd(vctot,vcoul);
			vgbtot  = _mm_add_pd(vgbtot,vgb);
					
			fscal	= _mm_sub_pd(fijC,fscal);
			fscal	= _mm_mul_pd(fscal,neg);
			fscal	= _mm_mul_pd(fscal,rinv);
						
			/* calculate partial force terms */
			t1		= _mm_mul_pd(fscal,dx);
			t2		= _mm_mul_pd(fscal,dy);
			t3		= _mm_mul_pd(fscal,dz);
			
			/* update the i force */
			fix		= _mm_add_pd(fix,t1);
			fiy		= _mm_add_pd(fiy,t2);
			fiz		= _mm_add_pd(fiz,t3);
			
			/* accumulate forces from memory */
			xmm1	= _mm_loadu_pd(faction+j13); /* fx1 fy1 */
			xmm2	= _mm_loadu_pd(faction+j23); /* fx2 fy2 */
			
			xmm5	= _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
			xmm6	= _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
			
			/* transpose */
			xmm7	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
			xmm5	= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
			xmm6	= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
			
			/* subtract partial forces */
			xmm5	= _mm_sub_pd(xmm5,t1);
			xmm6	= _mm_sub_pd(xmm6,t2);
			xmm7	= _mm_sub_pd(xmm7,t3);
			
			xmm1	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
			xmm2	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
			
			/* store fx and fy */
			_mm_storeu_pd(faction+j13,xmm1);
			_mm_storeu_pd(faction+j23,xmm2);
			
			/* .. then fz */
			_mm_storel_pd(faction+j13+2,xmm7);
			_mm_storel_pd(faction+j23+2,xmm7);
		}

		/* In double precision, offset can only be either 0 or 1 */
		if(offset!=0)
		{
			jnr1	= jjnr[k];
			j13		= jnr1*3;
			
			jx      = _mm_load_sd(pos+j13);
			jy      = _mm_load_sd(pos+j13+1);
			jz      = _mm_load_sd(pos+j13+2);
						
			isaj	= _mm_load_sd(invsqrta+jnr1);
			isaprod = _mm_mul_sd(isai,isaj);
			dvdaj	= _mm_load_sd(dvda+jnr1);
			q		= _mm_load_sd(charge+jnr1);
			qq      = _mm_mul_sd(iq,q);
			
			dx      = _mm_sub_sd(ix,jx);
			dy		= _mm_sub_sd(iy,jy);
			dz		= _mm_sub_sd(iz,jz);
			
			rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
			rinv    = my_invrsq_pd(rsq11);
						
			vcoul	= _mm_mul_sd(qq,rinv);
			fscal	= _mm_mul_sd(vcoul,rinv);
			qq		= _mm_mul_sd(isaprod,qq);
			qq		= _mm_mul_sd(qq,neg);
			gbscale	= _mm_mul_sd(isaprod,gbtabscale);
			
			r		= _mm_mul_sd(rsq11,rinv);
			rt		= _mm_mul_sd(r,gbscale);
			n0		= _mm_cvttpd_epi32(rt);
			n0d		= _mm_cvtepi32_pd(n0);
			eps		= _mm_sub_sd(rt,n0d);
			eps2	= _mm_mul_sd(eps,eps);
			
			nnn		= _mm_slli_epi64(n0,2);
			
			xmm1	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); 
			xmm2	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); 
			xmm3	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); 
			xmm4	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); 
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); 
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); 
			
			G		= _mm_mul_sd(G,eps);
			H		= _mm_mul_sd(H,eps2);
			Fp		= _mm_add_sd(F,G);
			Fp		= _mm_add_sd(Fp,H);
			VV		= _mm_mul_sd(Fp,eps);
			VV		= _mm_add_sd(Y,VV);
			H		= _mm_mul_sd(two,H);
			FF		= _mm_add_sd(Fp,G);
			FF		= _mm_add_sd(FF,H);
			vgb		= _mm_mul_sd(qq,VV);
			fijC	= _mm_mul_sd(qq,FF);
			fijC	= _mm_mul_sd(fijC,gbscale);
			
			dvdatmp = _mm_mul_sd(fijC,r);
			dvdatmp	= _mm_add_sd(vgb,dvdatmp);
			dvdatmp = _mm_mul_sd(dvdatmp,neg);
			dvdatmp = _mm_mul_sd(dvdatmp,half);
			dvdasum	= _mm_add_sd(dvdasum,dvdatmp);
			
			xmm1	= _mm_mul_sd(dvdatmp,isaj);
			xmm1	= _mm_mul_sd(xmm1,isaj);
			dvdaj	= _mm_add_sd(dvdaj,xmm1);
			
			/* store dvda */
			_mm_storel_pd(dvda+jnr1,dvdaj);
			
			vctot	= _mm_add_sd(vctot,vcoul);
			vgbtot  = _mm_add_sd(vgbtot,vgb);
						
			fscal	= _mm_sub_sd(fijC,fscal);
			fscal	= _mm_mul_sd(fscal,neg);
			fscal	= _mm_mul_sd(fscal,rinv);
								
			/* calculate partial force terms */
			t1		= _mm_mul_sd(fscal,dx);
			t2		= _mm_mul_sd(fscal,dy);
			t3		= _mm_mul_sd(fscal,dz);
			
			/* update the i force */
			fix		= _mm_add_sd(fix,t1);
			fiy		= _mm_add_sd(fiy,t2);
			fiz		= _mm_add_sd(fiz,t3);
			
			/* accumulate forces from memory */
			xmm5	= _mm_load_sd(faction+j13);   /* fx */
			xmm6    = _mm_load_sd(faction+j13+1); /* fy */
			xmm7    = _mm_load_sd(faction+j13+2); /* fz */
						
			/* subtract partial forces */
			xmm5	= _mm_sub_sd(xmm5,t1);
			xmm6	= _mm_sub_sd(xmm6,t2);
			xmm7	= _mm_sub_sd(xmm7,t3);
			
			/* store forces */
			_mm_store_sd(faction+j13,xmm5);
			_mm_store_sd(faction+j13+1,xmm6);
			_mm_store_sd(faction+j13+2,xmm7);
		}
		
		/* fix/fiy/fiz now contain four partial terms, that all should be
		 * added to the i particle forces
		 */
		t1		 = _mm_unpacklo_pd(t1,fix);
		t2		 = _mm_unpacklo_pd(t2,fiy);
		t3		 = _mm_unpacklo_pd(t3,fiz);
				
		fix		 = _mm_add_pd(fix,t1);
		fiy		 = _mm_add_pd(fiy,t2);
		fiz		 = _mm_add_pd(fiz,t3);
		
		fix      = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
		fiy      = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
		fiz      = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
		
		/* Load i forces from memory */
		xmm1     = _mm_load_sd(faction+ii3);
		xmm2     = _mm_load_sd(faction+ii3+1);
		xmm3     = _mm_load_sd(faction+ii3+2);
		
		/* Add to i force */
		fix      = _mm_add_sd(fix,xmm1);
		fiy      = _mm_add_sd(fiy,xmm2);
		fiz      = _mm_add_sd(fiz,xmm3);
	
		/* store i forces to memory */
		_mm_store_sd(faction+ii3,fix);
		_mm_store_sd(faction+ii3+1,fiy);
		_mm_store_sd(faction+ii3+2,fiz);
				
		/* now do dvda */
		dvdatmp  = _mm_unpacklo_pd(dvdatmp,dvdasum);
		dvdasum  = _mm_add_pd(dvdasum,dvdatmp);
		_mm_storeh_pd(&dva,dvdasum);
		dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
		
		ggid	 = gid[n];
		
		/* Coulomb potential */
		vcoul	 = _mm_unpacklo_pd(vcoul,vctot);
		vctot	 = _mm_add_pd(vctot,vcoul);
		_mm_storeh_pd(&vct,vctot);
		Vc[ggid] = Vc[ggid] + vct;
		
		/* GB potential */
		vgb  	 = _mm_unpacklo_pd(vgb,vgbtot);
		vgbtot	 = _mm_add_pd(vgbtot,vgb);
		_mm_storeh_pd(&vgbt,vgbtot);
		gpol[ggid] = gpol[ggid] + vgbt;
	}
	
	*outeriter   = nri;            
    *inneriter   = nj1; 
	
}
	static NPT_INLINE __m128d npt_mm_loaddup_pd(const double* ptr) {
		__m128d temp = _mm_load_sd(ptr);
		return _mm_unpacklo_pd(temp, temp);
	}