Пример #1
0
void SetSolution_8(double value) {
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
/* Statements in this Scop: S185 */
for (int i0 = (iterationOffsetBegin[1][1]-1); (i0<=(iterationOffsetEnd[1][1]+257)); i0 += 1) {
double* fieldData_Solution_8_p1 = (&fieldData_Solution[8][(i0*260)]);
int i1 = (iterationOffsetBegin[1][0]-1);
for (; (i1<(iterationOffsetBegin[1][0]&(~1))); i1 += 1) {
fieldData_Solution_8_p1[(i1+262)] = value;
}
__m128d vec0 = _mm_set1_pd(value);
for (; (i1<(iterationOffsetEnd[1][0]+255)); i1 += 4) {
/* fieldData_Solution_8_p1[(i1+262)] = value; */
__m128d vec1;
__m128d vec1_2;
vec1 = vec0;
vec1_2 = vec0;
_mm_storeu_pd((&fieldData_Solution_8_p1[(i1+262)]), vec1);
_mm_storeu_pd((&fieldData_Solution_8_p1[(i1+264)]), vec1_2);
}
for (; (i1<(iterationOffsetEnd[1][0]+258)); i1 += 1) {
fieldData_Solution_8_p1[(i1+262)] = value;
}
}
}
}
}
Пример #2
0
void SetSolution_GMRF_7(double value) {
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
/* Statements in this Scop: S193 */
for (int i0 = (iterationOffsetBegin[0][1]-1); (i0<=(iterationOffsetEnd[0][1]+129)); i0 += 1) {
double* fieldData_Solution_GMRF_7_p1 = (&fieldData_Solution_GMRF[7][(i0*132)]);
int i1 = (iterationOffsetBegin[0][0]-1);
for (; (i1<(iterationOffsetBegin[0][0]&(~1))); i1 += 1) {
fieldData_Solution_GMRF_7_p1[(i1+134)] = value;
}
__m128d vec0 = _mm_set1_pd(value);
for (; (i1<(iterationOffsetEnd[0][0]+127)); i1 += 4) {
/* fieldData_Solution_GMRF_7_p1[(i1+134)] = value; */
__m128d vec1;
__m128d vec1_2;
vec1 = vec0;
vec1_2 = vec0;
_mm_storeu_pd((&fieldData_Solution_GMRF_7_p1[(i1+134)]), vec1);
_mm_storeu_pd((&fieldData_Solution_GMRF_7_p1[(i1+136)]), vec1_2);
}
for (; (i1<(iterationOffsetEnd[0][0]+130)); i1 += 1) {
fieldData_Solution_GMRF_7_p1[(i1+134)] = value;
}
}
}
}
}
Пример #3
0
void InitRHS_GMRF() {
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
/* Statements in this Scop: S279 */
for (int i0 = iterationOffsetBegin[0][1]; (i0<=(iterationOffsetEnd[0][1]+256)); i0 += 1) {
double* fieldData_RHS_GMRF_8_p1 = (&fieldData_RHS_GMRF[8][(i0*258)]);
int i1 = iterationOffsetBegin[0][0];
for (; (i1<((iterationOffsetBegin[0][0]+1)&(~1))); i1 += 1) {
fieldData_RHS_GMRF_8_p1[i1] = 0.000000e+00;
}
__m128d vec0 = _mm_set1_pd(0.000000e+00);
for (; (i1<(iterationOffsetEnd[0][0]+254)); i1 += 4) {
/* fieldData_RHS_GMRF_8_p1[i1] = 0.000000e+00; */
__m128d vec1;
__m128d vec1_2;
vec1 = vec0;
vec1_2 = vec0;
_mm_storeu_pd((&fieldData_RHS_GMRF_8_p1[i1]), vec1);
_mm_storeu_pd((&fieldData_RHS_GMRF_8_p1[(i1+2)]), vec1_2);
}
for (; (i1<(iterationOffsetEnd[0][0]+257)); i1 += 1) {
fieldData_RHS_GMRF_8_p1[i1] = 0.000000e+00;
}
}
}
}
}
Пример #4
0
int main( int argc, char **argv ) {
    /* set   A  =   |1 3|,     B  =   |3 0|       C =   |0 0|
                    |2 4|             |0 2|             |0 0|  */
    double A[4] = {1,2,3,4}, B[4] = {3,0,0,2}, C[4] = {0,0,0,0};

    /*   We are computing C = C + A x B, which means:
         C[0] += A[0]*B[0] + A[2]*B[1]
         C[1] += A[1]*B[0] + A[3]*B[1]
         C[2] += A[0]*B[2] + A[2]*B[3]
         C[3] += A[1]*B[2] + A[3]*B[3] */

    /* load entire matrix C into SIMD variables */
    __m128d c1 = _mm_loadu_pd( C+0 ); /* c1 = (C[0],C[1]) */
    __m128d c2 = _mm_loadu_pd( C+2 ); /* c2 = (C[2],C[3]) */

    for( int i = 0; i < 2; i++ ) {
        __m128d a  = _mm_loadu_pd( A+i*2 ); /* load next column of A */
        __m128d b1 = _mm_load1_pd( B+0+i );
        __m128d b2 = _mm_load1_pd( B+2+i ); /* load next row of B */

        c1 = _mm_add_pd( c1, _mm_mul_pd( a, b1 ) ); /* multiply and add */
        c2 = _mm_add_pd( c2, _mm_mul_pd( a, b2 ) );
    }

    /* store the result back to the C array */
    _mm_storeu_pd( C+0, c1 ); /* (C[0],C[1]) = c1 */
    _mm_storeu_pd( C+2, c2 ); /* (C[2],C[3]) = c2 */

    /* output whatever we've got */
    printf( "|%g %g| * |%g %g| = |%g %g|\n", A[0], A[2], B[0], B[2], C[0], C[2] );
    printf( "|%g %g|   |%g %g|   |%g %g|\n", A[1], A[3], B[1], B[3], C[1], C[3] );

    return 0;
}
Пример #5
0
// multiply *p by v and applied to all n
COREARRAY_DLL_DEFAULT void vec_f64_mul(double *p, size_t n, double v)
{
#if defined(COREARRAY_SIMD_AVX)

	const __m256d v4 = _mm256_set1_pd(v);

	switch ((size_t)p & 0x1F)
	{
	case 0x08:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x10:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x18:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x00:
		for (; n >= 4; n-=4)
		{
			_mm256_store_pd(p, _mm256_mul_pd(_mm256_load_pd(p), v4));
			p += 4;
		}
		if (n >= 2)
		{
			_mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), _mm256_castpd256_pd128(v4)));
			p += 2; n -= 2;
		}
		break;
	default:
		for (; n >= 4; n-=4)
		{
			_mm256_storeu_pd(p, _mm256_mul_pd(_mm256_loadu_pd(p), v4));
			p += 4;
		}
		if (n >= 2)
		{
			_mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), _mm256_castpd256_pd128(v4)));
			p += 2; n -= 2;
		}
	}

#elif defined(COREARRAY_SIMD_SSE2)

	const __m128d v2 = _mm_set1_pd(v);

	switch ((size_t)p & 0x0F)
	{
	case 0x08:
		if (n > 0) { (*p++) *= v; n--; }
	case 0x00:
		for (; n >= 2; n-=2, p+=2)
			_mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), v2));
		break;
	default:
		for (; n >= 2; n-=2, p+=2)
			_mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), v2));
	}

#endif

	for (; n > 0; n--) (*p++) *= v;
}
Пример #6
0
void
mlib_FIR_tap2f_d64s(
    mlib_d64 *pdst,
    const mlib_d64 *psrc,
    mlib_d64 *pflt,
    mlib_s32 n)
{
	mlib_s32 j;
	mlib_d64 src1_1, src2_1;
	mlib_d64 src1_2, src2_2;
	mlib_d64 dflt1 = pflt[0], dflt2 = pflt[1];
	__m128d sdflt1, sdflt2, ssrc1, ssrc2, smul1, smul2;

	sdflt2 = _mm_set1_pd(dflt2);
	sdflt1 = _mm_set1_pd(dflt1);

	if ((mlib_addr)psrc & 15) {

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		for (j = 0; j < n; j++) {
			ssrc1 = _mm_loadu_pd(psrc);
			ssrc2 = _mm_loadu_pd(psrc + 2);

			smul1 = _mm_mul_pd(sdflt2, ssrc1);
			smul2 = _mm_mul_pd(sdflt1, ssrc2);

			smul1 = _mm_add_pd(smul1, smul2);

			_mm_storeu_pd(pdst, smul1);

			psrc += 2;
			pdst += 2;
		}
	} else {

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		for (j = 0; j < n; j++) {
			ssrc1 = _mm_load_pd(psrc);
			ssrc2 = _mm_load_pd(psrc + 2);

			smul1 = _mm_mul_pd(sdflt2, ssrc1);
			smul2 = _mm_mul_pd(sdflt1, ssrc2);

			smul1 = _mm_add_pd(smul1, smul2);

			_mm_storeu_pd(pdst, smul1);

			psrc += 2;
			pdst += 2;
		}
	}
}
Пример #7
0
/** this fun use the SSE to implement the mul **/
void square_dgemm(int lda, double* A, double* B, double* C) {
    // define the variable here

    register __m128d cTmp, aTmp, bTmp; 

    for (int j = 0; j < lda; j++) {
        for (int k = 0; k < lda; k++) {
            // copy the B's val to fill the bTmp
            bTmp = _mm_load1_pd(B + k + j*lda);

            double* adda_mid = A + k*lda;
            double* addc_mid = C + j*lda;
            for (int i = 0; i < lda/8*8; i += 8) {
                double* adda = adda_mid + i;
                double* addc = addc_mid + i;
                
                aTmp = _mm_loadu_pd(adda);
                cTmp = _mm_loadu_pd(addc);
                cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp));
                _mm_storeu_pd(addc, cTmp);

                aTmp = _mm_loadu_pd(adda + 2);
                cTmp = _mm_loadu_pd(addc + 2);
                cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp));
                _mm_storeu_pd((addc + 2), cTmp);

                aTmp = _mm_loadu_pd(adda + 4);
                cTmp = _mm_loadu_pd(addc + 4);
                cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp));
                _mm_storeu_pd((addc + 4), cTmp);

                aTmp = _mm_loadu_pd(adda + 6);
                cTmp = _mm_loadu_pd(addc + 6);
                cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp));
                _mm_storeu_pd((addc + 6), cTmp);
            }

            for (int i = lda/8*8; i < lda/2*2; i += 2) {
                double* adda = adda_mid + i;
                double* addc = addc_mid + i;
                
                aTmp = _mm_loadu_pd(adda);
                cTmp = _mm_loadu_pd(addc);
                cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp));
                _mm_storeu_pd(addc, cTmp);
            }

            // the last case
            for (int i = lda/2*2; i < lda; i ++) {
                C[i + j*lda] += A[i + k*lda] * B[k+j*lda];

            }
        }
    }
}
void transpose_misaligned(double *a, double *b, int N1, int N2, double factor) {

    int i,j,k,k1,it,jt,itt,jtt,it_bound,jt_bound,itt_bound,jtt_bound;
    int conflict,tmp,tmpN,offset,line_offset,setnum,set[8192/(4*sizeof(double))];
    double *pA, *pB;


    register __m128d x, y, z, w, t, t1,fac_vector;

    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);

    itt_bound = (N1/tilesize)*tilesize;
    for (itt = 0; itt < itt_bound; itt=itt+5*tilesize) {
        jtt_bound =(N2/tilesize)*tilesize;
        for (jtt = 0; jtt < jtt_bound; jtt=jtt+5*tilesize) {
            it_bound = (itt+5*tilesize > itt_bound)?itt_bound:itt+5*tilesize;
            for (it = itt; it < it_bound; it = it+tilesize) {
                jt_bound = (jtt+5*tilesize>itt_bound)?jtt_bound:jtt+5*tilesize;
                for (jt = jtt; jt < jt_bound; jt = jt+tilesize) {
                    k = 0;
                    for (j = jt; j < jt+tilesize; j=j+2) {
                        for (i = it; i < it+tilesize; i=i+2) {
                            pA = a+i*N2+j;
                            pB = b+j*N1+i;
                            x = _mm_loadu_pd(pA);
                            x = _mm_mul_pd(x,fac_vector);
                            y = _mm_loadu_pd(pA + N2);
                            y = _mm_mul_pd(y,fac_vector);
                            z = _mm_shuffle_pd( x, y, 0);
                            w = _mm_shuffle_pd( x, y, 3);
                            _mm_storeu_pd(pB,z);
                            _mm_storeu_pd(pB + N1,w);
                        }
                    }
                }
            }
        }
        for (i = itt; i < itt+5*tilesize && i < itt_bound; i++) {
            for (j = jtt_bound; j < N2; j++) {
                b[j*N1+i] = factor * a[i*N2+j];
            }
        }
    }
    for (i = itt_bound; i < N1; i++) {
        for (j = 0; j < N2; j++) {
            b[j*N1+i] = factor * a[i*N2+j];
        }
    }
}
Пример #9
0
// *p += (*s) * v
COREARRAY_DLL_DEFAULT double *vec_f64_addmul(double *p, const double *s,
	size_t n, double v)
{
#if defined(COREARRAY_SIMD_SSE2)

	const __m128d v2 = _mm_set1_pd(v);

	switch ((size_t)p & 0x0F)
	{
	case 0x08:
		if (n > 0) { (*p++) += (*s++) * v; n--; }
	case 0x00:
		for (; n >= 2; n -= 2)
		{
			_mm_store_pd(p, _mm_add_pd(_mm_load_pd(p),
				_mm_mul_pd(_mm_loadu_pd(s), v2)));
			p += 2; s += 2;
		}
		break;
	default:
		for (; n >= 2; n-=2)
		{
			_mm_storeu_pd(p, _mm_add_pd(_mm_loadu_pd(p),
				_mm_mul_pd(_mm_loadu_pd(s), v2)));
			p += 2; s += 2;
		}
	}

#endif

	for (; n > 0; n--) (*p++) += (*s++) * v;
	return p;
}
void sgemm( int m, int n, float *A, float *C ) {
    int i, j, k, jtn, cieling;
    float B[n * m];
    float buf[2];
    __m128d sum, ab, cd, ef, AB, CD, EF;
    transpose(m, n, A, B);
    for (i = 0; i < m; i += 1) {
        for (j = 0; j < m; j += 1) {
            jtn = j * n;
            for (k = 0, cieling = n - 5; k < cieling; k += 6) {
                ab = _mm_load1_pd(A + i + k * m);
                cd = _mm_load1_pd(A + i + (k + 2) * m);
                ef = _mm_load1_pd(A + i + (k + 4) * m);
                AB = _mm_loadu_pd(B + k + jtn);
                CD = _mm_loadu_pd(B + k + 2 + jtn);
                EF = _mm_loadu_pd(B + k + 4 + jtn);
                sum = _mm_add_pd(sum, _mm_mul_sd(ab, AB));
                sum = _mm_add_pd(sum, _mm_mul_sd(cd, CD));
                sum = _mm_add_pd(sum, _mm_mul_sd(ef, EF));
            }
            _mm_storeu_pd(buf, sum);
            C[i + j * m] = buf[0];
            if (n % 6 != 0) {
                for ( ; k < n; k += 1) {
                    C[i + j * m] += A[i + k * m] * A[k + jtn];
                }
            }
        }
    }
}
Пример #11
0
static void
sse3_test_movddup_mem (double *i1, double *r)
{
  __m128d t1 = _mm_loaddup_pd (i1);

  _mm_storeu_pd (r, t1);
}
Пример #12
0
static void
sse3_test_movddup_reg_subsume (double *i1, double *r)
{
  __m128d t1 = _mm_load_pd (i1);
  __m128d t2 = _mm_movedup_pd (t1);

  _mm_storeu_pd (r, t2);
}
Пример #13
0
void test_mm_storeu_pd(double* A, __m128d B) {
  // DAG-LABEL: test_mm_storeu_pd
  // DAG: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 1
  //
  // ASM-LABEL: test_mm_storeu_pd
  // ASM: movupd
  _mm_storeu_pd(A, B);
}
Пример #14
0
static void
sse2_test (void)
{
  double a[2];
  __m128d x = _mm_set1_pd(3);
  _mm_storeu_pd(a,x);
  if (a[0] != 3.0 || a[1] != 3.0)
    __builtin_abort ();
}
Пример #15
0
value complex_add(value vx, value vy)
{
    CAMLparam2(vx, vy);
    CAMLlocal1(vz);
    vz = caml_alloc(Double_array_tag, 2);
    _mm_storeu_pd((double*) vz,
                  _mm_loadu_pd((double const*) vx) + _mm_loadu_pd((double const*) vy));
    CAMLreturn(vz);
}
static void
sse3_test_haddpd (double *i1, double *i2, double *r)
{
  __m128d t1 = _mm_loadu_pd (i1);
  __m128d t2 = _mm_loadu_pd (i2);

  t1 = _mm_hadd_pd (t1, t2);

  _mm_storeu_pd (r, t1);
}
Пример #17
0
static void
sse3_test_hsubpd_subsume (double *i1, double *i2, double *r)
{
    __m128d t1 = _mm_load_pd (i1);
    __m128d t2 = _mm_load_pd (i2);

    t1 = _mm_hsub_pd (t1, t2);

    _mm_storeu_pd (r, t1);
}
Пример #18
0
static void
sse3_test_movddup_reg (double *i1, double *r)
{
  __m128d t1 = _mm_loadu_pd (i1);
  __m128d t2 = _mm_loadu_pd (&cnst1[0]);

  t1  = _mm_mul_pd (t1, t2);
  t2  = _mm_movedup_pd (t1);

  _mm_storeu_pd (r, t2);
}
Пример #19
0
value complex_mul(value vab, value vcd)
{
    CAMLparam2(vab, vcd);
    CAMLlocal1(vz);
    vz = caml_alloc(Double_array_tag, 2);
    __m128d ab, cd, ac_bd, ba, bc_ad;
    ab = _mm_loadu_pd((double const*) vab);
    cd = _mm_loadu_pd((double const*) vcd);
    ac_bd = _mm_mul_pd(ab, cd);
    ba    = _mm_shuffle_pd(ab, ab, 1);
    bc_ad = _mm_mul_pd(ba, cd);
    _mm_storeu_pd((double*) vz, _mm_addsub_pd(ac_bd, bc_ad));
    CAMLreturn(vz);
}
/* Copy new_matirx back to the old_matrix. */
void copy_padding_back(int old_size, double* old_matrix, int new_size, double* new_matrix) {
    if(old_size%2 == 1) {
	 	for(int i=0; i<old_size; i++) {
	     	double* addr_new = new_matrix + i * new_size;
	     	double* addr_old = old_matrix + i * old_size;
			for(int j=0; j<old_size - 1; j+=2) {
				__m128d v1 = _mm_load_pd(addr_new + j);
				_mm_storeu_pd(addr_old + j, v1);
	    		}
			old_matrix[(i+1)*old_size-1]=new_matrix[i*new_size+old_size-1];
		}
    }else {
		for(int i=0; i<old_size; i++) {
			double* addr_new = new_matrix + i * new_size;
			double* addr_old = old_matrix + i * old_size;
			for(int j=0; j<old_size; j+=2) {
				__m128d v1 = _mm_load_pd(addr_new + j);
				_mm_storeu_pd(addr_old + j, v1);
			 }
		}
    }
    free(new_matrix);
}
Пример #21
0
static void add_block(double* new_A, double*  A, int M, int N, int lda, int M_even) {

    __m128d a; 
    int i_step;
    for (int j=0; j<N; j++) {
        for (int i=0; i<M; i+=I_STRIDE) {
            i_step = min(I_STRIDE,M-i); 
            if (i_step == 1) {
                A[i+j*lda] = new_A[i+j*M_even];
            } 
            else {
                a = _mm_load_pd(new_A + i + j*M_even);
                _mm_storeu_pd(A+i+j*lda,a);
            }
        }
    }
}
Пример #22
0
ALGEBRA_INLINE void		vector_addm_double_aligned_32 (double* v1,double lambda,const double* v2,size_t n)
{
	size_t k;
	
	__m128d l1 = _mm_load1_pd(&lambda);

	size_t q = n / 2;
	size_t r = n % 2;
	if(q > 0) {
		if (ALGEBRA_IS_ALIGNED(v1) && ALGEBRA_IS_ALIGNED(v2)) {
			for (k=0;k<q;k++) {
				/* Charge 2 valeurs de chaque tableau */
				__m128d i1 = _mm_load_pd(v1);
				__m128d j1 = _mm_load_pd(v2);
				/* multiplie */
					   j1 = _mm_mul_pd(j1, l1);
				/* additionne */
				i1 = _mm_add_pd(i1,j1);
				/* Sauvegarde */
				_mm_store_pd(v1, i1);
				v1 += 2;
				v2 += 2;
			}
		}
		else {		
			for (k=0;k<q;k++) {
				/* Charge 8 valeurs de chaque tableau */
				__m128d i1 = _mm_loadu_pd(v1);
				__m128d j1 = _mm_loadu_pd(v2);
					   j1 = _mm_mul_pd(j1, l1);
				/* Soustrait */
				i1 = _mm_add_pd(i1,j1);
				/* Sauvegarde */
				_mm_storeu_pd(v1, i1);
				v1 += 2;
				v2 += 2;
			}
		}
	}
	
	for(k = 0 ; k<r ; k++)
		v1[k] += lambda*v2[k];

}
Пример #23
0
// from Intel's sample intrin_double_sample.c
void multiply_SSE3(double xr, double xi, double yr, double yi,
    complex_num *z)
{
    __m128d num1, num2, num3;

    // Duplicates lower vector element into upper vector element.
    //   num1: [x.real, x.real]

    num1 = _mm_loaddup_pd(&xr);

    // Move y elements into a vector
    //   num2: [y.img, y.real]

    num2 = _mm_set_pd(yi, yr);

    // Multiplies vector elements
    //   num3: [(x.real*y.img), (x.real*y.real)]

    num3 = _mm_mul_pd(num2, num1);

    //   num1: [x.img, x.img]

    num1 = _mm_loaddup_pd(&xi);

    // Swaps the vector elements
    //   num2: [y.real, y.img]

    num2 = _mm_shuffle_pd(num2, num2, 1);

    //   num2: [(x.img*y.real), (x.img*y.img)]

    num2 = _mm_mul_pd(num2, num1);

    // Adds upper vector element while subtracting lower vector element
    //   num3: [((x.real *y.img)+(x.img*y.real)),
    //          ((x.real*y.real)-(x.img*y.img))]

    num3 = _mm_addsub_pd(num3, num2);

    // Stores the elements of num3 into z

    _mm_storeu_pd((double *)z, num3);

}
Пример #24
0
void
mlib_FIR_tap1f_d64(
    mlib_d64 *pdst,
    const mlib_d64 *psrc,
    mlib_d64 *pflt,
    mlib_s32 n)
{
	mlib_s32 j;
	mlib_d64 dflt1 = pflt[0];
	__m128d sdflt1;
	__m128d ssrc1;
	__m128d smul1;

	j = 0;
	if ((mlib_addr)psrc & 15) {
		pdst[0] = dflt1 * psrc[0];
		psrc++;
		pdst++;
		j++;
	}

	sdflt1 = _mm_set1_pd(dflt1);

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
	for (; j < (n - 1); j += 2) {
		ssrc1 = _mm_load_pd(psrc);
		smul1 = _mm_mul_pd(sdflt1, ssrc1);
		_mm_storeu_pd(pdst, smul1);
		psrc += 2;
		pdst += 2;
	}

	for (; j < n; j++) {
		pdst[0] = dflt1 * psrc[0];
		psrc++;
		pdst++;
	}
}
void tce_sort_4_simd(double* unsorted,double* sorted,
                     int a, int b, int c, int d,
                     int i, int j, int k, int l,
                     double factor) {
    int id[4],jd[4],ia,ib,j1,j2,j3,j4;
    int l1,l2,l3,l4;
    int ia1,ia2,ia3,ia4;
    int ib1,ib2,ib3,ib4;
    int rangea1,rangea2,rangea3,rangea4;
    int rangeb1,rangeb2,rangeb3,rangeb4;
    int range[4],order[4],order_r[4];
    int jj1,jj2,jj3,jj4;
    int jj1_bound,jj2_bound,jj3_bound,jj4_bound;
    int count,ir,jr,kr,lr,N1,N2;

    double *pA, *pB;
    register __m128d x, y, z, w, t, t1,fac_vector;

    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);

    jd[0] = a;
    jd[1] = b;
    jd[2] = c;
    jd[3] = d;
    // prefer writes

    range[0] = b*c*d;
    range[1] = c*d;
    range[2] = d;
    range[3] = 1;

    l1 = jd[i];
    l2 = jd[j];
    l3 = jd[k];
    l4 = jd[l];

    rangea1 = range[i];
    rangea2 = range[j];
    rangea3 = range[k];
    rangea4 = range[l];

    rangeb1 = l2*l3*l4;
    rangeb2 = l3*l4;
    rangeb3 = l4;
    rangeb4 = 1;


    // here vectorization can rely on the compiler
    if (l == 3) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4++) {
                        ia = ia3 + j4*rangea4;
                        ib = ib3 + j4*rangeb4;
                        sorted[ib] = unsorted[ia] * factor;
                    }
                }
            }
        }
    }

    if (k == 3) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3 += tilesize) {
                    for (j4 = 0; j4 < l4; j4 += tilesize) {
                        jj3_bound = (j3 + tilesize > l3)? l3 :j3+tilesize;
                        for (jj3 = j3; jj3 < jj3_bound; jj3 += 2) {
                            ia3 = ia2 + jj3*rangea3;
                            ib3 = ib2 + jj3*rangeb3;
                            jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize;
                            for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) {
                                ia = ia3 + jj4*rangea4;
                                ib = ib3 + jj4*rangeb4;
                                N1 = rangeb3;
                                N2 = rangea4;

                                pA = unsorted+ia;
                                pB = sorted+ib;
                                x = _mm_loadu_pd(pA);
                                x = _mm_mul_pd(x,fac_vector);
                                y = _mm_loadu_pd(pA + N2);
                                y = _mm_mul_pd(y,fac_vector);
                                z = _mm_shuffle_pd( x, y, 0);
                                w = _mm_shuffle_pd( x, y, 3);
                                _mm_storeu_pd(pB,z);
                                _mm_storeu_pd(pB + N1,w);
                            }
                        }
                    }
                }
            }
        }
    }

    if (j == 3) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2 += tilesize) {
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia1 + j3*rangea3;
                    ib3 = ib1 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4 += tilesize) {
                        jj2_bound = (j2 + tilesize > l2)? l2 :j2+tilesize;
                        for (jj2 = j2; jj2 < jj2_bound; jj2 += 2) {
                            ia2 = ia3 + jj2*rangea2;
                            ib2 = ib3 + jj2*rangeb2;
                            jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize;
                            for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) {
                                ia = ia2 + jj4*rangea4;
                                ib = ib2 + jj4*rangeb4;
                                N1 = rangeb2;
                                N2 = rangea4;

                                pA = unsorted+ia;
                                pB = sorted+ib;
                                x = _mm_loadu_pd(pA);
                                x = _mm_mul_pd(x,fac_vector);
                                y = _mm_loadu_pd(pA + N2);
                                y = _mm_mul_pd(y,fac_vector);
                                z = _mm_shuffle_pd( x, y, 0);
                                w = _mm_shuffle_pd( x, y, 3);
                                _mm_storeu_pd(pB,z);
                                _mm_storeu_pd(pB + N1,w);
                            }
                        }
                    }
                }
            }
        }
    }

    if (i == 3) {
        for (j1 = 0; j1 < l1; j1 += tilesize) {
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = j2*rangea2;
                ib2 = j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4 += tilesize) {
                        jj1_bound = (j1 + tilesize > l1)? l1 :j1+tilesize;
                        for (jj1 = j1; jj1 < jj1_bound; jj1 += 2) {
                            ia1 = ia3 + jj1*rangea1;
                            ib1 = ib3 + jj1*rangeb1;
                            jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize;
                            for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) {
                                ia = ia1 + jj4*rangea4;
                                ib = ib1 + jj4*rangeb4;
                                N1 = rangeb1;
                                N2 = rangea4;

                                pA = unsorted+ia;
                                pB = sorted+ib;
                                x = _mm_loadu_pd(pA);
                                x = _mm_mul_pd(x,fac_vector);
                                y = _mm_loadu_pd(pA + N2);
                                y = _mm_mul_pd(y,fac_vector);
                                z = _mm_shuffle_pd( x, y, 0);
                                w = _mm_shuffle_pd( x, y, 3);
                                _mm_storeu_pd(pB,z);
                                _mm_storeu_pd(pB + N1,w);
                            }
                        }
                    }
                }
            }
        }
    }

}
Пример #26
0
void UpResidual_GMRF_3() {
exchsolution_gmrfData_3(0);
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[0]) {
/* Statements in this Scop: S139 */
for (int i0 = iterationOffsetBegin[0][1]; (i0<=(iterationOffsetEnd[0][1]+8)); i0 += 1) {
double* fieldData_Solution_GMRF_3_p1 = (&fieldData_Solution_GMRF[3][(i0*11)]);
double* fieldData_RHS_GMRF_3_p1 = (&fieldData_RHS_GMRF[3][(i0*9)]);
double* fieldData_LaplaceCoeff_GMRF_3_p1 = (&fieldData_LaplaceCoeff_GMRF[3][(i0*11)]);
double* fieldData_Residual_GMRF_3_p1 = (&fieldData_Residual_GMRF[3][(i0*11)]);
int i1 = (iterationOffsetBegin[0][0]+i0);
for (; (i1<(((iterationOffsetBegin[0][0]+i0)+1)&(~1))); i1 += 1) {
fieldData_Residual_GMRF_3_p1[(i1+14)] = (fieldData_RHS_GMRF_3_p1[i1]-(((((((((fieldData_LaplaceCoeff_GMRF_3_p1[(i1+14)]*fieldData_Solution_GMRF_3_p1[(i1+14)])+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+146)]*fieldData_Solution_GMRF_3_p1[(i1+15)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+278)]*fieldData_Solution_GMRF_3_p1[(i1+13)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+410)]*fieldData_Solution_GMRF_3_p1[(i1+26)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+542)]*fieldData_Solution_GMRF_3_p1[(i1+2)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+674)]*fieldData_Solution_GMRF_3_p1[(i1+1)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+806)]*fieldData_Solution_GMRF_3_p1[(i1+25)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+938)]*fieldData_Solution_GMRF_3_p1[(i1+3)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+1070)]*fieldData_Solution_GMRF_3_p1[(i1+27)])));
}
for (; (i1<((iterationOffsetEnd[0][0]+i0)+6)); i1 += 4) {
/* fieldData_Residual_GMRF_3_p1[(i1+14)] = (fieldData_RHS_GMRF_3_p1[i1]-(((((((((fieldData_LaplaceCoeff_GMRF_3_p1[(i1+14)]*fieldData_Solution_GMRF_3_p1[(i1+14)])+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+146)]*fieldData_Solution_GMRF_3_p1[(i1+15)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+278)]*fieldData_Solution_GMRF_3_p1[(i1+13)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+410)]*fieldData_Solution_GMRF_3_p1[(i1+26)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+542)]*fieldData_Solution_GMRF_3_p1[(i1+2)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+674)]*fieldData_Solution_GMRF_3_p1[(i1+1)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+806)]*fieldData_Solution_GMRF_3_p1[(i1+25)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+938)]*fieldData_Solution_GMRF_3_p1[(i1+3)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+1070)]*fieldData_Solution_GMRF_3_p1[(i1+27)]))); */
__m128d vec0 = _mm_loadu_pd((&fieldData_RHS_GMRF_3_p1[i1]));
__m128d vec0_2 = _mm_loadu_pd((&fieldData_RHS_GMRF_3_p1[(i1+2)]));
__m128d vec1 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+1070)]));
__m128d vec1_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+1072)]));
__m128d vec2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+27)]));
__m128d vec2_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+29)]));
__m128d vec3 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+938)]));
__m128d vec3_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+940)]));
__m128d vec4 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+3)]));
__m128d vec4_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+5)]));
__m128d vec5 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+806)]));
__m128d vec5_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+808)]));
__m128d vec6 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+25)]));
__m128d vec6_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+27)]));
__m128d vec7 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+674)]));
__m128d vec7_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+676)]));
__m128d vec8 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+1)]));
__m128d vec8_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+3)]));
__m128d vec9 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+542)]));
__m128d vec9_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+544)]));
__m128d vec10 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+2)]));
__m128d vec10_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+4)]));
__m128d vec11 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+410)]));
__m128d vec11_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+412)]));
__m128d vec12 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+26)]));
__m128d vec12_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+28)]));
__m128d vec13 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+278)]));
__m128d vec13_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+280)]));
__m128d vec14 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+13)]));
__m128d vec14_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+15)]));
__m128d vec15 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+14)]));
__m128d vec15_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+16)]));
__m128d vec16 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+14)]));
__m128d vec16_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+16)]));
__m128d vec17 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+146)]));
__m128d vec17_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+148)]));
__m128d vec18 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+15)]));
__m128d vec18_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+17)]));
__m128d vec19;
__m128d vec19_2;
vec19 = _mm_sub_pd(vec0, _mm_add_pd(_mm_mul_pd(vec1, vec2), _mm_add_pd(_mm_mul_pd(vec3, vec4), _mm_add_pd(_mm_mul_pd(vec5, vec6), _mm_add_pd(_mm_mul_pd(vec7, vec8), _mm_add_pd(_mm_mul_pd(vec9, vec10), _mm_add_pd(_mm_mul_pd(vec11, vec12), _mm_add_pd(_mm_mul_pd(vec13, vec14), _mm_add_pd(_mm_mul_pd(vec15, vec16), _mm_mul_pd(vec17, vec18))))))))));
vec19_2 = _mm_sub_pd(vec0_2, _mm_add_pd(_mm_mul_pd(vec1_2, vec2_2), _mm_add_pd(_mm_mul_pd(vec3_2, vec4_2), _mm_add_pd(_mm_mul_pd(vec5_2, vec6_2), _mm_add_pd(_mm_mul_pd(vec7_2, vec8_2), _mm_add_pd(_mm_mul_pd(vec9_2, vec10_2), _mm_add_pd(_mm_mul_pd(vec11_2, vec12_2), _mm_add_pd(_mm_mul_pd(vec13_2, vec14_2), _mm_add_pd(_mm_mul_pd(vec15_2, vec16_2), _mm_mul_pd(vec17_2, vec18_2))))))))));
_mm_storeu_pd((&fieldData_Residual_GMRF_3_p1[(i1+14)]), vec19);
_mm_storeu_pd((&fieldData_Residual_GMRF_3_p1[(i1+16)]), vec19_2);
}
for (; (i1<((iterationOffsetEnd[0][0]+i0)+9)); i1 += 1) {
fieldData_Residual_GMRF_3_p1[(i1+14)] = (fieldData_RHS_GMRF_3_p1[i1]-(((((((((fieldData_LaplaceCoeff_GMRF_3_p1[(i1+14)]*fieldData_Solution_GMRF_3_p1[(i1+14)])+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+146)]*fieldData_Solution_GMRF_3_p1[(i1+15)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+278)]*fieldData_Solution_GMRF_3_p1[(i1+13)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+410)]*fieldData_Solution_GMRF_3_p1[(i1+26)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+542)]*fieldData_Solution_GMRF_3_p1[(i1+2)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+674)]*fieldData_Solution_GMRF_3_p1[(i1+1)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+806)]*fieldData_Solution_GMRF_3_p1[(i1+25)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+938)]*fieldData_Solution_GMRF_3_p1[(i1+3)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+1070)]*fieldData_Solution_GMRF_3_p1[(i1+27)])));
}
}
}
}
}
Пример #27
0
void UpResidual_0() {
exchsolutionData_0(0);
for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) {
if (isValidForSubdomain[1]) {
/* Statements in this Scop: S127 */
for (int i0 = iterationOffsetBegin[1][1]; (i0<=(iterationOffsetEnd[1][1]+1)); i0 += 1) {
double* fieldData_Solution_0_p1 = (&fieldData_Solution[0][(i0*5)]);
double* fieldData_LaplaceCoeff_0_p1 = (&fieldData_LaplaceCoeff[0][(i0*5)]);
double* fieldData_RHS_0_p1 = (&fieldData_RHS[0][i0]);
double* fieldData_Residual_0_p1 = (&fieldData_Residual[0][(i0*5)]);
int i1 = (iterationOffsetBegin[1][0]+i0);
for (; (i1<(((iterationOffsetBegin[1][0]+i0)+1)&(~1))); i1 += 1) {
fieldData_Residual_0_p1[(i1+8)] = (fieldData_RHS_0_p1[i1]-(((((((((fieldData_LaplaceCoeff_0_p1[(i1+8)]*fieldData_Solution_0_p1[(i1+8)])+(fieldData_LaplaceCoeff_0_p1[(i1+32)]*fieldData_Solution_0_p1[(i1+9)]))+(fieldData_LaplaceCoeff_0_p1[(i1+56)]*fieldData_Solution_0_p1[(i1+7)]))+(fieldData_LaplaceCoeff_0_p1[(i1+80)]*fieldData_Solution_0_p1[(i1+14)]))+(fieldData_LaplaceCoeff_0_p1[(i1+104)]*fieldData_Solution_0_p1[(i1+2)]))+(fieldData_LaplaceCoeff_0_p1[(i1+128)]*fieldData_Solution_0_p1[(i1+1)]))+(fieldData_LaplaceCoeff_0_p1[(i1+152)]*fieldData_Solution_0_p1[(i1+13)]))+(fieldData_LaplaceCoeff_0_p1[(i1+176)]*fieldData_Solution_0_p1[(i1+3)]))+(fieldData_LaplaceCoeff_0_p1[(i1+200)]*fieldData_Solution_0_p1[(i1+15)])));
}
for (; (i1<((iterationOffsetEnd[1][0]+i0)-1)); i1 += 4) {
/* fieldData_Residual_0_p1[(i1+8)] = (fieldData_RHS_0_p1[i1]-(((((((((fieldData_LaplaceCoeff_0_p1[(i1+8)]*fieldData_Solution_0_p1[(i1+8)])+(fieldData_LaplaceCoeff_0_p1[(i1+32)]*fieldData_Solution_0_p1[(i1+9)]))+(fieldData_LaplaceCoeff_0_p1[(i1+56)]*fieldData_Solution_0_p1[(i1+7)]))+(fieldData_LaplaceCoeff_0_p1[(i1+80)]*fieldData_Solution_0_p1[(i1+14)]))+(fieldData_LaplaceCoeff_0_p1[(i1+104)]*fieldData_Solution_0_p1[(i1+2)]))+(fieldData_LaplaceCoeff_0_p1[(i1+128)]*fieldData_Solution_0_p1[(i1+1)]))+(fieldData_LaplaceCoeff_0_p1[(i1+152)]*fieldData_Solution_0_p1[(i1+13)]))+(fieldData_LaplaceCoeff_0_p1[(i1+176)]*fieldData_Solution_0_p1[(i1+3)]))+(fieldData_LaplaceCoeff_0_p1[(i1+200)]*fieldData_Solution_0_p1[(i1+15)]))); */
__m128d vec0 = _mm_loadu_pd((&fieldData_RHS_0_p1[i1]));
__m128d vec0_2 = _mm_loadu_pd((&fieldData_RHS_0_p1[(i1+2)]));
__m128d vec1 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+200)]));
__m128d vec1_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+202)]));
__m128d vec2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+15)]));
__m128d vec2_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+17)]));
__m128d vec3 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+176)]));
__m128d vec3_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+178)]));
__m128d vec4 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+3)]));
__m128d vec4_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+5)]));
__m128d vec5 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+152)]));
__m128d vec5_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+154)]));
__m128d vec6 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+13)]));
__m128d vec6_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+15)]));
__m128d vec7 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+128)]));
__m128d vec7_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+130)]));
__m128d vec8 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+1)]));
__m128d vec8_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+3)]));
__m128d vec9 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+104)]));
__m128d vec9_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+106)]));
__m128d vec10 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+2)]));
__m128d vec10_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+4)]));
__m128d vec11 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+80)]));
__m128d vec11_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+82)]));
__m128d vec12 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+14)]));
__m128d vec12_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+16)]));
__m128d vec13 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+56)]));
__m128d vec13_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+58)]));
__m128d vec14 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+7)]));
__m128d vec14_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+9)]));
__m128d vec15 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+8)]));
__m128d vec15_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+10)]));
__m128d vec16 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+8)]));
__m128d vec16_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+10)]));
__m128d vec17 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+32)]));
__m128d vec17_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+34)]));
__m128d vec18 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+9)]));
__m128d vec18_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+11)]));
__m128d vec19;
__m128d vec19_2;
vec19 = _mm_sub_pd(vec0, _mm_add_pd(_mm_mul_pd(vec1, vec2), _mm_add_pd(_mm_mul_pd(vec3, vec4), _mm_add_pd(_mm_mul_pd(vec5, vec6), _mm_add_pd(_mm_mul_pd(vec7, vec8), _mm_add_pd(_mm_mul_pd(vec9, vec10), _mm_add_pd(_mm_mul_pd(vec11, vec12), _mm_add_pd(_mm_mul_pd(vec13, vec14), _mm_add_pd(_mm_mul_pd(vec15, vec16), _mm_mul_pd(vec17, vec18))))))))));
vec19_2 = _mm_sub_pd(vec0_2, _mm_add_pd(_mm_mul_pd(vec1_2, vec2_2), _mm_add_pd(_mm_mul_pd(vec3_2, vec4_2), _mm_add_pd(_mm_mul_pd(vec5_2, vec6_2), _mm_add_pd(_mm_mul_pd(vec7_2, vec8_2), _mm_add_pd(_mm_mul_pd(vec9_2, vec10_2), _mm_add_pd(_mm_mul_pd(vec11_2, vec12_2), _mm_add_pd(_mm_mul_pd(vec13_2, vec14_2), _mm_add_pd(_mm_mul_pd(vec15_2, vec16_2), _mm_mul_pd(vec17_2, vec18_2))))))))));
_mm_storeu_pd((&fieldData_Residual_0_p1[(i1+8)]), vec19);
_mm_storeu_pd((&fieldData_Residual_0_p1[(i1+10)]), vec19_2);
}
for (; (i1<((iterationOffsetEnd[1][0]+i0)+2)); i1 += 1) {
fieldData_Residual_0_p1[(i1+8)] = (fieldData_RHS_0_p1[i1]-(((((((((fieldData_LaplaceCoeff_0_p1[(i1+8)]*fieldData_Solution_0_p1[(i1+8)])+(fieldData_LaplaceCoeff_0_p1[(i1+32)]*fieldData_Solution_0_p1[(i1+9)]))+(fieldData_LaplaceCoeff_0_p1[(i1+56)]*fieldData_Solution_0_p1[(i1+7)]))+(fieldData_LaplaceCoeff_0_p1[(i1+80)]*fieldData_Solution_0_p1[(i1+14)]))+(fieldData_LaplaceCoeff_0_p1[(i1+104)]*fieldData_Solution_0_p1[(i1+2)]))+(fieldData_LaplaceCoeff_0_p1[(i1+128)]*fieldData_Solution_0_p1[(i1+1)]))+(fieldData_LaplaceCoeff_0_p1[(i1+152)]*fieldData_Solution_0_p1[(i1+13)]))+(fieldData_LaplaceCoeff_0_p1[(i1+176)]*fieldData_Solution_0_p1[(i1+3)]))+(fieldData_LaplaceCoeff_0_p1[(i1+200)]*fieldData_Solution_0_p1[(i1+15)])));
}
}
}
}
}
void transpose_4321_loop_3241_( double *unsorted, double *sorted,
        int *p_dim1, int *p_dim2, int *p_dim3, int *p_dim4, double *p_factor ) {

    int dim1,dim2,dim3,dim4;
    int dim1mod,dim2mod,dim3mod,dim4mod;
    unsigned int old_offset,new_offset;
    unsigned int j1,j2,j3,j4;
    double factor = *p_factor;
    double *pA, *pB;
    register __m128d x, y, z, w, t, t1,fac_vector;
    unsigned int N1,N2;
    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);


    dim1 = *p_dim1;
    dim2 = *p_dim2;
    dim3 = *p_dim3;
    dim4 = *p_dim4;

    N1 = dim2*dim3*dim4;
    N2 = dim2*dim3*dim4;

    dim1mod = (int) floor( (float)dim1 / (float) 4);
    dim2mod = (int) floor( (float)dim2 / (float) 4);
    dim3mod = (int) floor( (float)dim3 / (float) 4);
    dim4mod = (int) floor( (float)dim4 / (float) 4);

    /* pluto start (dim1,dim2,dim3,dim4) */
#pragma ivdep
#pragma parallel
#pragma loop count min(10) max(80) avg(40)
#pragma unroll
    for( j3 = 0; j3<dim3; j3++) {
#pragma loop count min(10) max(80) avg(40)
#pragma unroll
        for( j2 = 0; j2<dim2; j2++) {
#pragma loop count min(10) max(80) avg(40)
#pragma unroll
#pragma vector always
            for( j4 = 0; j4<dim4; j4+=2) {
#pragma loop count min(10) max(80) avg(40)
#pragma unroll
#pragma vector always
                for( j1 = 0; j1<dim1; j1+=2) {
                    //sorted[j1+dim1*(j2+dim2*(j3+dim3*j4))] = unsorted[j4+dim4*(j3+dim3*(j2+dim2*j1))] * factor;

                    pA = unsorted + j4+dim4*(j3+dim3*(j2+dim2*j1));
                    pB = sorted   + j1+dim1*(j2+dim2*(j3+dim3*j4));
                    x = _mm_loadu_pd(pA);
                    x = _mm_mul_pd(x,fac_vector);
                    y = _mm_loadu_pd(pA + N2);
                    y = _mm_mul_pd(y,fac_vector);
                    z = _mm_shuffle_pd( x, y, 0);
                    w = _mm_shuffle_pd( x, y, 3);
                    _mm_storeu_pd(pB,z);
                    _mm_storeu_pd(pB + N1,w);

                }
            }
        }
    }
    /* pluto end */
    return;
}
Пример #29
0
AABB3d TriangleItemHandler::clip(
    const size_t                        item_index,
    const size_t                        dimension,
    const double                        slab_min,
    const double                        slab_max) const
{
    const TriangleVertexInfo& vertex_info = m_triangle_vertex_infos[item_index];

    if (vertex_info.m_motion_segment_count > 0)
    {
        AABB3d triangle_bbox = m_triangle_bboxes[item_index];

        if (triangle_bbox.min[dimension] < slab_min)
            triangle_bbox.min[dimension] = slab_min;

        if (triangle_bbox.max[dimension] > slab_max)
            triangle_bbox.max[dimension] = slab_max;

        return triangle_bbox;
    }

#ifdef APPLESEED_USE_SSE

    APPLESEED_SIMD4_ALIGN const Vector3d v0(m_triangle_vertices[vertex_info.m_vertex_index + 0]);
    APPLESEED_SIMD4_ALIGN const Vector3d v1(m_triangle_vertices[vertex_info.m_vertex_index + 1]);
    APPLESEED_SIMD4_ALIGN const Vector3d v2(m_triangle_vertices[vertex_info.m_vertex_index + 2]);

    const double v0d = v0[dimension];
    const double v1d = v1[dimension];
    const double v2d = v2[dimension];

    const int v0_ge_min = v0d >= slab_min ? 1 : 0;
    const int v0_le_max = v0d <= slab_max ? 1 : 0;
    const int v1_ge_min = v1d >= slab_min ? 1 : 0;
    const int v1_le_max = v1d <= slab_max ? 1 : 0;
    const int v2_ge_min = v2d >= slab_min ? 1 : 0;
    const int v2_le_max = v2d <= slab_max ? 1 : 0;

    __m128d bbox_min_xy = _mm_set1_pd(+numeric_limits<double>::max());
    __m128d bbox_min_zz = _mm_set1_pd(+numeric_limits<double>::max());
    __m128d bbox_max_xy = _mm_set1_pd(-numeric_limits<double>::max());
    __m128d bbox_max_zz = _mm_set1_pd(-numeric_limits<double>::max());

    const __m128d v0_xy = _mm_load_pd(&v0.x);
    const __m128d v0_zz = _mm_set1_pd(v0.z);
    const __m128d v1_xy = _mm_load_pd(&v1.x);
    const __m128d v1_zz = _mm_set1_pd(v1.z);
    const __m128d v2_xy = _mm_load_pd(&v2.x);
    const __m128d v2_zz = _mm_set1_pd(v2.z);

    if (v0_ge_min & v0_le_max)
    {
        bbox_min_xy = _mm_min_pd(bbox_min_xy, v0_xy);
        bbox_max_xy = _mm_max_pd(bbox_max_xy, v0_xy);
        bbox_min_zz = _mm_min_pd(bbox_min_zz, v0_zz);
        bbox_max_zz = _mm_max_pd(bbox_max_zz, v0_zz);
    }

    if (v1_ge_min & v1_le_max)
    {
        bbox_min_xy = _mm_min_pd(bbox_min_xy, v1_xy);
        bbox_max_xy = _mm_max_pd(bbox_max_xy, v1_xy);
        bbox_min_zz = _mm_min_pd(bbox_min_zz, v1_zz);
        bbox_max_zz = _mm_max_pd(bbox_max_zz, v1_zz);
    }

    if (v2_ge_min & v2_le_max)
    {
        bbox_min_xy = _mm_min_pd(bbox_min_xy, v2_xy);
        bbox_max_xy = _mm_max_pd(bbox_max_xy, v2_xy);
        bbox_min_zz = _mm_min_pd(bbox_min_zz, v2_zz);
        bbox_max_zz = _mm_max_pd(bbox_max_zz, v2_zz);
    }

    const int v0v1_cross_min = v0_ge_min ^ v1_ge_min;
    const int v0v1_cross_max = v0_le_max ^ v1_le_max;
    const int v1v2_cross_min = v1_ge_min ^ v2_ge_min;
    const int v1v2_cross_max = v1_le_max ^ v2_le_max;
    const int v2v0_cross_min = v2_ge_min ^ v0_ge_min;
    const int v2v0_cross_max = v2_le_max ^ v0_le_max;

    if (v0v1_cross_min | v0v1_cross_max)
    {
        const double rcp_v0v1 = 1.0 / (v1[dimension] - v0[dimension]);

        if (v0v1_cross_min)
        {
            const double t = (slab_min - v0[dimension]) * rcp_v0v1;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v0_xy, mt1), _mm_mul_pd(v1_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v0_zz, mt1), _mm_mul_pd(v1_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }

        if (v0v1_cross_max)
        {
            const double t = (slab_max - v0[dimension]) * rcp_v0v1;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v0_xy, mt1), _mm_mul_pd(v1_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v0_zz, mt1), _mm_mul_pd(v1_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }
    }

    if (v1v2_cross_min | v1v2_cross_max)
    {
        const double rcp_v1v2 = 1.0 / (v2[dimension] - v1[dimension]);

        if (v1v2_cross_min)
        {
            const double t = (slab_min - v1[dimension]) * rcp_v1v2;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v1_xy, mt1), _mm_mul_pd(v2_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v1_zz, mt1), _mm_mul_pd(v2_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }

        if (v1v2_cross_max)
        {
            const double t = (slab_max - v1[dimension]) * rcp_v1v2;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v1_xy, mt1), _mm_mul_pd(v2_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v1_zz, mt1), _mm_mul_pd(v2_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }
    }

    if (v2v0_cross_min | v2v0_cross_max)
    {
        const double rcp_v2v0 = 1.0 / (v0[dimension] - v2[dimension]);

        if (v2v0_cross_min)
        {
            const double t = (slab_min - v2[dimension]) * rcp_v2v0;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v2_xy, mt1), _mm_mul_pd(v0_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v2_zz, mt1), _mm_mul_pd(v0_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }

        if (v2v0_cross_max)
        {
            const double t = (slab_max - v2[dimension]) * rcp_v2v0;
            assert(t >= 0.0 && t <= 1.0);

            const __m128d mt = _mm_set1_pd(t);
            const __m128d mt1 = _mm_set1_pd(1.0 - t);
            const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v2_xy, mt1), _mm_mul_pd(v0_xy, mt));
            const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v2_zz, mt1), _mm_mul_pd(v0_zz, mt));

            bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy);
            bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy);
            bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz);
            bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz);
        }
    }

    APPLESEED_SIMD4_ALIGN AABB3d bbox;

    _mm_store_pd(&bbox.min.x, bbox_min_xy);
    _mm_store_sd(&bbox.min.z, bbox_min_zz);
    _mm_storeu_pd(&bbox.max.x, bbox_max_xy);
    _mm_store_sd(&bbox.max.z, bbox_max_zz);

    if (bbox.min[dimension] < slab_min)
        bbox.min[dimension] = slab_min;

    if (bbox.max[dimension] > slab_max)
        bbox.max[dimension] = slab_max;

#else

    const Vector3d v0(m_triangle_vertices[vertex_info.m_vertex_index + 0]);
    const Vector3d v1(m_triangle_vertices[vertex_info.m_vertex_index + 1]);
    const Vector3d v2(m_triangle_vertices[vertex_info.m_vertex_index + 2]);

    const int v0_ge_min = v0[dimension] >= slab_min ? 1 : 0;
    const int v0_le_max = v0[dimension] <= slab_max ? 1 : 0;
    const int v1_ge_min = v1[dimension] >= slab_min ? 1 : 0;
    const int v1_le_max = v1[dimension] <= slab_max ? 1 : 0;
    const int v2_ge_min = v2[dimension] >= slab_min ? 1 : 0;
    const int v2_le_max = v2[dimension] <= slab_max ? 1 : 0;

    AABB3d bbox;
    bbox.invalidate();

    if (v0_ge_min & v0_le_max)
        bbox.insert(v0);

    if (v1_ge_min & v1_le_max)
        bbox.insert(v1);

    if (v2_ge_min & v2_le_max)
        bbox.insert(v2);

    if (v0_ge_min != v1_ge_min)
        bbox.insert(segment_plane_intersection(v0, v1, dimension, slab_min));

    if (v0_le_max != v1_le_max)
        bbox.insert(segment_plane_intersection(v0, v1, dimension, slab_max));

    if (v1_ge_min != v2_ge_min)
        bbox.insert(segment_plane_intersection(v1, v2, dimension, slab_min));

    if (v1_le_max != v2_le_max)
        bbox.insert(segment_plane_intersection(v1, v2, dimension, slab_max));

    if (v2_ge_min != v0_ge_min)
        bbox.insert(segment_plane_intersection(v2, v0, dimension, slab_min));

    if (v2_le_max != v0_le_max)
        bbox.insert(segment_plane_intersection(v2, v0, dimension, slab_max));

#endif

    return bbox;
}
void tce_sort_6_simd(double* unsorted,double* sorted,
                     int a, int b, int c, int d, int e, int f,
                     int i, int j, int k, int l, int m, int n,
                     double factor) {
    int id[6],jd[6],ia,ib,j1,j2,j3,j4,j5,j6;
    int l1,l2,l3,l4,l5,l6;
    int ia1,ia2,ia3,ia4,ia5,ia6;
    int ib1,ib2,ib3,ib4,ib5,ib6;
    int rangea1,rangea2,rangea3,rangea4,rangea5,rangea6;
    int rangeb1,rangeb2,rangeb3,rangeb4,rangeb5,rangeb6;
    int range[6],order[6],order_r[6];
    int jj1,jj2,jj3,jj4,jj5,jj6;
    int jj1_bound,jj2_bound,jj3_bound,jj4_bound,jj5_bound,jj6_bound;
    int N1,N2;

    double *pA, *pB;
    register __m128d x, y, z, w, p, q,fac_vector;

    fac_vector = _mm_load_sd(&factor);
    fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector);

    jd[0] = a;
    jd[1] = b;
    jd[2] = c;
    jd[3] = d;
    jd[4] = e;
    jd[5] = f;

    // prefer writes
    range[0] = b*c*d*e*f;
    range[1] = c*d*e*f;
    range[2] = d*e*f;
    range[3] = e*f;
    range[4] = f;
    range[5] = 1;

    l1 = jd[i];
    l2 = jd[j];
    l3 = jd[k];
    l4 = jd[l];
    l5 = jd[m];
    l6 = jd[n];


    rangea1 = range[i];
    rangea2 = range[j];
    rangea3 = range[k];
    rangea4 = range[l];
    rangea5 = range[m];
    rangea6 = range[n];


    rangeb1 = l2*l3*l4*l5*l6;
    rangeb2 = l3*l4*l5*l6;
    rangeb3 = l4*l5*l6;
    rangeb4 = l5*l6;
    rangeb5 = l6;
    rangeb6 = 1;

    // here vectorization can rely on the compiler
    if (n == 5) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4++) {
                        ia4 = ia3 + j4*rangea4;
                        ib4 = ib3 + j4*rangeb4;
                        for (j5 = 0; j5 < l5; j5++) {
                            ia5 = ia4 + j5*rangea5;
                            ib5 = ib4 + j5*rangeb5;
                            for (j6 = 0; j6 < l6; j6++) {
                                ia = ia5 + j6*rangea6;
                                ib = ib5 + j6*rangeb6;
                                sorted[ib] = unsorted[ia] * factor;
                            }
                        }
                    }
                }
            }
        }
    }

    if (m == 5) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4++) {
                        ia4 = ia3 + j4*rangea4;
                        ib4 = ib3 + j4*rangeb4;
                        for (j5 = 0; j5 < l5; j5 += tilesize) {
                            for (j6 = 0; j6 < l6; j6 += tilesize) {
                                jj5_bound = (j5 + tilesize > l5)? l5 :j5+tilesize;
                                for (jj5 = j5; jj5 < jj5_bound; jj5 += 2) {
                                    ia5 = ia4 + jj5*rangea5;
                                    ib5 = ib4 + jj5*rangeb5;
                                    jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize;
                                    for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) {
                                        ia = ia5 + jj6*rangea6;
                                        ib = ib5 + jj6*rangeb6;
                                        N1 = rangeb5;
                                        N2 = rangea6;

                                        pA = unsorted+ia;
                                        pB = sorted+ib;
                                        x = _mm_loadu_pd(pA);
                                        x = _mm_mul_pd(x,fac_vector);
                                        y = _mm_loadu_pd(pA + N2);
                                        y = _mm_mul_pd(y,fac_vector);
                                        z = _mm_shuffle_pd( x, y, 0);
                                        w = _mm_shuffle_pd( x, y, 3);
                                        _mm_storeu_pd(pB,z);
                                        _mm_storeu_pd(pB + N1,w);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    if (l == 5) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4 += tilesize) {
                        for (j5 = 0; j5 < l5; j5++) {
                            ia5 = ia3 + j5*rangea5;
                            ib5 = ib3 + j5*rangeb5;
                            for (j6 = 0; j6 < l6; j6 += tilesize) {
                                jj4_bound = (j4 + tilesize > l4)? l4 :j4+tilesize;
                                for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) {
                                    ia4 = ia5 + jj4*rangea4;
                                    ib4 = ib5 + jj4*rangeb4;
                                    jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize;
                                    for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) {
                                        ia = ia4 + jj6*rangea6;
                                        ib = ib4 + jj6*rangeb6;
                                        N1 = rangeb4;
                                        N2 = rangea6;

                                        pA = unsorted+ia;
                                        pB = sorted+ib;
                                        x = _mm_loadu_pd(pA);
                                        x = _mm_mul_pd(x,fac_vector);
                                        y = _mm_loadu_pd(pA + N2);
                                        y = _mm_mul_pd(y,fac_vector);
                                        z = _mm_shuffle_pd( x, y, 0);
                                        w = _mm_shuffle_pd( x, y, 3);
                                        _mm_storeu_pd(pB,z);
                                        _mm_storeu_pd(pB + N1,w);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    if (k == 5) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = ia1 + j2*rangea2;
                ib2 = ib1 + j2*rangeb2;
                for (j3 = 0; j3 < l3; j3 += tilesize) {
                    for (j4 = 0; j4 < l4; j4++) {
                        ia4 = ia2 + j4*rangea4;
                        ib4 = ib2 + j4*rangeb4;
                        for (j5 = 0; j5 < l5; j5++) {
                            ia5 = ia4 + j5*rangea5;
                            ib5 = ib4 + j5*rangeb5;
                            for (j6 = 0; j6 < l6; j6 += tilesize) {
                                jj3_bound = (j3 + tilesize > l3)? l3 :j3+tilesize;
                                for (jj3 = j3; jj3 < jj3_bound; jj3 += 2) {
                                    ia3 = ia5 + jj3*rangea3;
                                    ib3 = ib5 + jj3*rangeb3;
                                    jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize;
                                    for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) {
                                        ia = ia3 + jj6*rangea6;
                                        ib = ib3 + jj6*rangeb6;
                                        N1 = rangeb3;
                                        N2 = rangea6;

                                        pA = unsorted+ia;
                                        pB = sorted+ib;
                                        x = _mm_loadu_pd(pA);
                                        x = _mm_mul_pd(x,fac_vector);
                                        y = _mm_loadu_pd(pA + N2);
                                        y = _mm_mul_pd(y,fac_vector);
                                        z = _mm_shuffle_pd( x, y, 0);
                                        w = _mm_shuffle_pd( x, y, 3);
                                        _mm_storeu_pd(pB,z);
                                        _mm_storeu_pd(pB + N1,w);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }


    if (j == 5) {
        for (j1 = 0; j1 < l1; j1++) {
            ia1 = j1*rangea1;
            ib1 = j1*rangeb1;
            for (j2 = 0; j2 < l2; j2 += tilesize) {
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia1 + j3*rangea3;
                    ib3 = ib1 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4++) {
                        ia4 = ia3 + j4*rangea4;
                        ib4 = ib3 + j4*rangeb4;
                        for (j5 = 0; j5 < l5; j5++) {
                            ia5 = ia4 + j5*rangea5;
                            ib5 = ib4 + j5*rangeb5;
                            for (j6 = 0; j6 < l6; j6 += tilesize) {
                                jj2_bound = (j2 + tilesize > l2)? l2 :j2+tilesize;
                                for (jj2 = j2; jj2 < jj2_bound; jj2 += 2) {
                                    ia2 = ia5 + jj2*rangea2;
                                    ib2 = ib5 + jj2*rangeb2;
                                    jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize;
                                    for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) {
                                        ia = ia2 + jj6*rangea6;
                                        ib = ib2 + jj6*rangeb6;
                                        N1 = rangeb2;
                                        N2 = rangea6;

                                        pA = unsorted+ia;
                                        pB = sorted+ib;
                                        x = _mm_loadu_pd(pA);
                                        x = _mm_mul_pd(x,fac_vector);
                                        y = _mm_loadu_pd(pA + N2);
                                        y = _mm_mul_pd(y,fac_vector);
                                        z = _mm_shuffle_pd( x, y, 0);
                                        w = _mm_shuffle_pd( x, y, 3);
                                        _mm_storeu_pd(pB,z);
                                        _mm_storeu_pd(pB + N1,w);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    if (i == 5) {
        for (j1 = 0; j1 < l1; j1 += tilesize) {
            for (j2 = 0; j2 < l2; j2++) {
                ia2 = j2*rangea2;
                ib2 = j2*rangeb2;
                for (j3 = 0; j3 < l3; j3++) {
                    ia3 = ia2 + j3*rangea3;
                    ib3 = ib2 + j3*rangeb3;
                    for (j4 = 0; j4 < l4; j4++) {
                        ia4 = ia3 + j4*rangea4;
                        ib4 = ib3 + j4*rangeb4;
                        for (j5 = 0; j5 < l5; j5++) {
                            ia5 = ia4 + j5*rangea5;
                            ib5 = ib4 + j5*rangeb5;
                            for (j6 = 0; j6 < l6; j6 += tilesize) {
                                jj1_bound = (j1 + tilesize > l1)? l1 :j1+tilesize;
                                for (jj1 = j1; jj1 < jj1_bound; jj1 += 2) {
                                    ia1 = ia5 + jj1*rangea1;
                                    ib1 = ib5 + jj1*rangeb1;
                                    jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize;
                                    for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) {
                                        ia = ia1 + jj6*rangea6;
                                        ib = ib1 + jj6*rangeb6;
                                        N1 = rangeb1;
                                        N2 = rangea6;

                                        pA = unsorted+ia;
                                        pB = sorted+ib;
                                        x = _mm_loadu_pd(pA);
                                        x = _mm_mul_pd(x,fac_vector);
                                        y = _mm_loadu_pd(pA + N2);
                                        y = _mm_mul_pd(y,fac_vector);
                                        z = _mm_shuffle_pd( x, y, 0);
                                        w = _mm_shuffle_pd( x, y, 3);
                                        _mm_storeu_pd(pB,z);
                                        _mm_storeu_pd(pB + N1,w);
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

}