void SetSolution_8(double value) { for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { /* Statements in this Scop: S185 */ for (int i0 = (iterationOffsetBegin[1][1]-1); (i0<=(iterationOffsetEnd[1][1]+257)); i0 += 1) { double* fieldData_Solution_8_p1 = (&fieldData_Solution[8][(i0*260)]); int i1 = (iterationOffsetBegin[1][0]-1); for (; (i1<(iterationOffsetBegin[1][0]&(~1))); i1 += 1) { fieldData_Solution_8_p1[(i1+262)] = value; } __m128d vec0 = _mm_set1_pd(value); for (; (i1<(iterationOffsetEnd[1][0]+255)); i1 += 4) { /* fieldData_Solution_8_p1[(i1+262)] = value; */ __m128d vec1; __m128d vec1_2; vec1 = vec0; vec1_2 = vec0; _mm_storeu_pd((&fieldData_Solution_8_p1[(i1+262)]), vec1); _mm_storeu_pd((&fieldData_Solution_8_p1[(i1+264)]), vec1_2); } for (; (i1<(iterationOffsetEnd[1][0]+258)); i1 += 1) { fieldData_Solution_8_p1[(i1+262)] = value; } } } } }
void SetSolution_GMRF_7(double value) { for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { /* Statements in this Scop: S193 */ for (int i0 = (iterationOffsetBegin[0][1]-1); (i0<=(iterationOffsetEnd[0][1]+129)); i0 += 1) { double* fieldData_Solution_GMRF_7_p1 = (&fieldData_Solution_GMRF[7][(i0*132)]); int i1 = (iterationOffsetBegin[0][0]-1); for (; (i1<(iterationOffsetBegin[0][0]&(~1))); i1 += 1) { fieldData_Solution_GMRF_7_p1[(i1+134)] = value; } __m128d vec0 = _mm_set1_pd(value); for (; (i1<(iterationOffsetEnd[0][0]+127)); i1 += 4) { /* fieldData_Solution_GMRF_7_p1[(i1+134)] = value; */ __m128d vec1; __m128d vec1_2; vec1 = vec0; vec1_2 = vec0; _mm_storeu_pd((&fieldData_Solution_GMRF_7_p1[(i1+134)]), vec1); _mm_storeu_pd((&fieldData_Solution_GMRF_7_p1[(i1+136)]), vec1_2); } for (; (i1<(iterationOffsetEnd[0][0]+130)); i1 += 1) { fieldData_Solution_GMRF_7_p1[(i1+134)] = value; } } } } }
void InitRHS_GMRF() { for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { /* Statements in this Scop: S279 */ for (int i0 = iterationOffsetBegin[0][1]; (i0<=(iterationOffsetEnd[0][1]+256)); i0 += 1) { double* fieldData_RHS_GMRF_8_p1 = (&fieldData_RHS_GMRF[8][(i0*258)]); int i1 = iterationOffsetBegin[0][0]; for (; (i1<((iterationOffsetBegin[0][0]+1)&(~1))); i1 += 1) { fieldData_RHS_GMRF_8_p1[i1] = 0.000000e+00; } __m128d vec0 = _mm_set1_pd(0.000000e+00); for (; (i1<(iterationOffsetEnd[0][0]+254)); i1 += 4) { /* fieldData_RHS_GMRF_8_p1[i1] = 0.000000e+00; */ __m128d vec1; __m128d vec1_2; vec1 = vec0; vec1_2 = vec0; _mm_storeu_pd((&fieldData_RHS_GMRF_8_p1[i1]), vec1); _mm_storeu_pd((&fieldData_RHS_GMRF_8_p1[(i1+2)]), vec1_2); } for (; (i1<(iterationOffsetEnd[0][0]+257)); i1 += 1) { fieldData_RHS_GMRF_8_p1[i1] = 0.000000e+00; } } } } }
int main( int argc, char **argv ) { /* set A = |1 3|, B = |3 0| C = |0 0| |2 4| |0 2| |0 0| */ double A[4] = {1,2,3,4}, B[4] = {3,0,0,2}, C[4] = {0,0,0,0}; /* We are computing C = C + A x B, which means: C[0] += A[0]*B[0] + A[2]*B[1] C[1] += A[1]*B[0] + A[3]*B[1] C[2] += A[0]*B[2] + A[2]*B[3] C[3] += A[1]*B[2] + A[3]*B[3] */ /* load entire matrix C into SIMD variables */ __m128d c1 = _mm_loadu_pd( C+0 ); /* c1 = (C[0],C[1]) */ __m128d c2 = _mm_loadu_pd( C+2 ); /* c2 = (C[2],C[3]) */ for( int i = 0; i < 2; i++ ) { __m128d a = _mm_loadu_pd( A+i*2 ); /* load next column of A */ __m128d b1 = _mm_load1_pd( B+0+i ); __m128d b2 = _mm_load1_pd( B+2+i ); /* load next row of B */ c1 = _mm_add_pd( c1, _mm_mul_pd( a, b1 ) ); /* multiply and add */ c2 = _mm_add_pd( c2, _mm_mul_pd( a, b2 ) ); } /* store the result back to the C array */ _mm_storeu_pd( C+0, c1 ); /* (C[0],C[1]) = c1 */ _mm_storeu_pd( C+2, c2 ); /* (C[2],C[3]) = c2 */ /* output whatever we've got */ printf( "|%g %g| * |%g %g| = |%g %g|\n", A[0], A[2], B[0], B[2], C[0], C[2] ); printf( "|%g %g| |%g %g| |%g %g|\n", A[1], A[3], B[1], B[3], C[1], C[3] ); return 0; }
// multiply *p by v and applied to all n COREARRAY_DLL_DEFAULT void vec_f64_mul(double *p, size_t n, double v) { #if defined(COREARRAY_SIMD_AVX) const __m256d v4 = _mm256_set1_pd(v); switch ((size_t)p & 0x1F) { case 0x08: if (n > 0) { (*p++) *= v; n--; } case 0x10: if (n > 0) { (*p++) *= v; n--; } case 0x18: if (n > 0) { (*p++) *= v; n--; } case 0x00: for (; n >= 4; n-=4) { _mm256_store_pd(p, _mm256_mul_pd(_mm256_load_pd(p), v4)); p += 4; } if (n >= 2) { _mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), _mm256_castpd256_pd128(v4))); p += 2; n -= 2; } break; default: for (; n >= 4; n-=4) { _mm256_storeu_pd(p, _mm256_mul_pd(_mm256_loadu_pd(p), v4)); p += 4; } if (n >= 2) { _mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), _mm256_castpd256_pd128(v4))); p += 2; n -= 2; } } #elif defined(COREARRAY_SIMD_SSE2) const __m128d v2 = _mm_set1_pd(v); switch ((size_t)p & 0x0F) { case 0x08: if (n > 0) { (*p++) *= v; n--; } case 0x00: for (; n >= 2; n-=2, p+=2) _mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), v2)); break; default: for (; n >= 2; n-=2, p+=2) _mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), v2)); } #endif for (; n > 0; n--) (*p++) *= v; }
void mlib_FIR_tap2f_d64s( mlib_d64 *pdst, const mlib_d64 *psrc, mlib_d64 *pflt, mlib_s32 n) { mlib_s32 j; mlib_d64 src1_1, src2_1; mlib_d64 src1_2, src2_2; mlib_d64 dflt1 = pflt[0], dflt2 = pflt[1]; __m128d sdflt1, sdflt2, ssrc1, ssrc2, smul1, smul2; sdflt2 = _mm_set1_pd(dflt2); sdflt1 = _mm_set1_pd(dflt1); if ((mlib_addr)psrc & 15) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j < n; j++) { ssrc1 = _mm_loadu_pd(psrc); ssrc2 = _mm_loadu_pd(psrc + 2); smul1 = _mm_mul_pd(sdflt2, ssrc1); smul2 = _mm_mul_pd(sdflt1, ssrc2); smul1 = _mm_add_pd(smul1, smul2); _mm_storeu_pd(pdst, smul1); psrc += 2; pdst += 2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j < n; j++) { ssrc1 = _mm_load_pd(psrc); ssrc2 = _mm_load_pd(psrc + 2); smul1 = _mm_mul_pd(sdflt2, ssrc1); smul2 = _mm_mul_pd(sdflt1, ssrc2); smul1 = _mm_add_pd(smul1, smul2); _mm_storeu_pd(pdst, smul1); psrc += 2; pdst += 2; } } }
/** this fun use the SSE to implement the mul **/ void square_dgemm(int lda, double* A, double* B, double* C) { // define the variable here register __m128d cTmp, aTmp, bTmp; for (int j = 0; j < lda; j++) { for (int k = 0; k < lda; k++) { // copy the B's val to fill the bTmp bTmp = _mm_load1_pd(B + k + j*lda); double* adda_mid = A + k*lda; double* addc_mid = C + j*lda; for (int i = 0; i < lda/8*8; i += 8) { double* adda = adda_mid + i; double* addc = addc_mid + i; aTmp = _mm_loadu_pd(adda); cTmp = _mm_loadu_pd(addc); cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp)); _mm_storeu_pd(addc, cTmp); aTmp = _mm_loadu_pd(adda + 2); cTmp = _mm_loadu_pd(addc + 2); cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp)); _mm_storeu_pd((addc + 2), cTmp); aTmp = _mm_loadu_pd(adda + 4); cTmp = _mm_loadu_pd(addc + 4); cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp)); _mm_storeu_pd((addc + 4), cTmp); aTmp = _mm_loadu_pd(adda + 6); cTmp = _mm_loadu_pd(addc + 6); cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp)); _mm_storeu_pd((addc + 6), cTmp); } for (int i = lda/8*8; i < lda/2*2; i += 2) { double* adda = adda_mid + i; double* addc = addc_mid + i; aTmp = _mm_loadu_pd(adda); cTmp = _mm_loadu_pd(addc); cTmp = _mm_add_pd(cTmp, _mm_mul_pd(bTmp, aTmp)); _mm_storeu_pd(addc, cTmp); } // the last case for (int i = lda/2*2; i < lda; i ++) { C[i + j*lda] += A[i + k*lda] * B[k+j*lda]; } } } }
void transpose_misaligned(double *a, double *b, int N1, int N2, double factor) { int i,j,k,k1,it,jt,itt,jtt,it_bound,jt_bound,itt_bound,jtt_bound; int conflict,tmp,tmpN,offset,line_offset,setnum,set[8192/(4*sizeof(double))]; double *pA, *pB; register __m128d x, y, z, w, t, t1,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); itt_bound = (N1/tilesize)*tilesize; for (itt = 0; itt < itt_bound; itt=itt+5*tilesize) { jtt_bound =(N2/tilesize)*tilesize; for (jtt = 0; jtt < jtt_bound; jtt=jtt+5*tilesize) { it_bound = (itt+5*tilesize > itt_bound)?itt_bound:itt+5*tilesize; for (it = itt; it < it_bound; it = it+tilesize) { jt_bound = (jtt+5*tilesize>itt_bound)?jtt_bound:jtt+5*tilesize; for (jt = jtt; jt < jt_bound; jt = jt+tilesize) { k = 0; for (j = jt; j < jt+tilesize; j=j+2) { for (i = it; i < it+tilesize; i=i+2) { pA = a+i*N2+j; pB = b+j*N1+i; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } for (i = itt; i < itt+5*tilesize && i < itt_bound; i++) { for (j = jtt_bound; j < N2; j++) { b[j*N1+i] = factor * a[i*N2+j]; } } } for (i = itt_bound; i < N1; i++) { for (j = 0; j < N2; j++) { b[j*N1+i] = factor * a[i*N2+j]; } } }
// *p += (*s) * v COREARRAY_DLL_DEFAULT double *vec_f64_addmul(double *p, const double *s, size_t n, double v) { #if defined(COREARRAY_SIMD_SSE2) const __m128d v2 = _mm_set1_pd(v); switch ((size_t)p & 0x0F) { case 0x08: if (n > 0) { (*p++) += (*s++) * v; n--; } case 0x00: for (; n >= 2; n -= 2) { _mm_store_pd(p, _mm_add_pd(_mm_load_pd(p), _mm_mul_pd(_mm_loadu_pd(s), v2))); p += 2; s += 2; } break; default: for (; n >= 2; n-=2) { _mm_storeu_pd(p, _mm_add_pd(_mm_loadu_pd(p), _mm_mul_pd(_mm_loadu_pd(s), v2))); p += 2; s += 2; } } #endif for (; n > 0; n--) (*p++) += (*s++) * v; return p; }
void sgemm( int m, int n, float *A, float *C ) { int i, j, k, jtn, cieling; float B[n * m]; float buf[2]; __m128d sum, ab, cd, ef, AB, CD, EF; transpose(m, n, A, B); for (i = 0; i < m; i += 1) { for (j = 0; j < m; j += 1) { jtn = j * n; for (k = 0, cieling = n - 5; k < cieling; k += 6) { ab = _mm_load1_pd(A + i + k * m); cd = _mm_load1_pd(A + i + (k + 2) * m); ef = _mm_load1_pd(A + i + (k + 4) * m); AB = _mm_loadu_pd(B + k + jtn); CD = _mm_loadu_pd(B + k + 2 + jtn); EF = _mm_loadu_pd(B + k + 4 + jtn); sum = _mm_add_pd(sum, _mm_mul_sd(ab, AB)); sum = _mm_add_pd(sum, _mm_mul_sd(cd, CD)); sum = _mm_add_pd(sum, _mm_mul_sd(ef, EF)); } _mm_storeu_pd(buf, sum); C[i + j * m] = buf[0]; if (n % 6 != 0) { for ( ; k < n; k += 1) { C[i + j * m] += A[i + k * m] * A[k + jtn]; } } } } }
static void sse3_test_movddup_mem (double *i1, double *r) { __m128d t1 = _mm_loaddup_pd (i1); _mm_storeu_pd (r, t1); }
static void sse3_test_movddup_reg_subsume (double *i1, double *r) { __m128d t1 = _mm_load_pd (i1); __m128d t2 = _mm_movedup_pd (t1); _mm_storeu_pd (r, t2); }
void test_mm_storeu_pd(double* A, __m128d B) { // DAG-LABEL: test_mm_storeu_pd // DAG: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 1 // // ASM-LABEL: test_mm_storeu_pd // ASM: movupd _mm_storeu_pd(A, B); }
static void sse2_test (void) { double a[2]; __m128d x = _mm_set1_pd(3); _mm_storeu_pd(a,x); if (a[0] != 3.0 || a[1] != 3.0) __builtin_abort (); }
value complex_add(value vx, value vy) { CAMLparam2(vx, vy); CAMLlocal1(vz); vz = caml_alloc(Double_array_tag, 2); _mm_storeu_pd((double*) vz, _mm_loadu_pd((double const*) vx) + _mm_loadu_pd((double const*) vy)); CAMLreturn(vz); }
static void sse3_test_haddpd (double *i1, double *i2, double *r) { __m128d t1 = _mm_loadu_pd (i1); __m128d t2 = _mm_loadu_pd (i2); t1 = _mm_hadd_pd (t1, t2); _mm_storeu_pd (r, t1); }
static void sse3_test_hsubpd_subsume (double *i1, double *i2, double *r) { __m128d t1 = _mm_load_pd (i1); __m128d t2 = _mm_load_pd (i2); t1 = _mm_hsub_pd (t1, t2); _mm_storeu_pd (r, t1); }
static void sse3_test_movddup_reg (double *i1, double *r) { __m128d t1 = _mm_loadu_pd (i1); __m128d t2 = _mm_loadu_pd (&cnst1[0]); t1 = _mm_mul_pd (t1, t2); t2 = _mm_movedup_pd (t1); _mm_storeu_pd (r, t2); }
value complex_mul(value vab, value vcd) { CAMLparam2(vab, vcd); CAMLlocal1(vz); vz = caml_alloc(Double_array_tag, 2); __m128d ab, cd, ac_bd, ba, bc_ad; ab = _mm_loadu_pd((double const*) vab); cd = _mm_loadu_pd((double const*) vcd); ac_bd = _mm_mul_pd(ab, cd); ba = _mm_shuffle_pd(ab, ab, 1); bc_ad = _mm_mul_pd(ba, cd); _mm_storeu_pd((double*) vz, _mm_addsub_pd(ac_bd, bc_ad)); CAMLreturn(vz); }
/* Copy new_matirx back to the old_matrix. */ void copy_padding_back(int old_size, double* old_matrix, int new_size, double* new_matrix) { if(old_size%2 == 1) { for(int i=0; i<old_size; i++) { double* addr_new = new_matrix + i * new_size; double* addr_old = old_matrix + i * old_size; for(int j=0; j<old_size - 1; j+=2) { __m128d v1 = _mm_load_pd(addr_new + j); _mm_storeu_pd(addr_old + j, v1); } old_matrix[(i+1)*old_size-1]=new_matrix[i*new_size+old_size-1]; } }else { for(int i=0; i<old_size; i++) { double* addr_new = new_matrix + i * new_size; double* addr_old = old_matrix + i * old_size; for(int j=0; j<old_size; j+=2) { __m128d v1 = _mm_load_pd(addr_new + j); _mm_storeu_pd(addr_old + j, v1); } } } free(new_matrix); }
static void add_block(double* new_A, double* A, int M, int N, int lda, int M_even) { __m128d a; int i_step; for (int j=0; j<N; j++) { for (int i=0; i<M; i+=I_STRIDE) { i_step = min(I_STRIDE,M-i); if (i_step == 1) { A[i+j*lda] = new_A[i+j*M_even]; } else { a = _mm_load_pd(new_A + i + j*M_even); _mm_storeu_pd(A+i+j*lda,a); } } } }
ALGEBRA_INLINE void vector_addm_double_aligned_32 (double* v1,double lambda,const double* v2,size_t n) { size_t k; __m128d l1 = _mm_load1_pd(&lambda); size_t q = n / 2; size_t r = n % 2; if(q > 0) { if (ALGEBRA_IS_ALIGNED(v1) && ALGEBRA_IS_ALIGNED(v2)) { for (k=0;k<q;k++) { /* Charge 2 valeurs de chaque tableau */ __m128d i1 = _mm_load_pd(v1); __m128d j1 = _mm_load_pd(v2); /* multiplie */ j1 = _mm_mul_pd(j1, l1); /* additionne */ i1 = _mm_add_pd(i1,j1); /* Sauvegarde */ _mm_store_pd(v1, i1); v1 += 2; v2 += 2; } } else { for (k=0;k<q;k++) { /* Charge 8 valeurs de chaque tableau */ __m128d i1 = _mm_loadu_pd(v1); __m128d j1 = _mm_loadu_pd(v2); j1 = _mm_mul_pd(j1, l1); /* Soustrait */ i1 = _mm_add_pd(i1,j1); /* Sauvegarde */ _mm_storeu_pd(v1, i1); v1 += 2; v2 += 2; } } } for(k = 0 ; k<r ; k++) v1[k] += lambda*v2[k]; }
// from Intel's sample intrin_double_sample.c void multiply_SSE3(double xr, double xi, double yr, double yi, complex_num *z) { __m128d num1, num2, num3; // Duplicates lower vector element into upper vector element. // num1: [x.real, x.real] num1 = _mm_loaddup_pd(&xr); // Move y elements into a vector // num2: [y.img, y.real] num2 = _mm_set_pd(yi, yr); // Multiplies vector elements // num3: [(x.real*y.img), (x.real*y.real)] num3 = _mm_mul_pd(num2, num1); // num1: [x.img, x.img] num1 = _mm_loaddup_pd(&xi); // Swaps the vector elements // num2: [y.real, y.img] num2 = _mm_shuffle_pd(num2, num2, 1); // num2: [(x.img*y.real), (x.img*y.img)] num2 = _mm_mul_pd(num2, num1); // Adds upper vector element while subtracting lower vector element // num3: [((x.real *y.img)+(x.img*y.real)), // ((x.real*y.real)-(x.img*y.img))] num3 = _mm_addsub_pd(num3, num2); // Stores the elements of num3 into z _mm_storeu_pd((double *)z, num3); }
void mlib_FIR_tap1f_d64( mlib_d64 *pdst, const mlib_d64 *psrc, mlib_d64 *pflt, mlib_s32 n) { mlib_s32 j; mlib_d64 dflt1 = pflt[0]; __m128d sdflt1; __m128d ssrc1; __m128d smul1; j = 0; if ((mlib_addr)psrc & 15) { pdst[0] = dflt1 * psrc[0]; psrc++; pdst++; j++; } sdflt1 = _mm_set1_pd(dflt1); #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (; j < (n - 1); j += 2) { ssrc1 = _mm_load_pd(psrc); smul1 = _mm_mul_pd(sdflt1, ssrc1); _mm_storeu_pd(pdst, smul1); psrc += 2; pdst += 2; } for (; j < n; j++) { pdst[0] = dflt1 * psrc[0]; psrc++; pdst++; } }
void tce_sort_4_simd(double* unsorted,double* sorted, int a, int b, int c, int d, int i, int j, int k, int l, double factor) { int id[4],jd[4],ia,ib,j1,j2,j3,j4; int l1,l2,l3,l4; int ia1,ia2,ia3,ia4; int ib1,ib2,ib3,ib4; int rangea1,rangea2,rangea3,rangea4; int rangeb1,rangeb2,rangeb3,rangeb4; int range[4],order[4],order_r[4]; int jj1,jj2,jj3,jj4; int jj1_bound,jj2_bound,jj3_bound,jj4_bound; int count,ir,jr,kr,lr,N1,N2; double *pA, *pB; register __m128d x, y, z, w, t, t1,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); jd[0] = a; jd[1] = b; jd[2] = c; jd[3] = d; // prefer writes range[0] = b*c*d; range[1] = c*d; range[2] = d; range[3] = 1; l1 = jd[i]; l2 = jd[j]; l3 = jd[k]; l4 = jd[l]; rangea1 = range[i]; rangea2 = range[j]; rangea3 = range[k]; rangea4 = range[l]; rangeb1 = l2*l3*l4; rangeb2 = l3*l4; rangeb3 = l4; rangeb4 = 1; // here vectorization can rely on the compiler if (l == 3) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia = ia3 + j4*rangea4; ib = ib3 + j4*rangeb4; sorted[ib] = unsorted[ia] * factor; } } } } } if (k == 3) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3 += tilesize) { for (j4 = 0; j4 < l4; j4 += tilesize) { jj3_bound = (j3 + tilesize > l3)? l3 :j3+tilesize; for (jj3 = j3; jj3 < jj3_bound; jj3 += 2) { ia3 = ia2 + jj3*rangea3; ib3 = ib2 + jj3*rangeb3; jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize; for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) { ia = ia3 + jj4*rangea4; ib = ib3 + jj4*rangeb4; N1 = rangeb3; N2 = rangea4; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } if (j == 3) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2 += tilesize) { for (j3 = 0; j3 < l3; j3++) { ia3 = ia1 + j3*rangea3; ib3 = ib1 + j3*rangeb3; for (j4 = 0; j4 < l4; j4 += tilesize) { jj2_bound = (j2 + tilesize > l2)? l2 :j2+tilesize; for (jj2 = j2; jj2 < jj2_bound; jj2 += 2) { ia2 = ia3 + jj2*rangea2; ib2 = ib3 + jj2*rangeb2; jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize; for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) { ia = ia2 + jj4*rangea4; ib = ib2 + jj4*rangeb4; N1 = rangeb2; N2 = rangea4; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } if (i == 3) { for (j1 = 0; j1 < l1; j1 += tilesize) { for (j2 = 0; j2 < l2; j2++) { ia2 = j2*rangea2; ib2 = j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4 += tilesize) { jj1_bound = (j1 + tilesize > l1)? l1 :j1+tilesize; for (jj1 = j1; jj1 < jj1_bound; jj1 += 2) { ia1 = ia3 + jj1*rangea1; ib1 = ib3 + jj1*rangeb1; jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize; for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) { ia = ia1 + jj4*rangea4; ib = ib1 + jj4*rangeb4; N1 = rangeb1; N2 = rangea4; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } }
void UpResidual_GMRF_3() { exchsolution_gmrfData_3(0); for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[0]) { /* Statements in this Scop: S139 */ for (int i0 = iterationOffsetBegin[0][1]; (i0<=(iterationOffsetEnd[0][1]+8)); i0 += 1) { double* fieldData_Solution_GMRF_3_p1 = (&fieldData_Solution_GMRF[3][(i0*11)]); double* fieldData_RHS_GMRF_3_p1 = (&fieldData_RHS_GMRF[3][(i0*9)]); double* fieldData_LaplaceCoeff_GMRF_3_p1 = (&fieldData_LaplaceCoeff_GMRF[3][(i0*11)]); double* fieldData_Residual_GMRF_3_p1 = (&fieldData_Residual_GMRF[3][(i0*11)]); int i1 = (iterationOffsetBegin[0][0]+i0); for (; (i1<(((iterationOffsetBegin[0][0]+i0)+1)&(~1))); i1 += 1) { fieldData_Residual_GMRF_3_p1[(i1+14)] = (fieldData_RHS_GMRF_3_p1[i1]-(((((((((fieldData_LaplaceCoeff_GMRF_3_p1[(i1+14)]*fieldData_Solution_GMRF_3_p1[(i1+14)])+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+146)]*fieldData_Solution_GMRF_3_p1[(i1+15)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+278)]*fieldData_Solution_GMRF_3_p1[(i1+13)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+410)]*fieldData_Solution_GMRF_3_p1[(i1+26)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+542)]*fieldData_Solution_GMRF_3_p1[(i1+2)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+674)]*fieldData_Solution_GMRF_3_p1[(i1+1)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+806)]*fieldData_Solution_GMRF_3_p1[(i1+25)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+938)]*fieldData_Solution_GMRF_3_p1[(i1+3)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+1070)]*fieldData_Solution_GMRF_3_p1[(i1+27)]))); } for (; (i1<((iterationOffsetEnd[0][0]+i0)+6)); i1 += 4) { /* fieldData_Residual_GMRF_3_p1[(i1+14)] = (fieldData_RHS_GMRF_3_p1[i1]-(((((((((fieldData_LaplaceCoeff_GMRF_3_p1[(i1+14)]*fieldData_Solution_GMRF_3_p1[(i1+14)])+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+146)]*fieldData_Solution_GMRF_3_p1[(i1+15)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+278)]*fieldData_Solution_GMRF_3_p1[(i1+13)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+410)]*fieldData_Solution_GMRF_3_p1[(i1+26)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+542)]*fieldData_Solution_GMRF_3_p1[(i1+2)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+674)]*fieldData_Solution_GMRF_3_p1[(i1+1)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+806)]*fieldData_Solution_GMRF_3_p1[(i1+25)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+938)]*fieldData_Solution_GMRF_3_p1[(i1+3)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+1070)]*fieldData_Solution_GMRF_3_p1[(i1+27)]))); */ __m128d vec0 = _mm_loadu_pd((&fieldData_RHS_GMRF_3_p1[i1])); __m128d vec0_2 = _mm_loadu_pd((&fieldData_RHS_GMRF_3_p1[(i1+2)])); __m128d vec1 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+1070)])); __m128d vec1_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+1072)])); __m128d vec2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+27)])); __m128d vec2_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+29)])); __m128d vec3 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+938)])); __m128d vec3_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+940)])); __m128d vec4 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+3)])); __m128d vec4_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+5)])); __m128d vec5 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+806)])); __m128d vec5_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+808)])); __m128d vec6 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+25)])); __m128d vec6_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+27)])); __m128d vec7 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+674)])); __m128d vec7_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+676)])); __m128d vec8 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+1)])); __m128d vec8_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+3)])); __m128d vec9 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+542)])); __m128d vec9_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+544)])); __m128d vec10 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+2)])); __m128d vec10_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+4)])); __m128d vec11 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+410)])); __m128d vec11_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+412)])); __m128d vec12 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+26)])); __m128d vec12_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+28)])); __m128d vec13 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+278)])); __m128d vec13_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+280)])); __m128d vec14 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+13)])); __m128d vec14_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+15)])); __m128d vec15 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+14)])); __m128d vec15_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+16)])); __m128d vec16 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+14)])); __m128d vec16_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+16)])); __m128d vec17 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+146)])); __m128d vec17_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_GMRF_3_p1[(i1+148)])); __m128d vec18 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+15)])); __m128d vec18_2 = _mm_loadu_pd((&fieldData_Solution_GMRF_3_p1[(i1+17)])); __m128d vec19; __m128d vec19_2; vec19 = _mm_sub_pd(vec0, _mm_add_pd(_mm_mul_pd(vec1, vec2), _mm_add_pd(_mm_mul_pd(vec3, vec4), _mm_add_pd(_mm_mul_pd(vec5, vec6), _mm_add_pd(_mm_mul_pd(vec7, vec8), _mm_add_pd(_mm_mul_pd(vec9, vec10), _mm_add_pd(_mm_mul_pd(vec11, vec12), _mm_add_pd(_mm_mul_pd(vec13, vec14), _mm_add_pd(_mm_mul_pd(vec15, vec16), _mm_mul_pd(vec17, vec18)))))))))); vec19_2 = _mm_sub_pd(vec0_2, _mm_add_pd(_mm_mul_pd(vec1_2, vec2_2), _mm_add_pd(_mm_mul_pd(vec3_2, vec4_2), _mm_add_pd(_mm_mul_pd(vec5_2, vec6_2), _mm_add_pd(_mm_mul_pd(vec7_2, vec8_2), _mm_add_pd(_mm_mul_pd(vec9_2, vec10_2), _mm_add_pd(_mm_mul_pd(vec11_2, vec12_2), _mm_add_pd(_mm_mul_pd(vec13_2, vec14_2), _mm_add_pd(_mm_mul_pd(vec15_2, vec16_2), _mm_mul_pd(vec17_2, vec18_2)))))))))); _mm_storeu_pd((&fieldData_Residual_GMRF_3_p1[(i1+14)]), vec19); _mm_storeu_pd((&fieldData_Residual_GMRF_3_p1[(i1+16)]), vec19_2); } for (; (i1<((iterationOffsetEnd[0][0]+i0)+9)); i1 += 1) { fieldData_Residual_GMRF_3_p1[(i1+14)] = (fieldData_RHS_GMRF_3_p1[i1]-(((((((((fieldData_LaplaceCoeff_GMRF_3_p1[(i1+14)]*fieldData_Solution_GMRF_3_p1[(i1+14)])+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+146)]*fieldData_Solution_GMRF_3_p1[(i1+15)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+278)]*fieldData_Solution_GMRF_3_p1[(i1+13)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+410)]*fieldData_Solution_GMRF_3_p1[(i1+26)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+542)]*fieldData_Solution_GMRF_3_p1[(i1+2)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+674)]*fieldData_Solution_GMRF_3_p1[(i1+1)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+806)]*fieldData_Solution_GMRF_3_p1[(i1+25)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+938)]*fieldData_Solution_GMRF_3_p1[(i1+3)]))+(fieldData_LaplaceCoeff_GMRF_3_p1[(i1+1070)]*fieldData_Solution_GMRF_3_p1[(i1+27)]))); } } } } }
void UpResidual_0() { exchsolutionData_0(0); for (int fragmentIdx = 0; fragmentIdx < 1; ++fragmentIdx) { if (isValidForSubdomain[1]) { /* Statements in this Scop: S127 */ for (int i0 = iterationOffsetBegin[1][1]; (i0<=(iterationOffsetEnd[1][1]+1)); i0 += 1) { double* fieldData_Solution_0_p1 = (&fieldData_Solution[0][(i0*5)]); double* fieldData_LaplaceCoeff_0_p1 = (&fieldData_LaplaceCoeff[0][(i0*5)]); double* fieldData_RHS_0_p1 = (&fieldData_RHS[0][i0]); double* fieldData_Residual_0_p1 = (&fieldData_Residual[0][(i0*5)]); int i1 = (iterationOffsetBegin[1][0]+i0); for (; (i1<(((iterationOffsetBegin[1][0]+i0)+1)&(~1))); i1 += 1) { fieldData_Residual_0_p1[(i1+8)] = (fieldData_RHS_0_p1[i1]-(((((((((fieldData_LaplaceCoeff_0_p1[(i1+8)]*fieldData_Solution_0_p1[(i1+8)])+(fieldData_LaplaceCoeff_0_p1[(i1+32)]*fieldData_Solution_0_p1[(i1+9)]))+(fieldData_LaplaceCoeff_0_p1[(i1+56)]*fieldData_Solution_0_p1[(i1+7)]))+(fieldData_LaplaceCoeff_0_p1[(i1+80)]*fieldData_Solution_0_p1[(i1+14)]))+(fieldData_LaplaceCoeff_0_p1[(i1+104)]*fieldData_Solution_0_p1[(i1+2)]))+(fieldData_LaplaceCoeff_0_p1[(i1+128)]*fieldData_Solution_0_p1[(i1+1)]))+(fieldData_LaplaceCoeff_0_p1[(i1+152)]*fieldData_Solution_0_p1[(i1+13)]))+(fieldData_LaplaceCoeff_0_p1[(i1+176)]*fieldData_Solution_0_p1[(i1+3)]))+(fieldData_LaplaceCoeff_0_p1[(i1+200)]*fieldData_Solution_0_p1[(i1+15)]))); } for (; (i1<((iterationOffsetEnd[1][0]+i0)-1)); i1 += 4) { /* fieldData_Residual_0_p1[(i1+8)] = (fieldData_RHS_0_p1[i1]-(((((((((fieldData_LaplaceCoeff_0_p1[(i1+8)]*fieldData_Solution_0_p1[(i1+8)])+(fieldData_LaplaceCoeff_0_p1[(i1+32)]*fieldData_Solution_0_p1[(i1+9)]))+(fieldData_LaplaceCoeff_0_p1[(i1+56)]*fieldData_Solution_0_p1[(i1+7)]))+(fieldData_LaplaceCoeff_0_p1[(i1+80)]*fieldData_Solution_0_p1[(i1+14)]))+(fieldData_LaplaceCoeff_0_p1[(i1+104)]*fieldData_Solution_0_p1[(i1+2)]))+(fieldData_LaplaceCoeff_0_p1[(i1+128)]*fieldData_Solution_0_p1[(i1+1)]))+(fieldData_LaplaceCoeff_0_p1[(i1+152)]*fieldData_Solution_0_p1[(i1+13)]))+(fieldData_LaplaceCoeff_0_p1[(i1+176)]*fieldData_Solution_0_p1[(i1+3)]))+(fieldData_LaplaceCoeff_0_p1[(i1+200)]*fieldData_Solution_0_p1[(i1+15)]))); */ __m128d vec0 = _mm_loadu_pd((&fieldData_RHS_0_p1[i1])); __m128d vec0_2 = _mm_loadu_pd((&fieldData_RHS_0_p1[(i1+2)])); __m128d vec1 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+200)])); __m128d vec1_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+202)])); __m128d vec2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+15)])); __m128d vec2_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+17)])); __m128d vec3 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+176)])); __m128d vec3_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+178)])); __m128d vec4 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+3)])); __m128d vec4_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+5)])); __m128d vec5 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+152)])); __m128d vec5_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+154)])); __m128d vec6 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+13)])); __m128d vec6_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+15)])); __m128d vec7 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+128)])); __m128d vec7_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+130)])); __m128d vec8 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+1)])); __m128d vec8_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+3)])); __m128d vec9 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+104)])); __m128d vec9_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+106)])); __m128d vec10 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+2)])); __m128d vec10_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+4)])); __m128d vec11 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+80)])); __m128d vec11_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+82)])); __m128d vec12 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+14)])); __m128d vec12_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+16)])); __m128d vec13 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+56)])); __m128d vec13_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+58)])); __m128d vec14 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+7)])); __m128d vec14_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+9)])); __m128d vec15 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+8)])); __m128d vec15_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+10)])); __m128d vec16 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+8)])); __m128d vec16_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+10)])); __m128d vec17 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+32)])); __m128d vec17_2 = _mm_loadu_pd((&fieldData_LaplaceCoeff_0_p1[(i1+34)])); __m128d vec18 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+9)])); __m128d vec18_2 = _mm_loadu_pd((&fieldData_Solution_0_p1[(i1+11)])); __m128d vec19; __m128d vec19_2; vec19 = _mm_sub_pd(vec0, _mm_add_pd(_mm_mul_pd(vec1, vec2), _mm_add_pd(_mm_mul_pd(vec3, vec4), _mm_add_pd(_mm_mul_pd(vec5, vec6), _mm_add_pd(_mm_mul_pd(vec7, vec8), _mm_add_pd(_mm_mul_pd(vec9, vec10), _mm_add_pd(_mm_mul_pd(vec11, vec12), _mm_add_pd(_mm_mul_pd(vec13, vec14), _mm_add_pd(_mm_mul_pd(vec15, vec16), _mm_mul_pd(vec17, vec18)))))))))); vec19_2 = _mm_sub_pd(vec0_2, _mm_add_pd(_mm_mul_pd(vec1_2, vec2_2), _mm_add_pd(_mm_mul_pd(vec3_2, vec4_2), _mm_add_pd(_mm_mul_pd(vec5_2, vec6_2), _mm_add_pd(_mm_mul_pd(vec7_2, vec8_2), _mm_add_pd(_mm_mul_pd(vec9_2, vec10_2), _mm_add_pd(_mm_mul_pd(vec11_2, vec12_2), _mm_add_pd(_mm_mul_pd(vec13_2, vec14_2), _mm_add_pd(_mm_mul_pd(vec15_2, vec16_2), _mm_mul_pd(vec17_2, vec18_2)))))))))); _mm_storeu_pd((&fieldData_Residual_0_p1[(i1+8)]), vec19); _mm_storeu_pd((&fieldData_Residual_0_p1[(i1+10)]), vec19_2); } for (; (i1<((iterationOffsetEnd[1][0]+i0)+2)); i1 += 1) { fieldData_Residual_0_p1[(i1+8)] = (fieldData_RHS_0_p1[i1]-(((((((((fieldData_LaplaceCoeff_0_p1[(i1+8)]*fieldData_Solution_0_p1[(i1+8)])+(fieldData_LaplaceCoeff_0_p1[(i1+32)]*fieldData_Solution_0_p1[(i1+9)]))+(fieldData_LaplaceCoeff_0_p1[(i1+56)]*fieldData_Solution_0_p1[(i1+7)]))+(fieldData_LaplaceCoeff_0_p1[(i1+80)]*fieldData_Solution_0_p1[(i1+14)]))+(fieldData_LaplaceCoeff_0_p1[(i1+104)]*fieldData_Solution_0_p1[(i1+2)]))+(fieldData_LaplaceCoeff_0_p1[(i1+128)]*fieldData_Solution_0_p1[(i1+1)]))+(fieldData_LaplaceCoeff_0_p1[(i1+152)]*fieldData_Solution_0_p1[(i1+13)]))+(fieldData_LaplaceCoeff_0_p1[(i1+176)]*fieldData_Solution_0_p1[(i1+3)]))+(fieldData_LaplaceCoeff_0_p1[(i1+200)]*fieldData_Solution_0_p1[(i1+15)]))); } } } } }
void transpose_4321_loop_3241_( double *unsorted, double *sorted, int *p_dim1, int *p_dim2, int *p_dim3, int *p_dim4, double *p_factor ) { int dim1,dim2,dim3,dim4; int dim1mod,dim2mod,dim3mod,dim4mod; unsigned int old_offset,new_offset; unsigned int j1,j2,j3,j4; double factor = *p_factor; double *pA, *pB; register __m128d x, y, z, w, t, t1,fac_vector; unsigned int N1,N2; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); dim1 = *p_dim1; dim2 = *p_dim2; dim3 = *p_dim3; dim4 = *p_dim4; N1 = dim2*dim3*dim4; N2 = dim2*dim3*dim4; dim1mod = (int) floor( (float)dim1 / (float) 4); dim2mod = (int) floor( (float)dim2 / (float) 4); dim3mod = (int) floor( (float)dim3 / (float) 4); dim4mod = (int) floor( (float)dim4 / (float) 4); /* pluto start (dim1,dim2,dim3,dim4) */ #pragma ivdep #pragma parallel #pragma loop count min(10) max(80) avg(40) #pragma unroll for( j3 = 0; j3<dim3; j3++) { #pragma loop count min(10) max(80) avg(40) #pragma unroll for( j2 = 0; j2<dim2; j2++) { #pragma loop count min(10) max(80) avg(40) #pragma unroll #pragma vector always for( j4 = 0; j4<dim4; j4+=2) { #pragma loop count min(10) max(80) avg(40) #pragma unroll #pragma vector always for( j1 = 0; j1<dim1; j1+=2) { //sorted[j1+dim1*(j2+dim2*(j3+dim3*j4))] = unsorted[j4+dim4*(j3+dim3*(j2+dim2*j1))] * factor; pA = unsorted + j4+dim4*(j3+dim3*(j2+dim2*j1)); pB = sorted + j1+dim1*(j2+dim2*(j3+dim3*j4)); x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } /* pluto end */ return; }
AABB3d TriangleItemHandler::clip( const size_t item_index, const size_t dimension, const double slab_min, const double slab_max) const { const TriangleVertexInfo& vertex_info = m_triangle_vertex_infos[item_index]; if (vertex_info.m_motion_segment_count > 0) { AABB3d triangle_bbox = m_triangle_bboxes[item_index]; if (triangle_bbox.min[dimension] < slab_min) triangle_bbox.min[dimension] = slab_min; if (triangle_bbox.max[dimension] > slab_max) triangle_bbox.max[dimension] = slab_max; return triangle_bbox; } #ifdef APPLESEED_USE_SSE APPLESEED_SIMD4_ALIGN const Vector3d v0(m_triangle_vertices[vertex_info.m_vertex_index + 0]); APPLESEED_SIMD4_ALIGN const Vector3d v1(m_triangle_vertices[vertex_info.m_vertex_index + 1]); APPLESEED_SIMD4_ALIGN const Vector3d v2(m_triangle_vertices[vertex_info.m_vertex_index + 2]); const double v0d = v0[dimension]; const double v1d = v1[dimension]; const double v2d = v2[dimension]; const int v0_ge_min = v0d >= slab_min ? 1 : 0; const int v0_le_max = v0d <= slab_max ? 1 : 0; const int v1_ge_min = v1d >= slab_min ? 1 : 0; const int v1_le_max = v1d <= slab_max ? 1 : 0; const int v2_ge_min = v2d >= slab_min ? 1 : 0; const int v2_le_max = v2d <= slab_max ? 1 : 0; __m128d bbox_min_xy = _mm_set1_pd(+numeric_limits<double>::max()); __m128d bbox_min_zz = _mm_set1_pd(+numeric_limits<double>::max()); __m128d bbox_max_xy = _mm_set1_pd(-numeric_limits<double>::max()); __m128d bbox_max_zz = _mm_set1_pd(-numeric_limits<double>::max()); const __m128d v0_xy = _mm_load_pd(&v0.x); const __m128d v0_zz = _mm_set1_pd(v0.z); const __m128d v1_xy = _mm_load_pd(&v1.x); const __m128d v1_zz = _mm_set1_pd(v1.z); const __m128d v2_xy = _mm_load_pd(&v2.x); const __m128d v2_zz = _mm_set1_pd(v2.z); if (v0_ge_min & v0_le_max) { bbox_min_xy = _mm_min_pd(bbox_min_xy, v0_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, v0_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, v0_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, v0_zz); } if (v1_ge_min & v1_le_max) { bbox_min_xy = _mm_min_pd(bbox_min_xy, v1_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, v1_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, v1_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, v1_zz); } if (v2_ge_min & v2_le_max) { bbox_min_xy = _mm_min_pd(bbox_min_xy, v2_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, v2_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, v2_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, v2_zz); } const int v0v1_cross_min = v0_ge_min ^ v1_ge_min; const int v0v1_cross_max = v0_le_max ^ v1_le_max; const int v1v2_cross_min = v1_ge_min ^ v2_ge_min; const int v1v2_cross_max = v1_le_max ^ v2_le_max; const int v2v0_cross_min = v2_ge_min ^ v0_ge_min; const int v2v0_cross_max = v2_le_max ^ v0_le_max; if (v0v1_cross_min | v0v1_cross_max) { const double rcp_v0v1 = 1.0 / (v1[dimension] - v0[dimension]); if (v0v1_cross_min) { const double t = (slab_min - v0[dimension]) * rcp_v0v1; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v0_xy, mt1), _mm_mul_pd(v1_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v0_zz, mt1), _mm_mul_pd(v1_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } if (v0v1_cross_max) { const double t = (slab_max - v0[dimension]) * rcp_v0v1; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v0_xy, mt1), _mm_mul_pd(v1_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v0_zz, mt1), _mm_mul_pd(v1_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } } if (v1v2_cross_min | v1v2_cross_max) { const double rcp_v1v2 = 1.0 / (v2[dimension] - v1[dimension]); if (v1v2_cross_min) { const double t = (slab_min - v1[dimension]) * rcp_v1v2; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v1_xy, mt1), _mm_mul_pd(v2_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v1_zz, mt1), _mm_mul_pd(v2_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } if (v1v2_cross_max) { const double t = (slab_max - v1[dimension]) * rcp_v1v2; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v1_xy, mt1), _mm_mul_pd(v2_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v1_zz, mt1), _mm_mul_pd(v2_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } } if (v2v0_cross_min | v2v0_cross_max) { const double rcp_v2v0 = 1.0 / (v0[dimension] - v2[dimension]); if (v2v0_cross_min) { const double t = (slab_min - v2[dimension]) * rcp_v2v0; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v2_xy, mt1), _mm_mul_pd(v0_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v2_zz, mt1), _mm_mul_pd(v0_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } if (v2v0_cross_max) { const double t = (slab_max - v2[dimension]) * rcp_v2v0; assert(t >= 0.0 && t <= 1.0); const __m128d mt = _mm_set1_pd(t); const __m128d mt1 = _mm_set1_pd(1.0 - t); const __m128d p_xy = _mm_add_pd(_mm_mul_pd(v2_xy, mt1), _mm_mul_pd(v0_xy, mt)); const __m128d p_zz = _mm_add_pd(_mm_mul_pd(v2_zz, mt1), _mm_mul_pd(v0_zz, mt)); bbox_min_xy = _mm_min_pd(bbox_min_xy, p_xy); bbox_max_xy = _mm_max_pd(bbox_max_xy, p_xy); bbox_min_zz = _mm_min_pd(bbox_min_zz, p_zz); bbox_max_zz = _mm_max_pd(bbox_max_zz, p_zz); } } APPLESEED_SIMD4_ALIGN AABB3d bbox; _mm_store_pd(&bbox.min.x, bbox_min_xy); _mm_store_sd(&bbox.min.z, bbox_min_zz); _mm_storeu_pd(&bbox.max.x, bbox_max_xy); _mm_store_sd(&bbox.max.z, bbox_max_zz); if (bbox.min[dimension] < slab_min) bbox.min[dimension] = slab_min; if (bbox.max[dimension] > slab_max) bbox.max[dimension] = slab_max; #else const Vector3d v0(m_triangle_vertices[vertex_info.m_vertex_index + 0]); const Vector3d v1(m_triangle_vertices[vertex_info.m_vertex_index + 1]); const Vector3d v2(m_triangle_vertices[vertex_info.m_vertex_index + 2]); const int v0_ge_min = v0[dimension] >= slab_min ? 1 : 0; const int v0_le_max = v0[dimension] <= slab_max ? 1 : 0; const int v1_ge_min = v1[dimension] >= slab_min ? 1 : 0; const int v1_le_max = v1[dimension] <= slab_max ? 1 : 0; const int v2_ge_min = v2[dimension] >= slab_min ? 1 : 0; const int v2_le_max = v2[dimension] <= slab_max ? 1 : 0; AABB3d bbox; bbox.invalidate(); if (v0_ge_min & v0_le_max) bbox.insert(v0); if (v1_ge_min & v1_le_max) bbox.insert(v1); if (v2_ge_min & v2_le_max) bbox.insert(v2); if (v0_ge_min != v1_ge_min) bbox.insert(segment_plane_intersection(v0, v1, dimension, slab_min)); if (v0_le_max != v1_le_max) bbox.insert(segment_plane_intersection(v0, v1, dimension, slab_max)); if (v1_ge_min != v2_ge_min) bbox.insert(segment_plane_intersection(v1, v2, dimension, slab_min)); if (v1_le_max != v2_le_max) bbox.insert(segment_plane_intersection(v1, v2, dimension, slab_max)); if (v2_ge_min != v0_ge_min) bbox.insert(segment_plane_intersection(v2, v0, dimension, slab_min)); if (v2_le_max != v0_le_max) bbox.insert(segment_plane_intersection(v2, v0, dimension, slab_max)); #endif return bbox; }
void tce_sort_6_simd(double* unsorted,double* sorted, int a, int b, int c, int d, int e, int f, int i, int j, int k, int l, int m, int n, double factor) { int id[6],jd[6],ia,ib,j1,j2,j3,j4,j5,j6; int l1,l2,l3,l4,l5,l6; int ia1,ia2,ia3,ia4,ia5,ia6; int ib1,ib2,ib3,ib4,ib5,ib6; int rangea1,rangea2,rangea3,rangea4,rangea5,rangea6; int rangeb1,rangeb2,rangeb3,rangeb4,rangeb5,rangeb6; int range[6],order[6],order_r[6]; int jj1,jj2,jj3,jj4,jj5,jj6; int jj1_bound,jj2_bound,jj3_bound,jj4_bound,jj5_bound,jj6_bound; int N1,N2; double *pA, *pB; register __m128d x, y, z, w, p, q,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); jd[0] = a; jd[1] = b; jd[2] = c; jd[3] = d; jd[4] = e; jd[5] = f; // prefer writes range[0] = b*c*d*e*f; range[1] = c*d*e*f; range[2] = d*e*f; range[3] = e*f; range[4] = f; range[5] = 1; l1 = jd[i]; l2 = jd[j]; l3 = jd[k]; l4 = jd[l]; l5 = jd[m]; l6 = jd[n]; rangea1 = range[i]; rangea2 = range[j]; rangea3 = range[k]; rangea4 = range[l]; rangea5 = range[m]; rangea6 = range[n]; rangeb1 = l2*l3*l4*l5*l6; rangeb2 = l3*l4*l5*l6; rangeb3 = l4*l5*l6; rangeb4 = l5*l6; rangeb5 = l6; rangeb6 = 1; // here vectorization can rely on the compiler if (n == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia4 = ia3 + j4*rangea4; ib4 = ib3 + j4*rangeb4; for (j5 = 0; j5 < l5; j5++) { ia5 = ia4 + j5*rangea5; ib5 = ib4 + j5*rangeb5; for (j6 = 0; j6 < l6; j6++) { ia = ia5 + j6*rangea6; ib = ib5 + j6*rangeb6; sorted[ib] = unsorted[ia] * factor; } } } } } } } if (m == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia4 = ia3 + j4*rangea4; ib4 = ib3 + j4*rangeb4; for (j5 = 0; j5 < l5; j5 += tilesize) { for (j6 = 0; j6 < l6; j6 += tilesize) { jj5_bound = (j5 + tilesize > l5)? l5 :j5+tilesize; for (jj5 = j5; jj5 < jj5_bound; jj5 += 2) { ia5 = ia4 + jj5*rangea5; ib5 = ib4 + jj5*rangeb5; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia5 + jj6*rangea6; ib = ib5 + jj6*rangeb6; N1 = rangeb5; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } if (l == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4 += tilesize) { for (j5 = 0; j5 < l5; j5++) { ia5 = ia3 + j5*rangea5; ib5 = ib3 + j5*rangeb5; for (j6 = 0; j6 < l6; j6 += tilesize) { jj4_bound = (j4 + tilesize > l4)? l4 :j4+tilesize; for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) { ia4 = ia5 + jj4*rangea4; ib4 = ib5 + jj4*rangeb4; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia4 + jj6*rangea6; ib = ib4 + jj6*rangeb6; N1 = rangeb4; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } if (k == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3 += tilesize) { for (j4 = 0; j4 < l4; j4++) { ia4 = ia2 + j4*rangea4; ib4 = ib2 + j4*rangeb4; for (j5 = 0; j5 < l5; j5++) { ia5 = ia4 + j5*rangea5; ib5 = ib4 + j5*rangeb5; for (j6 = 0; j6 < l6; j6 += tilesize) { jj3_bound = (j3 + tilesize > l3)? l3 :j3+tilesize; for (jj3 = j3; jj3 < jj3_bound; jj3 += 2) { ia3 = ia5 + jj3*rangea3; ib3 = ib5 + jj3*rangeb3; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia3 + jj6*rangea6; ib = ib3 + jj6*rangeb6; N1 = rangeb3; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } if (j == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2 += tilesize) { for (j3 = 0; j3 < l3; j3++) { ia3 = ia1 + j3*rangea3; ib3 = ib1 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia4 = ia3 + j4*rangea4; ib4 = ib3 + j4*rangeb4; for (j5 = 0; j5 < l5; j5++) { ia5 = ia4 + j5*rangea5; ib5 = ib4 + j5*rangeb5; for (j6 = 0; j6 < l6; j6 += tilesize) { jj2_bound = (j2 + tilesize > l2)? l2 :j2+tilesize; for (jj2 = j2; jj2 < jj2_bound; jj2 += 2) { ia2 = ia5 + jj2*rangea2; ib2 = ib5 + jj2*rangeb2; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia2 + jj6*rangea6; ib = ib2 + jj6*rangeb6; N1 = rangeb2; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } if (i == 5) { for (j1 = 0; j1 < l1; j1 += tilesize) { for (j2 = 0; j2 < l2; j2++) { ia2 = j2*rangea2; ib2 = j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia4 = ia3 + j4*rangea4; ib4 = ib3 + j4*rangeb4; for (j5 = 0; j5 < l5; j5++) { ia5 = ia4 + j5*rangea5; ib5 = ib4 + j5*rangeb5; for (j6 = 0; j6 < l6; j6 += tilesize) { jj1_bound = (j1 + tilesize > l1)? l1 :j1+tilesize; for (jj1 = j1; jj1 < jj1_bound; jj1 += 2) { ia1 = ia5 + jj1*rangea1; ib1 = ib5 + jj1*rangeb1; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia1 + jj6*rangea6; ib = ib1 + jj6*rangeb6; N1 = rangeb1; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } }