void trigo_vsin_vml_sse2(double* dst, const double* src, size_t length) { size_t i = length; while (i) { if (!SimdUtils::isAligned(dst, 16) || i == 1) { __m128d d = _mm_load_sd(src); _mm_store_sd(dst, sin_vml_pd(d)); dst++; src++; if (--i == 0) break; } while (i >= 2) { __m128d d = _mm_loadu_pd(src); _mm_store_pd(dst, sin_vml_pd(d)); dst += 2; src += 2; i -= 2; } } }
__m128d test_mm_load_sd(double const* A) { // DAG-LABEL: test_mm_load_sd // DAG: load double, double* %{{.*}}, align 1 // // ASM-LABEL: test_mm_load_sd // ASM: movsd return _mm_load_sd(A); }
static void sse3_test_movddup_reg_subsume_ldsd (double *i1, double *r) { __m128d t1 = _mm_load_sd (i1); __m128d t2 = _mm_movedup_pd (t1); _mm_storeu_pd (r, t2); }
static double rcp_d(double x) { __m128d xd = _mm_load_sd(&x); double xi = _mm_cvtss_f32(_mm_rcp_ss(_mm_cvtsd_ss(_mm_setzero_ps(), xd))); xi = xi + xi * (1.0 - x * xi); xi = xi + xi * (1.0 - x * xi); return xi; }
void foo (unsigned int x, double *y, const double *z) { __m128d tmp; while (x) { tmp = _mm_load_sd (z); _mm_store_sd (y, tmp); --x; ++z; ++y; } }
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void stream_vector_set( const double i_scalar, double* io_c, const int i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; /* init the trip counts */ stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for ( ; l_n < l_trip_prolog; l_n++ ) { io_c[l_n] = i_scalar; } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) { /* we need manual unrolling as the compiler otherwise generates too many dependencies */ const __m256d vec_scalar = _mm256_broadcast_sd(&i_scalar); for ( ; l_n < l_trip_stream; l_n+=8 ) { #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), vec_scalar ); _mm256_store_pd( &(io_c[l_n+4]), vec_scalar ); #else _mm256_stream_pd( &(io_c[l_n]), vec_scalar ); _mm256_stream_pd( &(io_c[l_n+4]), vec_scalar ); #endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { const __m512d vec_scalar = _mm512_broadcastsd_pd(_mm_load_sd(&i_scalar)); for ( ; l_n < l_trip_stream; l_n+=8 ) { #ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), vec_scalar ); #else _mm512_stream_pd( &(io_c[l_n]), vec_scalar ); #endif } } #else for ( ; l_n < l_trip_stream; l_n++ ) { io_c[l_n] = i_scalar; } #endif /* run the epilogue */ for ( ; l_n < i_length; l_n++ ) { io_c[l_n] = i_scalar; } }
void transpose_misaligned(double *a, double *b, int N1, int N2, double factor) { int i,j,k,k1,it,jt,itt,jtt,it_bound,jt_bound,itt_bound,jtt_bound; int conflict,tmp,tmpN,offset,line_offset,setnum,set[8192/(4*sizeof(double))]; double *pA, *pB; register __m128d x, y, z, w, t, t1,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); itt_bound = (N1/tilesize)*tilesize; for (itt = 0; itt < itt_bound; itt=itt+5*tilesize) { jtt_bound =(N2/tilesize)*tilesize; for (jtt = 0; jtt < jtt_bound; jtt=jtt+5*tilesize) { it_bound = (itt+5*tilesize > itt_bound)?itt_bound:itt+5*tilesize; for (it = itt; it < it_bound; it = it+tilesize) { jt_bound = (jtt+5*tilesize>itt_bound)?jtt_bound:jtt+5*tilesize; for (jt = jtt; jt < jt_bound; jt = jt+tilesize) { k = 0; for (j = jt; j < jt+tilesize; j=j+2) { for (i = it; i < it+tilesize; i=i+2) { pA = a+i*N2+j; pB = b+j*N1+i; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } for (i = itt; i < itt+5*tilesize && i < itt_bound; i++) { for (j = jtt_bound; j < N2; j++) { b[j*N1+i] = factor * a[i*N2+j]; } } } for (i = itt_bound; i < N1; i++) { for (j = 0; j < N2; j++) { b[j*N1+i] = factor * a[i*N2+j]; } } }
void transpose_aligned(double *a, double *b, int N1, int N2, double factor) { int i,j,k,k1,it,jt,itt,jtt,conflict,tmp,tmpN; double *pA, *pB; register __m128d x, y, z, w,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); for (it = 0; it < N1; it=it+tilesize) { for (jt = 0; jt < N2; jt=jt+tilesize) { k = 0; for (j = jt; j < jt+tilesize; j=j+2) { for (i = it; i < it+tilesize; i=i+2) { pA = a+i*N2+j; x = _mm_load_pd(pA); y = _mm_load_pd(pA + N2); x = _mm_mul_pd(x,fac_vector); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); k = (j-jt)*tilesize + (i-it); _mm_store_pd(buf + k,z); _mm_store_pd(buf + k + tilesize,w); } } k = 0; k1 = 0; for (j = jt; j < jt+tilesize; j++) { pB = b+j*N1+it; k = (j-jt)*tilesize; x = _mm_load_pd(&buf[k]); y = _mm_load_pd(&buf[k]+2); z = _mm_load_pd(&buf[k]+2*2); w = _mm_load_pd(&buf[k]+3*2); _mm_stream_pd(pB,x); _mm_stream_pd(pB+2,y); _mm_stream_pd(pB+2*2,z); _mm_stream_pd(pB+3*2,w); } } } }
void matVecMult_opt(int N, const double *matA, const double *vecB, double *vecC) { int i; int j; double c; if((N%2)==0) { for(i=0; i<N; i++, matA+=N) { //__m128d vecB_value = _mm_load_sd(&vecB[i]); //vecB_value = _mm_unpacklo_pd(vecB_value, vecB_value); for(j=0; j<N; j+=2) { //__m128d matA_value = _mm_load_pd(&matA[j]); //__m128d vecC_value = _mm_load_sd(&vecC[i]); //__m128d vecB_value = _mm_load_pd(&vecB[j]); //__m128d mulResult = _mm_mul_pd(matA_value, vecB_value); //mulResult = _mm_hadd_pd(mulResult, _mm_set_pd(0,0)); _mm_store_sd(&vecC[i], _mm_add_sd(_mm_hadd_pd( _mm_mul_pd(_mm_load_pd(&matA[j]), _mm_load_pd(&vecB[j])), _mm_set_pd(0,0)), _mm_load_sd(&vecC[i]))); } } } else { for(i = 0; i < N; i++, matA+=N) { c=0; for(j=0; j < N; j++) { //vecC[i] += matA[i*N+j]*vecB[j]; //_mm_prefetch(&matA[j], _MM_HINT_NTA); c += matA[j]*vecB[j]; //_mm_prefetch(&matA[j+1], _MM_HINT_T2); //_mm_store_pd(&vecC[i], _mm_load_pd(&vecB[i])); //_mm_store_pd(&vecC[i],_mm_add_pd(_mm_mul_pd(_mm_load_pd(&matA[j]),_mm_load_pd(&vecB[i])),_mm_load_pd(&vecC[i]))); } vecC[i]=c; } } }
static void TEST (void) { union128d u, s1; double e[2]; int i; s1.x = _mm_set_pd (2134.3343,1234.635654); u.x = test (s1.x); for (i = 0; i < 2; i++) { __m128d tmp = _mm_load_sd (&s1.a[i]); tmp = _mm_sqrt_sd (tmp, tmp); _mm_store_sd (&e[i], tmp); } if (check_union128d (u, e)) abort (); }
static void TEST (void) { union128d u, s; double e[2] = {0.0}; int i; s.x = _mm_set_pd (1.1234, -2.3478); u.x = _mm_round_pd (s.x, iRoundMode); for (i = 0; i < 2; i++) { __m128d tmp = _mm_load_sd (&s.a[i]); tmp = _mm_round_sd (tmp, tmp, iRoundMode); _mm_store_sd (&e[i], tmp); } if (check_union128d (u, e)) abort (); }
/*SSE2 contains an instruction SQRTSD. This instruction Computes the square root of the low-order double-precision floating-point value in an XMM register or in a 64-bit memory location and writes the result in the low-order quadword of another XMM register. The corresponding intrinsic is _mm_sqrt_sd()*/ double FN_PROTOTYPE(sqrt)(double x) { __m128d X128; double result; UT64 uresult; if(x < 0.0) { uresult.u64 = 0xfff8000000000000; __amd_handle_error(DOMAIN, EDOM, "sqrt", x, 0.0 , uresult.f64); return uresult.f64; } /*Load x into an XMM register*/ X128 = _mm_load_sd(&x); /*Calculate sqrt using SQRTSD instrunction*/ X128 = _mm_sqrt_sd(X128, X128); /*Store back the result into a double precision floating point number*/ _mm_store_sd(&result, X128); return result; }
void static avx_test (void) { union256d u, s1; double source [4] = {2134.3343,1234.635654,453.345635,54646.464356}; double e[4] = {0.0}; int i; s1.x = _mm256_loadu_pd (source); u.x = _mm256_floor_pd (s1.x); for (i = 0; i < 4; i++) { __m128d tmp = _mm_load_sd (&s1.a[i]); tmp = _mm_floor_sd (tmp, tmp); _mm_store_sd (&e[i], tmp); } if (check_union256d (u, e)) abort (); }
void matMult_opt(int N, const double *matA, const double *matB, double *matC) { int i, j, k; if((N%2)==0) { //int TwoN=N; //N=N/2; for(i=0; i<N; i++) { for(j=0;j<N;j++) { __m128d matA_value = _mm_load_sd(&matA[j*N+i]); matA_value = _mm_unpacklo_pd(matA_value, matA_value); for(k=0; k<N; k+=2) { __m128d matB_value = _mm_load_pd(&matB[i*N+k]); __m128d matC_value = _mm_load_pd(&matC[j*N+k]); _mm_store_pd(&matC[j*N+k], _mm_add_pd(_mm_mul_pd(matA_value, matB_value), matC_value)); } } } } else { for(i=0; i<N; i++) { for(j=0;j<N;j++) { for(k=0; k<N; k++) { matC[j*N+k] += matA[j*N+i]*matB[i*N+k]; } } } } }
double GetResult(double * LeftMatrix, double * RightMatrix, int N, int L, int M) { // матрица LeftMatrix хранится по строкам // матрица RighttMatrix хранится по строкам // L ― число столбцов LeftMatrix и число строк RighttMatrix // N ― число строк LeftMatrix // M ― число столбцов RighttMatrix // Возвращаемый результат ― сумма всех элементов произведения LeftMatrix на RightMatrix слева направо int i=0; int j=0; int k=0; int k0,ktop; int leftindex=0; int rightindex=0; double sum=0.0; #ifdef __SSE2__ int MX = (M&1) ? M : 0; int M2 = M & ~1; #endif int kstride = MIN(L2_CACHE*3/L/sizeof(double)/4, TLB_SIZE*PAGE_SIZE*3/L/sizeof(double)/4); int istride = TLB_SIZE/4; int jstride = L1_CACHE*3/sizeof(double)/4; #pragma omp parallel private(i, j, k, k0, ktop) reduction(+: sum) { #ifdef __SSE2__ double temp[2]; __m128d sum2 = _mm_set1_pd(0.0); __m128d sum3 = _mm_set1_pd(0.0); __m128d sum4 = _mm_set1_pd(0.0); __m128d sum5 = _mm_set1_pd(0.0); __m128d sum6 = _mm_set1_pd(0.0); __m128d sum7 = _mm_set1_pd(0.0); #endif for(k0=0;k0<L;k0+=kstride) { ktop = MIN(k0+kstride,L); #ifdef _OPENMP for(int i0=omp_get_thread_num()*istride;i0<N;i0+=omp_get_num_threads()*istride) #else for(int i0=0;i0<N;i0+=istride) #endif { int itop = MIN(i0+istride,N); for(k=k0;k<ktop;k++) { for (int j0=0;j0<M;j0+=jstride) { #ifdef __SSE2__ int jtop = MIN(jstride,M2-j0); int MX2 = (jtop < jstride ? MX-j0 : 0); #else int jtop = MIN(jstride,M-j0); #endif double *pright = RightMatrix + k*M + j0; for(i=i0;i<itop;i++) { double left = LeftMatrix[i*L+k]; #ifdef __SSE2__ __m128d left2 = _mm_set1_pd(left); if (((long)pright)&0xF) { for(j=0;j<jtop-10;j+=12) { sum2 = _mm_add_pd(sum2, _mm_mul_pd(left2, _mm_loadu_pd(pright+j))); sum3 = _mm_add_pd(sum3, _mm_mul_pd(left2, _mm_loadu_pd(pright+j+2))); sum4 = _mm_add_pd(sum4, _mm_mul_pd(left2, _mm_loadu_pd(pright+j+4))); sum5 = _mm_add_pd(sum5, _mm_mul_pd(left2, _mm_loadu_pd(pright+j+6))); sum6 = _mm_add_pd(sum6, _mm_mul_pd(left2, _mm_loadu_pd(pright+j+8))); sum7 = _mm_add_pd(sum7, _mm_mul_pd(left2, _mm_loadu_pd(pright+j+10))); } for(;j<jtop;j+=2) sum2 = _mm_add_pd(sum2, _mm_mul_pd(left2, _mm_loadu_pd(pright+j))); } else { for(j=0;j<jtop-10;j+=12) { sum2 = _mm_add_pd(sum2, _mm_mul_pd(left2, _mm_load_pd(pright+j))); sum3 = _mm_add_pd(sum3, _mm_mul_pd(left2, _mm_load_pd(pright+j+2))); sum4 = _mm_add_pd(sum4, _mm_mul_pd(left2, _mm_load_pd(pright+j+4))); sum5 = _mm_add_pd(sum5, _mm_mul_pd(left2, _mm_load_pd(pright+j+6))); sum6 = _mm_add_pd(sum6, _mm_mul_pd(left2, _mm_load_pd(pright+j+8))); sum7 = _mm_add_pd(sum7, _mm_mul_pd(left2, _mm_load_pd(pright+j+10))); } for(;j<jtop;j+=2) sum2 = _mm_add_pd(sum2, _mm_mul_pd(left2, _mm_load_pd(pright+j))); } if (MX2) sum3 = _mm_add_sd(sum3, _mm_mul_sd(left2, _mm_load_sd(pright+MX2-1))); #else double s1=0,s2=0,s3=0,s4=0; for(j=0;j<jtop-3;j+=4) { s1 += left*pright[j]; s2 += left*pright[j+1]; s3 += left*pright[j+2]; s4 += left*pright[j+3]; } for(;j<jtop;j++) sum += left*pright[j]; sum += s1 + s2 + s3 + s4; #endif } } } } } #ifdef __SSE2__ _mm_storeu_pd(temp, _mm_add_pd(_mm_add_pd(sum2,_mm_add_pd(sum3,sum6)),_mm_add_pd(sum4,_mm_add_pd(sum5,sum7)))); sum += temp[0]+temp[1]; #endif } return sum; }
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void stream_update_helmholtz_no_h2( const double* i_g1, const double* i_g2, const double* i_g3, const double* i_tm1, const double* i_tm2, const double* i_tm3, double* io_c, const double i_h1, const int i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; /* init the trip counts */ stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for ( ; l_n < l_trip_prolog; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) { const __m256d vec_h1 = _mm256_broadcast_sd(&i_h1); /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for ( ; l_n < l_trip_stream; l_n+=8 ) { __m256d vec_g1_1, vec_g2_1, vec_g3_1, vec_tm1_1, vec_tm2_1, vec_tm3_1; __m256d vec_g1_2, vec_g2_2, vec_g3_2, vec_tm1_2, vec_tm2_2, vec_tm3_2; vec_g1_1 = _mm256_loadu_pd(&(i_g1[l_n])); vec_tm1_1 = _mm256_loadu_pd(&(i_tm1[l_n])); vec_g1_2 = _mm256_loadu_pd(&(i_g1[l_n+4])); vec_tm1_2 = _mm256_loadu_pd(&(i_tm1[l_n+4])); vec_g1_1 = _mm256_mul_pd(vec_g1_1, vec_tm1_1); vec_g2_1 = _mm256_loadu_pd(&(i_g2[l_n])); vec_g1_2 = _mm256_mul_pd(vec_g1_2, vec_tm1_2); vec_g2_2 = _mm256_loadu_pd(&(i_g2[l_n+4])); vec_tm2_1 = _mm256_loadu_pd(&(i_tm2[l_n])); vec_g2_1 = _mm256_mul_pd(vec_g2_1, vec_tm2_1); vec_tm2_2 = _mm256_loadu_pd(&(i_tm2[l_n+4])); vec_g2_2 = _mm256_mul_pd(vec_g2_2, vec_tm2_2); vec_g3_1 = _mm256_loadu_pd(&(i_g3[l_n])); vec_tm3_1 = _mm256_loadu_pd(&(i_tm3[l_n])); vec_g3_2 = _mm256_loadu_pd(&(i_g3[l_n+4])); vec_tm3_2 = _mm256_loadu_pd(&(i_tm3[l_n+4])); vec_g3_1 = _mm256_mul_pd(vec_g3_1, vec_tm3_1); vec_g3_2 = _mm256_mul_pd(vec_g3_2, vec_tm3_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g2_1); vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g2_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g3_1); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) ); #else _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) ); #endif vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g3_2); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) ); #else _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) ); #endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { const __m512d vec_h1 = _mm512_broadcastsd_pd(_mm_load_sd(&i_h1)); for ( ; l_n < l_trip_stream; l_n+=8 ) { __m512d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3; vec_g1 = _mm512_loadu_pd(&(i_g1[l_n])); vec_tm1 = _mm512_loadu_pd(&(i_tm1[l_n])); vec_g1 = _mm512_mul_pd(vec_g1, vec_tm1); vec_g2 = _mm512_loadu_pd(&(i_g2[l_n])); vec_tm2 = _mm512_loadu_pd(&(i_tm2[l_n])); vec_g2 = _mm512_mul_pd(vec_g2, vec_tm2); vec_g3 = _mm512_loadu_pd(&(i_g3[l_n])); vec_tm3 = _mm512_loadu_pd(&(i_tm3[l_n])); vec_g3 = _mm512_mul_pd(vec_g3, vec_tm3); vec_g1 = _mm512_add_pd(vec_g1, vec_g2); vec_g1 = _mm512_add_pd(vec_g1, vec_g3); #ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) ); #else _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) ); #endif } } #else for ( ; l_n < l_trip_stream; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } #endif /* run the epilogue */ for ( ; l_n < i_length; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } }
void transpose_4321_loop_3241_( double *unsorted, double *sorted, int *p_dim1, int *p_dim2, int *p_dim3, int *p_dim4, double *p_factor ) { int dim1,dim2,dim3,dim4; int dim1mod,dim2mod,dim3mod,dim4mod; unsigned int old_offset,new_offset; unsigned int j1,j2,j3,j4; double factor = *p_factor; double *pA, *pB; register __m128d x, y, z, w, t, t1,fac_vector; unsigned int N1,N2; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); dim1 = *p_dim1; dim2 = *p_dim2; dim3 = *p_dim3; dim4 = *p_dim4; N1 = dim2*dim3*dim4; N2 = dim2*dim3*dim4; dim1mod = (int) floor( (float)dim1 / (float) 4); dim2mod = (int) floor( (float)dim2 / (float) 4); dim3mod = (int) floor( (float)dim3 / (float) 4); dim4mod = (int) floor( (float)dim4 / (float) 4); /* pluto start (dim1,dim2,dim3,dim4) */ #pragma ivdep #pragma parallel #pragma loop count min(10) max(80) avg(40) #pragma unroll for( j3 = 0; j3<dim3; j3++) { #pragma loop count min(10) max(80) avg(40) #pragma unroll for( j2 = 0; j2<dim2; j2++) { #pragma loop count min(10) max(80) avg(40) #pragma unroll #pragma vector always for( j4 = 0; j4<dim4; j4+=2) { #pragma loop count min(10) max(80) avg(40) #pragma unroll #pragma vector always for( j1 = 0; j1<dim1; j1+=2) { //sorted[j1+dim1*(j2+dim2*(j3+dim3*j4))] = unsorted[j4+dim4*(j3+dim3*(j2+dim2*j1))] * factor; pA = unsorted + j4+dim4*(j3+dim3*(j2+dim2*j1)); pB = sorted + j1+dim1*(j2+dim2*(j3+dim3*j4)); x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } /* pluto end */ return; }
// it moves vertically across blocks void kernel_dsymv_4_lib4(int kmax, double *A, int sda, double *x_n, double *y_n, double *z_n, double *x_t, double *y_t, double *z_t, int tri, int alg) { if(kmax<=0) return; /*printf("\nciao %d\n", kmax); */ const int bs = 4; __builtin_prefetch( A + bs*0 ); __builtin_prefetch( A + bs*2 ); int k, ka; ka = kmax; // number from aligned positon double k_left; // double *sA, *sy_n, *sx_t; static double d_mask[4] = {0.5, 1.5, 2.5, 3.5}; __m256d v_mask, zeros, temp, a_00, a_01, a_02, a_03, x_n_0, x_n_1, x_n_2, x_n_3, y_n_0, x_t_0, y_t_0, y_t_1, y_t_2, y_t_3; __m256i i_mask; #if 0 __m128d stemp, sa_00, sa_01, sa_02, sa_03, sx_n_0, sx_n_1, sx_n_2, sx_n_3, sy_n_0, sx_t_0, sy_t_0, sy_t_1, sy_t_2, sy_t_3; #endif zeros = _mm256_setzero_pd(); x_n_0 = _mm256_broadcast_sd( &x_n[0] ); x_n_1 = _mm256_broadcast_sd( &x_n[1] ); x_n_2 = _mm256_broadcast_sd( &x_n[2] ); x_n_3 = _mm256_broadcast_sd( &x_n[3] ); if(alg==-1) // TODO xor { x_n_0 = _mm256_sub_pd( zeros, x_n_0 ); x_n_1 = _mm256_sub_pd( zeros, x_n_1 ); x_n_2 = _mm256_sub_pd( zeros, x_n_2 ); x_n_3 = _mm256_sub_pd( zeros, x_n_3 ); } y_t_0 = _mm256_setzero_pd(); y_t_1 = _mm256_setzero_pd(); y_t_2 = _mm256_setzero_pd(); y_t_3 = _mm256_setzero_pd(); #if 0 sx_n_0 = _mm256_castpd256_pd128( x_n_0 ); sx_n_1 = _mm256_castpd256_pd128( x_n_1 ); sx_n_2 = _mm256_castpd256_pd128( x_n_2 ); sx_n_3 = _mm256_castpd256_pd128( x_n_3 ); sy_t_0 = _mm256_castpd256_pd128( y_t_0 ); sy_t_1 = _mm256_castpd256_pd128( y_t_1 ); sy_t_2 = _mm256_castpd256_pd128( y_t_2 ); sy_t_3 = _mm256_castpd256_pd128( y_t_3 ); k = bs*(ka/bs); sA = A + (ka/bs)*sda*bs; sy_n = y_n + (ka/bs)*bs; sx_t = x_t + (ka/bs)*bs; for(; k<ka; k++) { sy_n_0 = _mm_load_sd( &sy_n[0] ); sx_t_0 = _mm_load_sd( &sx_t[0] ); sa_00 = _mm_load_sd( &sA[0+bs*0] ); sa_01 = _mm_load_sd( &sA[0+bs*1] ); sa_02 = _mm_load_sd( &sA[0+bs*2] ); sa_03 = _mm_load_sd( &sA[0+bs*3] ); stemp = _mm_mul_sd( sa_00, sx_n_0 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_00, sx_t_0 ); sy_t_0 = _mm_add_sd( sy_t_0, stemp ); stemp = _mm_mul_sd( sa_01, sx_n_1 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_01, sx_t_0 ); sy_t_1 = _mm_add_sd( sy_t_1, stemp ); stemp = _mm_mul_sd( sa_02, sx_n_2 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_02, sx_t_0 ); sy_t_2 = _mm_add_sd( sy_t_2, stemp ); stemp = _mm_mul_sd( sa_03, sx_n_3 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_03, sx_t_0 ); sy_t_3 = _mm_add_sd( sy_t_3, stemp ); _mm_store_sd( &sy_n[0], sy_n_0 ); sA += 1; sy_n += 1; sx_t += 1; } y_t_0 = _mm256_castpd128_pd256( sy_t_0 ); y_t_1 = _mm256_castpd128_pd256( sy_t_1 ); y_t_2 = _mm256_castpd128_pd256( sy_t_2 ); y_t_3 = _mm256_castpd128_pd256( sy_t_3 ); #endif k=0; // corner if(tri==1) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 14 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); temp = _mm256_blend_pd( zeros, temp, 14 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 12 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); temp = _mm256_blend_pd( zeros, temp, 12 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 8 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); temp = _mm256_blend_pd( zeros, temp, 8 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; k += 4; } for(; k<ka-7; k+=2*bs) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; } for(; k<ka-3; k+=bs) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; } if(k<ka) { k_left = ka-k; v_mask = _mm256_sub_pd( _mm256_loadu_pd( d_mask ), _mm256_broadcast_sd( &k_left ) ); i_mask = _mm256_castpd_si256( v_mask ); // __builtin_prefetch( A + sda*bs +bs*0 ); // __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_maskload_pd( &x_t[0], i_mask ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_maskstore_pd( &z_n[0], i_mask, y_n_0 ); // A += sda*bs; // y_n += 4; // z_n += 4; // x_t += 4; } __m256d y_0_1_2_3; y_t_0 = _mm256_hadd_pd( y_t_0, y_t_1 ); y_t_2 = _mm256_hadd_pd( y_t_2, y_t_3 ); y_t_1 = _mm256_permute2f128_pd( y_t_2, y_t_0, 2 ); y_t_0 = _mm256_permute2f128_pd( y_t_2, y_t_0, 19 ); y_t_0 = _mm256_add_pd( y_t_0, y_t_1 ); if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_t_0 ); _mm256_storeu_pd( &z_t[0], y_0_1_2_3 ); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_t_0 ); _mm256_storeu_pd( &z_t[0], y_0_1_2_3 ); } }
void tce_sort_6_simd(double* unsorted,double* sorted, int a, int b, int c, int d, int e, int f, int i, int j, int k, int l, int m, int n, double factor) { int id[6],jd[6],ia,ib,j1,j2,j3,j4,j5,j6; int l1,l2,l3,l4,l5,l6; int ia1,ia2,ia3,ia4,ia5,ia6; int ib1,ib2,ib3,ib4,ib5,ib6; int rangea1,rangea2,rangea3,rangea4,rangea5,rangea6; int rangeb1,rangeb2,rangeb3,rangeb4,rangeb5,rangeb6; int range[6],order[6],order_r[6]; int jj1,jj2,jj3,jj4,jj5,jj6; int jj1_bound,jj2_bound,jj3_bound,jj4_bound,jj5_bound,jj6_bound; int N1,N2; double *pA, *pB; register __m128d x, y, z, w, p, q,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); jd[0] = a; jd[1] = b; jd[2] = c; jd[3] = d; jd[4] = e; jd[5] = f; // prefer writes range[0] = b*c*d*e*f; range[1] = c*d*e*f; range[2] = d*e*f; range[3] = e*f; range[4] = f; range[5] = 1; l1 = jd[i]; l2 = jd[j]; l3 = jd[k]; l4 = jd[l]; l5 = jd[m]; l6 = jd[n]; rangea1 = range[i]; rangea2 = range[j]; rangea3 = range[k]; rangea4 = range[l]; rangea5 = range[m]; rangea6 = range[n]; rangeb1 = l2*l3*l4*l5*l6; rangeb2 = l3*l4*l5*l6; rangeb3 = l4*l5*l6; rangeb4 = l5*l6; rangeb5 = l6; rangeb6 = 1; // here vectorization can rely on the compiler if (n == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia4 = ia3 + j4*rangea4; ib4 = ib3 + j4*rangeb4; for (j5 = 0; j5 < l5; j5++) { ia5 = ia4 + j5*rangea5; ib5 = ib4 + j5*rangeb5; for (j6 = 0; j6 < l6; j6++) { ia = ia5 + j6*rangea6; ib = ib5 + j6*rangeb6; sorted[ib] = unsorted[ia] * factor; } } } } } } } if (m == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia4 = ia3 + j4*rangea4; ib4 = ib3 + j4*rangeb4; for (j5 = 0; j5 < l5; j5 += tilesize) { for (j6 = 0; j6 < l6; j6 += tilesize) { jj5_bound = (j5 + tilesize > l5)? l5 :j5+tilesize; for (jj5 = j5; jj5 < jj5_bound; jj5 += 2) { ia5 = ia4 + jj5*rangea5; ib5 = ib4 + jj5*rangeb5; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia5 + jj6*rangea6; ib = ib5 + jj6*rangeb6; N1 = rangeb5; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } if (l == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4 += tilesize) { for (j5 = 0; j5 < l5; j5++) { ia5 = ia3 + j5*rangea5; ib5 = ib3 + j5*rangeb5; for (j6 = 0; j6 < l6; j6 += tilesize) { jj4_bound = (j4 + tilesize > l4)? l4 :j4+tilesize; for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) { ia4 = ia5 + jj4*rangea4; ib4 = ib5 + jj4*rangeb4; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia4 + jj6*rangea6; ib = ib4 + jj6*rangeb6; N1 = rangeb4; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } if (k == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3 += tilesize) { for (j4 = 0; j4 < l4; j4++) { ia4 = ia2 + j4*rangea4; ib4 = ib2 + j4*rangeb4; for (j5 = 0; j5 < l5; j5++) { ia5 = ia4 + j5*rangea5; ib5 = ib4 + j5*rangeb5; for (j6 = 0; j6 < l6; j6 += tilesize) { jj3_bound = (j3 + tilesize > l3)? l3 :j3+tilesize; for (jj3 = j3; jj3 < jj3_bound; jj3 += 2) { ia3 = ia5 + jj3*rangea3; ib3 = ib5 + jj3*rangeb3; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia3 + jj6*rangea6; ib = ib3 + jj6*rangeb6; N1 = rangeb3; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } if (j == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2 += tilesize) { for (j3 = 0; j3 < l3; j3++) { ia3 = ia1 + j3*rangea3; ib3 = ib1 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia4 = ia3 + j4*rangea4; ib4 = ib3 + j4*rangeb4; for (j5 = 0; j5 < l5; j5++) { ia5 = ia4 + j5*rangea5; ib5 = ib4 + j5*rangeb5; for (j6 = 0; j6 < l6; j6 += tilesize) { jj2_bound = (j2 + tilesize > l2)? l2 :j2+tilesize; for (jj2 = j2; jj2 < jj2_bound; jj2 += 2) { ia2 = ia5 + jj2*rangea2; ib2 = ib5 + jj2*rangeb2; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia2 + jj6*rangea6; ib = ib2 + jj6*rangeb6; N1 = rangeb2; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } if (i == 5) { for (j1 = 0; j1 < l1; j1 += tilesize) { for (j2 = 0; j2 < l2; j2++) { ia2 = j2*rangea2; ib2 = j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia4 = ia3 + j4*rangea4; ib4 = ib3 + j4*rangeb4; for (j5 = 0; j5 < l5; j5++) { ia5 = ia4 + j5*rangea5; ib5 = ib4 + j5*rangeb5; for (j6 = 0; j6 < l6; j6 += tilesize) { jj1_bound = (j1 + tilesize > l1)? l1 :j1+tilesize; for (jj1 = j1; jj1 < jj1_bound; jj1 += 2) { ia1 = ia5 + jj1*rangea1; ib1 = ib5 + jj1*rangeb1; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia1 + jj6*rangea6; ib = ib1 + jj6*rangeb6; N1 = rangeb1; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } }
void ATL_UGEMV(ATL_CINT M, ATL_CINT N, const TYPE *A, ATL_CINT lda1, const TYPE *X, TYPE *Y) {/* BEGIN GEMV: nMU=1, MU=2, NU=8 */ ATL_INT i, j; ATL_CINT MAp = ((((size_t)A)&0xF) || M==1) ? 1 : 2; ATL_CINT MA = M - MAp; #define A0 A const TYPE *A1=A0+lda1, *A2=A1+lda1, *A3=A2+lda1, *A4=A3+lda1, *A5=A4+lda1, *A6=A5+lda1, *A7=A6+lda1; ATL_CINT M2=((((((MA) >> 1)) << 1)))+MAp, N8=(((((N) >> 3)) << 3)), lda8=(((lda1) << 3)); __m128d x0, x1, y0, y1, y2, y3, y4, y5, y6, y7, a0_0, a0_1, a0_2, a0_3, a0_4, a0_5, a0_6, a0_7; if (!M || !N) return; for (j=0; j < N8; j += 8, A0 += lda8, A1 += lda8, A2 += lda8, A3 += lda8, A4 += lda8, A5 += lda8, A6 += lda8, A7 += lda8) {/* BEGIN N-LOOP UR=8 */ if (MAp != 1) {/* peel to zero Y */ i=0; x0 = _mm_load_pd(X+i+0); y0 = _mm_load_pd(A0+i); y0 = _mm_mul_pd(y0, x0); y1 = _mm_load_pd(A1+i); y1 = _mm_mul_pd(y1, x0); y2 = _mm_load_pd(A2+i); y2 = _mm_mul_pd(y2, x0); y3 = _mm_load_pd(A3+i); y3 = _mm_mul_pd(y3, x0); y4 = _mm_load_pd(A4+i); y4 = _mm_mul_pd(y4, x0); y5 = _mm_load_pd(A5+i); y5 = _mm_mul_pd(y5, x0); y6 = _mm_load_pd(A6+i); y6 = _mm_mul_pd(y6, x0); y7 = _mm_load_pd(A7+i); y7 = _mm_mul_pd(y7, x0); } /* end zero Y peel */ else /* if (MAp == 1)*/ {/* peel to force X/A alignment, zero Y */ i=0; x0 = _mm_load_sd(X+i+0); y0 = _mm_load_sd(A0+i); y0 = _mm_mul_sd(y0, x0); y1 = _mm_load_sd(A1+i); y1 = _mm_mul_sd(y1, x0); y2 = _mm_load_sd(A2+i); y2 = _mm_mul_sd(y2, x0); y3 = _mm_load_sd(A3+i); y3 = _mm_mul_sd(y3, x0); y4 = _mm_load_sd(A4+i); y4 = _mm_mul_sd(y4, x0); y5 = _mm_load_sd(A5+i); y5 = _mm_mul_sd(y5, x0); y6 = _mm_load_sd(A6+i); y6 = _mm_mul_sd(y6, x0); y7 = _mm_load_sd(A7+i); y7 = _mm_mul_sd(y7, x0); } /* end force-align/zeroY peel */ for (i=MAp; i < M2; i += 2) {/* ----- BEGIN M-LOOP BODY ----- */ /* --- BEGIN MUxNU UNROLL 0 --- */ x0 = _mm_load_pd(X+i+0); a0_0 = _mm_load_pd(A0+i); a0_0 = _mm_mul_pd(a0_0, x0); y0 = _mm_add_pd(y0, a0_0); a0_1 = _mm_load_pd(A1+i); a0_1 = _mm_mul_pd(a0_1, x0); y1 = _mm_add_pd(y1, a0_1); a0_2 = _mm_load_pd(A2+i); a0_2 = _mm_mul_pd(a0_2, x0); y2 = _mm_add_pd(y2, a0_2); a0_3 = _mm_load_pd(A3+i); a0_3 = _mm_mul_pd(a0_3, x0); y3 = _mm_add_pd(y3, a0_3); a0_4 = _mm_load_pd(A4+i); a0_4 = _mm_mul_pd(a0_4, x0); y4 = _mm_add_pd(y4, a0_4); a0_5 = _mm_load_pd(A5+i); a0_5 = _mm_mul_pd(a0_5, x0); y5 = _mm_add_pd(y5, a0_5); a0_6 = _mm_load_pd(A6+i); a0_6 = _mm_mul_pd(a0_6, x0); y6 = _mm_add_pd(y6, a0_6); a0_7 = _mm_load_pd(A7+i); a0_7 = _mm_mul_pd(a0_7, x0); y7 = _mm_add_pd(y7, a0_7); /* --- END MUxNU UNROLL 0 --- */ }/* ----- END M-LOOP BODY ----- */ if (M != M2) {/* ----- BEGIN SCALAR M CLEANUP ----- */ x0 = _mm_load_sd(X+i+0); a0_0 = _mm_load_sd(A0+i); a0_0 = _mm_mul_sd(a0_0, x0); y0 = _mm_add_sd(y0, a0_0); a0_1 = _mm_load_sd(A1+i); a0_1 = _mm_mul_sd(a0_1, x0); y1 = _mm_add_sd(y1, a0_1); a0_2 = _mm_load_sd(A2+i); a0_2 = _mm_mul_sd(a0_2, x0); y2 = _mm_add_sd(y2, a0_2); a0_3 = _mm_load_sd(A3+i); a0_3 = _mm_mul_sd(a0_3, x0); y3 = _mm_add_sd(y3, a0_3); a0_4 = _mm_load_sd(A4+i); a0_4 = _mm_mul_sd(a0_4, x0); y4 = _mm_add_sd(y4, a0_4); a0_5 = _mm_load_sd(A5+i); a0_5 = _mm_mul_sd(a0_5, x0); y5 = _mm_add_sd(y5, a0_5); a0_6 = _mm_load_sd(A6+i); a0_6 = _mm_mul_sd(a0_6, x0); y6 = _mm_add_sd(y6, a0_6); a0_7 = _mm_load_sd(A7+i); a0_7 = _mm_mul_sd(a0_7, x0); y7 = _mm_add_sd(y7, a0_7); }/* ----- END SCALAR M CLEANUP ----- */ _my_hadd_pd(y0, y1); #ifndef BETA0 a0_0 = _mm_load_pd(Y+j+0); y0 = _mm_add_pd(y0, a0_0); #endif _mm_store_pd(Y+j+0, y0); _my_hadd_pd(y2, y3); #ifndef BETA0 a0_1 = _mm_load_pd(Y+j+2); y2 = _mm_add_pd(y2, a0_1); #endif _mm_store_pd(Y+j+2, y2); _my_hadd_pd(y4, y5); #ifndef BETA0 a0_2 = _mm_load_pd(Y+j+4); y4 = _mm_add_pd(y4, a0_2); #endif _mm_store_pd(Y+j+4, y4); _my_hadd_pd(y6, y7); #ifndef BETA0 a0_3 = _mm_load_pd(Y+j+6); y6 = _mm_add_pd(y6, a0_3); #endif _mm_store_pd(Y+j+6, y6); }/* END N-LOOP UR=8 */ for (j=N8; j < N; j++, A0 += lda1) {/* BEGIN N-LOOP UR=1 */ if (MAp != 1) {/* peel to zero Y */ i=0; x0 = _mm_load_pd(X+i+0); y0 = _mm_load_pd(A0+i); y0 = _mm_mul_pd(y0, x0); } /* end zero Y peel */ else /* if (MAp == 1)*/ {/* peel to force X/A alignment, zero Y */ i=0; x0 = _mm_load_sd(X+i+0); y0 = _mm_load_sd(A0+i); y0 = _mm_mul_sd(y0, x0); } /* end force-align/zeroY peel */ for (i=MAp; i < M2; i += 2) {/* ----- BEGIN M-LOOP BODY ----- */ /* --- BEGIN MUxNU UNROLL 0 --- */ x0 = _mm_load_pd(X+i+0); a0_0 = _mm_load_pd(A0+i); a0_0 = _mm_mul_pd(a0_0, x0); y0 = _mm_add_pd(y0, a0_0); /* --- END MUxNU UNROLL 0 --- */ }/* ----- END M-LOOP BODY ----- */ if (M != M2) {/* ----- BEGIN SCALAR M CLEANUP ----- */ x0 = _mm_load_sd(X+i+0); a0_0 = _mm_load_sd(A0+i); a0_0 = _mm_mul_sd(a0_0, x0); y0 = _mm_add_sd(y0, a0_0); }/* ----- END SCALAR M CLEANUP ----- */ _my_hadd_pd(y0, y0); #ifndef BETA0 a0_0 = _mm_load_sd(Y+j+0); y0 = _mm_add_sd(y0, a0_0); #endif _mm_store_sd(Y+j+0, y0); }/* END N-LOOP UR=1 */ }/* END GEMV: nMU=1, MU=2, NU=8 */
void tce_sort_4_simd(double* unsorted,double* sorted, int a, int b, int c, int d, int i, int j, int k, int l, double factor) { int id[4],jd[4],ia,ib,j1,j2,j3,j4; int l1,l2,l3,l4; int ia1,ia2,ia3,ia4; int ib1,ib2,ib3,ib4; int rangea1,rangea2,rangea3,rangea4; int rangeb1,rangeb2,rangeb3,rangeb4; int range[4],order[4],order_r[4]; int jj1,jj2,jj3,jj4; int jj1_bound,jj2_bound,jj3_bound,jj4_bound; int count,ir,jr,kr,lr,N1,N2; double *pA, *pB; register __m128d x, y, z, w, t, t1,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); jd[0] = a; jd[1] = b; jd[2] = c; jd[3] = d; // prefer writes range[0] = b*c*d; range[1] = c*d; range[2] = d; range[3] = 1; l1 = jd[i]; l2 = jd[j]; l3 = jd[k]; l4 = jd[l]; rangea1 = range[i]; rangea2 = range[j]; rangea3 = range[k]; rangea4 = range[l]; rangeb1 = l2*l3*l4; rangeb2 = l3*l4; rangeb3 = l4; rangeb4 = 1; // here vectorization can rely on the compiler if (l == 3) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia = ia3 + j4*rangea4; ib = ib3 + j4*rangeb4; sorted[ib] = unsorted[ia] * factor; } } } } } if (k == 3) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3 += tilesize) { for (j4 = 0; j4 < l4; j4 += tilesize) { jj3_bound = (j3 + tilesize > l3)? l3 :j3+tilesize; for (jj3 = j3; jj3 < jj3_bound; jj3 += 2) { ia3 = ia2 + jj3*rangea3; ib3 = ib2 + jj3*rangeb3; jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize; for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) { ia = ia3 + jj4*rangea4; ib = ib3 + jj4*rangeb4; N1 = rangeb3; N2 = rangea4; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } if (j == 3) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2 += tilesize) { for (j3 = 0; j3 < l3; j3++) { ia3 = ia1 + j3*rangea3; ib3 = ib1 + j3*rangeb3; for (j4 = 0; j4 < l4; j4 += tilesize) { jj2_bound = (j2 + tilesize > l2)? l2 :j2+tilesize; for (jj2 = j2; jj2 < jj2_bound; jj2 += 2) { ia2 = ia3 + jj2*rangea2; ib2 = ib3 + jj2*rangeb2; jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize; for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) { ia = ia2 + jj4*rangea4; ib = ib2 + jj4*rangeb4; N1 = rangeb2; N2 = rangea4; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } if (i == 3) { for (j1 = 0; j1 < l1; j1 += tilesize) { for (j2 = 0; j2 < l2; j2++) { ia2 = j2*rangea2; ib2 = j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4 += tilesize) { jj1_bound = (j1 + tilesize > l1)? l1 :j1+tilesize; for (jj1 = j1; jj1 < jj1_bound; jj1 += 2) { ia1 = ia3 + jj1*rangea1; ib1 = ib3 + jj1*rangeb1; jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize; for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) { ia = ia1 + jj4*rangea4; ib = ib1 + jj4*rangeb4; N1 = rangeb1; N2 = rangea4; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } }
inline int FloatToInt( double x ) { return _mm_cvttsd_si32( _mm_load_sd( &x) ); }
static inline int lrint(double d) { return _mm_cvtsd_si32(_mm_load_sd(&d)); }
// it moves vertically across blocks void kernel_dtrmv_u_t_1_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 4; double *tA, *tx; int k; __m256d tmp0, a_00_10_20_30, x_0_1_2_3, y_00; y_00 = _mm256_setzero_pd(); k=0; for(; k<kmax-3; k+=4) { x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); A += 4 + (sda-1)*lda; x += 4; } __m128d tm0, a_00_10, a_01_11, x_0_1, y_0, y_1, y_0_1; tm0 = _mm256_extractf128_pd( y_00, 0x1 ); y_0 = _mm256_castpd256_pd128( y_00 ); y_0 = _mm_add_pd( y_0, tm0 ); if(k<kmax-1) { x_0_1 = _mm_loadu_pd( &x[0] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); tm0 = _mm_mul_pd( a_00_10, x_0_1 ); y_0 = _mm_add_pd( y_0, tm0 ); A += 2; x += 2; } x_0_1 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); tm0 = _mm_mul_sd( a_00_10, x_0_1 ); y_0 = _mm_add_sd( y_0, tm0 ); y_0 = _mm_hadd_pd( y_0, y_0 ); if(alg==0) { _mm_store_sd(&y[0], y_0); } else if(alg==1) { y_0_1 = _mm_load_sd( &y[0] ); y_0_1 = _mm_add_sd( y_0_1, y_0 ); _mm_store_sd(&y[0], y_0_1); } else // alg==-1 { y_0_1 = _mm_load_sd( &y[0] ); y_0_1 = _mm_sub_sd( y_0_1, y_0 ); _mm_store_sd(&y[0], y_0_1); } }
// it moves horizontally inside a block void kernel_dtrmv_u_n_2_lib4(int kmax, double *A, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; __m128d ax_temp, a_00_10, a_01_11, a_02_12, a_03_13, x_0, x_1, x_2, x_3, y_0_1, y_0_1_b, y_0_1_c, y_0_1_d, z_0_1; /* y_0_1 = _mm_setzero_pd(); */ // second col (avoid zero y_0_1) x_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, x_0 ); // first col x_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); ax_temp = _mm_mul_sd( a_00_10, x_0 ); y_0_1 = _mm_add_sd( y_0_1, ax_temp ); A += 2*lda; x += 2; k=2; for(; k<kmax-1; k+=2) { x_0 = _mm_loaddup_pd( &x[0] ); x_1 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); a_01_11 = _mm_load_pd( &A[0+lda*1] ); ax_temp = _mm_mul_pd( a_00_10, x_0 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); ax_temp = _mm_mul_pd( a_01_11, x_1 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); A += 2*lda; x += 2; } if(kmax%2==1) { x_0 = _mm_loaddup_pd( &x[0] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); ax_temp = _mm_mul_pd( a_00_10, x_0 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); } if(alg==0) { _mm_storeu_pd(&y[0], y_0_1); } else if(alg==1) { z_0_1 = _mm_loadu_pd( &y[0] ); z_0_1 = _mm_add_pd( z_0_1, y_0_1 ); _mm_storeu_pd(&y[0], z_0_1); } else // alg==-1 { z_0_1 = _mm_loadu_pd( &y[0] ); z_0_1 = _mm_sub_pd( z_0_1, y_0_1 ); _mm_storeu_pd(&y[0], z_0_1); } }
static inline __m128d my_invrsq_pd(__m128d x) { const __m128d three = (const __m128d) {3.0f, 3.0f}; const __m128d half = (const __m128d) {0.5f, 0.5f}; __m128 t = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */ __m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */ /* First Newton-Rapson step, accuracy is now 24 bits */ __m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1))))); /* Return second Newton-Rapson step, accuracy 48 bits */ return (__m128d) _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2))))); } /* to extract single integers from a __m128i datatype */ #define _mm_extract_epi64(x, imm) \ _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm))) void nb_kernel400_x86_64_sse2(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * Vc, int * type, int * p_ntype, double * vdwparam, double * Vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,ntype,nthreads,offset; int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid; double facel,krf,crf,tabscl,gbtabscl,vct,vgbt; double shX,shY,shZ,isai_d,dva; gmx_gbdata_t *gbdata; float * gpol; __m128d ix,iy,iz,jx,jy,jz; __m128d dx,dy,dz,t1,t2,t3; __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2; __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj; __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d; __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8; __m128d fac,tabscale,gbtabscale; __m128i n0,nnn; const __m128d neg = {-1.0f,-1.0f}; const __m128d zero = {0.0f,0.0f}; const __m128d half = {0.5f,0.5f}; const __m128d two = {2.0f,2.0f}; const __m128d three = {3.0f,3.0f}; gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent)); krf = *p_krf; crf = *p_crf; tabscl = *p_tabscale; gbtabscl = *p_gbtabscale; nj1 = 0; /* Splat variables */ fac = _mm_load1_pd(&facel); tabscale = _mm_load1_pd(&tabscl); gbtabscale = _mm_load1_pd(&gbtabscl); /* Keep compiler happy */ dvdatmp = _mm_setzero_pd(); vgb = _mm_setzero_pd(); dvdaj = _mm_setzero_pd(); isaj = _mm_setzero_pd(); vcoul = _mm_setzero_pd(); t1 = _mm_setzero_pd(); t2 = _mm_setzero_pd(); t3 = _mm_setzero_pd(); jnr1=jnr2=0; j13=j23=0; for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; offset = (nj1-nj0)%2; ii = iinr[n]; ii3 = ii*3; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shX+pos[ii3+1]); iz = _mm_set1_pd(shX+pos[ii3+2]); q = _mm_set1_pd(charge[ii]); iq = _mm_mul_pd(fac,q); isai_d = invsqrta[ii]; isai = _mm_load1_pd(&isai_d); fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); vctot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); for(k=nj0;k<nj1-offset; k+=2) { jnr1 = jjnr[k]; jnr2 = jjnr[k+1]; j13 = jnr1 * 3; j23 = jnr2 * 3; /* Load coordinates */ xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */ xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */ xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */ xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */ /* transpose */ jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* distances */ dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); /* Load invsqrta */ isaj = _mm_loadl_pd(isaj,invsqrta+jnr1); isaj = _mm_loadh_pd(isaj,invsqrta+jnr2); isaprod = _mm_mul_pd(isai,isaj); /* Load charges */ q = _mm_loadl_pd(q,charge+jnr1); q = _mm_loadh_pd(q,charge+jnr2); qq = _mm_mul_pd(iq,q); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); qq = _mm_mul_pd(isaprod,qq); qq = _mm_mul_pd(qq,neg); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Load dvdaj */ dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1); dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2); r = _mm_mul_pd(rsq11,rinv); rt = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_pd(rt,n0d); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); H = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,H); vgb = _mm_mul_pd(qq,VV); fijC = _mm_mul_pd(qq,FF); fijC = _mm_mul_pd(fijC,gbscale); dvdatmp = _mm_mul_pd(fijC,r); dvdatmp = _mm_add_pd(vgb,dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp,neg); dvdatmp = _mm_mul_pd(dvdatmp,half); dvdasum = _mm_add_pd(dvdasum,dvdatmp); xmm1 = _mm_mul_pd(dvdatmp,isaj); xmm1 = _mm_mul_pd(xmm1,isaj); dvdaj = _mm_add_pd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); _mm_storeh_pd(dvda+jnr2,dvdaj); vctot = _mm_add_pd(vctot,vcoul); vgbtot = _mm_add_pd(vgbtot,vgb); fscal = _mm_sub_pd(fijC,fscal); fscal = _mm_mul_pd(fscal,neg); fscal = _mm_mul_pd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_pd(fscal,dx); t2 = _mm_mul_pd(fscal,dy); t3 = _mm_mul_pd(fscal,dz); /* update the i force */ fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); /* accumulate forces from memory */ xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */ xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */ xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */ xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */ /* transpose */ xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */ xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */ xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* subtract partial forces */ xmm5 = _mm_sub_pd(xmm5,t1); xmm6 = _mm_sub_pd(xmm6,t2); xmm7 = _mm_sub_pd(xmm7,t3); xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */ xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* store fx and fy */ _mm_storeu_pd(faction+j13,xmm1); _mm_storeu_pd(faction+j23,xmm2); /* .. then fz */ _mm_storel_pd(faction+j13+2,xmm7); _mm_storel_pd(faction+j23+2,xmm7); } /* In double precision, offset can only be either 0 or 1 */ if(offset!=0) { jnr1 = jjnr[k]; j13 = jnr1*3; jx = _mm_load_sd(pos+j13); jy = _mm_load_sd(pos+j13+1); jz = _mm_load_sd(pos+j13+2); isaj = _mm_load_sd(invsqrta+jnr1); isaprod = _mm_mul_sd(isai,isaj); dvdaj = _mm_load_sd(dvda+jnr1); q = _mm_load_sd(charge+jnr1); qq = _mm_mul_sd(iq,q); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); qq = _mm_mul_sd(isaprod,qq); qq = _mm_mul_sd(qq,neg); gbscale = _mm_mul_sd(isaprod,gbtabscale); r = _mm_mul_sd(rsq11,rinv); rt = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_sd(rt,n0d); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); H = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,H); vgb = _mm_mul_sd(qq,VV); fijC = _mm_mul_sd(qq,FF); fijC = _mm_mul_sd(fijC,gbscale); dvdatmp = _mm_mul_sd(fijC,r); dvdatmp = _mm_add_sd(vgb,dvdatmp); dvdatmp = _mm_mul_sd(dvdatmp,neg); dvdatmp = _mm_mul_sd(dvdatmp,half); dvdasum = _mm_add_sd(dvdasum,dvdatmp); xmm1 = _mm_mul_sd(dvdatmp,isaj); xmm1 = _mm_mul_sd(xmm1,isaj); dvdaj = _mm_add_sd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); vctot = _mm_add_sd(vctot,vcoul); vgbtot = _mm_add_sd(vgbtot,vgb); fscal = _mm_sub_sd(fijC,fscal); fscal = _mm_mul_sd(fscal,neg); fscal = _mm_mul_sd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_sd(fscal,dx); t2 = _mm_mul_sd(fscal,dy); t3 = _mm_mul_sd(fscal,dz); /* update the i force */ fix = _mm_add_sd(fix,t1); fiy = _mm_add_sd(fiy,t2); fiz = _mm_add_sd(fiz,t3); /* accumulate forces from memory */ xmm5 = _mm_load_sd(faction+j13); /* fx */ xmm6 = _mm_load_sd(faction+j13+1); /* fy */ xmm7 = _mm_load_sd(faction+j13+2); /* fz */ /* subtract partial forces */ xmm5 = _mm_sub_sd(xmm5,t1); xmm6 = _mm_sub_sd(xmm6,t2); xmm7 = _mm_sub_sd(xmm7,t3); /* store forces */ _mm_store_sd(faction+j13,xmm5); _mm_store_sd(faction+j13+1,xmm6); _mm_store_sd(faction+j13+2,xmm7); } /* fix/fiy/fiz now contain four partial terms, that all should be * added to the i particle forces */ t1 = _mm_unpacklo_pd(t1,fix); t2 = _mm_unpacklo_pd(t2,fiy); t3 = _mm_unpacklo_pd(t3,fiz); fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1)); fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1)); fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1)); /* Load i forces from memory */ xmm1 = _mm_load_sd(faction+ii3); xmm2 = _mm_load_sd(faction+ii3+1); xmm3 = _mm_load_sd(faction+ii3+2); /* Add to i force */ fix = _mm_add_sd(fix,xmm1); fiy = _mm_add_sd(fiy,xmm2); fiz = _mm_add_sd(fiz,xmm3); /* store i forces to memory */ _mm_store_sd(faction+ii3,fix); _mm_store_sd(faction+ii3+1,fiy); _mm_store_sd(faction+ii3+2,fiz); /* now do dvda */ dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum); dvdasum = _mm_add_pd(dvdasum,dvdatmp); _mm_storeh_pd(&dva,dvdasum); dvda[ii] = dvda[ii] + dva*isai_d*isai_d; ggid = gid[n]; /* Coulomb potential */ vcoul = _mm_unpacklo_pd(vcoul,vctot); vctot = _mm_add_pd(vctot,vcoul); _mm_storeh_pd(&vct,vctot); Vc[ggid] = Vc[ggid] + vct; /* GB potential */ vgb = _mm_unpacklo_pd(vgb,vgbtot); vgbtot = _mm_add_pd(vgbtot,vgb); _mm_storeh_pd(&vgbt,vgbtot); gpol[ggid] = gpol[ggid] + vgbt; } *outeriter = nri; *inneriter = nj1; }
mlib_status __mlib_ImageBlendRGBA2ARGB( mlib_image *dst, const mlib_image *src) { mlib_type type; mlib_u8 *sl, *dl; mlib_s32 slb, dlb, nchan, width, height; mlib_s32 i, j, ii, off; P_TYPE *sp, *dp; P_TYPE ss, aa, ds, dd, d_h, d_l; P_TYPE mzero, const255, mask64, d_half; MLIB_IMAGE_CHECK(dst); MLIB_IMAGE_CHECK(src); MLIB_IMAGE_FULL_EQUAL(dst, src); MLIB_IMAGE_GET_ALL_PARAMS(dst, type, nchan, width, height, dlb, dl); slb = mlib_ImageGetStride(src); sl = mlib_ImageGetData(src); if (type != MLIB_BYTE || nchan != 4) { return (MLIB_FAILURE); } mzero = _mm_setzero_si128(); const255 = _mm_set1_epi32(0x00ff00ff); mask64 = _mm_set1_epi32(0xffffff00); d_half = _mm_set1_epi32(0x00800080); for (j = 0; j < height; j++) { P_TYPE alp, a0, a1, ralp, s0, s1, d0, d1, drnd; mlib_m128 s0u, s1u; sp = (void *)sl; dp = (void *)dl; if (!(((mlib_s32)sp | (mlib_s32)dp) & 15)) { for (i = 0; i < (width / 4); i++) { ss = _mm_load_si128(sp); dd = _mm_load_si128(dp); s0 = _mm_unpacklo_epi8(ss, mzero); a0 = _mm_shufflelo_epi16(s0, 0xff); a0 = _mm_shufflehi_epi16(a0, 0xff); s0 = _mm_shufflelo_epi16(s0, 0x93); s0 = _mm_shufflehi_epi16(s0, 0x93); BLEND(d_h, a0, s0, _mm_unpacklo_epi8(dd, mzero)); s1 = _mm_unpackhi_epi8(ss, mzero); a1 = _mm_shufflelo_epi16(s1, 0xff); a1 = _mm_shufflehi_epi16(a1, 0xff); s1 = _mm_shufflelo_epi16(s1, 0x93); s1 = _mm_shufflehi_epi16(s1, 0x93); BLEND(d_l, a1, s1, _mm_unpackhi_epi8(dd, mzero)); d_h = _mm_packus_epi16(d_h, d_l); d_h = _mm_or_si128(_mm_and_si128(mask64, d_h), _mm_andnot_si128(mask64, dd)); _mm_store_si128(dp, d_h); sp++; dp++; } } else { for (i = 0; i < (width / 4); i++) { #if 0 ss = _mm_loadu_si128(sp); s0 = _mm_unpacklo_epi8(ss, mzero); s1 = _mm_unpackhi_epi8(ss, mzero); #else s0u.m128d = _mm_load_sd((mlib_d64 *)sp); s1u.m128d = _mm_load_sd((mlib_d64 *)sp + 1); s0 = _mm_unpacklo_epi8(s0u.m128i, mzero); s1 = _mm_unpacklo_epi8(s1u.m128i, mzero); #endif dd = _mm_loadu_si128(dp); a0 = _mm_shufflelo_epi16(s0, 0xff); a0 = _mm_shufflehi_epi16(a0, 0xff); s0 = _mm_shufflelo_epi16(s0, 0x93); s0 = _mm_shufflehi_epi16(s0, 0x93); BLEND(d_h, a0, s0, _mm_unpacklo_epi8(dd, mzero)); a1 = _mm_shufflelo_epi16(s1, 0xff); a1 = _mm_shufflehi_epi16(a1, 0xff); s1 = _mm_shufflelo_epi16(s1, 0x93); s1 = _mm_shufflehi_epi16(s1, 0x93); BLEND(d_l, a1, s1, _mm_unpackhi_epi8(dd, mzero)); d_h = _mm_packus_epi16(d_h, d_l); d_h = _mm_or_si128(_mm_and_si128(mask64, d_h), _mm_andnot_si128(mask64, dd)); #if 1 _mm_storeu_si128(dp, d_h); #else s0u.m128i = d_h; s1u.m128i = _mm_shuffle_epi32(d_h, 0x3e); _mm_store_sd((mlib_d64 *)dp, s0u.m128d); _mm_store_sd((mlib_d64 *)dp + 1, s1u.m128d); #endif sp++; dp++; } } if (width & 3) { s0u.m128d = _mm_load_sd((mlib_d64 *)sp); s1u.m128d = _mm_load_sd((mlib_d64 *)sp + 1); s0 = _mm_unpacklo_epi8(s0u.m128i, mzero); s1 = _mm_unpacklo_epi8(s1u.m128i, mzero); dd = _mm_loadu_si128(dp); a0 = _mm_shufflelo_epi16(s0, 0xff); a0 = _mm_shufflehi_epi16(a0, 0xff); s0 = _mm_shufflelo_epi16(s0, 0x93); s0 = _mm_shufflehi_epi16(s0, 0x93); BLEND(d_h, a0, s0, _mm_unpacklo_epi8(dd, mzero)); a1 = _mm_shufflelo_epi16(s1, 0xff); a1 = _mm_shufflehi_epi16(a1, 0xff); s1 = _mm_shufflelo_epi16(s1, 0x93); s1 = _mm_shufflehi_epi16(s1, 0x93); BLEND(d_l, a1, s1, _mm_unpackhi_epi8(dd, mzero)); d_h = _mm_packus_epi16(d_h, d_l); d_h = _mm_or_si128(_mm_and_si128(mask64, d_h), _mm_andnot_si128(mask64, dd)); for (ii = 0; ii < (width & 3); ii++) { ((mlib_s32 *)dp)[ii] = ((mlib_s32 *)&d_h)[ii]; } } sl += slb; dl += dlb; } return (MLIB_SUCCESS); }
__m128d test_load_sd(void* y) { // CHECK: define {{.*}} @test_load_sd // CHECK: load double* {{.*}}, align 1{{$}} return _mm_load_sd(y); }
// it moves horizontally inside a block void kernel_dtrmv_u_n_8_lib4(int kmax, double *A0, int sda, double *x, double *y, int alg) { if(kmax<=0) return; double *A1 = A0 + 4*sda; const int lda = 4; int k; __m128d tmp0, z_0, y_0_1, a_00_10; __m256d zeros, ax_temp, a_00_10_20_30, a_01_11_21_31, a_40_50_60_70, a_41_51_61_71, x_0, x_1, y_0_1_2_3, y_0_1_2_3_b, z_0_1_2_3, y_4_5_6_7, y_4_5_6_7_b, z_4_5_6_7; /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_4_5_6_7 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_4_5_6_7_b = _mm256_setzero_pd(); */ zeros = _mm256_setzero_pd(); /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_0_1_2_3_c = _mm256_setzero_pd(); */ /* y_0_1_2_3_d = _mm256_setzero_pd();*/ // upper triangular // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A0[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A0[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_0_1_2_3_b = _mm256_castpd128_pd256( y_0_1 ); y_0_1_2_3_b = _mm256_blend_pd( y_0_1_2_3_b, y_0_1_2_3_b, 0xc ); // forth col (avoid zero y_0_1_2_3) x_1 = _mm256_broadcast_sd( &x[3] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); y_0_1_2_3 = _mm256_mul_pd( a_01_11_21_31, x_1 ); // first col x_0 = _mm256_broadcast_sd( &x[2] ); x_0 = _mm256_blend_pd( x_0, zeros, 0x8 ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; // upper squared x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); x_0 = _mm256_broadcast_sd( &x[2] ); x_1 = _mm256_broadcast_sd( &x[3] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); // lower triangular // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A1[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A1[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_4_5_6_7_b = _mm256_castpd128_pd256( y_0_1 ); y_4_5_6_7_b = _mm256_blend_pd( y_4_5_6_7_b, y_4_5_6_7_b, 0xc ); // forth col (avoid zero y_4_5_6_7) x_1 = _mm256_broadcast_sd( &x[3] ); a_01_11_21_31 = _mm256_load_pd( &A1[0+lda*3] ); y_4_5_6_7 = _mm256_mul_pd( a_01_11_21_31, x_1 ); // first col x_0 = _mm256_broadcast_sd( &x[2] ); x_0 = _mm256_blend_pd( x_0, zeros, 0x8 ); a_00_10_20_30 = _mm256_load_pd( &A1[0+lda*2] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; k=8; for(; k<kmax-3; k+=4) { /* __builtin_prefetch( A0 + 4*lda );*/ /* __builtin_prefetch( A1 + 4*lda );*/ x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); /* __builtin_prefetch( A0 + 5*lda );*/ /* __builtin_prefetch( A1 + 5*lda );*/ x_0 = _mm256_broadcast_sd( &x[2] ); x_1 = _mm256_broadcast_sd( &x[3] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; } if(kmax%4>=2) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 2*lda; A1 += 2*lda; x += 2; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_4_5_6_7_b ); if(kmax%2==1) { x_0 = _mm256_broadcast_sd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); /* A0 += 1*lda;*/ /* A1 += 1*lda;*/ /* x += 1;*/ } if(alg==0) { _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } else if(alg==1) { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_4_5_6_7 = _mm256_loadu_pd( &y[4] ); z_0_1_2_3 = _mm256_add_pd( z_0_1_2_3, y_0_1_2_3 ); z_4_5_6_7 = _mm256_add_pd( z_4_5_6_7, y_4_5_6_7 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); _mm256_storeu_pd(&y[4], z_4_5_6_7); } else // alg==-1 { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_4_5_6_7 = _mm256_loadu_pd( &y[4] ); z_0_1_2_3 = _mm256_sub_pd( z_0_1_2_3, y_0_1_2_3 ); z_4_5_6_7 = _mm256_sub_pd( z_4_5_6_7, y_4_5_6_7 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); _mm256_storeu_pd(&y[4], z_4_5_6_7); } }
// it moves horizontally inside a block (A upper triangular) void kernel_dtrmv_u_n_4_lib4(int kmax, double *A, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; __m128d tmp0, z_0, y_0_1, a_00_10; __m256d zeros, ax_temp, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0, x_1, x_2, x_3, y_0_1_2_3, y_0_1_2_3_b, y_0_1_2_3_c, y_0_1_2_3_d, z_0_1_2_3; zeros = _mm256_setzero_pd(); /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_0_1_2_3_c = _mm256_setzero_pd(); */ y_0_1_2_3_d = _mm256_setzero_pd(); // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_0_1_2_3_c = _mm256_castpd128_pd256( y_0_1 ); y_0_1_2_3_c = _mm256_blend_pd( y_0_1_2_3_c, y_0_1_2_3_d, 0xc ); // forth col (avoid zero y_0_1_2_3) x_3 = _mm256_broadcast_sd( &x[3] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); y_0_1_2_3 = _mm256_mul_pd( a_03_13_23_33, x_3 ); // first col x_2 = _mm256_broadcast_sd( &x[2] ); x_2 = _mm256_blend_pd( x_2, zeros, 0x8 ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); y_0_1_2_3_b = _mm256_mul_pd( a_02_12_22_32, x_2 ); A += 4*lda; x += 4; k=4; for(; k<kmax-3; k+=4) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); x_2 = _mm256_broadcast_sd( &x[2] ); x_3 = _mm256_broadcast_sd( &x[3] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_02_12_22_32, x_2 ); y_0_1_2_3_c = _mm256_add_pd( y_0_1_2_3_c, ax_temp ); ax_temp = _mm256_mul_pd( a_03_13_23_33, x_3 ); y_0_1_2_3_d = _mm256_add_pd( y_0_1_2_3_d, ax_temp ); A += 4*lda; x += 4; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_c ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, y_0_1_2_3_d ); if(kmax%4>=2) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); A += 2*lda; x += 2; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b ); if(kmax%2==1) { x_0 = _mm256_broadcast_sd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); } if(alg==0) { _mm256_storeu_pd(&y[0], y_0_1_2_3); } else if(alg==1) { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_0_1_2_3 = _mm256_add_pd ( z_0_1_2_3, y_0_1_2_3 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); } else // alg==-1 { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_0_1_2_3 = _mm256_sub_pd ( z_0_1_2_3, y_0_1_2_3 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); } }