dcomplex zdotc_( int* n, dcomplex* x, int* inc_x, dcomplex* z, int* inc_z ) { dcomplex* restrict x1; dcomplex* restrict z1; int i; v2df_t rho1v; v2df_t z11v, z12v; v2df_t x1v, x1rv; dcomplex rho; int n1 = *n; int incx = *inc_x; int incz = *inc_z; x1 = x; z1 = z; rho1v.v = _mm_setzero_pd(); { v2df_t bcac, adbd; for ( i = 0; i < n1; ++i ) { z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) ); z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) ); x1v.v = _mm_load_pd( ( double* )x1 ); x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) ); bcac.v = x1rv.v * z11v.v; adbd.v = x1v.v * z12v.v; rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v ); x1 += incx; z1 += incz; } rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) ); rho1v.d[1] = -rho1v.d[1]; } rho.real = rho1v.d[0]; rho.imag = rho1v.d[1]; return rho; }
__m128d test_mm_loaddup_pd(double const* P) { // CHECK-LABEL: test_mm_loaddup_pd // CHECK: load double* // CHECK: insertelement <2 x double> undef, double %{{.*}}, i32 0 // CHECK: insertelement <2 x double> %{{.*}}, double %{{.*}}, i32 1 return _mm_loaddup_pd(P); }
static void sse3_test_movddup_mem (double *i1, double *r) { __m128d t1 = _mm_loaddup_pd (i1); _mm_storeu_pd (r, t1); }
Point operator*(double scalar, const Point &point) { #ifdef __SSE3__ return Point(point.v * _mm_loaddup_pd(&scalar)); #else return Point(point.x * scalar, point.y * scalar); #endif }
Point Point::operator/(double scalar) const { #ifdef __SSE3__ return Point(v / _mm_loaddup_pd(&scalar)); #else return Point(x / scalar, y / scalar); #endif }
// from Intel's sample intrin_double_sample.c void multiply_SSE3(double xr, double xi, double yr, double yi, complex_num *z) { __m128d num1, num2, num3; // Duplicates lower vector element into upper vector element. // num1: [x.real, x.real] num1 = _mm_loaddup_pd(&xr); // Move y elements into a vector // num2: [y.img, y.real] num2 = _mm_set_pd(yi, yr); // Multiplies vector elements // num3: [(x.real*y.img), (x.real*y.real)] num3 = _mm_mul_pd(num2, num1); // num1: [x.img, x.img] num1 = _mm_loaddup_pd(&xi); // Swaps the vector elements // num2: [y.real, y.img] num2 = _mm_shuffle_pd(num2, num2, 1); // num2: [(x.img*y.real), (x.img*y.img)] num2 = _mm_mul_pd(num2, num1); // Adds upper vector element while subtracting lower vector element // num3: [((x.real *y.img)+(x.img*y.real)), // ((x.real*y.real)-(x.img*y.img))] num3 = _mm_addsub_pd(num3, num2); // Stores the elements of num3 into z _mm_storeu_pd((double *)z, num3); }
Point &Point::operator/=(double scalar) { #ifdef __SSE3__ v /= _mm_loaddup_pd(&scalar); #else x /= scalar; y /= scalar; #endif return *this; }
static NPT_INLINE __m128d npt_mm_loaddup_pd(const double* ptr) { return _mm_loaddup_pd(ptr); }
void bl1_ddotaxmyv2( int n, double* alpha, double* beta, double* x, int inc_x, double* u, int inc_u, double* rho, double* y, int inc_y, double* z, int inc_z ) #if BLIS1_VECTOR_INTRINSIC_TYPE == BLIS1_SSE_INTRINSICS { double* restrict chi1; double* restrict upsilon1; double* restrict psi1; double* restrict zeta1; double rho_c; int i; int n_pre; int n_run; int n_left; v2df_t a1v, b1v; v2df_t rho1v; v2df_t x1v, u1v, y1v, z1v; if ( inc_x != 1 || inc_u != 1 || inc_y != 1 || inc_z != 1 ) bl1_abort(); n_pre = 0; if ( ( unsigned long ) z % 16 != 0 ) { if ( ( unsigned long ) x % 16 == 0 || ( unsigned long ) u % 16 == 0 || ( unsigned long ) y % 16 == 0 ) bl1_abort(); n_pre = 1; } n_run = ( n - n_pre ) / 2; n_left = ( n - n_pre ) % 2; chi1 = x; upsilon1 = u; psi1 = y; zeta1 = z; rho_c = 0.0; if ( n_pre == 1 ) { double alpha_c = *alpha; double beta_c = *beta; double chi1_c = *chi1; double upsilon_c = *upsilon1; rho_c += chi1_c * upsilon_c; *psi1 -= alpha_c * chi1_c; *zeta1 -= beta_c * chi1_c; chi1 += inc_x; upsilon1 += inc_u; psi1 += inc_y; zeta1 += inc_z; } a1v.v = _mm_loaddup_pd( ( double* )alpha ); b1v.v = _mm_loaddup_pd( ( double* )beta ); rho1v.v = _mm_setzero_pd(); for ( i = 0; i < n_run; ++i ) { x1v.v = _mm_load_pd( ( double* )chi1 ); u1v.v = _mm_load_pd( ( double* )upsilon1 ); y1v.v = _mm_load_pd( ( double* )psi1 ); z1v.v = _mm_load_pd( ( double* )zeta1 ); rho1v.v += x1v.v * u1v.v; y1v.v -= a1v.v * x1v.v; z1v.v -= b1v.v * x1v.v; _mm_store_pd( ( double* )psi1, y1v.v ); _mm_store_pd( ( double* )zeta1, z1v.v ); chi1 += 2; upsilon1 += 2; psi1 += 2; zeta1 += 2; } rho_c += rho1v.d[0] + rho1v.d[1]; if ( n_left > 0 ) { double alpha_c = *alpha; double beta_c = *beta; for( i = 0; i < n_left; ++i ) { double chi1_c = *chi1; double upsilon_c = *upsilon1; rho_c += chi1_c * upsilon_c; *psi1 -= alpha_c * chi1_c; *zeta1 -= beta_c * chi1_c; chi1 += inc_x; upsilon1 += inc_u; psi1 += inc_y; zeta1 += inc_z; } } *rho = rho_c; }
void AddDot4x4( int k, double *a, int lda, double *b, int ldb, double *c, int ldc ) { /* So, this routine computes a 4x4 block of matrix A C( 0, 0 ), C( 0, 1 ), C( 0, 2 ), C( 0, 3 ). C( 1, 0 ), C( 1, 1 ), C( 1, 2 ), C( 1, 3 ). C( 2, 0 ), C( 2, 1 ), C( 2, 2 ), C( 2, 3 ). C( 3, 0 ), C( 3, 1 ), C( 3, 2 ), C( 3, 3 ). Notice that this routine is called with c = C( i, j ) in the previous routine, so these are actually the elements C( i , j ), C( i , j+1 ), C( i , j+2 ), C( i , j+3 ) C( i+1, j ), C( i+1, j+1 ), C( i+1, j+2 ), C( i+1, j+3 ) C( i+2, j ), C( i+2, j+1 ), C( i+2, j+2 ), C( i+2, j+3 ) C( i+3, j ), C( i+3, j+1 ), C( i+3, j+2 ), C( i+3, j+3 ) in the original matrix C And now we use vector registers and instructions */ int p; v2df_t c_00_c_10_vreg, c_01_c_11_vreg, c_02_c_12_vreg, c_03_c_13_vreg, c_20_c_30_vreg, c_21_c_31_vreg, c_22_c_32_vreg, c_23_c_33_vreg, a_0p_a_1p_vreg, a_2p_a_3p_vreg, b_p0_vreg, b_p1_vreg, b_p2_vreg, b_p3_vreg; c_00_c_10_vreg.v = _mm_setzero_pd(); c_01_c_11_vreg.v = _mm_setzero_pd(); c_02_c_12_vreg.v = _mm_setzero_pd(); c_03_c_13_vreg.v = _mm_setzero_pd(); c_20_c_30_vreg.v = _mm_setzero_pd(); c_21_c_31_vreg.v = _mm_setzero_pd(); c_22_c_32_vreg.v = _mm_setzero_pd(); c_23_c_33_vreg.v = _mm_setzero_pd(); for ( p=0; p<k; p++ ){ a_0p_a_1p_vreg.v = _mm_load_pd( (double *) a ); a_2p_a_3p_vreg.v = _mm_load_pd( (double *) ( a+2 ) ); a += 4; b_p0_vreg.v = _mm_loaddup_pd( (double *) b ); /* load and duplicate */ b_p1_vreg.v = _mm_loaddup_pd( (double *) (b+1) ); /* load and duplicate */ b_p2_vreg.v = _mm_loaddup_pd( (double *) (b+2) ); /* load and duplicate */ b_p3_vreg.v = _mm_loaddup_pd( (double *) (b+3) ); /* load and duplicate */ b += 4; /* First row and second rows */ c_00_c_10_vreg.v += a_0p_a_1p_vreg.v * b_p0_vreg.v; c_01_c_11_vreg.v += a_0p_a_1p_vreg.v * b_p1_vreg.v; c_02_c_12_vreg.v += a_0p_a_1p_vreg.v * b_p2_vreg.v; c_03_c_13_vreg.v += a_0p_a_1p_vreg.v * b_p3_vreg.v; /* Third and fourth rows */ c_20_c_30_vreg.v += a_2p_a_3p_vreg.v * b_p0_vreg.v; c_21_c_31_vreg.v += a_2p_a_3p_vreg.v * b_p1_vreg.v; c_22_c_32_vreg.v += a_2p_a_3p_vreg.v * b_p2_vreg.v; c_23_c_33_vreg.v += a_2p_a_3p_vreg.v * b_p3_vreg.v; } C( 0, 0 ) += c_00_c_10_vreg.d[0]; C( 0, 1 ) += c_01_c_11_vreg.d[0]; C( 0, 2 ) += c_02_c_12_vreg.d[0]; C( 0, 3 ) += c_03_c_13_vreg.d[0]; C( 1, 0 ) += c_00_c_10_vreg.d[1]; C( 1, 1 ) += c_01_c_11_vreg.d[1]; C( 1, 2 ) += c_02_c_12_vreg.d[1]; C( 1, 3 ) += c_03_c_13_vreg.d[1]; C( 2, 0 ) += c_20_c_30_vreg.d[0]; C( 2, 1 ) += c_21_c_31_vreg.d[0]; C( 2, 2 ) += c_22_c_32_vreg.d[0]; C( 2, 3 ) += c_23_c_33_vreg.d[0]; C( 3, 0 ) += c_20_c_30_vreg.d[1]; C( 3, 1 ) += c_21_c_31_vreg.d[1]; C( 3, 2 ) += c_22_c_32_vreg.d[1]; C( 3, 3 ) += c_23_c_33_vreg.d[1]; }
CPS_START_NAMESPACE //-------------------------------------------------------------------- // CVS keywords // // $Source: /home/chulwoo/CPS/repo/CVS/cps_only/cps_pp/src/util/dirac_op/d_op_dwf/sse/dwf_dslash_5_plus.C,v $ // $State: Exp $ // //-------------------------------------------------------------------- //------------------------------------------------------------------ // // dwf_dslash_5_plus.C // // dwf_dslash_5_plus is the derivative part of the 5th direction // part of the fermion matrix. This routine accumulates the result // on the out field // The in, out fields are defined on the checkerboard lattice. // The action of this operator is the same for even/odd // checkerboard fields because there is no gauge field along // the 5th direction. // dag = 0/1 <--> Dslash/Dslash^dagger is calculated. // // // Storage order for DWF fermions //------------------------------------------------------------------ // // | | // | |r| | // | |i| | // | | // | |r| | = |spin comp| // | |i| | // | | // | |r| | // | |i| | // | | // // // | | // | |spin comp| | // | | // | | // | |spin comp| | // | | = |spinor| // | | // | |spin comp| | // | | // | | // | |spin comp| | // | | // // // | | // | |spinor| | // | |spinor| | // | |spinor| | // | |spinor| | // | |spinor| | // | . | = |s-block| The spinors are arranged in Wilson // | . | order with odd - even 4d-checkerboard // | . | storage. // |evn/odd vol | // | . | // | |spinor| | // | | // // // | | // | |s-block even| | For even chckerboard // | |s-block odd| | // | |s-block even| | // | |s-block odd| | // | . | // | . | // | . | // | | // // // | | // | |s-block odd| | For odd chckerboard // | |s-block even| | // | |s-block odd| | // | |s-block even| | // | . | // | . | // | . | // | | // //------------------------------------------------------------------ CPS_END_NAMESPACE #include<util/dwf.h> #include<util/gjp.h> #include<util/dirac_op.h> #include<util/vector.h> #include<util/verbose.h> #include<util/error.h> #include<util/smalloc.h> #include<util/smalloc.h> #include<util/qblas_extend.h> #include<comms/scu.h> CPS_START_NAMESPACE void dwf_dslash_5_plus_dag0(Vector *out, Vector *in, Float mass, Dwf *dwf_lib_arg) { #pragma omp parallel default(shared) { // Initializations //------------------------------------------------------------------ int idx; IFloat *f_in; IFloat *f_out; int x; int s; const IFloat two_over_a5 = 2.0 * GJP.DwfA5Inv(); const IFloat neg_mass_two_over_a5 = -2.0 * mass * GJP.DwfA5Inv(); const int local_ls = GJP.SnodeSites(); const int s_nodes = GJP.Snodes(); const int s_node_coor = GJP.SnodeCoor(); const int vol_4d_cb = dwf_lib_arg->vol_4d / 2; const int max_dex((local_ls-1)*vol_4d_cb); const int ls_stride = 24 * vol_4d_cb; IFloat *comm_buf = dwf_lib_arg->comm_buf; // [1 + gamma_5] term (if dag=1 [1 - gamma_5] term) // // out[s] = [1 + gamma_5] in[s-1] //------------------------------------------------------------------ f_in = (IFloat *) in; f_out = (IFloat *) out + ls_stride; register __m128d al; al = _mm_loaddup_pd(&two_over_a5); register __m128d nal; nal = _mm_loaddup_pd(&neg_mass_two_over_a5); register __m128d x0, x1, x2, x3, x4, x5; \ register __m128d y0, y1, y2, y3, y4, y5; \ register __m128d z0, z1; \ #define DAXPY( _A, _X, _Y, _x, _y ) \ _x = _mm_load_pd( _X ); \ _y = _mm_load_pd( _Y ); \ _x = _mm_add_pd( _y, _mm_mul_pd(_A, _x) ); \ _mm_store_pd( _Y, _x ); \ #define DIST_XY 6 #define DAXPY12( _A, _X, _Y ) \ DAXPY(_A, _X+0, _Y+0, x0, y0); \ DAXPY(_A, _X+2, _Y+2, x1, y1); \ DAXPY(_A, _X+4, _Y+4, x2, y2); \ DAXPY(_A, _X+6, _Y+6, x3, y3); \ DAXPY(_A, _X+8, _Y+8, x4, y4); \ DAXPY(_A, _X+10, _Y+10, x5, y5); \ \ _mm_prefetch((char *)(_X + DIST_XY*24), _MM_HINT_T0); \ _mm_prefetch((char *)(_Y + DIST_XY*24), _MM_HINT_T0); \ #pragma omp for schedule(static) for (idx=0;idx<max_dex;idx++) { // cblas_daxpy(12,two_over_a5,f_in+24*idx,f_out+24*idx); DAXPY12( al, f_in+24*idx, f_out+24*idx ); } // [1 + gamma_5] for lower boundary term (if dag=1 [1 - gamma_5] term) // If there's only one node along fifth direction, no communication // is necessary; Otherwise data from adjacent node in minus direction // will be needed. // If the lower boundary is the s=0 term // out[0] = - m_f * [1 + gamma_5] in[ls-1] // else, out[s] = [1 + gamma_5] in[s-1] // //------------------------------------------------------------------ f_in = (IFloat *) in+ (local_ls-1)*ls_stride; f_out = (IFloat *) out; if(s_node_coor == 0) { #pragma omp for schedule(static) for(x=0; x<vol_4d_cb; x++) { const int shift(24*x); //IFloat *f_temp(f_in+shift); //cblas_daxpy(12,neg_mass_two_over_a5,f_temp,f_out+shift); DAXPY12( nal, f_in+shift, f_out+shift ); } } else { #pragma omp for schedule(static) for(x=0; x<vol_4d_cb; x++) { const int shift(24*x); //IFloat *f_temp(f_in+shift); //cblas_daxpy(12,two_over_a5,f_temp,f_out+shift); DAXPY12( al, f_in+shift, f_out+shift); } } // [1 - gamma_5] term (if dag=1 [1 + gamma_5] term) // // out[s] = [1 - gamma_5] in[s+1] //------------------------------------------------------------------ f_in = (IFloat *) in + 12 + ls_stride; f_out = (IFloat *) out + 12; #pragma omp for schedule(static) for (idx=0;idx<max_dex;idx++) { int shift(24*idx); //cblas_daxpy(12,two_over_a5,f_in+shift,f_out+shift); DAXPY12( al, f_in+shift, f_out+shift); } // [1 - gamma_5] for upper boundary term (if dag=1 [1 + gamma_5] term) // If there's only one node along fifth direction, no communication // is necessary; Otherwise data from adjacent node in minus direction // will be needed. // If the upper boundary is the s=ls term // out[ls-1] = - m_f * [1 - gamma_5] in[0] // else out[s] = [1 - gamma_5] in[s+1] // //------------------------------------------------------------------ f_in = (IFloat *) in +12; f_out = (IFloat *) out+12+ (local_ls-1)*ls_stride; if(s_node_coor == s_nodes - 1) { #pragma omp for schedule(static) for(x=0; x<vol_4d_cb; x++){ const int shift(24*x); //IFloat *f_temp (f_in+shift); //cblas_daxpy(12,neg_mass_two_over_a5,f_temp,f_out+shift); DAXPY12( nal, f_in+shift, f_out+shift ); } } else { #pragma omp for schedule(static) for(x=0; x<vol_4d_cb; x++){ const int shift(24*x); //IFloat *f_temp (f_in+shift); //cblas_daxpy(12,two_over_a5,f_temp,f_out+shift); DAXPY12( al, f_in+shift, f_out+shift ); } } } const int local_ls = GJP.SnodeSites(); const int vol_4d_cb = dwf_lib_arg->vol_4d / 2; DiracOp::CGflops+=2*2*vol_4d_cb*local_ls*12; }
__m128d test_mm_loaddup_pd(double const* P) { // CHECK-LABEL: test_mm_loaddup_pd // CHECK: load double* return _mm_loaddup_pd(P); }
// it moves horizontally inside a block void kernel_dgemv_n_2_lib4(int kmax, double *A, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; __m128d ax_temp, a_00_10, a_01_11, a_02_12, a_03_13, x_0, x_1, x_2, x_3, y_0_1, y_0_1_b, y_0_1_c, y_0_1_d, z_0_1; y_0_1 = _mm_setzero_pd(); y_0_1_b = _mm_setzero_pd(); y_0_1_c = _mm_setzero_pd(); y_0_1_d = _mm_setzero_pd(); k=0; for(; k<kmax-3; k+=4) { x_0 = _mm_loaddup_pd( &x[0] ); x_1 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); a_01_11 = _mm_load_pd( &A[0+lda*1] ); x_2 = _mm_loaddup_pd( &x[2] ); x_3 = _mm_loaddup_pd( &x[3] ); a_02_12 = _mm_load_pd( &A[0+lda*2] ); a_03_13 = _mm_load_pd( &A[0+lda*3] ); ax_temp = _mm_mul_pd( a_00_10, x_0 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); ax_temp = _mm_mul_pd( a_01_11, x_1 ); y_0_1_b = _mm_add_pd( y_0_1_b, ax_temp ); ax_temp = _mm_mul_pd( a_02_12, x_2 ); y_0_1_c = _mm_add_pd( y_0_1_c, ax_temp ); ax_temp = _mm_mul_pd( a_03_13, x_3 ); y_0_1_d = _mm_add_pd( y_0_1_d, ax_temp ); A += 4*lda; x += 4; } y_0_1 = _mm_add_pd( y_0_1, y_0_1_c ); y_0_1_b = _mm_add_pd( y_0_1_b, y_0_1_d ); if(kmax%4>=2) { x_0 = _mm_loaddup_pd( &x[0] ); x_1 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); a_01_11 = _mm_load_pd( &A[0+lda*1] ); ax_temp = _mm_mul_pd( a_00_10, x_0 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); ax_temp = _mm_mul_pd( a_01_11, x_1 ); y_0_1_b = _mm_add_pd( y_0_1_b, ax_temp ); A += 2*lda; x += 2; } y_0_1 = _mm_add_pd( y_0_1, y_0_1_b ); if(kmax%2==1) { x_0 = _mm_loaddup_pd( &x[0] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); ax_temp = _mm_mul_pd( a_00_10, x_0 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); } if(alg==0) { _mm_storeu_pd(&y[0], y_0_1); } else if(alg==1) { z_0_1 = _mm_loadu_pd( &y[0] ); z_0_1 = _mm_add_pd( z_0_1, y_0_1 ); _mm_storeu_pd(&y[0], z_0_1); } else // alg==-1 { z_0_1 = _mm_loadu_pd( &y[0] ); z_0_1 = _mm_sub_pd( z_0_1, y_0_1 ); _mm_storeu_pd(&y[0], z_0_1); } }
// it moves horizontally inside a block (A upper triangular) void kernel_dtrmv_u_n_4_lib4(int kmax, double *A, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; __m128d tmp0, z_0, y_0_1, a_00_10; __m256d zeros, ax_temp, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0, x_1, x_2, x_3, y_0_1_2_3, y_0_1_2_3_b, y_0_1_2_3_c, y_0_1_2_3_d, z_0_1_2_3; zeros = _mm256_setzero_pd(); /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_0_1_2_3_c = _mm256_setzero_pd(); */ y_0_1_2_3_d = _mm256_setzero_pd(); // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_0_1_2_3_c = _mm256_castpd128_pd256( y_0_1 ); y_0_1_2_3_c = _mm256_blend_pd( y_0_1_2_3_c, y_0_1_2_3_d, 0xc ); // forth col (avoid zero y_0_1_2_3) x_3 = _mm256_broadcast_sd( &x[3] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); y_0_1_2_3 = _mm256_mul_pd( a_03_13_23_33, x_3 ); // first col x_2 = _mm256_broadcast_sd( &x[2] ); x_2 = _mm256_blend_pd( x_2, zeros, 0x8 ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); y_0_1_2_3_b = _mm256_mul_pd( a_02_12_22_32, x_2 ); A += 4*lda; x += 4; k=4; for(; k<kmax-3; k+=4) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); x_2 = _mm256_broadcast_sd( &x[2] ); x_3 = _mm256_broadcast_sd( &x[3] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_02_12_22_32, x_2 ); y_0_1_2_3_c = _mm256_add_pd( y_0_1_2_3_c, ax_temp ); ax_temp = _mm256_mul_pd( a_03_13_23_33, x_3 ); y_0_1_2_3_d = _mm256_add_pd( y_0_1_2_3_d, ax_temp ); A += 4*lda; x += 4; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_c ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, y_0_1_2_3_d ); if(kmax%4>=2) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); A += 2*lda; x += 2; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b ); if(kmax%2==1) { x_0 = _mm256_broadcast_sd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); } if(alg==0) { _mm256_storeu_pd(&y[0], y_0_1_2_3); } else if(alg==1) { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_0_1_2_3 = _mm256_add_pd ( z_0_1_2_3, y_0_1_2_3 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); } else // alg==-1 { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_0_1_2_3 = _mm256_sub_pd ( z_0_1_2_3, y_0_1_2_3 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); } }
// it moves horizontally inside a block void kernel_dtrmv_u_n_8_lib4(int kmax, double *A0, int sda, double *x, double *y, int alg) { if(kmax<=0) return; double *A1 = A0 + 4*sda; const int lda = 4; int k; __m128d tmp0, z_0, y_0_1, a_00_10; __m256d zeros, ax_temp, a_00_10_20_30, a_01_11_21_31, a_40_50_60_70, a_41_51_61_71, x_0, x_1, y_0_1_2_3, y_0_1_2_3_b, z_0_1_2_3, y_4_5_6_7, y_4_5_6_7_b, z_4_5_6_7; /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_4_5_6_7 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_4_5_6_7_b = _mm256_setzero_pd(); */ zeros = _mm256_setzero_pd(); /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_0_1_2_3_c = _mm256_setzero_pd(); */ /* y_0_1_2_3_d = _mm256_setzero_pd();*/ // upper triangular // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A0[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A0[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_0_1_2_3_b = _mm256_castpd128_pd256( y_0_1 ); y_0_1_2_3_b = _mm256_blend_pd( y_0_1_2_3_b, y_0_1_2_3_b, 0xc ); // forth col (avoid zero y_0_1_2_3) x_1 = _mm256_broadcast_sd( &x[3] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); y_0_1_2_3 = _mm256_mul_pd( a_01_11_21_31, x_1 ); // first col x_0 = _mm256_broadcast_sd( &x[2] ); x_0 = _mm256_blend_pd( x_0, zeros, 0x8 ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; // upper squared x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); x_0 = _mm256_broadcast_sd( &x[2] ); x_1 = _mm256_broadcast_sd( &x[3] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); // lower triangular // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A1[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A1[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_4_5_6_7_b = _mm256_castpd128_pd256( y_0_1 ); y_4_5_6_7_b = _mm256_blend_pd( y_4_5_6_7_b, y_4_5_6_7_b, 0xc ); // forth col (avoid zero y_4_5_6_7) x_1 = _mm256_broadcast_sd( &x[3] ); a_01_11_21_31 = _mm256_load_pd( &A1[0+lda*3] ); y_4_5_6_7 = _mm256_mul_pd( a_01_11_21_31, x_1 ); // first col x_0 = _mm256_broadcast_sd( &x[2] ); x_0 = _mm256_blend_pd( x_0, zeros, 0x8 ); a_00_10_20_30 = _mm256_load_pd( &A1[0+lda*2] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; k=8; for(; k<kmax-3; k+=4) { /* __builtin_prefetch( A0 + 4*lda );*/ /* __builtin_prefetch( A1 + 4*lda );*/ x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); /* __builtin_prefetch( A0 + 5*lda );*/ /* __builtin_prefetch( A1 + 5*lda );*/ x_0 = _mm256_broadcast_sd( &x[2] ); x_1 = _mm256_broadcast_sd( &x[3] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; } if(kmax%4>=2) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 2*lda; A1 += 2*lda; x += 2; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_4_5_6_7_b ); if(kmax%2==1) { x_0 = _mm256_broadcast_sd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); /* A0 += 1*lda;*/ /* A1 += 1*lda;*/ /* x += 1;*/ } if(alg==0) { _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } else if(alg==1) { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_4_5_6_7 = _mm256_loadu_pd( &y[4] ); z_0_1_2_3 = _mm256_add_pd( z_0_1_2_3, y_0_1_2_3 ); z_4_5_6_7 = _mm256_add_pd( z_4_5_6_7, y_4_5_6_7 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); _mm256_storeu_pd(&y[4], z_4_5_6_7); } else // alg==-1 { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_4_5_6_7 = _mm256_loadu_pd( &y[4] ); z_0_1_2_3 = _mm256_sub_pd( z_0_1_2_3, y_0_1_2_3 ); z_4_5_6_7 = _mm256_sub_pd( z_4_5_6_7, y_4_5_6_7 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); _mm256_storeu_pd(&y[4], z_4_5_6_7); } }
// it moves horizontally inside a block void kernel_dtrmv_u_n_2_lib4(int kmax, double *A, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; __m128d ax_temp, a_00_10, a_01_11, a_02_12, a_03_13, x_0, x_1, x_2, x_3, y_0_1, y_0_1_b, y_0_1_c, y_0_1_d, z_0_1; /* y_0_1 = _mm_setzero_pd(); */ // second col (avoid zero y_0_1) x_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, x_0 ); // first col x_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); ax_temp = _mm_mul_sd( a_00_10, x_0 ); y_0_1 = _mm_add_sd( y_0_1, ax_temp ); A += 2*lda; x += 2; k=2; for(; k<kmax-1; k+=2) { x_0 = _mm_loaddup_pd( &x[0] ); x_1 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); a_01_11 = _mm_load_pd( &A[0+lda*1] ); ax_temp = _mm_mul_pd( a_00_10, x_0 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); ax_temp = _mm_mul_pd( a_01_11, x_1 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); A += 2*lda; x += 2; } if(kmax%2==1) { x_0 = _mm_loaddup_pd( &x[0] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); ax_temp = _mm_mul_pd( a_00_10, x_0 ); y_0_1 = _mm_add_pd( y_0_1, ax_temp ); } if(alg==0) { _mm_storeu_pd(&y[0], y_0_1); } else if(alg==1) { z_0_1 = _mm_loadu_pd( &y[0] ); z_0_1 = _mm_add_pd( z_0_1, y_0_1 ); _mm_storeu_pd(&y[0], z_0_1); } else // alg==-1 { z_0_1 = _mm_loadu_pd( &y[0] ); z_0_1 = _mm_sub_pd( z_0_1, y_0_1 ); _mm_storeu_pd(&y[0], z_0_1); } }
__m128d test_mm_loaddup_pd(double const* P) { // CHECK-LABEL: test_mm_loaddup_pd // CHECK: load double* // CHECK-ASM: movddup %xmm{{.*}} return _mm_loaddup_pd(P); }
void bli_daxpyf_int_var1 ( conj_t conja, conj_t conjx, dim_t m, dim_t b_n, double* alpha, double* a, inc_t inca, inc_t lda, double* x, inc_t incx, double* y, inc_t incy, cntx_t* cntx ) { double* restrict alpha_cast = alpha; double* restrict a_cast = a; double* restrict x_cast = x; double* restrict y_cast = y; dim_t i; const dim_t n_elem_per_reg = 2; const dim_t n_iter_unroll = 2; dim_t m_pre; dim_t m_run; dim_t m_left; double* restrict a0; double* restrict a1; double* restrict a2; double* restrict a3; double* restrict y0; double a0c, a1c, a2c, a3c; double chi0, chi1, chi2, chi3; v2df_t a00v, a01v, a02v, a03v, y0v; v2df_t a10v, a11v, a12v, a13v, y1v; v2df_t chi0v, chi1v, chi2v, chi3v; bool_t use_ref = FALSE; if ( bli_zero_dim2( m, b_n ) ) return; m_pre = 0; // If there is anything that would interfere with our use of aligned // vector loads/stores, call the reference implementation. if ( b_n < bli_cntx_get_blksz_def_dt( BLIS_DOUBLE, BLIS_AF, cntx ) ) { use_ref = TRUE; } else if ( inca != 1 || incx != 1 || incy != 1 || bli_is_unaligned_to( lda*sizeof(double), 16 ) ) { use_ref = TRUE; } else if ( bli_is_unaligned_to( a, 16 ) || bli_is_unaligned_to( y, 16 ) ) { use_ref = TRUE; if ( bli_is_unaligned_to( a, 16 ) && bli_is_unaligned_to( y, 16 ) ) { use_ref = FALSE; m_pre = 1; } } // Call the reference implementation if needed. if ( use_ref == TRUE ) { BLIS_DAXPYF_KERNEL_REF( conja, conjx, m, b_n, alpha_cast, a_cast, inca, lda, x_cast, incx, y_cast, incy, cntx ); return; } m_run = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll ); m_left = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll ); a0 = a_cast + 0*lda; a1 = a_cast + 1*lda; a2 = a_cast + 2*lda; a3 = a_cast + 3*lda; y0 = y_cast; chi0 = *(x_cast + 0*incx); chi1 = *(x_cast + 1*incx); chi2 = *(x_cast + 2*incx); chi3 = *(x_cast + 3*incx); PASTEMAC2(d,d,scals)( *alpha_cast, chi0 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi1 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi2 ); PASTEMAC2(d,d,scals)( *alpha_cast, chi3 ); if ( m_pre == 1 ) { a0c = *a0; a1c = *a1; a2c = *a2; a3c = *a3; *y0 += chi0 * a0c + chi1 * a1c + chi2 * a2c + chi3 * a3c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; y0 += incy; } chi0v.v = _mm_loaddup_pd( ( double* )&chi0 ); chi1v.v = _mm_loaddup_pd( ( double* )&chi1 ); chi2v.v = _mm_loaddup_pd( ( double* )&chi2 ); chi3v.v = _mm_loaddup_pd( ( double* )&chi3 ); for ( i = 0; i < m_run; ++i ) { y0v.v = _mm_load_pd( ( double* )(y0 + 0*n_elem_per_reg) ); a00v.v = _mm_load_pd( ( double* )(a0 + 0*n_elem_per_reg) ); a01v.v = _mm_load_pd( ( double* )(a1 + 0*n_elem_per_reg) ); y0v.v += chi0v.v * a00v.v; y0v.v += chi1v.v * a01v.v; a02v.v = _mm_load_pd( ( double* )(a2 + 0*n_elem_per_reg) ); a03v.v = _mm_load_pd( ( double* )(a3 + 0*n_elem_per_reg) ); y0v.v += chi2v.v * a02v.v; y0v.v += chi3v.v * a03v.v; _mm_store_pd( ( double* )(y0 + 0*n_elem_per_reg), y0v.v ); y1v.v = _mm_load_pd( ( double* )(y0 + 1*n_elem_per_reg) ); a10v.v = _mm_load_pd( ( double* )(a0 + 1*n_elem_per_reg) ); a11v.v = _mm_load_pd( ( double* )(a1 + 1*n_elem_per_reg) ); y1v.v += chi0v.v * a10v.v; y1v.v += chi1v.v * a11v.v; a12v.v = _mm_load_pd( ( double* )(a2 + 1*n_elem_per_reg) ); a13v.v = _mm_load_pd( ( double* )(a3 + 1*n_elem_per_reg) ); y1v.v += chi2v.v * a12v.v; y1v.v += chi3v.v * a13v.v; _mm_store_pd( ( double* )(y0 + 1*n_elem_per_reg), y1v.v ); a0 += n_elem_per_reg * n_iter_unroll; a1 += n_elem_per_reg * n_iter_unroll; a2 += n_elem_per_reg * n_iter_unroll; a3 += n_elem_per_reg * n_iter_unroll; y0 += n_elem_per_reg * n_iter_unroll; } if ( m_left > 0 ) { for ( i = 0; i < m_left; ++i ) { a0c = *a0; a1c = *a1; a2c = *a2; a3c = *a3; *y0 += chi0 * a0c + chi1 * a1c + chi2 * a2c + chi3 * a3c; a0 += inca; a1 += inca; a2 += inca; a3 += inca; y0 += incy; } } }