void THDoubleVector_copy_AVX(double *y, const double *x, const ptrdiff_t n) { ptrdiff_t i; ptrdiff_t off; for (i=0; i<=((n)-8); i+=8) { _mm256_storeu_pd(y+i, _mm256_loadu_pd(x+i)); _mm256_storeu_pd(y+i+4, _mm256_loadu_pd(x+i+4)); } off = (n) - ((n)%8); for (i=0; i<((n)%8); i++) { y[off+i] = x[off+i]; } }
void THDoubleVector_fill_AVX(double *x, const double c, const ptrdiff_t n) { ptrdiff_t i; ptrdiff_t off; __m256d YMM0 = _mm256_set_pd(c, c, c, c); for (i=0; i<=((n)-16); i+=16) { _mm256_storeu_pd((x)+i , YMM0); _mm256_storeu_pd((x)+i+4, YMM0); _mm256_storeu_pd((x)+i+8, YMM0); _mm256_storeu_pd((x)+i+12, YMM0); } off = (n) - ((n)%16); for (i=0; i<((n)%16); i++) { x[off+i] = c; } }
// multiply *p by v and applied to all n COREARRAY_DLL_DEFAULT void vec_f64_mul(double *p, size_t n, double v) { #if defined(COREARRAY_SIMD_AVX) const __m256d v4 = _mm256_set1_pd(v); switch ((size_t)p & 0x1F) { case 0x08: if (n > 0) { (*p++) *= v; n--; } case 0x10: if (n > 0) { (*p++) *= v; n--; } case 0x18: if (n > 0) { (*p++) *= v; n--; } case 0x00: for (; n >= 4; n-=4) { _mm256_store_pd(p, _mm256_mul_pd(_mm256_load_pd(p), v4)); p += 4; } if (n >= 2) { _mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), _mm256_castpd256_pd128(v4))); p += 2; n -= 2; } break; default: for (; n >= 4; n-=4) { _mm256_storeu_pd(p, _mm256_mul_pd(_mm256_loadu_pd(p), v4)); p += 4; } if (n >= 2) { _mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), _mm256_castpd256_pd128(v4))); p += 2; n -= 2; } } #elif defined(COREARRAY_SIMD_SSE2) const __m128d v2 = _mm_set1_pd(v); switch ((size_t)p & 0x0F) { case 0x08: if (n > 0) { (*p++) *= v; n--; } case 0x00: for (; n >= 2; n-=2, p+=2) _mm_store_pd(p, _mm_mul_pd(_mm_load_pd(p), v2)); break; default: for (; n >= 2; n-=2, p+=2) _mm_storeu_pd(p, _mm_mul_pd(_mm_loadu_pd(p), v2)); } #endif for (; n > 0; n--) (*p++) *= v; }
void THDoubleVector_muls_AVX(double *y, const double *x, const double c, const ptrdiff_t n) { ptrdiff_t i; __m256d YMM15 = _mm256_set_pd(c, c, c, c); __m256d YMM0, YMM1; for (i=0; i<=((n)-8); i+=8) { YMM0 = _mm256_loadu_pd(x+i); YMM1 = _mm256_loadu_pd(x+i+4); YMM0 = _mm256_mul_pd(YMM0, YMM15); YMM1 = _mm256_mul_pd(YMM1, YMM15); _mm256_storeu_pd(y+i, YMM0); _mm256_storeu_pd(y+i+4, YMM1); } for (; i<n; i++) { y[i] = x[i] * c; } }
void SimpleClean::PartialSubtractImageAVX(double *image, size_t imgWidth, size_t imgHeight, const double *psf, size_t psfWidth, size_t psfHeight, size_t x, size_t y, double factor, size_t startY, size_t endY) { size_t startX, endX; int offsetX = (int) x - psfWidth/2, offsetY = (int) y - psfHeight/2; if(offsetX > 0) startX = offsetX; else startX = 0; if(offsetY > (int) startY) startY = offsetY; endX = std::min(x + psfWidth/2, imgWidth); size_t unAlignedCount = (endX - startX) % 4; endX -= unAlignedCount; endY = std::min(y + psfHeight/2, endY); const __m256d mFactor = _mm256_set1_pd(-factor); for(size_t ypos = startY; ypos < endY; ++ypos) { double *imageIter = image + ypos * imgWidth + startX; const double *psfIter = psf + (ypos - offsetY) * psfWidth + startX - offsetX; for(size_t xpos = startX; xpos != endX; xpos+=4) { __m256d imgVal = _mm256_loadu_pd(imageIter), psfVal = _mm256_loadu_pd(psfIter); #ifdef __FMA4__ _mm256_storeu_pd(imageIter, _mm256_fmadd_pd(psfVal, mFactor, imgVal)); #else _mm256_storeu_pd(imageIter, _mm256_add_pd(imgVal, _mm256_mul_pd(psfVal, mFactor))); #endif imageIter+=4; psfIter+=4; } for(size_t xpos = endX; xpos!=endX + unAlignedCount; ++xpos) { *imageIter -= *psfIter * factor; ++imageIter; ++psfIter; } } }
void THDoubleVector_cmul_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) { ptrdiff_t i; __m256d YMM0, YMM1, YMM2, YMM3; for (i=0; i<=((n)-8); i+=8) { YMM0 = _mm256_loadu_pd(x+i); YMM1 = _mm256_loadu_pd(x+i+4); YMM2 = _mm256_loadu_pd(y+i); YMM3 = _mm256_loadu_pd(y+i+4); YMM2 = _mm256_mul_pd(YMM0, YMM2); YMM3 = _mm256_mul_pd(YMM1, YMM3); _mm256_storeu_pd(z+i, YMM2); _mm256_storeu_pd(z+i+4, YMM3); } for (; i<n; i++) { z[i] = x[i] * y[i]; } }
static inline void matmul_4xkxkx4(int lda, int K, double* a, double* b, double* c) { __m256d a_coli, bi0, bi1, bi2, bi3; __m256d c_col0, c_col1, c_col2, c_col3; /* layout of 4x4 c matrix 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 */ double* c01_ptr = c + lda; double* c02_ptr = c01_ptr + lda; double* c03_ptr = c02_ptr + lda; // load old value of c c_col0 = _mm256_loadu_pd(c); c_col1 = _mm256_loadu_pd(c01_ptr); c_col2 = _mm256_loadu_pd(c02_ptr); c_col3 = _mm256_loadu_pd(c03_ptr); // for every column of a (or every row of b) for (int i = 0; i < K; ++i) { a_coli = _mm256_load_pd(a); a += 4; bi0 = _mm256_broadcast_sd(b++); bi1 = _mm256_broadcast_sd(b++); bi2 = _mm256_broadcast_sd(b++); bi3 = _mm256_broadcast_sd(b++); c_col0 = _mm256_add_pd(c_col0, _mm256_mul_pd(a_coli, bi0)); c_col1 = _mm256_add_pd(c_col1, _mm256_mul_pd(a_coli, bi1)); c_col2 = _mm256_add_pd(c_col2, _mm256_mul_pd(a_coli, bi2)); c_col3 = _mm256_add_pd(c_col3, _mm256_mul_pd(a_coli, bi3)); } _mm256_storeu_pd(c, c_col0); _mm256_storeu_pd(c01_ptr, c_col1); _mm256_storeu_pd(c02_ptr, c_col2); _mm256_storeu_pd(c03_ptr, c_col3); }
void ComplexToDouble(Complex *src, double *dstI, double *dstQ, const unsigned int len) { __m128 avxA, avxB; __m256d avxA_D, avxB_D, avxX_D, avxY_D, avxR_D, avxI_D; for (unsigned int i=0; i+4<=len; i+=4) { avxA = _mm_maskload_ps((float*)(src+i), _mm_set_epi32(SET_1, SET_1, SET_1, SET_1)); //load float avxB = _mm_maskload_ps((float*)(src+i+2), _mm_set_epi32(SET_1, SET_1, SET_1, SET_1)); avxA_D = _mm256_cvtps_pd(avxA); //float to double avxB_D = _mm256_cvtps_pd(avxB); avxX_D = _mm256_permute2f128_pd(avxA_D, avxB_D, 0x20); avxY_D = _mm256_permute2f128_pd(avxA_D, avxB_D, 0x31); avxR_D = _mm256_shuffle_pd(avxX_D, avxY_D, 0x00); avxI_D = _mm256_shuffle_pd(avxX_D, avxY_D, 0x0f); _mm256_storeu_pd(dstI+i, avxR_D); //store _mm256_storeu_pd(dstQ+i, avxI_D); } for (unsigned int i=len-(len&0x03); i<len; ++i) { dstI[i] = static_cast<double>(src[i].m_real); dstQ[i] = static_cast<double>(src[i].m_imag); } }
extern "C" void product32x32_avx(double *a, double *b, double *c, int n) { for(int i=0; i<n; i++) { __m256d t1 = _mm256_loadu_pd(&c[i*n + 0]); __m256d t2 = _mm256_loadu_pd(&c[i*n + 4]); __m256d t3 = _mm256_loadu_pd(&c[i*n + 8]); __m256d t4 = _mm256_loadu_pd(&c[i*n + 12]); __m256d t5 = _mm256_loadu_pd(&c[i*n + 16]); __m256d t6 = _mm256_loadu_pd(&c[i*n + 20]); __m256d t7 = _mm256_loadu_pd(&c[i*n + 24]); __m256d t8 = _mm256_loadu_pd(&c[i*n + 28]); for(int k=0; k<n; k++) { __m256d a1 = _mm256_set1_pd(a[k*n+i]); __m256d b1 = _mm256_loadu_pd(&b[k*n+0]); t1 = _mm256_sub_pd(t1,_mm256_mul_pd(a1,b1)); __m256d b2 = _mm256_loadu_pd(&b[k*n+4]); t2 = _mm256_sub_pd(t2,_mm256_mul_pd(a1,b2)); __m256d b3 = _mm256_loadu_pd(&b[k*n+8]); t3 = _mm256_sub_pd(t3,_mm256_mul_pd(a1,b3)); __m256d b4 = _mm256_loadu_pd(&b[k*n+12]); t4 = _mm256_sub_pd(t4,_mm256_mul_pd(a1,b4)); __m256d b5 = _mm256_loadu_pd(&b[k*n+16]); t5 = _mm256_sub_pd(t5,_mm256_mul_pd(a1,b5)); __m256d b6 = _mm256_loadu_pd(&b[k*n+20]); t6 = _mm256_sub_pd(t6,_mm256_mul_pd(a1,b6)); __m256d b7 = _mm256_loadu_pd(&b[k*n+24]); t7 = _mm256_sub_pd(t7,_mm256_mul_pd(a1,b7)); __m256d b8 = _mm256_loadu_pd(&b[k*n+28]); t8 = _mm256_sub_pd(t8,_mm256_mul_pd(a1,b8)); } _mm256_storeu_pd(&c[i*n + 0], t1); _mm256_storeu_pd(&c[i*n + 4], t2); _mm256_storeu_pd(&c[i*n + 8], t3); _mm256_storeu_pd(&c[i*n + 12], t4); _mm256_storeu_pd(&c[i*n + 16], t5); _mm256_storeu_pd(&c[i*n + 20], t6); _mm256_storeu_pd(&c[i*n + 24], t7); _mm256_storeu_pd(&c[i*n + 28], t8); } }
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) { unsigned int m = n >> 2; unsigned int k = n & 3, j; unsigned int l = n & (~3); for (j = 0; j < m; j++) { v4sd src = _mm256_loadu_pd(a + 4 * j); v4sd tem = simd_exp4d(src); _mm256_storeu_pd(y + 4 * j, tem); } for (j = 0; j < k; j++) { y[j + l] = exp(a[j + l]); } }
void THDoubleVector_cadd_AVX(double *z, const double *x, const double *y, const double c, const ptrdiff_t n) { ptrdiff_t i; __m256d YMM15 = _mm256_set_pd(c, c, c, c); __m256d YMM0, YMM1, YMM2, YMM3; for (i=0; i<=((n)-4); i+=4) { YMM0 = _mm256_loadu_pd(y+i); YMM1 = _mm256_loadu_pd(x+i); YMM2 = _mm256_mul_pd(YMM0, YMM15); YMM3 = _mm256_add_pd(YMM1, YMM2); _mm256_storeu_pd(z+i, YMM3); } for (; i<(n); i++) { z[i] = x[i] + y[i] * c; } }
void static avx_test (void) { int i; union256d u, s1, s2; long long source1[4]={34545, 95567, 23443, 5675}; long long source2[4]={674, 57897, 93459, 45624}; long long d[4]; long long e[4]; s1.x = _mm256_loadu_pd ((double *)source1); s2.x = _mm256_loadu_pd ((double *)source2); u.x = _mm256_andnot_pd (s1.x, s2.x); _mm256_storeu_pd ((double *)d, u.x); for (i = 0; i < 4; i++) e[i] = (~source1[i]) & source2[i]; if (checkVl (d, e, 4)) abort (); }
/* sum double vectors ---------------------------------------------------------- * sum double vectors: out=data1.+data2 * args : double *data1 I input double array * double *data2 I input double array * int n I number of input data * double *out O output double array * return : none * note : AVX command is used if "AVX" is defined *-----------------------------------------------------------------------------*/ extern void sumvd(const double *data1, const double *data2, int n, double *out) { int i; #if !defined(AVX_ENABLE) for (i=0;i<n;i++) out[i]=data1[i]+data2[i]; #else int m=n/4; __m256d xmm1,xmm2,xmm3; if (n<8) { for (i=0;i<n;i++) out[i]=data1[i]+data2[i]; } else { for (i=0;i<4*m;i+=4) { xmm1=_mm256_loadu_pd(&data1[i]); xmm2=_mm256_loadu_pd(&data2[i]); xmm3=_mm256_add_pd(xmm1,xmm2); _mm256_storeu_pd(&out[i],xmm3); } for (;i<n;i++) out[i]=data1[i]+data2[i]; } #endif }
void jacobi_avx(GRID_T *oldGrid, GRID_T *newGrid, int width, int height){ int remainder; remainder = (width-2)%4; /* Each vector contains one value of the four Jacobi iteration step * Either each upper, below, left or right value. */ __m256d up_row, below_row, right_row, left_row; __m256d factor = _mm256_set1_pd(0.25); for(int i = 1; i < height-1; i++){ for(int j = 1; j < width-4; j += 4){ up_row = _mm256_loadu_pd(&(oldGrid[(i-1)*width + j])); below_row = _mm256_loadu_pd(&(oldGrid[(i+1)*width + j])); right_row = _mm256_loadu_pd(&(oldGrid[i*width + (j+1)])); left_row = _mm256_loadu_pd(&(oldGrid[i*width + (j-1)])); /* Sum up n-th element of each vector */ __m256d dest; __m256d add_1 = _mm256_add_pd(up_row, below_row); __m256d add_2 = _mm256_add_pd(left_row, right_row); dest = _mm256_add_pd(add_2, add_1); /* Multiplicat with 0.25 */ dest = _mm256_mul_pd(dest, factor); // Use unaligned store method. Normal one produces segmentation fault _mm256_storeu_pd(&(newGrid[i*width + j]), dest); } for(int j = width - remainder - 1; j < width -1; j++){ newGrid[i*width + j] = (oldGrid[i*width + (j-1)] + oldGrid[i*width + (j+1)] + oldGrid[(i-1)*width + j] + oldGrid[(i+1)*width + j]) * 0.25; } } return; }
// it moves horizontally inside a block (A upper triangular) void kernel_dtrmv_u_n_4_lib4(int kmax, double *A, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; __m128d tmp0, z_0, y_0_1, a_00_10; __m256d zeros, ax_temp, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0, x_1, x_2, x_3, y_0_1_2_3, y_0_1_2_3_b, y_0_1_2_3_c, y_0_1_2_3_d, z_0_1_2_3; zeros = _mm256_setzero_pd(); /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_0_1_2_3_c = _mm256_setzero_pd(); */ y_0_1_2_3_d = _mm256_setzero_pd(); // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_0_1_2_3_c = _mm256_castpd128_pd256( y_0_1 ); y_0_1_2_3_c = _mm256_blend_pd( y_0_1_2_3_c, y_0_1_2_3_d, 0xc ); // forth col (avoid zero y_0_1_2_3) x_3 = _mm256_broadcast_sd( &x[3] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); y_0_1_2_3 = _mm256_mul_pd( a_03_13_23_33, x_3 ); // first col x_2 = _mm256_broadcast_sd( &x[2] ); x_2 = _mm256_blend_pd( x_2, zeros, 0x8 ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); y_0_1_2_3_b = _mm256_mul_pd( a_02_12_22_32, x_2 ); A += 4*lda; x += 4; k=4; for(; k<kmax-3; k+=4) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); x_2 = _mm256_broadcast_sd( &x[2] ); x_3 = _mm256_broadcast_sd( &x[3] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_02_12_22_32, x_2 ); y_0_1_2_3_c = _mm256_add_pd( y_0_1_2_3_c, ax_temp ); ax_temp = _mm256_mul_pd( a_03_13_23_33, x_3 ); y_0_1_2_3_d = _mm256_add_pd( y_0_1_2_3_d, ax_temp ); A += 4*lda; x += 4; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_c ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, y_0_1_2_3_d ); if(kmax%4>=2) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); A += 2*lda; x += 2; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b ); if(kmax%2==1) { x_0 = _mm256_broadcast_sd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); } if(alg==0) { _mm256_storeu_pd(&y[0], y_0_1_2_3); } else if(alg==1) { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_0_1_2_3 = _mm256_add_pd ( z_0_1_2_3, y_0_1_2_3 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); } else // alg==-1 { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_0_1_2_3 = _mm256_sub_pd ( z_0_1_2_3, y_0_1_2_3 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); } }
void kernel_dgemv_t_4_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); double *tA, *tx; int k; int ka = kmax; // number from aligned positon __m256d aaxx_temp, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0_1_2_3, y_00, y_11, y_22, y_33; __m128d ax_temp, a_00_10, a_01_11, a_02_12, a_03_13, x_0_1, y_0, y_1, y_2, y_3; y_00 = _mm256_setzero_pd(); y_11 = _mm256_setzero_pd(); y_22 = _mm256_setzero_pd(); y_33 = _mm256_setzero_pd(); y_0 = _mm256_castpd256_pd128(y_00); y_1 = _mm256_castpd256_pd128(y_11); y_2 = _mm256_castpd256_pd128(y_22); y_3 = _mm256_castpd256_pd128(y_33); k = lda*(ka/lda); tA = A + (ka/lda)*sda*lda; tx = x + (ka/lda)*lda; for(; k<ka; k++) { x_0_1 = _mm_load_sd( &tx[0] ); a_00_10 = _mm_load_sd( &tA[0+lda*0] ); a_01_11 = _mm_load_sd( &tA[0+lda*1] ); a_02_12 = _mm_load_sd( &tA[0+lda*2] ); a_03_13 = _mm_load_sd( &tA[0+lda*3] ); ax_temp = _mm_mul_sd( a_00_10, x_0_1 ); y_0 = _mm_add_sd (y_0, ax_temp ); ax_temp = _mm_mul_sd( a_01_11, x_0_1 ); y_1 = _mm_add_sd (y_1, ax_temp ); ax_temp = _mm_mul_sd( a_02_12, x_0_1 ); y_2 = _mm_add_sd (y_2, ax_temp ); ax_temp = _mm_mul_sd( a_03_13, x_0_1 ); y_3 = _mm_add_sd (y_3, ax_temp ); tA += 1; tx += 1; } y_00 = _mm256_castpd128_pd256(y_0); y_11 = _mm256_castpd128_pd256(y_1); y_22 = _mm256_castpd128_pd256(y_2); y_33 = _mm256_castpd128_pd256(y_3); k=0; for(; k<ka-7; k+=8) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; } for(; k<ka-3; k+=4) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; } __m256d y_0_1_2_3; y_00 = _mm256_hadd_pd(y_00, y_11); y_22 = _mm256_hadd_pd(y_22, y_33); y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 ); y_00 = _mm256_permute2f128_pd(y_22, y_00, 19); y_00 = _mm256_add_pd( y_00, y_11 ); if(alg==0) { _mm256_storeu_pd(&y[0], y_00); } else if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); } }
void kernel_dgemv_t_8_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); double *tA, *tx; int k; int ka = kmax; // number from aligned positon __m256d aaxx_temp, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0_1_2_3, y_00, y_11, y_22, y_33, y_44, y_55, y_66, y_77; __m128d ax_temp, a_00_10, a_01_11, a_02_12, a_03_13, x_0_1, y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7; y_00 = _mm256_setzero_pd(); y_11 = _mm256_setzero_pd(); y_22 = _mm256_setzero_pd(); y_33 = _mm256_setzero_pd(); y_44 = _mm256_setzero_pd(); y_55 = _mm256_setzero_pd(); y_66 = _mm256_setzero_pd(); y_77 = _mm256_setzero_pd(); y_0 = _mm256_castpd256_pd128(y_00); y_1 = _mm256_castpd256_pd128(y_11); y_2 = _mm256_castpd256_pd128(y_22); y_3 = _mm256_castpd256_pd128(y_33); y_4 = _mm256_castpd256_pd128(y_44); y_5 = _mm256_castpd256_pd128(y_55); y_6 = _mm256_castpd256_pd128(y_66); y_7 = _mm256_castpd256_pd128(y_77); k = lda*(ka/lda); tA = A + (ka/lda)*sda*lda; tx = x + (ka/lda)*lda; if(ka-k>0) // it can be only ka-k = {1, 2, 3} { if((ka-k)>=2) { x_0_1 = _mm_load_pd( &tx[0] ); a_00_10 = _mm_load_pd( &tA[0+lda*0] ); a_01_11 = _mm_load_pd( &tA[0+lda*1] ); a_02_12 = _mm_load_pd( &tA[0+lda*2] ); a_03_13 = _mm_load_pd( &tA[0+lda*3] ); ax_temp = _mm_mul_pd( a_00_10, x_0_1 ); y_0 = _mm_add_pd (y_0, ax_temp ); ax_temp = _mm_mul_pd( a_01_11, x_0_1 ); y_1 = _mm_add_pd (y_1, ax_temp ); ax_temp = _mm_mul_pd( a_02_12, x_0_1 ); y_2 = _mm_add_pd (y_2, ax_temp ); ax_temp = _mm_mul_pd( a_03_13, x_0_1 ); y_3 = _mm_add_pd (y_3, ax_temp ); a_00_10 = _mm_load_pd( &tA[0+lda*4] ); a_01_11 = _mm_load_pd( &tA[0+lda*5] ); a_02_12 = _mm_load_pd( &tA[0+lda*6] ); a_03_13 = _mm_load_pd( &tA[0+lda*7] ); ax_temp = _mm_mul_pd( a_00_10, x_0_1 ); y_4 = _mm_add_pd (y_4, ax_temp ); ax_temp = _mm_mul_pd( a_01_11, x_0_1 ); y_5 = _mm_add_pd (y_5, ax_temp ); ax_temp = _mm_mul_pd( a_02_12, x_0_1 ); y_6 = _mm_add_pd (y_6, ax_temp ); ax_temp = _mm_mul_pd( a_03_13, x_0_1 ); y_7 = _mm_add_pd (y_7, ax_temp ); tA += 2; tx += 2; k+=2; } if((ka-k)==1) { x_0_1 = _mm_load_sd( &tx[0] ); a_00_10 = _mm_load_sd( &tA[0+lda*0] ); a_01_11 = _mm_load_sd( &tA[0+lda*1] ); a_02_12 = _mm_load_sd( &tA[0+lda*2] ); a_03_13 = _mm_load_sd( &tA[0+lda*3] ); ax_temp = _mm_mul_sd( a_00_10, x_0_1 ); y_0 = _mm_add_sd (y_0, ax_temp ); ax_temp = _mm_mul_sd( a_01_11, x_0_1 ); y_1 = _mm_add_sd (y_1, ax_temp ); ax_temp = _mm_mul_sd( a_02_12, x_0_1 ); y_2 = _mm_add_sd (y_2, ax_temp ); ax_temp = _mm_mul_sd( a_03_13, x_0_1 ); y_3 = _mm_add_sd (y_3, ax_temp ); a_00_10 = _mm_load_sd( &tA[0+lda*4] ); a_01_11 = _mm_load_sd( &tA[0+lda*5] ); a_02_12 = _mm_load_sd( &tA[0+lda*6] ); a_03_13 = _mm_load_sd( &tA[0+lda*7] ); ax_temp = _mm_mul_sd( a_00_10, x_0_1 ); y_4 = _mm_add_sd (y_4, ax_temp ); ax_temp = _mm_mul_sd( a_01_11, x_0_1 ); y_5 = _mm_add_sd (y_5, ax_temp ); ax_temp = _mm_mul_sd( a_02_12, x_0_1 ); y_6 = _mm_add_sd (y_6, ax_temp ); ax_temp = _mm_mul_sd( a_03_13, x_0_1 ); y_7 = _mm_add_sd (y_7, ax_temp ); tA += 1; tx += 1; k++; } } y_00 = _mm256_castpd128_pd256(y_0); y_11 = _mm256_castpd128_pd256(y_1); y_22 = _mm256_castpd128_pd256(y_2); y_33 = _mm256_castpd128_pd256(y_3); y_44 = _mm256_castpd128_pd256(y_4); y_55 = _mm256_castpd128_pd256(y_5); y_66 = _mm256_castpd128_pd256(y_6); y_77 = _mm256_castpd128_pd256(y_7); k=0; for(; k<ka-7; k+=8) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_55 = _mm256_add_pd( y_55, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_77 = _mm256_add_pd( y_77, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_55 = _mm256_add_pd( y_55, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_77 = _mm256_add_pd( y_77, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; } for(; k<ka-3; k+=4) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_55 = _mm256_add_pd( y_55, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_77 = _mm256_add_pd( y_77, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; } __m256d y_0_1_2_3, y_4_5_6_7; y_00 = _mm256_hadd_pd(y_00, y_11); y_22 = _mm256_hadd_pd(y_22, y_33); y_44 = _mm256_hadd_pd(y_44, y_55); y_66 = _mm256_hadd_pd(y_66, y_77); y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 ); y_00 = _mm256_permute2f128_pd(y_22, y_00, 19); y_55 = _mm256_permute2f128_pd(y_66, y_44, 2 ); y_44 = _mm256_permute2f128_pd(y_66, y_44, 19); y_00 = _mm256_add_pd( y_00, y_11 ); y_44 = _mm256_add_pd( y_44, y_55 ); if(alg==0) { _mm256_storeu_pd(&y[0], y_00); _mm256_storeu_pd(&y[4], y_44); } else if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_4_5_6_7 = _mm256_loadu_pd( &y[4] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_44 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_4_5_6_7 = _mm256_loadu_pd( &y[4] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 ); y_4_5_6_7 = _mm256_sub_pd( y_4_5_6_7, y_44 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } }
// it moves horizontally inside a block void kernel_dgemv_n_8_lib4(int kmax, double *A0, double *A1, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; __m256d ax_temp, a_00_10_20_30, a_01_11_21_31, a_40_50_60_70, a_41_51_61_71, x_0, x_1, y_0_1_2_3, y_0_1_2_3_b, z_0_1_2_3, y_4_5_6_7, y_4_5_6_7_b, z_4_5_6_7; y_0_1_2_3 = _mm256_setzero_pd(); y_4_5_6_7 = _mm256_setzero_pd(); y_0_1_2_3_b = _mm256_setzero_pd(); y_4_5_6_7_b = _mm256_setzero_pd(); if(kmax<=64) { k=0; for(; k<kmax-3; k+=4) { /* __builtin_prefetch( A0 + 4*lda );*/ /* __builtin_prefetch( A1 + 4*lda );*/ x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); /* __builtin_prefetch( A0 + 5*lda );*/ /* __builtin_prefetch( A1 + 5*lda );*/ x_0 = _mm256_broadcast_sd( &x[2] ); x_1 = _mm256_broadcast_sd( &x[3] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; } } else { k=0; for(; k<kmax-3; k+=4) { __builtin_prefetch( A0 + 4*lda ); __builtin_prefetch( A1 + 4*lda ); x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); /* __builtin_prefetch( A0 + 5*lda );*/ /* __builtin_prefetch( A1 + 5*lda );*/ __builtin_prefetch( A0 + 6*lda ); __builtin_prefetch( A1 + 6*lda ); x_0 = _mm256_broadcast_sd( &x[2] ); x_1 = _mm256_broadcast_sd( &x[3] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; } } if(kmax%4>=2) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 2*lda; A1 += 2*lda; x += 2; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_4_5_6_7_b ); if(kmax%2==1) { x_0 = _mm256_broadcast_sd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); /* A0 += 1*lda;*/ /* A1 += 1*lda;*/ /* x += 1;*/ } if(alg==0) { _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } else if(alg==1) { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_4_5_6_7 = _mm256_loadu_pd( &y[4] ); z_0_1_2_3 = _mm256_add_pd( z_0_1_2_3, y_0_1_2_3 ); z_4_5_6_7 = _mm256_add_pd( z_4_5_6_7, y_4_5_6_7 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); _mm256_storeu_pd(&y[4], z_4_5_6_7); } else // alg==-1 { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_4_5_6_7 = _mm256_loadu_pd( &y[4] ); z_0_1_2_3 = _mm256_sub_pd( z_0_1_2_3, y_0_1_2_3 ); z_4_5_6_7 = _mm256_sub_pd( z_4_5_6_7, y_4_5_6_7 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); _mm256_storeu_pd(&y[4], z_4_5_6_7); } }
/*! * \brief Unaligned store of the given packed vector at the * given memory position */ ETL_STATIC_INLINE(void) storeu(etl::complex<double>* memory, avx_simd_complex_double<etl::complex<double>> value) { _mm256_storeu_pd(reinterpret_cast<double*>(memory), value.value); }
/*! * \brief Unaligned store of the given packed vector at the * given memory position */ ETL_STATIC_INLINE(void) storeu(double* memory, avx_simd_double value) { _mm256_storeu_pd(memory, value.value); }
// it moves horizontally inside a block void kernel_dgemv_n_4_lib4(int kmax, double *A, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; __m256d ax_temp, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0, x_1, x_2, x_3, y_0_1_2_3, y_0_1_2_3_b, y_0_1_2_3_c, y_0_1_2_3_d, z_0_1_2_3; y_0_1_2_3 = _mm256_setzero_pd(); y_0_1_2_3_b = _mm256_setzero_pd(); y_0_1_2_3_c = _mm256_setzero_pd(); y_0_1_2_3_d = _mm256_setzero_pd(); k=0; for(; k<kmax-3; k+=4) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); x_2 = _mm256_broadcast_sd( &x[2] ); x_3 = _mm256_broadcast_sd( &x[3] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_02_12_22_32, x_2 ); y_0_1_2_3_c = _mm256_add_pd( y_0_1_2_3_c, ax_temp ); ax_temp = _mm256_mul_pd( a_03_13_23_33, x_3 ); y_0_1_2_3_d = _mm256_add_pd( y_0_1_2_3_d, ax_temp ); A += 4*lda; x += 4; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_c ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, y_0_1_2_3_d ); if(kmax%4>=2) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); A += 2*lda; x += 2; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b ); if(kmax%2==1) { x_0 = _mm256_broadcast_sd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); /* A += 1*lda;*/ /* x += 1;*/ } if(alg==0) { _mm256_storeu_pd(&y[0], y_0_1_2_3); } else if(alg==1) { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_0_1_2_3 = _mm256_add_pd ( z_0_1_2_3, y_0_1_2_3 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); } else // alg==-1 { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_0_1_2_3 = _mm256_sub_pd ( z_0_1_2_3, y_0_1_2_3 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); } }
inline void vector4d::store_unaligned(double* dst) const { _mm256_storeu_pd(dst, m_value); }
// it moves vertically across blocks void kernel_dsymv_4_lib4(int kmax, double *A, int sda, double *x_n, double *y_n, double *z_n, double *x_t, double *y_t, double *z_t, int tri, int alg) { if(kmax<=0) return; /*printf("\nciao %d\n", kmax); */ const int bs = 4; __builtin_prefetch( A + bs*0 ); __builtin_prefetch( A + bs*2 ); int k, ka; ka = kmax; // number from aligned positon double k_left; // double *sA, *sy_n, *sx_t; static double d_mask[4] = {0.5, 1.5, 2.5, 3.5}; __m256d v_mask, zeros, temp, a_00, a_01, a_02, a_03, x_n_0, x_n_1, x_n_2, x_n_3, y_n_0, x_t_0, y_t_0, y_t_1, y_t_2, y_t_3; __m256i i_mask; #if 0 __m128d stemp, sa_00, sa_01, sa_02, sa_03, sx_n_0, sx_n_1, sx_n_2, sx_n_3, sy_n_0, sx_t_0, sy_t_0, sy_t_1, sy_t_2, sy_t_3; #endif zeros = _mm256_setzero_pd(); x_n_0 = _mm256_broadcast_sd( &x_n[0] ); x_n_1 = _mm256_broadcast_sd( &x_n[1] ); x_n_2 = _mm256_broadcast_sd( &x_n[2] ); x_n_3 = _mm256_broadcast_sd( &x_n[3] ); if(alg==-1) // TODO xor { x_n_0 = _mm256_sub_pd( zeros, x_n_0 ); x_n_1 = _mm256_sub_pd( zeros, x_n_1 ); x_n_2 = _mm256_sub_pd( zeros, x_n_2 ); x_n_3 = _mm256_sub_pd( zeros, x_n_3 ); } y_t_0 = _mm256_setzero_pd(); y_t_1 = _mm256_setzero_pd(); y_t_2 = _mm256_setzero_pd(); y_t_3 = _mm256_setzero_pd(); #if 0 sx_n_0 = _mm256_castpd256_pd128( x_n_0 ); sx_n_1 = _mm256_castpd256_pd128( x_n_1 ); sx_n_2 = _mm256_castpd256_pd128( x_n_2 ); sx_n_3 = _mm256_castpd256_pd128( x_n_3 ); sy_t_0 = _mm256_castpd256_pd128( y_t_0 ); sy_t_1 = _mm256_castpd256_pd128( y_t_1 ); sy_t_2 = _mm256_castpd256_pd128( y_t_2 ); sy_t_3 = _mm256_castpd256_pd128( y_t_3 ); k = bs*(ka/bs); sA = A + (ka/bs)*sda*bs; sy_n = y_n + (ka/bs)*bs; sx_t = x_t + (ka/bs)*bs; for(; k<ka; k++) { sy_n_0 = _mm_load_sd( &sy_n[0] ); sx_t_0 = _mm_load_sd( &sx_t[0] ); sa_00 = _mm_load_sd( &sA[0+bs*0] ); sa_01 = _mm_load_sd( &sA[0+bs*1] ); sa_02 = _mm_load_sd( &sA[0+bs*2] ); sa_03 = _mm_load_sd( &sA[0+bs*3] ); stemp = _mm_mul_sd( sa_00, sx_n_0 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_00, sx_t_0 ); sy_t_0 = _mm_add_sd( sy_t_0, stemp ); stemp = _mm_mul_sd( sa_01, sx_n_1 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_01, sx_t_0 ); sy_t_1 = _mm_add_sd( sy_t_1, stemp ); stemp = _mm_mul_sd( sa_02, sx_n_2 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_02, sx_t_0 ); sy_t_2 = _mm_add_sd( sy_t_2, stemp ); stemp = _mm_mul_sd( sa_03, sx_n_3 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_03, sx_t_0 ); sy_t_3 = _mm_add_sd( sy_t_3, stemp ); _mm_store_sd( &sy_n[0], sy_n_0 ); sA += 1; sy_n += 1; sx_t += 1; } y_t_0 = _mm256_castpd128_pd256( sy_t_0 ); y_t_1 = _mm256_castpd128_pd256( sy_t_1 ); y_t_2 = _mm256_castpd128_pd256( sy_t_2 ); y_t_3 = _mm256_castpd128_pd256( sy_t_3 ); #endif k=0; // corner if(tri==1) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 14 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); temp = _mm256_blend_pd( zeros, temp, 14 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 12 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); temp = _mm256_blend_pd( zeros, temp, 12 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 8 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); temp = _mm256_blend_pd( zeros, temp, 8 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; k += 4; } for(; k<ka-7; k+=2*bs) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; } for(; k<ka-3; k+=bs) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; } if(k<ka) { k_left = ka-k; v_mask = _mm256_sub_pd( _mm256_loadu_pd( d_mask ), _mm256_broadcast_sd( &k_left ) ); i_mask = _mm256_castpd_si256( v_mask ); // __builtin_prefetch( A + sda*bs +bs*0 ); // __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_maskload_pd( &x_t[0], i_mask ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_maskstore_pd( &z_n[0], i_mask, y_n_0 ); // A += sda*bs; // y_n += 4; // z_n += 4; // x_t += 4; } __m256d y_0_1_2_3; y_t_0 = _mm256_hadd_pd( y_t_0, y_t_1 ); y_t_2 = _mm256_hadd_pd( y_t_2, y_t_3 ); y_t_1 = _mm256_permute2f128_pd( y_t_2, y_t_0, 2 ); y_t_0 = _mm256_permute2f128_pd( y_t_2, y_t_0, 19 ); y_t_0 = _mm256_add_pd( y_t_0, y_t_1 ); if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_t_0 ); _mm256_storeu_pd( &z_t[0], y_0_1_2_3 ); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_t_0 ); _mm256_storeu_pd( &z_t[0], y_0_1_2_3 ); } }
test (double *e, __m256d a) { return _mm256_storeu_pd (e, a); }
KFR_INTRINSIC void write(cunaligned_t, f64* ptr, const f64avx& x) { _mm256_storeu_pd(ptr, x.v); }
// it moves vertically across blocks void kernel_dtrmv_u_t_4_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 4; /* __builtin_prefetch( A + 0*lda );*/ /* __builtin_prefetch( A + 2*lda );*/ /* double *tA, *tx;*/ int k; __m256d zeros, tmp0, tmp1, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0_1_2_3, y_00, y_11, y_22, y_33; y_00 = _mm256_setzero_pd(); y_11 = _mm256_setzero_pd(); y_22 = _mm256_setzero_pd(); y_33 = _mm256_setzero_pd(); k=0; for(; k<kmax-7; k+=8) { /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); A += 4 + (sda-1)*lda; x += 4; /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); A += 4 + (sda-1)*lda; x += 4; } for(; k<kmax-3; k+=4) { /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); A += 4 + (sda-1)*lda; x += 4; } zeros = _mm256_setzero_pd(); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); __m256d y_0_1_2_3; y_00 = _mm256_hadd_pd(y_00, y_11); y_22 = _mm256_hadd_pd(y_22, y_33); y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 ); y_00 = _mm256_permute2f128_pd(y_22, y_00, 19); y_00 = _mm256_add_pd( y_00, y_11 ); if(alg==0) { _mm256_storeu_pd(&y[0], y_00); } else if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); } }
// it moves vertically across blocks void kernel_dtrmv_u_t_8_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 4; /* __builtin_prefetch( A + 0*lda );*/ /* __builtin_prefetch( A + 2*lda );*/ /* __builtin_prefetch( A + 4*lda );*/ /* __builtin_prefetch( A + 6*lda );*/ /* double *tA, *tx;*/ int k; /* int ka = kmax-kna; // number from aligned positon*/ __m256d zeros, tmp0, tmp1, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0_1_2_3, y_00, y_11, y_22, y_33, y_44, y_55, y_66, y_77; /* __m128d*/ /* ax_temp,*/ /* a_00_10, a_01_11, a_02_12, a_03_13,*/ /* x_0_1,*/ /* y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7;*/ y_00 = _mm256_setzero_pd(); y_11 = _mm256_setzero_pd(); y_22 = _mm256_setzero_pd(); y_33 = _mm256_setzero_pd(); y_44 = _mm256_setzero_pd(); y_55 = _mm256_setzero_pd(); y_66 = _mm256_setzero_pd(); y_77 = _mm256_setzero_pd(); k=0; for(; k<kmax-7; k+=8) { /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); /* __builtin_prefetch( A + sda*lda + 4*lda );*/ /* __builtin_prefetch( A + sda*lda + 6*lda );*/ a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, tmp0 ); y_55 = _mm256_add_pd( y_55, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, tmp0 ); y_77 = _mm256_add_pd( y_77, tmp1 ); A += 4 + (sda-1)*lda; x += 4; /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); /* __builtin_prefetch( A + sda*lda + 4*lda );*/ /* __builtin_prefetch( A + sda*lda + 6*lda );*/ a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, tmp0 ); y_55 = _mm256_add_pd( y_55, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, tmp0 ); y_77 = _mm256_add_pd( y_77, tmp1 ); A += 4 + (sda-1)*lda; x += 4; } /* for(; k<ka-3; k+=4)*/ /* {*/ /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ /* x_0_1_2_3 = _mm256_loadu_pd( &x[0] );*/ /* a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );*/ /* a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );*/ /* a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );*/ /* a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );*/ /* */ /* aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );*/ /* y_00 = _mm256_add_pd( y_00, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );*/ /* y_11 = _mm256_add_pd( y_11, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );*/ /* y_22 = _mm256_add_pd( y_22, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );*/ /* y_33 = _mm256_add_pd( y_33, aaxx_temp );*/ /* */ /* __builtin_prefetch( A + sda*lda + 4*lda );*/ /* __builtin_prefetch( A + sda*lda + 6*lda );*/ /* a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );*/ /* a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );*/ /* a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );*/ /* a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );*/ /* */ /* aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );*/ /* y_44 = _mm256_add_pd( y_44, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );*/ /* y_55 = _mm256_add_pd( y_55, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );*/ /* y_66 = _mm256_add_pd( y_66, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );*/ /* y_77 = _mm256_add_pd( y_77, aaxx_temp );*/ /* A += 4 + (sda-1)*lda;*/ /* x += 4;*/ /* }*/ zeros = _mm256_setzero_pd(); // top triangle x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); // top square a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, tmp0 ); y_55 = _mm256_add_pd( y_55, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, tmp0 ); y_77 = _mm256_add_pd( y_77, tmp1 ); A += 4 + (sda-1)*lda; x += 4; // bottom triangle x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, tmp0 ); y_55 = _mm256_add_pd( y_55, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, tmp0 ); y_77 = _mm256_add_pd( y_77, tmp1 ); // store __m256d y_0_1_2_3, y_4_5_6_7; y_00 = _mm256_hadd_pd(y_00, y_11); y_22 = _mm256_hadd_pd(y_22, y_33); y_44 = _mm256_hadd_pd(y_44, y_55); y_66 = _mm256_hadd_pd(y_66, y_77); y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 ); y_00 = _mm256_permute2f128_pd(y_22, y_00, 19); y_55 = _mm256_permute2f128_pd(y_66, y_44, 2 ); y_44 = _mm256_permute2f128_pd(y_66, y_44, 19); y_00 = _mm256_add_pd( y_00, y_11 ); y_44 = _mm256_add_pd( y_44, y_55 ); if(alg==0) { _mm256_storeu_pd(&y[0], y_00); _mm256_storeu_pd(&y[4], y_44); } else if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_4_5_6_7 = _mm256_loadu_pd( &y[4] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_44 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_4_5_6_7 = _mm256_loadu_pd( &y[4] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 ); y_4_5_6_7 = _mm256_sub_pd( y_4_5_6_7, y_44 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } }
// it moves horizontally inside a block void kernel_dtrmv_u_n_8_lib4(int kmax, double *A0, int sda, double *x, double *y, int alg) { if(kmax<=0) return; double *A1 = A0 + 4*sda; const int lda = 4; int k; __m128d tmp0, z_0, y_0_1, a_00_10; __m256d zeros, ax_temp, a_00_10_20_30, a_01_11_21_31, a_40_50_60_70, a_41_51_61_71, x_0, x_1, y_0_1_2_3, y_0_1_2_3_b, z_0_1_2_3, y_4_5_6_7, y_4_5_6_7_b, z_4_5_6_7; /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_4_5_6_7 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_4_5_6_7_b = _mm256_setzero_pd(); */ zeros = _mm256_setzero_pd(); /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_0_1_2_3_c = _mm256_setzero_pd(); */ /* y_0_1_2_3_d = _mm256_setzero_pd();*/ // upper triangular // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A0[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A0[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_0_1_2_3_b = _mm256_castpd128_pd256( y_0_1 ); y_0_1_2_3_b = _mm256_blend_pd( y_0_1_2_3_b, y_0_1_2_3_b, 0xc ); // forth col (avoid zero y_0_1_2_3) x_1 = _mm256_broadcast_sd( &x[3] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); y_0_1_2_3 = _mm256_mul_pd( a_01_11_21_31, x_1 ); // first col x_0 = _mm256_broadcast_sd( &x[2] ); x_0 = _mm256_blend_pd( x_0, zeros, 0x8 ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; // upper squared x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); x_0 = _mm256_broadcast_sd( &x[2] ); x_1 = _mm256_broadcast_sd( &x[3] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); // lower triangular // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A1[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A1[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_4_5_6_7_b = _mm256_castpd128_pd256( y_0_1 ); y_4_5_6_7_b = _mm256_blend_pd( y_4_5_6_7_b, y_4_5_6_7_b, 0xc ); // forth col (avoid zero y_4_5_6_7) x_1 = _mm256_broadcast_sd( &x[3] ); a_01_11_21_31 = _mm256_load_pd( &A1[0+lda*3] ); y_4_5_6_7 = _mm256_mul_pd( a_01_11_21_31, x_1 ); // first col x_0 = _mm256_broadcast_sd( &x[2] ); x_0 = _mm256_blend_pd( x_0, zeros, 0x8 ); a_00_10_20_30 = _mm256_load_pd( &A1[0+lda*2] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; k=8; for(; k<kmax-3; k+=4) { /* __builtin_prefetch( A0 + 4*lda );*/ /* __builtin_prefetch( A1 + 4*lda );*/ x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); /* __builtin_prefetch( A0 + 5*lda );*/ /* __builtin_prefetch( A1 + 5*lda );*/ x_0 = _mm256_broadcast_sd( &x[2] ); x_1 = _mm256_broadcast_sd( &x[3] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; } if(kmax%4>=2) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 2*lda; A1 += 2*lda; x += 2; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_4_5_6_7_b ); if(kmax%2==1) { x_0 = _mm256_broadcast_sd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); /* A0 += 1*lda;*/ /* A1 += 1*lda;*/ /* x += 1;*/ } if(alg==0) { _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } else if(alg==1) { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_4_5_6_7 = _mm256_loadu_pd( &y[4] ); z_0_1_2_3 = _mm256_add_pd( z_0_1_2_3, y_0_1_2_3 ); z_4_5_6_7 = _mm256_add_pd( z_4_5_6_7, y_4_5_6_7 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); _mm256_storeu_pd(&y[4], z_4_5_6_7); } else // alg==-1 { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_4_5_6_7 = _mm256_loadu_pd( &y[4] ); z_0_1_2_3 = _mm256_sub_pd( z_0_1_2_3, y_0_1_2_3 ); z_4_5_6_7 = _mm256_sub_pd( z_4_5_6_7, y_4_5_6_7 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); _mm256_storeu_pd(&y[4], z_4_5_6_7); } }
ALGEBRA_INLINE void vector_addm_double_aligned_32 (double* v1,double lambda,const double* v2,size_t n) { size_t k; __m256d l1 = _mm256_broadcast_sd(&lambda); __m256d l2 = _mm256_broadcast_sd(&lambda); __m256d l3 = _mm256_broadcast_sd(&lambda); __m256d l4 = _mm256_broadcast_sd(&lambda); size_t q = n / 16; size_t r = n % 16; if(q > 0) { if (ALGEBRA_IS_ALIGNED(v1) && ALGEBRA_IS_ALIGNED(v2)) { for (k=0;k<q;k++) { /* Charge 4 valeurs de chaque tableau */ __m256d i1 = _mm256_load_pd(v1); __m256d j1 = _mm256_load_pd(v2); __m256d i2 = _mm256_load_pd(v1+4); __m256d j2 = _mm256_load_pd(v2+4); __m256d i3 = _mm256_load_pd(v1+8); __m256d j3 = _mm256_load_pd(v2+8); __m256d i4 = _mm256_load_pd(v1+12); __m256d j4 = _mm256_load_pd(v2+12); /* multiplie */ j1 = _mm256_mul_pd(j1, l1); j2 = _mm256_mul_pd(j2, l2); j3 = _mm256_mul_pd(j3, l3); j4 = _mm256_mul_pd(j4, l4); /* Additionne */ i1 = _mm256_add_pd(i1,j1); i2 = _mm256_add_pd(i2,j2); i3 = _mm256_add_pd(i3,j3); i4 = _mm256_add_pd(i4,j4); /* Sauvegarde */ _mm256_store_pd(v1, i1); _mm256_store_pd(v1+4, i2); _mm256_store_pd(v1+8, i3); _mm256_store_pd(v1+12, i4); v1 += 16; v2 += 16; } } else { for (k=0;k<q;k++) { /* Charge 4 valeurs de chaque tableau */ __m256d i1 = _mm256_loadu_pd(v1); __m256d j1 = _mm256_loadu_pd(v2); __m256d i2 = _mm256_loadu_pd(v1+4); __m256d j2 = _mm256_loadu_pd(v2+4); __m256d i3 = _mm256_loadu_pd(v1+8); __m256d j3 = _mm256_loadu_pd(v2+8); __m256d i4 = _mm256_loadu_pd(v1+12); __m256d j4 = _mm256_loadu_pd(v2+12); /* multiplie */ j1 = _mm256_mul_pd(j1, l1); j2 = _mm256_mul_pd(j2, l2); j3 = _mm256_mul_pd(j3, l3); j4 = _mm256_mul_pd(j4, l4); /* Additionne */ i1 = _mm256_add_pd(i1,j1); i2 = _mm256_add_pd(i2,j2); i3 = _mm256_add_pd(i3,j3); i4 = _mm256_add_pd(i4,j4); /* Sauvegarde */ _mm256_storeu_pd(v1, i1); _mm256_storeu_pd(v1+4, i2); _mm256_storeu_pd(v1+8, i3); _mm256_storeu_pd(v1+12, i4); v1 += 16; v2 += 16; } } } for(k = 0 ; k<r ; k++) v1[k] += lambda*v2[k]; }
// add *p by *s and applied to all n COREARRAY_DLL_DEFAULT void vec_f64_add(double *p, const double *s, size_t n) { #if defined(COREARRAY_SIMD_AVX) switch ((size_t)p & 0x1F) { case 0x08: if (n > 0) { (*p++) += (*s++); n--; } case 0x10: if (n > 0) { (*p++) += (*s++); n--; } case 0x18: if (n > 0) { (*p++) += (*s++); n--; } case 0x00: for (; n >= 4; n-=4) { _mm256_store_pd(p, _mm256_add_pd(_mm256_load_pd(p), _mm256_loadu_pd(s))); p += 4; s += 4; } if (n >= 2) { _mm_store_pd(p, _mm_add_pd(_mm_load_pd(p), _mm_loadu_pd(s))); p += 2; s += 2; n -= 2; } break; default: for (; n >= 4; n-=4) { _mm256_storeu_pd(p, _mm256_add_pd(_mm256_loadu_pd(p), _mm256_loadu_pd(s))); p += 4; s += 4; } if (n >= 2) { _mm_storeu_pd(p, _mm_add_pd(_mm_loadu_pd(p), _mm_loadu_pd(s))); p += 2; s += 2; n -= 2; } } #elif defined(COREARRAY_SIMD_SSE2) switch ((size_t)p & 0x0F) { case 0x08: if (n > 0) { (*p++) += (*s++); n--; } case 0x00: for (; n >= 2; n-=2) { _mm_store_pd(p, _mm_add_pd(_mm_load_pd(p), _mm_loadu_pd(s))); p += 2; s += 2; } break; default: for (; n >= 2; n-=2) { _mm_storeu_pd(p, _mm_add_pd(_mm_loadu_pd(p), _mm_loadu_pd(s))); p += 2; s += 2; } } #endif for (; n > 0; n--) (*p++) += (*s++); }