inline vector4d haddp(const vector4d* row) { // row = (a,b,c,d) // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3) __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]); // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3) __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]); // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3) __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100); // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3) tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21); return _mm256_add_pd(tmp1, tmp2); }
inline float64x4_t dot(const float64x4_t ymm1, const float64x4_t ymm2) { float64x4_t mul0 = _mm256_mul_pd(ymm1, ymm2); float64x4_t hadd0 = _mm256_hadd_pd(mul0, mul0); float64x2_t ext0 = _mm256_extractf128_pd(hadd0, 0); float64x2_t ext1 = _mm256_extractf128_pd(hadd0, 1); float64x2_t add0 = _mm_add_pd(ext0, ext1); return _mm256_broadcast_pd(&add0); }
irreg_poly_area_func_sign(double, _avx) { if (__builtin_expect(is_null(cords) || cords_len == 0, 0)) return 0; __m256d curr, forw, coef_0, coef_1, end = _mm256_load_pd((const double *)cords), accum_sum = _mm256_setzero_pd(); double accum_sum_aux; unsigned long index; for (index = 0; index < (cords_len - 4); index += 4) { curr = end; // x0,y0,x1,y1 forw = _mm256_load_pd((const double *)&cords[index + 2]); // x2,y2,x3,y3 end = _mm256_load_pd((const double *)&cords[index + 4]); // x4,y4,x5,y5 coef_0 = _mm256_permute2f128_pd(curr, forw, 0b00110001); // x1, y1, x3, y3 coef_1 = _mm256_permute2f128_pd(forw, end, 0b00100000); // x2, y2, x4, y4 //_mm256_hsub_pd(a, b) == a0 - a1, b0 - b1, a2 - a3, b2 - b3 accum_sum = _mm256_add_pd( accum_sum, _mm256_hsub_pd( // x0*y1 - y0*x1, x1*y2 - y1x2, x2*y3 - y2*x3, x3*y4 - y3*x4 _mm256_mul_pd( // x0*y1, y0*x1, x2*y3, y2*x3 _mm256_permute2f128_pd(curr, forw, 0b00100000), // x0, y0, x2, y2 _mm256_shuffle_pd(coef_0, coef_0, 0b0101) // y1, x1, y3, x3 ), _mm256_mul_pd(coef_0, _mm256_shuffle_pd(coef_1, coef_1, 0b0101)) // y2, x2, y4, x4 // ^^^^^^^^^^^^^^^ x1*y2, y1*x2, x3*y4, y3*x4 ) ); } accum_sum = _mm256_hadd_pd(accum_sum, _mm256_permute2f128_pd(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a2+a3, a0+a1 accum_sum = _mm256_hadd_pd(accum_sum, accum_sum); // a0+a1+a2+a3, ... for (accum_sum_aux = _mm_cvtsd_f64(_mm256_castpd256_pd128(accum_sum)); index < (cords_len - 1); index++) accum_sum_aux += _calc_diff_of_adj_prods(cords, index); return accum_sum_aux; // return scalar_half(scalar_abs(accum_sum_aux)); }
double hadd(const vector4d& rhs) { // rhs = (x0, x1, x2, x3) // tmp = (x2, x3, x0, x1) __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1); // tmp = (x2+x0, x3+x1, -, -) tmp = _mm256_add_pd(rhs, tmp); // tmp = (x2+x0+x3+x1, -, -, -) tmp = _mm256_hadd_pd(tmp, tmp); return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0)); }
void gvrotg_avx(double *c, double *s, double *r, double a, double b) { register __m256d x0, x1, t0, t2, u0, u1, one, b0, b1; if (b == 0.0) { *c = 1.0; *s = 0.0; *r = a; return; } if (a == 0.0) { *c = 0.0; *s = 1.0; *r = b; return; } // set_pd() order: [3, 2, 1, 0] // x[0], x[1]: |a| > |b|, x[2],x[3]: |b| > |a| x0 = _mm256_set_pd(1.0, a, b, 1.0); // x0 = {1, a, b, 1} x1 = _mm256_set_pd(1.0, b, a, 1.0); // x0 = {1, b, a, 1} t0 = _mm256_div_pd(x0, x1); // t0 = {1, a/b, b/a, 1} x0 = _mm256_mul_pd(t0, t0); // x3 = {1, (a/b)^2, (b/a)^2, 1} t2 = _mm256_hadd_pd(x0, x0); // x3 = {1+(a/b)^2, ., (b/a)^2+1, ..} u0 = _mm256_sqrt_pd(t2); // u0 = {sqrt(1+(a/b)^2), .., sqrt((b/a)^2+1)} one = _mm256_set1_pd(1.0); u1 = _mm256_div_pd(one, u0); b0 = _mm256_blend_pd(u0, u1, 0x9); // b0 = {1/u(b), u(b), u(a), 1/u(a)} b0 = _mm256_mul_pd(b0, x1); // b0 = {1/u(b), b*u(b), a*u(a), 1/u(a)} b1 = _mm256_mul_pd(t0, u1); // b1 = {1/u(b), t*u(b), t*u(a), 1/u(a)} if (fabs(b) > fabs(a)) { *s = b0[3]; // = 1/u(b) *r = b0[2]; // = b*u(b) *c = b1[2]; // = t*u(b) if (signbit(b)) { *s = -(*s); *c = -(*c); *r = -(*r); } } else { *c = b0[0]; *r = b0[1]; *s = b1[1]; } }
double HodgkinHuxley::dV(double *V, double I) { const double C = 1.0; const double gNa = 120.0; const double gK = 36.0; const double gL = 0.3; const double ENa = 50.0; const double EK = -77.0; const double EL = -54.4; #ifdef __AVX__ /* AVX is an instruction set from Intel which allows simultaneous operation on 4 doubles. Seems to be slower than optimized FPU, though. */ double Va[] __attribute__ ((aligned (32))) = {V[0], V[0], V[0], 1.0}, Ea[] __attribute__ ((aligned (32))) = {EL, ENa, EK, 0.0}, Ga[] __attribute__ ((aligned (32))) = {-gL, -gNa * pow(V[2], 3.0) * V[3], -gK * pow(V[1], 4.0), I}; // load V __m256d Vr = _mm256_load_pd(Va); // load E __m256d Er = _mm256_load_pd(Ea); // load G __m256d Gr = _mm256_load_pd(Ga); // subtract Vr = _mm256_sub_pd(Vr, Er); // dot product (why does intel not have _mm256_dp_pd ?) Vr = _mm256_mul_pd(Vr, Gr); __m256d temp = _mm256_hadd_pd(Vr, Vr); __m128d lo128 = _mm256_extractf128_pd(temp, 0); __m128d hi128 = _mm256_extractf128_pd(temp, 1); __m128d dotproduct = _mm_add_pd(lo128, hi128); double sseVal; // store _mm_storel_pd(&sseVal, dotproduct); sseVal /= C; return sseVal; #else return (-gL * (V[0] - EL) - gNa * pow(V[2], 3.0) * V[3] * (V[0] - ENa) - gK * pow(V[1], 4.0) * (V[0] - EK) + I) / C; #endif }
ALGEBRA_INLINE double vector_ps_double (const double* pa,const double* pb,size_t n) { if(ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) { size_t q = n/4; size_t r = n%4; double w = 0; if(q>0) { __m256d acc = _mm256_setzero_pd(); __m256d i1 = _mm256_load_pd(pa); __m256d j1 = _mm256_load_pd(pb); pa += 4; pb += 4; __m256d s = _mm256_mul_pd(i1, j1); acc = _mm256_add_pd(acc, s); while(--q != 0) { // load i1 = _mm256_load_pd(pa); j1 = _mm256_load_pd(pb); pa += 4; pb += 4; // multiplie s = _mm256_mul_pd(i1, j1); // accumule acc = _mm256_add_pd(acc, s); } // sum finale // add horizontal acc = _mm256_hadd_pd(acc, acc); // échange 128bits haut et bas __m256d accp = _mm256_permute2f128_pd(acc, acc, 1); // add vertical acc = _mm256_add_pd(acc, accp); // extract _mm_store_sd(&w, _mm256_extractf128_pd(acc,0)); } return w + vector_ps_double_basic(pa, pb, r); } return vector_ps_double_basic(pa, pb, n); }
void core::Vector3::normalize(void) { #if defined(VTX_USE_AVX) ALIGNED_32 platform::F64_t vector[] = {this->x, this->y, this->z, 0}; ALIGNED_32 platform::F64_t reciprocalVector[] = {1.0, 1.0, 1.0, 1.0}; __m256d simdvector; __m256d result; __m256d recp; simdvector = _mm256_load_pd(vector); recp = _mm256_load_pd(reciprocalVector); result = _mm256_mul_pd(simdvector, simdvector); result = _mm256_hadd_pd(result, result); result = _mm256_hadd_pd(result, result); result = _mm256_sqrt_pd(result); result = _mm256_div_pd(recp, result); simdvector = _mm256_mul_pd(simdvector, result); _mm256_store_pd(vector, simdvector); this->x = vector[0]; this->y = vector[1]; this->z = vector[2]; #elif defined(VTX_USE_SSE) // Must pad with a trailing 0, to store in 128-bit register ALIGNED_16 core::F32_t vector[] = {this->x, this->y, this->z, 0}; __m128 simdvector; __m128 result; simdvector = _mm_load_ps(vector); // (X^2, Y^2, Z^2, 0^2) result = _mm_mul_ps(simdvector, simdvector); // Add all elements together, giving us (X^2 + Y^2 + Z^2 + 0^2) result = _mm_hadd_ps(result, result); result = _mm_hadd_ps(result, result); // Calculate square root, giving us sqrt(X^2 + Y^2 + Z^2 + 0^2) result = _mm_sqrt_ps(result); // Calculate reciprocal, giving us 1 / sqrt(X^2 + Y^2 + Z^2 + 0^2) result = _mm_rcp_ps(result); // Finally, multiply the result with our original vector. simdvector = _mm_mul_ps(simdvector, result); _mm_store_ps(vector, simdvector); this->x = vector[0]; this->y = vector[1]; this->z = vector[2]; #else core::F64_t num = 1.0 / std::sqrt(std::pow(this->x, 2) + std::pow(this->y, 2) + std::pow(this->z, 2)); this->x *= num; this->y *= num; this->z *= num; #endif }
/*! * \brief Perform an horizontal sum of the given vector. * \param in The input vector type * \return the horizontal sum of the vector */ ETL_STATIC_INLINE(double) hadd(avx_simd_double in) { const __m256d t1 = _mm256_hadd_pd(in.value, _mm256_permute2f128_pd(in.value, in.value, 1)); const __m256d t2 = _mm256_hadd_pd(t1, t1); return _mm_cvtsd_f64(_mm256_castpd256_pd128(t2)); }
void kernel_dgemv_t_1_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; double *tA, *tx; int k; int ka = kmax; // number from aligned positon __m256d aaxx_temp, a_00_10_20_30, x_0_1_2_3, y_00; __m128d ax_temp, a_00_10, x_0_1, y_0, y_1, y_0_1; y_00 = _mm256_setzero_pd(); y_0 = _mm256_castpd256_pd128(y_00); k = lda*(ka/lda); tA = A + (ka/lda)*sda*lda; tx = x + (ka/lda)*lda; for(; k<ka; k++) { x_0_1 = _mm_load_sd( &tx[0] ); a_00_10 = _mm_load_sd( &tA[0+lda*0] ); /* y_0 += a_00_10 * x_0_1;*/ ax_temp = _mm_mul_sd( a_00_10, x_0_1 ); y_0 = _mm_add_sd (y_0, ax_temp ); tA += 1; tx += 1; } y_00 = _mm256_castpd128_pd256(y_0); k=0; for(; k<ka-3; k+=4) { x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); /* y_00 += a_00_10_20_30 * x_0_1_2_3;*/ aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; } y_00 = _mm256_hadd_pd(y_00, y_00); y_1 = _mm256_extractf128_pd(y_00, 1); y_0 = _mm256_castpd256_pd128(y_00); /* y_0 += y_1;*/ y_0 = _mm_add_sd( y_0, y_1 ); if(alg==0) { _mm_store_sd(&y[0], y_0); } else if(alg==1) { y_0_1 = _mm_load_sd( &y[0] ); /* y_0_1 += y_0;*/ y_0_1 = _mm_add_sd( y_0_1, y_0 ); _mm_store_sd(&y[0], y_0_1); } else // alg==-1 { y_0_1 = _mm_load_sd( &y[0] ); /* y_0_1 -= y_0;*/ y_0_1 = _mm_sub_sd( y_0_1, y_0 ); _mm_store_sd(&y[0], y_0_1); } }
// Computes and returns the dot product of the n-vectors u and v. // Uses Intel AVX intrinsics to access the SIMD instruction set. double DotProductAVX(const double* u, const double* v, int n) { int max_offset = n - 4; int offset = 0; // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and // v, and multiplying them together in parallel. __m256d sum = _mm256_setzero_pd(); if (offset <= max_offset) { offset = 4; // Aligned load is reputedly faster but requires 32 byte aligned input. if ((reinterpret_cast<const uintptr_t>(u) & 31) == 0 && (reinterpret_cast<const uintptr_t>(v) & 31) == 0) { // Use aligned load. __m256d floats1 = _mm256_load_pd(u); __m256d floats2 = _mm256_load_pd(v); // Multiply. sum = _mm256_mul_pd(floats1, floats2); while (offset <= max_offset) { floats1 = _mm256_load_pd(u + offset); floats2 = _mm256_load_pd(v + offset); offset += 4; __m256d product = _mm256_mul_pd(floats1, floats2); sum = _mm256_add_pd(sum, product); } } else { // Use unaligned load. __m256d floats1 = _mm256_loadu_pd(u); __m256d floats2 = _mm256_loadu_pd(v); // Multiply. sum = _mm256_mul_pd(floats1, floats2); while (offset <= max_offset) { floats1 = _mm256_loadu_pd(u + offset); floats2 = _mm256_loadu_pd(v + offset); offset += 4; __m256d product = _mm256_mul_pd(floats1, floats2); sum = _mm256_add_pd(sum, product); } } } // Add the 4 product sums together horizontally. Not so easy as with sse, as // there is no add across the upper/lower 128 bit boundary, so permute to // move the upper 128 bits to lower in another register. __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1); sum = _mm256_hadd_pd(sum, sum2); sum = _mm256_hadd_pd(sum, sum); double result; // _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse // instruction, as that introduces a 70 cycle delay. All this casting is to // fool the instrinsics into thinking we are extracting the bottom int64. auto cast_sum = _mm256_castpd_si256(sum); *(reinterpret_cast<inT64*>(&result)) = #if defined(_WIN32) || defined(__i386__) // This is a very simple workaround that is activated // for all platforms that do not have _mm256_extract_epi64. // _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y] ((uint64_t*)&cast_sum)[0] #else _mm256_extract_epi64(cast_sum, 0) #endif ; while (offset < n) { result += u[offset] * v[offset]; ++offset; } return result; }
void ntt_transform(poly out, const poly o) { int s, pos = 0, offset; __m256d vt,vo0,vo10,vo11,vo20,vo21,vo22,vo23,vc,vp,vpinv,neg2,neg4; __m256d vx0,vx1,vx2,vx3,vx4,vx5,vx6,vx7; vpinv = _mm256_set_pd(PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE, PARAM_APPROX_P_INVERSE); vp = _mm256_set_pd(8383489., 8383489., 8383489., 8383489.); bitrev(out); vo10 = _mm256_load_pd(o+pos); vo20 = _mm256_load_pd(o+pos+4); neg2 = _mm256_load_pd(_neg2); neg4 = _mm256_load_pd(_neg4); // m = 2, m = 4, m = 8 (3 levels merged) for(s = 0; s<POLY_DEG; s+=8) { // No multiplication with omega required, respective value is 1 vx0 = _mm256_load_pd(out+s); vt = _mm256_mul_pd(vx0,neg2); vx0 = _mm256_hadd_pd(vx0,vt); vx1 = _mm256_load_pd(out+s+4); vt = _mm256_mul_pd(vx1,neg2); vx1 = _mm256_hadd_pd(vx1,vt); vx0 = _mm256_mul_pd(vx0, vo10); vc = _mm256_mul_pd(vx0, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vx0 = _mm256_sub_pd(vx0,vc); vt = _mm256_permute2f128_pd (vx0, vx0, 0x01); // now contains x2,x3,x0,x1 vx0 = _mm256_mul_pd(vx0, neg4); vx0 = _mm256_add_pd(vx0, vt); vx1 = _mm256_mul_pd(vx1, vo10); vc = _mm256_mul_pd(vx1, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vx1 = _mm256_sub_pd(vx1,vc); vt = _mm256_permute2f128_pd (vx1, vx1, 0x01); // now contains x2,x3,x0,x1 vx1 = _mm256_mul_pd(vx1, neg4); vx1 = _mm256_add_pd(vx1, vt); vt = _mm256_mul_pd(vx1, vo20); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx1 = _mm256_sub_pd(vx0, vt); _mm256_store_pd(out+s+4, vx1); vx0 = _mm256_add_pd(vx0, vt); _mm256_store_pd(out+s+0, vx0); } pos += 8; // m = 16, m = 32, m = 64 (3 levels merged) for(offset = 0; offset < 8; offset+=4) { vo0 = _mm256_load_pd(o+pos+offset); vo10 = _mm256_load_pd(o+pos+offset+8); vo11 = _mm256_load_pd(o+pos+offset+16); for(s = 0; s<POLY_DEG; s+=64) { vx1 = _mm256_load_pd(out+offset+s+8); vt = _mm256_mul_pd(vx1, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx0 = _mm256_load_pd(out+offset+s+0); vx1 = _mm256_sub_pd(vx0, vt); // _mm256_store_pd(out+offset+s+8, vx1); vx0 = _mm256_add_pd(vx0, vt); // _mm256_store_pd(out+offset+s+0, vx0); vx3 = _mm256_load_pd(out+offset+s+24); vt = _mm256_mul_pd(vx3, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx2 = _mm256_load_pd(out+offset+s+16); vx3 = _mm256_sub_pd(vx2, vt); // _mm256_store_pd(out+offset+s+24, vx3); vx2 = _mm256_add_pd(vx2, vt); // _mm256_store_pd(out+offset+s+16, vx2); vx5 = _mm256_load_pd(out+offset+s+40); vt = _mm256_mul_pd(vx5, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx4 = _mm256_load_pd(out+offset+s+32); vx5 = _mm256_sub_pd(vx4, vt); // _mm256_store_pd(out+offset+s+40, vx5); vx4 = _mm256_add_pd(vx4, vt); // _mm256_store_pd(out+offset+s+32, vx4); vx7 = _mm256_load_pd(out+offset+s+56); vt = _mm256_mul_pd(vx7, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx6 = _mm256_load_pd(out+offset+s+48); vx7 = _mm256_sub_pd(vx6, vt); // _mm256_store_pd(out+offset+s+56, vx7); vx6 = _mm256_add_pd(vx6, vt); // _mm256_store_pd(out+offset+s+48, vx6); // vx2 = _mm256_load_pd(out+offset+s+16); vt = _mm256_mul_pd(vx2, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx0 = _mm256_load_pd(out+offset+s+0); vx2 = _mm256_sub_pd(vx0, vt); // _mm256_store_pd(out+offset+s+16, vx2); vx0 = _mm256_add_pd(vx0, vt); // _mm256_store_pd(out+offset+s+0, vx0); // vx6 = _mm256_load_pd(out+offset+s+48); vt = _mm256_mul_pd(vx6, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx4 = _mm256_load_pd(out+offset+s+32); vx6 = _mm256_sub_pd(vx4, vt); // _mm256_store_pd(out+offset+s+48, vx6); vx4 = _mm256_add_pd(vx4, vt); // _mm256_store_pd(out+offset+s+32, vx4); // vx3 = _mm256_load_pd(out+offset+s+24); vt = _mm256_mul_pd(vx3, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx1 = _mm256_load_pd(out+offset+s+8); vx3 = _mm256_sub_pd(vx1, vt); // _mm256_store_pd(out+offset+s+24, vx3); vx1 = _mm256_add_pd(vx1, vt); // _mm256_store_pd(out+offset+s+8, vx1); // vx7 = _mm256_load_pd(out+offset+s+56); vt = _mm256_mul_pd(vx7, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx5 = _mm256_load_pd(out+offset+s+40); vx7 = _mm256_sub_pd(vx5, vt); // _mm256_store_pd(out+offset+s+56, vx7); vx5 = _mm256_add_pd(vx5, vt); // _mm256_store_pd(out+offset+s+40, vx5); // vx4 = _mm256_load_pd(out+offset+s+32); vo20 = _mm256_load_pd(o+pos+offset+24); vt = _mm256_mul_pd(vx4, vo20); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx0 = _mm256_load_pd(out+offset+s+0); vx4 = _mm256_sub_pd(vx0, vt); _mm256_store_pd(out+offset+s+32, vx4); vx0 = _mm256_add_pd(vx0, vt); _mm256_store_pd(out+offset+s+0, vx0); // vx5 = _mm256_load_pd(out+offset+s+40); vo21 = _mm256_load_pd(o+pos+offset+32); vt = _mm256_mul_pd(vx5, vo21); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx1 = _mm256_load_pd(out+offset+s+8); vx5 = _mm256_sub_pd(vx1, vt); _mm256_store_pd(out+offset+s+40, vx5); vx1 = _mm256_add_pd(vx1, vt); _mm256_store_pd(out+offset+s+8, vx1); // vx6 = _mm256_load_pd(out+offset+s+48); vo22 = _mm256_load_pd(o+pos+offset+40); vt = _mm256_mul_pd(vx6, vo22); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx2 = _mm256_load_pd(out+offset+s+16); vx6 = _mm256_sub_pd(vx2, vt); _mm256_store_pd(out+offset+s+48, vx6); vx2 = _mm256_add_pd(vx2, vt); _mm256_store_pd(out+offset+s+16, vx2); // vx7 = _mm256_load_pd(out+offset+s+56); vo23 = _mm256_load_pd(o+pos+offset+48); vt = _mm256_mul_pd(vx7, vo23); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); // vx3 = _mm256_load_pd(out+offset+s+24); vx7 = _mm256_sub_pd(vx3, vt); _mm256_store_pd(out+offset+s+56, vx7); vx3 = _mm256_add_pd(vx3, vt); _mm256_store_pd(out+offset+s+24, vx3); } } pos += 56; // m = 128, m=256, m=512 (3 levels merged) for(offset=0;offset<64;offset+=4) { vo0 = _mm256_load_pd(o+pos+offset); vo10 = _mm256_load_pd(o+pos+offset+64); vo11 = _mm256_load_pd(o+pos+offset+128); for(s = 0; s<POLY_DEG; s+=512) { vx1 = _mm256_load_pd(out+offset+s+64); vt = _mm256_mul_pd(vx1, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx0 = _mm256_load_pd(out+offset+s+0); vx1 = _mm256_sub_pd(vx0, vt); //_mm256_store_pd(out+offset+s+64, vx1); vx0 = _mm256_add_pd(vx0, vt); //_mm256_store_pd(out+offset+s+0, vx0); vx3 = _mm256_load_pd(out+offset+s+192); vt = _mm256_mul_pd(vx3, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx2 = _mm256_load_pd(out+offset+s+128); vx3 = _mm256_sub_pd(vx2, vt); //_mm256_store_pd(out+offset+s+192, vx3); vx2 = _mm256_add_pd(vx2, vt); //_mm256_store_pd(out+offset+s+128, vx2); vx5 = _mm256_load_pd(out+offset+s+320); vt = _mm256_mul_pd(vx5, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx4 = _mm256_load_pd(out+offset+s+256); vx5 = _mm256_sub_pd(vx4, vt); //_mm256_store_pd(out+offset+s+320, vx5); vx4 = _mm256_add_pd(vx4, vt); //_mm256_store_pd(out+offset+s+256, vx4); vx7 = _mm256_load_pd(out+offset+s+448); vt = _mm256_mul_pd(vx7, vo0); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); vx6 = _mm256_load_pd(out+offset+s+384); vx7 = _mm256_sub_pd(vx6, vt); //_mm256_store_pd(out+offset+s+448, vx7); vx6 = _mm256_add_pd(vx6, vt); //_mm256_store_pd(out+offset+s+384, vx6); //vx2 = _mm256_load_pd(out+offset+s+128); vt = _mm256_mul_pd(vx2, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx0 = _mm256_load_pd(out+offset+s+0); vx2 = _mm256_sub_pd(vx0, vt); //_mm256_store_pd(out+offset+s+128, vx2); vx0 = _mm256_add_pd(vx0, vt); //_mm256_store_pd(out+offset+s+0, vx0); //vx3 = _mm256_load_pd(out+offset+s+192); vt = _mm256_mul_pd(vx3, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx1 = _mm256_load_pd(out+offset+s+64); vx3 = _mm256_sub_pd(vx1, vt); //_mm256_store_pd(out+offset+s+192, vx3); vx1 = _mm256_add_pd(vx1, vt); //_mm256_store_pd(out+offset+s+64, vx1); //vx6 = _mm256_load_pd(out+offset+s+384); vt = _mm256_mul_pd(vx6, vo10); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx4 = _mm256_load_pd(out+offset+s+256); vx6 = _mm256_sub_pd(vx4, vt); //_mm256_store_pd(out+offset+s+384, vx6); vx4 = _mm256_add_pd(vx4, vt); //_mm256_store_pd(out+offset+s+256, vx4); //vx7 = _mm256_load_pd(out+offset+s+448); vt = _mm256_mul_pd(vx7, vo11); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx5 = _mm256_load_pd(out+offset+s+320); vx7 = _mm256_sub_pd(vx5, vt); //_mm256_store_pd(out+offset+s+448, vx7); vx5 = _mm256_add_pd(vx5, vt); //_mm256_store_pd(out+offset+s+320, vx5); //vx4 = _mm256_load_pd(out+offset+s+256); vo20 = _mm256_load_pd(o+pos+offset+192); vt = _mm256_mul_pd(vx4, vo20); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx0 = _mm256_load_pd(out+offset+s+0); vx4 = _mm256_sub_pd(vx0, vt); _mm256_store_pd(out+offset+s+256, vx4); vx0 = _mm256_add_pd(vx0, vt); _mm256_store_pd(out+offset+s+0, vx0); //vx5 = _mm256_load_pd(out+offset+s+320); vo21 = _mm256_load_pd(o+pos+offset+256); vt = _mm256_mul_pd(vx5, vo21); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx1 = _mm256_load_pd(out+offset+s+64); vx5 = _mm256_sub_pd(vx1, vt); _mm256_store_pd(out+offset+s+320, vx5); vx1 = _mm256_add_pd(vx1, vt); _mm256_store_pd(out+offset+s+64, vx1); //vx6 = _mm256_load_pd(out+offset+s+384); vo22 = _mm256_load_pd(o+pos+offset+320); vt = _mm256_mul_pd(vx6, vo22); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx2 = _mm256_load_pd(out+offset+s+128); vx6 = _mm256_sub_pd(vx2, vt); _mm256_store_pd(out+offset+s+384, vx6); vx2 = _mm256_add_pd(vx2, vt); _mm256_store_pd(out+offset+s+128, vx2); //vx7 = _mm256_load_pd(out+offset+s+448); vo23 = _mm256_load_pd(o+pos+offset+384); vt = _mm256_mul_pd(vx7, vo23); vc = _mm256_mul_pd(vt, vpinv); vc = _mm256_round_pd(vc,0x08); vc = _mm256_mul_pd(vc, vp); vt = _mm256_sub_pd(vt,vc); //vx3 = _mm256_load_pd(out+offset+s+192); vx7 = _mm256_sub_pd(vx3, vt); _mm256_store_pd(out+offset+s+448, vx7); vx3 = _mm256_add_pd(vx3, vt); _mm256_store_pd(out+offset+s+192, vx3); } } }
void AVX2FMA3DNoise(Vector3d& result, const Vector3d& EPoint) { #if CHECK_FUNCTIONAL Vector3d param(EPoint); #endif AVX2TABLETYPE *mp; // TODO FIXME - global statistics reference // Stats[Calls_To_DNoise]++; const __m256d ONE_PD = _mm256_set1_pd(1.0); const __m128i short_si128 = _mm_set1_epi32(0xffff); const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0); const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON); const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy); const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn)); const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0); const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn)); const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD); const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn), _mm_set1_epi32(0xfff)); const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn, _mm256_mul_pd(xyz_ixyzn, _mm256_sub_pd(_mm256_set1_pd(3.0), _mm256_add_pd(xyz_ixyzn, xyz_ixyzn)))); const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn); const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20); const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0)); const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1)); const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy); const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); int ints[4]; _mm_storeu_si128((__m128i*)(ints), i_xyzn); const int ixiy_hash = Hash2d(ints[0], ints[1]); const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]); const int ixjy_hash = Hash2d(ints[0], ints[1] + 1); const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1); const int iz = ints[2]; const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); __m256d ss; __m256d blend; __m256d x = _mm256_setzero_pd(), y = _mm256_setzero_pd(), z = _mm256_setzero_pd(); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0)); // blend = _mm256_blend_pd(iii, jjj, 0); INCSUMAVX_VECTOR(mp, ss, iii); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1)); blend = _mm256_blend_pd(iii, jjj, 2); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3)); blend = _mm256_blend_pd(iii, jjj, 6); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2)); blend = _mm256_blend_pd(iii, jjj, 4); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2)); blend = _mm256_blend_pd(iii, jjj, 12); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3)); // blend = _mm256_blend_pd(iii, jjj, 14); INCSUMAVX_VECTOR(mp, ss, jjj); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1)); blend = _mm256_blend_pd(iii, jjj, 10); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0)); blend = _mm256_blend_pd(iii, jjj, 8); INCSUMAVX_VECTOR(mp, ss, blend); __m256d xy = _mm256_hadd_pd(x,y); __m128d xy_up = _mm256_extractf128_pd(xy,1); xy_up = _mm_add_pd(_mm256_castpd256_pd128(xy),xy_up); _mm_storeu_pd(&result[X],xy_up); __m128d z_up = _mm256_extractf128_pd(z,1); z_up = _mm_add_pd(_mm256_castpd256_pd128(z),z_up); z_up = _mm_hadd_pd(z_up,z_up); result[Z] = _mm_cvtsd_f64(z_up); #if CHECK_FUNCTIONAL { Vector3d portable_res; PortableDNoise(portable_res , param); if (fabs(portable_res[X] - result[X]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise X error"); } if (fabs(portable_res[Y] - result[Y]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise Y error"); } if (fabs(portable_res[Z] - result[Z]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise Z error"); } } #endif _mm256_zeroupper(); return; }
void kernel_dgemv_t_8_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); double *tA, *tx; int k; int ka = kmax; // number from aligned positon __m256d aaxx_temp, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0_1_2_3, y_00, y_11, y_22, y_33, y_44, y_55, y_66, y_77; __m128d ax_temp, a_00_10, a_01_11, a_02_12, a_03_13, x_0_1, y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7; y_00 = _mm256_setzero_pd(); y_11 = _mm256_setzero_pd(); y_22 = _mm256_setzero_pd(); y_33 = _mm256_setzero_pd(); y_44 = _mm256_setzero_pd(); y_55 = _mm256_setzero_pd(); y_66 = _mm256_setzero_pd(); y_77 = _mm256_setzero_pd(); y_0 = _mm256_castpd256_pd128(y_00); y_1 = _mm256_castpd256_pd128(y_11); y_2 = _mm256_castpd256_pd128(y_22); y_3 = _mm256_castpd256_pd128(y_33); y_4 = _mm256_castpd256_pd128(y_44); y_5 = _mm256_castpd256_pd128(y_55); y_6 = _mm256_castpd256_pd128(y_66); y_7 = _mm256_castpd256_pd128(y_77); k = lda*(ka/lda); tA = A + (ka/lda)*sda*lda; tx = x + (ka/lda)*lda; if(ka-k>0) // it can be only ka-k = {1, 2, 3} { if((ka-k)>=2) { x_0_1 = _mm_load_pd( &tx[0] ); a_00_10 = _mm_load_pd( &tA[0+lda*0] ); a_01_11 = _mm_load_pd( &tA[0+lda*1] ); a_02_12 = _mm_load_pd( &tA[0+lda*2] ); a_03_13 = _mm_load_pd( &tA[0+lda*3] ); ax_temp = _mm_mul_pd( a_00_10, x_0_1 ); y_0 = _mm_add_pd (y_0, ax_temp ); ax_temp = _mm_mul_pd( a_01_11, x_0_1 ); y_1 = _mm_add_pd (y_1, ax_temp ); ax_temp = _mm_mul_pd( a_02_12, x_0_1 ); y_2 = _mm_add_pd (y_2, ax_temp ); ax_temp = _mm_mul_pd( a_03_13, x_0_1 ); y_3 = _mm_add_pd (y_3, ax_temp ); a_00_10 = _mm_load_pd( &tA[0+lda*4] ); a_01_11 = _mm_load_pd( &tA[0+lda*5] ); a_02_12 = _mm_load_pd( &tA[0+lda*6] ); a_03_13 = _mm_load_pd( &tA[0+lda*7] ); ax_temp = _mm_mul_pd( a_00_10, x_0_1 ); y_4 = _mm_add_pd (y_4, ax_temp ); ax_temp = _mm_mul_pd( a_01_11, x_0_1 ); y_5 = _mm_add_pd (y_5, ax_temp ); ax_temp = _mm_mul_pd( a_02_12, x_0_1 ); y_6 = _mm_add_pd (y_6, ax_temp ); ax_temp = _mm_mul_pd( a_03_13, x_0_1 ); y_7 = _mm_add_pd (y_7, ax_temp ); tA += 2; tx += 2; k+=2; } if((ka-k)==1) { x_0_1 = _mm_load_sd( &tx[0] ); a_00_10 = _mm_load_sd( &tA[0+lda*0] ); a_01_11 = _mm_load_sd( &tA[0+lda*1] ); a_02_12 = _mm_load_sd( &tA[0+lda*2] ); a_03_13 = _mm_load_sd( &tA[0+lda*3] ); ax_temp = _mm_mul_sd( a_00_10, x_0_1 ); y_0 = _mm_add_sd (y_0, ax_temp ); ax_temp = _mm_mul_sd( a_01_11, x_0_1 ); y_1 = _mm_add_sd (y_1, ax_temp ); ax_temp = _mm_mul_sd( a_02_12, x_0_1 ); y_2 = _mm_add_sd (y_2, ax_temp ); ax_temp = _mm_mul_sd( a_03_13, x_0_1 ); y_3 = _mm_add_sd (y_3, ax_temp ); a_00_10 = _mm_load_sd( &tA[0+lda*4] ); a_01_11 = _mm_load_sd( &tA[0+lda*5] ); a_02_12 = _mm_load_sd( &tA[0+lda*6] ); a_03_13 = _mm_load_sd( &tA[0+lda*7] ); ax_temp = _mm_mul_sd( a_00_10, x_0_1 ); y_4 = _mm_add_sd (y_4, ax_temp ); ax_temp = _mm_mul_sd( a_01_11, x_0_1 ); y_5 = _mm_add_sd (y_5, ax_temp ); ax_temp = _mm_mul_sd( a_02_12, x_0_1 ); y_6 = _mm_add_sd (y_6, ax_temp ); ax_temp = _mm_mul_sd( a_03_13, x_0_1 ); y_7 = _mm_add_sd (y_7, ax_temp ); tA += 1; tx += 1; k++; } } y_00 = _mm256_castpd128_pd256(y_0); y_11 = _mm256_castpd128_pd256(y_1); y_22 = _mm256_castpd128_pd256(y_2); y_33 = _mm256_castpd128_pd256(y_3); y_44 = _mm256_castpd128_pd256(y_4); y_55 = _mm256_castpd128_pd256(y_5); y_66 = _mm256_castpd128_pd256(y_6); y_77 = _mm256_castpd128_pd256(y_7); k=0; for(; k<ka-7; k+=8) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_55 = _mm256_add_pd( y_55, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_77 = _mm256_add_pd( y_77, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_55 = _mm256_add_pd( y_55, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_77 = _mm256_add_pd( y_77, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; } for(; k<ka-3; k+=4) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_55 = _mm256_add_pd( y_55, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_77 = _mm256_add_pd( y_77, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; } __m256d y_0_1_2_3, y_4_5_6_7; y_00 = _mm256_hadd_pd(y_00, y_11); y_22 = _mm256_hadd_pd(y_22, y_33); y_44 = _mm256_hadd_pd(y_44, y_55); y_66 = _mm256_hadd_pd(y_66, y_77); y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 ); y_00 = _mm256_permute2f128_pd(y_22, y_00, 19); y_55 = _mm256_permute2f128_pd(y_66, y_44, 2 ); y_44 = _mm256_permute2f128_pd(y_66, y_44, 19); y_00 = _mm256_add_pd( y_00, y_11 ); y_44 = _mm256_add_pd( y_44, y_55 ); if(alg==0) { _mm256_storeu_pd(&y[0], y_00); _mm256_storeu_pd(&y[4], y_44); } else if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_4_5_6_7 = _mm256_loadu_pd( &y[4] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_44 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_4_5_6_7 = _mm256_loadu_pd( &y[4] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 ); y_4_5_6_7 = _mm256_sub_pd( y_4_5_6_7, y_44 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } }
/* Naive implementation of Matrix Matrix Multiplication @param A input matrix @param B input matrix @param C output matrix */ inline void naive(const Matrix& A, const Matrix& B, Matrix& C){ //preload dimensions for faster access int dimM = C.getDimM(); int dimN = C.getDimN(); int dimL = A.getDimN(); for (int m = 0; m < dimM; m+=4){ ///rows of c for (int n = 0; n < dimN; n+=4){ ///cols of c //do calculation of a 4x4 block //std::cout << m << "\t" << n << std::endl; __m256d* pA = A.get(m, 0); __m256d* pB = A.get(m+1, 0); __m256d* pC = A.get(m+2, 0); __m256d* pD = A.get(m+3, 0); __m256d* pK = B.getT(0, n); __m256d* pL = B.getT(0, n+1); __m256d* pM = B.getT(0, n+2); __m256d* pN = B.getT(0, n+3); //std::cout << pA << "\t" << pB << "\t" << pC << "\t" << pD << std::endl; __m256d K = _mm256_setzero_pd(); __m256d L = _mm256_setzero_pd(); __m256d M = _mm256_setzero_pd(); __m256d N = _mm256_setzero_pd(); __m256d O = _mm256_setzero_pd(); __m256d P = _mm256_setzero_pd(); __m256d Q = _mm256_setzero_pd(); __m256d R = _mm256_setzero_pd(); __m256d S = _mm256_setzero_pd(); __m256d T = _mm256_setzero_pd(); __m256d U = _mm256_setzero_pd(); __m256d V = _mm256_setzero_pd(); __m256d W = _mm256_setzero_pd(); __m256d X = _mm256_setzero_pd(); __m256d Y = _mm256_setzero_pd(); __m256d Z = _mm256_setzero_pd(); for (int l = 0; l < dimL; l+=4){ //std::cout <<"mul" << std::endl; K = K + (*pA) * (*pK); L = L + (*pA) * (*pL); M = M + (*pA) * (*pM); N = N + (*pA) * (*pN); O = O + (*pB) * (*pK); P = P + (*pB) * (*pL); Q = Q + (*pB) * (*pM); R = R + (*pB) * (*pN); S = S + (*pC) * (*pK); T = T + (*pC) * (*pL); U = U + (*pC) * (*pM); V = V + (*pC) * (*pN); W = W + (*pD) * (*pK); X = X + (*pD) * (*pL); Y = Y + (*pD) * (*pM); Z = Z + (*pD) * (*pN); //std::cout << "inc" <<std::endl; pA++; pB++; pC++; pD++; pK++; pL++; pM++; pN++; } // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]} __m256d sumab = _mm256_hadd_pd(K, L); // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]} __m256d sumcd = _mm256_hadd_pd(M, N); // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]} __m256d blend = _mm256_blend_pd(sumab, sumcd, 0b1100); // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]} __m256d perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21); __m256d sum = _mm256_add_pd(perm, blend); C.set(m, n, sum); //C(m , n) = K[0] + K[1] + K[2] + K[3]; //C(m , n+1) = L[0] + L[1] + L[2] + L[3]; //C(m , n+2) = M[0] + M[1] + M[2] + M[3]; //C(m , n+3) = N[0] + N[1] + N[2] + N[3]; // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]} sumab = _mm256_hadd_pd(O, P); // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]} sumcd = _mm256_hadd_pd(Q, R); // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]} blend = _mm256_blend_pd(sumab, sumcd, 0b1100); // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]} perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21); sum = _mm256_add_pd(perm, blend); C.set(m+1, n, sum); //C(m+1, n ) = O[0] + O[1] + O[2] + O[3]; //C(m+1, n+1) = P[0] + P[1] + P[2] + P[3]; //C(m+1, n+2) = Q[0] + Q[1] + Q[2] + Q[3]; //C(m+1, n+3) = R[0] + R[1] + R[2] + R[3]; // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]} sumab = _mm256_hadd_pd(S, T); // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]} sumcd = _mm256_hadd_pd(U, V); // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]} blend = _mm256_blend_pd(sumab, sumcd, 0b1100); // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]} perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21); sum = _mm256_add_pd(perm, blend); C.set(m+2, n, sum); //C(m+2, n ) = S[0] + S[1] + S[2] + S[3]; //C(m+2, n+1) = T[0] + T[1] + T[2] + T[3]; //C(m+2, n+2) = U[0] + U[1] + U[2] + U[3]; //C(m+2, n+3) = V[0] + V[1] + V[2] + V[3]; // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]} sumab = _mm256_hadd_pd(W, X); // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]} sumcd = _mm256_hadd_pd(Y, Z); // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]} blend = _mm256_blend_pd(sumab, sumcd, 0b1100); // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]} perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21); sum = _mm256_add_pd(perm, blend); C.set(m+3, n, sum); //C(m+3, n ) = W[0] + W[1] + W[2] + W[3]; //C(m+3, n+1) = X[0] + X[1] + X[2] + X[3]; //C(m+3, n+2) = Y[0] + Y[1] + Y[2] + Y[3]; //C(m+3, n+3) = Z[0] + Z[1] + Z[2] + Z[3]; } } }
// it moves vertically across blocks void kernel_dtrmv_u_t_8_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 4; /* __builtin_prefetch( A + 0*lda );*/ /* __builtin_prefetch( A + 2*lda );*/ /* __builtin_prefetch( A + 4*lda );*/ /* __builtin_prefetch( A + 6*lda );*/ /* double *tA, *tx;*/ int k; /* int ka = kmax-kna; // number from aligned positon*/ __m256d zeros, tmp0, tmp1, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0_1_2_3, y_00, y_11, y_22, y_33, y_44, y_55, y_66, y_77; /* __m128d*/ /* ax_temp,*/ /* a_00_10, a_01_11, a_02_12, a_03_13,*/ /* x_0_1,*/ /* y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7;*/ y_00 = _mm256_setzero_pd(); y_11 = _mm256_setzero_pd(); y_22 = _mm256_setzero_pd(); y_33 = _mm256_setzero_pd(); y_44 = _mm256_setzero_pd(); y_55 = _mm256_setzero_pd(); y_66 = _mm256_setzero_pd(); y_77 = _mm256_setzero_pd(); k=0; for(; k<kmax-7; k+=8) { /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); /* __builtin_prefetch( A + sda*lda + 4*lda );*/ /* __builtin_prefetch( A + sda*lda + 6*lda );*/ a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, tmp0 ); y_55 = _mm256_add_pd( y_55, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, tmp0 ); y_77 = _mm256_add_pd( y_77, tmp1 ); A += 4 + (sda-1)*lda; x += 4; /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); /* __builtin_prefetch( A + sda*lda + 4*lda );*/ /* __builtin_prefetch( A + sda*lda + 6*lda );*/ a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, tmp0 ); y_55 = _mm256_add_pd( y_55, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, tmp0 ); y_77 = _mm256_add_pd( y_77, tmp1 ); A += 4 + (sda-1)*lda; x += 4; } /* for(; k<ka-3; k+=4)*/ /* {*/ /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ /* x_0_1_2_3 = _mm256_loadu_pd( &x[0] );*/ /* a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );*/ /* a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );*/ /* a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );*/ /* a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );*/ /* */ /* aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );*/ /* y_00 = _mm256_add_pd( y_00, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );*/ /* y_11 = _mm256_add_pd( y_11, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );*/ /* y_22 = _mm256_add_pd( y_22, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );*/ /* y_33 = _mm256_add_pd( y_33, aaxx_temp );*/ /* */ /* __builtin_prefetch( A + sda*lda + 4*lda );*/ /* __builtin_prefetch( A + sda*lda + 6*lda );*/ /* a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );*/ /* a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );*/ /* a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );*/ /* a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );*/ /* */ /* aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );*/ /* y_44 = _mm256_add_pd( y_44, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );*/ /* y_55 = _mm256_add_pd( y_55, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );*/ /* y_66 = _mm256_add_pd( y_66, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );*/ /* y_77 = _mm256_add_pd( y_77, aaxx_temp );*/ /* A += 4 + (sda-1)*lda;*/ /* x += 4;*/ /* }*/ zeros = _mm256_setzero_pd(); // top triangle x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); // top square a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, tmp0 ); y_55 = _mm256_add_pd( y_55, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, tmp0 ); y_77 = _mm256_add_pd( y_77, tmp1 ); A += 4 + (sda-1)*lda; x += 4; // bottom triangle x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, tmp0 ); y_55 = _mm256_add_pd( y_55, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, tmp0 ); y_77 = _mm256_add_pd( y_77, tmp1 ); // store __m256d y_0_1_2_3, y_4_5_6_7; y_00 = _mm256_hadd_pd(y_00, y_11); y_22 = _mm256_hadd_pd(y_22, y_33); y_44 = _mm256_hadd_pd(y_44, y_55); y_66 = _mm256_hadd_pd(y_66, y_77); y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 ); y_00 = _mm256_permute2f128_pd(y_22, y_00, 19); y_55 = _mm256_permute2f128_pd(y_66, y_44, 2 ); y_44 = _mm256_permute2f128_pd(y_66, y_44, 19); y_00 = _mm256_add_pd( y_00, y_11 ); y_44 = _mm256_add_pd( y_44, y_55 ); if(alg==0) { _mm256_storeu_pd(&y[0], y_00); _mm256_storeu_pd(&y[4], y_44); } else if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_4_5_6_7 = _mm256_loadu_pd( &y[4] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_44 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_4_5_6_7 = _mm256_loadu_pd( &y[4] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 ); y_4_5_6_7 = _mm256_sub_pd( y_4_5_6_7, y_44 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } }
// it moves vertically across blocks void kernel_dtrmv_u_t_4_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 4; /* __builtin_prefetch( A + 0*lda );*/ /* __builtin_prefetch( A + 2*lda );*/ /* double *tA, *tx;*/ int k; __m256d zeros, tmp0, tmp1, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0_1_2_3, y_00, y_11, y_22, y_33; y_00 = _mm256_setzero_pd(); y_11 = _mm256_setzero_pd(); y_22 = _mm256_setzero_pd(); y_33 = _mm256_setzero_pd(); k=0; for(; k<kmax-7; k+=8) { /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); A += 4 + (sda-1)*lda; x += 4; /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); A += 4 + (sda-1)*lda; x += 4; } for(; k<kmax-3; k+=4) { /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); A += 4 + (sda-1)*lda; x += 4; } zeros = _mm256_setzero_pd(); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); __m256d y_0_1_2_3; y_00 = _mm256_hadd_pd(y_00, y_11); y_22 = _mm256_hadd_pd(y_22, y_33); y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 ); y_00 = _mm256_permute2f128_pd(y_22, y_00, 19); y_00 = _mm256_add_pd( y_00, y_11 ); if(alg==0) { _mm256_storeu_pd(&y[0], y_00); } else if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); } }
// it moves vertically across blocks void kernel_dsymv_4_lib4(int kmax, double *A, int sda, double *x_n, double *y_n, double *z_n, double *x_t, double *y_t, double *z_t, int tri, int alg) { if(kmax<=0) return; /*printf("\nciao %d\n", kmax); */ const int bs = 4; __builtin_prefetch( A + bs*0 ); __builtin_prefetch( A + bs*2 ); int k, ka; ka = kmax; // number from aligned positon double k_left; // double *sA, *sy_n, *sx_t; static double d_mask[4] = {0.5, 1.5, 2.5, 3.5}; __m256d v_mask, zeros, temp, a_00, a_01, a_02, a_03, x_n_0, x_n_1, x_n_2, x_n_3, y_n_0, x_t_0, y_t_0, y_t_1, y_t_2, y_t_3; __m256i i_mask; #if 0 __m128d stemp, sa_00, sa_01, sa_02, sa_03, sx_n_0, sx_n_1, sx_n_2, sx_n_3, sy_n_0, sx_t_0, sy_t_0, sy_t_1, sy_t_2, sy_t_3; #endif zeros = _mm256_setzero_pd(); x_n_0 = _mm256_broadcast_sd( &x_n[0] ); x_n_1 = _mm256_broadcast_sd( &x_n[1] ); x_n_2 = _mm256_broadcast_sd( &x_n[2] ); x_n_3 = _mm256_broadcast_sd( &x_n[3] ); if(alg==-1) // TODO xor { x_n_0 = _mm256_sub_pd( zeros, x_n_0 ); x_n_1 = _mm256_sub_pd( zeros, x_n_1 ); x_n_2 = _mm256_sub_pd( zeros, x_n_2 ); x_n_3 = _mm256_sub_pd( zeros, x_n_3 ); } y_t_0 = _mm256_setzero_pd(); y_t_1 = _mm256_setzero_pd(); y_t_2 = _mm256_setzero_pd(); y_t_3 = _mm256_setzero_pd(); #if 0 sx_n_0 = _mm256_castpd256_pd128( x_n_0 ); sx_n_1 = _mm256_castpd256_pd128( x_n_1 ); sx_n_2 = _mm256_castpd256_pd128( x_n_2 ); sx_n_3 = _mm256_castpd256_pd128( x_n_3 ); sy_t_0 = _mm256_castpd256_pd128( y_t_0 ); sy_t_1 = _mm256_castpd256_pd128( y_t_1 ); sy_t_2 = _mm256_castpd256_pd128( y_t_2 ); sy_t_3 = _mm256_castpd256_pd128( y_t_3 ); k = bs*(ka/bs); sA = A + (ka/bs)*sda*bs; sy_n = y_n + (ka/bs)*bs; sx_t = x_t + (ka/bs)*bs; for(; k<ka; k++) { sy_n_0 = _mm_load_sd( &sy_n[0] ); sx_t_0 = _mm_load_sd( &sx_t[0] ); sa_00 = _mm_load_sd( &sA[0+bs*0] ); sa_01 = _mm_load_sd( &sA[0+bs*1] ); sa_02 = _mm_load_sd( &sA[0+bs*2] ); sa_03 = _mm_load_sd( &sA[0+bs*3] ); stemp = _mm_mul_sd( sa_00, sx_n_0 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_00, sx_t_0 ); sy_t_0 = _mm_add_sd( sy_t_0, stemp ); stemp = _mm_mul_sd( sa_01, sx_n_1 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_01, sx_t_0 ); sy_t_1 = _mm_add_sd( sy_t_1, stemp ); stemp = _mm_mul_sd( sa_02, sx_n_2 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_02, sx_t_0 ); sy_t_2 = _mm_add_sd( sy_t_2, stemp ); stemp = _mm_mul_sd( sa_03, sx_n_3 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_03, sx_t_0 ); sy_t_3 = _mm_add_sd( sy_t_3, stemp ); _mm_store_sd( &sy_n[0], sy_n_0 ); sA += 1; sy_n += 1; sx_t += 1; } y_t_0 = _mm256_castpd128_pd256( sy_t_0 ); y_t_1 = _mm256_castpd128_pd256( sy_t_1 ); y_t_2 = _mm256_castpd128_pd256( sy_t_2 ); y_t_3 = _mm256_castpd128_pd256( sy_t_3 ); #endif k=0; // corner if(tri==1) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 14 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); temp = _mm256_blend_pd( zeros, temp, 14 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 12 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); temp = _mm256_blend_pd( zeros, temp, 12 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 8 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); temp = _mm256_blend_pd( zeros, temp, 8 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; k += 4; } for(; k<ka-7; k+=2*bs) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; } for(; k<ka-3; k+=bs) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; } if(k<ka) { k_left = ka-k; v_mask = _mm256_sub_pd( _mm256_loadu_pd( d_mask ), _mm256_broadcast_sd( &k_left ) ); i_mask = _mm256_castpd_si256( v_mask ); // __builtin_prefetch( A + sda*bs +bs*0 ); // __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_maskload_pd( &x_t[0], i_mask ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_maskstore_pd( &z_n[0], i_mask, y_n_0 ); // A += sda*bs; // y_n += 4; // z_n += 4; // x_t += 4; } __m256d y_0_1_2_3; y_t_0 = _mm256_hadd_pd( y_t_0, y_t_1 ); y_t_2 = _mm256_hadd_pd( y_t_2, y_t_3 ); y_t_1 = _mm256_permute2f128_pd( y_t_2, y_t_0, 2 ); y_t_0 = _mm256_permute2f128_pd( y_t_2, y_t_0, 19 ); y_t_0 = _mm256_add_pd( y_t_0, y_t_1 ); if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_t_0 ); _mm256_storeu_pd( &z_t[0], y_0_1_2_3 ); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_t_0 ); _mm256_storeu_pd( &z_t[0], y_0_1_2_3 ); } }
/** * Calculate all values in one step per pixel. Requires grabbing the neighboring pixels. */ FORCE_INLINE double single_pixel( double *im, int center, int top, int left, int right, int bottom, const __m256i mask1110, const __m256d rgb0W, const __m256d onehalf, const __m256d minustwelvehalf){ // double r = im[center]; // double g = im[center+1]; // double b = im[center+2]; // double r1 = im[top]; // double g1 = im[top+1]; // double b1 = im[top+2]; // double r2 = im[left]; // double g2 = im[left+1]; // double b2 = im[left+2]; // double r3 = im[right]; // double g3 = im[right+1]; // double b3 = im[right+2]; // double r4 = im[bottom]; // double g4 = im[bottom+1]; // double b4 = im[bottom+2]; __m256d c = _mm256_maskload_pd(&(im[center]),mask1110); __m256d c1 = _mm256_loadu_pd(&(im[top])); __m256d c2 = _mm256_loadu_pd(&(im[left])); __m256d c3 = _mm256_loadu_pd(&(im[right])); __m256d c4 = _mm256_loadu_pd(&(im[bottom])); COST_INC_LOAD(20); // double grey = rw * r + gw * g + bw * b; // double grey1 = rw * r1 + gw * g1 + bw * b1; // double grey2 = rw * r2 + gw * g2 + bw * b2; // double grey3 = rw * r3 + gw * g3 + bw * b3; // double grey4 = rw * r4 + gw * g4 + bw * b4; __m256d greyc = _mm256_mul_pd(c,rgb0W); __m256d grey1 = _mm256_mul_pd(c1,rgb0W); __m256d grey2 = _mm256_mul_pd(c2,rgb0W); __m256d grey3 = _mm256_mul_pd(c3,rgb0W); __m256d grey4 = _mm256_mul_pd(c4,rgb0W); //AVX: double: horizontal add for 1 vector __m256d c_perm = _mm256_permute2f128_pd(c, c, 0b00100001);//1,2 __m256d c_h = _mm256_hadd_pd(c,c_perm); __m128d c_h_lo = _mm256_extractf128_pd (c_h, 0);// lo __m128d c_h_hi = _mm256_extractf128_pd (c_h, 1);// hi double c_hsum_lo = _mm_cvtsd_f64(c_h_lo); double c_hsum_hi = _mm_cvtsd_f64(c_h_hi); double c_hsum = c_hsum_lo + c_hsum_hi; //AVX: double: horizontal add for 1 vector __m256d greyc_perm = _mm256_permute2f128_pd(greyc, greyc, 0b00100001);//1,2 __m256d greyc_h = _mm256_hadd_pd(greyc,greyc_perm); __m128d greyc_h_lo = _mm256_extractf128_pd (greyc_h, 0);// lo __m128d greyc_h_hi = _mm256_extractf128_pd (greyc_h, 1);// hi double greyc_hsum_lo = _mm_cvtsd_f64(greyc_h_lo); double greyc_hsum_hi = _mm_cvtsd_f64(greyc_h_hi); double greyc_hsum = greyc_hsum_lo + greyc_hsum_hi; //AVX: _m256d: horizontal add for 4 vectors at once __m256d grey12 = _mm256_hadd_pd(grey1,grey2); __m256d grey34 = _mm256_hadd_pd(grey3,grey4); __m256d grey_1234_blend = _mm256_blend_pd(grey12, grey34, 0b1100); //0011 __m256d grey_1234_perm = _mm256_permute2f128_pd(grey12, grey34, 0b00100001);//1,2 __m256d grey_1234 = _mm256_add_pd(grey_1234_perm, grey_1234_blend); //AVX: double: horizontal add for 1 vector __m256d grey1234_perm = _mm256_permute2f128_pd(grey_1234, grey_1234, 0b00100001);//1,2 __m256d grey1234_h = _mm256_hadd_pd(grey_1234,grey1234_perm); __m128d grey1234_h_lo = _mm256_extractf128_pd (grey1234_h, 0);// lo __m128d grey1234_h_hi = _mm256_extractf128_pd (grey1234_h, 1);// hi double grey1234_hsum_lo = _mm_cvtsd_f64(grey1234_h_lo); double grey1234_hsum_hi = _mm_cvtsd_f64(grey1234_h_hi); double grey1234_sum = grey1234_hsum_lo + grey1234_hsum_hi; COST_INC_ADD(10); //+ operations wasted on AVX COST_INC_MUL(15); //+ operations wasted on AVX double mu = c_hsum / 3.0; COST_INC_ADD(2); COST_INC_DIV(1); // double rmu = r-mu; // double gmu = g-mu; // double bmu = b-mu; __m256d c_mu = _mm256_set1_pd(mu); __m256d c_rgbmu = _mm256_sub_pd(c,c_mu); COST_INC_ADD(3); //+1 operations wasted on AVX // double rz = r-0.5; // double gz = g-0.5; // double bz = b-0.5; __m256d c_rgbz = _mm256_sub_pd(c,onehalf); COST_INC_ADD(3); //+1 operations wasted on AVX // double rzrz = rz*rz; // double gzgz = gz*gz; // double bzbz = bz*bz; __m256d c_rgbz_sq = _mm256_mul_pd(c_rgbz,c_rgbz); COST_INC_MUL(3); //+1 operations wasted on AVX // double re = exp(-12.5*rzrz); // double ge = exp(-12.5*gzgz); // double be = exp(-12.5*bzbz); __m256d c_rgbe_tmp = _mm256_mul_pd(minustwelvehalf,c_rgbz_sq); __m128 c_rgbe_tmp_ps = _mm256_cvtpd_ps(c_rgbe_tmp); __m128 c_rgbe_ps = exp_ps(c_rgbe_tmp_ps); __m256d c_rgbe = _mm256_cvtps_pd(c_rgbe_ps); COST_INC_EXP(3); COST_INC_MUL(3); //+1 operations wasted on AVX // double t1 = sqrt((rmu*rmu + gmu*gmu + bmu*bmu)/3.0); __m256d c_rgbmu_sq = _mm256_mul_pd(c_rgbmu,c_rgbmu); __m128d t1_tmp1_lo = _mm256_extractf128_pd (c_rgbmu_sq, 0);// lo __m128d t1_tmp1_hi = _mm256_extractf128_pd (c_rgbmu_sq, 1);// hi __m128d t1_tmp1_lo_sum = _mm_hadd_pd (t1_tmp1_lo, t1_tmp1_lo); double t1_tmp1_hi_lo = _mm_cvtsd_f64(t1_tmp1_hi); double t1_tmp1_lo_sum_lo = _mm_cvtsd_f64(t1_tmp1_lo_sum); double t1_tmp1 = t1_tmp1_lo_sum_lo + t1_tmp1_hi_lo; double t1_tmp2 = t1_tmp1 / 3.0; double t1 = sqrt(t1_tmp2); COST_INC_SQRT(1); COST_INC_ADD(3); COST_INC_MUL(3); //+1 operations wasted on AVX COST_INC_DIV(1); double t2 = fabs(t1); COST_INC_ABS(1); // double t3 = re*ge*be; __m128d t3_tmp1_lo = _mm256_extractf128_pd (c_rgbe, 0);// lo __m128d t3_tmp1_hi = _mm256_extractf128_pd (c_rgbe, 1);// hi double t3_tmp1_lo_lo = _mm_cvtsd_f64(t3_tmp1_lo); double t3_tmp1_hi_lo = _mm_cvtsd_f64(t3_tmp1_hi); __m128d t3_tmp1_lo_swapped = _mm_permute_pd(t3_tmp1_lo, 1);// swap double t3_tmp1_lo_hi = _mm_cvtsd_f64(t3_tmp1_lo_swapped); double t3 = t3_tmp1_lo_lo * t3_tmp1_lo_hi * t3_tmp1_hi_lo; COST_INC_MUL(2); double t4 = fabs(t3); COST_INC_ABS(1); double t5 = t2 * t4; COST_INC_MUL(1); // double t6 = -4.0*grey+grey1+grey2+grey3+grey4; double minusfour_times_grey = -4.0*greyc_hsum; double t6 = minusfour_times_grey+grey1234_sum; COST_INC_MUL(1); COST_INC_ADD(2); //2 operations saved due to AVX double t7 = fabs(t6); COST_INC_ABS(1); double t8 = t5 * t7; COST_INC_MUL(1); double t9 = t8 + 1.0E-12; COST_INC_ADD(1); return t9; }
void kernel_dgemv_t_4_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); double *tA, *tx; int k; int ka = kmax; // number from aligned positon __m256d aaxx_temp, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0_1_2_3, y_00, y_11, y_22, y_33; __m128d ax_temp, a_00_10, a_01_11, a_02_12, a_03_13, x_0_1, y_0, y_1, y_2, y_3; y_00 = _mm256_setzero_pd(); y_11 = _mm256_setzero_pd(); y_22 = _mm256_setzero_pd(); y_33 = _mm256_setzero_pd(); y_0 = _mm256_castpd256_pd128(y_00); y_1 = _mm256_castpd256_pd128(y_11); y_2 = _mm256_castpd256_pd128(y_22); y_3 = _mm256_castpd256_pd128(y_33); k = lda*(ka/lda); tA = A + (ka/lda)*sda*lda; tx = x + (ka/lda)*lda; for(; k<ka; k++) { x_0_1 = _mm_load_sd( &tx[0] ); a_00_10 = _mm_load_sd( &tA[0+lda*0] ); a_01_11 = _mm_load_sd( &tA[0+lda*1] ); a_02_12 = _mm_load_sd( &tA[0+lda*2] ); a_03_13 = _mm_load_sd( &tA[0+lda*3] ); ax_temp = _mm_mul_sd( a_00_10, x_0_1 ); y_0 = _mm_add_sd (y_0, ax_temp ); ax_temp = _mm_mul_sd( a_01_11, x_0_1 ); y_1 = _mm_add_sd (y_1, ax_temp ); ax_temp = _mm_mul_sd( a_02_12, x_0_1 ); y_2 = _mm_add_sd (y_2, ax_temp ); ax_temp = _mm_mul_sd( a_03_13, x_0_1 ); y_3 = _mm_add_sd (y_3, ax_temp ); tA += 1; tx += 1; } y_00 = _mm256_castpd128_pd256(y_0); y_11 = _mm256_castpd128_pd256(y_1); y_22 = _mm256_castpd128_pd256(y_2); y_33 = _mm256_castpd128_pd256(y_3); k=0; for(; k<ka-7; k+=8) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; } for(; k<ka-3; k+=4) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_11 = _mm256_add_pd( y_11, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, aaxx_temp ); aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_33 = _mm256_add_pd( y_33, aaxx_temp ); A += 4 + (sda-1)*lda; x += 4; } __m256d y_0_1_2_3; y_00 = _mm256_hadd_pd(y_00, y_11); y_22 = _mm256_hadd_pd(y_22, y_33); y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 ); y_00 = _mm256_permute2f128_pd(y_22, y_00, 19); y_00 = _mm256_add_pd( y_00, y_11 ); if(alg==0) { _mm256_storeu_pd(&y[0], y_00); } else if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); } }