double compute_pi(size_t dt) { int i; double pi = 0.0; double delta = 1.0 / dt; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4; ymm0 = _mm256_set1_pd(1.0); ymm1 = _mm256_set1_pd(delta); ymm2 = _mm256_set_pd(delta * 3, delta * 2, delta * 1, 0.0); ymm4 = _mm256_setzero_pd(); for (i = 0; i <= dt - 4; i += 4) { ymm3 = _mm256_set1_pd(i * delta); ymm3 = _mm256_add_pd(ymm3, ymm2); ymm3 = _mm256_mul_pd(ymm3, ymm3); ymm3 = _mm256_add_pd(ymm0, ymm3); ymm3 = _mm256_div_pd(ymm1, ymm3); ymm4 = _mm256_add_pd(ymm4, ymm3); } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm4); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; }
Color3 evalFourier3(float * const coeffs[3], size_t nCoeffs, Float phi) { #if FOURIER_SCALAR == 1 double cosPhi = std::cos((double) phi), cosPhi_prev = cosPhi, cosPhi_cur = 1.0f; double Y = 0, R = 0, B = 0; for (size_t i=0; i<nCoeffs; ++i) { Y += coeffs[0][i] * cosPhi_cur; R += coeffs[1][i] * cosPhi_cur; B += coeffs[2][i] * cosPhi_cur; double cosPhi_next = 2*cosPhi*cosPhi_cur - cosPhi_prev; cosPhi_prev = cosPhi_cur; cosPhi_cur = cosPhi_next; } double G = 1.39829f*Y -0.100913f*B - 0.297375f*R; return Color3((Float) R, (Float) G, (Float) B); #else double cosPhi = std::cos((double) phi); __m256d cosPhi_prev = _mm256_set1_pd(cosPhi), cosPhi_cur = _mm256_set1_pd(1.0), Y = _mm256_set_sd((double) coeffs[0][0]), R = _mm256_set_sd((double) coeffs[1][0]), B = _mm256_set_sd((double) coeffs[2][0]), factorPhi_prev, factorPhi_cur; initializeRecurrence(cosPhi, factorPhi_prev, factorPhi_cur); for (size_t i=1; i<nCoeffs; i+=4) { __m256d cosPhi_next = _mm256_add_pd(_mm256_mul_pd(factorPhi_prev, cosPhi_prev), _mm256_mul_pd(factorPhi_cur, cosPhi_cur)); Y = _mm256_add_pd(Y, _mm256_mul_pd(cosPhi_next, _mm256_cvtps_pd(_mm_load_ps(coeffs[0]+i)))); R = _mm256_add_pd(R, _mm256_mul_pd(cosPhi_next, _mm256_cvtps_pd(_mm_load_ps(coeffs[1]+i)))); B = _mm256_add_pd(B, _mm256_mul_pd(cosPhi_next, _mm256_cvtps_pd(_mm_load_ps(coeffs[2]+i)))); cosPhi_prev = _mm256_splat2_pd(cosPhi_next); cosPhi_cur = _mm256_splat3_pd(cosPhi_next); } MM_ALIGN32 struct { double Y; double R; double B; double unused; } tmp; simd::hadd(Y, R, B, _mm256_setzero_pd(), (double *) &tmp); double G = 1.39829*tmp.Y -0.100913*tmp.B - 0.297375*tmp.R; return Color3((Float) tmp.R, (Float) G, (Float) tmp.B); #endif }
int main(int, char**) { volatile __m256d a = _mm256_setzero_pd(); volatile __m256d b = _mm256_set1_pd(42.42); volatile __m256d result = _mm256_add_pd(a, b); (void)result; return 0; }
double compute_pi_leibniz_avx_opt(size_t n) { double pi = 0.0; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8; register __m256d ymm9, ymm10, ymm11, ymm12, ymm13; ymm0 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0); ymm1 = _mm256_set_pd(1.0, 3.0, 5.0, 7.0); ymm2 = _mm256_set_pd(9.0, 11.0, 13.0, 15.0); ymm3 = _mm256_set_pd(17.0, 19.0, 21.0, 23.0); ymm4 = _mm256_set_pd(25.0, 27.0, 29.0, 31.0); ymm13 = _mm256_set1_pd(32.0); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); for (int i = 0; i <= n - 16; i += 16) { ymm9 = _mm256_div_pd(ymm0, ymm1); ymm1 = _mm256_add_pd(ymm1, ymm13); ymm10 = _mm256_div_pd(ymm0, ymm2); ymm2 = _mm256_add_pd(ymm2, ymm13); ymm11 = _mm256_div_pd(ymm0, ymm3); ymm3 = _mm256_add_pd(ymm3, ymm13); ymm12 = _mm256_div_pd(ymm0, ymm4); ymm4 = _mm256_add_pd(ymm4, ymm13); ymm5 = _mm256_add_pd(ymm5, ymm9); ymm6 = _mm256_add_pd(ymm6, ymm10); ymm7 = _mm256_add_pd(ymm7, ymm11); ymm8 = _mm256_add_pd(ymm8, ymm12); } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm5); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; _mm256_store_pd(tmp, ymm6); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; _mm256_store_pd(tmp, ymm7); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; _mm256_store_pd(tmp, ymm8); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; }
double zdotu_soa( const int N, const double* da, const double* db, const int ix, const double* dc, const double* dd, const int iy, double* res ) { __m256d ymm0; __m256d ymm1; __m256d ymm2; __m256d ymm3; __m256d ymm4 = _mm256_setzero_pd(); __m256d ymm5 = _mm256_setzero_pd(); // int ii; //#pragma unroll for(ii = 0; ii < N/4; ii++) { _mm_prefetch((const char*) da + 0x200, 1); _mm_prefetch((const char*) db + 0x200, 1); _mm_prefetch((const char*) dc + 0x200, 1); _mm_prefetch((const char*) dd + 0x200, 1); //IACA_START; // 8*4*4 = 128 bytes ymm0 = _mm256_loadu_pd(da + 4*ii); ymm1 = _mm256_loadu_pd(db + 4*ii); ymm2 = _mm256_loadu_pd(dc + 4*ii); ymm3 = _mm256_loadu_pd(dd + 4*ii); // 2*4*4 = 32 flops ymm4 = _mm256_fmsub_pd(ymm0, ymm2, _mm256_fmsub_pd(ymm1, ymm3, ymm4)); ymm5 = _mm256_fmadd_pd(ymm0, ymm3, _mm256_fmadd_pd(ymm1, ymm2, ymm5)); // flops/bute ratio = 1/4 //IACA_END } double* re = (double*)&ymm4; double* im = (double*)&ymm5; // res[0] = re[0] + re[1] + re[2] + re[3]; res[1] = im[0] + im[1] + im[2] + im[3]; }
//for 20 depth void conv_forward_1(conv_layer_t* l, vol_t** in, vol_t** out, int start, int end) { uint64_t tempTime = timestamp_us(); for (int i = start; i <= end; i++) { vol_t* V = in[i]; vol_t* A = out[i]; for(int d = 0; d < 20; d++) { vol_t* f = l->filters[d]; int x = -2; int y = -2; for(int ay = 0; ay < 8; y += 1, ay++) { x = -2; for(int ax=0; ax < 8; x += 1, ax++) { double a = 0.0; __m256d sum = _mm256_setzero_pd(); for(int fy = 0; fy < 5; fy++) { int oy = y + fy; for(int fx = 0; fx < 5; fx++) { int ox = x + fx; if(oy >= 0 && oy < 8 && ox >=0 && ox < 8) { __m256d vector = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20])); __m256d vector2 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20])); __m256d vectorMult = _mm256_mul_pd(vector, vector2); sum =_mm256_add_pd (vectorMult, sum); __m256d vector0 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+4])); __m256d vector9 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+ 4])); __m256d vectorMult0 = _mm256_mul_pd(vector0, vector9); sum =_mm256_add_pd (vectorMult0, sum); __m256d vector3 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+8])); __m256d vector4 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+8])); __m256d vectorMult2 = _mm256_mul_pd(vector3, vector4); sum =_mm256_add_pd (vectorMult2, sum); __m256d vector5 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+12])); __m256d vector6 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+12])); __m256d vectorMult3 = _mm256_mul_pd(vector5, vector6); sum =_mm256_add_pd (vectorMult3, sum); __m256d vector7 = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*20+16])); __m256d vector8 = _mm256_loadu_pd (&(V->w[((8 * oy)+ox)*20+16])); __m256d vectorMult4 = _mm256_mul_pd(vector7, vector8); sum =_mm256_add_pd (vectorMult4, sum); } } } for(int i = 0; i < 4; i++) { a+= sum[i]; } a += l->biases->w[d]; set_vol(A, ax, ay, d, a); } } } } l->myTime += timestamp_us() - tempTime; }
// this function assumes data is stored in col-major // if data is in row major, call it like matmul4x4(B, A, C) void matmul4x4(double *A, double *B, double *C) { __m256d col[4], sum[4]; //load every column into registers for(int i=0; i<4; i++) col[i] = _mm256_load_pd(&A[i*4]); for(int i=0; i<4; i++) { sum[i] = _mm256_setzero_pd(); for(int j=0; j<4; j++) { sum[i] = _mm256_add_pd(_mm256_mul_pd(_mm256_set1_pd(B[i*4+j]), col[j]), sum[i]); } } for(int i=0; i<4; i++) _mm256_store_pd(&C[i*4], sum[i]); }
int main() { __m256d a; __m256i mask; double d[4]={1,2,3,4}; a = _mm256_setzero_pd(); mask = _mm256_castpd_si256(a); #ifdef GMX_SIMD_X86_AVX_GCC_MASKLOAD_BUG a = _mm256_maskload_pd(d,_mm256_castsi256_pd(mask)); #else a = _mm256_maskload_pd(d,mask); #endif }
double zdotu_aos( const int N, const double* dx, const int ix, const double* dy, const int iy, double* res ) { __m256d ymm0; __m256d ymm1; __m256d ymm2; __m256d ymm3; __m256d ymm4 = _mm256_setzero_pd(); __m256d ymm5 = _mm256_setzero_pd(); // int ii = 0; //for(ii = 0; ii < N/2; ii++) do { //IACA_START; ymm0 = _mm256_loadu_pd(dx + 4*ii); ymm1 = _mm256_loadu_pd(dy + 4*ii); // ymm4 = _mm256_fmadd_pd(ymm1, ymm0, ymm4); ymm2 = _mm256_permute_pd(ymm1, 0x5); ymm5 = _mm256_fmadd_pd(ymm2, ymm0, ymm5); ii++; // } while (ii < N/2); //IACA_END double* re = (double*)&ymm4; double* im = (double*)&ymm5; res[0] = re[0] - re[1] + re[2] - re[3]; res[1] = im[0] + im[1] + im[2] + im[3]; }
irreg_poly_area_func_sign(double, _avx) { if (__builtin_expect(is_null(cords) || cords_len == 0, 0)) return 0; __m256d curr, forw, coef_0, coef_1, end = _mm256_load_pd((const double *)cords), accum_sum = _mm256_setzero_pd(); double accum_sum_aux; unsigned long index; for (index = 0; index < (cords_len - 4); index += 4) { curr = end; // x0,y0,x1,y1 forw = _mm256_load_pd((const double *)&cords[index + 2]); // x2,y2,x3,y3 end = _mm256_load_pd((const double *)&cords[index + 4]); // x4,y4,x5,y5 coef_0 = _mm256_permute2f128_pd(curr, forw, 0b00110001); // x1, y1, x3, y3 coef_1 = _mm256_permute2f128_pd(forw, end, 0b00100000); // x2, y2, x4, y4 //_mm256_hsub_pd(a, b) == a0 - a1, b0 - b1, a2 - a3, b2 - b3 accum_sum = _mm256_add_pd( accum_sum, _mm256_hsub_pd( // x0*y1 - y0*x1, x1*y2 - y1x2, x2*y3 - y2*x3, x3*y4 - y3*x4 _mm256_mul_pd( // x0*y1, y0*x1, x2*y3, y2*x3 _mm256_permute2f128_pd(curr, forw, 0b00100000), // x0, y0, x2, y2 _mm256_shuffle_pd(coef_0, coef_0, 0b0101) // y1, x1, y3, x3 ), _mm256_mul_pd(coef_0, _mm256_shuffle_pd(coef_1, coef_1, 0b0101)) // y2, x2, y4, x4 // ^^^^^^^^^^^^^^^ x1*y2, y1*x2, x3*y4, y3*x4 ) ); } accum_sum = _mm256_hadd_pd(accum_sum, _mm256_permute2f128_pd(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a2+a3, a0+a1 accum_sum = _mm256_hadd_pd(accum_sum, accum_sum); // a0+a1+a2+a3, ... for (accum_sum_aux = _mm_cvtsd_f64(_mm256_castpd256_pd128(accum_sum)); index < (cords_len - 1); index++) accum_sum_aux += _calc_diff_of_adj_prods(cords, index); return accum_sum_aux; // return scalar_half(scalar_abs(accum_sum_aux)); }
double compute_pi_euler_avx(size_t n) { double pi = 0.0; register __m256d ymm0, ymm1, ymm2, ymm3; ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_set1_pd(1.0); ymm2 = _mm256_set1_pd(6.0); for (int i = 0; i <= n - 4; i += 4) { ymm3 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0); ymm3 = _mm256_mul_pd(ymm3, ymm3); ymm3 = _mm256_div_pd(ymm1, ymm3); ymm0 = _mm256_add_pd(ymm0, ymm3); } ymm3 = _mm256_mul_pd(ymm2, ymm0); double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm0); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return sqrt( pi ); }
ALGEBRA_INLINE double vector_ps_double (const double* pa,const double* pb,size_t n) { if(ALGEBRA_IS_ALIGNED(pa) && ALGEBRA_IS_ALIGNED(pb)) { size_t q = n/4; size_t r = n%4; double w = 0; if(q>0) { __m256d acc = _mm256_setzero_pd(); __m256d i1 = _mm256_load_pd(pa); __m256d j1 = _mm256_load_pd(pb); pa += 4; pb += 4; __m256d s = _mm256_mul_pd(i1, j1); acc = _mm256_add_pd(acc, s); while(--q != 0) { // load i1 = _mm256_load_pd(pa); j1 = _mm256_load_pd(pb); pa += 4; pb += 4; // multiplie s = _mm256_mul_pd(i1, j1); // accumule acc = _mm256_add_pd(acc, s); } // sum finale // add horizontal acc = _mm256_hadd_pd(acc, acc); // échange 128bits haut et bas __m256d accp = _mm256_permute2f128_pd(acc, acc, 1); // add vertical acc = _mm256_add_pd(acc, accp); // extract _mm_store_sd(&w, _mm256_extractf128_pd(acc,0)); } return w + vector_ps_double_basic(pa, pb, r); } return vector_ps_double_basic(pa, pb, n); }
double compute_pi_leibniz_fma(size_t n) { double pi = 0.0; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4; ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_set1_pd(2.0); ymm2 = _mm256_set1_pd(1.0); ymm3 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0); for (int i = 0; i <= n - 4; i += 4) { ymm4 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0); ymm4 = _mm256_fmadd_pd(ymm1, ymm4, ymm2); ymm4 = _mm256_div_pd(ymm3, ymm4); ymm0 = _mm256_add_pd(ymm0, ymm4); } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm0); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; }
void conv_forward(conv_layer_t* l, vol_t** in, vol_t** out, int start, int end) { uint64_t tempTime = timestamp_us(); for (int i = start; i <= end; i++) { vol_t* V = in[i]; vol_t* A = out[i]; for(int d = 0; d < 16; d++) { vol_t* f = l->filters[d]; int x = -2; int y = -2; for(int ay = 0; ay < 32; y += 1, ay++) { x = -2; for(int ax=0; ax < 32; x += 1, ax++) { double a = 0.0; __m256d sum = _mm256_setzero_pd(); for(int fy = 0; fy < 5; fy++) { int oy = y + fy; for(int fx = 0; fx < 5; fx++) { int ox = x + fx; if(oy >= 0 && oy < 32 && ox >=0 && ox < 32) { __m256d vector = _mm256_loadu_pd (&(f->w[((5 * fy)+fx)*3])); __m256d vector2 = _mm256_loadu_pd (&(V->w[((32 * oy)+ox)*3])); __m256d vectorMult = _mm256_mul_pd(vector, vector2); sum =_mm256_add_pd (vectorMult, sum); } } } for(int i = 0; i < 3; i++) { a+= sum[i]; } a += l->biases->w[d]; set_vol(A, ax, ay, d, a); } } } } l->myTime += timestamp_us() - tempTime; }
DBL AVX2FMA3Noise(const Vector3d& EPoint, int noise_generator) { AVX2TABLETYPE *mp; DBL sum = 0.0; // TODO FIXME - global statistics reference // Stats[Calls_To_Noise]++; if (noise_generator == kNoiseGen_Perlin) { // The 1.59 and 0.985 are to correct for some biasing problems with // the random # generator used to create the noise tables. Final // range of values is about 5.0e-4 below 0.0 and above 1.0. Mean // value is 0.49 (ideally it would be 0.5). sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985); // Clamp final value to 0-1 range if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; return sum; } const __m256d ONE_PD = _mm256_set1_pd(1); const __m128i short_si128 = _mm_set1_epi32(0xffff); const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0); const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON); const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy); const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn)); const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0); const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn)); const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD); const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn), _mm_set1_epi32(0xfff)); const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn, _mm256_mul_pd(xyz_ixyzn, _mm256_sub_pd(_mm256_set1_pd(3.0), _mm256_add_pd(xyz_ixyzn, xyz_ixyzn)))); const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn); const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20); const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0)); const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1)); const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy); const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); int ints[4]; _mm_storeu_si128((__m128i*)(ints), i_xyzn); const int ixiy_hash = Hash2d(ints[0], ints[1]); const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]); const int ixjy_hash = Hash2d(ints[0], ints[1] + 1); const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1); const int iz = ints[2]; const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); __m256d sumr = _mm256_setzero_pd(); __m256d sumr1 = _mm256_setzero_pd(); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)]; INCSUMAVX_NOBLEND(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0)), iii); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)]; INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 2); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)]; INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 4); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)]; INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3)), iii, jjj, 6); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)]; INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0)), iii, jjj, 8); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)]; INCSUMAVX(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1)), iii, jjj, 10); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)]; INCSUMAVX(sumr, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2)), iii, jjj, 12); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)]; INCSUMAVX_NOBLEND(sumr1, mp, PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3)), jjj); { sumr = _mm256_add_pd(sumr, sumr1); __m128d sumr_up = _mm256_extractf128_pd(sumr,1); sumr_up = _mm_add_pd(_mm256_castpd256_pd128(sumr),sumr_up); sumr_up = _mm_hadd_pd(sumr_up,sumr_up); sum = _mm_cvtsd_f64(sumr_up); } if (noise_generator == kNoiseGen_RangeCorrected) { /* details of range here: Min, max: -1.05242, 0.988997 Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828 We want to change it to as close to [0,1] as possible. */ sum += 1.05242; sum *= 0.48985582; /*sum *= 0.5; sum += 0.5;*/ if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; } else { sum = sum + 0.5; /* range at this point -0.5 - 0.5... */ if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; } #if CHECK_FUNCTIONAL { DBL orig_sum = PortableNoise(EPoint, noise_generator); if (fabs(orig_sum - sum) >= EPSILON) { throw POV_EXCEPTION_STRING("Noise error"); } } #endif _mm256_zeroupper(); return (sum); }
/* Naive implementation of Matrix Matrix Multiplication @param A input matrix @param B input matrix @param C output matrix */ inline void naive(const Matrix& A, const Matrix& B, Matrix& C){ //preload dimensions for faster access int dimM = C.getDimM(); int dimN = C.getDimN(); int dimL = A.getDimN(); for (int m = 0; m < dimM; m+=4){ ///rows of c for (int n = 0; n < dimN; n+=4){ ///cols of c //do calculation of a 4x4 block //std::cout << m << "\t" << n << std::endl; __m256d* pA = A.get(m, 0); __m256d* pB = A.get(m+1, 0); __m256d* pC = A.get(m+2, 0); __m256d* pD = A.get(m+3, 0); __m256d* pK = B.getT(0, n); __m256d* pL = B.getT(0, n+1); __m256d* pM = B.getT(0, n+2); __m256d* pN = B.getT(0, n+3); //std::cout << pA << "\t" << pB << "\t" << pC << "\t" << pD << std::endl; __m256d K = _mm256_setzero_pd(); __m256d L = _mm256_setzero_pd(); __m256d M = _mm256_setzero_pd(); __m256d N = _mm256_setzero_pd(); __m256d O = _mm256_setzero_pd(); __m256d P = _mm256_setzero_pd(); __m256d Q = _mm256_setzero_pd(); __m256d R = _mm256_setzero_pd(); __m256d S = _mm256_setzero_pd(); __m256d T = _mm256_setzero_pd(); __m256d U = _mm256_setzero_pd(); __m256d V = _mm256_setzero_pd(); __m256d W = _mm256_setzero_pd(); __m256d X = _mm256_setzero_pd(); __m256d Y = _mm256_setzero_pd(); __m256d Z = _mm256_setzero_pd(); for (int l = 0; l < dimL; l+=4){ //std::cout <<"mul" << std::endl; K = K + (*pA) * (*pK); L = L + (*pA) * (*pL); M = M + (*pA) * (*pM); N = N + (*pA) * (*pN); O = O + (*pB) * (*pK); P = P + (*pB) * (*pL); Q = Q + (*pB) * (*pM); R = R + (*pB) * (*pN); S = S + (*pC) * (*pK); T = T + (*pC) * (*pL); U = U + (*pC) * (*pM); V = V + (*pC) * (*pN); W = W + (*pD) * (*pK); X = X + (*pD) * (*pL); Y = Y + (*pD) * (*pM); Z = Z + (*pD) * (*pN); //std::cout << "inc" <<std::endl; pA++; pB++; pC++; pD++; pK++; pL++; pM++; pN++; } // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]} __m256d sumab = _mm256_hadd_pd(K, L); // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]} __m256d sumcd = _mm256_hadd_pd(M, N); // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]} __m256d blend = _mm256_blend_pd(sumab, sumcd, 0b1100); // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]} __m256d perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21); __m256d sum = _mm256_add_pd(perm, blend); C.set(m, n, sum); //C(m , n) = K[0] + K[1] + K[2] + K[3]; //C(m , n+1) = L[0] + L[1] + L[2] + L[3]; //C(m , n+2) = M[0] + M[1] + M[2] + M[3]; //C(m , n+3) = N[0] + N[1] + N[2] + N[3]; // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]} sumab = _mm256_hadd_pd(O, P); // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]} sumcd = _mm256_hadd_pd(Q, R); // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]} blend = _mm256_blend_pd(sumab, sumcd, 0b1100); // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]} perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21); sum = _mm256_add_pd(perm, blend); C.set(m+1, n, sum); //C(m+1, n ) = O[0] + O[1] + O[2] + O[3]; //C(m+1, n+1) = P[0] + P[1] + P[2] + P[3]; //C(m+1, n+2) = Q[0] + Q[1] + Q[2] + Q[3]; //C(m+1, n+3) = R[0] + R[1] + R[2] + R[3]; // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]} sumab = _mm256_hadd_pd(S, T); // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]} sumcd = _mm256_hadd_pd(U, V); // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]} blend = _mm256_blend_pd(sumab, sumcd, 0b1100); // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]} perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21); sum = _mm256_add_pd(perm, blend); C.set(m+2, n, sum); //C(m+2, n ) = S[0] + S[1] + S[2] + S[3]; //C(m+2, n+1) = T[0] + T[1] + T[2] + T[3]; //C(m+2, n+2) = U[0] + U[1] + U[2] + U[3]; //C(m+2, n+3) = V[0] + V[1] + V[2] + V[3]; // {a[0]+a[1], b[0]+b[1], a[2]+a[3], b[2]+b[3]} sumab = _mm256_hadd_pd(W, X); // {c[0]+c[1], d[0]+d[1], c[2]+c[3], d[2]+d[3]} sumcd = _mm256_hadd_pd(Y, Z); // {a[0]+a[1], b[0]+b[1], c[2]+c[3], d[2]+d[3]} blend = _mm256_blend_pd(sumab, sumcd, 0b1100); // {a[2]+a[3], b[2]+b[3], c[0]+c[1], d[0]+d[1]} perm = _mm256_permute2f128_pd(sumab, sumcd, 0x21); sum = _mm256_add_pd(perm, blend); C.set(m+3, n, sum); //C(m+3, n ) = W[0] + W[1] + W[2] + W[3]; //C(m+3, n+1) = X[0] + X[1] + X[2] + X[3]; //C(m+3, n+2) = Y[0] + Y[1] + Y[2] + Y[3]; //C(m+3, n+3) = Z[0] + Z[1] + Z[2] + Z[3]; } } }
void ks_gaussian_int_d8x4( int k, double alpha, double *u, double *aa, double *a, double *bb, double *b, double *w, aux_t *aux ) { int i; double neg2 = -2.0; double dzero = 0.0; v4df_t c03_0, c03_1, c03_2, c03_3; v4df_t c47_0, c47_1, c47_2, c47_3; v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3; v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3; v4df_t c_tmp; v4df_t u03; v4df_t u47; v4df_t a03, a47; v4df_t A03, A47; // prefetched A v4df_t b0, b1, b2, b3; v4df_t B0; // prefetched B v4df_t aa_tmp, bb_tmp; v4df_t w_tmp; //// Inline vdExp() //const double log2e = 1.4426950408889634073599; //const double maxlog = 7.09782712893383996843e2; // log( 2**1024 ) //const double minlog = -7.08396418532264106224e2; // log( 2**-1024 ) //const double one = 1.0; //const double c1 = 6.93145751953125E-1; //const double c2 = 1.42860682030941723212E-6; //// Original Remez Order 11 coefficients //const double w11 = 3.5524625185478232665958141148891055719216674475023e-8; //const double w10 = 2.5535368519306500343384723775435166753084614063349e-7; //const double w9 = 2.77750562801295315877005242757916081614772210463065e-6; //const double w8 = 2.47868893393199945541176652007657202642495832996107e-5; //const double w7 = 1.98419213985637881240770890090795533564573406893163e-4; //const double w6 = 1.3888869684178659239014256260881685824525255547326e-3; //const double w5 = 8.3333337052009872221152811550156335074160546333973e-3; //const double w4 = 4.1666666621080810610346717440523105184720007971655e-2; //const double w3 = 0.166666666669960803484477734308515404418108830469798; //const double w2 = 0.499999999999877094481580370323249951329122224389189; //const double w1 = 1.0000000000000017952745258419615282194236357388884; //const double w0 = 0.99999999999999999566016490920259318691496540598896; // Remez Order 11 polynomail approximation //const double w0 = 9.9999999999999999694541216787022234814339814028865e-1; //const double w1 = 1.0000000000000013347525109964212249781265243645457; //const double w2 = 4.9999999999990426011279542064313207349934058355357e-1; //const double w3 = 1.6666666666933781279020916199156875162816850273886e-1; //const double w4 = 4.1666666628388978913396218847247771982698350546174e-2; //const double w5 = 8.3333336552944126722390410619859929515740995889372e-3; //const double w6 = 1.3888871805082296012945081624687544823497126781709e-3; //const double w7 = 1.9841863599469418342286677256362193951266072398489e-4; //const double w8 = 2.4787899938611697691690479138150629377630767114546e-5; //const double w9 = 2.7764095757136528235740765949934667970688427190168e-6; //const double w10 = 2.5602485412126369546033948405199058329040797134573e-7; //const double w11 = 3.5347283721656121939634391175390704621351283546671e-8; // Remez Order 9 polynomail approximation // const double w0 = 9.9999999999998657717890998293462356769270934668652e-1; // const double w1 = 1.0000000000041078023971691258305486059867172736079; // const double w2 = 4.9999999979496223000111361187419539211772440139043e-1; // const double w3 = 1.6666667059968250851708016603646727895353772273675e-1; // const double w4 = 4.1666628655740875994884332519499013211594753124142e-2; // const double w5 = 8.3335428149736685441705398632467122758546893330069e-3; // const double w6 = 1.3881912931358424526285652289974115047170651985345e-3; // const double w7 = 1.9983735415194021112767942931416179152416729204150e-4; // const double w8 = 2.3068467290270483679711135625155862511780587976925e-5; // const double w9 = 3.8865682386514872192656192137071689334005518164704e-6; //v4df_t a03_0, a03_1, a03_2, a03_3; //v4df_t a47_0, a47_1, a47_2, a47_3; //v4df_t p03_0, p03_1, p03_2, p03_3; //v4df_t p47_0, p47_1, p47_2, p47_3; //v4df_t y, l2e, tmp, p; //v4li_t k03_0, k03_1, k03_2, k03_3; //v4li_t k47_0, k47_1, k47_2, k47_3; //v4li_t offset; //v4li_t k1, k2; //__m128d p1, p2; int k_iter = k / 2; int k_left = k % 2; __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( a ) ); __asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"( aux->b_next ) ); c03_0.v = _mm256_setzero_pd(); c03_1.v = _mm256_setzero_pd(); c03_2.v = _mm256_setzero_pd(); c03_3.v = _mm256_setzero_pd(); c47_0.v = _mm256_setzero_pd(); c47_1.v = _mm256_setzero_pd(); c47_2.v = _mm256_setzero_pd(); c47_3.v = _mm256_setzero_pd(); // Load a03 a03.v = _mm256_load_pd( (double*)a ); // Load a47 a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // Load (b0,b1,b2,b3) b0.v = _mm256_load_pd( (double*)b ); for ( i = 0; i < k_iter; ++i ) { __asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) ); // Preload A03 A03.v = _mm256_load_pd( (double*)( a + 8 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Preload A47 A47.v = _mm256_load_pd( (double*)( a + 12 ) ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); // Preload B0 B0.v = _mm256_load_pd( (double*)( b + 4 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); // Iteration #1 __asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) ); // Preload a03 ( next iteration ) a03.v = _mm256_load_pd( (double*)( a + 16 ) ); c_tmp.v = _mm256_mul_pd( A03.v , B0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); b1.v = _mm256_shuffle_pd( B0.v, B0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , B0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); c_tmp.v = _mm256_mul_pd( A03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // Preload a47 ( next iteration ) a47.v = _mm256_load_pd( (double*)( a + 20 ) ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( A47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); c_tmp.v = _mm256_mul_pd( A03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Load b0 ( next iteration ) b0.v = _mm256_load_pd( (double*)( b + 8 ) ); c_tmp.v = _mm256_mul_pd( A03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( A47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 16; b += 8; } for ( i = 0; i < k_left; ++i ) { a03.v = _mm256_load_pd( (double*)a ); //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] ); a47.v = _mm256_load_pd( (double*)( a + 4 ) ); //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] ); b0.v = _mm256_load_pd( (double*)b ); //printf( "b0 = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 8; b += 4; } // Prefetch aa and bb __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aa ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( bb ) ); tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 ); tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 ); tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 ); tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 ); tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 ); tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 ); tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 ); tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 ); c03_0.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 ); c03_3.v = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 ); c03_1.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 ); c03_2.v = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 ); c47_0.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 ); c47_3.v = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 ); c47_1.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 ); c47_2.v = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 ); //printf( "rank-k\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); //for ( i = 0; i < k; i++ ) { // a03.v = _mm256_load_pd( (double*)a ); // a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // b0.v = _mm256_broadcast_sd( (double*)b ); // b1.v = _mm256_broadcast_sd( (double*)( b + 1 ) ); // b2.v = _mm256_broadcast_sd( (double*)( b + 2 ) ); // b3.v = _mm256_broadcast_sd( (double*)( b + 3 ) ); // a += DKS_MR; // b += DKS_NR; // c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); // c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); // c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); // c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); // c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); // c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); // c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); // c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); // c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); //} aa_tmp.v = _mm256_broadcast_sd( &neg2 ); //c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); //c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); //c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); //c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); //c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); //c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); //c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); //c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); // c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); //printf( "scale -2 \n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); aa_tmp.v = _mm256_load_pd( (double*)aa ); c03_0.v = _mm256_add_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_add_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_add_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_add_pd( aa_tmp.v, c03_3.v ); //printf( "aa03 = %lf, %lf, %lf, %lf\n", aa_tmp.d[0], aa_tmp.d[1], aa_tmp.d[2], aa_tmp.d[3] ); //printf( "bb03 = %lf, %lf, %lf, %lf\n", bb[ 0 ], bb[ 1 ], bb[ 2 ], bb[ 3 ] ); aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) ); c47_0.v = _mm256_add_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_add_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_add_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_add_pd( aa_tmp.v, c47_3.v ); //printf( "add a^2\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); // Prefetch u __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( u ) ); bb_tmp.v = _mm256_broadcast_sd( (double*)bb ); c03_0.v = _mm256_add_pd( bb_tmp.v, c03_0.v ); c47_0.v = _mm256_add_pd( bb_tmp.v, c47_0.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) ); c03_1.v = _mm256_add_pd( bb_tmp.v, c03_1.v ); c47_1.v = _mm256_add_pd( bb_tmp.v, c47_1.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) ); c03_2.v = _mm256_add_pd( bb_tmp.v, c03_2.v ); c47_2.v = _mm256_add_pd( bb_tmp.v, c47_2.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) ); c03_3.v = _mm256_add_pd( bb_tmp.v, c03_3.v ); c47_3.v = _mm256_add_pd( bb_tmp.v, c47_3.v ); // Check if there is any illegle value c_tmp.v = _mm256_broadcast_sd( &dzero ); c03_0.v = _mm256_max_pd( c_tmp.v, c03_0.v ); c03_1.v = _mm256_max_pd( c_tmp.v, c03_1.v ); c03_2.v = _mm256_max_pd( c_tmp.v, c03_2.v ); c03_3.v = _mm256_max_pd( c_tmp.v, c03_3.v ); c47_0.v = _mm256_max_pd( c_tmp.v, c47_0.v ); c47_1.v = _mm256_max_pd( c_tmp.v, c47_1.v ); c47_2.v = _mm256_max_pd( c_tmp.v, c47_2.v ); c47_3.v = _mm256_max_pd( c_tmp.v, c47_3.v ); // Scale before the kernel evaluation aa_tmp.v = _mm256_broadcast_sd( &alpha ); c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); // Preload u03, u47 u03.v = _mm256_load_pd( (double*)u ); u47.v = _mm256_load_pd( (double*)( u + 4 ) ); // Prefetch w __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( w ) ); #include "ks_exp_int_d8x4.h" //printf( "square distance\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); //for ( i = 0; i < 4; i++ ) { // if ( c03_0.d[ i ] != c03_0.d[ i ] ) { // printf( "error Nan: c03_0[ %d ]\n", i ); // } // if ( c03_1.d[ i ] != c03_1.d[ i ] ) { // printf( "error Nan: c03_1[ %d ]\n", i ); // } // if ( c03_2.d[ i ] != c03_2.d[ i ] ) { // printf( "error Nan: c03_2[ %d ]\n", i ); // } // if ( c03_3.d[ i ] != c03_3.d[ i ] ) { // printf( "error Nan: c03_3[ %d ]\n", i ); // } // if ( c47_0.d[ i ] != c47_0.d[ i ] ) { // printf( "error Nan: c47_0[ %d ]\n", i ); // } // if ( c47_1.d[ i ] != c47_1.d[ i ] ) { // printf( "error Nan: c47_1[ %d ]\n", i ); // } // if ( c47_2.d[ i ] != c47_2.d[ i ] ) { // printf( "error Nan: c47_2[ %d ]\n", i ); // } // if ( c47_3.d[ i ] != c47_3.d[ i ] ) { // printf( "error Nan: c47_3[ %d ]\n", i ); // } //} // tmp.v = _mm256_broadcast_sd( &maxlog ); // c03_0.v = _mm256_min_pd( tmp.v, c03_0.v ); // c03_1.v = _mm256_min_pd( tmp.v, c03_1.v ); // c03_2.v = _mm256_min_pd( tmp.v, c03_2.v ); // c03_3.v = _mm256_min_pd( tmp.v, c03_3.v ); // c47_0.v = _mm256_min_pd( tmp.v, c47_0.v ); // c47_1.v = _mm256_min_pd( tmp.v, c47_1.v ); // c47_2.v = _mm256_min_pd( tmp.v, c47_2.v ); // c47_3.v = _mm256_min_pd( tmp.v, c47_3.v ); // tmp.v = _mm256_broadcast_sd( &minlog ); // c03_0.v = _mm256_max_pd( tmp.v, c03_0.v ); // c03_1.v = _mm256_max_pd( tmp.v, c03_1.v ); // c03_2.v = _mm256_max_pd( tmp.v, c03_2.v ); // c03_3.v = _mm256_max_pd( tmp.v, c03_3.v ); // c47_0.v = _mm256_max_pd( tmp.v, c47_0.v ); // c47_1.v = _mm256_max_pd( tmp.v, c47_1.v ); // c47_2.v = _mm256_max_pd( tmp.v, c47_2.v ); // c47_3.v = _mm256_max_pd( tmp.v, c47_3.v ); // // // a = c / log2e // // c = a * ln2 = k * ln2 + w, ( w in [ -ln2, ln2 ] ) // l2e.v = _mm256_broadcast_sd( &log2e ); // a03_0.v = _mm256_mul_pd( l2e.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( l2e.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( l2e.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( l2e.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( l2e.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( l2e.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( l2e.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( l2e.v, c47_3.v ); // // // Check if a < 0 // tmp.v = _mm256_setzero_pd(); // p03_0.v = _mm256_cmp_pd( a03_0.v, tmp.v, 1 ); // p03_1.v = _mm256_cmp_pd( a03_1.v, tmp.v, 1 ); // p03_2.v = _mm256_cmp_pd( a03_2.v, tmp.v, 1 ); // p03_3.v = _mm256_cmp_pd( a03_3.v, tmp.v, 1 ); // p47_0.v = _mm256_cmp_pd( a47_0.v, tmp.v, 1 ); // p47_1.v = _mm256_cmp_pd( a47_1.v, tmp.v, 1 ); // p47_2.v = _mm256_cmp_pd( a47_2.v, tmp.v, 1 ); // p47_3.v = _mm256_cmp_pd( a47_3.v, tmp.v, 1 ); // tmp.v = _mm256_broadcast_sd( &one ); // p03_0.v = _mm256_and_pd( tmp.v, p03_0.v ); // p03_1.v = _mm256_and_pd( tmp.v, p03_1.v ); // p03_2.v = _mm256_and_pd( tmp.v, p03_2.v ); // p03_3.v = _mm256_and_pd( tmp.v, p03_3.v ); // p47_0.v = _mm256_and_pd( tmp.v, p47_0.v ); // p47_1.v = _mm256_and_pd( tmp.v, p47_1.v ); // p47_2.v = _mm256_and_pd( tmp.v, p47_2.v ); // p47_3.v = _mm256_and_pd( tmp.v, p47_3.v ); // // If a < 0 ( w < 0 ), then a - 1 = ( k - 1 ) + w / ln2 // a03_0.v = _mm256_sub_pd( a03_0.v, p03_0.v ); // a03_1.v = _mm256_sub_pd( a03_1.v, p03_1.v ); // a03_2.v = _mm256_sub_pd( a03_2.v, p03_2.v ); // a03_3.v = _mm256_sub_pd( a03_3.v, p03_3.v ); // a47_0.v = _mm256_sub_pd( a47_0.v, p47_0.v ); // a47_1.v = _mm256_sub_pd( a47_1.v, p47_1.v ); // a47_2.v = _mm256_sub_pd( a47_2.v, p47_2.v ); // a47_3.v = _mm256_sub_pd( a47_3.v, p47_3.v ); // // Compute floor( a ) by two conversions // // if a < 0, p = k - 1 // // else , p = k // k03_0.v = _mm256_cvttpd_epi32( a03_0.v ); // k03_1.v = _mm256_cvttpd_epi32( a03_1.v ); // k03_2.v = _mm256_cvttpd_epi32( a03_2.v ); // k03_3.v = _mm256_cvttpd_epi32( a03_3.v ); // k47_0.v = _mm256_cvttpd_epi32( a47_0.v ); // k47_1.v = _mm256_cvttpd_epi32( a47_1.v ); // k47_2.v = _mm256_cvttpd_epi32( a47_2.v ); // k47_3.v = _mm256_cvttpd_epi32( a47_3.v ); // p03_0.v = _mm256_cvtepi32_pd( k03_0.v ); // p03_1.v = _mm256_cvtepi32_pd( k03_1.v ); // p03_2.v = _mm256_cvtepi32_pd( k03_2.v ); // p03_3.v = _mm256_cvtepi32_pd( k03_3.v ); // p47_0.v = _mm256_cvtepi32_pd( k47_0.v ); // p47_1.v = _mm256_cvtepi32_pd( k47_1.v ); // p47_2.v = _mm256_cvtepi32_pd( k47_2.v ); // p47_3.v = _mm256_cvtepi32_pd( k47_3.v ); // // // --------------------- // // x -= p * ln2 // // --------------------- // // c1 = ln2 // // if a < 0, a = ( k - 1 ) * ln2 // // else , a = k * ln2 // // if a < 0, x -= ( k - 1 ) * ln2 // // else , x -= k * ln2 // // // tmp.v = _mm256_broadcast_sd( &c1 ); // a03_0.v = _mm256_mul_pd( tmp.v, p03_0.v ); // a03_1.v = _mm256_mul_pd( tmp.v, p03_1.v ); // a03_2.v = _mm256_mul_pd( tmp.v, p03_2.v ); // a03_3.v = _mm256_mul_pd( tmp.v, p03_3.v ); // a47_0.v = _mm256_mul_pd( tmp.v, p47_0.v ); // a47_1.v = _mm256_mul_pd( tmp.v, p47_1.v ); // a47_2.v = _mm256_mul_pd( tmp.v, p47_2.v ); // a47_3.v = _mm256_mul_pd( tmp.v, p47_3.v ); // c03_0.v = _mm256_sub_pd( c03_0.v, a03_0.v ); // c03_1.v = _mm256_sub_pd( c03_1.v, a03_1.v ); // c03_2.v = _mm256_sub_pd( c03_2.v, a03_2.v ); // c03_3.v = _mm256_sub_pd( c03_3.v, a03_3.v ); // c47_0.v = _mm256_sub_pd( c47_0.v, a47_0.v ); // c47_1.v = _mm256_sub_pd( c47_1.v, a47_1.v ); // c47_2.v = _mm256_sub_pd( c47_2.v, a47_2.v ); // c47_3.v = _mm256_sub_pd( c47_3.v, a47_3.v ); // tmp.v = _mm256_broadcast_sd( &c2 ); // a03_0.v = _mm256_mul_pd( tmp.v, p03_0.v ); // a03_1.v = _mm256_mul_pd( tmp.v, p03_1.v ); // a03_2.v = _mm256_mul_pd( tmp.v, p03_2.v ); // a03_3.v = _mm256_mul_pd( tmp.v, p03_3.v ); // a47_0.v = _mm256_mul_pd( tmp.v, p47_0.v ); // a47_1.v = _mm256_mul_pd( tmp.v, p47_1.v ); // a47_2.v = _mm256_mul_pd( tmp.v, p47_2.v ); // a47_3.v = _mm256_mul_pd( tmp.v, p47_3.v ); // c03_0.v = _mm256_sub_pd( c03_0.v, a03_0.v ); // c03_1.v = _mm256_sub_pd( c03_1.v, a03_1.v ); // c03_2.v = _mm256_sub_pd( c03_2.v, a03_2.v ); // c03_3.v = _mm256_sub_pd( c03_3.v, a03_3.v ); // c47_0.v = _mm256_sub_pd( c47_0.v, a47_0.v ); // c47_1.v = _mm256_sub_pd( c47_1.v, a47_1.v ); // c47_2.v = _mm256_sub_pd( c47_2.v, a47_2.v ); // c47_3.v = _mm256_sub_pd( c47_3.v, a47_3.v ); // // // //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); // //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); // //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); // //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); // //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); // //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); // //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); // //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); // // // // Prefetch u // __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( u ) ); // // // // // Compute e^x using polynomial approximation // // a = w10 + w11 * x // tmp.v = _mm256_broadcast_sd( &w11 ); // //tmp.v = _mm256_broadcast_sd( &w9 ); // a03_0.v = _mm256_mul_pd( c03_0.v, tmp.v ); // a03_1.v = _mm256_mul_pd( c03_1.v, tmp.v ); // a03_2.v = _mm256_mul_pd( c03_2.v, tmp.v ); // a03_3.v = _mm256_mul_pd( c03_3.v, tmp.v ); // a47_0.v = _mm256_mul_pd( c47_0.v, tmp.v ); // a47_1.v = _mm256_mul_pd( c47_1.v, tmp.v ); // a47_2.v = _mm256_mul_pd( c47_2.v, tmp.v ); // a47_3.v = _mm256_mul_pd( c47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w10 ); // //tmp.v = _mm256_broadcast_sd( &w8 ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // // a = w8 + ( w9 + ( w10 + w11 * x ) * x ) * x // tmp.v = _mm256_broadcast_sd( &w9 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w8 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // tmp.v = _mm256_broadcast_sd( &w7 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w6 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // tmp.v = _mm256_broadcast_sd( &w5 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w4 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // // Prefetch w // __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( w ) ); // // Preload u03 // u03.v = _mm256_load_pd( (double*)u ); // // // tmp.v = _mm256_broadcast_sd( &w3 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w2 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // tmp.v = _mm256_broadcast_sd( &w1 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w0 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // // Preload u47 // u47.v = _mm256_load_pd( (double*)( u + 4 ) ); // // // offset.v = _mm_setr_epi32( 1023, 1023, 0, 0 ); // k1.v = _mm_set_epi32( 0, 0, k03_0.d[ 1 ], k03_0.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k03_0.d[ 3 ], k03_0.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p03_0.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k03_1.d[ 1 ], k03_1.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k03_1.d[ 3 ], k03_1.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p03_1.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k03_2.d[ 1 ], k03_2.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k03_2.d[ 3 ], k03_2.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p03_2.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k03_3.d[ 1 ], k03_3.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k03_3.d[ 3 ], k03_3.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p03_3.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k47_0.d[ 1 ], k47_0.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k47_0.d[ 3 ], k47_0.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p47_0.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k47_1.d[ 1 ], k47_1.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k47_1.d[ 3 ], k47_1.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p47_1.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k47_2.d[ 1 ], k47_2.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k47_2.d[ 3 ], k47_2.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p47_2.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k47_3.d[ 1 ], k47_3.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k47_3.d[ 3 ], k47_3.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p47_3.v = _mm256_set_m128d( p2, p1 ); // // // //u03.v = _mm256_load_pd( (double*)u ); // //u47.v = _mm256_load_pd( (double*)( u + 4 ) ); // // // c03_0.v = _mm256_mul_pd( a03_0.v, p03_0.v ); // c03_1.v = _mm256_mul_pd( a03_1.v, p03_1.v ); // c03_2.v = _mm256_mul_pd( a03_2.v, p03_2.v ); // c03_3.v = _mm256_mul_pd( a03_3.v, p03_3.v ); // c47_0.v = _mm256_mul_pd( a47_0.v, p47_0.v ); // c47_1.v = _mm256_mul_pd( a47_1.v, p47_1.v ); // c47_2.v = _mm256_mul_pd( a47_2.v, p47_2.v ); // c47_3.v = _mm256_mul_pd( a47_3.v, p47_3.v ); //for ( i = 0; i < 4; i++ ) { // if ( c03_0.d[ i ] != c03_0.d[ i ] ) { // printf( "error exp Nan: c03_0[ %d ]\n", i ); // } // if ( c03_1.d[ i ] != c03_1.d[ i ] ) { // printf( "error exp Nan: c03_1[ %d ]\n", i ); // } // if ( c03_2.d[ i ] != c03_2.d[ i ] ) { // printf( "error exp Nan: c03_2[ %d ]\n", i ); // } // if ( c03_3.d[ i ] != c03_3.d[ i ] ) { // printf( "error exp Nan: c03_3[ %d ]\n", i ); // } // if ( c47_0.d[ i ] != c47_0.d[ i ] ) { // printf( "error exp Nan: c47_0[ %d ]\n", i ); // } // if ( c47_1.d[ i ] != c47_1.d[ i ] ) { // printf( "error exp Nan: c47_1[ %d ]\n", i ); // } // if ( c47_2.d[ i ] != c47_2.d[ i ] ) { // printf( "error exp Nan: c47_2[ %d ]\n", i ); // } // if ( c47_3.d[ i ] != c47_3.d[ i ] ) { // printf( "error exp Nan: c47_3[ %d ]\n", i ); // } //} //printf( "exp\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); //printf( "w\n" ); //printf( "%lf, %lf, %lf, %lf\n", w[0], w[3], w[3], w[3] ); //u03.v = _mm256_load_pd( (double*)u ); //u47.v = _mm256_load_pd( (double*)( u + 4 ) ); w_tmp.v = _mm256_broadcast_sd( (double*)w ); c03_0.v = _mm256_mul_pd( w_tmp.v, c03_0.v ); c47_0.v = _mm256_mul_pd( w_tmp.v, c47_0.v ); u03.v = _mm256_add_pd( u03.v, c03_0.v ); u47.v = _mm256_add_pd( u47.v, c47_0.v ); //for ( i = 0; i < 4; i++ ) { // if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) { // printf( "error w_tmp Nan: w_tmp[ %d ]\n", i ); // } //} w_tmp.v = _mm256_broadcast_sd( (double*)( w + 1 ) ); c03_1.v = _mm256_mul_pd( w_tmp.v, c03_1.v ); c47_1.v = _mm256_mul_pd( w_tmp.v, c47_1.v ); u03.v = _mm256_add_pd( u03.v, c03_1.v ); u47.v = _mm256_add_pd( u47.v, c47_1.v ); //for ( i = 0; i < 4; i++ ) { // if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) { // printf( "error w_tmp Nan: w_tmp[ %d ]\n", i ); // } //} w_tmp.v = _mm256_broadcast_sd( (double*)( w + 2 ) ); c03_2.v = _mm256_mul_pd( w_tmp.v, c03_2.v ); c47_2.v = _mm256_mul_pd( w_tmp.v, c47_2.v ); u03.v = _mm256_add_pd( u03.v, c03_2.v ); u47.v = _mm256_add_pd( u47.v, c47_2.v ); //for ( i = 0; i < 4; i++ ) { // if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) { // printf( "error w_tmp Nan: w_tmp[ %d ]\n", i ); // } //} w_tmp.v = _mm256_broadcast_sd( (double*)( w + 3 ) ); c03_3.v = _mm256_mul_pd( w_tmp.v, c03_3.v ); c47_3.v = _mm256_mul_pd( w_tmp.v, c47_3.v ); u03.v = _mm256_add_pd( u03.v, c03_3.v ); u47.v = _mm256_add_pd( u47.v, c47_3.v ); //for ( i = 0; i < 4; i++ ) { // if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) { // printf( "error w_tmp Nan: w_tmp[ %d ]\n", i ); // } //} _mm256_store_pd( (double*)u, u03.v ); _mm256_store_pd( (double*)( u + 4 ), u47.v ); //for ( i = 0; i < 4; i++ ) { // if ( c03_0.d[ i ] != c03_0.d[ i ] ) { // printf( "error gemv Nan: c03_0[ %d ]\n", i ); // exit( 1 ); // } // if ( c03_1.d[ i ] != c03_1.d[ i ] ) { // printf( "error gemv Nan: c03_1[ %d ]\n", i ); // exit( 1 ); // } // if ( c03_2.d[ i ] != c03_2.d[ i ] ) { // printf( "error gemv Nan: c03_2[ %d ]\n", i ); // exit( 1 ); // } // if ( c03_3.d[ i ] != c03_3.d[ i ] ) { // printf( "error gemv Nan: c03_3[ %d ]\n", i ); // exit( 1 ); // } // if ( c47_0.d[ i ] != c47_0.d[ i ] ) { // printf( "error gemv Nan: c47_0[ %d ]\n", i ); // exit( 1 ); // } // if ( c47_1.d[ i ] != c47_1.d[ i ] ) { // printf( "error gemv Nan: c47_1[ %d ]\n", i ); // exit( 1 ); // } // if ( c47_2.d[ i ] != c47_2.d[ i ] ) { // printf( "error gemv Nan: c47_2[ %d ]\n", i ); // exit( 1 ); // } // if ( c47_3.d[ i ] != c47_3.d[ i ] ) { // printf( "error gemv Nan: c47_3[ %d ]\n", i ); // exit( 1 ); // } //} //for ( i = 0; i < 4; i ++ ) { // if ( w[ i ] != w[ i ] ) { // printf( "GSKS error w Nan: w03[ %d ]\n", i ); // } //} //for ( i = 0; i < 4; i++ ) { // if ( u03.d[ i ] != u03.d[ i ] ) { // printf( "GSKS error u Nan: u03[ %d ]\n", i ); // } // if ( u47.d[ i ] != u47.d[ i ] ) { // printf( "GSKS error u Nan: u47[ %d ]\n", i ); // } //} //printf( "%lf\n", u03.d[0] ); //printf( "%lf\n", u03.d[1] ); //printf( "%lf\n", u03.d[2] ); //printf( "%lf\n", u03.d[3] ); //printf( "%lf\n", u47.d[0] ); //printf( "%lf\n", u47.d[1] ); //printf( "%lf\n", u47.d[2] ); //printf( "%lf\n", u47.d[3] ); }
static inline PetscErrorCode TensorContract_FMA(PetscInt dof,PetscInt P,PetscInt Q,const PetscReal Rf[],const PetscReal Sf[],const PetscReal Tf[],TensorMode tmode,const PetscScalar xx[],PetscScalar yy[]) { PetscFunctionBegin; if (tmode == TENSOR_TRANSPOSE) {PetscInt tmp = Q; Q = P; P = tmp;} { PetscReal R[Q][P],S[Q][P],T[Q][P]; const PetscScalar (*x)[P*P*P][NE] = (const PetscScalar(*)[P*P*P][NE])xx; PetscScalar (*y)[P*P*P][NE] = (PetscScalar(*)[Q*Q*Q][NE])yy; PetscScalar u[dof][Q*P*P][NE]_align,v[dof][Q*Q*P][NE]_align; for (PetscInt i=0; i<Q; i++) { for (PetscInt j=0; j<P; j++) { R[i][j] = tmode == TENSOR_EVAL ? Rf[i*P+j] : Rf[j*Q+i]; S[i][j] = tmode == TENSOR_EVAL ? Sf[i*P+j] : Sf[j*Q+i]; T[i][j] = tmode == TENSOR_EVAL ? Tf[i*P+j] : Tf[j*Q+i]; } } // u[l,a,j,k] = R[a,i] x[l,i,j,k] for (PetscInt l=0; l<dof; l++) { for (PetscInt a=0; a<Q; a++) { __m256d r[P]; for (PetscInt i=0; i<P; i++) r[i] = _mm256_set1_pd(R[a][i]); for (PetscInt jk=0; jk<P*P; jk++) { __m256d u_lajk = _mm256_setzero_pd(); for (PetscInt i=0; i<P; i++) { u_lajk = _mm256_fmadd_pd(r[i],_mm256_load_pd(x[l][i*P*P+jk]),u_lajk); } _mm256_store_pd(u[l][a*P*P+jk],u_lajk); } } } // v[l,a,b,k] = S[b,j] u[l,a,j,k] for (PetscInt l=0; l<dof; l++) { for (PetscInt b=0; b<Q; b++) { __m256d s[P]; for (int j=0; j<P; j++) s[j] = _mm256_set1_pd(S[b][j]); for (PetscInt a=0; a<Q; a++) { for (PetscInt k=0; k<P; k++) { __m256d v_labk = _mm256_setzero_pd(); for (PetscInt j=0; j<P; j++) { v_labk = _mm256_fmadd_pd(s[j],_mm256_load_pd(u[l][(a*P+j)*P+k]),v_labk); } _mm256_store_pd(v[l][(a*Q+b)*P+k],v_labk); } } } } // y[l,a,b,c] = T[c,k] v[l,a,b,k] for (PetscInt l=0; l<dof; l++) { for (PetscInt c=0; c<Q; c++) { __m256d t[P]; for (int k=0; k<P; k++) t[k] = _mm256_set1_pd(T[c][k]); for (PetscInt ab=0; ab<Q*Q; ab++) { __m256d y_labc = _mm256_load_pd(y[l][ab*Q+c]); for (PetscInt k=0; k<P; k++) { // for (PetscInt e=0; e<NE; e++) y[l][ab*Q+c][e] += T[c][k] * v[l][ab*P+k][e]; y_labc = _mm256_fmadd_pd(t[k],_mm256_load_pd(v[l][ab*P+k]),y_labc); } _mm256_store_pd(y[l][ab*Q+c],y_labc); } } } PetscLogFlops(dof*(Q*P*P*P+Q*Q*P*P+Q*Q*Q*P)*NE*2); } PetscFunctionReturn(0); }
// Computes and returns the dot product of the n-vectors u and v. // Uses Intel AVX intrinsics to access the SIMD instruction set. double DotProductAVX(const double* u, const double* v, int n) { int max_offset = n - 4; int offset = 0; // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and // v, and multiplying them together in parallel. __m256d sum = _mm256_setzero_pd(); if (offset <= max_offset) { offset = 4; // Aligned load is reputedly faster but requires 32 byte aligned input. if ((reinterpret_cast<const uintptr_t>(u) & 31) == 0 && (reinterpret_cast<const uintptr_t>(v) & 31) == 0) { // Use aligned load. __m256d floats1 = _mm256_load_pd(u); __m256d floats2 = _mm256_load_pd(v); // Multiply. sum = _mm256_mul_pd(floats1, floats2); while (offset <= max_offset) { floats1 = _mm256_load_pd(u + offset); floats2 = _mm256_load_pd(v + offset); offset += 4; __m256d product = _mm256_mul_pd(floats1, floats2); sum = _mm256_add_pd(sum, product); } } else { // Use unaligned load. __m256d floats1 = _mm256_loadu_pd(u); __m256d floats2 = _mm256_loadu_pd(v); // Multiply. sum = _mm256_mul_pd(floats1, floats2); while (offset <= max_offset) { floats1 = _mm256_loadu_pd(u + offset); floats2 = _mm256_loadu_pd(v + offset); offset += 4; __m256d product = _mm256_mul_pd(floats1, floats2); sum = _mm256_add_pd(sum, product); } } } // Add the 4 product sums together horizontally. Not so easy as with sse, as // there is no add across the upper/lower 128 bit boundary, so permute to // move the upper 128 bits to lower in another register. __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1); sum = _mm256_hadd_pd(sum, sum2); sum = _mm256_hadd_pd(sum, sum); double result; // _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse // instruction, as that introduces a 70 cycle delay. All this casting is to // fool the instrinsics into thinking we are extracting the bottom int64. auto cast_sum = _mm256_castpd_si256(sum); *(reinterpret_cast<inT64*>(&result)) = #if defined(_WIN32) || defined(__i386__) // This is a very simple workaround that is activated // for all platforms that do not have _mm256_extract_epi64. // _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y] ((uint64_t*)&cast_sum)[0] #else _mm256_extract_epi64(cast_sum, 0) #endif ; while (offset < n) { result += u[offset] * v[offset]; ++offset; } return result; }
int i, j, k, convolve, nStreams; MW_ALIGN_V(64) double psgt[256], psgf[256], xyzstr[256]; MW_ALIGN_V(64) double xs[256], ys[256], zs[256]; const __m256d REF_XR = _mm256_set1_pd(reff_xr_rp3); const __m256d COSBL = _mm256_set1_pd(lbt.lCosBCos); const __m256d SINB = _mm256_set1_pd(lbt.bSin); const __m256d SINCOSBL = _mm256_set1_pd(lbt.lSinBCos); const __m256d SUNR0 = _mm256_set1_pd(ap->sun_r0); const __m256d R0 = _mm256_set1_pd(ap->r0); const __m256d QV_RECIP = _mm256_set1_pd(ap->q_inv); __m256d RI, QI; ssp_m256 xyz0, xyz1, xyz2, tmp0, tmp1, tmp2, PROD, PBXV, BGP; //xyz0, 1, 2 = x, y, z BGP.d = _mm256_setzero_pd(); convolve = ap->convolve; nStreams = ap->number_streams; for (i = 0; i < convolve; i += 4) { /* Put r_point and qw_r3_n into RI and QI respectively */ RI = _mm256_load_pd(&r_point[i]); QI = _mm256_load_pd(&qw_r3_N[i]); /* Coordinate Transform to Galactic Center XYZ */ xyz0.d = _mm256_sub_pd(_mm256_mul_pd(RI, COSBL), SUNR0); //X Value /* xyz0.d = _mm256_fmadd_pd(RI, COSBL, NSUNR0); */ _mm256_store_pd(&xs[i], xyz0.d);
void rnn_int_d8x4_var2( int k, double *aa, double *a, double *bb, double *b, double *c, aux_t *aux ) { int i; double neg2 = -2.0; double dzero = 0.0; v4df_t c03_0, c03_1, c03_2, c03_3; v4df_t c47_0, c47_1, c47_2, c47_3; v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3; v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3; v4df_t c_tmp; v4df_t a03, a47; v4df_t A03, A47; // prefetched A v4df_t b0, b1, b2, b3; v4df_t B0; // prefetched B v4df_t aa_tmp, bb_tmp; int k_iter = k / 2; int k_left = k % 2; __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( a ) ); __asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"( aux->b_next ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( c ) ); c03_0.v = _mm256_setzero_pd(); c03_1.v = _mm256_setzero_pd(); c03_2.v = _mm256_setzero_pd(); c03_3.v = _mm256_setzero_pd(); c47_0.v = _mm256_setzero_pd(); c47_1.v = _mm256_setzero_pd(); c47_2.v = _mm256_setzero_pd(); c47_3.v = _mm256_setzero_pd(); // Load a03 a03.v = _mm256_load_pd( (double*)a ); // Load a47 a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // Load (b0,b1,b2,b3) b0.v = _mm256_load_pd( (double*)b ); for ( i = 0; i < k_iter; ++i ) { __asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) ); // Preload A03 A03.v = _mm256_load_pd( (double*)( a + 8 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Preload A47 A47.v = _mm256_load_pd( (double*)( a + 12 ) ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); // Preload B0 B0.v = _mm256_load_pd( (double*)( b + 4 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); // Iteration #1 __asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) ); // Preload a03 ( next iteration ) a03.v = _mm256_load_pd( (double*)( a + 16 ) ); c_tmp.v = _mm256_mul_pd( A03.v , B0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); b1.v = _mm256_shuffle_pd( B0.v, B0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , B0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); c_tmp.v = _mm256_mul_pd( A03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // Preload a47 ( next iteration ) a47.v = _mm256_load_pd( (double*)( a + 20 ) ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( A47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); c_tmp.v = _mm256_mul_pd( A03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Load b0 ( next iteration ) b0.v = _mm256_load_pd( (double*)( b + 8 ) ); c_tmp.v = _mm256_mul_pd( A03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( A47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 16; b += 8; } for ( i = 0; i < k_left; ++i ) { a03.v = _mm256_load_pd( (double*)a ); //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] ); a47.v = _mm256_load_pd( (double*)( a + 4 ) ); //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] ); b0.v = _mm256_load_pd( (double*)b ); //printf( "b0 = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 8; b += 4; } // Prefetch aa and bb __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aa ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( bb ) ); tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 ); tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 ); tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 ); tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 ); tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 ); tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 ); tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 ); tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 ); c03_0.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 ); c03_3.v = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 ); c03_1.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 ); c03_2.v = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 ); c47_0.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 ); c47_3.v = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 ); c47_1.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 ); c47_2.v = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 ); //printf( "rank-k\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aux->I ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aux->D ) ); //for ( i = 0; i < k; i++ ) { // a03.v = _mm256_load_pd( (double*)a ); // a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // b0.v = _mm256_broadcast_sd( (double*)b ); // b1.v = _mm256_broadcast_sd( (double*)( b + 1 ) ); // b2.v = _mm256_broadcast_sd( (double*)( b + 2 ) ); // b3.v = _mm256_broadcast_sd( (double*)( b + 3 ) ); // a += DKS_MR; // b += DKS_NR; // c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); // c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); // c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); // c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); // c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); // c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); // c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); // c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); // c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); //} aa_tmp.v = _mm256_broadcast_sd( &neg2 ); //c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); //c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); //c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); //c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); //c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); //c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); //c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); //c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); // c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); //printf( "scale -2 \n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); aa_tmp.v = _mm256_load_pd( (double*)aa ); c03_0.v = _mm256_add_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_add_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_add_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_add_pd( aa_tmp.v, c03_3.v ); //printf( "aa03 = %lf, %lf, %lf, %lf\n", aa_tmp.d[0], aa_tmp.d[1], aa_tmp.d[2], aa_tmp.d[3] ); //printf( "bb03 = %lf, %lf, %lf, %lf\n", bb[ 0 ], bb[ 1 ], bb[ 2 ], bb[ 3 ] ); aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) ); c47_0.v = _mm256_add_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_add_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_add_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_add_pd( aa_tmp.v, c47_3.v ); //printf( "add a^2\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); bb_tmp.v = _mm256_broadcast_sd( (double*)bb ); c03_0.v = _mm256_add_pd( bb_tmp.v, c03_0.v ); c47_0.v = _mm256_add_pd( bb_tmp.v, c47_0.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) ); c03_1.v = _mm256_add_pd( bb_tmp.v, c03_1.v ); c47_1.v = _mm256_add_pd( bb_tmp.v, c47_1.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) ); c03_2.v = _mm256_add_pd( bb_tmp.v, c03_2.v ); c47_2.v = _mm256_add_pd( bb_tmp.v, c47_2.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) ); c03_3.v = _mm256_add_pd( bb_tmp.v, c03_3.v ); c47_3.v = _mm256_add_pd( bb_tmp.v, c47_3.v ); // Check if there is any illegle value c_tmp.v = _mm256_broadcast_sd( &dzero ); c03_0.v = _mm256_max_pd( c_tmp.v, c03_0.v ); c03_1.v = _mm256_max_pd( c_tmp.v, c03_1.v ); c03_2.v = _mm256_max_pd( c_tmp.v, c03_2.v ); c03_3.v = _mm256_max_pd( c_tmp.v, c03_3.v ); c47_0.v = _mm256_max_pd( c_tmp.v, c47_0.v ); c47_1.v = _mm256_max_pd( c_tmp.v, c47_1.v ); c47_2.v = _mm256_max_pd( c_tmp.v, c47_2.v ); c47_3.v = _mm256_max_pd( c_tmp.v, c47_3.v ); // Transpose c03/c47 _0, _1, _2, _3 to be the row vector tmpc03_0.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0x0 ); tmpc03_1.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0xF ); tmpc03_2.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0x0 ); tmpc03_3.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0xF ); tmpc47_0.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0x0 ); tmpc47_1.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0xF ); tmpc47_2.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0x0 ); tmpc47_3.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0xF ); c03_0.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x20 ); c03_2.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x31 ); c03_1.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x20 ); c03_3.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x31 ); c47_0.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x20 ); c47_2.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x31 ); c47_1.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x20 ); c47_3.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x31 ); // c03_0; // c03_1; // c03_2; // c03_3; // c47_0; // c47_1; // c47_2; // c47_3; _mm256_store_pd( c , c03_0.v ); _mm256_store_pd( c + 4, c03_1.v ); _mm256_store_pd( c + 8, c03_2.v ); _mm256_store_pd( c + 12, c03_3.v ); _mm256_store_pd( c + 16, c47_0.v ); _mm256_store_pd( c + 20, c47_1.v ); _mm256_store_pd( c + 24, c47_2.v ); _mm256_store_pd( c + 28, c47_3.v ); }
// it moves vertically across blocks void kernel_dtrmv_u_t_4_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 4; /* __builtin_prefetch( A + 0*lda );*/ /* __builtin_prefetch( A + 2*lda );*/ /* double *tA, *tx;*/ int k; __m256d zeros, tmp0, tmp1, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0_1_2_3, y_00, y_11, y_22, y_33; y_00 = _mm256_setzero_pd(); y_11 = _mm256_setzero_pd(); y_22 = _mm256_setzero_pd(); y_33 = _mm256_setzero_pd(); k=0; for(; k<kmax-7; k+=8) { /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); A += 4 + (sda-1)*lda; x += 4; /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); A += 4 + (sda-1)*lda; x += 4; } for(; k<kmax-3; k+=4) { /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); A += 4 + (sda-1)*lda; x += 4; } zeros = _mm256_setzero_pd(); x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); __m256d y_0_1_2_3; y_00 = _mm256_hadd_pd(y_00, y_11); y_22 = _mm256_hadd_pd(y_22, y_33); y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 ); y_00 = _mm256_permute2f128_pd(y_22, y_00, 19); y_00 = _mm256_add_pd( y_00, y_11 ); if(alg==0) { _mm256_storeu_pd(&y[0], y_00); } else if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); } }
// it moves vertically across blocks void kernel_dsymv_4_lib4(int kmax, double *A, int sda, double *x_n, double *y_n, double *z_n, double *x_t, double *y_t, double *z_t, int tri, int alg) { if(kmax<=0) return; /*printf("\nciao %d\n", kmax); */ const int bs = 4; __builtin_prefetch( A + bs*0 ); __builtin_prefetch( A + bs*2 ); int k, ka; ka = kmax; // number from aligned positon double k_left; // double *sA, *sy_n, *sx_t; static double d_mask[4] = {0.5, 1.5, 2.5, 3.5}; __m256d v_mask, zeros, temp, a_00, a_01, a_02, a_03, x_n_0, x_n_1, x_n_2, x_n_3, y_n_0, x_t_0, y_t_0, y_t_1, y_t_2, y_t_3; __m256i i_mask; #if 0 __m128d stemp, sa_00, sa_01, sa_02, sa_03, sx_n_0, sx_n_1, sx_n_2, sx_n_3, sy_n_0, sx_t_0, sy_t_0, sy_t_1, sy_t_2, sy_t_3; #endif zeros = _mm256_setzero_pd(); x_n_0 = _mm256_broadcast_sd( &x_n[0] ); x_n_1 = _mm256_broadcast_sd( &x_n[1] ); x_n_2 = _mm256_broadcast_sd( &x_n[2] ); x_n_3 = _mm256_broadcast_sd( &x_n[3] ); if(alg==-1) // TODO xor { x_n_0 = _mm256_sub_pd( zeros, x_n_0 ); x_n_1 = _mm256_sub_pd( zeros, x_n_1 ); x_n_2 = _mm256_sub_pd( zeros, x_n_2 ); x_n_3 = _mm256_sub_pd( zeros, x_n_3 ); } y_t_0 = _mm256_setzero_pd(); y_t_1 = _mm256_setzero_pd(); y_t_2 = _mm256_setzero_pd(); y_t_3 = _mm256_setzero_pd(); #if 0 sx_n_0 = _mm256_castpd256_pd128( x_n_0 ); sx_n_1 = _mm256_castpd256_pd128( x_n_1 ); sx_n_2 = _mm256_castpd256_pd128( x_n_2 ); sx_n_3 = _mm256_castpd256_pd128( x_n_3 ); sy_t_0 = _mm256_castpd256_pd128( y_t_0 ); sy_t_1 = _mm256_castpd256_pd128( y_t_1 ); sy_t_2 = _mm256_castpd256_pd128( y_t_2 ); sy_t_3 = _mm256_castpd256_pd128( y_t_3 ); k = bs*(ka/bs); sA = A + (ka/bs)*sda*bs; sy_n = y_n + (ka/bs)*bs; sx_t = x_t + (ka/bs)*bs; for(; k<ka; k++) { sy_n_0 = _mm_load_sd( &sy_n[0] ); sx_t_0 = _mm_load_sd( &sx_t[0] ); sa_00 = _mm_load_sd( &sA[0+bs*0] ); sa_01 = _mm_load_sd( &sA[0+bs*1] ); sa_02 = _mm_load_sd( &sA[0+bs*2] ); sa_03 = _mm_load_sd( &sA[0+bs*3] ); stemp = _mm_mul_sd( sa_00, sx_n_0 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_00, sx_t_0 ); sy_t_0 = _mm_add_sd( sy_t_0, stemp ); stemp = _mm_mul_sd( sa_01, sx_n_1 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_01, sx_t_0 ); sy_t_1 = _mm_add_sd( sy_t_1, stemp ); stemp = _mm_mul_sd( sa_02, sx_n_2 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_02, sx_t_0 ); sy_t_2 = _mm_add_sd( sy_t_2, stemp ); stemp = _mm_mul_sd( sa_03, sx_n_3 ); sy_n_0 = _mm_add_sd( sy_n_0, stemp ); stemp = _mm_mul_sd( sa_03, sx_t_0 ); sy_t_3 = _mm_add_sd( sy_t_3, stemp ); _mm_store_sd( &sy_n[0], sy_n_0 ); sA += 1; sy_n += 1; sx_t += 1; } y_t_0 = _mm256_castpd128_pd256( sy_t_0 ); y_t_1 = _mm256_castpd128_pd256( sy_t_1 ); y_t_2 = _mm256_castpd128_pd256( sy_t_2 ); y_t_3 = _mm256_castpd128_pd256( sy_t_3 ); #endif k=0; // corner if(tri==1) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 14 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); temp = _mm256_blend_pd( zeros, temp, 14 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 12 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); temp = _mm256_blend_pd( zeros, temp, 12 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); temp = _mm256_blend_pd( zeros, temp, 8 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); temp = _mm256_blend_pd( zeros, temp, 8 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; k += 4; } for(; k<ka-7; k+=2*bs) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; } for(; k<ka-3; k+=bs) { __builtin_prefetch( A + sda*bs +bs*0 ); __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_loadu_pd( &x_t[0] ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_storeu_pd( &z_n[0], y_n_0 ); A += sda*bs; y_n += 4; z_n += 4; x_t += 4; } if(k<ka) { k_left = ka-k; v_mask = _mm256_sub_pd( _mm256_loadu_pd( d_mask ), _mm256_broadcast_sd( &k_left ) ); i_mask = _mm256_castpd_si256( v_mask ); // __builtin_prefetch( A + sda*bs +bs*0 ); // __builtin_prefetch( A + sda*bs +bs*2 ); y_n_0 = _mm256_loadu_pd( &y_n[0] ); x_t_0 = _mm256_maskload_pd( &x_t[0], i_mask ); a_00 = _mm256_load_pd( &A[0+bs*0] ); a_01 = _mm256_load_pd( &A[0+bs*1] ); a_02 = _mm256_load_pd( &A[0+bs*2] ); a_03 = _mm256_load_pd( &A[0+bs*3] ); temp = _mm256_mul_pd( a_00, x_n_0 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_00, x_t_0 ); y_t_0 = _mm256_add_pd( y_t_0, temp ); temp = _mm256_mul_pd( a_01, x_n_1 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_01, x_t_0 ); y_t_1 = _mm256_add_pd( y_t_1, temp ); temp = _mm256_mul_pd( a_02, x_n_2 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_02, x_t_0 ); y_t_2 = _mm256_add_pd( y_t_2, temp ); temp = _mm256_mul_pd( a_03, x_n_3 ); y_n_0 = _mm256_add_pd( y_n_0, temp ); temp = _mm256_mul_pd( a_03, x_t_0 ); y_t_3 = _mm256_add_pd( y_t_3, temp ); _mm256_maskstore_pd( &z_n[0], i_mask, y_n_0 ); // A += sda*bs; // y_n += 4; // z_n += 4; // x_t += 4; } __m256d y_0_1_2_3; y_t_0 = _mm256_hadd_pd( y_t_0, y_t_1 ); y_t_2 = _mm256_hadd_pd( y_t_2, y_t_3 ); y_t_1 = _mm256_permute2f128_pd( y_t_2, y_t_0, 2 ); y_t_0 = _mm256_permute2f128_pd( y_t_2, y_t_0, 19 ); y_t_0 = _mm256_add_pd( y_t_0, y_t_1 ); if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_t_0 ); _mm256_storeu_pd( &z_t[0], y_0_1_2_3 ); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_t_0 ); _mm256_storeu_pd( &z_t[0], y_0_1_2_3 ); } }
vector_register<256, double> setZero() { return vector_register<256, double>(_mm256_setzero_pd()); }
// it moves vertically across blocks void kernel_dtrmv_u_t_8_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 4; /* __builtin_prefetch( A + 0*lda );*/ /* __builtin_prefetch( A + 2*lda );*/ /* __builtin_prefetch( A + 4*lda );*/ /* __builtin_prefetch( A + 6*lda );*/ /* double *tA, *tx;*/ int k; /* int ka = kmax-kna; // number from aligned positon*/ __m256d zeros, tmp0, tmp1, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0_1_2_3, y_00, y_11, y_22, y_33, y_44, y_55, y_66, y_77; /* __m128d*/ /* ax_temp,*/ /* a_00_10, a_01_11, a_02_12, a_03_13,*/ /* x_0_1,*/ /* y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7;*/ y_00 = _mm256_setzero_pd(); y_11 = _mm256_setzero_pd(); y_22 = _mm256_setzero_pd(); y_33 = _mm256_setzero_pd(); y_44 = _mm256_setzero_pd(); y_55 = _mm256_setzero_pd(); y_66 = _mm256_setzero_pd(); y_77 = _mm256_setzero_pd(); k=0; for(; k<kmax-7; k+=8) { /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); /* __builtin_prefetch( A + sda*lda + 4*lda );*/ /* __builtin_prefetch( A + sda*lda + 6*lda );*/ a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, tmp0 ); y_55 = _mm256_add_pd( y_55, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, tmp0 ); y_77 = _mm256_add_pd( y_77, tmp1 ); A += 4 + (sda-1)*lda; x += 4; /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); /* __builtin_prefetch( A + sda*lda + 4*lda );*/ /* __builtin_prefetch( A + sda*lda + 6*lda );*/ a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, tmp0 ); y_55 = _mm256_add_pd( y_55, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, tmp0 ); y_77 = _mm256_add_pd( y_77, tmp1 ); A += 4 + (sda-1)*lda; x += 4; } /* for(; k<ka-3; k+=4)*/ /* {*/ /* __builtin_prefetch( A + sda*lda + 0*lda );*/ /* __builtin_prefetch( A + sda*lda + 2*lda );*/ /* x_0_1_2_3 = _mm256_loadu_pd( &x[0] );*/ /* a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );*/ /* a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );*/ /* a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );*/ /* a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );*/ /* */ /* aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );*/ /* y_00 = _mm256_add_pd( y_00, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );*/ /* y_11 = _mm256_add_pd( y_11, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );*/ /* y_22 = _mm256_add_pd( y_22, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );*/ /* y_33 = _mm256_add_pd( y_33, aaxx_temp );*/ /* */ /* __builtin_prefetch( A + sda*lda + 4*lda );*/ /* __builtin_prefetch( A + sda*lda + 6*lda );*/ /* a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] );*/ /* a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] );*/ /* a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] );*/ /* a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] );*/ /* */ /* aaxx_temp = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 );*/ /* y_44 = _mm256_add_pd( y_44, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 );*/ /* y_55 = _mm256_add_pd( y_55, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 );*/ /* y_66 = _mm256_add_pd( y_66, aaxx_temp );*/ /* aaxx_temp = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 );*/ /* y_77 = _mm256_add_pd( y_77, aaxx_temp );*/ /* A += 4 + (sda-1)*lda;*/ /* x += 4;*/ /* }*/ zeros = _mm256_setzero_pd(); // top triangle x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); y_11 = _mm256_add_pd( y_11, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_22 = _mm256_add_pd( y_22, tmp0 ); y_33 = _mm256_add_pd( y_33, tmp1 ); // top square a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, tmp0 ); y_55 = _mm256_add_pd( y_55, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, tmp0 ); y_77 = _mm256_add_pd( y_77, tmp1 ); A += 4 + (sda-1)*lda; x += 4; // bottom triangle x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*4] ); a_00_10_20_30 = _mm256_blend_pd( a_00_10_20_30, zeros, 0xe ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*5] ); a_01_11_21_31 = _mm256_blend_pd( a_01_11_21_31, zeros, 0xc ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*6] ); a_02_12_22_32 = _mm256_blend_pd( a_02_12_22_32, zeros, 0x8 ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*7] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_01_11_21_31, x_0_1_2_3 ); y_44 = _mm256_add_pd( y_44, tmp0 ); y_55 = _mm256_add_pd( y_55, tmp1 ); tmp0 = _mm256_mul_pd( a_02_12_22_32, x_0_1_2_3 ); tmp1 = _mm256_mul_pd( a_03_13_23_33, x_0_1_2_3 ); y_66 = _mm256_add_pd( y_66, tmp0 ); y_77 = _mm256_add_pd( y_77, tmp1 ); // store __m256d y_0_1_2_3, y_4_5_6_7; y_00 = _mm256_hadd_pd(y_00, y_11); y_22 = _mm256_hadd_pd(y_22, y_33); y_44 = _mm256_hadd_pd(y_44, y_55); y_66 = _mm256_hadd_pd(y_66, y_77); y_11 = _mm256_permute2f128_pd(y_22, y_00, 2 ); y_00 = _mm256_permute2f128_pd(y_22, y_00, 19); y_55 = _mm256_permute2f128_pd(y_66, y_44, 2 ); y_44 = _mm256_permute2f128_pd(y_66, y_44, 19); y_00 = _mm256_add_pd( y_00, y_11 ); y_44 = _mm256_add_pd( y_44, y_55 ); if(alg==0) { _mm256_storeu_pd(&y[0], y_00); _mm256_storeu_pd(&y[4], y_44); } else if(alg==1) { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_4_5_6_7 = _mm256_loadu_pd( &y[4] ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_00 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_44 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } else // alg==-1 { y_0_1_2_3 = _mm256_loadu_pd( &y[0] ); y_4_5_6_7 = _mm256_loadu_pd( &y[4] ); y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_00 ); y_4_5_6_7 = _mm256_sub_pd( y_4_5_6_7, y_44 ); _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } }
// it moves vertically across blocks void kernel_dtrmv_u_t_1_lib4(int kmax, double *A, int sda, double *x, double *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 4; double *tA, *tx; int k; __m256d tmp0, a_00_10_20_30, x_0_1_2_3, y_00; y_00 = _mm256_setzero_pd(); k=0; for(; k<kmax-3; k+=4) { x_0_1_2_3 = _mm256_loadu_pd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); tmp0 = _mm256_mul_pd( a_00_10_20_30, x_0_1_2_3 ); y_00 = _mm256_add_pd( y_00, tmp0 ); A += 4 + (sda-1)*lda; x += 4; } __m128d tm0, a_00_10, a_01_11, x_0_1, y_0, y_1, y_0_1; tm0 = _mm256_extractf128_pd( y_00, 0x1 ); y_0 = _mm256_castpd256_pd128( y_00 ); y_0 = _mm_add_pd( y_0, tm0 ); if(k<kmax-1) { x_0_1 = _mm_loadu_pd( &x[0] ); a_00_10 = _mm_load_pd( &A[0+lda*0] ); tm0 = _mm_mul_pd( a_00_10, x_0_1 ); y_0 = _mm_add_pd( y_0, tm0 ); A += 2; x += 2; } x_0_1 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); tm0 = _mm_mul_sd( a_00_10, x_0_1 ); y_0 = _mm_add_sd( y_0, tm0 ); y_0 = _mm_hadd_pd( y_0, y_0 ); if(alg==0) { _mm_store_sd(&y[0], y_0); } else if(alg==1) { y_0_1 = _mm_load_sd( &y[0] ); y_0_1 = _mm_add_sd( y_0_1, y_0 ); _mm_store_sd(&y[0], y_0_1); } else // alg==-1 { y_0_1 = _mm_load_sd( &y[0] ); y_0_1 = _mm_sub_sd( y_0_1, y_0 ); _mm_store_sd(&y[0], y_0_1); } }
/*! * \return Error code. * \ingroup AlgConvolve * \brief Convolves double 1D kernel and data arrays, cnv = krn * data. * The return convolution array must not be aliased to * either the kernel or data arrays. * \param sizeArrayCnv Length of return array must be * >= max(len(dat),len(krn)). * \param arrayCnv Return convolution array. * \param sizeArrayKrn Length of kernel array, must be * odd. * \param arrayKrn Kernel array. * \param sizeArrayDat Length of data array. * \param arrayDat Data array. * \param pad Type of padding. * \param padVal Padding value, only used when * pad == ALG_PAD_VALUE. */ AlgError AlgConvolveD(int sizeArrayCnv, double *arrayCnv, int sizeArrayKrn, double *arrayKrn, int sizeArrayDat, double *arrayDat, AlgPadType pad, double padVal) { int pCnt, kCnt0, kCnt1, halfArrayKrn; double dat0, dat1; AlgError errCode = ALG_ERR_NONE; ALG_DBG((ALG_DBG_LVL_FN|ALG_DBG_LVL_1), ("AlgConvolve FE %d 0x%lx %d 0x%lx %d 0x%lx %d\n", sizeArrayCnv, (unsigned long )arrayCnv, sizeArrayKrn, (unsigned long )arrayKrn, sizeArrayDat, (unsigned long )arrayDat, (int )pad)); halfArrayKrn = sizeArrayKrn / 2; if((sizeArrayCnv <= 0) || (arrayCnv == NULL) || (sizeArrayKrn <= 0) || ((sizeArrayKrn % 2) != 1) || (arrayKrn == NULL) || (sizeArrayDat <= 0) || (arrayDat == NULL)) { errCode = ALG_ERR_FUNC; } else { switch(pad) { case ALG_PAD_NONE: pad = ALG_PAD_ZERO; break; case ALG_PAD_ZERO: break; case ALG_PAD_END: dat0 = arrayDat[0]; dat1 = arrayDat[sizeArrayDat - 1]; break; case ALG_PAD_VALUE: dat0 = padVal; dat1 = padVal; break; default: errCode = ALG_ERR_FUNC; break; } } if(errCode == ALG_ERR_NONE) { /* Pad leading data with zeros or first data value and convolve with the * kernel until the whole of the kernel is within the data. */ int idp; for(idp = 0; idp < halfArrayKrn; ++idp) { int idk; double cnv = 0.0; pCnt = halfArrayKrn - idp; if((pad == ALG_PAD_END) || pad == (ALG_PAD_VALUE)) { for(idk = 0; idk < pCnt; ++idk) { cnv += arrayKrn[idk]; } cnv *= dat0; } kCnt0 = sizeArrayKrn - pCnt; for(idk = 0; idk < kCnt0; ++idk) { cnv += arrayKrn[pCnt + idk] * arrayDat[idk]; } arrayCnv[idp] = cnv; } /* Between leading and trailing padding regions just convolue the data * with the kernel. */ pCnt = sizeArrayDat - sizeArrayKrn + 1; #if defined ALG_FAST_CODE && defined __AVX2__ { int sizeArrayKrn4; sizeArrayKrn4 = sizeArrayKrn - (sizeArrayKrn % 4); for(idp = 0; idp < pCnt; ++idp) { int idk; double *dP; double *cP; __m256d c; c = _mm256_setzero_pd(); dP = arrayDat + idp; for(idk = 0; idk < sizeArrayKrn4; idk += 4) { __m256d d, k; d = _mm256_loadu_pd(dP + idk); k = _mm256_loadu_pd(arrayKrn + idk); c = _mm256_add_pd(c, _mm256_mul_pd(d, k)); } cP = (double *)&c; cP[0] = cP[0] + cP[1] + cP[2] + cP[3]; for(idk = sizeArrayKrn4; idk < sizeArrayKrn; ++idk) { cP[0] += arrayKrn[idk] * dP[idk]; } arrayCnv[halfArrayKrn + idp] = cP[0]; } } #else /* !ALG_FAST_CODE */ for(idp = 0; idp < pCnt; ++idp) { int idk; double cnv = 0.0; for(idk = 0; idk < sizeArrayKrn; ++idk) { cnv += arrayKrn[idk] * arrayDat[idp + idk]; } arrayCnv[halfArrayKrn + idp] = cnv; } #endif /* ALG_FAST_CODE */ /* Pad trailing data with zeros or last data value and convolve with the * kernel until the whole of the kernel is outside the data. */ for(idp = 0; idp < halfArrayKrn; ++idp) { int idk, idt; double cnv = 0.0; kCnt0 = sizeArrayKrn - idp - 1; idt = idp + sizeArrayDat - sizeArrayKrn + 1; for(idk = 0; idk < kCnt0; ++idk) { cnv += arrayKrn[idk] * arrayDat[idt + idk]; } if((pad == ALG_PAD_END) || pad == (ALG_PAD_VALUE)) { double cnv1 = 0.0; kCnt1 = sizeArrayKrn - kCnt0; for(idk = 0; idk < kCnt1; ++idk) { cnv1 += arrayKrn[kCnt0 + idk]; } cnv += cnv1 * dat1; } arrayCnv[sizeArrayDat - halfArrayKrn + idp] = cnv; } } ALG_DBG((ALG_DBG_LVL_FN|ALG_DBG_LVL_1), ("AlgConvolve FX %d\n", (int )errCode)); return(errCode); }
void AVX2FMA3DNoise(Vector3d& result, const Vector3d& EPoint) { #if CHECK_FUNCTIONAL Vector3d param(EPoint); #endif AVX2TABLETYPE *mp; // TODO FIXME - global statistics reference // Stats[Calls_To_DNoise]++; const __m256d ONE_PD = _mm256_set1_pd(1.0); const __m128i short_si128 = _mm_set1_epi32(0xffff); const __m256d xyzn = _mm256_setr_pd(EPoint[X], EPoint[Y], EPoint[Z], 0); const __m256d epsy = _mm256_set1_pd(1.0 - EPSILON); const __m256d xyzn_e = _mm256_sub_pd(xyzn, epsy); const __m128i tmp_xyzn = _mm256_cvttpd_epi32(_mm256_blendv_pd(xyzn, xyzn_e, xyzn)); const __m128i noise_min_xyzn = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, NOISE_MINZ, 0); const __m256d xyz_ixyzn = _mm256_sub_pd(xyzn, _mm256_cvtepi32_pd(tmp_xyzn)); const __m256d xyz_jxyzn = _mm256_sub_pd(xyz_ixyzn, ONE_PD); const __m128i i_xyzn = _mm_and_si128(_mm_sub_epi32(tmp_xyzn, noise_min_xyzn), _mm_set1_epi32(0xfff)); const __m256d s_xyzn = _mm256_mul_pd(xyz_ixyzn, _mm256_mul_pd(xyz_ixyzn, _mm256_sub_pd(_mm256_set1_pd(3.0), _mm256_add_pd(xyz_ixyzn, xyz_ixyzn)))); const __m256d t_xyzn = _mm256_sub_pd(ONE_PD, s_xyzn); const __m256d txtysxsy = _mm256_permute2f128_pd(t_xyzn, s_xyzn, 0x20); const __m256d txsxtxsx = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(2, 0, 2, 0)); const __m256d tytysysy = PERMUTE4x64(txtysxsy, _MM_SHUFFLE(3, 3, 1, 1)); const __m256d txtysxtytxsysxsy = _mm256_mul_pd(txsxtxsx, tytysysy); const __m256d incrsump_s1 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(t_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); const __m256d incrsump_s2 = _mm256_mul_pd(txtysxtytxsysxsy, PERMUTE4x64(s_xyzn, _MM_SHUFFLE(2, 2, 2, 2))); int ints[4]; _mm_storeu_si128((__m128i*)(ints), i_xyzn); const int ixiy_hash = Hash2d(ints[0], ints[1]); const int jxiy_hash = Hash2d(ints[0] + 1, ints[1]); const int ixjy_hash = Hash2d(ints[0], ints[1] + 1); const int jxjy_hash = Hash2d(ints[0] + 1, ints[1] + 1); const int iz = ints[2]; const __m256d iii = _mm256_blend_pd(PERMUTE4x64(xyz_ixyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); const __m256d jjj = _mm256_blend_pd(PERMUTE4x64(xyz_jxyzn, _MM_SHUFFLE(2, 1, 0, 0)), _mm256_set_pd(0, 0, 0, 0.5), 0x1); __m256d ss; __m256d blend; __m256d x = _mm256_setzero_pd(), y = _mm256_setzero_pd(), z = _mm256_setzero_pd(); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(0, 0, 0, 0)); // blend = _mm256_blend_pd(iii, jjj, 0); INCSUMAVX_VECTOR(mp, ss, iii); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(1, 1, 1, 1)); blend = _mm256_blend_pd(iii, jjj, 2); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(3, 3, 3, 3)); blend = _mm256_blend_pd(iii, jjj, 6); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz)]; ss = PERMUTE4x64(incrsump_s1, _MM_SHUFFLE(2, 2, 2, 2)); blend = _mm256_blend_pd(iii, jjj, 4); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixjy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(2, 2, 2, 2)); blend = _mm256_blend_pd(iii, jjj, 12); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxjy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(3, 3, 3, 3)); // blend = _mm256_blend_pd(iii, jjj, 14); INCSUMAVX_VECTOR(mp, ss, jjj); mp = &AVX2RTable[Hash1dRTableIndexAVX(jxiy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(1, 1, 1, 1)); blend = _mm256_blend_pd(iii, jjj, 10); INCSUMAVX_VECTOR(mp, ss, blend); mp = &AVX2RTable[Hash1dRTableIndexAVX(ixiy_hash, iz + 1)]; ss = PERMUTE4x64(incrsump_s2, _MM_SHUFFLE(0, 0, 0, 0)); blend = _mm256_blend_pd(iii, jjj, 8); INCSUMAVX_VECTOR(mp, ss, blend); __m256d xy = _mm256_hadd_pd(x,y); __m128d xy_up = _mm256_extractf128_pd(xy,1); xy_up = _mm_add_pd(_mm256_castpd256_pd128(xy),xy_up); _mm_storeu_pd(&result[X],xy_up); __m128d z_up = _mm256_extractf128_pd(z,1); z_up = _mm_add_pd(_mm256_castpd256_pd128(z),z_up); z_up = _mm_hadd_pd(z_up,z_up); result[Z] = _mm_cvtsd_f64(z_up); #if CHECK_FUNCTIONAL { Vector3d portable_res; PortableDNoise(portable_res , param); if (fabs(portable_res[X] - result[X]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise X error"); } if (fabs(portable_res[Y] - result[Y]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise Y error"); } if (fabs(portable_res[Z] - result[Z]) >= EPSILON) { throw POV_EXCEPTION_STRING("DNoise Z error"); } } #endif _mm256_zeroupper(); return; }
// it moves horizontally inside a block void kernel_dtrmv_u_n_8_lib4(int kmax, double *A0, int sda, double *x, double *y, int alg) { if(kmax<=0) return; double *A1 = A0 + 4*sda; const int lda = 4; int k; __m128d tmp0, z_0, y_0_1, a_00_10; __m256d zeros, ax_temp, a_00_10_20_30, a_01_11_21_31, a_40_50_60_70, a_41_51_61_71, x_0, x_1, y_0_1_2_3, y_0_1_2_3_b, z_0_1_2_3, y_4_5_6_7, y_4_5_6_7_b, z_4_5_6_7; /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_4_5_6_7 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_4_5_6_7_b = _mm256_setzero_pd(); */ zeros = _mm256_setzero_pd(); /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_0_1_2_3_c = _mm256_setzero_pd(); */ /* y_0_1_2_3_d = _mm256_setzero_pd();*/ // upper triangular // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A0[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A0[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_0_1_2_3_b = _mm256_castpd128_pd256( y_0_1 ); y_0_1_2_3_b = _mm256_blend_pd( y_0_1_2_3_b, y_0_1_2_3_b, 0xc ); // forth col (avoid zero y_0_1_2_3) x_1 = _mm256_broadcast_sd( &x[3] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); y_0_1_2_3 = _mm256_mul_pd( a_01_11_21_31, x_1 ); // first col x_0 = _mm256_broadcast_sd( &x[2] ); x_0 = _mm256_blend_pd( x_0, zeros, 0x8 ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; // upper squared x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); x_0 = _mm256_broadcast_sd( &x[2] ); x_1 = _mm256_broadcast_sd( &x[3] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); // lower triangular // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A1[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A1[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_4_5_6_7_b = _mm256_castpd128_pd256( y_0_1 ); y_4_5_6_7_b = _mm256_blend_pd( y_4_5_6_7_b, y_4_5_6_7_b, 0xc ); // forth col (avoid zero y_4_5_6_7) x_1 = _mm256_broadcast_sd( &x[3] ); a_01_11_21_31 = _mm256_load_pd( &A1[0+lda*3] ); y_4_5_6_7 = _mm256_mul_pd( a_01_11_21_31, x_1 ); // first col x_0 = _mm256_broadcast_sd( &x[2] ); x_0 = _mm256_blend_pd( x_0, zeros, 0x8 ); a_00_10_20_30 = _mm256_load_pd( &A1[0+lda*2] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; k=8; for(; k<kmax-3; k+=4) { /* __builtin_prefetch( A0 + 4*lda );*/ /* __builtin_prefetch( A1 + 4*lda );*/ x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); /* __builtin_prefetch( A0 + 5*lda );*/ /* __builtin_prefetch( A1 + 5*lda );*/ x_0 = _mm256_broadcast_sd( &x[2] ); x_1 = _mm256_broadcast_sd( &x[3] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 4*lda; A1 += 4*lda; x += 4; } if(kmax%4>=2) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] ); a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 ); y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp ); A0 += 2*lda; A1 += 2*lda; x += 2; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_4_5_6_7_b ); if(kmax%2==1) { x_0 = _mm256_broadcast_sd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] ); a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 ); y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp ); /* A0 += 1*lda;*/ /* A1 += 1*lda;*/ /* x += 1;*/ } if(alg==0) { _mm256_storeu_pd(&y[0], y_0_1_2_3); _mm256_storeu_pd(&y[4], y_4_5_6_7); } else if(alg==1) { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_4_5_6_7 = _mm256_loadu_pd( &y[4] ); z_0_1_2_3 = _mm256_add_pd( z_0_1_2_3, y_0_1_2_3 ); z_4_5_6_7 = _mm256_add_pd( z_4_5_6_7, y_4_5_6_7 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); _mm256_storeu_pd(&y[4], z_4_5_6_7); } else // alg==-1 { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_4_5_6_7 = _mm256_loadu_pd( &y[4] ); z_0_1_2_3 = _mm256_sub_pd( z_0_1_2_3, y_0_1_2_3 ); z_4_5_6_7 = _mm256_sub_pd( z_4_5_6_7, y_4_5_6_7 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); _mm256_storeu_pd(&y[4], z_4_5_6_7); } }
// it moves horizontally inside a block (A upper triangular) void kernel_dtrmv_u_n_4_lib4(int kmax, double *A, double *x, double *y, int alg) { if(kmax<=0) return; const int lda = 4; int k; __m128d tmp0, z_0, y_0_1, a_00_10; __m256d zeros, ax_temp, a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33, x_0, x_1, x_2, x_3, y_0_1_2_3, y_0_1_2_3_b, y_0_1_2_3_c, y_0_1_2_3_d, z_0_1_2_3; zeros = _mm256_setzero_pd(); /* y_0_1_2_3 = _mm256_setzero_pd(); */ /* y_0_1_2_3_b = _mm256_setzero_pd(); */ /* y_0_1_2_3_c = _mm256_setzero_pd(); */ y_0_1_2_3_d = _mm256_setzero_pd(); // second col (avoid zero y_0_1) z_0 = _mm_loaddup_pd( &x[1] ); a_00_10 = _mm_load_pd( &A[0+lda*1] ); y_0_1 = _mm_mul_pd( a_00_10, z_0 ); // first col z_0 = _mm_load_sd( &x[0] ); a_00_10 = _mm_load_sd( &A[0+lda*0] ); tmp0 = _mm_mul_sd( a_00_10, z_0 ); y_0_1 = _mm_add_sd( y_0_1, tmp0 ); y_0_1_2_3_c = _mm256_castpd128_pd256( y_0_1 ); y_0_1_2_3_c = _mm256_blend_pd( y_0_1_2_3_c, y_0_1_2_3_d, 0xc ); // forth col (avoid zero y_0_1_2_3) x_3 = _mm256_broadcast_sd( &x[3] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); y_0_1_2_3 = _mm256_mul_pd( a_03_13_23_33, x_3 ); // first col x_2 = _mm256_broadcast_sd( &x[2] ); x_2 = _mm256_blend_pd( x_2, zeros, 0x8 ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); y_0_1_2_3_b = _mm256_mul_pd( a_02_12_22_32, x_2 ); A += 4*lda; x += 4; k=4; for(; k<kmax-3; k+=4) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); x_2 = _mm256_broadcast_sd( &x[2] ); x_3 = _mm256_broadcast_sd( &x[3] ); a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] ); a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); ax_temp = _mm256_mul_pd( a_02_12_22_32, x_2 ); y_0_1_2_3_c = _mm256_add_pd( y_0_1_2_3_c, ax_temp ); ax_temp = _mm256_mul_pd( a_03_13_23_33, x_3 ); y_0_1_2_3_d = _mm256_add_pd( y_0_1_2_3_d, ax_temp ); A += 4*lda; x += 4; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_c ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, y_0_1_2_3_d ); if(kmax%4>=2) { x_0 = _mm256_broadcast_sd( &x[0] ); x_1 = _mm256_broadcast_sd( &x[1] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 ); y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp ); A += 2*lda; x += 2; } y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b ); if(kmax%2==1) { x_0 = _mm256_broadcast_sd( &x[0] ); a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] ); ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 ); y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp ); } if(alg==0) { _mm256_storeu_pd(&y[0], y_0_1_2_3); } else if(alg==1) { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_0_1_2_3 = _mm256_add_pd ( z_0_1_2_3, y_0_1_2_3 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); } else // alg==-1 { z_0_1_2_3 = _mm256_loadu_pd( &y[0] ); z_0_1_2_3 = _mm256_sub_pd ( z_0_1_2_3, y_0_1_2_3 ); _mm256_storeu_pd(&y[0], z_0_1_2_3); } }